In [1]:
import math
import numpy as np
# from cv2 import Rodrigues
from math import sin, cos, atan2, sqrt
import pandas as pd
from pathlib import Path
import pyproj
from pyproj import Proj, transform
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from tqdm import tqdm
import warnings
import pathlib
import itertools

In [2]:
data_dir = Path("../input/google-smartphone-decimeter-challenge")
bl_trn_fname = 'baseline_locations_train.csv'
bl_tst_fname = 'baseline_locations_test.csv'
sample_fname = 'sample_submission.csv'

# ground truth
p = pathlib.Path(data_dir)
gt_files = list(p.glob('train/*/*/ground_truth.csv'))
gts = []
for gt_file in gt_files:
    gts.append(pd.read_csv(gt_file))
ground_truth = pd.concat(gts)

In [3]:
bl_trn_df = pd.read_csv(data_dir / bl_trn_fname)
bl_tst_df = pd.read_csv(data_dir / bl_tst_fname)
sample_df = pd.read_csv(data_dir / sample_fname)

filtered_nb037 = pd.read_csv('../output/sub_nb037.csv')
bl_tst_df['lngDeg'] = filtered_nb037['lngDeg']
bl_tst_df['latDeg'] = filtered_nb037['latDeg']

# 1. Euler Angles to Rotation Vector  
Euler Angles <-> Rotation Matrix <-> Rotation Vector

In [4]:
# pitch:y
# yaw:z
# roll:x
def an2v(y_delta, z_delta, x_delta):
    '''
    Euler Angles ->Rotation Matrix -> Rotation Vector

    Input：
        1. y_delta          (float): the angle with rotateing around y-axis.
        2. z_delta         (float): the angle with rotateing around z-axis. 
        3. x_delta         (float): the angle with rotateing around x-axis. 
    Output：
        rx/ry/rz             (float): the rotation vector with rotateing 
    
    Code Ref.: https://www.zacobria.com/universal-robots-knowledge-base-tech-support-forum-hints-tips/python-code-example-of-converting-rpyeuler-angles-to-rotation-vectorangle-axis-for-universal-robots/
    (Note：In Code Ref: pitch=y,yaw=z,roll=x. But Google is pitch=x,yaw=z,roll=y)
    '''
    # yaw: z
    Rz_Matrix = np.matrix([
    [math.cos(z_delta), -math.sin(z_delta), 0],
    [math.sin(z_delta), math.cos(z_delta), 0],
    [0, 0, 1]
    ])
    
    # pitch: y
    Ry_Matrix = np.matrix([
    [math.cos(y_delta), 0, math.sin(y_delta)],
    [0, 1, 0],
    [-math.sin(y_delta), 0, math.cos(y_delta)]
    ])
    
    # roll: x
    Rx_Matrix = np.matrix([
    [1, 0, 0],
    [0, math.cos(x_delta), -math.sin(x_delta)],
    [0, math.sin(x_delta), math.cos(x_delta)]
    ])

    R = Rz_Matrix * Ry_Matrix * Rx_Matrix

    theta = math.acos(((R[0, 0] + R[1, 1] + R[2, 2]) - 1) / 2)
    multi = 1 / (2 * math.sin(theta))

    rx = multi * (R[2, 1] - R[1, 2]) * theta
    ry = multi * (R[0, 2] - R[2, 0]) * theta
    rz = multi * (R[1, 0] - R[0, 1]) * theta

    return rx, ry, rz

In [5]:
def v2a(rotation_v):
    '''
    Rotation Vector -> Rotation Matrix -> Euler Angles

    Input：
        rx/ry/rz             (float): the rotation vector with rotateing around x/y/z-axis.
    Output：
        1. y_delta          (float): the angle with rotateing around y-axis.
        2. z_delta         (float): the angle with rotateing around z-axis. 
        3. x_delta         (float): the angle with rotateing around x-axis.  
    '''
    # Rotation Vector -> Rotation Matrix
    R = Rodrigues(rotation_v)[0]

    sq = sqrt(R[2,1] ** 2 +  R[2,2] ** 2)

    if  not (sq < 1e-6) :
        x_delta = atan2(R[2,1] , R[2,2])
        y_delta = atan2(-R[2,0], sq)
        z_delta = atan2(R[1,0], R[0,0])
    else :
        x_delta = atan2(-R[1,2], R[1,1])
        y_delta = atan2(-R[2,0], sq)
        z_delta = 0

    return y_delta, z_delta, x_delta

# 2. Prepare IMU Dataset  
This part is to prepare the dataset for the model. I divided this part into the following steps:  
(1) Load GNSS Log  
(2) Merge sub-dataset (Status/UncalAccel/UncalGyro/UncalMag/OrientationDeg)  
(3) UTC to GpsEpoch  
(4) OrientationDeg to Rotation Vector  
(5) Calibrate Sensors' data  
(6) LatDeg&lngDeg to x/y/z  
(7) Orgainze Data (eg. t1 t2 t3 t4 t5 -> t6)  
(8) Clean Data (unrelated-aixs features and uncalibrated features)  
(9) Add Statistic Features  

In [6]:
def gnss_log_to_dataframes(path):
    '''Load GNSS Log'''
    print('Loading ' + path, flush = True)
    gnss_section_names = {'Raw', 'UncalAccel', 'UncalGyro', 'UncalMag', 'Fix', 'Status', 'OrientationDeg'}
    with open(path) as f_open:
        datalines = f_open.readlines()

    datas = {k: [] for k in gnss_section_names}
    gnss_map = {k: [] for k in gnss_section_names}
    for dataline in datalines:
        is_header = dataline.startswith('#')
        dataline = dataline.strip('#').strip().split(',')
        # skip over notes, version numbers, etc
        if is_header and dataline[0] in gnss_section_names:
            gnss_map[dataline[0]] = dataline[1:]
        elif not is_header:
            datas[dataline[0]].append(dataline[1:])

    results = dict()
    for k, v in datas.items():
        results[k] = pd.DataFrame(v, columns=gnss_map[k])
    # pandas doesn't properly infer types from these lists by default
    for k, df in results.items():
        for col in df.columns:
            if col == 'CodeType':
                continue
            results[k][col] = pd.to_numeric(results[k][col])

    return results

In [7]:
def UTC2GpsEpoch(df):
    '''UTC to GpsEpoch
    
    utcTimeMillis         : UTC epoch (1970/1/1)
    millisSinceGpsEpoch   : GPS epoch(1980/1/6 midnight 12:00 UTC)
    
    Ref: https://www.kaggle.com/c/google-smartphone-decimeter-challenge/discussion/239187
    '''
    dt_offset = pd.to_datetime('1980-01-06 00:00:00') 
    dt_offset_in_ms = int(dt_offset.value / 1e6)
    df['millisSinceGpsEpoch'] = df['utcTimeMillis'] - dt_offset_in_ms + 18000
    return df

In [8]:
def prepare_imu_data(data_dir, dataset_name, cname, pname, bl_df):
    '''Prepare IMU Dataset (For Train: IMU+GT+BL; For Test: IMU+BL)
    Input：
        1. data_dir: data_dir
        2. dataset_name: dataset name（'train'/'test'）
        3. cname: CollectionName
        4. pname: phoneName
        5. bl_df: baseline's dataframe
    Output：df_all
    '''
    # load GNSS log
    gnss_df = gnss_log_to_dataframes(str(data_dir / dataset_name / cname / pname / f'{pname}_GnssLog.txt'))
    print('sub-dataset shape：')
    print('Raw:', gnss_df['Raw'].shape)
    print('Status:', gnss_df['Status'].shape)
    print('UncalAccel:', gnss_df['UncalAccel'].shape)
    print('UncalGyro:', gnss_df['UncalGyro'].shape)
    print('UncalMag:', gnss_df['UncalMag'].shape)
    print('OrientationDeg:', gnss_df['OrientationDeg'].shape)
    print('Fix:', gnss_df['Fix'].shape)

    # merge sub-datasets
    # accel + gyro
    imu_df = pd.merge_asof(gnss_df['UncalAccel'].sort_values('utcTimeMillis'),
                           gnss_df['UncalGyro'].drop('elapsedRealtimeNanos', axis=1).sort_values('utcTimeMillis'),
                           on = 'utcTimeMillis',
                           direction='nearest')
    # (accel + gyro) + mag
    imu_df = pd.merge_asof(imu_df.sort_values('utcTimeMillis'),
                           gnss_df['UncalMag'].drop('elapsedRealtimeNanos', axis=1).sort_values('utcTimeMillis'),
                           on = 'utcTimeMillis',
                           direction='nearest')
    # ((accel + gyro) + mag) + OrientationDeg
    imu_df = pd.merge_asof(imu_df.sort_values('utcTimeMillis'),
                           gnss_df['OrientationDeg'].drop('elapsedRealtimeNanos', axis=1).sort_values('utcTimeMillis'),
                           on = 'utcTimeMillis',
                           direction='nearest')
   
    # UTC->GpsEpoch
    imu_df = UTC2GpsEpoch(imu_df)

    # print IMU time
    dt_offset = pd.to_datetime('1980-01-06 00:00:00')
    dt_offset_in_ms = int(dt_offset.value / 1e6)
    tmp_datetime = pd.to_datetime(imu_df['millisSinceGpsEpoch'] + dt_offset_in_ms, unit='ms')
    print(f"imu_df time scope: {tmp_datetime.min()} - {tmp_datetime.max()}")


    if dataset_name == 'train':
        # read GT dataset
        gt_path = data_dir / dataset_name / cname / pname / 'ground_truth.csv'
        gt_df = pd.read_csv(gt_path, usecols = ['collectionName', 'phoneName', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg'])

        # print GT time
        tmp_datetime = pd.to_datetime(gt_df['millisSinceGpsEpoch'] + dt_offset_in_ms, unit='ms')
        print(f"gt_df time scope: {tmp_datetime.min()} - {tmp_datetime.max()}")

        # merge GT dataset
        imu_df = pd.merge_asof(gt_df.sort_values('millisSinceGpsEpoch'),
                               imu_df.drop(['elapsedRealtimeNanos'], axis=1).sort_values('millisSinceGpsEpoch'),
                               on = 'millisSinceGpsEpoch',
                               direction='nearest')
    elif dataset_name == 'test':
        # merge smaple_df
        imu_df = pd.merge_asof(sample_df.sort_values('millisSinceGpsEpoch'),
                           imu_df.drop(['elapsedRealtimeNanos'], axis=1).sort_values('millisSinceGpsEpoch'),
                           on = 'millisSinceGpsEpoch',
                           direction='nearest')

    # OrientationDeg -> Rotation Vector
    rxs = []
    rys = []
    rzs = []
    for i in range(len(imu_df)):
        y_delta = imu_df['rollDeg'].iloc[i]
        z_delta = imu_df['yawDeg'].iloc[i]
        x_delta = imu_df['pitchDeg'].iloc[i]
        rx, ry, rz = an2v(y_delta, z_delta, x_delta)
        rxs.append(rx)
        rys.append(ry)
        rzs.append(rz)

    imu_df['ahrsX'] = rxs
    imu_df['ahrsY'] = rys
    imu_df['ahrsZ'] = rzs

    # calibrate sensors' reading
    for axis in ['X', 'Y', 'Z']:
        imu_df['Accel{}Mps2'.format(axis)] = imu_df['UncalAccel{}Mps2'.format(axis)] - imu_df['Bias{}Mps2'.format(axis)]
        imu_df['Gyro{}RadPerSec'.format(axis)] = imu_df['UncalGyro{}RadPerSec'.format(axis)] - imu_df['Drift{}RadPerSec'.format(axis)]
        imu_df['Mag{}MicroT'.format(axis)] = imu_df['UncalMag{}MicroT'.format(axis)] - imu_df['Bias{}MicroT'.format(axis)]

        # clearn bias features
        imu_df.drop(['Bias{}Mps2'.format(axis), 'Drift{}RadPerSec'.format(axis), 'Bias{}MicroT'.format(axis)], axis = 1, inplace = True) 

    if dataset_name == 'train':
        # merge Baseline dataset：imu_df + bl_df = (GT + IMU) + Baseline
        df_all = pd.merge(imu_df.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'}),
                      bl_df.drop(['phone'], axis=1).rename(columns={'latDeg':'latDeg_bl','lngDeg':'lngDeg_bl'}),
                      on = ['collectionName', 'phoneName', 'millisSinceGpsEpoch'])
    elif dataset_name == 'test':
        df_all = pd.merge(imu_df,
              bl_df[(bl_df['collectionName']==cname) & (bl_df['phoneName']==pname)].drop(['phone'], axis=1).rename(columns={'latDeg':'latDeg_bl','lngDeg':'lngDeg_bl'}),
              on = ['millisSinceGpsEpoch'])
        df_all.drop(['phone'], axis=1, inplace=True)
        
    return df_all


In [9]:
def WGS84_to_ECEF(lat, lon, alt):
    # convert to radians
    rad_lat = lat * (np.pi / 180.0)
    rad_lon = lon * (np.pi / 180.0)
    a    = 6378137.0
    # f is the flattening factor
    finv = 298.257223563
    f = 1 / finv   
    # e is the eccentricity
    e2 = 1 - (1 - f) * (1 - f)    
    # N is the radius of curvature in the prime vertical
    N = a / np.sqrt(1 - e2 * np.sin(rad_lat) * np.sin(rad_lat))
    x = (N + alt) * np.cos(rad_lat) * np.cos(rad_lon)
    y = (N + alt) * np.cos(rad_lat) * np.sin(rad_lon)
    z = (N * (1 - e2) + alt)        * np.sin(rad_lat)
    return x, y, z

transformer = pyproj.Transformer.from_crs(
    {"proj":'geocent', "ellps":'WGS84', "datum":'WGS84'},
    {"proj":'latlong', "ellps":'WGS84', "datum":'WGS84'},)
def ECEF_to_WGS84(x,y,z):
    lon, lat, alt = transformer.transform(x,y,z,radians=False)
    return lon, lat, alt

In [10]:
def get_xyz(df_all, dataset_name):
    # baseline: lat/lngDeg -> x/y/z
    df_all['Xbl'], df_all['Ybl'], df_all['Zbl'] = zip(*df_all.apply(lambda x: WGS84_to_ECEF(x.latDeg_bl, x.lngDeg_bl, x.heightAboveWgs84EllipsoidM), axis=1))
    
    if dataset_name == 'train':
        # gt: lat/lngDeg -> x/y/z
        df_all['Xgt'], df_all['Ygt'], df_all['Zgt'] = zip(*df_all.apply(lambda x: WGS84_to_ECEF(x.latDeg_gt, x.lngDeg_gt, x.heightAboveWgs84EllipsoidM), axis=1))
        # copy lat/lngDeg
        lat_lng_df = df_all[['latDeg_gt','lngDeg_gt', 'latDeg_bl', 'lngDeg_bl']]
        df_all.drop(['latDeg_gt','lngDeg_gt', 'latDeg_bl', 'lngDeg_bl'], axis = 1, inplace = True)
    elif dataset_name == 'test':
        # copy lat/lngDeg
        lat_lng_df = df_all[['latDeg_bl', 'lngDeg_bl']]
        df_all.drop(['latDeg_bl', 'lngDeg_bl', 'latDeg','lngDeg',], axis = 1, inplace = True)     

    return lat_lng_df, df_all

In [11]:
def prepare_df_train(df_all_train, window_size):
    '''prepare training dataset with all aixses'''
    tgt_df = df_all_train.copy()
    total_len = len(tgt_df) 
    moving_times = total_len - window_size 
    
    tgt_df.rename(columns = {'yawDeg':'yawZDeg', 'rollDeg':'rollYDeg', 'pitchDeg':'pitchXDeg'}, inplace = True)

    feature_cols = [f for f in list(tgt_df) if f not in ['Xgt', 'Ygt', 'Zgt']]

    # Historical Feature names
    hist_feats = []
    for time_flag in range(1, window_size + 1):
        for fn in feature_cols:
            hist_feats.append(fn + '_' + str(time_flag))

    # Window Sliding
    # t1 t2 t3 t4 t5 -> t6
    # t2 t3 t4 t5 t6 -> t7

    # Add historical data 
    df_train = pd.DataFrame()
    features = []
    xs = []
    ys = []
    zs = []

    for start_idx in range(moving_times):
        feature_list = list()
        x_list = list()
        y_list = list()
        z_list = list()
        for window_idx in range(window_size):
            feature_list.extend(tgt_df[feature_cols].iloc[start_idx + window_idx,:].to_list())
        x_list.append(tgt_df['Xgt'].iloc[start_idx + window_size])
        y_list.append(tgt_df['Ygt'].iloc[start_idx + window_size])
        z_list.append(tgt_df['Zgt'].iloc[start_idx + window_size])

        features.append(feature_list)
        xs.extend(x_list)
        ys.extend(y_list)
        zs.extend(z_list)

    df_train = pd.DataFrame(features, columns = hist_feats)
    df_train['Xgt'] = xs
    df_train['Ygt'] = ys
    df_train['Zgt'] = zs
    
    # clean single-value feature: collectionName_[1-5]\phoneName_[1-5]
    tmp_feats = []
    for fn in list(df_train):
        if (fn.startswith('collectionName_') == False) and (fn.startswith('phoneName_') == False):
            tmp_feats.append(fn)
    df_train = df_train[tmp_feats]

    # clean time feature
    tmp_drop_feats = []
    for f in list(df_train):
        if (f.startswith('millisSinceGpsEpoch') == True) or (f.startswith('timeSinceFirstFixSeconds') == True) or (f.startswith('utcTimeMillis') == True):
            tmp_drop_feats.append(f)
    df_train.drop(tmp_drop_feats, axis = 1, inplace = True)
    
    return df_train

In [12]:
def prepare_df_test(df_all_test, window_size):
    '''prepare testing dataset with all aixses'''
    tgt_df = df_all_test.copy()
    total_len = len(tgt_df) 
    moving_times = total_len - window_size 
    
    tgt_df.rename(columns = {'yawDeg':'yawZDeg', 'rollDeg':'rollYDeg', 'pitchDeg':'pitchXDeg'}, inplace = True)

    feature_cols = [f for f in list(tgt_df) if f not in ['Xgt', 'Ygt', 'Zgt']] 
    
    hist_feats = []
    for time_flag in range(1, window_size + 1):
        for fn in feature_cols:
            hist_feats.append(fn + '_' + str(time_flag))

    # t1 t2 t3 t4 t5 -> t6
    # t2 t3 t4 t5 t6 -> t7
    df_test = pd.DataFrame()
    features = []

    for start_idx in range(moving_times):
        feature_list = list()

        for window_idx in range(window_size):
            feature_list.extend(tgt_df[feature_cols].iloc[start_idx + window_idx,:].to_list())
        features.append(feature_list)

    df_test = pd.DataFrame(features, columns = hist_feats)
    tmp_feats = []
    for fn in list(df_test):
        if (fn.startswith('collectionName_') == False) and (fn.startswith('phoneName_') == False):
            tmp_feats.append(fn)
    df_test = df_test[tmp_feats]

    tmp_drop_feats = []
    for f in list(df_test):
        if (f.startswith('millisSinceGpsEpoch') == True) or (f.startswith('timeSinceFirstFixSeconds') == True) or (f.startswith('utcTimeMillis') == True) or (f.startswith('elapsedRealtimeNanos') == True):
            tmp_drop_feats.append(f)
    df_test.drop(tmp_drop_feats, axis = 1, inplace = True)
    
    return df_test

In [13]:
def remove_other_axis_feats(df_all, tgt_axis):
    '''unrelated-aixs features and uncalibrated features'''
    # Clean unrelated-aixs features
    all_imu_feats = ['UncalAccelXMps2', 'UncalAccelYMps2', 'UncalAccelZMps2',
                     'UncalGyroXRadPerSec', 'UncalGyroYRadPerSec', 'UncalGyroZRadPerSec',
                     'UncalMagXMicroT', 'UncalMagYMicroT', 'UncalMagZMicroT',
                     'ahrsX', 'ahrsY', 'ahrsZ',
                     'AccelXMps2', 'AccelYMps2', 'AccelZMps2',
                     'GyroXRadPerSec', 'GyroZRadPerSec', 'GyroYRadPerSec',
                     'MagXMicroT', 'MagYMicroT', 'MagZMicroT',
                     'yawZDeg', 'rollYDeg', 'pitchXDeg',
                     'Xbl', 'Ybl', 'Zbl']
    tgt_imu_feats = []
    for axis in ['X', 'Y', 'Z']:
        if axis != tgt_axis:
            for f in all_imu_feats:
                if f.find(axis) >= 0:
                    tgt_imu_feats.append(f)
            
    tmp_drop_feats = []
    for f in list(df_all):
        if f.split('_')[0] in tgt_imu_feats:
            tmp_drop_feats.append(f)

    tgt_df = df_all.drop(tmp_drop_feats, axis = 1)
    
    # Clean uncalibrated features
    uncal_feats = [f for f in list(tgt_df) if f.startswith('Uncal') == True]
    tgt_df = tgt_df.drop(uncal_feats, axis = 1)
    
    return tgt_df

In [14]:
def add_stat_feats(data, tgt_axis):
    for f in ['yawZDeg', 'rollYDeg', 'pitchXDeg']:
        if f.find(tgt_axis) >= 0:
            ori_feat = f
            break
            
    cont_feats = ['heightAboveWgs84EllipsoidM', 'ahrs{}'.format(tgt_axis),
           'Accel{}Mps2'.format(tgt_axis), 'Gyro{}RadPerSec'.format(tgt_axis), 'Mag{}MicroT'.format(tgt_axis),
            '{}bl'.format(tgt_axis)] + [ori_feat]
    
    for f in cont_feats:
        data[f + '_' + str(window_size) + '_mean'] = data[[f + f'_{i}' for i in range(1,window_size)]].mean(axis=1)
        data[f + '_' + str(window_size) + '_std'] = data[[f + f'_{i}' for i in range(1,window_size)]].std(axis=1)
        data[f + '_' + str(window_size) + '_max'] = data[[f + f'_{i}' for i in range(1,window_size)]].max(axis=1)
        data[f + '_' + str(window_size) + '_min'] = data[[f + f'_{i}' for i in range(1,window_size)]].min(axis=1)
        data[f + '_' + str(window_size) + '_median'] = data[[f + f'_{i}' for i in range(1,window_size)]].median(axis=1)
    return data

# 3,Modeling
Note: I only use the given axis features for predict the target axis location.  
For example, use features contains x-axis to predict the next x location.  
More, I used LGBM here.  

In [15]:
# LightGBM
params = {
    'metric':'mse',
    'objective':'regression',
    'seed':2021,
    'boosting_type':'gbdt',
    'early_stopping_rounds':10,
    'subsample':0.7,
    'feature_fraction':0.7,
    'bagging_fraction': 0.7,
    'reg_lambda': 10
}
window_size = 30
verbose_flag = True
folds = 5

In [16]:
# collection name
collection_uniq = bl_trn_df['collectionName'].unique()
MTV = [i for i in collection_uniq if 'MTV' in i and ('2021-03' in i or '2021-04' in i)]
SF = [i for i in collection_uniq if 'SF' in i and ('2021-03' in i or '2021-04' in i)]
RWC = [i for i in collection_uniq if 'RWC' in i and ('2021-03' in i or '2021-04' in i)]
SVL = [i for i in collection_uniq if 'SVL' in i and ('2021-03' in i or '2021-04' in i)]
SJC = [i for i in collection_uniq if 'SJC' in i and ('2021-03' in i or '2021-04' in i)]

In [17]:
# Example: I use SJC's dataset for training 
# tgt_cns = ['2021-04-22-US-SJC-1', '2021-04-28-US-SJC-1', '2021-04-29-US-SJC-2']
tgt_cns = SJC
cn2pn_df = bl_trn_df[['collectionName', 'phoneName']].drop_duplicates()

df_trains = []
lat_lng_df_trains = []
for tgt_cn in tqdm(tgt_cns):
    pns = list(cn2pn_df[cn2pn_df['collectionName'] == tgt_cn]['phoneName'].values)
    for tgt_pn in pns: # collectionに対するphone
        print('Prepare Training Dataset：', tgt_cn + '_' + tgt_pn)  
        df_all_train = prepare_imu_data(data_dir, 'train', tgt_cn, tgt_pn, bl_trn_df)
        lat_lng_df_train, df_all_train = get_xyz(df_all_train, 'train')
        df_train = prepare_df_train(df_all_train,  window_size) # 所有轴的数据
        df_trains.append(df_train)
        lat_lng_df_trains.append(lat_lng_df_train)
        print('_'*20)
        
df_train = pd.concat(df_trains, axis = 0)
lat_lng_df_train = pd.concat(lat_lng_df_trains, axis = 0)
print('Final Dataset shape：', df_train.shape)

  0%|          | 0/3 [00:00<?, ?it/s]Prepare Training Dataset： 2021-04-22-US-SJC-1_Pixel4
Loading ../input/google-smartphone-decimeter-challenge/train/2021-04-22-US-SJC-1/Pixel4/Pixel4_GnssLog.txt
sub-dataset shape：
Raw: (69759, 36)
Status: (105134, 13)
UncalAccel: (300038, 8)
UncalGyro: (300038, 8)
UncalMag: (289948, 8)
OrientationDeg: (180408, 5)
Fix: (0, 11)
imu_df time scope: 2021-04-22 21:02:55.866000 - 2021-04-22 21:51:07.442000
gt_df time scope: 2021-04-22 21:02:55.446000 - 2021-04-22 21:51:06.446000
____________________
Prepare Training Dataset： 2021-04-22-US-SJC-1_SamsungS20Ultra
Loading ../input/google-smartphone-decimeter-challenge/train/2021-04-22-US-SJC-1/SamsungS20Ultra/SamsungS20Ultra_GnssLog.txt
sub-dataset shape：
Raw: (103693, 36)
Status: (39416, 13)
UncalAccel: (282622, 8)
UncalGyro: (282622, 8)
UncalMag: (282622, 8)
OrientationDeg: (282623, 5)
Fix: (0, 11)
imu_df time scope: 2021-04-22 21:02:57.127000 - 2021-04-22 21:50:03.236000
gt_df time scope: 2021-04-22 21:02:57

In [18]:
# Example: I choose one of SJC collection from the test dataset as my test dataset, you can choose what as you like
cname_test = '2021-04-22-US-SJC-2'
pname_test = 'SamsungS20Ultra'

# cname_test = '2021-03-16-US-MTV-2'
# pname_test = 'Pixel4Modded'

df_all_test = prepare_imu_data(data_dir, 'test', cname_test, pname_test, bl_tst_df)
lat_lng_df_test, df_all_test = get_xyz(df_all_test, 'test')
df_test = prepare_df_test(df_all_test,  window_size)
print('df_test:', df_test.shape)
print('df_test.columns:', df_test.columns)

Loading ../input/google-smartphone-decimeter-challenge/test/2021-04-22-US-SJC-2/SamsungS20Ultra/SamsungS20Ultra_GnssLog.txt
sub-dataset shape：
Raw: (89737, 36)
Status: (40793, 13)
UncalAccel: (232389, 8)
UncalGyro: (232389, 8)
UncalMag: (232389, 8)
OrientationDeg: (232342, 5)
Fix: (0, 11)
imu_df time scope: 2021-04-22 22:01:14.226000 - 2021-04-22 22:39:57.997000
df_test: (2294, 840)
df_test.columns: Index(['UncalAccelXMps2_1', 'UncalAccelYMps2_1', 'UncalAccelZMps2_1',
       'UncalGyroXRadPerSec_1', 'UncalGyroYRadPerSec_1',
       'UncalGyroZRadPerSec_1', 'UncalMagXMicroT_1', 'UncalMagYMicroT_1',
       'UncalMagZMicroT_1', 'yawZDeg_1',
       ...
       'AccelYMps2_30', 'GyroYRadPerSec_30', 'MagYMicroT_30', 'AccelZMps2_30',
       'GyroZRadPerSec_30', 'MagZMicroT_30', 'heightAboveWgs84EllipsoidM_30',
       'Xbl_30', 'Ybl_30', 'Zbl_30'],
      dtype='object', length=840)


In [19]:
def training(df_train, df_test, tgt_axis, window_size):
    '''For the given axis target to train the model. Also, it has validation and prediciton.'''
    df_train = remove_other_axis_feats(df_train, tgt_axis)
    df_train = add_stat_feats(df_train, tgt_axis)
    df_test = remove_other_axis_feats(df_test, tgt_axis)
    df_test = add_stat_feats(df_test, tgt_axis)
    
    feature_names = [f for f in list(df_train) if f not in ['Xgt', 'Ygt', 'Zgt']]
    target = '{}gt'.format(tgt_axis)

    kfold = KFold(n_splits=folds, shuffle=True, random_state=params['seed'])

    pred_valid = np.zeros((len(df_train),)) 
    pred_test = np.zeros((len(df_test),)) 
    scores = []
    for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(df_train, df_train[target])):
        X_train = df_train.iloc[trn_idx][feature_names]
        Y_train = df_train.iloc[trn_idx][target]
        X_val = df_train.iloc[val_idx][feature_names]
        Y_val = df_train.iloc[val_idx][target]

        model = lgb.LGBMRegressor(**params)
        lgb_model = model.fit(X_train, 
                              Y_train,
                              eval_names=['train', 'valid'],
                              eval_set=[(X_train, Y_train), (X_val, Y_val)],
                              verbose=0,
                              eval_metric=params['metric'],
                              early_stopping_rounds=params['early_stopping_rounds'])
        pred_valid[val_idx] = lgb_model.predict(X_val, num_iteration =  lgb_model.best_iteration_)
        pred_test += lgb_model.predict(df_test[feature_names], num_iteration =  lgb_model.best_iteration_)

        scores.append(lgb_model.best_score_['valid']['l2'])
    
    pred_test = pred_test /  kfold.n_splits
    
    if verbose_flag == True:
        print("Each Fold's MSE：{}, Average MSE：{:.4f}".format([np.round(v,2) for v in scores], np.mean(scores)))
        print("-"*60)
    return df_train, df_test, pred_valid, pred_test

In [20]:
df_train_x, df_test_x, pred_valid_x, pred_test_x = training(df_train, df_test, 'X', window_size)
df_train_y, df_test_y, pred_valid_y, pred_test_y = training(df_train, df_test, 'Y', window_size)
df_train_z, df_test_z, pred_valid_z, pred_test_z = training(df_train, df_test, 'Z', window_size)

Each Fold's MSE：[90.15, 119.88, 188.38, 100.33, 93.24], Average MSE：118.3971
------------------------------------------------------------
Each Fold's MSE：[193.1, 202.32, 374.75, 179.97, 149.17], Average MSE：219.8594
------------------------------------------------------------
Each Fold's MSE：[146.97, 162.02, 422.7, 158.9, 124.52], Average MSE：203.0214
------------------------------------------------------------


In [21]:
val_compare_df = pd.DataFrame({'Xgt':df_train_x['Xgt'].values, 'Xpred':pred_valid_x,
                               'Ygt':df_train_y['Ygt'].values, 'Ypred':pred_valid_y,
                                'Zgt':df_train_z['Zgt'].values, 'Zpred':pred_valid_z})

In [22]:
# xyz -> lng, lat
lng_gt, lat_gt, _ = ECEF_to_WGS84(val_compare_df['Xgt'].values,val_compare_df['Ygt'].values,val_compare_df['Zgt'].values)
lng_pred, lat_pred, _ = ECEF_to_WGS84(val_compare_df['Xpred'].values,val_compare_df['Ypred'].values,val_compare_df['Zpred'].values)
lng_test_pred, lat_test_pred, _ = ECEF_to_WGS84(pred_test_x, pred_test_y, pred_test_z)


val_compare_df['latDeg_gt'] = lat_gt
val_compare_df['lngDeg_gt'] = lng_gt
val_compare_df['latDeg_pred'] = lat_pred
val_compare_df['lngDeg_pred'] = lng_pred
test_pred_df = pd.DataFrame({'latDeg':lat_test_pred, 'lngDeg':lng_test_pred})

In [23]:
# From：https://www.kaggle.com/emaerthin/demonstration-of-the-kalman-filter
def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(a**0.5)
    dist = 6_367_000 * c
    return dist

# apply train and test

In [24]:
# インデックス累積和
N = [0] + [len(bl_trn_df[bl_trn_df['collectionName']==SJC[n]]) for n in range(len(SJC))]
N = [i for i in itertools.accumulate(N)]

# trainの結果を反映させる
for i in range(len(N)-1):
    bl_trn_df[bl_trn_df['collectionName']==SJC[i]].loc[:,'latDeg'] = val_compare_df['latDeg_pred'][N[i]:N[i+1]]
    bl_trn_df[bl_trn_df['collectionName']==SJC[i]].loc[:,'lngDeg'] = val_compare_df['lngDeg_pred'][N[i]:N[i+1]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [25]:
# Baseline vs. GT
lat_lng_df_train['dist'] = calc_haversine(
                                    lat_lng_df_train.latDeg_gt.reset_index(drop=True), 
                                    lat_lng_df_train.lngDeg_gt.reset_index(drop=True),
                                    bl_trn_df[bl_trn_df['collectionName'].isin(SJC)]['latDeg'].reset_index(drop=True),
                                    bl_trn_df[bl_trn_df['collectionName'].isin(SJC)]['lngDeg'].reset_index(drop=True))
print('dist_50:',np.percentile(lat_lng_df_train['dist'],50) )
print('dist_95:',np.percentile(lat_lng_df_train['dist'],95) )
print('avg_dist_50_95:',(np.percentile(lat_lng_df_train['dist'],50) + np.percentile(lat_lng_df_train['dist'],95))/2)
print('avg_dist:', lat_lng_df_train['dist'].mean())

dist_50: 6.523611331179955
dist_95: 38.87085992092085
avg_dist_50_95: 22.697235626050404
avg_dist: 11.756682178606797


# score

In [26]:
# Baseline vs. GT
lat_lng_df_train['dist'] = calc_haversine(
lat_lng_df_train.latDeg_gt, lat_lng_df_train.lngDeg_gt,                                      lat_lng_df_train.latDeg_bl, lat_lng_df_train.lngDeg_bl)

print('dist_50:',np.percentile(lat_lng_df_train['dist'],50) )
print('dist_95:',np.percentile(lat_lng_df_train['dist'],95) )
print('avg_dist_50_95:',(np.percentile(lat_lng_df_train['dist'],50) + np.percentile(lat_lng_df_train['dist'],95))/2)
print('avg_dist:', lat_lng_df_train['dist'].mean())

dist_50: 6.129357526334204
dist_95: 33.837015811852936
avg_dist_50_95: 19.98318666909357
avg_dist: 10.508449405316306


In [27]:
# IMU Prediction vs. GT
val_compare_df['dist'] = calc_haversine(
val_compare_df.latDeg_gt,                                        val_compare_df.lngDeg_gt, 
val_compare_df.latDeg_pred,                                     val_compare_df.lngDeg_pred)
# IMU预测vsGT（多collection）
print('dist_50:',np.percentile(val_compare_df['dist'],50) )
print('dist_95:',np.percentile(val_compare_df['dist'],95) )
print('avg_dist_50_95:',(np.percentile(val_compare_df['dist'],50) + np.percentile(val_compare_df['dist'],95))/2)
print('avg_dist:', val_compare_df['dist'].mean())

dist_50: 6.839893596084978
dist_95: 19.370614100148384
avg_dist_50_95: 13.10525384811668
avg_dist: 8.366828373091364


In [28]:
bl_tst_df.iloc[bl_tst_df[bl_tst_df['phone']==cname_test + '_' + pname_test].index[window_size:],3] = test_pred_df['latDeg'].values
bl_tst_df.iloc[bl_tst_df[bl_tst_df['phone']==cname_test + '_' + pname_test].index[window_size:],4] = test_pred_df['lngDeg'].values

In [29]:
# bl_trn_df.to_csv('../output/filtered_nb037.csv', index=False)
bl_tst_df.to_csv('../output/sub_nb037_1.csv', index=False)