## 5th Place Post-Processing
This is our team's post-processing pipeline for [Saito](https://www.kaggle.com/saitodevel01)'s baseline

(Some of the input from Saito is private, but see the following notebook for that part.)
* https://www.kaggle.com/saitodevel01/gsdc-optimization-based-smoothing-1st-version
* https://www.kaggle.com/saitodevel01/gsdc-bias-eda
* https://www.kaggle.com/saitodevel01/dsdc-unified-post-processing
* https://www.kaggle.com/saitodevel01/gsdc-bias-correction
* https://www.kaggle.com/saitodevel01/gsdc-vehicle-speed-estimation-by-doppler-shift

In [None]:
!pip install pandarallel

In [None]:
import warnings
warnings.simplefilter('ignore')

from contextlib import contextmanager
from glob import glob
from time import time

import lightgbm as lgbm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns
from pathlib import Path
import torch
from scipy import interpolate
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
from sklearn.neighbors import KNeighborsRegressor
from tqdm.notebook import tqdm
from tqdm._tqdm_notebook import tqdm_notebook as tqdm
tqdm.pandas()

from pandarallel import pandarallel
pandarallel.initialize()

import scipy.sparse
import scipy.sparse.linalg
import multiprocessing

from sklearn.neighbors import KNeighborsClassifier
from utils import * 
import simdkalman

import saito_io_f_v2 as io_f
import constants 
import transform 
import qpsolver
import signal_f_v2 as signal_f
import map_matching
import design_filter
from scipy.interpolate import InterpolatedUnivariateSpline

In [None]:
def area_prediction():    
    BASE_DIR = Path('../input/google-smartphone-decimeter-challenge')

    train_base = pd.read_csv(BASE_DIR / 'baseline_locations_train.csv')
    train_base = train_base.sort_values([
        "collectionName", "phoneName", "millisSinceGpsEpoch"
    ]).reset_index(drop=True)
    train_base['area'] = train_base['collectionName'].map(lambda x: x.split('-')[4])
    
    test_base = pd.read_csv(BASE_DIR / 'baseline_locations_test.csv')
    test_base = test_base.sort_values([
        "collectionName", "phoneName", "millisSinceGpsEpoch"
    ]).reset_index(drop=True)
    test_base['area'] = test_base['collectionName'].map(lambda x: x.split('-')[4])

    train_name = np.array(sorted(path.split('/')[-1] for path in glob(f'{BASE_DIR}/train/*')))
    train_highway  = train_name[np.array([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21]) - 1]
    train_tree     = train_name[np.array([22,23,25,26,28]) - 1]
    train_downtown = train_name[np.array([24,27,29]) - 1]

    train_base['area_target'] = -1
    train_base.loc[train_base['collectionName'].isin(train_highway),  'area_target'] = 0
    train_base.loc[train_base['collectionName'].isin(train_tree),     'area_target'] = 1
    train_base.loc[train_base['collectionName'].isin(train_downtown), 'area_target'] = 2

    def processing_downtown(input_df: pd.DataFrame, is_train=False):
        output_df = input_df.groupby('collectionName')[['latDeg', 'lngDeg']].std()
        if is_train:
            output_df = output_df.merge(
                input_df.groupby('collectionName')[['area_target']].first(),
                on='collectionName')
        output_df = output_df.merge(
            input_df.groupby('collectionName')['area'].first(),
            on='collectionName')
        output_df = output_df.merge(
            input_df.groupby('collectionName')['phoneName'].unique().apply(list),
            on='collectionName')
        return output_df

    train = processing_downtown(train_base, is_train=True)
    train['downtown_target'] = (train['area_target']==2).astype(int)

    downtown_model_knn = KNeighborsClassifier(n_neighbors=1)
    downtown_model_knn.fit(
        train[['latDeg', 'lngDeg']],
        train['downtown_target'],
    )

    def processing_highway_tree(input_df: pd.DataFrame, is_train=False):
        output_df = input_df.groupby('collectionName')[['latDeg', 'lngDeg']].min()
        if is_train:
            output_df = output_df.merge(
                input_df.groupby('collectionName')[['area_target']].first(),
                on='collectionName')
        output_df = output_df.merge(
            input_df.groupby('collectionName')['area'].first(),
            on='collectionName')
        output_df = output_df.merge(
            input_df.groupby('collectionName')['phoneName'].unique().apply(list),
            on='collectionName')
        return output_df

    train = processing_highway_tree(train_base, is_train=True)

    highway_tree_model_knn = KNeighborsClassifier(n_neighbors=1)
    highway_tree_model_knn.fit(
        train.loc[train['area_target']!=2, ['latDeg', 'lngDeg']],
        train.loc[train['area_target']!=2, 'area_target'],
    )

    def predict_area(test_base):
        test_base = test_base.copy()
        test_base = test_base.sort_values([
            "collectionName", "phoneName", "millisSinceGpsEpoch"
        ]).reset_index(drop=True)
        test_base['area'] = test_base['collectionName'].map(lambda x: x.split('-')[4])

        test = processing_downtown(test_base)
        downtown_pred = downtown_model_knn.predict(test[['latDeg', 'lngDeg']])

        test = processing_highway_tree(test_base)
        test.loc[downtown_pred==1, 'area_pred'] = 2
        pred = highway_tree_model_knn.predict(test.loc[test['area_pred'].isnull(), ['latDeg', 'lngDeg']])
        test.loc[test['area_pred'].isnull(), 'area_pred'] = pred
        test['area_pred'] = test['area_pred'].astype(int)
        test['collectionName'] = test.index

        test_highway  = []
        test_tree     = []
        test_downtown = []
        for collection, area_pred in test[['collectionName', 'area_pred']].itertuples(index=False):
            if area_pred == 0:
                test_highway.append(collection)
            elif area_pred == 1:
                test_tree.append(collection)
            else:
                test_downtown.append(collection)
        return (test_highway, test_tree, test_downtown)
    return train_highway, train_tree, train_downtown, predict_area(test_base)
TRAIN_HIGHWAY, TRAIN_TREEWAY, TRAIN_DOWNTOWN, \
    (TEST_HIGHWAY, TEST_TREEWAY, TEST_DOWNTOWN) = area_prediction()

In [None]:
# Generalized functions of LightGBM
def fit_lgbm(X, y, train_df, params: dict=None, verbose=100, seed: int=42, N_SPLITS: int=5):
    models = []
    oof_pred = np.zeros(len(y), dtype=np.float64)
    
    kf = GroupKFold(n_splits=N_SPLITS)
    for i, (idx_train, idx_valid) in enumerate(kf.split(X, y, train_df['collectionName'].reset_index(drop=True))):
        x_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
        x_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]

        model = lgbm.LGBMClassifier(**params)
        model.fit(x_train, y_train, 
            eval_set=[(x_valid, y_valid)],  
            early_stopping_rounds=verbose, 
            eval_metric='logloss',
            verbose=0)
            
        pred_i = model.predict_proba(x_valid)[:, 1]
        oof_pred[x_valid.index] = pred_i
        models.append(model)

    return oof_pred, models

def predict_lgbm(models, feat_df):
    pred = np.array([model.predict_proba(feat_df.values)[:, 1] for model in models])
    pred = np.mean(pred, axis=0)
    return pred

# KnnHeight
Map the latitude and longitude from the ground truth to the correct altitude. <br>
If there is a discrepancy between the predicted altitude and the output of this model, the accuracy of the satellite positioning can be expected to be poor.<br>

In [None]:
class KnnHeight:
    name = "Knn"
    isPrep = True
    
    def __init__(self):
        pass
    
    def main(self, train, test):
        knn_model = self.fit_knn_height(
            train[['t_latDeg', 't_lngDeg']],
            train['t_heightAboveWgs84EllipsoidM'])

        train['heightKNN'] = self.pred_knn_height(train[['latDeg', 'lngDeg']], knn_model)
        test['heightKNN']  = self.pred_knn_height(test[['latDeg', 'lngDeg']] , knn_model)

        train['heightDiff'] = np.abs(train['heightAboveWgs84EllipsoidM'] - train['heightKNN'])
        test['heightDiff']  = np.abs(test['heightAboveWgs84EllipsoidM']  - test['heightKNN'])
        
        return train, test
    
    def fit_knn_height(self, X: pd.DataFrame, y: pd.Series, n_neighbors=15):
        model = KNeighborsRegressor(n_neighbors=n_neighbors, weights='distance')
        model.fit(X.values, y.values)
        return model

    def pred_knn_height(self, X: pd.DataFrame, model):
        return model.predict(X.values)

# Outlier Detection
Predict the probability of being an outlier using the relative coordinates (calculated from the absolute coordinates) of the surrounding 50 seconds as a feature.<br>
In the case of Downtown, the distance to the ground truth is added to the feature value.<br>
If the absolute position prediction is outputting the altitude, add the difference from the kNN output.<br>

In [None]:
class OutlierDetection:
    name = "OutlierDetection"
    isPrep = True
    
    def __init__(self, sjc_loss_threshold, other_loss_threshold):
        self.sjc_loss_threshold   = sjc_loss_threshold
        self.other_loss_threshold = other_loss_threshold
        
    def main(self, train, test):
        # distance from ground truth
        train_snapped = snap_to_grid(train, 1e10)
        test_snapped = snap_to_grid(test, 1e10)
        train['dist'] = calc_dist(train[['latDeg','lngDeg']], train_snapped[['latDeg','lngDeg']])
        test['dist']  = calc_dist(test[['latDeg','lngDeg']] , test_snapped[['latDeg','lngDeg']])
        del train_snapped,test_snapped

        # add area
        train['area'] = train['phone'].apply(lambda s : s.split('-')[4])
        test['area'] = test['phone'].apply(lambda s : s.split('-')[4])
        
        params = {
         'reg_alpha': 0.01,
         'reg_lambda': 0.01, 
         'num_leaves': 40,
         'n_estimators': 10000,
         'learning_rate': 0.1,
         'random_state': 42,
         'max_depth': -1
        }
        
        # Downtown
        loss_threshold = self.sjc_loss_threshold
        visibility     = 22
        train['t_isOutlier'] = (train.meter > loss_threshold).astype(int)
        N_SPLITS = 3
        use_train_index = train.collectionName.isin(TRAIN_DOWNTOWN)
        use_test_index = test.collectionName.isin(TEST_DOWNTOWN)
        oof, models = fit_lgbm(self.processing_sjc(train[use_train_index].reset_index(drop=True), visibility), 
                               train.loc[use_train_index, 't_isOutlier'], 
                               train_df=train[use_train_index],
                               params=params, N_SPLITS=N_SPLITS)
        pred = predict_lgbm(models, self.processing_sjc(test[use_test_index].reset_index(drop=True), visibility))
        train.loc[use_train_index, 'outlier_rate'] = oof
        test.loc[use_test_index, 'outlier_rate'] = pred
        print('score', accuracy_score((oof>0.5).astype(int), train.loc[use_train_index, 't_isOutlier']))
        print(confusion_matrix((oof>0.5).astype(int), train.loc[use_train_index, 't_isOutlier']))

        # others
        loss_threshold = self.other_loss_threshold
        visibility     = 26 
        train['t_isOutlier'] = (train.meter > loss_threshold).astype(int)
        N_SPLITS = 5
        use_train_index = ~train.collectionName.isin(TRAIN_DOWNTOWN)
        use_test_index = ~test.collectionName.isin(TEST_DOWNTOWN)
        oof, models = fit_lgbm(self.processing_notsjc(train[use_train_index].reset_index(drop=True), visibility), 
                               train.loc[use_train_index, 't_isOutlier'], 
                               train_df=train[use_train_index],
                               params=params, N_SPLITS=N_SPLITS)
        pred = predict_lgbm(models, self.processing_notsjc(test[use_test_index].reset_index(drop=True), visibility))
        train.loc[use_train_index, 'outlier_rate'] = oof
        test.loc[use_test_index, 'outlier_rate'] = pred
        print('score', accuracy_score((oof>0.5).astype(int), train.loc[use_train_index, 't_isOutlier']))
        print(confusion_matrix((oof>0.5).astype(int), train.loc[use_train_index, 't_isOutlier']))
        
        return train, test        
        

    def processing_sjc(self, input_df: pd.DataFrame, L=25):
        output_df = pd.DataFrame()
        shift_list = list(range(-L, L+1, 1))

        for i in shift_list:
            output_df = pd.concat([
                output_df,
#                 input_df.groupby('phone')[['heightDiff', 'dist']]\
                input_df.groupby('phone')[['dist']]\
                    .shift(i).add_prefix(f'shift{i}_')
            ], axis=1)

        for i in shift_list:
            if i == 0: continue
            output_df = pd.concat([
                output_df,
                input_df.groupby('phone')[['latDeg', 'lngDeg']]\
                    .diff(i).add_prefix(f'diff{i}_')
            ], axis=1)
            output_df = pd.concat([
                output_df,
                input_df.groupby('phone')[['latDeg', 'lngDeg']]\
                    .pct_change(i).add_prefix(f'change{i}_')
            ], axis=1)

        return output_df 
    
    def processing_notsjc(self, input_df: pd.DataFrame, L=25):
        output_df = pd.DataFrame()
        shift_list = list(range(-L, L+1, 1))

        for i in shift_list:
            if i == 0: continue
            output_df = pd.concat([
                output_df,
                input_df.groupby('phone')[['latDeg', 'lngDeg']]\
                    .diff(i).add_prefix(f'diff{i}_')
            ], axis=1)
            output_df = pd.concat([
                output_df,
                input_df.groupby('phone')[['latDeg', 'lngDeg']]\
                    .pct_change(i).add_prefix(f'change{i}_')
            ], axis=1)

        return output_df

# Interpolate Outliers by Relpositions
We change the threshold and the direction of interpolation as follows, and after 20 predictions, we take a weighted average using the threshold.<br>

1. take 10 threshold values at equal intervals within 0.1-0.5.
    1. where the threshold is exceeded, it is considered as an outlier and replaced by nan.
    2. accumulate the relative coordinates from the nearest non-nan point and prepare one new absolute coordinate. (There are two ways to do this: forward and reverse.)
2. replace the predicted value with a weighted average of these 20 with "1/threshold".

In [None]:
class InterpolateByRelpos:
    name = "InterpolateByRelpos"
    isPrep = False
    
    def __init__(self, sjc_start=0.1, sjc_end=0.5, other_start=0.1, other_end=0.5):
        # downtown
        self.sjc_start = sjc_start
        self.sjc_end = sjc_end
        
        # others
        self.other_start = other_start
        self.other_end = other_end
    
    def main(self, train, test):
        train['area'] = train['phone'].apply(lambda s : s.split('-')[4])
        test['area'] = test['phone'].apply(lambda s : s.split('-')[4])
        
        train = pd.concat([
            self.interpolate_by_relpos(train[train.collectionName.isin(TRAIN_DOWNTOWN)], self.sjc_start, self.sjc_end),
            self.interpolate_by_relpos(train[~train.collectionName.isin(TRAIN_DOWNTOWN)], self.other_start, self.other_end),
        ]).sort_values(["phone", "millisSinceGpsEpoch"]).reset_index(drop=True)
        
        
        test_sjc    =  test.collectionName.isin(TEST_DOWNTOWN)
        test_nonsjc = ~test.collectionName.isin(TEST_DOWNTOWN)
        test = pd.concat([
            self.interpolate_by_relpos(test[test_sjc], self.sjc_start, self.sjc_end),
            self.interpolate_by_relpos(test[test_nonsjc], self.other_start, self.other_end),
        ]).sort_values(["phone", "millisSinceGpsEpoch"]).reset_index(drop=True)
        
        train["isOutlier"] = 0
        test["isOutlier"] = 0
        
        return train, test
        
        
    def interpolate_by_relpos(self, df, start, end):                                   
        dfs = []
        for phone, phone_df in tqdm(df.groupby("phone")):
            phone_df = phone_df.reset_index(drop=True)

            phone_lats = []
            phone_lngs = []

            THRESH_LIST = np.linspace(start, end, 10)
            for THRESH in THRESH_LIST:
                """
                 Interpolate in the forward direction 
                """
                acsend_df = phone_df.copy()
                acsend_df.loc[(acsend_df.index > 0) & (acsend_df.outlier_rate > THRESH), ["latDeg", "lngDeg"]] = np.nan
                lats = acsend_df.latDeg.tolist()
                lngs = acsend_df.lngDeg.tolist()
                d_lats = acsend_df.delta_latDeg.tolist()
                d_lngs = acsend_df.delta_lngDeg.tolist()
                for i, (lat, lng, d_lat, d_lng) in enumerate(zip(lats, lngs, d_lats, d_lngs)):
                    if lat != lat and i > 0:
                        lats[i] = lats[i-1] + d_lat
                        lngs[i] = lngs[i-1] + d_lng
                acsend_df["latDeg"] = lats
                acsend_df["lngDeg"] = lngs        


                """
                 Interpolate in the reverse direction
                """
                reverse_df = phone_df.copy()
                reverse_df.loc[(reverse_df.index < len(reverse_df)-1) & (reverse_df.outlier_rate > THRESH), ["latDeg", "lngDeg"]] = np.nan
                lats = reverse_df.latDeg.tolist()
                lngs = reverse_df.lngDeg.tolist()
                d_lats = reverse_df.delta_latDeg.tolist()
                d_lngs = reverse_df.delta_lngDeg.tolist()
                for i in range(len(lats)-2, -1, -1):    
                    if lats[i] != lats[i] and i + 1 < len(lats):
                        lats[i] = lats[i+1] - d_lats[i+1]
                        lngs[i] = lngs[i+1] - d_lngs[i+1]

                reverse_df["latDeg"] = lats
                reverse_df["lngDeg"] = lngs        

                phone_lats.append((acsend_df.latDeg + reverse_df.latDeg) / 2)
                phone_lngs.append((acsend_df.lngDeg + reverse_df.lngDeg) / 2)

            """
             average 20 cordinates
            """
            lats = np.average(phone_lats, axis=0, weights=1/THRESH_LIST)
            lngs = np.average(phone_lngs, axis=0, weights=1/THRESH_LIST)

            phone_df["latDeg"] = lats
            phone_df["lngDeg"] = lngs

            dfs.append(phone_df)
        df = pd.concat(dfs)
        
        return df.reset_index(drop=True)

# StopMean
As in outlier detection, the stop point is predicted from relative coordinates.

In [None]:
class StopMean:
    name = "StopMean"
    isPrep = False
    
    def __init__(self):
        pass
    
    def main(self, train, test):
        train['target_stop'] = (train['speedMps']==0).astype(int)

        params = {
         'reg_alpha': 0.01,
         'reg_lambda': 0.01, 
         'num_leaves': 40,
         'n_estimators': 10000,
         'learning_rate': 0.1,
         'random_state': 42,
         'max_depth': -1
        }

        N_SPLITS = 5
        oof, models = fit_lgbm(self.processing_stopmean(train), train['target_stop'], train_df=train, params=params, N_SPLITS=N_SPLITS)
        pred = predict_lgbm(models, self.processing_stopmean(test))

        train['stop'] = (oof > 0.5).astype(int)
        test['stop'] = (pred > 0.5).astype(int)

        print('score', accuracy_score((oof>0.5).astype(int), train['target_stop']))
        print(confusion_matrix((oof>0.5).astype(int), train['target_stop']))
        
        train = stopmean(train)
        test = stopmean(test)
        
        return train, test
        
    def processing_stopmean(self, input_df: pd.DataFrame):
        output_df = pd.DataFrame()
        shift_list = list(range(-15, 16, 1))

        for i in shift_list:
            if i == 0: continue
            output_df = pd.concat([
                output_df,
                input_df.groupby('phone')[['latDeg', 'lngDeg']]\
                    .diff(i).add_prefix(f'diff{i}_')
            ], axis=1)

            output_df = pd.concat([
                output_df,
                input_df.groupby('phone')[['latDeg', 'lngDeg']]\
                    .pct_change(i).add_prefix(f'change{i}_')
            ], axis=1)

        return output_df
        

# Cost Minimization

In [None]:
class SaitoOptimization:
    name = "SaitoOptimization"
    isPrep = False
    
    def __init__(self, N_SNAP_TO_GRID_ITERATION):
        self.params_highway = { 'sigma_u'  : 1.0,
                       'sigma_p'  : 3.0,
                       'sigma_a'  : 2.0 * 1e+5,
                       'sigma_v'  : 4.0 * 1e+5,
                       'sigma_d'  : 0.16 * 1e+5,
                       'reject_p' : 7.0,   # [m]
                       'reject_d' : 1.0,   # [m/s]
                       'vmin'     : -0.05, # [m/s]
                       'vmax'     : 50.0,  # [m/s]
                       'Mi8_velocity_timeshift' : 0.46,
                       'use_not_go_back_constraint' : False,
                       'use_map'  : False,
                      }
        self.params_treeway = { 'sigma_u'  : 1.0,
                       'sigma_p'  : 6.0,
                       'sigma_a'  : 0.8 * 1e+5,
                       'sigma_v'  : 3.0 * 1e+5,
                       'sigma_d'  : 0.12 * 1e+5,
                       'reject_p' : 12.0,  # [m]
                       'reject_d' : 1.0,   # [m/s]
                       'vmin'     : -0.05, # [m/s]
                       'vmax'     : 50.0,  # [m/s]
                       'Mi8_velocity_timeshift' : 0.30,
                       'use_not_go_back_constraint' : False,
                       'use_map'  : False,
                      }
        self.params_downtown = { 'sigma_u'  : 1.0,
                        'sigma_p'  : 20.0,
                        'sigma_a'  : 0.4 * 1e+5,
                        'sigma_v'  : 1.0 * 1e+5,
                        'sigma_d'  : 1.3 * 1e+5,
                        'reject_p' : 20.0,  # [m]
                        'reject_d' : 3.0,   # [m/s]
                        'vmin'     : -0.05, # [m/s]
                        'vmax'     : 50.0,  # [m/s]
                        'Mi8_velocity_timeshift' : 0.0,
                        'use_not_go_back_constraint' : False,
                        'use_map'  : True,
                        'threshold_distance_to_nearest_neighbor' : 8.0,
                        'sigma_p_stage2' : 3.0,
                        'num_stage2_iterations' : N_SNAP_TO_GRID_ITERATION,
                       }
        self.DT_X = 1.0
    
    def get_optimization_constants(self, base_df, velocity_df, sensor_df, params, use_sensor):
        const = dict()
        dt    = self.DT_X
        TIME_y = base_df['Time'].values
        TIME_d = velocity_df['Time'].values
        N_y = TIME_y.shape[0]
        N_d = TIME_d.shape[0]
        N_x = int(np.ceil(np.max(TIME_y) / dt) + 1)
        const['N_y'] = N_y
        const['N_d'] = N_d
        const['N_x'] = N_x

        a = np.array([[1, dt, (1/2)*dt**2],
                      [0,  1,  dt],
                      [0,  0,  1]])
        e3 = scipy.sparse.eye(3)
        A = np.empty(shape=(2*(N_x-1), 2*N_x), dtype=np.object)
        for i_x in range(N_x-1):
            A[2*i_x  , 2*i_x  ] = a
            A[2*i_x+1, 2*i_x+1] = a
            A[2*i_x  , 2*i_x+2] = -e3
            A[2*i_x+1, 2*i_x+3] = -e3
        const['A'] = scipy.sparse.bmat(A, format='csr')

        b = np.array([[(1/6)*dt**3,
                       (1/2)*dt**2,
                       dt]]).T
        const['B'] = scipy.sparse.block_diag([b for _ in range(2*(N_x-1))], format='csr')

        diag_R  = np.full(2*N_x - 2, params['sigma_u']**(-2) * dt)
        const['R'] = scipy.sparse.spdiags(diag_R, [0], 2*N_x - 2, 2*N_x - 2, format='csc')

        x_index  = np.floor(TIME_y / dt).astype(int)
        alpha    = (TIME_y / dt) - x_index
        coeff_y0 = 1 - 3*alpha**2 + 2*alpha**3
        coeff_y1 =     3*alpha**2 - 2*alpha**3
        coeff_v0 = dt * alpha * (alpha - 1)**2
        coeff_v1 = dt * alpha**2 * (alpha - 1)
        C = np.empty(shape=(2*N_y, 2*N_x), dtype=np.object)
        for i_x in range(N_x):
            C[0, 2*i_x  ] = scipy.sparse.coo_matrix((1, 3))
            C[0, 2*i_x+1] = scipy.sparse.coo_matrix((1, 3))
        for i_y in range(N_y):
            i_x = x_index[i_y]
            c_i = np.array([[coeff_y0[i_y], coeff_v0[i_y], 0]])
            C[2*i_y,   2*i_x]   = c_i
            C[2*i_y+1, 2*i_x+1] = c_i
            if i_x < N_x - 1:
                c_iplus = np.array([[coeff_y1[i_y], coeff_v1[i_y], 0]])
                C[2*i_y,   2*i_x+2] = c_iplus
                C[2*i_y+1, 2*i_x+3] = c_iplus
        const['Cp_orig']  = scipy.sparse.bmat(C, format='csr')

        diag_Lp = np.full(2*N_y, params['sigma_p']**(-2))
        const['Lp_orig'] = scipy.sparse.spdiags(diag_Lp, [0], 2*N_y, 2*N_y, format='csr')
        const['Yp_orig'] = base_df[['latDeg', 'lngDeg']].values.flatten()

        BLH = transform.BLH(
            lat=np.deg2rad(base_df['latDeg'].values),
            lng=np.deg2rad(base_df['lngDeg'].values),
            hgt=np.zeros(N_y),
        )
        DEG2RAD = np.pi / 180.0
        J = transform.jacobian_BL_to_EN(BLH) * DEG2RAD
        J = np.mean(J, axis=0)
        J[0, 0] = 0
        J[1, 1] = 0
        JJ = scipy.sparse.block_diag([J, J], format='csr')
        const['J'] = J

        # ドップラ速度に関するパラメータ
        x_index  = np.floor(TIME_d / dt).astype(int)
        alpha    = (TIME_d / dt) - x_index
        coeff_y0 = 1 - 3*alpha**2 + 2*alpha**3
        coeff_y1 =     3*alpha**2 - 2*alpha**3
        coeff_v0 = dt * alpha * (alpha - 1)**2
        coeff_v1 = dt * alpha**2 * (alpha - 1)
        C = np.empty(shape=(N_d, N_x), dtype=np.object)
        for i_x in range(N_x):
            C[0, i_x] = scipy.sparse.coo_matrix((2, 6))
        for i_d in range(N_d):
            i_x = x_index[i_d]
            c = np.array([[0, coeff_y0[i_d], coeff_v0[i_d], 0, 0, 0],
                          [0, 0, 0, 0, coeff_y0[i_d], coeff_v0[i_d]]])
            C[i_d, i_x] = J @ c
            if i_x < N_x - 1:
                c = np.array([[0, coeff_y1[i_d], coeff_v1[i_d], 0, 0, 0],
                              [0, 0, 0, 0, coeff_y1[i_d], coeff_v1[i_d]]])
                C[i_d, i_x+1] = J @ c
        const['Cd_orig']  = scipy.sparse.bmat(C, format='csr')

        diag_Ld = np.full(2*N_d, params['sigma_d']**(-2))
        const['Ld_orig'] = scipy.sparse.spdiags(diag_Ld, [0], 2*N_d, 2*N_d, format='csr')
        const['Yd_orig'] = velocity_df[['v_east', 'v_north']].values.flatten()

        if sensor_df is None:
            const['use_sensor'] = False
            const['use_inquality'] = False
            return const

        TIME_s = sensor_df['Time'].values
        N_s = TIME_s.shape[0]
        const['N_s'] = N_s
        const['use_sensor'] = use_sensor
        const['use_inquality'] = (use_sensor and params['use_not_go_back_constraint'])
        x_index = np.round(TIME_s / dt).astype(int)
        const['x_index_sensor'] = x_index
        if not use_sensor:
            return const

        # 速度制約・速度コストに関するパラメータ
        COS_TH = sensor_df['cos_th'].values
        SIN_TH = sensor_df['sin_th'].values
        CV = np.empty(shape=(N_s, N_x), dtype=np.object)
        GV = np.empty(shape=(N_s, N_x), dtype=np.object)
        cv = np.array([[0, 1, 0, 0, 0, 0],
                       [0, 0, 0, 0, 1, 0]], dtype=np.float64)
        for i_x in range(N_x):
            CV[0, i_x] = scipy.sparse.coo_matrix((1, 6))
            GV[0, i_x] = scipy.sparse.coo_matrix((1, 6))
        for i_s in range(N_s):
            i_x = x_index[i_s]
            k = np.array([[SIN_TH[i_s], -COS_TH[i_s]]])
            CV[i_s, i_x] = k @ J @ cv
            k = np.array([[-COS_TH[i_s], -SIN_TH[i_s]]])
            GV[i_s, i_x] = k @ J @ cv
        const['Cv'] = scipy.sparse.bmat(CV, format='csr')
        const['Gv'] = scipy.sparse.bmat(GV, format='csr')
        const['hv'] = np.full((N_s, ), -params['vmin'])

        diag_Lv = np.full(N_s, params['sigma_v']**(-2))
        const['Lv'] = scipy.sparse.spdiags(diag_Lv, [0], N_s, N_s, format='csr')

        # 加速度コストに関するパラメータ
        DOT_V_COS_TH = sensor_df['dotV'] * sensor_df['cos_th'].values
        DOT_V_SIN_TH = sensor_df['dotV'] * sensor_df['sin_th'].values
        OMEGA = sensor_df['omega'].values
        CA = np.empty(shape=(N_s, N_x), dtype=np.object)
        ca = np.array([[0, 1, 0, 0, 0, 0],
                       [0, 0, 0, 0, 1, 0],
                       [0, 0, 1, 0, 0, 0],
                       [0, 0, 0, 0, 0, 1]], dtype=np.float64)
        for i_x in range(N_x):
            CA[0, i_x] = scipy.sparse.coo_matrix((2, 6))
        for i_s in range(N_s):
            i_x = x_index[i_s]
            k = np.array([[0,  OMEGA[i_s], 1, 0],
                          [-OMEGA[i_s], 0, 0, 1]])
            CA[i_s, i_x] = k @ JJ @ ca
        const['Ca'] = scipy.sparse.bmat(CA, format='csr')
        const['Ya'] = np.stack([DOT_V_COS_TH, DOT_V_SIN_TH], axis=1).flatten()

        diag_La = np.full(2*N_s, params['sigma_a']**(-2))
        const['La'] = scipy.sparse.spdiags(diag_La, [0], 2*N_s, 2*N_s, format='csr')

        return const


    def solve_QP(self, const, p_valid, d_valid):
        A = const['A']
        B = const['B']
        R = const['R']
        Cp_orig = const['Cp_orig']
        Lp_orig = const['Lp_orig']
        Yp_orig = const['Yp_orig']
        Cd_orig = const['Cd_orig']
        Ld_orig = const['Ld_orig']
        Yd_orig = const['Yd_orig']

        p_valid2 = np.stack([p_valid, p_valid], axis=1).flatten()
        Cp = Cp_orig[p_valid2, :]
        Lp = Lp_orig[np.ix_(p_valid2, p_valid2)]
        Yp = Yp_orig[p_valid2]

        d_valid2 = np.stack([d_valid, d_valid], axis=1).flatten()
        Cd = Cd_orig[d_valid2, :]
        Ld = Ld_orig[np.ix_(d_valid2, d_valid2)]
        Yd = Yd_orig[d_valid2]

        CLC_p = Cp.T @ (Lp @ Cp)
        CLC_d = Cd.T @ (Ld @ Cd)

        CLY_p = Cp.T @ (Lp @ Yp)
        CLY_d = Cd.T @ (Ld @ Yd)

        if const['use_sensor']:
            Cv = const['Cv']
            Lv = const['Lv']
            Ca = const['Ca']
            La = const['La']
            Ya = const['Ya']
            CLC_v = Cv.T @ (Lv @ Cv)
            CLC_a = Ca.T @ (La @ Ca)
            Q     = CLC_p + CLC_d + CLC_v + CLC_a

            CLY_a = Ca.T @ (La @ Ya)
            q     = CLY_p + CLY_d + CLY_a
        else:
            Q = CLC_p + CLC_d
            q = CLY_p + CLY_d

        if const['use_inquality']:
            G = const['Gv']
            h = const['hv']
            X_star = qpsolver.solve_qp_with_inequality(R=R, Q=Q, q=q, A=A, B=B, G=G, h=h)
        else:
            X_star = qpsolver.solve_qp(R=R, Q=Q, q=q, A=A, B=B)
        return X_star


    def get_baseline(self, collection_name):
        df = self.BASELINE_DF[self.BASELINE_DF['collectionName'] == collection_name].copy()
        df.reset_index(drop=True, inplace=True)
        return df

    def get_velocity(self, collection_name):
        df = self.VELOCITY_DF[self.VELOCITY_DF['collectionName'] == collection_name].copy()
        df.reset_index(drop=True, inplace=True)
        return df

    def apply_costmin(self, base_df, velocity_df, sensor_df, params, N_LOOP):
        const = self.get_optimization_constants(base_df, velocity_df, sensor_df, params, use_sensor=True)

        if params['use_map']:
            distance = map_matching.distance_to_nearest_neighbor(base_df)
            default_p_valid = (distance < params['threshold_distance_to_nearest_neighbor'])
            p_valid = default_p_valid
        else:
            default_p_valid = np.full(const['N_y'], True)
            p_valid = default_p_valid

        V = np.sqrt(np.sum(velocity_df[['v_east', 'v_north']].values**2, axis=1))
        default_d_valid = (V < params['vmax'])
        d_valid = default_d_valid

        for loop in range(N_LOOP):
            X_star = self.solve_QP(const, p_valid, d_valid)
            Y_star = const['Cp_orig'] @ X_star
            Y_star = np.reshape(Y_star, (-1, 2))
            pp_df  = base_df.copy()
            pp_df['latDeg'] = Y_star[:, 0]
            pp_df['lngDeg'] = Y_star[:, 1]
            distance = transform.pd_haversine_distance(pp_df, base_df)
            p_valid = default_p_valid & (distance < params['reject_p'])

            dXYdt = const['Cd_orig'] @ X_star
            dXYdt = np.reshape(dXYdt, (-1, 2))
            v_err = dXYdt - velocity_df[['v_east', 'v_north']].values
            v_err = np.sqrt(np.sum(v_err**2, axis=1))
            d_valid = default_d_valid & (v_err < params['reject_d'])

        return pp_df
    
    
    def recalibrate_sensor_by_vehicle_motion(self, base_df, velocity_df, sensor_df, params):
        const = self.get_optimization_constants(base_df, velocity_df, sensor_df, params, use_sensor=False)

        if params['use_map']:
            distance = map_matching.distance_to_nearest_neighbor(base_df)
            p_valid  = (distance < params['threshold_distance_to_nearest_neighbor'])
        else:
            p_valid = np.full(const['N_y'], True)

        V = np.sqrt(np.sum(velocity_df[['v_east', 'v_north']].values**2, axis=1))
        d_valid = (V < params['vmax'])

        X_star = self.solve_QP(const, p_valid, d_valid)
        X_mat  = np.reshape(X_star, (-1, 6))
        dotB   = X_mat[const['x_index_sensor'], 1]
        dotL   = X_mat[const['x_index_sensor'], 4]
        dotXY  = const['J'] @ np.stack([dotB, dotL], axis=0) # shape = (2, N)
        dotX   = dotXY[0, :]
        dotY   = dotXY[1, :]
        V = np.sqrt(np.sum(dotXY**2, axis=0))
        cond = (V > (20 / 3.6))
        trig_moving_direction = signal_f.Trig.from_data(dotX[cond], dotY[cond])
        trig_theta   = signal_f.Trig.from_rad(sensor_df['theta'].values[cond])
        trig_offset  = trig_theta - trig_moving_direction
        angle_offset = np.arctan2(np.mean(trig_offset.sin), np.mean(trig_offset.cos))
        sensor_df['theta']  = sensor_df['theta'] - angle_offset
        sensor_df['cos_th'] = np.cos(sensor_df['theta'].values)
        sensor_df['sin_th'] = np.sin(sensor_df['theta'].values)

        return sensor_df

    
    def do_postprocess(self, args):
        train_or_test, collection, params = args

        base_df = self.get_baseline(collection)
        t_ref   = base_df['millisSinceGpsEpoch'].min()
        base_df['Time'] = 1e-3 * (base_df['millisSinceGpsEpoch'] - t_ref).values

        velocity_df = self.get_velocity(collection)
        velocity_df['Time'] = (1e-3 * (velocity_df['millisSinceGpsEpoch'] - t_ref).values
                               -  params['Mi8_velocity_timeshift'] * (velocity_df['phoneName'] == 'Mi8').astype(float)
                               )
        velocity_df = velocity_df[(  velocity_df['Time'] >= base_df['Time'].min())
                                  & (velocity_df['Time'] <= base_df['Time'].max())]
        velocity_df.reset_index(drop=True, inplace=True)

        phone_list = [path.split('/')[-1] for path in sorted(glob(f'{BASE_DIR}/{train_or_test}/{collection}/*'))]
        sensor_df_list   = []
        dt_up   = 2.5 * 1e-3
        dt_down = self.DT_X
        FLT = design_filter.make_sinc_filter(F_cutoff=2.0, dt=dt_up)
        for phone in phone_list:
            gnss_log_filename = f'{BASE_DIR}/{train_or_test}/{collection}/{phone}/{phone}_GnssLog.txt'
            sensor_df_orig = io_f.read_GnssLog_sensors(gnss_log_filename)
            if signal_f.check_sensor_availability(sensor_df_orig):
                sensor_df = signal_f.preprocess_sensor_data(sensor_df_orig, t_ref, dt_up, dt_down, FLT)
                sensor_df = signal_f.remove_different_posture(sensor_df)
                sensor_df = sensor_df[(  sensor_df['Time'] >= base_df['Time'].min())
                                      & (sensor_df['Time'] <= base_df['Time'].max())].copy()
                sensor_df.reset_index(drop=True, inplace=True)
                sensor_df_list.append(sensor_df)
        if len(sensor_df_list) > 0:
            time_list = [df['Time'].max() - df['Time'].min() for df in sensor_df_list]
            idx = np.argmax(time_list)
            sensor_df = sensor_df_list[idx]
            sensor_df = signal_f.add_calibrated_signals(sensor_df, dt_down)
            sensor_df = self.recalibrate_sensor_by_vehicle_motion(base_df, velocity_df, sensor_df, params)
        else:
            sensor_df = None

        pp_df = base_df
        pp_df = self.apply_costmin(pp_df, velocity_df, sensor_df, params, N_LOOP=3)
        if params['use_map']:
            params_stage2 = dict(params)
            params_stage2['sigma_p'] = params['sigma_p_stage2']
            for _ in range(params['num_stage2_iterations']):
                pp_df = map_matching.snap_to_nearest_neighbor(pp_df)
                pp_df = self.apply_costmin(pp_df, velocity_df, sensor_df, params_stage2, N_LOOP=1)
        return pp_df
    

    def main(self, train, test):
        VELOCITY_PATH = '../input/vehicle-speed-estimation/_doppler_velocity'
        
        # train
        self.train_or_test = "train"
        self.BASELINE_DF = train
        self.VELOCITY_DF = pd.read_csv(f'{VELOCITY_PATH}/doppler_velocity_train.csv')
        
        collection_list_highway  = TRAIN_HIGHWAY
        collection_list_treeway  = TRAIN_TREEWAY
        collection_list_downtown = TRAIN_DOWNTOWN
        config = [
            (collection_list_highway,  self.params_highway),
            (collection_list_treeway,  self.params_treeway),
            (collection_list_downtown, self.params_downtown),
        ]
        args_list = []
        for collection_list, params in config:
            for collection in collection_list:
                args_list.append( ('train', collection, params) )
        train = pd.merge(
            train.drop(columns=["latDeg", "lngDeg"]),
            self.single_main(args_list)[["phone", "millisSinceGpsEpoch", "latDeg", "lngDeg"]],
            on=["phone", "millisSinceGpsEpoch"], how="inner")
        
        # test      
        self.train_or_test = "test"      
        self.BASELINE_DF = test
        self.VELOCITY_DF = pd.read_csv(f'{VELOCITY_PATH}/doppler_velocity_test.csv')
        
        collection_list_all = np.array(sorted(path.split('/')[-1] for path in glob(f'{BASE_DIR}/test/*')))
        collection_list_highway  = TEST_HIGHWAY
        collection_list_treeway  = TEST_TREEWAY
        collection_list_downtown = TEST_DOWNTOWN
        config = [
            (collection_list_highway,  self.params_highway),
            (collection_list_treeway,  self.params_treeway),
            (collection_list_downtown, self.params_downtown),
        ]
        args_list = []
        for collection_list, params in config:
            for collection in collection_list:
                args_list.append( ('test', collection, params) )
                
        test = pd.merge(
            test.drop(columns=["latDeg", "lngDeg"]),
            self.single_main(args_list)[["phone", "millisSinceGpsEpoch", "latDeg", "lngDeg"]],
            on=["phone", "millisSinceGpsEpoch"], how="inner")
        
        return train, test
        
    def single_main(self, args_list):
        processes = 8
        with multiprocessing.Pool(processes=processes) as pool:
            df_list = pool.imap_unordered(self.do_postprocess, args_list)
            df_list = tqdm(df_list, total=len(args_list))
            df_list = list(df_list)

        columns = ['phone', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg', 'collectionName', 'phoneName']
        sub_df = pd.concat(df_list)
        sub_df = sub_df[columns]
        sub_df = sub_df.sort_values(['phone', 'millisSinceGpsEpoch'])

        return sub_df

# Bias Correcetion

In [None]:
class BiasCorrection:
    name = 'bias_correction'
    isPrep = False
    
    def __init__(self, bias_x=0.0, bias_y=0.5):
        self.bias_x = bias_x
        self.bias_y = bias_y
    
    def main(self, train, test,bias_x=0.0,bias_y=0.5):
        train = self._bias_correction(train)
        test = self._bias_correction(test)
        return train, test

    def _bias_correction_phone(self, args):
        phone, phone_df = args

        B = np.deg2rad(phone_df['latDeg'].values)
        L = np.deg2rad(phone_df['lngDeg'].values)
        H = np.zeros_like(B)
        BLH = transform.BLH(lat=B, lng=L, hgt=H)
        J = transform.jacobian_BL_to_EN(BLH)

        t_ref  = phone_df['millisSinceGpsEpoch'].min()
        TIME   = 1e-3 * (phone_df['millisSinceGpsEpoch'] - t_ref).values
        dotB   = InterpolatedUnivariateSpline(TIME, B, k=3).derivative()(TIME)
        dotL   = InterpolatedUnivariateSpline(TIME, L, k=3).derivative()(TIME)
        dotBL  = np.stack([dotB, dotL], axis=1)
        dotEN  = np.einsum('nij,nj->ni', J, dotBL)
        absV   = np.sqrt(np.sum(dotEN**2, axis=1))
        th_az  = np.arctan2(dotEN[:, 0], dotEN[:, 1])

        cos_az = np.cos(th_az)
        sin_az = np.sin(th_az)
        valid  = (absV > (5 / 3.6))
        cos_az = InterpolatedUnivariateSpline(TIME[valid], cos_az[valid], k=1, ext=3)(TIME)
        sin_az = InterpolatedUnivariateSpline(TIME[valid], sin_az[valid], k=1, ext=3)(TIME)
        th_az  = np.arctan2(sin_az, cos_az)
        cos_az = np.cos(th_az)
        sin_az = np.sin(th_az)

        delta_X  = - self.bias_x
        delta_Y  = - self.bias_y
        delta_E  = (  cos_az * delta_X) + (sin_az * delta_Y)
        delta_N  = (- sin_az * delta_X) + (cos_az * delta_Y)
        delta_EN = np.stack([delta_E, delta_N], axis=0) # shape = (2, N)
        Jinv = np.linalg.inv(np.mean(J, axis=0))
        delta_BL_rad = Jinv @ delta_EN
        delta_BL_deg = np.rad2deg(delta_BL_rad)

        output_df = pd.DataFrame({
            'phone'               : phone_df['phone'],
            'millisSinceGpsEpoch' : phone_df['millisSinceGpsEpoch'],
            'latDeg'              : phone_df['latDeg'] + delta_BL_deg[0, :],
            'lngDeg'              : phone_df['lngDeg'] + delta_BL_deg[1, :],
        })
        return output_df
    
    def _bias_correction(self, base_df):
        output_df = base_df.sort_values(['phone', 'millisSinceGpsEpoch']).reset_index(drop=True).copy()
        output_df_list = map(self._bias_correction_phone, base_df.groupby('phone'))
        _df = pd.concat(output_df_list, axis=0)
        _df = _df.sort_values(['phone', 'millisSinceGpsEpoch']).reset_index(drop=True)
        output_df[['latDeg','lngDeg']] = _df[['latDeg','lngDeg']]
        return output_df

# Pipeline

In [None]:
class Pipeline:
        
    def __init__(self, train, test, processes):
        self.train = train
        self.test = test
        self.processes = processes
        
        self.pipe_name = ""     
        self.process_name = "" 
        self.process_names = []
        self.cvs = []    
        
    def main(self):
        global PROCESSED_DFS
        
        print(f"baseline...")
        self.train, cv = self.check_meter(self.train)
        self.process_names.append("baseline")
        self.cvs.append(cv)        
        print("\n")
        
        for process in self.processes:
            if process.name == "skip":
                continue
            
            print(f"{process.name}...")
            
            # reuse
            if process.isPrep == False:                
                self.process_names.append(process.name)
                self.process_name += "_" + process.name
            self.pipe_name += "_" + process.name
            
            if self.pipe_name in PROCESSED_DFS:
                print(f"loaded past result: {self.pipe_name}")
                self.train = PROCESSED_DFS[self.pipe_name][0].copy()
                self.test  = PROCESSED_DFS[self.pipe_name][1].copy()
            else:
                self.train, self.test = process.main(self.train, self.test)
                PROCESSED_DFS[self.pipe_name] = [self.train.copy(), self.test.copy()]
            
            if process.isPrep == False:
                self.train, cv = self.check_meter(self.train)
                self.cvs.append(cv)
                
            print("\n")
            self.train.to_csv(f"train_after{process.name}.csv", index=False)
            self.test.to_csv(f"test_after{process.name}.csv", index=False)
            
        self.train.to_csv(f"train{self.process_name}_FIN.csv", index=False)
        self.test.to_csv(f"test{self.process_name}_FIN.csv", index=False)
            
        self.plot()
        return self.train, self.test
                        
            
    def check_meter(self, input_df: pd.DataFrame, save=False):
        output_df = input_df.copy()

        output_df['meter'] = input_df.parallel_apply(vincenty_meter, axis=1)
        if save == True:
            output_df.to_csv('train_output.csv', index=False)

        meter_score = output_df['meter'].mean()
        print(f'meter: {meter_score}')

        scores = []
        for phone in output_df['phone'].unique():
            p_50 = np.percentile(output_df.loc[output_df['phone']==phone, 'meter'], 50)
            p_95 = np.percentile(output_df.loc[output_df['phone']==phone, 'meter'], 95)
            scores.append(p_50)
            scores.append(p_95)

        score = sum(scores) / len(scores)
        print(f'CV: {score}') 

        return output_df, score
    
    def plot(self):
        plt.subplots(figsize=(8, 3))
        plt.plot(self.process_names, self.cvs, marker="o")
        
        for l, c in zip(self.process_names, self.cvs):
            plt.text(l, c+0.05, f"{c:.2f}")
        plt.grid()
        plt.ylabel("CV")
        plt.title(self.process_name[1:])
        plt.show()
PROCESSED_DFS = {}

# Input

In [None]:
BASE_DIR = Path('../input/google-smartphone-decimeter-challenge')
SAITO_DIR = Path('../input/gsdc-improved-raw-gnss-baseline-result')
OUTPUT_DIR = Path('./output/')
os.makedirs(OUTPUT_DIR, exist_ok=True)

# aggregate ground_truth(common)
train = pd.read_csv(SAITO_DIR / 'raw_gnss_train.csv')
train["collectionName"] = train["phone"].apply(lambda x:x.split("_")[0])
train["phoneName"] = train["phone"].apply(lambda x:x.split("_")[1])

test = pd.read_csv(SAITO_DIR / 'raw_gnss_test.csv')
test["collectionName"] = test["phone"].apply(lambda x:x.split("_")[0])
test["phoneName"] = test["phone"].apply(lambda x:x.split("_")[1])

train_gt = pd.concat([pd.read_csv(path) for path in tqdm(BASE_DIR.glob('*/*/*/ground_truth.csv'),total=73)])
train_gt.rename(columns={'latDeg':'t_latDeg',
                         'lngDeg':'t_lngDeg',
                         'heightAboveWgs84EllipsoidM':'t_heightAboveWgs84EllipsoidM'}
               ,inplace=True)
train = train.merge(
    train_gt, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch']
)

rel_pos = pd.read_csv("../input/chris-baseline/gps-delta-lat-lng.csv").rename(columns={"latDeg": "delta_latDeg", "lngDeg":"delta_lngDeg"})
train = train.merge(rel_pos, on=["phone", "millisSinceGpsEpoch"])
test = test.merge(rel_pos, on=["phone", "millisSinceGpsEpoch"])

train = check_meter(train)
del train_gt

# Apply

In [None]:
pipe = Pipeline(
    train=train, test=test,

    processes=[
#         KnnHeight(),        
        
        OutlierDetection(
            sjc_loss_threshold=9.5, 
            other_loss_threshold=5.6,
        ),     
                
        InterpolateByRelpos(
            sjc_start=0.1,
            sjc_end=0.5,
            other_start=0.1,
            other_end=0.5,
        ),
        
        StopMean(),

        SaitoOptimization(
            N_SNAP_TO_GRID_ITERATION=1
        ),
        
        BiasCorrection(
            bias_x=0.30,
            bias_y=0.5
        ),
    ])

train, test = pipe.main()
test[["phone", "millisSinceGpsEpoch", "latDeg", "lngDeg"]].to_csv("submission.csv", index=False)

# VISUALIZE

In [None]:
visualize_trafic(test, color='phone', savepath=str(OUTPUT_DIR /'submission.html'))

In [None]:
def evaluate_train(train):
    dfs = []
    for c, c_df in train.groupby("collectionName"):
        data = {}
        for phone, phone_df in c_df.groupby("phoneName"):
            p_50 = np.percentile(phone_df.meter, 50)
            p_95 = np.percentile(phone_df.meter, 95)
            data[phone] = [(p_50+p_95)/2]

        df = pd.DataFrame(data, index=[c])
        dfs.append(df)        

    score_df = pd.concat(dfs).fillna("-")
    score_df.index=train.collectionName.unique()
    return score_df.copy()
score_df = evaluate_train(train)
score_df.to_csv(OUTPUT_DIR / "train_CV.csv")
score_df