# Outlier correction

In [66]:
from pathlib import Path
import pandas as pd

import numpy as np
from tqdm.notebook import tqdm


data_path = Path("../input/google-smartphone-decimeter-challenge")
df_test = pd.read_csv(data_path / 'baseline_locations_test.csv')
df_sub    = pd.read_csv(data_path / 'sample_submission.csv')

# returns a generator
truths = (data_path / 'train').rglob('ground_truth.csv')

df_list = []
cols = ['collectionName', 'phoneName', 'millisSinceGpsEpoch', 'latDeg',
       'lngDeg']

for t in tqdm(truths, total=73):
    df_phone = pd.read_csv(t, usecols=cols)  
    df_list.append(df_phone)
df_truth = pd.concat(df_list, ignore_index=True)

df_basepreds = pd.read_csv(data_path / 'baseline_locations_train.csv')#, usecols=cols)
df_all = df_truth.merge(df_basepreds, how='inner', on=cols[:3], suffixes=('_truth', '_basepred'))

def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(a**0.5)
    return dist

df_all['dist'] = calc_haversine(df_all.latDeg_truth, df_all.lngDeg_truth, 
    df_all.latDeg_basepred, df_all.lngDeg_basepred)

  0%|          | 0/73 [00:00<?, ?it/s]

In [67]:
df_all.dist.describe()

count    131342.000000
mean          3.846848
std          30.739767
min           0.001338
25%           1.210976
50%           2.065769
75%           3.560001
max        8340.257976
Name: dist, dtype: float64

In [68]:
df_all.sort_values(by = 'dist',ascending = False)[['collectionName','dist']].head(10)

Unnamed: 0,collectionName,dist
64035,2020-09-04-US-SF-1,8340.257976
114354,2020-07-17-US-MTV-2,5050.995543
52894,2021-04-26-US-SVL-1,2254.344928
113362,2020-07-17-US-MTV-2,2026.294654
113360,2020-07-17-US-MTV-2,1934.676643
108223,2021-04-29-US-SJC-2,1599.570433
83930,2020-05-29-US-MTV-1,1128.348831
113361,2020-07-17-US-MTV-2,1044.316856
54443,2021-01-05-US-SVL-1,653.703379
74448,2021-04-15-US-MTV-1,549.061548


# Correct outlier

In [69]:
def correct_outlier(df):    
    df['dist_pre'] = 0
    df['dist_pro'] = 0

    df['latDeg_pre'] = df['latDeg'].shift(periods=1,fill_value=0)
    df['lngDeg_pre'] = df['lngDeg'].shift(periods=1,fill_value=0)
    df['latDeg_pro'] = df['latDeg'].shift(periods=-1,fill_value=0)
    df['lngDeg_pro'] = df['lngDeg'].shift(periods=-1,fill_value=0)
    df['dist_pre'] = calc_haversine(df.latDeg_pre, df.lngDeg_pre, df.latDeg, df.lngDeg)
    df['dist_pro'] = calc_haversine(df.latDeg, df.lngDeg, df.latDeg_pro, df.lngDeg_pro)

    list_phone = df['phone'].unique()
    for phone in list_phone:
        ind_s = df[df['phone'] == phone].index[0]
        ind_e = df[df['phone'] == phone].index[-1]
        df.loc[ind_s,'dist_pre'] = 0
        df.loc[ind_e,'dist_pro'] = 0

    pro_95 = df['dist_pro'].mean() + (df['dist_pro'].std() * 2)
    pre_95 = df['dist_pre'].mean() + (df['dist_pre'].std() * 2)
    ind = df[(df['dist_pro'] > pro_95)&(df['dist_pre'] > pre_95)][['dist_pre','dist_pro']].index

    for i in ind:
        df.loc[i,'latDeg'] = (df.loc[i-1,'latDeg'] + df.loc[i+1,'latDeg'])/2
        df.loc[i,'lngDeg'] = (df.loc[i-1,'lngDeg'] + df.loc[i+1,'lngDeg'])/2
    
    return df

# Kalman filter

In [70]:
from pathlib import Path
import numpy as np
import pandas as pd
import simdkalman
from tqdm.notebook import tqdm

T = 1.0
state_transition = np.array([[1, 0, T, 0, 0.5 * T ** 2, 0], [0, 1, 0, T, 0, 0.5 * T ** 2], [0, 0, 1, 0, T, 0],
                             [0, 0, 0, 1, 0, T], [0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 1]])
process_noise = np.diag([1e-5, 1e-5, 5e-6, 5e-6, 1e-6, 1e-6]) + np.ones((6, 6)) * 1e-9
observation_model = np.array([[1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0]])
observation_noise = np.diag([5e-5, 5e-5]) + np.ones((2, 2)) * 1e-9

kf = simdkalman.KalmanFilter(
        state_transition = state_transition,
        process_noise = process_noise,
        observation_model = observation_model,
        observation_noise = observation_noise)

In [71]:
def apply_kf_smoothing(df, kf_=kf):
    unique_paths = df[['collectionName', 'phoneName']].drop_duplicates().to_numpy()
    for collection, phone in tqdm(unique_paths):
        cond = np.logical_and(df['collectionName'] == collection, df['phoneName'] == phone)
        data = df[cond][['latDeg', 'lngDeg']].to_numpy()
        data = data.reshape(1, len(data), 2)
        smoothed = kf_.smooth(data)
        df.loc[cond, 'latDeg'] = smoothed.states.mean[0, :, 0]
        df.loc[cond, 'lngDeg'] = smoothed.states.mean[0, :, 1]
    return df

In [72]:
def percentile50(x):
    return np.percentile(x, 50)
def percentile95(x):
    return np.percentile(x, 95)

def get_train_score(df, gt):
    gt = gt.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
    df = df.merge(gt, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='inner')
    # calc_distance_error
    df['err'] = calc_haversine(df['latDeg_gt'], df['lngDeg_gt'], df['latDeg'], df['lngDeg'])
    # calc_evaluate_score
    df['phone'] = df['collectionName'] + '_' + df['phoneName']
    res = df.groupby('phone')['err'].agg([percentile50, percentile95]) # phoneによってgroupbyし、gtと予測値の差(err)の50%,95%値を求める
    res['p50_p90_mean'] = (res['percentile50'] + res['percentile95']) / 2 
    score = res['p50_p90_mean'].mean()
    return score

In [73]:
df_correct_outlier = correct_outlier(df_basepreds)
kf_smoothed_baseline = apply_kf_smoothing(df_correct_outlier)
get_train_score(kf_smoothed_baseline, df_truth)

  0%|          | 0/73 [00:00<?, ?it/s]

4.58290973646303