In [133]:
# import library
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles
import seaborn as sns
from tqdm.notebook import tqdm
import pathlib
import plotly
import plotly.express as px

pd.options.display.max_columns = 100
pd.options.display.max_rows = 20

In [145]:
def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(a**0.5)
    return dist

def visualize_trafic(df, center={"lat":37.6458, "lon":-122.4056}, zoom=9):
    fig = px.scatter_mapbox(df,
                            # Here, plotly gets, (x,y) coordinates
                            lat="latDeg",
                            lon="lngDeg",
                            
                            #Here, plotly detects color of series
                            color="phone",
                            
                            labels="phone",
                            zoom=zoom,
                            center=center,
                            height=600,
                            width=800)
    fig.update_layout(mapbox_style='stamen-terrain')
    fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
    fig.update_layout(title_text="GPS trafic")
    fig.show()
    
def visualize_collection(df, collection):
    target_df = df[df['collectionName']==collection].copy()
    lat_center = target_df['latDeg'].mean()
    lng_center = target_df['lngDeg'].mean()
    center = {"lat":lat_center, "lon":lng_center}
    
    visualize_trafic(target_df, center)

def add_distance_diff(df):
    # shift(1): 上のやつが1個下に下がる → 前のデータ
    # shift(-1): 下のやつが1個上に上がる → 次のデータ
    df['latDeg_prev'] = df['latDeg'].shift(1)
    df['latDeg_next'] = df['latDeg'].shift(-1)
    df['lngDeg_prev'] = df['lngDeg'].shift(1)
    df['lngDeg_next'] = df['lngDeg'].shift(-1)
    df['phone_prev'] = df['phone'].shift(1)
    df['phone_next'] = df['phone'].shift(-1)
    
    df['latDeg_gt_prev'] = df['latDeg_gt'].shift(1)
    df['latDeg_gt_next'] = df['latDeg_gt'].shift(-1)
    df['lngDeg_gt_prev'] = df['lngDeg_gt'].shift(1)
    df['lngDeg_gt_next'] = df['lngDeg_gt'].shift(-1)
    
    df['latDeg_prev_diff'] = df['latDeg'] - df['latDeg_prev']
    df['latDeg_next_diff'] = df['latDeg_next'] - df['latDeg']
    df['latDeg_gt_prev_diff'] = df['latDeg_gt'] - df['latDeg_gt_prev']
    df['latDeg_gt_next_diff'] = df['latDeg_gt_next'] - df['latDeg_gt']
    
    df['lngDeg_prev_diff'] = df['lngDeg'] - df['lngDeg_prev']
    df['lngDeg_next_diff'] = df['lngDeg_next'] - df['lngDeg']
    df['lngDeg_gt_prev_diff'] = df['lngDeg_gt'] - df['lngDeg_gt_prev']
    df['lngDeg_gt_next_diff'] = df['lngDeg_gt_next'] - df['lngDeg_gt']
    
    df['dist_prev'] = calc_haversine(df['latDeg'], df['lngDeg'], df['latDeg_prev'], df['lngDeg_prev'])
    df['dist_next'] = calc_haversine(df['latDeg'], df['lngDeg'], df['latDeg_next'], df['lngDeg_next'])
    
    df['dist_gt_prev'] = calc_haversine(df['latDeg_gt'], df['lngDeg_gt'], df['latDeg_gt_prev'], df['lngDeg_gt_prev'])
    df['dist_gt_next'] = calc_haversine(df['latDeg_gt'], df['lngDeg_gt'], df['latDeg_gt_next'], df['lngDeg_gt_next'])
    
    df.loc[df['phone']!=df['phone_prev'], ['latDeg_prev', 'lngDeg_prev', 'dist_prev', 
                                           'latDeg_gt_prev', 'lngDeg_gt_prev', 'dist_gt_prev',
                                          'latDeg_prev_diff', 'latDeg_gt_prev_diff',
                                          'lngDeg_prev_diff', 'lngDeg_gt_prev_diff']] = np.nan
    
    df.loc[df['phone']!=df['phone_next'], ['latDeg_next', 'lngDeg_next', 'dist_next', 
                                           'latDeg_gt_next', 'lngDeg_gt_next', 'dist_gt_next',
                                          'latDeg_next_diff', 'latDeg_gt_next_diff',
                                          'lngDeg_next_diff', 'lngDeg_gt_next_diff']] = np.nan
    
    return df


def add_distance_nogt_diff(df):
    # shift(1): 上のやつが1個下に下がる → 前のデータ
    # shift(-1): 下のやつが1個上に上がる → 次のデータ
    df['latDeg_prev'] = df['latDeg'].shift(1)
    df['latDeg_next'] = df['latDeg'].shift(-1)
    df['lngDeg_prev'] = df['lngDeg'].shift(1)
    df['lngDeg_next'] = df['lngDeg'].shift(-1)
    df['phone_prev'] = df['phone'].shift(1)
    df['phone_next'] = df['phone'].shift(-1)
    
    df['latDeg_prev_diff'] = df['latDeg'] - df['latDeg_prev']
    df['latDeg_next_diff'] = df['latDeg_next'] - df['latDeg']
    
    df['lngDeg_prev_diff'] = df['lngDeg'] - df['lngDeg_prev']
    df['lngDeg_next_diff'] = df['lngDeg_next'] - df['lngDeg']

    
    df['dist_prev'] = calc_haversine(df['latDeg'], df['lngDeg'], df['latDeg_prev'], df['lngDeg_prev'])
    df['dist_next'] = calc_haversine(df['latDeg'], df['lngDeg'], df['latDeg_next'], df['lngDeg_next'])
    
    
    df.loc[df['phone']!=df['phone_prev'], ['latDeg_prev', 'lngDeg_prev', 'dist_prev', 
                                          'latDeg_prev_diff', 'lngDeg_prev_diff']] = np.nan
    
    df.loc[df['phone']!=df['phone_next'], ['latDeg_next', 'lngDeg_next', 'dist_next', 
                                           'latDeg_next_diff', 'lngDeg_next_diff']] = np.nan
    
    return df

def percentile50(x):
    return np.percentile(x, 50)
def percentile95(x):
    return np.percentile(x, 95)

def get_train_score(df, gt):
    gt = gt.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
    # df = df.merge(gt, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='inner')
    df = df.merge(gt, on=['phone', 'millisSinceGpsEpoch'], how='inner')
    # calc_distance_error
    df['err'] = calc_haversine(df['latDeg_gt'], df['lngDeg_gt'], df['latDeg'], df['lngDeg'])
    # calc_evaluate_score
    # df['phone'] = df['collectionName'] + '_' + df['phoneName']
    res = df.groupby('phone')['err'].agg([percentile50, percentile95]) # phoneによってgroupbyし、gtと予測値の差(err)の50%,95%値を求める
    res['p50_p90_mean'] = (res['percentile50'] + res['percentile95']) / 2 
    score = res['p50_p90_mean'].mean()
    return score

def get_all_train_score(df):
    # calc_distance_error
    df['err'] = calc_haversine(df['latDeg_gt'], df['lngDeg_gt'], df['latDeg'], df['lngDeg'])
    # calc_evaluate_score
    # df['phone'] = df['collectionName'] + '_' + df['phoneName']
    res = df.groupby('phone')['err'].agg([percentile50, percentile95]) # phoneによってgroupbyし、gtと予測値の差(err)の50%,95%値を求める
    res['p50_p90_mean'] = (res['percentile50'] + res['percentile95']) / 2 
    score = res['p50_p90_mean'].mean()
    return score

In [135]:
INPUT = '../input/google-smartphone-decimeter-challenge'

# filtered_train = pd.read_csv(INPUT + '/' + 'baseline_locations_filtered_train.csv')
filtered_train = pd.read_csv('../output/filtered_nb037.csv')
test = pd.read_csv(INPUT + '/' + 'baseline_locations_test.csv')
sample_sub = pd.read_csv(INPUT + '/' + 'sample_submission.csv')

# ground truth
p = pathlib.Path(INPUT)
gt_files = list(p.glob('train/*/*/ground_truth.csv'))

gts = []
for gt_file in gt_files:
    gts.append(pd.read_csv(gt_file))
ground_truth = pd.concat(gts)

In [136]:
ground_truth_ = ground_truth.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
ground_truth_['phone'] = ground_truth_['collectionName'] + '_' + ground_truth_['phoneName']
all_train = pd.merge(filtered_train, ground_truth_, on=['phone', 'millisSinceGpsEpoch'], how='left')
all_train = add_distance_diff(all_train)

In [137]:
visualize_trafic(all_train)

In [140]:
for idx, df in all_train.groupby('phone'):
    print(idx,get_all_train_score(df))
print()
print(get_all_train_score(all_train))

2020-05-14-US-MTV-1_Pixel4 1.849449478969058
2020-05-14-US-MTV-1_Pixel4XLModded 1.7175490520440424
2020-05-14-US-MTV-2_Pixel4 2.989916134654543
2020-05-14-US-MTV-2_Pixel4XLModded 3.5227205062994864
2020-05-21-US-MTV-1_Pixel4 3.2993641977298838
2020-05-21-US-MTV-2_Pixel4 2.210857597403672
2020-05-21-US-MTV-2_Pixel4XL 2.02230766214569
2020-05-29-US-MTV-1_Pixel4 2.2938083078323537
2020-05-29-US-MTV-1_Pixel4XL 2.1337499786373875
2020-05-29-US-MTV-1_Pixel4XLModded 2.0625640946499866
2020-05-29-US-MTV-2_Pixel4 2.755795977120524
2020-05-29-US-MTV-2_Pixel4XL 2.7649367777530784
2020-06-04-US-MTV-1_Pixel4 2.2335868708768807
2020-06-04-US-MTV-1_Pixel4XL 1.8503749594152414
2020-06-04-US-MTV-1_Pixel4XLModded 2.7780782294825173
2020-06-05-US-MTV-1_Pixel4 2.4344785644139066
2020-06-05-US-MTV-1_Pixel4XL 2.4453378498172635
2020-06-05-US-MTV-1_Pixel4XLModded 2.1831948639382883
2020-06-05-US-MTV-2_Pixel4 2.13955606891771
2020-06-05-US-MTV-2_Pixel4XL 2.0341382287369845
2020-06-11-US-MTV-1_Pixel4 2.1295869

In [114]:
tmp = all_train.copy()

In [146]:
def remove_lowSpeed(_df, dist_thr=0.4):
    df = _df.copy()

    
    df = add_distance_nogt_diff(df)

    _index = df[(df['dist_prev']<dist_thr) | (df['dist_next']<dist_thr)]['latDeg'].index
    df.loc[_index, 'latDeg'] = np.nan
    df.loc[_index, 'lngDeg'] = np.nan
    dfs = []
    for _, df in df.groupby('phone'):
        df = df.interpolate()
        dfs.append(df)
    df = pd.concat(dfs)
    return df[['phone','millisSinceGpsEpoch','latDeg','lngDeg']]

In [147]:
get_train_score(remove_lowSpeed(filtered_train, 0.09048945485730947), ground_truth)

KeyError: 'phone'

In [148]:
filtered_train

Unnamed: 0,phone,millisSinceGpsEpoch,latDeg,lngDeg
0,2020-05-14-US-MTV-1_Pixel4,1273529463442,37.423526,-122.093928
1,2020-05-14-US-MTV-1_Pixel4,1273529464442,37.423548,-122.094006
2,2020-05-14-US-MTV-1_Pixel4,1273529465442,37.423563,-122.094063
3,2020-05-14-US-MTV-1_Pixel4,1273529466442,37.423572,-122.094097
4,2020-05-14-US-MTV-1_Pixel4,1273529467442,37.423570,-122.094100
...,...,...,...,...
131337,2021-04-29-US-SJC-2_SamsungS20Ultra,1303760315000,37.334467,-121.899609
131338,2021-04-29-US-SJC-2_SamsungS20Ultra,1303760316000,37.334465,-121.899603
131339,2021-04-29-US-SJC-2_SamsungS20Ultra,1303760317000,37.334469,-121.899596
131340,2021-04-29-US-SJC-2_SamsungS20Ultra,1303760318000,37.334476,-121.899589


import optuna

def objective(trial):
    x = trial.suggest_uniform('x', 0, 1.4)
    score = get_train_score(remove_lowSpeed(all_train, x))
    print(x, score)
    return score

study = optuna.create_study()
study.optimize(objective, n_trials=100)
{'x': 0.09048945485730947}