In [1]:
import pandas as pd
import pathlib
from tqdm.notebook import tqdm
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df_train = pd.read_csv("../input/google-smartphone-decimeter-challenge/baseline_locations_train.csv")
df_test = pd.read_csv("../input/google-smartphone-decimeter-challenge/baseline_locations_test.csv")

In [3]:
train_collectionName = df_train["collectionName"].unique()

In [4]:
df_train_highway = df_train[df_train['collectionName'].isin([train_collectionName[0],
                                                           train_collectionName[1],
                                                           train_collectionName[2],
                                                           train_collectionName[3],
                                                           train_collectionName[4],
                                                           train_collectionName[5],
                                                           train_collectionName[6],
                                                           train_collectionName[7],
                                                           train_collectionName[8],
                                                           train_collectionName[9],
                                                           train_collectionName[10],
                                                           train_collectionName[11],
                                                           train_collectionName[12],
                                                           train_collectionName[13],
                                                           train_collectionName[14],
                                                           train_collectionName[15],
                                                           train_collectionName[16],
                                                           train_collectionName[17],
                                                           train_collectionName[18],
                                                           train_collectionName[19],
                                                           train_collectionName[20]])]

In [5]:
df_train_tree = df_train[df_train['collectionName'].isin([train_collectionName[21],
                                                          train_collectionName[22],
                                                          train_collectionName[24],
                                                          train_collectionName[25],
                                                          train_collectionName[27]])]

In [6]:
df_train_downtown = df_train[df_train['collectionName'].isin([train_collectionName[23],
                                                              train_collectionName[26],
                                                              train_collectionName[28]])]

In [7]:
# ground_truth
p = pathlib.Path("../input/google-smartphone-decimeter-challenge")
gt_files = list(p.glob('train/*/*/ground_truth.csv'))
print('ground_truth.csv count : ', len(gt_files))

gts = []
for gt_file in tqdm(gt_files):
    gts.append(pd.read_csv(gt_file))
ground_truth = pd.concat(gts)

ground_truth.csv count :  73


  0%|          | 0/73 [00:00<?, ?it/s]

In [8]:
def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(a**0.5)
    return dist

In [9]:
def percentile50(x):
    return np.percentile(x, 50)
def percentile95(x):
    return np.percentile(x, 95)


In [10]:
def get_train_score(df, gt):
    gt = gt.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
    df = df.merge(gt, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='inner')
    # calc_distance_error
    df['err'] = calc_haversine(df['latDeg_gt'], df['lngDeg_gt'], df['latDeg'], df['lngDeg'])
    # calc_evaluate_score
    df['phone'] = df['collectionName'] + '_' + df['phoneName']
    res = df.groupby('phone')['err'].agg([percentile50, percentile95])
    res['p50_p90_mean'] = (res['percentile50'] + res['percentile95']) / 2 
    score = res['p50_p90_mean'].mean()
    return score,df

In [11]:
score_highway,df_highway = get_train_score(df_train_highway, ground_truth)
score_tree,df_tree = get_train_score(df_train_tree, ground_truth)
score_downtown,df_downtown = get_train_score(df_train_downtown, ground_truth)
print("highway :" , score_highway )
print('tree : ' ,score_tree)
print('downtown : ' , score_downtown)

highway : 3.452807389502441
tree :  6.173261717576203
downtown :  19.432900281799608


In [13]:
df_train_highway['collectionName'].unique()

array(['2020-05-14-US-MTV-1', '2020-05-14-US-MTV-2',
       '2020-05-21-US-MTV-1', '2020-05-21-US-MTV-2',
       '2020-05-29-US-MTV-1', '2020-05-29-US-MTV-2',
       '2020-06-04-US-MTV-1', '2020-06-05-US-MTV-1',
       '2020-06-05-US-MTV-2', '2020-06-11-US-MTV-1',
       '2020-07-08-US-MTV-1', '2020-07-17-US-MTV-1',
       '2020-07-17-US-MTV-2', '2020-08-03-US-MTV-1',
       '2020-08-06-US-MTV-2', '2020-09-04-US-SF-1', '2020-09-04-US-SF-2',
       '2021-01-04-US-RWC-1', '2021-01-04-US-RWC-2',
       '2021-01-05-US-SVL-1', '2021-01-05-US-SVL-2'], dtype=object)

In [14]:
df_train_downtown['collectionName'].unique()

array(['2021-04-22-US-SJC-1', '2021-04-28-US-SJC-1',
       '2021-04-29-US-SJC-2'], dtype=object)

In [15]:
df_train_tree['collectionName'].unique()

array(['2021-03-10-US-SVL-1', '2021-04-15-US-MTV-1',
       '2021-04-26-US-SVL-1', '2021-04-28-US-MTV-1',
       '2021-04-29-US-MTV-1'], dtype=object)

In [17]:
df_train

Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,heightAboveWgs84EllipsoidM,phone
0,2020-05-14-US-MTV-1,Pixel4,1273529463442,37.423575,-122.094091,-34.06,2020-05-14-US-MTV-1_Pixel4
1,2020-05-14-US-MTV-1,Pixel4,1273529464442,37.423578,-122.094101,-33.29,2020-05-14-US-MTV-1_Pixel4
2,2020-05-14-US-MTV-1,Pixel4,1273529465442,37.423573,-122.094111,-30.99,2020-05-14-US-MTV-1_Pixel4
3,2020-05-14-US-MTV-1,Pixel4,1273529466442,37.423583,-122.094121,-32.83,2020-05-14-US-MTV-1_Pixel4
4,2020-05-14-US-MTV-1,Pixel4,1273529467442,37.423579,-122.094114,-34.49,2020-05-14-US-MTV-1_Pixel4
...,...,...,...,...,...,...,...
131337,2021-04-29-US-SJC-2,SamsungS20Ultra,1303760315000,37.334460,-121.899600,-8.09,2021-04-29-US-SJC-2_SamsungS20Ultra
131338,2021-04-29-US-SJC-2,SamsungS20Ultra,1303760316000,37.334472,-121.899583,-7.59,2021-04-29-US-SJC-2_SamsungS20Ultra
131339,2021-04-29-US-SJC-2,SamsungS20Ultra,1303760317000,37.334491,-121.899597,-8.35,2021-04-29-US-SJC-2_SamsungS20Ultra
131340,2021-04-29-US-SJC-2,SamsungS20Ultra,1303760318000,37.334495,-121.899583,-8.73,2021-04-29-US-SJC-2_SamsungS20Ultra
