In [None]:
# import library
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles
import seaborn as sns
from tqdm.notebook import tqdm
import pathlib
import plotly
import plotly.express as px

# utils

In [None]:
def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(a**0.5)
    return dist

In [None]:
def visualize_trafic(df, center, zoom=9):
    fig = px.scatter_mapbox(df,
                            
                            # Here, plotly gets, (x,y) coordinates
                            lat="latDeg",
                            lon="lngDeg",
                            
                            #Here, plotly detects color of series
                            color="phoneName",
                            labels="phoneName",
                            
                            zoom=zoom,
                            center=center,
                            height=600,
                            width=800)
    fig.update_layout(mapbox_style='stamen-terrain')
    fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
    fig.update_layout(title_text="GPS trafic")
    fig.show()
    
def visualize_collection(df, collection):
    target_df = df[df['collectionName']==collection].copy()
    lat_center = target_df['latDeg'].mean()
    lng_center = target_df['lngDeg'].mean()
    center = {"lat":lat_center, "lon":lng_center}
    
    visualize_trafic(target_df, center)

In [None]:
def add_distance_diff(df):
    df['latDeg_prev'] = df['latDeg'].shift(1)
    df['latDeg_next'] = df['latDeg'].shift(-1)
    df['lngDeg_prev'] = df['lngDeg'].shift(1)
    df['lngDeg_next'] = df['lngDeg'].shift(-1)
    df['phone_prev'] = df['phone'].shift(1)
    df['phone_next'] = df['phone'].shift(-1)
    
    df['latDeg_gt_prev'] = df['latDeg_gt'].shift(1)
    df['latDeg_gt_next'] = df['latDeg_gt'].shift(-1)
    df['lngDeg_gt_prev'] = df['lngDeg_gt'].shift(1)
    df['lngDeg_gt_next'] = df['lngDeg_gt'].shift(-1)    
    
    df['latDeg_prev_diff'] = df['latDeg'] - df['latDeg_prev']
    df['latDeg_next_diff'] = df['latDeg_next'] - df['latDeg']
    df['latDeg_gt_prev_diff'] = df['latDeg_gt'] - df['latDeg_gt_prev']
    df['latDeg_gt_next_diff'] = df['latDeg_gt_next'] - df['latDeg_gt']
    
    df['lngDeg_prev_diff'] = df['lngDeg'] - df['lngDeg_prev']
    df['lngDeg_next_diff'] = df['lngDeg_next'] - df['lngDeg']
    df['lngDeg_gt_prev_diff'] = df['lngDeg_gt'] - df['lngDeg_gt_prev']
    df['lngDeg_gt_next_diff'] = df['lngDeg_gt_next'] - df['lngDeg_gt']
    
    df['dist_prev'] = calc_haversine(df['latDeg'], df['lngDeg'], df['latDeg_prev'], df['lngDeg_prev'])
    df['dist_next'] = calc_haversine(df['latDeg'], df['lngDeg'], df['latDeg_next'], df['lngDeg_next'])
    
    df['dist_gt_prev'] = calc_haversine(df['latDeg_gt'], df['lngDeg_gt'], df['latDeg_gt_prev'], df['lngDeg_gt_prev'])
    df['dist_gt_next'] = calc_haversine(df['latDeg_gt'], df['lngDeg_gt'], df['latDeg_gt_next'], df['lngDeg_gt_next'])
    
    df.loc[df['phone']!=df['phone_prev'], ['latDeg_prev', 'lngDeg_prev', 'dist_prev', 'latDeg_gt_prev', 'lngDeg_gt_prev', 'dist_gt_prev', 
                                           'latDeg_prev_diff', 'latDeg_gt_prev_diff', 'lngDeg_prev_diff', 'lngDeg_gt_prev_diff']] = np.nan
    
    df.loc[df['phone']!=df['phone_next'], ['latDeg_next', 'lngDeg_next', 'dist_next', 'latDeg_gt_next', 'lngDeg_gt_next', 'dist_gt_next',
                                           'latDeg_next_diff', 'latDeg_gt_next_diff', 'lngDeg_next_diff', 'lngDeg_gt_next_diff']] = np.nan
    
    return df

In [None]:
# directory setting
INPUT = '../input/google-smartphone-decimeter-challenge'

In [None]:
train = pd.read_csv(INPUT + '/' + 'baseline_locations_train.csv')
test = pd.read_csv(INPUT + '/' + 'baseline_locations_test.csv')
sample_sub = pd.read_csv(INPUT + '/' + 'sample_submission.csv')

In [None]:
# ground_truth
p = pathlib.Path(INPUT)
gt_files = list(p.glob('train/*/*/ground_truth.csv'))

gts = []
for gt_file in gt_files:
    gts.append(pd.read_csv(gt_file))
ground_truth = pd.concat(gts)

# EDA

In [None]:
# preparing data for viz
tmp1 = ground_truth.copy()
tmp1['phone'] = tmp1['collectionName'] + '_' + tmp1['phoneName']
tmp1['phoneName'] = tmp1['phoneName'] + '_GT'
tmp = train.append(tmp1)

In [None]:
visualize_trafic(tmp[tmp['phone']=='2020-05-14-US-MTV-1_Pixel4XLModded'],
                 center={"lat":37.6458, "lon":-122.4056}, zoom=19)

In [None]:
visualize_trafic(tmp[tmp['phone']=='2020-06-04-US-MTV-1_Pixel4'],
                 center={"lat":37.41634, "lon":-122.0805}, zoom=19)

As you can see in the graph above, the baseline coordinates seem to vary widely,  
even though the car is stopped at the start or goal and the ground_truth coordinates have not changed.

In [None]:
ground_truth = ground_truth.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt', 'heightAboveWgs84EllipsoidM':'heightAboveWgs84EllipsoidM_gt'})
train = train.merge(ground_truth, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='inner')
train['dist_err'] = calc_haversine(train['latDeg_gt'], train['lngDeg_gt'], train['latDeg'], train['lngDeg'])
train = add_distance_diff(train)

In [None]:
train['speedMps'].hist()
plt.title('Distribution of "speedMps"')

The speed of the train data is held by ground_truth.  
Looking at this distribution, we can see that there are many records with speed = 0.

In [None]:
train.loc[train['speedMps']==0.0,'speed0'] = 'speed = 0'
train.loc[train['speedMps']>0.0,'speed0'] = 'speed > 0'

In [None]:
sns.boxplot(x='speed0', y='dist_err', data=train, showfliers = False)

We just looked at a few examples, but even when looking at the entire train data,  
the error appears to be large when the car is stopped.

In [None]:
def visualize_err_move_dist(df, phone, reject_outlier=True):
    '''
    visualize baseline error and relative move distance
    '''
    fig, axes = plt.subplots(figsize=(20, 10), nrows=2,sharex=True)
    df = df[df['phone']==phone]
    if reject_outlier:
        th = (df['dist_err'].std() * 3) + df['dist_err'].mean()
        df = df[df['dist_err']<th]
    
    axes[0].plot(df['millisSinceGpsEpoch'], df['dist_err'], label='err(baseline)')
    axes[1].plot(df['millisSinceGpsEpoch'], df['speedMps'], label='speedMps')
    axes[1].plot(df['millisSinceGpsEpoch'], df['dist_prev'], label='move dist(baseline)')
    axes[1].plot(df['millisSinceGpsEpoch'], df['dist_gt_prev'], label='move dist(ground_truth)')
    axes[0].legend(loc='upper right')
    axes[1].legend(loc='upper right')
    axes[0].grid(color='g', linestyle=':', linewidth=0.3)
    axes[1].grid(color='g', linestyle=':', linewidth=0.3)
    fig.suptitle(phone, fontsize=16)

In [None]:
visualize_err_move_dist(train, '2020-05-14-US-MTV-1_Pixel4XLModded')

Let's check the time series of baseline error and move distance  
for the example we just checked on the map.  

As you can see in the graph below,  
the baseline is moving a lot while it is actually stopped at the start and end points.  

I am not sure about the cause,  
but I think this approach to reduce the error may be effective.

(supplement)  
move_dist(ground_truth) and speedMps are almost identical.   
Is the speed being calculated based on the coordinates?

View all phone results below

In [None]:
phones = train['phone'].unique()
for phone in phones:
    visualize_err_move_dist(train, phone)