In [None]:
import numpy as np
import pandas as pd
import glob
import os
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from pathlib import Path
import plotly.express as px
import math

In [None]:
data_dir = Path("../input/google-smartphone-decimeter-challenge")
train_df = pd.read_csv(data_dir / "baseline_locations_train.csv")

In [None]:
# get all ground truth dataframe
gt_df = pd.DataFrame()
for (collection_name, phone_name), df in tqdm(train_df.groupby(["collectionName", "phoneName"])):
    path = data_dir / f"train/{collection_name}/{phone_name}/ground_truth.csv"
    df = pd.read_csv(path)  
    gt_df = pd.concat([gt_df, df]).reset_index(drop=True)   
gt_df.head()

gt_df['phone'] = gt_df['collectionName'] + '_' + gt_df['phoneName']

train_data = pd.read_csv('../input/google-smartphone-decimeter-challenge/baseline_locations_train.csv')

In [None]:
def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(a**0.5)
    return dist

def percentile50(x):
    return np.percentile(x, 50)
def percentile95(x):
    return np.percentile(x, 95)

def get_train_score(df, gt):
    gt = gt.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
    df = df.merge(gt, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='inner')
    # calc_distance_error
    df['err'] = calc_haversine(df['latDeg_gt'], df['lngDeg_gt'], df['latDeg'], df['lngDeg'])
    # calc_evaluate_score
    df['phone'] = df['collectionName'] + '_' + df['phoneName']
    res = df.groupby('phone')['err'].agg([percentile50, percentile95])
    res['p50_p90_mean'] = (res['percentile50'] + res['percentile95']) / 2 
    score = res['p50_p90_mean'].mean()
    return score

In [None]:
gt_df['gt'] = 1
train_data['gt'] = 0

Now let us introduce some helper functions which would help us to get lat/lng. For more info on that you may want to look for converting a degrees-minutes-seconds coordinate pairs to decimal degrees.

In [None]:
def dm(x):
    degrees = x // 100
    minutes = x - 100*degrees

    return degrees, minutes

def decimal_degrees(degrees, minutes):
    return degrees + minutes/60 

In [None]:
data_path = Path("../input/android-smartphones-high-accuracy-datasets/training")

truths = (data_path).rglob('*chipset.nmea')
    # returns a generator
    
df_list = []

for t in tqdm(truths, total=34):
    
    df_phone = pd.read_table(t, sep=',', header=None, index_col=1, parse_dates=True, comment='*')
    
    df_phone['phone'] = t
    print(t)
    
    df_list.append(df_phone)
    
df_truth = pd.concat(df_list, ignore_index=True)

#The next steps are just input data processing. 
#To understand it better, you may want to run it line by line and compare with the previous line.

df_truth = df_truth[[2,4, 'phone']]
df_truth.rename(columns={2: 'latDeg', 4: 'lngDeg'}, inplace=True)

df_truth = df_truth[df_truth.latDeg != 'A']
df_truth["latDeg"] = pd.to_numeric(df_truth["latDeg"], downcast="float")
df_truth["lngDeg"] = pd.to_numeric(df_truth["lngDeg"], downcast="float")
df_truth['latDeg'] = decimal_degrees(*dm(df_truth['latDeg']))
df_truth['lngDeg'] = 0 - decimal_degrees(*dm(df_truth['lngDeg']))

df_truth['gt'] = 2

df_truth

Now let us see that newly added data is indeed new and wasnt included as the basic training datasets.

In [None]:
ccc = pd.concat([df_truth, gt_df],ignore_index=True)

In [None]:
fig = px.scatter_mapbox(ccc,

                    # Here, plotly gets, (x,y) coordinates
                    lat="latDeg",
                    lon="lngDeg",
                    #text='phoneName',

                    #Here, plotly detects color of series
                    color="gt",
                    #labels="collectionName",

                    zoom=9,
                    center={"lat":37.423576, "lon":-122.094132},
                    height=600,
                    width=800)
fig.update_layout(mapbox_style='open-street-map')
fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
fig.update_layout(title_text="GPS trafic")
fig.show()