### Verification phoneName difference

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
import os
from glob import glob

import matplotlib.pyplot as plt
plt.style.use('seaborn')

import folium
import branca

Data read

In [None]:
trainfile = pd.read_csv("../input/google-smartphone-decimeter-challenge/baseline_locations_train.csv")
testfile = pd.read_csv("../input/google-smartphone-decimeter-challenge/baseline_locations_test.csv")
submission = pd.read_csv("../input/google-smartphone-decimeter-challenge/sample_submission.csv")

In [None]:
datapath = Path("../input/google-smartphone-decimeter-challenge")
truths = (datapath / 'train').rglob('ground_truth.csv')

cols = ['collectionName', 'phoneName', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg']
truth_arr =[]
for filepath in tqdm(truths, total=73):
    df_buf = pd.read_csv(filepath, usecols=cols)
    truth_arr.append(df_buf)
    
df_truth = pd.concat(truth_arr, ignore_index=True)
df_train = pd.merge(trainfile, df_truth, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], suffixes=("_current", "_truth"))

In [None]:
df_train.sample(3)

Latitude / longitude difference and simple distance

In [None]:
df_train['lat_dif'] = df_train['latDeg_truth'] - df_train['latDeg_current']
df_train['lng_dif'] = df_train['lngDeg_truth'] - df_train['lngDeg_current']
df_train['dummy_dist'] = df_train['lat_dif']**2 + df_train['lng_dif']**2

In [None]:
pnames = df_train['phoneName'].unique()
pnames

In [None]:
df_train.groupby('phoneName').std()[['lat_dif','lng_dif']]

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(15, 15))
ax = axes.flatten()
for i, pn in enumerate(pnames):
    df_g = df_train[df_train['phoneName']==pn]
    ax[i].scatter(df_g['lng_dif'], df_g['lat_dif'], s=20, c='blue', alpha=0.5)
    ax[i].spines['left'].set(position=('data', 0.0))
    ax[i].spines['bottom'].set(position=('data', 0.0))
    ax[i].set_title(pn)
    if pn == 'Mi8': continue
    if pn == 'Pixel4':
        ax[i].set_xlim((-0.015, 0.015))
        ax[i].set_ylim((-0.015, 0.015))
    else:
        ax[i].set_xlim((-0.006, 0.003))
        ax[i].set_ylim((-0.006, 0.003))
        
plt.show()

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(15, 15))
ax = axes.flatten()
for i, pn in enumerate(pnames):
    df_g = df_train[df_train['phoneName']==pn]
    ax[i].hist(df_g['dummy_dist'], bins=20)
    ax[i].set_yscale('log')
    ax[i].set_title(pn)
    
plt.show()

The variation of Mi8 is remarkable.

### Check sample collectionName.

In [None]:
cname = df_train['collectionName'].unique()[4]
df_cn = df_train[df_train['collectionName']==cname][['phoneName','latDeg_current','lngDeg_current','latDeg_truth','lngDeg_truth','dummy_dist']]
print('ex.', cname)

In [None]:
cm = branca.colormap.LinearColormap(['blue','lime','red'], vmin=0, vmax=np.quantile(df_cn['dummy_dist'], 0.95))
cm

In [None]:
center = df_cn[['latDeg_current','lngDeg_current']].mean().tolist()
m = folium.Map(location=center, zoom_start=10)
for pn, df_pn in df_cn.groupby('phoneName'):
    fg = folium.FeatureGroup(name=pn)
    cm = branca.colormap.LinearColormap(['blue','lime','red'], vmin=0, vmax=np.quantile(df_pn['dummy_dist'], 0.95))
    for pn, latc, lonc, latt, lont, dist in df_pn.values:
#         folium.Circle(location=[latc,lonc], radius=2, color='tomato').add_to(fg)
#         folium.Circle(location=[latt,lont], radius=2, color='blue').add_to(fg)
        folium.Circle(location=[latt,lont], radius=8, color=cm(dist), fill=True).add_to(fg)
    folium.Marker(location=df_pn[['latDeg_current','lngDeg_current']].values[0].tolist(), popup=pn).add_to(fg)
    fg.add_to(m)

folium.LayerControl(collapsed=False).add_to(m)
m

Although there are differences in variation, the points that do not deviate are similar. 