In [2]:

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles
import seaborn as sns
from tqdm.notebook import tqdm
from pathlib import Path
import plotly
import plotly.express as px


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [8]:
data_path = Path("../google-smartphone-decimeter-challenge")
df_test = pd.read_csv(
    data_path / 'baseline_locations_test.csv')
df_sub    = pd.read_csv(
    data_path / 'sample_submission.csv')

In [9]:
truths = (data_path / 'train').rglob('ground_truth.csv')
df_list = []
cols = ['collectionName', 'phoneName', 'millisSinceGpsEpoch', 'latDeg',
       'lngDeg']
for t in tqdm(truths, total=73):
    df_phone = pd.read_csv(t, usecols=cols)  
    df_list.append(df_phone)
df_truth = pd.concat(df_list, ignore_index=True)
df_phone.head()
df_basepreds = pd.read_csv(data_path / 'baseline_locations_train.csv', usecols=cols)
df_all = df_truth.merge(df_basepreds, how='inner', on=cols[:3], suffixes=('_truth', '_basepred'))


  0%|          | 0/73 [00:00<?, ?it/s]

In [10]:
def calc_haversine(lat1, lon1, lat2, lon2):
    
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(a**0.5)
    return dist

In [11]:
df_all['dist'] = calc_haversine(df_all.latDeg_truth, df_all.lngDeg_truth, 
    df_all.latDeg_basepred, df_all.lngDeg_basepred)
df_all.dist.describe()
df_all.sort_values(by = 'dist',ascending = False)[['collectionName','dist']].head(10)

Unnamed: 0,collectionName,dist
64035,2020-09-04-US-SF-1,8340.257976
114354,2020-07-17-US-MTV-2,5050.995543
52894,2021-04-26-US-SVL-1,2254.344928
113362,2020-07-17-US-MTV-2,2026.294654
113360,2020-07-17-US-MTV-2,1934.676643
108223,2021-04-29-US-SJC-2,1599.570433
83930,2020-05-29-US-MTV-1,1128.348831
113361,2020-07-17-US-MTV-2,1044.316856
54443,2021-01-05-US-SVL-1,653.703379
74448,2021-04-15-US-MTV-1,549.061548


In [12]:
df_test['dist_pre'] = 0
df_test['dist_pro'] = 0
df_test['latDeg_pre'] = df_test['latDeg'].shift(periods=1,fill_value=0)
df_test['lngDeg_pre'] = df_test['lngDeg'].shift(periods=1,fill_value=0)
df_test['latDeg_pro'] = df_test['latDeg'].shift(periods=-1,fill_value=0)
df_test['lngDeg_pro'] = df_test['lngDeg'].shift(periods=-1,fill_value=0)
df_test['dist_pre'] = calc_haversine(df_test.latDeg_pre, df_test.lngDeg_pre, df_test.latDeg, df_test.lngDeg)
df_test['dist_pro'] = calc_haversine(df_test.latDeg, df_test.lngDeg, df_test.latDeg_pro, df_test.lngDeg_pro)

list_phone = df_test['phone'].unique()
for phone in list_phone:
    ind_s = df_test[df_test['phone'] == phone].index[0]
    ind_e = df_test[df_test['phone'] == phone].index[-1]
    df_test.loc[ind_s,'dist_pre'] = 0
    df_test.loc[ind_e,'dist_pro'] = 0
df_test.dist_pre.describe()

count    91486.000000
mean        16.937410
std         12.526582
min          0.000000
25%          5.200745
50%         14.842604
75%         28.551707
max        391.394578
Name: dist_pre, dtype: float64

In [13]:
pro_95 = df_test['dist_pro'].mean() + (df_test['dist_pro'].std() * 2)
pre_95 = df_test['dist_pre'].mean() + (df_test['dist_pre'].std() * 2)
ind = df_test[(df_test['dist_pro'] > pro_95)&(df_test['dist_pre'] > pre_95)][['dist_pre','dist_pro']].index

for i in ind:
    df_test.loc[i,'latDeg'] = (df_test.loc[i-1,'latDeg'] + df_test.loc[i+1,'latDeg'])/2
    df_test.loc[i,'lngDeg'] = (df_test.loc[i-1,'lngDeg'] + df_test.loc[i+1,'lngDeg'])/2

In [14]:
!pip install simdkalman
import numpy as np
import pandas as pd
import simdkalman
from tqdm.notebook import tqdm

Collecting simdkalman
  Downloading simdkalman-1.0.2-py2.py3-none-any.whl (11 kB)
Installing collected packages: simdkalman
Successfully installed simdkalman-1.0.2


In [15]:
T = 1.0
state_transition = np.array([[1, 0, T, 0, 0.5 * T ** 2, 0], [0, 1, 0, T, 0, 0.5 * T ** 2], [0, 0, 1, 0, T, 0],
                             [0, 0, 0, 1, 0, T], [0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 1]])
process_noise = np.diag([1e-5, 1e-5, 5e-6, 5e-6, 1e-6, 1e-6]) + np.ones((6, 6)) * 1e-9
observation_model = np.array([[1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0]])
observation_noise = np.diag([5e-5, 5e-5]) + np.ones((2, 2)) * 1e-9

kf = simdkalman.KalmanFilter(
        state_transition = state_transition,
        process_noise = process_noise,
        observation_model = observation_model,
        observation_noise = observation_noise)

In [16]:
def apply_kf_smoothing(df, kf_=kf):
    unique_paths = df[['collectionName', 'phoneName']].drop_duplicates().to_numpy()
    for collection, phone in tqdm(unique_paths):
        cond = np.logical_and(df['collectionName'] == collection, df['phoneName'] == phone)
        data = df[cond][['latDeg', 'lngDeg']].to_numpy()
        data = data.reshape(1, len(data), 2)
        smoothed = kf_.smooth(data)
        df.loc[cond, 'latDeg'] = smoothed.states.mean[0, :, 0]
        df.loc[cond, 'lngDeg'] = smoothed.states.mean[0, :, 1]
    return df

In [17]:
kf_smoothed_baseline = apply_kf_smoothing(df_test)
df_sub = df_sub.assign(
    latDeg = kf_smoothed_baseline.latDeg,
    lngDeg = kf_smoothed_baseline.lngDeg
)
df_sub.to_csv('submission.csv', index=False)

  0%|          | 0/48 [00:00<?, ?it/s]