In this sample notebook it is demonstrated how to apply Kalman Filter to improve the baseline slightly

The notebook is based on https://www.kaggle.com/jpmiller/baseline-from-host-data

## ensure you have everything you need

In [None]:
!pip install simdkalman

Please read the documentation if you would like to learn more about this implementation of kf: https://simdkalman.readthedocs.io/en/latest/

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import simdkalman
from tqdm.notebook import tqdm

## define kf model


In [None]:
T = 1.0
state_transition = np.array([[1, 0, T, 0, 0.5 * T ** 2, 0], [0, 1, 0, T, 0, 0.5 * T ** 2], [0, 0, 1, 0, T, 0],
                             [0, 0, 0, 1, 0, T], [0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 1]])
process_noise = np.diag([1e-5, 1e-5, 5e-6, 5e-6, 1e-6, 1e-6]) + np.ones((6, 6)) * 1e-9
observation_model = np.array([[1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0]])
observation_noise = np.diag([5e-5, 5e-5]) + np.ones((2, 2)) * 1e-9

kf = simdkalman.KalmanFilter(
        state_transition = state_transition,
        process_noise = process_noise,
        observation_model = observation_model,
        observation_noise = observation_noise)

In [None]:
def apply_kf_smoothing(df, kf_=kf):
    unique_paths = df[['collectionName', 'phoneName']].drop_duplicates().to_numpy()
    for collection, phone in tqdm(unique_paths):
        cond = np.logical_and(df['collectionName'] == collection, df['phoneName'] == phone)
        data = df[cond][['latDeg', 'lngDeg']].to_numpy()
        data = data.reshape(1, len(data), 2)
        smoothed = kf_.smooth(data)
        df.loc[cond, 'latDeg'] = smoothed.states.mean[0, :, 0]
        df.loc[cond, 'lngDeg'] = smoothed.states.mean[0, :, 1]
    return df

## evaluate train error

In [None]:
data_path = Path("../input/google-smartphone-decimeter-challenge")

truths = (data_path / 'train').rglob('ground_truth.csv')
    # returns a generator

df_list = []
cols = ['collectionName', 'phoneName', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg']

for t in tqdm(truths, total=73):
    df_phone = pd.read_csv(t, usecols=cols)  
    df_list.append(df_phone)
df_truth = pd.concat(df_list, ignore_index=True)

#df_basepreds_kf = apply_kf_smoothing(pd.read_csv('../input/google-smartphone-decimeter-challenge/baseline_locations_train.csv', usecols=cols))
#df_all = df_truth.merge(df_basepreds_kf, how='inner', on=cols[:3], suffixes=('_truth', '_basepred'))

In [None]:
# simplified haversine distance
def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(a**0.5)
    dist = 6_367_000 * c
    return dist

In [None]:
df_all['dist'] = calc_haversine(df_all.latDeg_truth, df_all.lngDeg_truth, 
    df_all.latDeg_basepred, df_all.lngDeg_basepred)

print(f'mean error on train dataset: {df_all.dist.mean():.3f}m - slightly better than the baseline, but still a lot of improvements are needed')


In [None]:
res = df_all.dist.quantile([0.5, 0.95]).to_numpy()
print(f'score on train dataset: {0.5*(res[0] + res[1])}, 0.5: {res[0]}, 0.95: {res[1]}')
#print(f'score on train dataset: {df_all.dist.mean():.3f}m - slightly better than the baseline, but still a lot of improvements are needed')


In [None]:
"""
def apply_kf_smoothing(df, i, kf_=kf):
    unique_paths = df[['collectionName', 'phoneName']].drop_duplicates().to_numpy()
    for collection, phone in tqdm(unique_paths):
        cond = np.logical_and(df['collectionName'] == collection, df['phoneName'] == phone)
        data = df[cond][['latDeg', 'lngDeg']].to_numpy()
        data = data.reshape(1, len(data), 2)
        smoothed = kf_.smooth(data)
        df.loc[cond, 'latDeg'] = smoothed.states.mean[0, :, 0] - smoothed.states.mean[0, :, 2] * i / 100
        df.loc[cond, 'lngDeg'] = smoothed.states.mean[0, :, 1] - smoothed.states.mean[0, :, 3] * i / 100
    return df

best_score = 9999

for i in np.arange(100):
    df_basepreds_kf = apply_kf_smoothing(pd.read_csv('../input/google-smartphone-decimeter-challenge/baseline_locations_train.csv', usecols=cols),
                                         i=i)
    df_all = df_truth.merge(df_basepreds_kf, how='inner', on=cols[:3], suffixes=('_truth', '_basepred'))
    df_all['dist'] = calc_haversine(df_all.latDeg_truth, df_all.lngDeg_truth, 
        df_all.latDeg_basepred, df_all.lngDeg_basepred)
    res = df_all.dist.quantile([0.5, 0.95]).to_numpy()
    score = 0.5*(res[0] + res[1])
    if score < best_score:
        print(f'i: {i} score on train dataset: {score}, 0.5: {res[0]}, 0.95: {res[1]}')
    #print(f'score on train dataset: {df_all.dist.mean():.3f}m - slightly better than the baseline, but still a lot of improvements are needed')
"""

In [None]:
"""
def apply_kf_smoothing(df, i, kf_=kf):
    unique_paths = df[['collectionName', 'phoneName']].drop_duplicates().to_numpy()
    for collection, phone in tqdm(unique_paths):
        cond = np.logical_and(df['collectionName'] == collection, df['phoneName'] == phone)
        data = df[cond][['latDeg', 'lngDeg']].to_numpy()
        data = data.reshape(1, len(data), 2)
        smoothed = kf_.smooth(data)
        df.loc[cond, 'latDeg'] = smoothed.states.mean[0, :, 0] - smoothed.states.mean[0, :, 2] * (0.01 + 0.2 * i / 100)
        df.loc[cond, 'lngDeg'] = smoothed.states.mean[0, :, 1] - smoothed.states.mean[0, :, 3] * (0.01 + 0.2 * i / 100)
    return df

best_score = 9999

for i in np.arange(100):
    df_basepreds_kf = apply_kf_smoothing(pd.read_csv('../input/google-smartphone-decimeter-challenge/baseline_locations_train.csv', usecols=cols),
                                         i=i)
    df_all = df_truth.merge(df_basepreds_kf, how='inner', on=cols[:3], suffixes=('_truth', '_basepred'))
    df_all['dist'] = calc_haversine(df_all.latDeg_truth, df_all.lngDeg_truth, 
        df_all.latDeg_basepred, df_all.lngDeg_basepred)
    res = df_all.dist.quantile([0.5, 0.95]).to_numpy()
    score = 0.5*(res[0] + res[1])
    if score < best_score:
        print(f'i: {i} score on train dataset: {score}, 0.5: {res[0]}, 0.95: {res[1]}')
    #print(f'score on train dataset: {df_all.dist.mean():.3f}m - slightly better than the baseline, but still a lot of improvements are needed')
"""

In [None]:
"""
def apply_kf_smoothing(df, i, kf_=kf):
    unique_paths = df[['collectionName', 'phoneName']].drop_duplicates().to_numpy()
    for collection, phone in tqdm(unique_paths):
        cond = np.logical_and(df['collectionName'] == collection, df['phoneName'] == phone)
        data = df[cond][['latDeg', 'lngDeg']].to_numpy()
        data = data.reshape(1, len(data), 2)
        smoothed = kf_.smooth(data)
        df.loc[cond, 'latDeg'] = smoothed.states.mean[0, :, 0] - smoothed.states.mean[0, :, 2] * (0.012 + 0.05 * i / 100)
        df.loc[cond, 'lngDeg'] = smoothed.states.mean[0, :, 1] - smoothed.states.mean[0, :, 3] * (0.012 + 0.05 * i / 100)
    return df

best_score = 9999

for i in np.arange(100):
    df_basepreds_kf = apply_kf_smoothing(pd.read_csv('../input/google-smartphone-decimeter-challenge/baseline_locations_train.csv', usecols=cols),
                                         i=i)
    df_all = df_truth.merge(df_basepreds_kf, how='inner', on=cols[:3], suffixes=('_truth', '_basepred'))
    df_all['dist'] = calc_haversine(df_all.latDeg_truth, df_all.lngDeg_truth, 
        df_all.latDeg_basepred, df_all.lngDeg_basepred)
    res = df_all.dist.quantile([0.5, 0.95]).to_numpy()
    score = 0.5*(res[0] + res[1])
    if score < best_score:
        print(f'i: {i} score on train dataset: {score}, 0.5: {res[0]}, 0.95: {res[1]}')
    #print(f'score on train dataset: {df_all.dist.mean():.3f}m - slightly better than the baseline, but still a lot of improvements are needed')
"""

In [None]:
"""
#best_score = 9999
def apply_kf_smoothing(df, i=0, kf_=kf):
    d01={'Mi8': 1,
  'Pixel4XL': 1,
  'Pixel4': 1,
  'Pixel4XLModded': 2,
  'Pixel5': 3,
    'SamsungS20Ultra': 8,
    'Pixel4Modded': 0}
    d02={'Mi8': 18,
  'Pixel4XL': 21,
  'Pixel4': 27,
  'Pixel4XLModded': 26,
  'Pixel5': 9,
    'SamsungS20Ultra': 6,
    'Pixel4Modded': 18}
    unique_paths = df[['collectionName', 'phoneName']].drop_duplicates().to_numpy()
    for collection, phone in tqdm(unique_paths):
        cond = np.logical_and(df['collectionName'] == collection, df['phoneName'] == phone)
        data = df[cond][['latDeg', 'lngDeg']].to_numpy()
        data = data.reshape(1, len(data), 2)
        smoothed = kf_.smooth(data)
        df.loc[cond, 'latDeg'] = smoothed.states.mean[0, :, 0] - smoothed.states.mean[0, :, 2] * (d01[phone] / 100 - 0.01 + d02[phone]/2000)
        df.loc[cond, 'lngDeg'] = smoothed.states.mean[0, :, 1] - smoothed.states.mean[0, :, 3] * (d01[phone] / 100 - 0.01 + d02[phone]/2000)
    return df

best_score={'Mi8': 9991,
'Pixel4XL': 9991,
'Pixel4': 9991,
'Pixel4XLModded': 9992,
'Pixel5': 9993,
'SamsungS20Ultra': 9998,
'Pixel4Modded': 9990}
for i in [0]:#np.arange(40):
    df_basepreds_kf = apply_kf_smoothing(pd.read_csv('../input/google-smartphone-decimeter-challenge/baseline_locations_train.csv', usecols=cols),
                                         i=i)
    df_all = df_truth.merge(df_basepreds_kf, how='inner', on=cols[:3], suffixes=('_truth', '_basepred'))
    df_all['dist'] = calc_haversine(df_all.latDeg_truth, df_all.lngDeg_truth, 
        df_all.latDeg_basepred, df_all.lngDeg_basepred)
    for phone in df_all['phoneName'].unique():
        df_phone = df_all[df_all['phoneName'] == phone]
        res = df_phone.dist.quantile([0.5, 0.95]).to_numpy()
        score = 0.5*(res[0] + res[1])
        if score < best_score[phone]:
            best_score[phone] = score
            print(f'i: {i} score on phone {phone} train sub-dataset: {score}, 0.5: {res[0]}, 0.95: {res[1]}')
        #print(f'score on train dataset: {df_all.dist.mean():.3f}m - slightly better than the baseline, but still a lot of improvements are needed')
"""

In [None]:
"""#best_score = 9999
def apply_kf_smoothing(df, i, kf_=kf):
    unique_paths = df[['collectionName', 'phoneName']].drop_duplicates().to_numpy()
    for collection, phone in tqdm(unique_paths):
        cond = np.logical_and(df['collectionName'] == collection, df['phoneName'] == phone)
        data = df[cond][['latDeg', 'lngDeg']].to_numpy()
        data = data.reshape(1, len(data), 2)
        smoothed = kf_.smooth(data)
        df.loc[cond, 'latDeg'] = smoothed.states.mean[0, :, 0] - smoothed.states.mean[0, :, 2] * (i / 100)
        df.loc[cond, 'lngDeg'] = smoothed.states.mean[0, :, 1] - smoothed.states.mean[0, :, 3] * (i / 100)
    return df

for i in np.arange(100):
    df_basepreds_kf = apply_kf_smoothing(pd.read_csv('../input/google-smartphone-decimeter-challenge/baseline_locations_train.csv', usecols=cols),
                                         i=i)
    df_all = df_truth.merge(df_basepreds_kf, how='inner', on=cols[:3], suffixes=('_truth', '_basepred'))
    df_all['dist'] = calc_haversine(df_all.latDeg_truth, df_all.lngDeg_truth, 
        df_all.latDeg_basepred, df_all.lngDeg_basepred)
    for collection in df_all['collectionName'].unique():
        df_collection = df_all[df_all['collectionName'] == collection]
        res = df_collection.dist.quantile([0.5, 0.95]).to_numpy()
        score = 0.5*(res[0] + res[1])
        #if score < best_score:
        print(f'i: {i} score on collection {collection} train sub-dataset: {score}, 0.5: {res[0]}, 0.95: {res[1]}')
        #print(f'score on train dataset: {df_all.dist.mean():.3f}m - slightly better than the baseline, but still a lot of improvements are needed')
"""

In [None]:
df_all.head()

## prepare a submission based on the sample submission

In [None]:

#best_score = 9999
def apply_kf_smoothing(df, i=0, kf_=kf):
    d01={'Mi8': 1,
  'Pixel4XL': 1,
  'Pixel4': 1,
  'Pixel4XLModded': 2,
  'Pixel5': 3,
    'SamsungS20Ultra': 8,
    'Pixel4Modded': 0}
    d02={'Mi8': 18,
  'Pixel4XL': 21,
  'Pixel4': 27,
  'Pixel4XLModded': 26,
  'Pixel5': 9,
    'SamsungS20Ultra': 6,
    'Pixel4Modded': 18}
    unique_paths = df[['collectionName', 'phoneName']].drop_duplicates().to_numpy()
    for collection, phone in tqdm(unique_paths):
        cond = np.logical_and(df['collectionName'] == collection, df['phoneName'] == phone)
        data = df[cond][['latDeg', 'lngDeg']].to_numpy()
        data = data.reshape(1, len(data), 2)
        smoothed = kf_.smooth(data)
        df.loc[cond, 'latDeg'] = smoothed.states.mean[0, :, 0] - smoothed.states.mean[0, :, 2] * (d01[phone] / 100 - 0.01 + d02[phone]/2000)
        df.loc[cond, 'lngDeg'] = smoothed.states.mean[0, :, 1] - smoothed.states.mean[0, :, 3] * (d01[phone] / 100 - 0.01 + d02[phone]/2000)
    return df

best_score={'Mi8': 9991,
'Pixel4XL': 9991,
'Pixel4': 9991,
'Pixel4XLModded': 9992,
'Pixel5': 9993,
'SamsungS20Ultra': 9998,
'Pixel4Modded': 9990}
for i in [0]:#np.arange(40):
    df_basepreds_kf = apply_kf_smoothing(pd.read_csv('../input/google-smartphone-decimeter-challenge/baseline_locations_train.csv', usecols=cols),
                                         i=i)
    df_all = df_truth.merge(df_basepreds_kf, how='inner', on=cols[:3], suffixes=('_truth', '_basepred'))
    df_all['dist'] = calc_haversine(df_all.latDeg_truth, df_all.lngDeg_truth, 
        df_all.latDeg_basepred, df_all.lngDeg_basepred)
    for phone in df_all['phoneName'].unique():
        df_phone = df_all[df_all['phoneName'] == phone]
        res = df_phone.dist.quantile([0.5, 0.95]).to_numpy()
        score = 0.5*(res[0] + res[1])
        if score < best_score[phone]:
            best_score[phone] = score
            print(f'i: {i} score on phone {phone} train sub-dataset: {score}, 0.5: {res[0]}, 0.95: {res[1]}')
        #print(f'score on train dataset: {df_all.dist.mean():.3f}m - slightly better than the baseline, but still a lot of improvements are needed')


In [None]:
res = df_all.dist.quantile([0.5, 0.95]).to_numpy()
score = 0.5*(res[0] + res[1])
print(f'i: {i} score on the train dataset: {score}, 0.5: {res[0]}, 0.95: {res[1]}')

In [None]:
test_base = pd.read_csv(
    '../input/google-smartphone-decimeter-challenge/baseline_locations_test.csv')

sub = pd.read_csv('../input/google-smartphone-decimeter-challenge/sample_submission.csv')

kf_smoothed_baseline = apply_kf_smoothing(test_base)
sub = sub.assign(
    latDeg = kf_smoothed_baseline.latDeg,
    lngDeg = kf_smoothed_baseline.lngDeg
)
sub.to_csv('submission.csv', index=False)