In [None]:
!pip install simdkalman

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import simdkalman
from tqdm.notebook import tqdm
import itertools
from skopt import gp_minimize
from skopt.space import Real, Integer

In [None]:
T = 1.0
size = 4
noise = 1e-5
obs_noise = 5e-5

def make_shifted_matrix(vec):
    matrix = []
    size = len(vec)
    for i in range(size):
        row = [0] * i + vec[:size-i]
        matrix.append(row)
    return np.array(matrix)

def make_state_vector(T, size):
    vector = [1, 0]
    step = 2
    for i in range(size - 2):
        if i % 2 == 0:
            vector.append(T)
            T *= T / step
            step += 1
        else:
            vector.append(0)
    return vector

def make_noise_vector(noise, size):
    noise_vector = []
    for i in range(size):
        if i > 0 and i % 2 == 0:
            noise *= 0.5
        noise_vector.append(noise)
    return noise_vector

def make_kalman_filter(T, size, noise, obs_noise):
    vec = make_state_vector(T, size)
    state_transition = make_shifted_matrix(vec)
    process_noise = np.diag(make_noise_vector(noise, size)) + np.ones(size) * 1e-9
    observation_model = np.array([[1] + [0] * (size - 1), [0, 1] + [0] * (size - 2)])
    observation_noise = np.diag([obs_noise] * 2) + np.ones(2) * 1e-9
    kf = simdkalman.KalmanFilter(
            state_transition = state_transition,
            process_noise = process_noise,
            observation_model = observation_model,
            observation_noise = observation_noise)
    return kf

def apply_kf_smoothing(df, kf_):
    unique_paths = df[['collectionName', 'phoneName']].drop_duplicates().to_numpy()
    for collection, phone in unique_paths:
        cond = np.logical_and(df['collectionName'] == collection, df['phoneName'] == phone)
        data = df[cond][['latDeg', 'lngDeg']].to_numpy()
        data = data.reshape(1, len(data), 2)
        smoothed = kf_.smooth(data)
        df.loc[cond, 'latDeg'] = smoothed.states.mean[0, :, 0]
        df.loc[cond, 'lngDeg'] = smoothed.states.mean[0, :, 1]
    return df

In [None]:
data_path = Path("../input/google-smartphone-decimeter-challenge")

truths = (data_path / 'train').rglob('ground_truth.csv')
    # returns a generator

df_list = []
cols = ['collectionName', 'phoneName', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg']

def calculate_location(truths, kf):

    for t in truths:
        df_phone = pd.read_csv(t, usecols=cols)  
        df_list.append(df_phone)
    df_truth = pd.concat(df_list, ignore_index=True)

    df_basepreds_kf = apply_kf_smoothing(pd.read_csv('../input/google-smartphone-decimeter-challenge/baseline_locations_train.csv', usecols=cols), kf_=kf)
    df_all = df_truth.merge(df_basepreds_kf, how='inner', on=cols[:3], suffixes=('_truth', '_basepred'))
    return df_all

def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(a**0.5)
    dist = 6_367_000 * c
    return dist

def get_error(truths, kf):
    df_all = calculate_location(truths, kf)
    df_all['dist'] = calc_haversine(df_all.latDeg_truth, df_all.lngDeg_truth, 
        df_all.latDeg_basepred, df_all.lngDeg_basepred)
    error = df_all.dist.mean()
    return error

def optimize(params):
    T, half_size, noise, obs_noise = params
    size = half_size * 2
    kf = make_kalman_filter(T, size, noise, obs_noise)
    error = get_error(truths, kf)
    print(f'T = {T}, size = {size}, noise = {noise}, obs_noise = {obs_noise} => error = {error:.3f}m')
    return error

In [None]:
space = [Real(0.5, 1.5, name='T'), Integer(1, 4, name='half_size'), Real(1e-7, 1e-4, "log-uniform", name='noise'), Real(1e-7, 1e-4, "log-uniform", name='obs_noise')]

result = gp_minimize(optimize, space, n_calls=10)

In [None]:
T, half_size, noise, obs_noise = result.x
size = half_size * 2
print(f'Best result: T = {T}, size = {size}, noise = {noise} => error = {result.fun:.3f}m')

In [None]:
num_models = 5
best_param_indices = sorted(range(len(result.func_vals)), key=lambda x: result.func_vals[x])[:num_models]
best_params = [result.x_iters[i] for i in best_param_indices]
best_params

In [None]:
best_params = [[1.5, 2, 2.1714133956113952e-06, 1.719317114286542e-05]]

In [None]:
test_base = pd.read_csv(
'../input/google-smartphone-decimeter-challenge/baseline_locations_test.csv')
sub = pd.read_csv('../input/google-smartphone-decimeter-challenge/sample_submission.csv')
latDeg = []
lngDeg = []
for i in range(len(best_params)):
    T, half_size, noise, obs_noise = best_params[i]
    size = half_size * 2
    kf = make_kalman_filter(T, size, noise, obs_noise)
    kf_smoothed_baseline = apply_kf_smoothing(test_base, kf)
    latDeg.append(kf_smoothed_baseline.latDeg)
    lngDeg.append(kf_smoothed_baseline.lngDeg)
sub = sub.assign(
latDeg = np.mean(latDeg, axis =  0),
lngDeg = np.mean(lngDeg, axis =  0)
)
sub.to_csv('submission.csv', index=False)

In [None]:
!