This notebook is the most accurate notebook that [colum2131](https://www.kaggle.com/columbia2131), [tubo213](https://www.kaggle.com/tubotubo) and [penguin46](https://www.kaggle.com/ryotayoshinobu) created before merging with [chris](https://www.kaggle.com/chris62) and [
Akio Saito](https://www.kaggle.com/saitodevel01). Descriptions of each process can be found in [this discussion](https://www.kaggle.com/c/google-smartphone-decimeter-challenge/discussion/261739). Please comment if there are any unclear points.

In [None]:
%%capture
!pip install pandarallel 
!pip install vincenty
!pip install simdkalman

In [None]:
import warnings
warnings.simplefilter('ignore')

from contextlib import contextmanager
from glob import glob
from time import time
import pickle

import lightgbm as lgbm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns
from pathlib import Path
import torch
from scipy import interpolate
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
from sklearn.neighbors import KNeighborsRegressor
from tqdm.notebook import tqdm
from tqdm._tqdm_notebook import tqdm_notebook as tqdm
tqdm.pandas()

import simdkalman
from pandarallel import pandarallel
pandarallel.initialize()
from vincenty import vincenty
import cupy as cp

In [None]:
import pyproj
from pyproj import Proj, transform

def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(a**0.5)
    dist = 6_367_000 * c
    return dist

def WGS84_to_ECEF(lat, lon, alt):
    # convert to radians
    rad_lat = lat * (np.pi / 180.0)
    rad_lon = lon * (np.pi / 180.0)
    a = 6378137.0
    # f is the flattening factor
    finv = 298.257223563
    f = 1 / finv   
    # e is the eccentricity
    e2 = 1 - (1 - f) * (1 - f)    
    # N is the radius of curvature in the prime vertical
    N = a / np.sqrt(1 - e2 * np.sin(rad_lat) * np.sin(rad_lat))
    x = (N + alt) * np.cos(rad_lat) * np.cos(rad_lon)
    y = (N + alt) * np.cos(rad_lat) * np.sin(rad_lon)
    z = (N * (1 - e2) + alt)        * np.sin(rad_lat)
    return x, y, z


transformer = pyproj.Transformer.from_crs(
    {"proj":'geocent', "ellps":'WGS84', "datum":'WGS84'},
    {"proj":'latlong', "ellps":'WGS84', "datum":'WGS84'},)


def ECEF_to_WGS84(x,y,z):
    lng, lat, alt = transformer.transform(x,y,z,radians=False)
    return lat, lng, alt


W2E = lambda r: WGS84_to_ECEF(r.latDeg, r.lngDeg, r.heightKNN)
E2W = lambda r: ECEF_to_WGS84(r.x, r.y, r.z)


def visualize_trafic(df:pd.DataFrame,lat='latDeg',lng='lngDeg',color=None,savepath=None,zoom=9,center={"lat":37.423576, "lon":-122.094132}):
    fig = px.scatter_mapbox(df,
                            
                            # Here, plotly gets, (x,y) coordinates
                            lat=lat,
                            lon=lng,                          
                            #Here, plotly detects color of series
                            color=color,
                            zoom=zoom,
                            center=center,
                            height=600,
                            width=800)
    fig.update_layout(mapbox_style='stamen-terrain')
    fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
    fig.update_layout(title_text="GPS trafic")
    fig.show()
    if savepath != None:
        fig.write_html(savepath)

# Reading-data

In [None]:
def vincenty_meter(r, lat='latDeg', lng='lngDeg', tlat='t_latDeg', tlng='t_lngDeg'):
    return vincenty((r[lat], r[lng]), (r[tlat], r[tlng])) * 1000


def check_meter(input_df: pd.DataFrame, save=False):
    output_df = input_df.copy()
    
    output_df['meter'] = input_df.parallel_apply(vincenty_meter, axis=1)
    if save == True:
        output_df.to_csv('train_output.csv', index=False)

    meter_score = output_df['meter'].mean()
    print(f'meter: {meter_score}') # 2.533116208067488

    scores = []
    for phone in output_df['phone'].unique():
        p_50 = np.percentile(output_df.loc[output_df['phone']==phone, 'meter'], 50)
        p_95 = np.percentile(output_df.loc[output_df['phone']==phone, 'meter'], 95)
        scores.append(p_50)
        scores.append(p_95)

    score = sum(scores) / len(scores)
    print(f'CV: {score}') # 3.53009109589041
    
    return output_df


def get_groundtruth():
    output_df = pd.DataFrame()
    
    for path in glob(str(BASE_DIR / 'train/*/*/ground_truth.csv')):
        _df = pd.read_csv(path)
        output_df = pd.concat([output_df, _df])
    output_df = output_df.reset_index(drop=True)
    
    _columns = ['latDeg', 'lngDeg', 'heightAboveWgs84EllipsoidM']
    output_df[['t_'+col for col in _columns]] = output_df[_columns]
    output_df = output_df.drop(columns=_columns, axis=1)
    return output_df

In [None]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
BASE_DIR = Path('../input/google-smartphone-decimeter-challenge')

train_base = pd.read_csv(BASE_DIR / 'baseline_locations_train.csv')
test_base = pd.read_csv(BASE_DIR / 'baseline_locations_test.csv')
sample = pd.read_csv(BASE_DIR / 'sample_submission.csv')

train_base = train_base.merge(
    get_groundtruth(), on=['collectionName', 'phoneName', 'millisSinceGpsEpoch']
)
train_base = check_meter(train_base)

# kNN(height)

In [None]:
def fit_knn_height(X: pd.DataFrame, y: pd.Series, n_neighbors=15):
    model = KNeighborsRegressor(n_neighbors=n_neighbors, weights='distance')
    model.fit(X.values, y.values)
    return model


def pred_knn_height(X: pd.DataFrame, model):
    return model.predict(X.values)

In [None]:
knn_model = fit_knn_height(
    train_base[['t_latDeg', 't_lngDeg']],
    train_base['t_heightAboveWgs84EllipsoidM'])

train_base['heightKNN'] = pred_knn_height(train_base[['latDeg', 'lngDeg']], knn_model)
test_base['heightKNN'] = pred_knn_height(test_base[['latDeg', 'lngDeg']], knn_model)
train_base['heightDiff'] = np.abs(train_base['heightAboveWgs84EllipsoidM'] - train_base['heightKNN'])
test_base['heightDiff']  = np.abs(test_base['heightAboveWgs84EllipsoidM'] - test_base['heightKNN'])

del knn_model

# outlier detection

In [None]:
def calc_haversine(lat1, lon1, lat2, lon2):
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(a**0.5)
    dist = 6_367_000 * c
    return dist

def generate_waypoints(train=train_base.copy()):
    train['area'] = train['phone'].apply(lambda s : s.split('-')[4])
    train["phone"] = train.collectionName.str.cat(train.phoneName, sep="_")
    train = train[train.area=="SJC"]

    # augment by interpolate
    dfs = []
    for c, c_df in tqdm(train.groupby("collectionName")):    
        c_df = c_df.sort_values("millisSinceGpsEpoch")  

        f_lat = interpolate.interp1d(c_df.millisSinceGpsEpoch, c_df.t_latDeg, kind='linear')
        f_lng = interpolate.interp1d(c_df.millisSinceGpsEpoch, c_df.t_lngDeg, kind='linear')

        start_time = c_df.millisSinceGpsEpoch.min()
        end_time = c_df.millisSinceGpsEpoch.max()

        times = range(start_time, end_time, 19)
        lats = f_lat(times)
        lngs = f_lng(times)

        dfs.append(pd.DataFrame({
            "collectionName": c,
            "latDeg": lats,
            "lngDeg": lngs,          
            "gt_time": times,
        })) 

    return pd.concat(dfs).drop_duplicates().reset_index(drop=True).copy()

def closest_point(point, waypoints):
    point = cp.radians(cp.array([point]))
    waypoints  = cp.array(waypoints)
    _waypoints = cp.radians(cp.array(waypoints))
    diffs = point - _waypoints
    a = cp.sin(diffs[:,0]/2.0)**2 + cp.cos(_waypoints[:,0])*cp.cos(point[:,0])*cp.sin(diffs[:,1]/2.0)**2
    c = 2 * cp.arcsin(a**0.5)
    dist = 6_367_000 * c
    return waypoints[dist.argmin()].get().tolist()

def calc_dist(pred, waypoints):
    pred = cp.radians(cp.array(pred))
    waypoints = cp.radians(cp.array(waypoints))
    diffs = pred - waypoints
    a = cp.sin(diffs[:,0]/2.0)**2 + cp.cos(waypoints[:,0])*cp.cos(pred[:,0])*cp.sin(diffs[:,1]/2.0)**2
    c = 2 * cp.arcsin(a**0.5)
    dist = 6_367_000 * c
    return dist.get()

def add_latlng(df):
    df['latlng'] = [[x, y] for x,y in zip(df['latDeg'], df['lngDeg'])]
    return df

def snap_to_grid(sub, train_waypoints, threshhold=1e10):
    sub = add_latlng(sub)
    sub['matched_point'] = sub['latlng'].apply(lambda x:closest_point(x,train_waypoints))
    sub['dist'] = calc_dist(sub['latlng'].tolist(),sub['matched_point'].tolist())

    sub['_latDeg_'] = sub['latlng'].apply(lambda x:x[0])
    sub['_lngDeg_'] = sub['latlng'].apply(lambda x:x[1])

    sub.loc[sub['dist'] < threshhold,'_latDeg_'] = sub['matched_point'].apply(lambda x:x[0])
    sub.loc[sub['dist'] < threshhold,'_lngDeg_'] = sub['matched_point'].apply(lambda x:x[1])
    return sub[["phone", "millisSinceGpsEpoch", "dist"]].copy()

def processing(input_df: pd.DataFrame, L=25):
    output_df = pd.DataFrame(dtype=np.float64)
    shift_list = list(range(-L, L+1, 1))

    for i in shift_list:
        output_df = pd.concat([
            output_df,
            input_df.groupby('phone')[['heightDiff', 'dist']]\
                .shift(i).add_prefix(f'shift{i}_')
        ], axis=1)

    for i in shift_list:
        if i == 0: continue
        output_df = pd.concat([
            output_df,
            input_df.groupby('phone')[['latDeg', 'lngDeg']]\
                .diff(i).add_prefix(f'diff{i}_')
        ], axis=1)
    output_df = pd.concat([
        output_df,
        input_df.groupby('phone')[['latDeg', 'lngDeg']]\
            .pct_change(i).add_prefix(f'change{i}_')
    ], axis=1)

    return output_df

@contextmanager
def timer(logger=None, format_str='{:.3f}[s]', prefix=None, suffix=None):
    if prefix: format_str = str(prefix) + format_str
    if suffix: format_str = format_str + str(suffix)
    start = time()
    yield
    d = time() - start
    out_str = format_str.format(d)
    if logger:
        logger.info(out_str)
    else:
        print(out_str)

def fit_lgbm(X, y, train_df=train_base, params: dict=None, verbose=100, seed: int=42):
    models = []
    oof_pred = np.zeros(len(y), dtype=np.float64)
    
    kf = GroupKFold(n_splits=N_SPLITS)
    for i, (idx_train, idx_valid) in enumerate(kf.split(X, y, train_df['collectionName'].reset_index(drop=True))):
        x_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
        x_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]

        model = lgbm.LGBMClassifier(**params)
        with timer(prefix='fit fold={} '.format(i + 1)):
            model.fit(x_train, y_train, 
                eval_set=[(x_valid, y_valid)],  
                early_stopping_rounds=verbose, 
                eval_metric='logloss',
                verbose=verbose)
            
        pred_i = model.predict_proba(x_valid)[:, 1]
        oof_pred[x_valid.index] = pred_i
        models.append(model)

    return oof_pred, models

def predict_lgbm(models, feat_df):
    pred = np.array([model.predict_proba(feat_df.values)[:, 1] for model in models])
    pred = np.mean(pred, axis=0)
    return pred

### Only SJC

In [None]:
# Generate roads
train_waypoints = generate_waypoints()
train_base['area'] = train_base['phone'].apply(lambda s : s.split('-')[4])
test_base['area'] = test_base['phone'].apply(lambda s : s.split('-')[4])

train_base = train_base.merge(
    snap_to_grid(
        train_base.loc[train_base.area=="SJC"],
        train_waypoints[["latDeg", "lngDeg"]]
    ), on=["phone", "millisSinceGpsEpoch"], how="outer")

test_base = test_base.merge(
    snap_to_grid(
        test_base.loc[test_base.area=="SJC"],
        train_waypoints[["latDeg", "lngDeg"]]
    ), on=["phone", "millisSinceGpsEpoch"], how="outer")

In [None]:
loss_threshold = 9.5
visibility     = 22

train_base['t_isOutlier'] = (train_base.meter > loss_threshold).astype(int)

params = {
 'reg_alpha': 0.01,
 'reg_lambda': 0.01, 
 'num_leaves': 40,
 'n_estimators': 10000,
 'learning_rate': 0.1,
 'random_state': 42,
 'max_depth': -1
}

N_SPLITS = 3
oof, models = fit_lgbm(processing(train_base[train_base.area=="SJC"].reset_index(drop=True), visibility), 
                       train_base.loc[train_base.area=="SJC", 't_isOutlier'], 
                       train_df=train_base[train_base.area=="SJC"],
                       params=params)
pred = predict_lgbm(models, processing(test_base[test_base.area=="SJC"].reset_index(drop=True), visibility))

train_base.loc[train_base.area=="SJC", 'isOutlier'] = (oof > 0.5).astype(int)
test_base.loc[test_base.area=="SJC", 'isOutlier'] = (pred > 0.5).astype(int)
test_base.loc[test_base.collectionName=="2021-04-02-US-SJC-1", 'isOutlier'] = 0


print('score', accuracy_score((oof>0.5).astype(int), train_base.loc[train_base.area=="SJC", 't_isOutlier']))
print(confusion_matrix((oof>0.5).astype(int), train_base.loc[train_base.area=="SJC", 't_isOutlier']))

### OtherSJC

In [None]:
def processing(input_df: pd.DataFrame, L=25):
    output_df = pd.DataFrame(dtype=np.float64)
    shift_list = list(range(-L, L+1, 1))

    for i in shift_list:
        if i == 0: continue
        output_df = pd.concat([
            output_df,
            input_df.groupby('phone')[['latDeg', 'lngDeg']]\
                .diff(i).add_prefix(f'diff{i}_')
        ], axis=1)
    output_df = pd.concat([
        output_df,
        input_df.groupby('phone')[['latDeg', 'lngDeg']]\
            .pct_change(i).add_prefix(f'change{i}_')
    ], axis=1)

    return output_df

In [None]:
loss_threshold = 5.6 
visibility     = 26   

train_base['t_isOutlier'] = (train_base.meter > loss_threshold).astype(int)

params = {
 'reg_alpha': 0.01,
 'reg_lambda': 0.01, 
 'num_leaves': 40,
 'n_estimators': 10000,
 'learning_rate': 0.1,
 'random_state': 42,
 'max_depth': -1
}

N_SPLITS = 5
use_train_index = train_base.area!="SJC"
use_test_index = (test_base.area!="SJC") | (test_base.collectionName=="2021-04-02-US-SJC-1")
oof, models = fit_lgbm(processing(train_base[use_train_index].reset_index(drop=True), visibility), 
                       train_base.loc[use_train_index, 't_isOutlier'].reset_index(drop=True), 
                       train_df=train_base[use_train_index].reset_index(drop=True),
                       params=params)
pred = predict_lgbm(models, processing(test_base[use_test_index].reset_index(drop=True), visibility))

train_base.loc[use_train_index, 'isOutlier'] = (oof > 0.5).astype(int)
test_base.loc[use_test_index, 'isOutlier'] = (pred > 0.5).astype(int)


print('score', accuracy_score((oof>0.5).astype(int), train_base.loc[use_train_index, 't_isOutlier']))
print(confusion_matrix((oof>0.5).astype(int), train_base.loc[use_train_index, 't_isOutlier']))

### Interpolate outliers

In [None]:
train_base.loc[train_base.isOutlier==1, ["latDeg", "lngDeg", "heightAboveWgs84EllipsoidM"]] = np.nan
test_base.loc[test_base.isOutlier==1, ["latDeg", "lngDeg", "heightAboveWgs84EllipsoidM"]] = np.nan

dfs = []
for phone, phone_df in train_base.groupby("phone"):
    phone_df.loc[:, ["latDeg", "lngDeg", 'heightAboveWgs84EllipsoidM']] \
        = phone_df.loc[:, ["latDeg", "lngDeg", "heightAboveWgs84EllipsoidM"]].interpolate(limit_area=None, limit_direction='both')
    dfs.append(phone_df.copy())
train_base = pd.concat(dfs).reset_index(drop=True)

dfs = []
for phone, phone_df in test_base.groupby("phone"):
    phone_df.loc[:, ["latDeg", "lngDeg", "heightAboveWgs84EllipsoidM"]] \
         = phone_df.loc[:, ["latDeg", "lngDeg", "heightAboveWgs84EllipsoidM"]].interpolate(limit_area=None, limit_direction='both')
    dfs.append(phone_df.copy())
test_base = pd.concat(dfs).reset_index(drop=True)

train_base["isOutlier"] = 0
test_base["isOutlier"] = 0

In [None]:
train_base = check_meter(train_base)

# StopMean

In [None]:
def processing(input_df: pd.DataFrame):
    output_df = pd.DataFrame(dtype=np.float64)
    shift_list = list(range(-15, 16, 1))
    
    for i in shift_list:
        if i == 0: continue
        output_df = pd.concat([
            output_df,
            input_df.groupby('phone')[['latDeg', 'lngDeg']]\
                .diff(i).add_prefix(f'diff{i}_')
        ], axis=1)
        
        output_df = pd.concat([
            output_df,
            input_df.groupby('phone')[['latDeg', 'lngDeg']]\
                .pct_change(i).add_prefix(f'change{i}_')
        ], axis=1)
    
    return output_df

@contextmanager
def timer(logger=None, format_str='{:.3f}[s]', prefix=None, suffix=None):
    if prefix: format_str = str(prefix) + format_str
    if suffix: format_str = format_str + str(suffix)
    start = time()
    yield
    d = time() - start
    out_str = format_str.format(d)
    if logger:
        logger.info(out_str)
    else:
        print(out_str)
        

def fit_lgbm(X, y, params: dict=None, verbose=100, seed: int=42):
    models = []
    oof_pred = np.zeros(len(y), dtype=np.float64)
    
    kf = GroupKFold(n_splits=N_SPLITS)
    for i, (idx_train, idx_valid) in enumerate(kf.split(X, y, train_base['collectionName'])):
        x_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
        x_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]

        model = lgbm.LGBMClassifier(**params)
        with timer(prefix='fit fold={} '.format(i + 1)):
            model.fit(x_train, y_train, 
                eval_set=[(x_valid, y_valid)],  
                early_stopping_rounds=verbose, 
                eval_metric='logloss',
                verbose=verbose)
            
        pred_i = model.predict_proba(x_valid)[:, 1]
        oof_pred[x_valid.index] = pred_i
        models.append(model)

    return oof_pred, models


def predict_lgbm(models, feat_df):
    pred = np.array([model.predict_proba(feat_df.values)[:, 1] for model in models])
    pred = np.mean(pred, axis=0)
    return pred


def stopmean(input_df: pd.DataFrame):
    output_df = input_df.copy()
    stop_index = []
    
    for _, sub_df in input_df.groupby('phone'): 
        _index = sub_df.index
        _stops = sub_df['stop'].tolist()
        for i in range(1, len(_stops)-1):
            if _stops[i-1] == 1 and  _stops[i] == 0 and  _stops[i+1] == 1:
                _stops[i] = 1

        before = 1 if _stops[0] == 1 else 0

        tmp = []
        if before:
            tmp.append(_index[0])

        for i, flag in enumerate(_stops):
            if flag == 1 and before == 0:
                tmp.append(_index[i]) 
            elif flag == 0 and before == 1:
                tmp.append(_index[i])
                stop_index.append(tmp)
                tmp = []
            before = flag
            
        if tmp:
            tmp.append(_index[-1]+1)
            stop_index.append(tmp)

    output_df['stop_id'] = 0
    for i, (indexi, indexj) in enumerate(stop_index):
        output_df.iloc[indexi:indexj]['stop_id'] = i+1

    stopid2deg = output_df.loc[output_df['stop_id']!=0].groupby('stop_id')[['latDeg', 'lngDeg']].mean()
    stopid2deg.columns = ['stop_latDeg', 'stop_lngDeg']

    output_df = output_df.merge(stopid2deg, on='stop_id', how='left')
    output_df.loc[output_df['stop_id']!=0, ['latDeg', 'lngDeg']] = output_df.loc[output_df['stop_id']!=0, ['stop_latDeg', 'stop_lngDeg']].values
    
    output_df = output_df.drop(columns=['stop_latDeg', 'stop_lngDeg'], axis=1)
    
    return output_df

In [None]:
train_base['target_stop'] = (train_base['speedMps']==0).astype(int)

params = {
 'reg_alpha': 0.01,
 'reg_lambda': 0.01, 
 'num_leaves': 40,
 'n_estimators': 10000,
 'learning_rate': 0.1,
 'random_state': 42,
 'max_depth': -1
}

N_SPLITS = 5
oof, models = fit_lgbm(processing(train_base), train_base['target_stop'], params=params)
pred = predict_lgbm(models, processing(test_base))

train_base['stop'] = (oof > 0.5).astype(int)
test_base['stop'] = (pred > 0.5).astype(int)

print('score', accuracy_score((oof>0.5).astype(int), train_base['target_stop']))
print(confusion_matrix((oof>0.5).astype(int), train_base['target_stop']))

del train_base['target_stop'] 
del models

In [None]:
train_base = stopmean(train_base)
test_base = stopmean(test_base)

In [None]:
train_base = check_meter(train_base)

# Kalman Filter

In [None]:
def interpolate_beforekm(input_df: pd.DataFrame):
    first_dict = dict(input_df.groupby('phone')['millisSinceGpsEpoch'].first())
    last_dict = dict(input_df.groupby('phone')['millisSinceGpsEpoch'].last())
    columns = ['phone', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg', 'isOutlier']

    time_df = pd.DataFrame()
    for phone in input_df['phone'].unique():
        _list = np.arange(first_dict[phone], last_dict[phone]+1000, 1000, dtype=int)
        times = input_df.loc[input_df['phone']==phone, 'millisSinceGpsEpoch'].unique()
        
        _df = pd.DataFrame({
            'phone': phone,
            'millisSinceGpsEpoch': _list})
        _df = pd.concat([
            _df.loc[~_df['millisSinceGpsEpoch'].isin(times)],
            input_df.loc[input_df['phone']==phone, columns]])

        _df['target'] = 0
        _df.loc[_df['millisSinceGpsEpoch'].isin(_list), 'target'] = 1

        # interpolate
        _df = _df.sort_values('millisSinceGpsEpoch')
        _df.index =_df['millisSinceGpsEpoch'].values
        _df['latDeg'] = _df['latDeg'].interpolate(method='index', limit_direction='both')
        _df['lngDeg'] = _df['lngDeg'].interpolate(method='index', limit_direction='both')
        _df['isOutlier'] = _df['isOutlier'].interpolate(method='index', limit_direction='both')

        time_df = pd.concat([time_df, _df])

    time_df = time_df.sort_values(['phone', 'millisSinceGpsEpoch']).reset_index(drop=True)
    return time_df


def kalmanfilter_interpolate(input_df: pd.DataFrame, base_df: pd.DataFrame, params: list):
    _index = (input_df['target']==1)
    _df = apply_kf_smoothing(
        input_df.loc[_index].reset_index(drop=True),
        params
    )
    input_df.loc[_index, ['latDeg', 'lngDeg']] = _df[['latDeg', 'lngDeg']].values
    
    input_df.loc[input_df['target']==0, 'latDeg'] = np.nan
    input_df.loc[input_df['target']==0, 'lngDeg'] = np.nan
    
    input_df.index = input_df['millisSinceGpsEpoch'].values
    for phone in input_df['phone'].unique():
        _index = (input_df['phone']==phone)
        input_df.loc[_index, 'latDeg'] = input_df.loc[_index, 'latDeg'].interpolate(method='index', limit_direction='both')
        input_df.loc[_index, 'lngDeg'] = input_df.loc[_index, 'lngDeg'].interpolate(method='index', limit_direction='both')
    
    base_df = base_df.drop(columns=['latDeg', 'lngDeg', 'isOutlier'], axis=1)
    output_df = base_df.merge(input_df, on=['phone', 'millisSinceGpsEpoch'])
    return output_df


###KALMAN FILTER###
def make_shifted_matrix(vec):
    matrix = []
    size = len(vec)
    for i in range(size):
        row = [0] * i + vec[:size-i]
        matrix.append(row)
    return np.array(matrix)


def make_state_vector(T, size):
    vector = [1, 0]
    step = 2
    for i in range(size - 2):
        if i % 2 == 0:
            vector.append(T)
            T *= T / step
            step += 1
        else:
            vector.append(0)
    return vector


def make_noise_vector(noise, size):
    noise_vector = []
    for i in range(size):
        if i > 0 and i % 2 == 0:
            noise *= 0.5
        noise_vector.append(noise)
    return noise_vector


def make_kalman_filter(T, size, noise, obs_noise):
    vec = make_state_vector(T, size)
    state_transition = make_shifted_matrix(vec)
    process_noise = np.diag(make_noise_vector(noise, size)) + np.ones(size) * 1e-9
    observation_model = np.array([[1] + [0] * (size - 1), [0, 1] + [0] * (size - 2)])
    observation_noise = np.diag([obs_noise] * 2) + np.ones(2) * 1e-9
    kf = simdkalman.KalmanFilter(
            state_transition = state_transition,
            process_noise = process_noise,
            observation_model = observation_model,
            observation_noise = observation_noise)
    return kf


def apply_kf_smoothing(df, params):     
    for name in df['phone'].unique():  
        df.loc[df.isOutlier>0.5, 'latDeg'] = np.nan
        df.loc[df.isOutlier>0.5, 'lngDeg'] = np.nan
        
    T, half_size, noise, obs_noise = params
    size = half_size * 2
    kf = make_kalman_filter(T, size, noise, obs_noise)
    
    unique_paths = df['phone'].unique()
    for phone in tqdm(unique_paths):
        _index = (df['phone']==phone)
        data = df.loc[_index,['latDeg', 'lngDeg']].to_numpy()
        data = data.reshape(1, len(data), 2)
        smoothed = kf.smooth(data)
        df.loc[_index, 'latDeg'] = smoothed.states.mean[0, :, 0]
        df.loc[_index, 'lngDeg'] = smoothed.states.mean[0, :, 1]
    
    return df

In [None]:
train_timedf = interpolate_beforekm(train_base)
test_timedf = interpolate_beforekm(test_base)

params = [1.5, 2, 1.3376883684997819e-07, 9.861453983492513e-07]
trainkm_df = kalmanfilter_interpolate(train_timedf, train_base, params)
testkm_df = kalmanfilter_interpolate(test_timedf, test_base, params)

del train_timedf
del test_timedf
del params

In [None]:
trainkm_df = check_meter(trainkm_df)

# collectionNameMean

In [None]:
def collectionNamemean(df,cols:list,weight_col=None):
    df_copy = df.copy()

    if weight_col != None:
        tmp_base = pd.pivot_table(data=df_copy,index=['collectionName','millisSinceGpsEpoch'],columns=['phoneName'],values=cols+[weight_col])\
        .reset_index(level='collectionName').groupby('collectionName')\
        .apply(lambda x:x.interpolate(limit_area='inside',method='index'))
        for col in cols:
            tmp_base[col] = tmp_base[col] * tmp_base[weight_col]
            tmp_base[col] = tmp_base[col].values / tmp_base[weight_col].sum(axis=1).values.reshape(-1,1)

        meta_df = tmp_base[cols[0]].sum(axis=1,skipna=True).reset_index().copy()
        meta_df.columns = ['millisSinceGpsEpoch',cols[0]]

        if len(cols) >=2:
            for i in range(1,len(cols)):
                meta_df[cols[i]] = tmp_base[cols[i]].sum(axis=1,skipna=True).reset_index()[0]

    else:
        tmp_base = pd.pivot_table(data=df_copy,index=['collectionName','millisSinceGpsEpoch'],columns=['phoneName'],values=cols)\
        .reset_index(level='collectionName').groupby('collectionName')\
        .apply(lambda x:x.interpolate(limit_area='inside',method='index'))
    
        meta_df = tmp_base[cols[0]].mean(axis=1,skipna=True).reset_index().copy()
        meta_df.columns = ['millisSinceGpsEpoch',cols[0]]
    
        if len(cols) >=2:
            for i in range(1,len(cols)):
                meta_df[cols[i]] = tmp_base[cols[i]].mean(axis=1,skipna=True).reset_index()[0]

    output_df = pd.merge(df_copy.drop(columns=cols),meta_df,how='left',on=['millisSinceGpsEpoch'])
    return output_df


In [None]:
trainkm_df = trainkm_df.sort_values(["collectionName", "phoneName", "millisSinceGpsEpoch"])
trainkm_df = trainkm_df.reset_index(drop=True)

testkm_df = testkm_df.sort_values(["phone", "millisSinceGpsEpoch"])
testkm_df = testkm_df.reset_index(drop=True)

satelite_train = pd.read_csv("../input/satelite/train_pseudorange.csv").fillna(np.inf)
range_colums = [f"pseudoranges_sigma_{i}" for i in range(100)]
trainkm_df["acc_satelite"] = 0
for c in range_colums:
    trainkm_df["acc_satelite"] += 1 / satelite_train[c]**2

satelite_test = pd.read_csv("../input/satelite/test_pseudorange.csv")
satelite_test = satelite_test.sort_values(["phone", "millisSinceGpsEpoch"])
satelite_test = satelite_test.reset_index(drop=True).fillna(np.inf)
range_colums = [f"pseudoranges_sigma_{i}" for i in range(100)]
testkm_df["acc_satelite"] = 0
for c in range_colums:
    testkm_df["acc_satelite"] += 1 / satelite_test[c]**2

trainmean_df = collectionNamemean(trainkm_df, ['latDeg','lngDeg'], 'acc_satelite')
testmean_df = collectionNamemean(testkm_df, ['latDeg','lngDeg'], 'acc_satelite')

In [None]:
trainmean_df = check_meter(trainmean_df)

# back shift

In [None]:
def get_Dm_back(lat1, lon1, lat2, lon2, D):
    L = calc_haversine(lat1, lon1, lat2, lon2)
    if L == 0:
        return lat2, lon2
    while abs(D - L) > 0.0001:
        L = calc_haversine(lat1, lon1, lat2, lon2)
        lat1, lon1 = (lat2 - D/L * (lat2 - lat1), lon2 - D/L * (lon2 - lon1))
    return lat1, lon1


def backshift(input_df: pd.DataFrame, d=0.42):
    dfs = []
    for pName, phone_df in tqdm(input_df.groupby(["phone"])):
        phone_df = phone_df.reset_index(drop=True).copy()
        phone_df["latDeg_moved"] = phone_df.latDeg
        phone_df["lngDeg_moved"] = phone_df.lngDeg
        phone_df.loc[1:, "latDeg_moved"], phone_df.loc[1:, "lngDeg_moved"] = np.vectorize(get_Dm_back)(
            phone_df.latDeg[:-1],
            phone_df.lngDeg[:-1],
            phone_df.latDeg[1:],
            phone_df.lngDeg[1:],
            d
        )
        dfs.append(phone_df.copy()) 

    output_df = pd.concat(dfs).reset_index(drop=True)
    output_df = output_df.drop(columns=['latDeg', 'lngDeg'], axis=1)
    output_df = output_df.rename(columns={'latDeg_moved': 'latDeg', 'lngDeg_moved': 'lngDeg'})
    
    return output_df

In [None]:
# stop mean
trainmean_df = stopmean(trainmean_df)
testmean_df = stopmean(testmean_df)

# back shift
trainshift_df = backshift(trainmean_df)
testshift_df = backshift(testmean_df)

In [None]:
trainshift_df = check_meter(trainshift_df)

# snap to grid & submission

In [None]:
def snap_to_grid(df,threshhold):
    dfs = []
    
    no_area = 0
    if 'area' not in df.columns:
        no_area = 1
        df['area'] = df['phone'].apply(lambda x:x.split('-')[-2])
    
    for area,gdf in tqdm(df.groupby('area')):
        if area in ['MTV', 'SF', 'RWC', 'SVL', 'SJC']:
            gdf['neighbored'] = knn_dict[area].predict(gdf[['latDeg','lngDeg']]).astype(int)
        else:
            gdf['neighbored'] = 0 #testにしかないareaに関してはとりあえず0で置く,thresholdで消されるからOK
        dfs.append(gdf['neighbored'])
    neighbored = pd.concat(dfs).sort_index().values
    
    df[["latDeg_near", "lngDeg_near", "millisSinceGpsEpoch_near"]] = wps.loc[neighbored, ["latDeg", "lngDeg", "millisSinceGpsEpoch"]].values
    df["d_near"] = np.vectorize(calc_haversine)(df.latDeg, df.lngDeg, df.latDeg_near, df.lngDeg_near)
    df.loc[df['d_near']<threshhold,'latDeg'] = df.loc[df['d_near']<threshhold,'latDeg_near']
    df.loc[df['d_near']<threshhold,'lngDeg'] = df.loc[df['d_near']<threshhold,'lngDeg_near']

    if no_area:
        df.drop(columns=['area'],inplace=True)
    
    return df.drop(columns=['latDeg_near','lngDeg_near','millisSinceGpsEpoch_near','d_near']).copy()


wps = pd.read_csv('../input/snap-data/train_waypoint.csv')
with open('../input/snap-data/knn_dict.pkl', 'rb') as dic:
    knn_dict = pickle.load(dic)

In [None]:
sjc_train_index = trainshift_df['collectionName'].map(lambda x: 'SJC' in x)
trainsnap_df = pd.concat([
    snap_to_grid(trainshift_df[sjc_train_index], 100),
    snap_to_grid(trainshift_df[~sjc_train_index], 2)
]).reset_index(drop=True)

sjc_test_index = testshift_df['collectionName'].map(lambda x: 'SJC' in x)
testsnap_df = pd.concat([
    snap_to_grid(testshift_df[sjc_test_index], 100),
    snap_to_grid(testshift_df[~sjc_test_index], 2)
]).reset_index(drop=True)

In [None]:
trainsnap_df = check_meter(trainsnap_df)

In [None]:
sub = sample.drop(columns=['latDeg', 'lngDeg'], axis=1).merge(
        testsnap_df[['phone', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg']],
        on=['phone', 'millisSinceGpsEpoch']
)
sub.to_csv('submission.csv', index=False)

In [None]:
visualize_trafic(trainshift_df, color='collectionName')

In [None]:
visualize_trafic(testshift_df, color='collectionName')