This notebook implements the contents of [my discussion](https://www.kaggle.com/competitions/smartphone-decimeter-2022/discussion/322596).  
Phones mean improves in train, but there is no room for improvement in test because the number of phones is one.

In [None]:
import os
from glob import glob
from dataclasses import dataclass

import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from scipy.interpolate import InterpolatedUnivariateSpline

INPUT_PATH = '../input/smartphone-decimeter-2022'
TRAIN_PATH = os.path.join(INPUT_PATH, 'train')

WGS84_SEMI_MAJOR_AXIS = 6378137.0
WGS84_SEMI_MINOR_AXIS = 6356752.314245
WGS84_SQUARED_FIRST_ECCENTRICITY  = 6.69437999013e-3
WGS84_SQUARED_SECOND_ECCENTRICITY = 6.73949674226e-3

HAVERSINE_RADIUS = 6_371_000

In [None]:
@dataclass
class ECEF:
    x: np.array
    y: np.array
    z: np.array

    def to_numpy(self):
        return np.stack([self.x, self.y, self.z], axis=0)

    @staticmethod
    def from_numpy(pos):
        x, y, z = [np.squeeze(w) for w in np.split(pos, 3, axis=-1)]
        return ECEF(x=x, y=y, z=z)

@dataclass
class BLH:
    lat : np.array
    lng : np.array
    hgt : np.array

def ECEF_to_BLH(ecef):
    a = WGS84_SEMI_MAJOR_AXIS
    b = WGS84_SEMI_MINOR_AXIS
    e2  = WGS84_SQUARED_FIRST_ECCENTRICITY
    e2_ = WGS84_SQUARED_SECOND_ECCENTRICITY
    x = ecef.x
    y = ecef.y
    z = ecef.z
    r = np.sqrt(x**2 + y**2)
    t = np.arctan2(z * (a/b), r)
    B = np.arctan2(z + (e2_*b)*np.sin(t)**3, r - (e2*a)*np.cos(t)**3)
    L = np.arctan2(y, x)
    n = a / np.sqrt(1 - e2*np.sin(B)**2)
    H = (r / np.cos(B)) - n
    return BLH(lat=B, lng=L, hgt=H)

def haversine_distance(blh_1, blh_2):
    dlat = blh_2.lat - blh_1.lat
    dlng = blh_2.lng - blh_1.lng
    a = np.sin(dlat/2)**2 + np.cos(blh_1.lat) * np.cos(blh_2.lat) * np.sin(dlng/2)**2
    dist = 2 * HAVERSINE_RADIUS * np.arcsin(np.sqrt(a))
    return dist

def pandas_haversine_distance(df1, df2):
    blh1 = BLH(
        lat=np.deg2rad(df1['LatitudeDegrees'].to_numpy()),
        lng=np.deg2rad(df1['LongitudeDegrees'].to_numpy()),
        hgt=0,
    )
    blh2 = BLH(
        lat=np.deg2rad(df2['LatitudeDegrees'].to_numpy()),
        lng=np.deg2rad(df2['LongitudeDegrees'].to_numpy()),
        hgt=0,
    )
    return haversine_distance(blh1, blh2)


def ecef_to_lat_lng(tripID, gnss_df, UnixTimeMillis):
    ecef_columns = ['WlsPositionXEcefMeters', 'WlsPositionYEcefMeters', 'WlsPositionZEcefMeters']
    columns = ['utcTimeMillis'] + ecef_columns
    ecef_df = (gnss_df.drop_duplicates(subset='utcTimeMillis')[columns]
               .dropna().reset_index(drop=True))
    ecef = ECEF.from_numpy(ecef_df[ecef_columns].to_numpy())
    blh  = ECEF_to_BLH(ecef)

    TIME = ecef_df['utcTimeMillis'].to_numpy()
    lat = InterpolatedUnivariateSpline(TIME, blh.lat, ext=3)(UnixTimeMillis)
    lng = InterpolatedUnivariateSpline(TIME, blh.lng, ext=3)(UnixTimeMillis)
    return pd.DataFrame({
        'tripId' : tripID,
        'UnixTimeMillis'   : UnixTimeMillis,
        'LatitudeDegrees'  : np.degrees(lat),
        'LongitudeDegrees' : np.degrees(lng),
    })

def calc_score(tripID, pred_df, gt_df):
    d = pandas_haversine_distance(pred_df, gt_df)
    score = np.mean([np.quantile(d, 0.50), np.quantile(d, 0.95)])    
    return score

In [None]:
score_list = []
for phone_dir in tqdm(glob('../input/gsdc2-saito-latlon-baseline/train/*')):
    phone_df = []
    for path in glob(os.path.join(phone_dir, '*')):
        phone_df.append(pd.read_csv(os.path.join(path, 'baseline.csv')))
    phone_df = pd.concat(phone_df)
    
    # lat
    lat_df = pd.pivot_table(
        data=phone_df,
        index=['UnixTimeMillis'],
        columns=['tripId'],
        values=['LatitudeDegrees']
    )
    lat_df = lat_df.interpolate(method='index', limit_direction='both')
    lat_df = lat_df.mean(axis=1).reset_index()
    lat_df.columns = ['UnixTimeMillis', 'LatitudeDegrees']
    
    # lon
    lon_df = pd.pivot_table(
        data=phone_df,
        index=['UnixTimeMillis'],
        columns=['tripId'],
        values=['LongitudeDegrees']
    )
    lon_df = lon_df.interpolate(method='index', limit_direction='both')
    lon_df = lon_df.mean(axis=1).reset_index()
    lon_df.columns = ['UnixTimeMillis', 'LongitudeDegrees']
    
    phonemean_df = phone_df[['tripId', 'UnixTimeMillis']]\
                    .merge(lat_df, on='UnixTimeMillis', how='left')\
                    .merge(lon_df, on='UnixTimeMillis', how='left')   
    
    for tripId, grp_df in phonemean_df.groupby('tripId'):
        gt_df = pd.read_csv(os.path.join(TRAIN_PATH, tripId, 'ground_truth.csv'))
        score = calc_score(tripId, grp_df, gt_df)
        print(f'{tripId:<45}: score = {score:.3f}')
        score_list.append(score)

mean_score = np.mean(score_list)
print(f'mean_score = {mean_score:.3f}')

In [None]:
sub_df = pd.read_csv('../input/smartphone-decimeter-2022/sample_submission.csv')
sub_df = sub_df.drop(columns=['LatitudeDegrees', 'LongitudeDegrees'])

pred_dfs  = []
for phone_dir in tqdm(glob('../input/gsdc2-saito-latlon-baseline/test/*')):
    phone_df = []
    for path in glob(os.path.join(phone_dir, '*')):
        phone_df.append(pd.read_csv(os.path.join(path, 'baseline.csv')))
    phone_df = pd.concat(phone_df)
    
    # lat
    lat_df = pd.pivot_table(
        data=phone_df,
        index=['UnixTimeMillis'],
        columns=['tripId'],
        values=['LatitudeDegrees']
    )
    lat_df = lat_df.interpolate(method='index', limit_direction='both')
    lat_df = lat_df.mean(axis=1).reset_index()
    lat_df.columns = ['UnixTimeMillis', 'LatitudeDegrees']
    
    # lon
    lon_df = pd.pivot_table(
        data=phone_df,
        index=['UnixTimeMillis'],
        columns=['tripId'],
        values=['LongitudeDegrees']
    )
    lon_df = lon_df.interpolate(method='index', limit_direction='both')
    lon_df = lon_df.mean(axis=1).reset_index()
    lon_df.columns = ['UnixTimeMillis', 'LongitudeDegrees']
    
    phonemean_df = phone_df[['tripId', 'UnixTimeMillis']]\
                    .merge(lat_df, on='UnixTimeMillis', how='left')\
                    .merge(lon_df, on='UnixTimeMillis', how='left')   
    pred_dfs.append(phonemean_df)
    
sub_df = sub_df.merge(pd.concat(pred_dfs), on=['tripId', 'UnixTimeMillis'])
sub_df.to_csv('submission.csv', index=False)
display(sub_df)