## Summary

This is my first submission (LB: 4.906) program.

Instead of using the Kalman filter, this program formulates location estimation as a quadratic programming problem.

## Libraries

In [None]:
%%writefile constants.py

import json
import datetime
from collections import defaultdict
import numpy as np

GPS_ORIGIN_DAY       = datetime.date(1980, 1, 6)
GPS_ORIGIN_DATETIME  = datetime.datetime(1980, 1, 6)
GLONASS_LEAP_SECONDS = 18
BEIDOU_LEAP_SECONDS  = 14
TZ_MSK = datetime.timezone(datetime.timedelta(hours=+3), 'MSK')

WGS84_SEMI_MAJOR_AXIS = 6378137.0
WGS84_SEMI_MINOR_AXIS = 6356752.314245
WGS84_SQUARED_FIRST_ECCENTRICITY  = 6.69437999013e-3
WGS84_SQUARED_SECOND_ECCENTRICITY = 6.73949674226e-3
WGS84_FIRST_ECCENTRICITY  = np.sqrt(WGS84_SQUARED_FIRST_ECCENTRICITY)
WGS84_SECOND_ECCENTRICITY = np.sqrt(WGS84_SQUARED_SECOND_ECCENTRICITY)

LIGHT_SPEED = 299792458.0

OMEGA_EARTH = 7.2921151467e-5
MU_EARTH    = 3.986005e+14

FREQ_GPS_L1  = 1.575420e+09
FREQ_GPS_L5  = 1.176450e+09
FREQ_GAL_E1  = FREQ_GPS_L1
FREQ_GAL_E5A = FREQ_GPS_L5
FREQ_QZS_J1  = FREQ_GPS_L1
FREQ_QZS_J5  = FREQ_GPS_L5
FREQ_BDS_B1I = 1.561098e+09
FREQ_GLO_G1_NOMINAL = 1602.00 * 1e+6
FREQ_GLO_G1_DELTA   = 562.5 * 1e+3

CONSTELLATION_TYPE_MAP = {
    'GPS'     : 1,
    'GLONASS' : 3,
    'QZSS'    : 4,
    'BEIDOU'  : 5,
    'GALILEO' : 6,
}

RAW_STATE_BIT_MAP = {
     0: "Code Lock",
     1: "Bit Sync",
     2: "Subframe Sync",
     3: "Time Of Week Decoded State",
     4: "Millisecond Ambiguity",
     5: "Symbol Sync",
     6: "GLONASS String Sync",
     7: "GLONASS Time Of Day Decoded",
     8: "BEIDOU D2 Bit Sync",
     9: "BEIDOU D2 Subframe Sync",
    10: "Galileo E1BC Code Lock",
    11: "Galileo E1C 2^nd^ Code Lock",
    12: "Galileo E1B Page Sync",
    13: "SBAS Sync",
    14: "Time Of Week Known",
    15: "GLONASS Time Of Day Known",
}
RAW_STATE_BIT_INV_MAP = { value : key for key, value in RAW_STATE_BIT_MAP.items() }

SYSTEM_NAME_MAP = {
    'GPS'     : 'G',
    'GLONASS' : 'R',
    'GALILEO' : 'E',
    'BEIDOU'  : 'C',
    'QZSS'    : 'J',
}

GLONASS_FREQ_CHANNEL_MAP = {
    1 : 1,
    2 : -4,
    3 : 5,
    4 : 6,
    5 : 1,
    6 : -4,
    7 : 5,
    8 : 6,
    9 : -2,
    10 : -7,
    11 : 0,
    12 : -1,
    13 : -2,
    14 : -7,
    15 : 0,
    16 : -1,
    17 : 4,
    18 : -3,
    19 : 3,
    20 : 2,
    21 : 4,
    22 : -3,
    23 : 3,
    24 : 2,
}

QZSS_PRN_SVID_MAP = {
    193 : 1,
    194 : 2,
    199 : 3,
    195 : 4,
}

INIT_B = np.deg2rad(  37.5)
INIT_L = np.deg2rad(-122.2)
INIT_H = 0.0

FREQ_TOL = 100.0
Cn0DbHz_THRESHOLD = 20.0
ReceivedSvTimeUncertaintyNanos_THRESHOLD = 100
RAW_PSEUDO_RANGE_THRESHOLD = 50_000 * 1e+3

CLOCK_TIME_MARGIN = datetime.timedelta(seconds=90)
ORBIT_TIME_MARGIN = datetime.timedelta(hours=3)
IONO_TIME_MARGIN  = datetime.timedelta(hours=2)

EPSILON_M = 0.01
ELEVATION_CUTOFF = np.deg2rad(7.0)
DEFAULT_TROPO_DELAY_M = 2.48

HAVERSINE_RADIUS = 6_371_000

In [None]:
%%writefile transform.py

import numpy as np
from dataclasses import dataclass

import constants as C

@dataclass
class ECEF:
    x: np.array
    y: np.array
    z: np.array

    def to_numpy(self):
        return np.stack([self.x, self.y, self.z], axis=0)

    @staticmethod
    def from_numpy(pos):
        x, y, z = [np.squeeze(w) for w in np.split(pos, 3, axis=-1)]
        return ECEF(x=x, y=y, z=z)

@dataclass
class BLH:
    lat : np.array
    lng : np.array
    hgt : np.array

@dataclass
class ENU:
    east  : np.array
    north : np.array
    up    : np.array

@dataclass
class AZEL:
    elevation : np.array
    azimuth   : np.array
    zenith    : np.array

def BLH_to_ECEF(blh):
    a  = C.WGS84_SEMI_MAJOR_AXIS
    e2 = C.WGS84_SQUARED_FIRST_ECCENTRICITY
    sin_B = np.sin(blh.lat)
    cos_B = np.cos(blh.lat)
    sin_L = np.sin(blh.lng)
    cos_L = np.cos(blh.lng)
    n = a / np.sqrt(1 - e2*sin_B**2)
    x = (n + blh.hgt) * cos_B * cos_L
    y = (n + blh.hgt) * cos_B * sin_L
    z = ((1 - e2) * n + blh.hgt) * sin_B
    return ECEF(x=x, y=y, z=z)

def ECEF_to_BLH_approximate(ecef):
    a = C.WGS84_SEMI_MAJOR_AXIS
    b = C.WGS84_SEMI_MINOR_AXIS
    e2  = C.WGS84_SQUARED_FIRST_ECCENTRICITY
    e2_ = C.WGS84_SQUARED_SECOND_ECCENTRICITY
    x = ecef.x
    y = ecef.y
    z = ecef.z
    r = np.sqrt(x**2 + y**2)
    t = np.arctan2(z * (a/b), r)
    B = np.arctan2(z + (e2_*b)*np.sin(t)**3, r - (e2*a)*np.cos(t)**3)
    L = np.arctan2(y, x)
    n = a / np.sqrt(1 - e2*np.sin(B)**2)
    H = (r / np.cos(B)) - n
    return BLH(lat=B, lng=L, hgt=H)

ECEF_to_BLH = ECEF_to_BLH_approximate

def ECEF_to_ENU(pos, base):
    dx = pos.x - base.x
    dy = pos.y - base.y
    dz = pos.z - base.z
    base_blh = ECEF_to_BLH(base)
    sin_B = np.sin(base_blh.lat)
    cos_B = np.cos(base_blh.lat)
    sin_L = np.sin(base_blh.lng)
    cos_L = np.cos(base_blh.lng)
    e = -sin_L*dx + cos_L*dy
    n = -sin_B*cos_L*dx - sin_B*sin_L*dy + cos_B*dz
    u =  cos_B*cos_L*dx + cos_B*sin_L*dy + sin_B*dz
    return ENU(east=e, north=n, up=u)

def ENU_to_AZEL(enu):
    e = enu.east
    n = enu.north
    u = enu.up
    elevation = np.arctan2(u, np.sqrt(e**2 + n**2))
    azimuth   = np.arctan2(e, n)
    zenith    = (0.5 * np.pi) - elevation
    return AZEL(elevation=elevation,
                azimuth=azimuth,
                zenith=zenith)

def ECEF_to_AZEL(pos, base):
    return ENU_to_AZEL(ECEF_to_ENU(pos, base))

def haversine_distance(blh_1, blh_2):
    dlat = blh_2.lat - blh_1.lat
    dlng = blh_2.lng - blh_1.lng
    a = np.sin(dlat/2)**2 + np.cos(blh_1.lat) * np.cos(blh_2.lat) * np.sin(dlng/2)**2
    dist = 2 * C.HAVERSINE_RADIUS * np.arcsin(np.sqrt(a))
    return dist

def hubenys_distance(blh_1, blh_2):
    Rx = C.WGS84_SEMI_MAJOR_AXIS
    Ry = C.WGS84_SEMI_MINOR_AXIS
    E2 = C.WGS84_SQUARED_FIRST_ECCENTRICITY
    num_M = Rx * (1 - E2)
    Dy = blh_1.lat - blh_2.lat
    Dx = blh_1.lng - blh_2.lng
    P  = 0.5 * (blh_1.lat + blh_2.lat)
    W  = np.sqrt(1 - E2 * np.sin(P)**2)
    M  = num_M / W**3
    N  = Rx / W
    d2 = (Dy * M)**2 + (Dx * N * np.cos(P))**2
    d  = np.sqrt(d2)
    return d

def jacobian_BLH_to_ECEF(blh):
    a  = C.WGS84_SEMI_MAJOR_AXIS
    e2 = C.WGS84_SQUARED_FIRST_ECCENTRICITY
    B = blh.lat
    L = blh.lng
    H = blh.hgt
    cos_B = np.cos(B)
    sin_B = np.sin(B)
    cos_L = np.cos(L)
    sin_L = np.sin(L)
    N = a / np.sqrt(1 - e2*sin_B**2)
    dNdB = a * e2 * sin_B * cos_B * (1 - e2*sin_B**2)**(-3/2)
    N_plus_H = N + H
    cos_B_cos_L = cos_B * cos_L
    cos_B_sin_L = cos_B * sin_L
    sin_B_cos_L = sin_B * cos_L
    sin_B_sin_L = sin_B * sin_L

    dXdB = dNdB*cos_B_cos_L - N_plus_H*sin_B_cos_L
    dYdB = dNdB*cos_B_sin_L - N_plus_H*sin_B_sin_L
    dZdB = (1-e2)*dNdB*sin_B + (1-e2)*N_plus_H*cos_B

    dXdL = - N_plus_H * cos_B_sin_L
    dYdL =   N_plus_H * cos_B_cos_L
    dZdL = np.zeros_like(dXdL)

    dXdH = cos_B_cos_L
    dYdH = cos_B_sin_L
    dZdH = sin_B

    J = np.stack([[dXdB, dXdL, dXdH],
                  [dYdB, dYdL, dYdH],
                  [dZdB, dZdL, dZdH]], axis=0)
    axes = list(range(2, J.ndim)) + [0, 1]
    J = np.transpose(J, axes)
    return J

def jacobian_ECEF_to_ENU(blh):
    B = blh.lat
    L = blh.lng
    cos_B = np.cos(B)
    sin_B = np.sin(B)
    cos_L = np.cos(L)
    sin_L = np.sin(L)
    
    dEdX = -sin_L
    dEdY =  cos_L
    dEdZ = np.zeros_like(dEdX)
    
    dNdX = -sin_B*cos_L
    dNdY = -sin_B*sin_L
    dNdZ =  cos_B

    dUdX = cos_B*cos_L
    dUdY = cos_B*sin_L
    dUdZ = sin_B

    J = np.stack([[dEdX, dEdY, dEdZ],
                  [dNdX, dNdY, dNdZ],
                  [dUdX, dUdY, dUdZ]], axis=0)
    axes = list(range(2, J.ndim)) + [0, 1]
    J = np.transpose(J, axes)
    return J

def pd_haversine_distance(df1, df2):
    blh1 = BLH(
        lat=np.deg2rad(df1['latDeg'].values),
        lng=np.deg2rad(df1['lngDeg'].values),
        hgt=0,
    )
    blh2 = BLH(
        lat=np.deg2rad(df2['latDeg'].values),
        lng=np.deg2rad(df2['lngDeg'].values),
        hgt=0,
    )
    return haversine_distance(blh1, blh2)

In [None]:
%%writefile area_prediction.py

import numpy as np
import pandas as pd
from pathlib import Path
from glob import glob
from sklearn.neighbors import KNeighborsClassifier

BASE_DIR = Path('../input/google-smartphone-decimeter-challenge')

train_base = pd.read_csv(BASE_DIR / 'baseline_locations_train.csv')
train_base = train_base.sort_values([
    "collectionName", "phoneName", "millisSinceGpsEpoch"
]).reset_index(drop=True)

train_base['area'] = train_base['collectionName'].map(lambda x: x.split('-')[4])

train_name = np.array(sorted(path.split('/')[-1] for path in glob(f'{BASE_DIR}/train/*')))
train_highway  = train_name[np.array([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21]) - 1]
train_tree     = train_name[np.array([22,23,25,26,28]) - 1]
train_downtown = train_name[np.array([24,27,29]) - 1]

train_base['area_target'] = -1
train_base.loc[train_base['collectionName'].isin(train_highway),  'area_target'] = 0
train_base.loc[train_base['collectionName'].isin(train_tree),     'area_target'] = 1
train_base.loc[train_base['collectionName'].isin(train_downtown), 'area_target'] = 2

def processing_downtown(input_df: pd.DataFrame, is_train=False):
    output_df = input_df.groupby('collectionName')[['latDeg', 'lngDeg']].std()
    if is_train:
        output_df = output_df.merge(
            input_df.groupby('collectionName')[['area_target']].first(),
            on='collectionName')
    output_df = output_df.merge(
        input_df.groupby('collectionName')['area'].first(),
        on='collectionName')
    output_df = output_df.merge(
        input_df.groupby('collectionName')['phoneName'].unique().apply(list),
        on='collectionName')
    return output_df

train = processing_downtown(train_base, is_train=True)
train['downtown_target'] = (train['area_target']==2).astype(int)

downtown_model_knn = KNeighborsClassifier(n_neighbors=1)
downtown_model_knn.fit(
    train[['latDeg', 'lngDeg']],
    train['downtown_target'],
)

def processing_highway_tree(input_df: pd.DataFrame, is_train=False):
    output_df = input_df.groupby('collectionName')[['latDeg', 'lngDeg']].min()
    if is_train:
        output_df = output_df.merge(
            input_df.groupby('collectionName')[['area_target']].first(),
            on='collectionName')
    output_df = output_df.merge(
        input_df.groupby('collectionName')['area'].first(),
        on='collectionName')
    output_df = output_df.merge(
        input_df.groupby('collectionName')['phoneName'].unique().apply(list),
        on='collectionName')
    return output_df

train = processing_highway_tree(train_base, is_train=True)

highway_tree_model_knn = KNeighborsClassifier(n_neighbors=1)
highway_tree_model_knn.fit(
    train.loc[train['area_target']!=2, ['latDeg', 'lngDeg']],
    train.loc[train['area_target']!=2, 'area_target'],
)

def predict_area(test_base):
    test_base = test_base.copy()
    test_base = test_base.sort_values([
        "collectionName", "phoneName", "millisSinceGpsEpoch"
    ]).reset_index(drop=True)
    test_base['area'] = test_base['collectionName'].map(lambda x: x.split('-')[4])

    test = processing_downtown(test_base)
    downtown_pred = downtown_model_knn.predict(test[['latDeg', 'lngDeg']])

    test = processing_highway_tree(test_base)
    test.loc[downtown_pred==1, 'area_pred'] = 2
    pred = highway_tree_model_knn.predict(test.loc[test['area_pred'].isnull(), ['latDeg', 'lngDeg']])
    test.loc[test['area_pred'].isnull(), 'area_pred'] = pred
    test['area_pred'] = test['area_pred'].astype(int)
    test['collectionName'] = test.index

    test_highway  = []
    test_tree     = []
    test_downtown = []
    for collection, area_pred in test[['collectionName', 'area_pred']].itertuples(index=False):
        if area_pred == 0:
            test_highway.append(collection)
        elif area_pred == 1:
            test_tree.append(collection)
        else:
            test_downtown.append(collection)
    return (test_highway, test_tree, test_downtown)

## main.py

In [None]:
import multiprocessing
import glob
import numpy as np
import pandas as pd
import scipy.sparse
import scipy.sparse.linalg
from tqdm.notebook import tqdm

import transform
import area_prediction

INPUT_PATH = '../input/google-smartphone-decimeter-challenge'

BASELINE_DF = pd.concat([pd.read_csv(f'{INPUT_PATH}/baseline_locations_train.csv'),
                         pd.read_csv(f'{INPUT_PATH}/baseline_locations_test.csv'),
                         ], axis=0)
def get_baseline(collection_name):
    df = BASELINE_DF[BASELINE_DF['collectionName'] == collection_name].copy()
    df.reset_index(drop=True, inplace=True)
    return df

def get_optimization_constants(base_df, sigma_y):
    const = dict()
    dt = 1.0
    t0 = base_df['millisSinceGpsEpoch'].min()
    TIME_y = 1e-3 * (base_df['millisSinceGpsEpoch'] - t0).values
    N_y = TIME_y.shape[0]
    N_x = int(np.ceil(np.max(TIME_y) / dt) + 1)
    const['N_y'] = N_y
    const['N_x'] = N_x

    a = np.array([[1, dt, (1/2)*dt**2],
                  [0,  1,  dt],
                  [0,  0,  1]])
    e3 = scipy.sparse.eye(3)
    A = np.empty(shape=(2*(N_x-1), 2*N_x), dtype=np.object)
    for i_x in range(N_x-1):
        A[2*i_x  , 2*i_x  ] = a
        A[2*i_x+1, 2*i_x+1] = a
        A[2*i_x  , 2*i_x+2] = -e3
        A[2*i_x+1, 2*i_x+3] = -e3
    const['A'] = scipy.sparse.bmat(A, format='csr')
    
    b = np.array([[(1/6)*dt**3,
                   (1/2)*dt**2,
                   dt]]).T
    const['B'] = scipy.sparse.block_diag([b for _ in range(2*(N_x-1))], format='csr')

    sigma_u = 1.0
    diag_R  = np.full(2*N_x - 2, sigma_u**(-2) * dt)
    const['R'] = scipy.sparse.spdiags(diag_R, [0], 2*N_x - 2, 2*N_x - 2, format='csc')
    
    x_index  = np.floor(TIME_y / dt).astype(int)
    alpha    = (TIME_y / dt) - x_index
    coeff_y0 = 1 - 3*alpha**2 + 2*alpha**3
    coeff_y1 =     3*alpha**2 - 2*alpha**3
    coeff_v0 = alpha * (alpha - 1)**2
    coeff_v1 = alpha**2 * (alpha - 1)
    C = np.empty(shape=(2*N_y, 2*N_x), dtype=np.object)
    for i_x in range(N_x):
        C[0, 2*i_x  ] = scipy.sparse.coo_matrix((1, 3))
        C[0, 2*i_x+1] = scipy.sparse.coo_matrix((1, 3))
    for i_y in range(N_y):
        i_x = x_index[i_y]
        c_i = np.array([[coeff_y0[i_y], coeff_v0[i_y], 0]])
        C[2*i_y,   2*i_x]   = c_i
        C[2*i_y+1, 2*i_x+1] = c_i
        if i_x < N_x - 1:
            c_iplus = np.array([[coeff_y1[i_y], coeff_v1[i_y], 0]])
            C[2*i_y,   2*i_x+2] = c_iplus
            C[2*i_y+1, 2*i_x+3] = c_iplus
    const['C_orig']  = scipy.sparse.bmat(C, format='csr')

    diag_L = np.full(2*N_y, sigma_y**(-2))
    const['L_orig'] = scipy.sparse.spdiags(diag_L, [0], 2*N_y, 2*N_y, format='csr')

    const['Y_orig'] = base_df[['latDeg', 'lngDeg']].values.flatten()
    
    return const

def solve_QP(const, valid):
    A = const['A']
    B = const['B']
    R = const['R']
    C_orig = const['C_orig']
    L_orig = const['L_orig']
    Y_orig = const['Y_orig']
    valid2 = np.stack([valid, valid], axis=1).flatten()
    C = C_orig[valid2, :]
    L = L_orig[np.ix_(valid2, valid2)]
    Y = Y_orig[valid2]
    
    BRB = B @ scipy.sparse.linalg.spsolve(R, B.T)
    CLC = C.T @ (L @ C)
    CLY = C.T @ (L @ Y)
    A_sys  = scipy.sparse.bmat([[CLC, A.T], [A, -BRB]], format='csc')
    b_sys  = np.concatenate([CLY, np.zeros(A.shape[0])], axis=0)
    x_sys  = scipy.sparse.linalg.spsolve(A_sys, b_sys)
    X_star = x_sys[0:A.shape[1]]
    Y_star = C_orig @ X_star
    return Y_star

def do_postprocess(args):
    collection_name, params = args
    base_df = get_baseline(collection_name)
    const   = get_optimization_constants(base_df, params['sigma_y'])
    valid   = np.full(const['N_y'], True)
    for loop in range(3):
        Y_star = solve_QP(const, valid)
        Y_star = np.reshape(Y_star, (-1, 2))
        pp_df  = base_df.copy()
        pp_df['latDeg'] = Y_star[:, 0]
        pp_df['lngDeg'] = Y_star[:, 1]
        d = transform.pd_haversine_distance(pp_df, base_df)
        valid = (d < params['reject_m'])
    return pp_df

def make_postprocessing_df(config):
    args_list = []
    for collection_list, params in config:
        for collection_name in collection_list:
            args_list.append((collection_name, params))
    processes = multiprocessing.cpu_count()
    with multiprocessing.Pool(processes=processes) as pool:
        df_list = pool.imap_unordered(do_postprocess, args_list)
        df_list = tqdm(df_list, total=len(args_list))
        df_list = list(df_list)
    output_df = pd.concat(df_list, axis=0).sort_values(['phone', 'millisSinceGpsEpoch'])
    return output_df

def print_score(output_df):
    score_list = []
    for gid, phone_df in output_df.groupby('phone'):
        drive, phone = gid.split('_')
        gt_df = pd.read_csv(f'{INPUT_PATH}/train/{drive}/{phone}/ground_truth.csv')
        d = transform.pd_haversine_distance(phone_df, gt_df)
        score = np.mean([np.quantile(d, 0.50), np.quantile(d, 0.95)])
        score_list.append(score)
    score = np.mean(score_list)
    print(f'train score: {score:.3f}')
    return

def main():
    params_highway = { 'sigma_y'  : 3.0,
                       'reject_m' : 7.0,
                      }
    params_treeway = { 'sigma_y'  : 6.0,
                       'reject_m' : 11.0,
                      }
    params_downtown = { 'sigma_y'  : 30.0,
                        'reject_m' : 19.0,
                       }

    collection_list_all = np.array(sorted(path.split('/')[-1] for path in glob.glob(f'{INPUT_PATH}/train/*')))
    collection_list_highway  = collection_list_all[np.array([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21]) - 1]
    collection_list_treeway  = collection_list_all[np.array([22,23,25,26,28]) - 1]
    collection_list_downtown = collection_list_all[np.array([24,27,29]) - 1]
    config = [
        (collection_list_highway,  params_highway),
        (collection_list_treeway,  params_treeway),
        (collection_list_downtown, params_downtown),
    ]
    train_pp_df = make_postprocessing_df(config)
    print_score(train_pp_df)
    
    test_base = pd.read_csv(f'{INPUT_PATH}/baseline_locations_test.csv')
    collection_list_highway, collection_list_treeway, collection_list_downtown = area_prediction.predict_area(test_base)
    config = [
        (collection_list_highway,  params_highway),
        (collection_list_treeway,  params_treeway),
        (collection_list_downtown, params_downtown),
    ]
    test_pp_df = make_postprocessing_df(config)

    train_pp_df.to_csv('smoothing_1st_train.csv', index=False)
    test_pp_df.to_csv('smoothing_1st_test.csv', index=False)

    columns = ['phone', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg']
    sub_df = test_pp_df[columns]
    sub_df.to_csv('submission.csv', index=False)
    return

In [None]:
main()