In [1]:
# import library
import os
import pandas as pd
import numpy as np
np.random.seed(71)
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles
import seaborn as sns
from tqdm.notebook import tqdm
import pathlib
import plotly
import plotly.express as px
import simdkalman
from scipy.interpolate import interp1d

from pathlib import Path
import pyproj
from pyproj import Proj, transform # 地理的な位置を示す情報を扱うときに、座標系・測地系変換を行ったり、2点間の距離・方位角を計算したりできる。

import optuna

In [2]:
def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(a**0.5)
    return dist

    
def percentile50(x):
    return np.percentile(x, 50)
def percentile95(x):
    return np.percentile(x, 95)

def get_train_score(df, gt):
    gt = gt.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
    # df = df.merge(gt, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='inner')
    df = df.merge(gt, on=['phone', 'millisSinceGpsEpoch'], how='inner')
    # calc_distance_error
    df['err'] = calc_haversine(df['latDeg_gt'], df['lngDeg_gt'], df['latDeg'], df['lngDeg'])
    # calc_evaluate_score
    # df['phone'] = df['collectionName'] + '_' + df['phoneName']
    res = df.groupby('phone')['err'].agg([percentile50, percentile95]) # phoneによってgroupbyし、gtと予測値の差(err)の50%,95%値を求める
    res['p50_p90_mean'] = (res['percentile50'] + res['percentile95']) / 2 
    score = res['p50_p90_mean'].mean()
    return score

In [3]:
# directory setting
INPUT = '../input/google-smartphone-decimeter-challenge'

# base_train = pd.read_csv(INPUT + '/' + 'baseline_locations_train.csv')
base_train = pd.read_csv('../output/filtered_nb037.csv')
base_train['collectionName'] = base_train['phone'].map(lambda x: x.split('_')[0])
base_train['phoneName'] = base_train['phone'].map(lambda x: x.split('_')[1])

# base_test = pd.read_csv('../output/sub_nb037.csv')
base_test = pd.read_csv('../output/sub_nb037_5.csv')
# base_test = pd.read_csv('../output/fixed_base_test.csv')

sample_sub = pd.read_csv(INPUT + '/' + 'sample_submission.csv')

# ground_truth
p = pathlib.Path(INPUT)
gt_files = list(p.glob('train/*/*/ground_truth.csv'))
print('ground_truth.csv count : ', len(gt_files))

gts = []
for gt_file in tqdm(gt_files):
    gts.append(pd.read_csv(gt_file))
ground_truth = pd.concat(gts)
ground_truth['phone'] = ground_truth['collectionName'] + '_' + ground_truth['phoneName']


ground_truth.csv count :  73


  0%|          | 0/73 [00:00<?, ?it/s]

# Reject outlier
- 前と後の距離がそれぞれ50m以上離れていたら削除

In [4]:
def add_distance_diff(df):
    df['latDeg_prev'] = df['latDeg'].shift(1)
    df['latDeg_next'] = df['latDeg'].shift(-1)
    df['lngDeg_prev'] = df['lngDeg'].shift(1)
    df['lngDeg_next'] = df['lngDeg'].shift(-1)
    df['phone_prev'] = df['phone'].shift(1)
    df['phone_next'] = df['phone'].shift(-1)
    
    df['dist_prev'] = calc_haversine(df['latDeg'], df['lngDeg'], df['latDeg_prev'], df['lngDeg_prev'])
    df['dist_next'] = calc_haversine(df['latDeg'], df['lngDeg'], df['latDeg_next'], df['lngDeg_next'])
    
    df.loc[df['phone']!=df['phone_prev'], ['latDeg_prev', 'lngDeg_prev', 'dist_prev']] = np.nan
    df.loc[df['phone']!=df['phone_next'], ['latDeg_next', 'lngDeg_next', 'dist_next']] = np.nan
    
    return df

# Kalman filter

In [5]:
T = 1.0
state_transition = np.array([[1, 0, T, 0, 0.5 * T ** 2, 0], [0, 1, 0, T, 0, 0.5 * T ** 2], [0, 0, 1, 0, T, 0],
                             [0, 0, 0, 1, 0, T], [0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 1]])
process_noise = np.diag([1e-5, 1e-5, 5e-6, 5e-6, 1e-6, 1e-6]) + np.ones((6, 6)) * 1e-9
observation_model = np.array([[1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0]])
observation_noise = np.diag([5e-5, 5e-5]) + np.ones((2, 2)) * 1e-9

kf = simdkalman.KalmanFilter(
        state_transition = state_transition,
        process_noise = process_noise,
        observation_model = observation_model,
        observation_noise = observation_noise)

def apply_kf_smoothing(df_, kf_=kf):
    df = df_.copy()
    unique_paths = df[['collectionName', 'phoneName']].drop_duplicates().to_numpy()
    for collection, phone in unique_paths:
        cond = np.logical_and(df['collectionName'] == collection, df['phoneName'] == phone)
        data = df[cond][['latDeg', 'lngDeg']].to_numpy()
        data = data.reshape(1, len(data), 2)
        smoothed = kf_.smooth(data)
        df.loc[cond, 'latDeg'] = smoothed.states.mean[0, :, 0]
        df.loc[cond, 'lngDeg'] = smoothed.states.mean[0, :, 1]
    return df

# Phone mean prediction
- to use the average of the predictions of several phones in the same collection as the final prediction.

In [6]:
def make_lerp_data(df):
    '''
    Generate interpolated lat,lng values for different phone times in the same collection.
    '''
    org_columns = df.columns
    
    # Generate a combination of time x collection x phone and combine it with the original data (generate records to be interpolated)
    time_list = df[['collectionName', 'millisSinceGpsEpoch']].drop_duplicates()
    phone_list =df[['collectionName', 'phoneName']].drop_duplicates()
    tmp = time_list.merge(phone_list, on='collectionName', how='outer')
    
    lerp_df = tmp.merge(df, on=['collectionName', 'millisSinceGpsEpoch', 'phoneName'], how='left')
    lerp_df['phone'] = lerp_df['collectionName'] + '_' + lerp_df['phoneName']
    lerp_df = lerp_df.sort_values(['phone', 'millisSinceGpsEpoch'])
    
    # linear interpolation
    lerp_df['latDeg_prev'] = lerp_df['latDeg'].shift(1)
    lerp_df['latDeg_next'] = lerp_df['latDeg'].shift(-1)
    lerp_df['lngDeg_prev'] = lerp_df['lngDeg'].shift(1)
    lerp_df['lngDeg_next'] = lerp_df['lngDeg'].shift(-1)
    lerp_df['phone_prev'] = lerp_df['phone'].shift(1)
    lerp_df['phone_next'] = lerp_df['phone'].shift(-1)
    lerp_df['time_prev'] = lerp_df['millisSinceGpsEpoch'].shift(1)
    lerp_df['time_next'] = lerp_df['millisSinceGpsEpoch'].shift(-1)
    # Leave only records to be interpolated
    lerp_df = lerp_df[(lerp_df['latDeg'].isnull())&(lerp_df['phone']==lerp_df['phone_prev'])&(lerp_df['phone']==lerp_df['phone_next'])].copy()
    # calc lerp
    lerp_df['latDeg'] = lerp_df['latDeg_prev'] + ((lerp_df['latDeg_next'] - lerp_df['latDeg_prev']) * ((lerp_df['millisSinceGpsEpoch'] - lerp_df['time_prev']) / (lerp_df['time_next'] - lerp_df['time_prev']))) 
    lerp_df['lngDeg'] = lerp_df['lngDeg_prev'] + ((lerp_df['lngDeg_next'] - lerp_df['lngDeg_prev']) * ((lerp_df['millisSinceGpsEpoch'] - lerp_df['time_prev']) / (lerp_df['time_next'] - lerp_df['time_prev']))) 
    
    # Leave only the data that has a complete set of previous and next data.
    lerp_df = lerp_df[~lerp_df['latDeg'].isnull()]
    
    return lerp_df[org_columns]


def calc_mean_pred(df, lerp_df):
    '''
    Make a prediction based on the average of the predictions of phones in the same collection.
    '''
    add_lerp = pd.concat([df, lerp_df])
    mean_pred_result = add_lerp.groupby(['collectionName', 'millisSinceGpsEpoch'])[['latDeg', 'lngDeg']].mean().reset_index()
    mean_pred_df = df[['collectionName', 'phoneName', 'millisSinceGpsEpoch']].copy()
    mean_pred_df = mean_pred_df.merge(mean_pred_result[['collectionName', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg']], on=['collectionName', 'millisSinceGpsEpoch'], how='left')
    return mean_pred_df

# Remove Device 

In [7]:
import pandas as pd
import numpy as np
np.random.seed(71)

def get_removedevice(input_df: pd.DataFrame, device: str) -> pd.DataFrame:
    input_df['index'] = input_df.index 
    input_df = input_df.sort_values('millisSinceGpsEpoch')
    input_df.index = input_df['millisSinceGpsEpoch'].values # illisSinceGpsEpochをindexにする

    output_df = pd.DataFrame() 
    for _, subdf in input_df.groupby('collectionName'):

        phones = subdf['phoneName'].unique()

        # 1つのコレクションにphoneが1種類であるか、対象のデバイスがコレクションに含まれていない時
        if (len(phones) == 1) or (not device in phones):
            output_df = pd.concat([output_df, subdf])
            continue

        origin_df = subdf.copy()
        
        # 対象のデバイスの位置を削除
        _index = subdf['phoneName']==device
        subdf.loc[_index, 'latDeg'] = np.nan
        subdf.loc[_index, 'lngDeg'] = np.nan
        
        # Nanの周りに値が存在していれば、そのNanを補間
        # indexを基準として、線形的に補間していく
        subdf = subdf.interpolate(method='index', limit_area='inside')
        
        # 値が存在しないところは、元の値を使う
        _index = subdf['latDeg'].isnull()
        subdf.loc[_index, 'latDeg'] = origin_df.loc[_index, 'latDeg'].values
        subdf.loc[_index, 'lngDeg'] = origin_df.loc[_index, 'lngDeg'].values

        output_df = pd.concat([output_df, subdf])

    output_df.index = output_df['index'].values
    output_df = output_df.sort_index()

    del output_df['index']
    
    return output_df

# Position Shift

In [8]:
def compute_dist(oof, gt=ground_truth):
    df = oof.merge(gt, on = ['phone', 'millisSinceGpsEpoch'])
    dst_oof = calc_haversine(df.latDeg_x, df.lngDeg_x, df.latDeg_y, df.lngDeg_y)
    scores = pd.DataFrame({'phone': df.phone, 'dst': dst_oof})
    scores_grp = scores.groupby('phone') # phoneごとに距離誤差を算出
    d50 = scores_grp.quantile(.50).reset_index()
    d50.columns = ['phone','q50']
    d95 = scores_grp.quantile(.95).reset_index()
    d95.columns = ['phone', 'q95']
    return (scores_grp.quantile(.50).mean() + scores_grp.quantile(.95).mean())/2, d50.merge(d95)

def WGS84_to_ECEF(lat, lon, alt):
    # convert to randians
    rad_lat = lat * (np.pi / 180.0)
    rad_lon = lon * (np.pi / 180.0)
    a = 6378137.0 # 地球の長半径
    # f is the flattening factor
    finv = 298.257223563
    f = 1 / finv
    e2 = 1 - (1 - f) * (1 - f)
    # N is the radius of curvature in the prime vertical
    N = a / np.sqrt(1 - e2 * np.sin(rad_lat) * np.sin(rad_lat))
    x = (N + alt) * np.cos(rad_lat) * np.cos(rad_lon)
    y = (N + alt) * np.cos(rad_lat) * np.sin(rad_lon)
    z = (N * (1 - e2) + alt)        * np.sin(rad_lat)
    return x, y, z

transformer = pyproj.Transformer.from_crs(
    {"proj":"geocent", "ellps":"WGS84", "datum":"WGS84"},
    {"proj":'latlong', "ellps":'WGS84', "datum":'WGS84'})



def ECEF_to_WGS84(x,y,z):
    lon, lat, alt = transformer.transform(x,y,z,radians=False)
    return lon, lat, alt


def position_shift(fname, a):
    
    d = fname
    d['heightAboveWgs84EllipsoidM'] = 63.5
    d['x'], d['y'], d['z'] = zip(*d.apply(lambda x: WGS84_to_ECEF(x.latDeg, x.lngDeg, x.heightAboveWgs84EllipsoidM), axis=1))
    
    # a = -0.2
    d.sort_values(['phone', 'millisSinceGpsEpoch'], inplace=True)
    for fi in ['x','y','z']:
        # 1つ下のphoneが同じところで
        d[[fi+'p']] = d[fi].shift(1).where(d['phone'].eq(d['phone'].shift(1)))
        # diff: 次の地点との差
        d[[fi+'diff']] = d[fi] - d[fi+'p']
    # dist: 次の地点との距離
    d[['dist']] = np.sqrt(d['xdiff']**2 + d['ydiff']**2 + d['zdiff']**2)
    for fi in ['x','y','z']:
        d[[fi+'new']] = d[fi+'p'] + d[fi+'diff']*(1-a/d['dist'])
    lng, lat, alt = ECEF_to_WGS84(d['xnew'].values, d['ynew'].values, d['znew'].values)
    
    
    lng[np.isnan(lng)] = d.loc[np.isnan(lng),'lngDeg']
    lat[np.isnan(lat)] = d.loc[np.isnan(lat),'latDeg']
    d['latDeg'] = lat
    d['lngDeg'] = lng
    
    d.sort_values(['phone','millisSinceGpsEpoch'], inplace=True)

    return d[['phone', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg']]

def objective(trial):
    a = trial.suggest_uniform('a', -1, 1)
    score, scores = compute_dist(position_shift(filtered, a), ground_truth)
    return score

# remove low Speed

In [9]:
def add_distance_nogt_diff(df):
    # shift(1): 上のやつが1個下に下がる → 前のデータ
    # shift(-1): 下のやつが1個上に上がる → 次のデータ
    df['latDeg_prev'] = df['latDeg'].shift(1)
    df['latDeg_next'] = df['latDeg'].shift(-1)
    df['lngDeg_prev'] = df['lngDeg'].shift(1)
    df['lngDeg_next'] = df['lngDeg'].shift(-1)
    df['phone_prev'] = df['phone'].shift(1)
    df['phone_next'] = df['phone'].shift(-1)
    
    df['latDeg_prev_diff'] = df['latDeg'] - df['latDeg_prev']
    df['latDeg_next_diff'] = df['latDeg_next'] - df['latDeg']
    
    df['lngDeg_prev_diff'] = df['lngDeg'] - df['lngDeg_prev']
    df['lngDeg_next_diff'] = df['lngDeg_next'] - df['lngDeg']

    
    df['dist_prev'] = calc_haversine(df['latDeg'], df['lngDeg'], df['latDeg_prev'], df['lngDeg_prev'])
    df['dist_next'] = calc_haversine(df['latDeg'], df['lngDeg'], df['latDeg_next'], df['lngDeg_next'])
    
    
    df.loc[df['phone']!=df['phone_prev'], ['latDeg_prev', 'lngDeg_prev', 'dist_prev', 
                                          'latDeg_prev_diff', 'lngDeg_prev_diff']] = np.nan
    
    df.loc[df['phone']!=df['phone_next'], ['latDeg_next', 'lngDeg_next', 'dist_next', 
                                           'latDeg_next_diff', 'lngDeg_next_diff']] = np.nan
    
    return df

def remove_lowSpeed(_df, dist_thr=0.4):
    df = _df.copy()
    df['latDeg'] = df['latDeg'].astype(float)
    df['lngDeg'] = df['lngDeg'].astype(float)

    df = add_distance_nogt_diff(df)

    _index = df[(df['dist_prev']<dist_thr) | (df['dist_next']<dist_thr)]['latDeg'].index
    df.loc[_index, 'latDeg'] = np.nan
    df.loc[_index, 'lngDeg'] = np.nan
    # phoneごとに補間する
    dfs = []
    for _, df in df.groupby('phone'):
        df = df.interpolate(method='linear',
                            limit=None,
                            limit_direction='both')
        dfs.append(df)
    df = pd.concat(dfs)
    return df[['phone','millisSinceGpsEpoch','latDeg','lngDeg']]


def objective_rmls(trial):
    x = trial.suggest_uniform('x', 0.5, 0.9)
    score = get_train_score(remove_lowSpeed(filtered, x), ground_truth)
    return score

study = optuna.create_study()
study.optimize(objective_rmls, n_trials=100)

# phones mean

In [10]:
def mean_with_other_phones(df_):
    df = df_.copy()

    collections_list = df[['collectionName']].drop_duplicates().to_numpy()

    for collection in collections_list:
        phone_list = df[df['collectionName'].to_list() == collection][['phoneName']].drop_duplicates().to_numpy()

        phone_data = {}
        corrections = {}
        for phone in phone_list:
            cond = np.logical_and(df['collectionName'] == collection[0], df['phoneName'] == phone[0]).to_list()
            phone_data[phone[0]] = df[cond][['millisSinceGpsEpoch', 'latDeg', 'lngDeg']].to_numpy()

        for current in phone_data:
            correction = np.ones(phone_data[current].shape, dtype=np.float64)
            correction[:,1:] = phone_data[current][:,1:]
            
            # Telephones data don't complitely match by time, so - interpolate.
            for other in phone_data:
                if other == current:
                    continue

                loc = interp1d(phone_data[other][:,0], 
                               phone_data[other][:,1:], 
                               axis=0, 
                               kind='linear', 
                               copy=False, 
                               bounds_error=None, 
                               fill_value='extrapolate', 
                               assume_sorted=True)
                
                start_idx = 0
                stop_idx = 0
                for idx, val in enumerate(phone_data[current][:,0]):
                    if val < phone_data[other][0,0]:
                        start_idx = idx
                    if val < phone_data[other][-1,0]:
                        stop_idx = idx

                if stop_idx - start_idx > 0:
                    correction[start_idx:stop_idx,0] += 1
                    correction[start_idx:stop_idx,1:] += loc(phone_data[current][start_idx:stop_idx,0])                    

            correction[:,1] /= correction[:,0]
            correction[:,2] /= correction[:,0]
            
            corrections[current] = correction.copy()
        
        for phone in phone_list:
            cond = np.logical_and(df['collectionName'] == collection[0], df['phoneName'] == phone[0]).to_list()
            
            df.loc[cond, ['latDeg', 'lngDeg']] = corrections[phone[0]][:,1:]            
            
    return df

# train

In [11]:
# reject outlier
train_ro = add_distance_diff(base_train)
th = 43
train_ro.loc[((train_ro['dist_prev'] > th) | (train_ro['dist_next'] > th)), ['latDeg', 'lngDeg']] = np.nan

# kalman filter
cols = ['collectionName', 'phoneName', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg']
train_ro_kf = apply_kf_smoothing(train_ro[cols])

# phone mean pred
train_lerp = make_lerp_data(train_ro_kf)
train_mean_pred = calc_mean_pred(train_ro_kf, train_lerp)

train_ro_kf['phone'] = train_ro_kf['collectionName'] + '_' + train_ro_kf['phoneName']
train_mean_pred['phone'] = train_mean_pred['collectionName'] + '_' + train_mean_pred['phoneName']

print('reject outlier + kalmanfilter: ', get_train_score(train_ro_kf, ground_truth))
print('phone mean pred : ', get_train_score(train_mean_pred, ground_truth))

train_mean_pred = train_mean_pred.drop('collectionName', axis=1)
train_mean_pred = train_mean_pred.drop('phoneName', axis=1)
train_mean_pred = train_mean_pred.reindex(['phone', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg'], axis='columns')
filtered = train_mean_pred

# remove device
filtered['collectionName'] =filtered['phone'].map(lambda x: x.split('_')[0])
filtered['phoneName'] = filtered['phone'].map(lambda x: x.split('_')[1])
filtered = get_removedevice(filtered, 'SamsungS20Ultra')
filtered = filtered.drop(columns=['collectionName', 'phoneName'], axis=1)

# phones mean
filtered['collectionName'] =filtered['phone'].map(lambda x: x.split('_')[0])
filtered['phoneName'] = filtered['phone'].map(lambda x: x.split('_')[1])
filtered = mean_with_other_phones(filtered)
filtered = filtered.drop(columns=['collectionName', 'phoneName'], axis=1)
print('phones mean :', get_train_score(filtered, ground_truth))

# remove lowSpeed
filtered = remove_lowSpeed(filtered, 0.6939300630849313)
print('remove low speed: ', get_train_score(filtered, ground_truth))

# position shift
filtered = position_shift(filtered, a=0.6602905068929037)
print('position shift: ', get_train_score(filtered, ground_truth))

# to csv
filtered.to_csv('../output/filtered_nb046.csv', index=False)

# score
print('ro, kf, pm, rm, psm, rmls, ps: ', get_train_score(filtered, ground_truth))

reject outlier + kalmanfilter:  4.498849117070059
phone mean pred :  4.034688363261449
phones mean : 3.5362898482112963
remove low speed:  3.486164724134517
position shift:  3.4184543506885996
ro, kf, pm, rm, psm, rmls, ps:  3.4184543506885996


# submission

In [45]:
# subfileの雛形
submission = sample_sub

# reject outlier
base_test = add_distance_diff(base_test)
th = 43
base_test.loc[((base_test['dist_prev'] > th) | (base_test['dist_next'] > th)), ['latDeg', 'lngDeg']] = np.nan

# kalman filter
test_kf = apply_kf_smoothing(base_test)

# phone mean pred
test_lerp = make_lerp_data(test_kf)
test_mean_pred = calc_mean_pred(test_kf, test_lerp)
submission['latDeg'] = test_mean_pred['latDeg']
submission['lngDeg'] = test_mean_pred['lngDeg']

# Remove Device
submission['collectionName'] = submission['phone'].map(lambda x: x.split('_')[0])
submission['phoneName'] = submission['phone'].map(lambda x: x.split('_')[1])
submission = get_removedevice(submission, 'SamsungS20Ultra')
submission = submission.drop(columns=['collectionName', 'phoneName'], axis=1)

# phones mean
submission['collectionName'] =submission['phone'].map(lambda x: x.split('_')[0])
submission['phoneName'] = submission['phone'].map(lambda x: x.split('_')[1])
submission = mean_with_other_phones(submission)
submission = submission.drop(columns=['collectionName', 'phoneName'], axis=1)

# remove lowSpeed
submission = remove_lowSpeed(submission, 0.6939300630849313)

# position shift
submission = position_shift(submission, a=0.6602905068929037)

# submission
# submission.to_csv('../output/sub_nb046.csv', index=False)