# Position shift

In [4]:
import numpy as np
import pandas as pd
import os
from pathlib import Path
import pyproj
from pyproj import Proj, transform # 地理的な位置を示す情報を扱うときに、座標系・測地系変換を行ったり、2点間の距離・方位角を計算したりできる。

import optuna

def calc_haversine(lat1, lon1, lat2, lon2):
    RADIUS = 6_367_000 # 半径
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(a**0.5)
    return dist # 現在地と真値との距離

def compute_dist(fname, fname2 = 'gt.csv'):
    oof = pd.read_csv(fname)
    gt = pd.read_csv(fname2)
    df = oof.merge(gt, on = ['phone', 'millisSinceGpsEpoch'])
    dst_oof = calc_haversine(df.latDeg_x, df.lngDeg_x, df.latDeg_y, df.lngDeg_y)
    scores = pd.DataFrame({'phone': df.phone, 'dst': dst_oof})
    scores_grp = scores.groupby('phone') # phoneごとに距離誤差を算出
    d50 = scores_grp.quantile(.50).reset_index()
    d50.columns = ['phone','q50']
    d95 = scores_grp.quantile(.95).reset_index()
    d95.columns = ['phone', 'q95']
    return (scores_grp.quantile(.50).mean() + scores_grp.quantile(.95).mean())/2, d50.merge(d95)

def WGS84_to_ECEF(lat, lon, alt):
    # convert to randians
    rad_lat = lat * (np.pi / 180.0)
    rad_lon = lon * (np.pi / 180.0)
    a = 6378137.0 # 地球の長半径
    # f is the flattening factor
    finv = 298.257223563
    f = 1 / finv
    e2 = 1 - (1 - f) * (1 - f)
    # N is the radius of curvature in the prime vertical
    N = a / np.sqrt(1 - e2 * np.sin(rad_lat) * np.sin(rad_lat))
    x = (N + alt) * np.cos(rad_lat) * np.cos(rad_lon)
    y = (N + alt) * np.cos(rad_lat) * np.sin(rad_lon)
    z = (N * (1 - e2) + alt)        * np.sin(rad_lat)
    return x, y, z

transformer = pyproj.Transformer.from_crs(
    {"proj":"geocent", "ellps":"WGS84", "datum":"WGS84"},
    {"proj":'latlong', "ellps":'WGS84', "datum":'WGS84'})



def ECEF_to_WGS84(x,y,z):
    lon, lat, alt = transformer.transform(x,y,z,radians=False)
    return lon, lat, alt

datadir = Path('../input/google-smartphone-decimeter-challenge/')
testdir = datadir / 'test'
traindir = datadir / 'train'

sample_sub = pd.read_csv(datadir/'sample_submission.csv')
sub_columns = sample_sub.columns

# user_ouput = '../ouput/user_output/'
baseline_train = pd.read_csv(datadir / 'baseline_locations_train.csv')
baseline_train[sub_columns].to_csv('btrain.csv',index = False)
baseline_test = pd.read_csv(datadir / 'baseline_locations_test.csv')
baseline_test[sub_columns].to_csv('btest.csv',index = False)

msge = 'millisSinceGpsEpoch'

gt = pd.DataFrame()
for d in os.listdir(traindir): # コレクションを一つずつ見る
    for p in os.listdir(traindir/d): # phoneを一つずつ見る
        gt = gt.append(pd.read_csv(traindir/d/p/'ground_truth.csv'))
        
gt['phone'] = gt['collectionName'] + '_' + gt['phoneName']
gt[sub_columns].to_csv('gt.csv', index = False)
gt['heightAboveWgs84EllipsoidM'].describe()

count    131342.000000
mean         87.028847
std          56.999876
min          31.160000
25%          37.160000
50%          63.520000
75%         122.330000
max         247.850000
Name: heightAboveWgs84EllipsoidM, dtype: float64

In [None]:
score, scores = compute_dist('btrain.csv','gt.csv')
print(score)
scores

In [11]:
def position_shift(fname, a):
    
    d = pd.read_csv(fname)
    d['heightAboveWgs84EllipsoidM'] = 63.5
    d['x'], d['y'], d['z'] = zip(*d.apply(lambda x: WGS84_to_ECEF(x.latDeg, x.lngDeg, x.heightAboveWgs84EllipsoidM), axis=1))
    
    # a = -0.2
    d.sort_values(['phone', msge], inplace=True)
    for fi in ['x','y','z']:
        # 1つ下のphoneが同じところで
        d[[fi+'p']] = d[fi].shift(1).where(d['phone'].eq(d['phone'].shift(1)))
        # diff: 次の地点との差
        d[[fi+'diff']] = d[fi] - d[fi+'p']
    # dist: 次の地点との距離
    d[['dist']] = np.sqrt(d['xdiff']**2 + d['ydiff']**2 + d['zdiff']**2)
    for fi in ['x','y','z']:
        d[[fi+'new']] = d[fi+'p'] + d[fi+'diff']*(1-a/d['dist'])
    lng, lat, alt = ECEF_to_WGS84(d['xnew'].values, d['ynew'].values, d['znew'].values)
    
    
    lng[np.isnan(lng)] = d.loc[np.isnan(lng),'lngDeg']
    lat[np.isnan(lat)]  =d.loc[np.isnan(lat),'latDeg']
    d['latDeg'] = lat
    d['lngDeg'] = lng
    
    d.sort_values(['phone',msge], inplace=True)
    '''
    ffname = 'shifted_fname
    d[sub_columns].to_csv(ffname, index=False)
    return ffname
    '''
    return d[sub_columns]

def objective(trial):
    a = trial.suggest_uniform('a', -1, 1)
    score, scores = compute_dist(position_shift('btrain.csv', a),'gt.csv')
    return score

In [14]:
study = optuna.create_study()
study.optimize(objective, n_trials=30)

[32m[I 2021-06-13 16:47:07,314][0m A new study created in memory with name: no-name-64a78885-ac22-4346-9d49-8c49c20eb55a[0m
[32m[I 2021-06-13 16:47:14,240][0m Trial 0 finished with value: 5.210776192950242 and parameters: {'a': 0.32732961296509044}. Best is trial 0 with value: 5.210776192950242.[0m
[32m[I 2021-06-13 16:47:20,832][0m Trial 1 finished with value: 5.191607573728321 and parameters: {'a': 0.5121084706173713}. Best is trial 1 with value: 5.191607573728321.[0m
[32m[I 2021-06-13 16:47:27,506][0m Trial 2 finished with value: 5.193063800543382 and parameters: {'a': 0.4797307221793101}. Best is trial 1 with value: 5.191607573728321.[0m
[32m[I 2021-06-13 16:47:35,535][0m Trial 3 finished with value: 5.190931813591363 and parameters: {'a': 0.5678369778818746}. Best is trial 3 with value: 5.190931813591363.[0m
[32m[I 2021-06-13 16:47:42,850][0m Trial 4 finished with value: 5.404028042059111 and parameters: {'a': -0.3283256518512281}. Best is trial 3 with value: 5.19

{'a': 0.6602905068929037}: lb 5.531

## フィルター済みtrain

In [6]:
input_file = '../output/filtered_nb016.csv'
# filtered = position_shift(input_file, a=study.best_params['a'])
filtered = position_shift(input_file, a=0.6602905068929037)
filtered.to_csv('../output/filtered_nb017.csv', index=False)

### nb023, nb016

In [7]:
input_file = '../output/filtered_nb016_2.csv'
# filtered = position_shift(input_file, a=study.best_params['a'])
filtered = position_shift(input_file, a=0.6602905068929037)
filtered.to_csv('../output/filtered_nb017_2.csv', index=False)

## submission

1. phone mean predicion
2. remove device
3. position shift #######

In [20]:
input_file = '../output/sub_nb016.csv'
submission = position_shift(input_file, a=study.best_params['a'])
submission.to_csv('../output/sub_nb017.csv', index=False)

1. phone mean predicion
2. position shift #######
3. remove device

In [8]:
input_file = '../output/sub_nb005.csv'
# submission = position_shift(input_file, a=study.best_params['a'])
submission = position_shift(input_file, a=0.6602905068929037)
submission.to_csv('../output/sub_nb017_1.csv', index=False)

### 二度目のposition shift

In [9]:
input_file = '../output/sub_nb017.csv'
# submission = position_shift(input_file, a=study.best_params['a'])
submission = position_shift(input_file, a=0.6602905068929037)
submission.to_csv('../output/sub_nb017_2.csv', index=False)

### nb023, nb016,  

In [10]:
input_file = '../output/sub_nb016_2.csv'
# submission = position_shift(input_file, a=study.best_params['a'])
submission = position_shift(input_file, a=0.6602905068929037)
submission.to_csv('../output/sub_nb017_2.csv', index=False)

nb023_1, nb016_3

In [12]:
input_file = '../output/sub_nb016_3.csv'
# submission = position_shift(input_file, a=study.best_params['a'])
submission = position_shift(input_file, a=0.6602905068929037)
submission.to_csv('../output/sub_nb017_3.csv', index=False)