In [2]:
import numpy as np 
import pandas as pd
import seaborn as sns
import pathlib

import matplotlib.pyplot as plt
import plotly.figure_factory as ff
import plotly.express as px
%matplotlib inline

  shapely_geos_version, geos_capi_version_string


In [3]:
def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(a**0.5)
    return dist

    
def percentile50(x):
    return np.percentile(x, 50)
def percentile95(x):
    return np.percentile(x, 95)

def get_train_score(df, gt):
    gt = gt.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
    # df = df.merge(gt, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='inner')
    df = df.merge(gt, on=['phone', 'millisSinceGpsEpoch'], how='inner')
    # calc_distance_error
    df['err'] = calc_haversine(df['latDeg_gt'], df['lngDeg_gt'], df['latDeg'], df['lngDeg'])
    # calc_evaluate_score
    # df['phone'] = df['collectionName'] + '_' + df['phoneName']
    res = df.groupby('phone')['err'].agg([percentile50, percentile95]) # phoneによってgroupbyし、gtと予測値の差(err)の50%,95%値を求める
    res['p50_p90_mean'] = (res['percentile50'] + res['percentile95']) / 2 
    score = res['p50_p90_mean'].mean()
    return score
INPUT = '../input/google-smartphone-decimeter-challenge'
# ground_truth
p = pathlib.Path(INPUT)
gt_files = list(p.glob('train/*/*/ground_truth.csv'))
print('ground_truth.csv count : ', len(gt_files))

gts = []
for gt_file in gt_files:
    gts.append(pd.read_csv(gt_file))
ground_truth = pd.concat(gts)
ground_truth['phone'] = ground_truth['collectionName'] + '_' + ground_truth['phoneName']

ground_truth.csv count :  73


In [74]:
def ensembling(main, support, coeff1, coeff2): 
    
    suba  = main.copy() 
    subav = suba.values
       
    subb  = support.copy()
    subbv = subb.values    
           
    ense  = main.copy()    
    ensev = ense.values  
 
    for i in range (len(main)):
        
        pera1 = subav[i, 2]
        pera2 = subav[i, 3]
        
        perb1 = subbv[i, 2]
        perb2 = subbv[i, 3]

        per1 = (pera1 * coeff1) + (perb1 * (1.0 - coeff1))
        per2 = (pera2 * coeff2) + (perb2 * (1.0 - coeff2))
        
        ensev[i, 2] = per1
        ensev[i, 3] = per2
        
    ense.iloc[:, 2:] = ensev[:, 2:]  
  
    return ense 

In [127]:
path1 = '../output/filtered_nb046.csv'
tmp1 = pd.read_csv(path1)

path2 = '../output/filtered_nb056.csv'
tmp2 = pd.read_csv(path2)

en1 = ensembling(tmp1, tmp2, 0.02, 0.09)
get_train_score(en1, ground_truth)

3.142271664685327

In [139]:
en3 = ensembling(en1, en2, 0.2, 0.1)
get_train_score(en3, ground_truth)

3.1384583490343028

In [123]:
path3= '../output/filtered_nb065.csv'
tmp3 = pd.read_csv(path3)

path4 = '../output/filtered_nb046.csv'
tmp4 = pd.read_csv(path4)

en4 = ensembling(tmp3, tmp4, 0.5, 0.7)
get_train_score(en4, ground_truth)

3.4011460757204977

In [140]:
path3= '../output/filtered_nb065.csv'
tmp3 = pd.read_csv(path3)

path4 = '../output/filtered_nb056.csv'
tmp4 = pd.read_csv(path4)

en2 = ensembling(tmp3, tmp4, 0.05, 0.09)
get_train_score(en2, ground_truth)

3.137867005883899

In [142]:
path2 = '../output/sub_nb065.csv' 
sub065= pd.read_csv(path2)

path1 = '../output/sub_nb056.csv' 
sub056 = pd.read_csv(path1)

In [149]:
# sub1 = ensembling(sub5639, sub6027, 0.25, 0.60)

sub2 = ensembling(sub065, sub056, 0.05, 0.09)

In [152]:
# sub1.to_csv("submission1.csv",index=False)
sub2.to_csv("../output/sub_nb060_1.csv",index=False)