In [39]:
import numpy as np
import pandas as pd
from scipy.ndimage import gaussian_filter1d
from scipy.interpolate import interp1d

In [40]:
def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(a**0.5)
    return dist

def percentile50(x):
    return np.percentile(x, 50)
def percentile95(x):
    return np.percentile(x, 95)

def get_train_score(df, gt):
    gt['phone'] = gt['collectionName'] + '_' + gt['phoneName']
    gt = gt.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
    df = df.merge(gt, on=['phone', 'millisSinceGpsEpoch'], how='inner')
    # calc_distance_error
    df['err'] = calc_haversine(df['latDeg_gt'], df['lngDeg_gt'], df['latDeg'], df['lngDeg'])
    # calc_evaluate_score
    res = df.groupby('phone')['err'].agg([percentile50, percentile95]) # phoneによってgroupbyし、gtと予測値の差(err)の50%,95%値を求める
    res['p50_p90_mean'] = (res['percentile50'] + res['percentile95']) / 2 
    score = res['p50_p90_mean'].mean()
    return score

In [41]:
def apply_gauss_smoothing(df_, params):
    df = df_.copy()
    SZ_1 = params['sz_1']
    SZ_2 = params['sz_2']
    SZ_CRIT = params['sz_crit']    
    
    unique_paths = df[['collectionName', 'phoneName']].drop_duplicates().to_numpy()
    for collection, phone in unique_paths:
        cond = np.logical_and(df['collectionName'] == collection, df['phoneName'] == phone)
        data = df[cond][['latDeg', 'lngDeg']].to_numpy()
                
        lat_g1 = gaussian_filter1d(data[:, 0], np.sqrt(SZ_1))
        lon_g1 = gaussian_filter1d(data[:, 1], np.sqrt(SZ_1))
        lat_g2 = gaussian_filter1d(data[:, 0], np.sqrt(SZ_2))
        lon_g2 = gaussian_filter1d(data[:, 1], np.sqrt(SZ_2))

        lat_dif = data[1:,0] - data[:-1,0]
        lon_dif = data[1:,1] - data[:-1,1]

        lat_crit = np.append(np.abs(gaussian_filter1d(lat_dif, np.sqrt(SZ_CRIT)) / (1e-9 + gaussian_filter1d(np.abs(lat_dif), np.sqrt(SZ_CRIT)))),[0])
        lon_crit = np.append(np.abs(gaussian_filter1d(lon_dif, np.sqrt(SZ_CRIT)) / (1e-9 + gaussian_filter1d(np.abs(lon_dif), np.sqrt(SZ_CRIT)))),[0])           
            
        df.loc[cond, 'latDeg'] = lat_g1 * lat_crit + lat_g2 * (1.0 - lat_crit)
        df.loc[cond, 'lngDeg'] = lon_g1 * lon_crit + lon_g2 * (1.0 - lon_crit)    
                       
    return df
def mean_with_other_phones(df_):
    df = df_.copy()

    collections_list = df[['collectionName']].drop_duplicates().to_numpy()

    for collection in collections_list:
        phone_list = df[df['collectionName'].to_list() == collection][['phoneName']].drop_duplicates().to_numpy()

        phone_data = {}
        corrections = {}
        for phone in phone_list:
            cond = np.logical_and(df['collectionName'] == collection[0], df['phoneName'] == phone[0]).to_list()
            phone_data[phone[0]] = df[cond][['millisSinceGpsEpoch', 'latDeg', 'lngDeg']].to_numpy()

        for current in phone_data:
            correction = np.ones(phone_data[current].shape, dtype=np.float)
            correction[:,1:] = phone_data[current][:,1:]
            
            # Telephones data don't complitely match by time, so - interpolate.
            for other in phone_data:
                if other == current:
                    continue

                loc = interp1d(phone_data[other][:,0], 
                               phone_data[other][:,1:], 
                               axis=0, 
                               kind='linear', 
                               copy=False, 
                               bounds_error=None, 
                               fill_value='extrapolate', 
                               assume_sorted=True)
                
                start_idx = 0
                stop_idx = 0
                for idx, val in enumerate(phone_data[current][:,0]):
                    if val < phone_data[other][0,0]:
                        start_idx = idx
                    if val < phone_data[other][-1,0]:
                        stop_idx = idx

                if stop_idx - start_idx > 0:
                    correction[start_idx:stop_idx,0] += 1
                    correction[start_idx:stop_idx,1:] += loc(phone_data[current][start_idx:stop_idx,0])                    

            correction[:,1] /= correction[:,0]
            correction[:,2] /= correction[:,0]
            
            corrections[current] = correction.copy()
        
        for phone in phone_list:
            cond = np.logical_and(df['collectionName'] == collection[0], df['phoneName'] == phone[0]).to_list()
            
            df.loc[cond, ['latDeg', 'lngDeg']] = corrections[phone[0]][:,1:]            
            
    return df

In [42]:
base_train
test_base = pd.read_csv('../input/google-smartphone-decimeter-challenge/baseline_locations_test.csv')
sub = pd.read_csv('../input/google-smartphone-decimeter-challenge/sample_submission.csv')

smoothed_baseline = apply_gauss_smoothing(test_base, {'sz_1' : 0.85, 'sz_2' : 5.65, 'sz_crit' : 1.5})
smoothed_baseline = mean_with_other_phones(smoothed_baseline)

# sub = sub.assign(latDeg=smoothed_baseline.latDeg, lngDeg=smoothed_baseline.lngDeg)
# sub.to_csv('submission.csv', index=False)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


In [43]:
import pathlib
# directory setting
INPUT = '../input/google-smartphone-decimeter-challenge'

# base_train = pd.read_csv(INPUT + '/' + 'baseline_locations_train.csv')
# base_train = pd.read_csv('../output/filtered_nb037.csv')
base_train = pd.read_csv('../output/filtered_nb041.csv')
base_train['collectionName'] = base_train['phone'].map(lambda x: x.split('_')[0])
base_train['phoneName'] = base_train['phone'].map(lambda x: x.split('_')[1])

# base_test = pd.read_csv('../output/sub_nb037.csv')
base_test = pd.read_csv('../output/sub_nb037_5.csv')
# base_test = pd.read_csv('../output/fixed_base_test.csv')

sample_sub = pd.read_csv(INPUT + '/' + 'sample_submission.csv')

# ground_truth
p = pathlib.Path(INPUT)
gt_files = list(p.glob('train/*/*/ground_truth.csv'))
print('ground_truth.csv count : ', len(gt_files))

gts = []
for gt_file in gt_files:
    gts.append(pd.read_csv(gt_file))
ground_truth = pd.concat(gts)
ground_truth['phone'] = ground_truth['collectionName'] + '_' + ground_truth['phoneName']

ground_truth.csv count :  73


In [44]:
get_train_score(base_train, ground_truth)

3.8035925918757334

In [45]:
smoothed_baseline = apply_gauss_smoothing(base_train, {'sz_1' : 0.85, 'sz_2' : 5.65, 'sz_crit' : 1.5})
get_train_score(smoothed_baseline, ground_truth)

3.9062696173443814

In [48]:
smoothed_baseline1 = mean_with_other_phones(smoothed_baseline)
get_train_score(smoothed_baseline1, ground_truth)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


3.5874074412848453

In [46]:
train = mean_with_other_phones(base_train)
get_train_score(train, ground_truth)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


3.4515872201655102

In [49]:
train.head()

Unnamed: 0,phone,millisSinceGpsEpoch,latDeg,lngDeg,collectionName,phoneName
0,2020-05-14-US-MTV-1_Pixel4,1273529463442,37.423549,-122.094006,2020-05-14-US-MTV-1,Pixel4
1,2020-05-14-US-MTV-1_Pixel4,1273529464442,37.423563,-122.094056,2020-05-14-US-MTV-1,Pixel4
2,2020-05-14-US-MTV-1_Pixel4,1273529465442,37.423571,-122.09409,2020-05-14-US-MTV-1,Pixel4
3,2020-05-14-US-MTV-1_Pixel4,1273529466442,37.423568,-122.094091,2020-05-14-US-MTV-1,Pixel4
4,2020-05-14-US-MTV-1_Pixel4,1273529467442,37.423571,-122.094106,2020-05-14-US-MTV-1,Pixel4
