In [3]:
import numpy as np
import pandas as pd
import glob
import os
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from pathlib import Path
import plotly.express as px

from shapely.geometry import Point
import osmnx as ox
import momepy
import geopandas as gpd
import folium

from pandarallel import pandarallel
pandarallel.initialize()

  shapely_geos_version, geos_capi_version_string


INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [4]:
def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(a**0.5)
    return dist

def visualize_trafic(df, center={"lat":37.6458, "lon":-122.4056}, zoom=9):
    fig = px.scatter_mapbox(df,
                            # Here, plotly gets, (x,y) coordinates
                            lat="latDeg",
                            lon="lngDeg",
                            
                            #Here, plotly detects color of series
                            color="phone",
                            
                            labels="phone",
                            zoom=zoom,
                            center=center,
                            height=600,
                            width=800)
    fig.update_layout(mapbox_style='stamen-terrain')
    fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
    fig.update_layout(title_text="GPS trafic")
    fig.show()

    
def visualize_collection(df, collection):
    target_df = df[df['collectionName']==collection].copy()
    lat_center = target_df['latDeg'].mean()
    lng_center = target_df['lngDeg'].mean()
    center = {"lat":lat_center, "lon":lng_center}
    
    visualize_trafic(target_df, center)

def percentile50(x):
    return np.percentile(x, 50)
def percentile95(x):
    return np.percentile(x, 95)

def get_train_score(df, gt):
    gt = gt.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
    # df = df.merge(gt, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='inner')
    df = df.merge(gt, on=['phone', 'millisSinceGpsEpoch'], how='inner')
    # calc_distance_error
    df['err'] = calc_haversine(df['latDeg_gt'], df['lngDeg_gt'], df['latDeg'], df['lngDeg'])
    # calc_evaluate_score
    # df['phone'] = df['collectionName'] + '_' + df['phoneName']
    res = df.groupby('phone')['err'].agg([percentile50, percentile95]) # phoneによってgroupbyし、gtと予測値の差(err)の50%,95%値を求める
    res['p50_p90_mean'] = (res['percentile50'] + res['percentile95']) / 2 
    score = res['p50_p90_mean'].mean()
    return score

def get_all_train_score(df):
    # calc_distance_error
    df['err'] = calc_haversine(df['latDeg_gt'], df['lngDeg_gt'], df['latDeg'], df['lngDeg'])
    # calc_evaluate_score
    # df['phone'] = df['collectionName'] + '_' + df['phoneName']
    res = df.groupby('phone')['err'].agg([percentile50, percentile95]) # phoneによってgroupbyし、gtと予測値の差(err)の50%,95%値を求める
    res['p50_p90_mean'] = (res['percentile50'] + res['percentile95']) / 2 
    score = res['p50_p90_mean'].mean()
    return score

In [19]:
data_dir = Path("../input/google-smartphone-decimeter-challenge")
# train_df = pd.read_csv(data_dir / "baseline_locations_train.csv")
train_df = pd.read_csv('../output/filtered_nb046.csv')
train_df['collectionName'] = train_df['phone'].apply(lambda x: x.split('_')[0])
train_df['phoneName'] = train_df['phone'].apply(lambda x: x.split('_')[1])

# get all ground truth dataframe
ground_truth = pd.DataFrame()
for (collection_name, phone_name), df in tqdm(train_df.groupby(["collectionName", "phoneName"])):
    path = data_dir / f"train/{collection_name}/{phone_name}/ground_truth.csv"
    df = pd.read_csv(path)  
    ground_truth = pd.concat([ground_truth, df]).reset_index(drop=True)   

ground_truth['phone'] = ground_truth['collectionName'] + '_' + ground_truth['phoneName']

collection_uniq = train_df['collectionName'].unique()
SJC = [i for i in collection_uniq if 'SJC' in i]
MTV = [i for i in collection_uniq if 'MTV' in i]
SVL = [i for i in collection_uniq if 'SVL' in i]
SF = [i for i in collection_uniq if 'SF' in i]
RWC = [i for i in collection_uniq if 'RWC' in i]


target_collection = RWC[1]
target_train = train_df[train_df["collectionName"]==target_collection]
target_gt = ground_truth[ground_truth["collectionName"]==target_collection]

  0%|          | 0/73 [00:00<?, ?it/s]

In [20]:
# create grid point
line_points = target_gt[['latDeg','lngDeg']].copy()

switches = line_points.ne(line_points.shift(-1))
idx = switches[switches].index

num_interpolate = 5
for i in range(num_interpolate):
    df_new = pd.DataFrame(index=idx + 0.5)
    line_points= pd.concat([line_points, df_new]).sort_index()
line_points = line_points.reset_index(drop=True)
line_points = line_points.interpolate(method='linear')

In [21]:
def find_closest_point(point, points, max_thr=19, min_thr=16):
    """ Find closest point from a list of points. """
    df_ = pd.DataFrame({'latDeg':point['latDeg'].repeat(len(points)), 
                        'lngDeg':point['lngDeg'].repeat(len(points))},
                        columns=['latDeg', 'lngDeg'])
    # return minimum distance points
    distance = calc_haversine(points['latDeg'], points['lngDeg'],
                                 df_['latDeg'], df_['lngDeg']).min()
    if min_thr <= distance <= max_thr:
        return list(points.loc[calc_haversine(points['latDeg'], points['lngDeg'],
                            df_['latDeg'], df_['lngDeg']).argmin()])

def apply_grid_point(x, closest_point):
    '''
    input: 
        x: train row
        closest_point: closest point or None
    '''
    idx = x.name
    closest_point1 = closest_point[closest_point.index==idx]
    if closest_point1.isnull().values == True:
        pass
    else:
        x['latDeg'] = closest_point1.values[0][0]
        x['lngDeg'] = closest_point1.values[0][1]
    return x

In [22]:
train_tmp = target_train.copy()
closest_point = train_tmp[['latDeg','lngDeg']].parallel_apply(lambda x: find_closest_point(x, line_points[['latDeg', 'lngDeg']], max_thr=50, min_thr=0), axis=1)
train_tmp[['latDeg', 'lngDeg']] = train_tmp[['latDeg', 'lngDeg']].parallel_apply(apply_grid_point, closest_point=closest_point, axis=1)

In [23]:
print('before snap to grid:', get_train_score(target_train, ground_truth))
print('afeter snap to grid:', get_train_score(train_tmp, ground_truth))

before snap to grid: 2.034397909893134
afeter snap to grid: 1.5025287114699282
