In [1]:
import numpy as np
import pandas as pd
import glob
import os
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from pathlib import Path
import plotly.express as px

import gc
import optuna

from shapely.geometry import Point
import osmnx as ox
import momepy
import geopandas as gpd
import folium

from pandarallel import pandarallel
pandarallel.initialize()

  shapely_geos_version, geos_capi_version_string


INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [2]:
def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(a**0.5)
    return dist

def percentile50(x):
    return np.percentile(x, 50)
def percentile95(x):
    return np.percentile(x, 95)

def get_train_score(df, gt):
    gt = gt.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
    # df = df.merge(gt, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='inner')
    df = df.merge(gt, on=['phone', 'millisSinceGpsEpoch'], how='inner')
    # calc_distance_error
    df['err'] = calc_haversine(df['latDeg_gt'], df['lngDeg_gt'], df['latDeg'], df['lngDeg'])
    # calc_evaluate_score
    # df['phone'] = df['collectionName'] + '_' + df['phoneName']
    res = df.groupby('phone')['err'].agg([percentile50, percentile95]) # phoneによってgroupbyし、gtと予測値の差(err)の50%,95%値を求める
    res['p50_p90_mean'] = (res['percentile50'] + res['percentile95']) / 2 
    score = res['p50_p90_mean'].mean()
    return score

def get_all_train_score(df):
    # calc_distance_error
    df['err'] = calc_haversine(df['latDeg_gt'], df['lngDeg_gt'], df['latDeg'], df['lngDeg'])
    # calc_evaluate_score
    # df['phone'] = df['collectionName'] + '_' + df['phoneName']
    res = df.groupby('phone')['err'].agg([percentile50, percentile95]) # phoneによってgroupbyし、gtと予測値の差(err)の50%,95%値を求める
    res['p50_p90_mean'] = (res['percentile50'] + res['percentile95']) / 2 
    score = res['p50_p90_mean'].mean()
    return score

In [3]:
def find_closest_point(point, points, max_thr=19, min_thr=16):
    """ Find closest point from a list of points. """
    df_ = pd.DataFrame({'latDeg':point['latDeg'].repeat(len(points)), 
                        'lngDeg':point['lngDeg'].repeat(len(points))},
                        columns=['latDeg', 'lngDeg'])
    # return minimum distance points
    distance = calc_haversine(points['latDeg'], points['lngDeg'],
                                 df_['latDeg'], df_['lngDeg']).min()
    if min_thr <= distance <= max_thr:
        return points.loc[calc_haversine(points['latDeg'], points['lngDeg'],
                            df_['latDeg'], df_['lngDeg']).argmin()]


def apply_grid_point(x, closest_point):
    '''
    input: 
        x: train row
        closest_point: closest point or None
    '''
    idx = x.name
    closest_point1 = closest_point[closest_point.index==idx]
    if closest_point1.isnull().values == True:
        pass
    else:
        x['latDeg'] = closest_point1.values[0]['latDeg']
        x['lngDeg'] = closest_point1.values[0]['lngDeg']
    return x

In [4]:
data_dir = Path("../input/google-smartphone-decimeter-challenge")
# train_df = pd.read_csv(data_dir / "baseline_locations_train.csv")
train_df = pd.read_csv('../output/filtered_nb046.csv')
train_df['collectionName'] = train_df['phone'].apply(lambda x: x.split('_')[0])
train_df['phoneName'] = train_df['phone'].apply(lambda x: x.split('_')[1])

# get all ground truth dataframe
gt_df = pd.DataFrame()
for (collection_name, phone_name), df in tqdm(train_df.groupby(["collectionName", "phoneName"])):
    path = data_dir / f"train/{collection_name}/{phone_name}/ground_truth.csv"
    df = pd.read_csv(path)  
    gt_df = pd.concat([gt_df, df]).reset_index(drop=True)   

  0%|          | 0/73 [00:00<?, ?it/s]

# preprocessing

In [6]:
collection_uniq = train_df['collectionName'].unique()
MTV = [i for i in collection_uniq if 'MTV' in i]
SF = [i for i in collection_uniq if 'SF' in i]
RWC = [i for i in collection_uniq if 'RWC' in i]
SVL = [i for i in collection_uniq if 'SVL' in i]
SJC = [i for i in collection_uniq if 'SJC' in i]
collections = [SJC]

In [7]:
target_collection = SJC[1]
target_gt_df = gt_df[gt_df["collectionName"]==target_collection].reset_index(drop=True)
# for scoring
target_gt_df['phone'] = target_gt_df['collectionName'] + '_' + target_gt_df['phoneName']

train = train_df[train_df['collectionName']==target_collection]

# change pd.DataFrame -> gpd.GeoDataFrame
target_gt_df["geometry"] = [Point(p) for p in target_gt_df[["lngDeg", "latDeg"]].to_numpy()]
target_gt_gdf = gpd.GeoDataFrame(target_gt_df, geometry=target_gt_df["geometry"])

# get road data from open street map by osmnx
offset = 0.1**5
bbox = target_gt_gdf.bounds + [-offset, -offset, offset, offset]
east = bbox["minx"].min()
west = bbox["maxx"].max()
south = bbox["miny"].min()
north = bbox["maxy"].max()
G = ox.graph.graph_from_bbox(north, south, east, west, network_type='drive')


nodes, edges = momepy.nx_to_gdf(G)


edges = edges.dropna(subset=["geometry"]).reset_index(drop=True)
hits = bbox.apply(lambda row: list(edges.sindex.intersection(row)), axis=1)
tmp = pd.DataFrame({
    # index of points table
    "pt_idx": np.repeat(hits.index, hits.apply(len)),
    # ordinal position of line - access via iloc later
    "line_i": np.concatenate(hits.values)
})
# Join back to the lines on line_i; we use reset_index() to 
# give us the ordinal position of each line
tmp = tmp.join(edges.reset_index(drop=True), on="line_i")
# Join back to the original points to get their geometry
# rename the point geometry as "point"
tmp = tmp.join(target_gt_gdf.geometry.rename("point"), on="pt_idx")
# Convert back to a GeoDataFrame, so we can do spatial ops
tmp = gpd.GeoDataFrame(tmp, geometry="geometry", crs=target_gt_gdf.crs)


tmp["snap_dist"] = tmp.geometry.distance(gpd.GeoSeries(tmp.point))

# Discard any lines that are greater than tolerance from points
tolerance = 0.0005  
tmp = tmp.loc[tmp.snap_dist <= tolerance]
# Sort on ascending snap distance, so that closest goes to top
tmp = tmp.sort_values(by=["snap_dist"])

# group by the index of the points and take the first, which is the
# closest line 
closest = tmp.groupby("pt_idx").first()
# construct a GeoDataFrame of the closest lines
closest = gpd.GeoDataFrame(closest, geometry="geometry")
closest = closest.drop_duplicates("line_i").reset_index(drop=True)


line_points_list = []
split = 50  # param: number of split in each LineString
for dist in range(0, split, 1):
    dist = dist/split
    line_points = closest["geometry"].interpolate(dist, normalized=True)
    line_points_list.append(line_points)
line_points = pd.concat(line_points_list).reset_index(drop=True)
line_points = line_points.reset_index().rename(columns={0:"geometry"})
line_points["lngDeg"] = line_points["geometry"].x
line_points["latDeg"] = line_points["geometry"].y


Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:4326
Right CRS: None



In [8]:
train = train_df.copy()
train = train[train['collectionName']==target_collection]
train1 = train.copy()
# parallel applyをすると出力が変わり、エラーが出る
closest_point = train1[['latDeg','lngDeg']].apply(lambda x: find_closest_point(x, line_points[['latDeg', 'lngDeg']], max_thr=19, min_thr=16), axis=1)
train1[['latDeg', 'lngDeg']] = train1[['latDeg', 'lngDeg']].parallel_apply(apply_grid_point, closest_point=closest_point, axis=1)

In [9]:
print(get_train_score(train_df, target_gt_df))
print(get_train_score(train1, target_gt_df))

14.898988862403762
14.701296923190238


In [85]:
for c in collection:
    target_collection = c
    train = train_df.copy()
    train = train[train['collectionName']==c]
    train1 = train.copy()

    target_gt_df = gt_df[gt_df["collectionName"]==target_collection].reset_index(drop=True)
    # for scoring
    target_gt_df['phone'] = target_gt_df['collectionName'] + '_' + target_gt_df['phoneName']

    train = train_df[train_df['collectionName']==target_collection]

    # change pd.DataFrame -> gpd.GeoDataFrame
    target_gt_df["geometry"] = [Point(p) for p in target_gt_df[["lngDeg", "latDeg"]].to_numpy()]
    target_gt_gdf = gpd.GeoDataFrame(target_gt_df, geometry=target_gt_df["geometry"])

    # get road data from open street map by osmnx
    offset = 0.1**5
    bbox = target_gt_gdf.bounds + [-offset, -offset, offset, offset]
    east = bbox["minx"].min()
    west = bbox["maxx"].max()
    south = bbox["miny"].min()
    north = bbox["maxy"].max()
    G = ox.graph.graph_from_bbox(north, south, east, west, network_type='drive')


    nodes, edges = momepy.nx_to_gdf(G)


    edges = edges.dropna(subset=["geometry"]).reset_index(drop=True)
    hits = bbox.apply(lambda row: list(edges.sindex.intersection(row)), axis=1)
    tmp = pd.DataFrame({
        # index of points table
        "pt_idx": np.repeat(hits.index, hits.apply(len)),
        # ordinal position of line - access via iloc later
        "line_i": np.concatenate(hits.values)
    })
    # Join back to the lines on line_i; we use reset_index() to 
    # give us the ordinal position of each line
    tmp = tmp.join(edges.reset_index(drop=True), on="line_i")
    # Join back to the original points to get their geometry
    # rename the point geometry as "point"
    tmp = tmp.join(target_gt_gdf.geometry.rename("point"), on="pt_idx")
    # Convert back to a GeoDataFrame, so we can do spatial ops
    tmp = gpd.GeoDataFrame(tmp, geometry="geometry", crs=target_gt_gdf.crs)


    tmp["snap_dist"] = tmp.geometry.distance(gpd.GeoSeries(tmp.point))

    # Discard any lines that are greater than tolerance from points
    tolerance = 0.0005  
    tmp = tmp.loc[tmp.snap_dist <= tolerance]
    # Sort on ascending snap distance, so that closest goes to top
    tmp = tmp.sort_values(by=["snap_dist"])

    # group by the index of the points and take the first, which is the
    # closest line 
    closest = tmp.groupby("pt_idx").first()
    # construct a GeoDataFrame of the closest lines
    closest = gpd.GeoDataFrame(closest, geometry="geometry")
    closest = closest.drop_duplicates("line_i").reset_index(drop=True)


    line_points_list = []
    split = 50  # param: number of split in each LineString
    for dist in range(0, split, 1):
        dist = dist/split
        line_points = closest["geometry"].interpolate(dist, normalized=True)
        line_points_list.append(line_points)
    line_points = pd.concat(line_points_list).reset_index(drop=True)
    line_points = line_points.reset_index().rename(columns={0:"geometry"})
    line_points["lngDeg"] = line_points["geometry"].x
    line_points["latDeg"] = line_points["geometry"].y


    def objective_snap_to_grid(trial):
        X = trial.suggest_int('X', 15, 45)
        closest_point = train1[['latDeg','lngDeg']].apply(lambda x: find_closest_point(x, line_points[['latDeg', 'lngDeg']], thr=X), axis=1)
        train1[['latDeg', 'lngDeg']] = train1[['latDeg', 'lngDeg']].parallel_apply(apply_grid_point, closest_point=closest_point, axis=1)
        score = get_train_score(train1, target_gt_df)
        return score

    print('original score:', get_train_score(train1, target_gt_df))
    study = optuna.create_study()
    study.optimize(objective_snap_to_grid, n_trials=30)
    print(c, study.best_params, study.best_value)

    del train
    del train1
    gc.collect()


Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:4326
Right CRS: None

[32m[I 2021-07-25 20:28:55,064][0m A new study created in memory with name: no-name-399e864c-e596-438c-97a0-8fa09d84fbac[0m


original score: 15.734804800549322


[32m[I 2021-07-25 20:30:06,015][0m Trial 0 finished with value: 14.252638818159372 and parameters: {'X': 27}. Best is trial 0 with value: 14.252638818159372.[0m
[32m[I 2021-07-25 20:31:14,500][0m Trial 1 finished with value: 15.31097822296365 and parameters: {'X': 37}. Best is trial 0 with value: 14.252638818159372.[0m


2021-04-22-US-SJC-1 {'X': 27} 14.252638818159372



Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:4326
Right CRS: None

[32m[I 2021-07-25 20:31:19,764][0m A new study created in memory with name: no-name-118798bd-2bc2-4981-b2fd-709f22105ae7[0m


original score: 14.898988862403762


[32m[I 2021-07-25 20:32:04,655][0m Trial 0 finished with value: 16.33398552397798 and parameters: {'X': 37}. Best is trial 0 with value: 16.33398552397798.[0m
[32m[I 2021-07-25 20:32:54,064][0m Trial 1 finished with value: 16.33398552397798 and parameters: {'X': 26}. Best is trial 0 with value: 16.33398552397798.[0m


2021-04-28-US-SJC-1 {'X': 37} 16.33398552397798



Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:4326
Right CRS: None

[32m[I 2021-07-25 20:33:00,104][0m A new study created in memory with name: no-name-8a5ca312-d5da-48de-be0b-e4b614622946[0m


original score: 12.501141483952647


[32m[I 2021-07-25 20:33:50,407][0m Trial 0 finished with value: 14.585359649432977 and parameters: {'X': 33}. Best is trial 0 with value: 14.585359649432977.[0m
[32m[I 2021-07-25 20:34:37,881][0m Trial 1 finished with value: 14.692779746734674 and parameters: {'X': 34}. Best is trial 0 with value: 14.585359649432977.[0m


2021-04-29-US-SJC-2 {'X': 33} 14.585359649432977
