# Visualization of final submissions

In this notebook I visualize our final submissions. ( Private:2.872 , Public:3.487 )



In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import copy
import plotly.express as px
import plotly.graph_objects as go
import pyproj
import json
import bisect
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
from matplotlib.colors import ListedColormap, BoundaryNorm
import pickle
import random
from tqdm.notebook import tqdm
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
import warnings
warnings.simplefilter('ignore')
pd.set_option('display.max_rows',30)
pd.set_option('display.max_columns',None)

In [None]:
def visualize_trafic(df, center={"lat":37.423576, "lon":-122.094132}, zoom=9):
    fig = px.scatter_mapbox(df,
                            
                            # Here, plotly gets, (x,y) coordinates
                            lat="latDeg",
                            lon="lngDeg",
                            
                            #Here, plotly detects color of series
                            color="phoneName",
                            labels="phoneName",
                            
                            zoom=zoom,
                            center=center,
                            height=500,
                            width=750)
    fig.update_layout(mapbox_style='stamen-terrain')
    fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
    fig.update_layout(title_text="GPS trafic")
    fig.show()
    
def visualize_collection(df, collection, center={"lat":37.423576, "lon":-122.094132}):
    df_traj = df[df['collectionName'] == collection]
    center = {"lat":37.423576, "lon":-122.094132}
    visualize_trafic(df_traj, center)

In [None]:
def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(a**0.5)
    return dist

In [None]:
def percentile50(x):
    return np.percentile(x, 50)
def percentile95(x):
    return np.percentile(x, 95)

def get_train_score(df, gt):
    gt = gt.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
    df = df.merge(gt, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='inner')
    # calc_distance_error
    df['err'] = calc_haversine(df['latDeg_gt'], df['lngDeg_gt'], df['latDeg'], df['lngDeg'])
    # calc_evaluate_score
    df['phone'] = df['collectionName'] + '_' + df['phoneName']
    res = df.groupby('phone')['err'].agg([percentile50, percentile95])
    res['p50_p90_mean'] = (res['percentile50'] + res['percentile95']) / 2 
    score = res['p50_p90_mean'].mean()
    return score

In [None]:
def eval_all(df_pred, df_gt):
    scores = []
    compared_cols = ["latDeg_truth","lngDeg_truth","latDeg_pred","lngDeg_pred"]
    collections = sorted(df_gt['collectionName'].unique())
    for collection in collections:
        df_pred_col = df_pred[df_pred['collectionName'] == collection]
        df_gt_col = df_gt[df_gt['collectionName'] == collection]
        
        score = get_train_score(df_pred_col, df_gt_col)
        
        df_merged = pd.merge_asof(df_gt_col.sort_values('millisSinceGpsEpoch'), df_pred_col.sort_values('millisSinceGpsEpoch'), 
                                  on="millisSinceGpsEpoch", by=["collectionName", "phoneName"], 
                                  direction='nearest',tolerance=100000, suffixes=('_truth', '_pred'))
        df_merged = df_merged.sort_values(by=["collectionName", "phoneName", "millisSinceGpsEpoch"], ignore_index=True)

        haversine = calc_haversine(*df_merged[compared_cols].to_numpy().transpose()).mean()
        scores.append([collection, haversine, score])
    
    score = get_train_score(df_pred, df_gt)
    df_merged = pd.merge_asof(df_gt.sort_values('millisSinceGpsEpoch'), df_pred.sort_values('millisSinceGpsEpoch'), 
                              on="millisSinceGpsEpoch", by=["collectionName", "phoneName"], 
                              direction='nearest',tolerance=100000, suffixes=('_truth', '_pred'))
    haversine = calc_haversine(*df_merged[compared_cols].to_numpy().transpose()).mean()
    scores.append(['all', haversine, score])
    
    df_scores = pd.DataFrame(scores, columns=['collection', 'haversine', 'score'])
    return df_scores

In [None]:
datapath = Path("../input/google-smartphone-decimeter-challenge/")
ground_truths = (datapath / "train").rglob("ground_truth.csv")
df_gt = pd.concat([pd.read_csv(filepath) for filepath in ground_truths], ignore_index=True)

In [None]:
df_train = pd.read_csv('../input/210804-sub-c/train_submission.csv')

df_sub = pd.read_csv('../input/210804-sub-c/submission.csv')
tmp = df_sub['phone'].apply(lambda s : pd.Series(s.split('_')))
df_sub['collectionName'] = tmp[0]
df_sub['phoneName'] = tmp[1]

# Train

In [None]:
eval_all(df_train, df_gt)

In [None]:
visualize_trafic(df_train)

# Submission

In [None]:
visualize_trafic(df_sub)