# What is this?

The following is a short script aimed to implement this competition's evaluation function.
If you have ideas to make this function better - **awesome**! 

If you found this helpful please consider upvoting this notebook :)


In [None]:
# (x1, y1), (x2, y2) --> distance in km 
!pip install vincenty

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import preprocessing
import lightgbm as lgbm
from vincenty import vincenty

import os

In [None]:
"""
Submissions are scored on the mean of the 50th and 95th percentile distance errors.
For every phone and at every millisSinceGpsEpoch,
the horizontal distance (in meters) is computed between the predicted lat/lng and the ground truth lat/lng.
These distance errors form a distribution from which the 50th and 95th percentile errors are calculated 
(i.e. the 95th percentile error is the value, in meters, for which 95% of the distance errors are smaller).
The 50th and 95th percentile errors are then averaged for each phone.
Lastly, the mean of these averaged values is calculated across all phones in the test set.
"""
pred_cols = ["predsLatDeg", "predsLngDeg"]
real_cols = ["realLatDeg", "realLngDeg"]

def apply_vincenty(row: pd.DataFrame):
    return vincenty(row[real_cols].values, row[pred_cols].values)

def distance(group: pd.DataFrame):
    errors = group.apply(apply_vincenty, axis = 1) * 1000 #(km to meters)
    
    p_50 = np.percentile(errors, 50)
    p_95 = np.percentile(errors, 95)
    return (p_50 + p_95)/2

    
def evaluation(phones: pd.Series, predsLat: pd.Series, predsLan: pd.Series, realLat: pd.Series, realLan: pd.Series):
    preds_df = pd.DataFrame({"latDeg": predsLat, "lngDeg": predsLan})
    real_df = pd.DataFrame({"latDeg": realLat, "lngDeg": realLan})
    all_df = pd.DataFrame({"realLatDeg": realLat, "realLngDeg": realLan, "predsLatDeg": predsLat, "predsLngDeg": predsLan,"phone": phones})
    cols=['latDeg','lngDeg']
    
    errors_per_phone = all_df.groupby("phone").apply(distance)
    return errors_per_phone.mean()
    
    

    

In [None]:
train = pd.read_csv("../input/google-smartphone-decimeter-challenge/baseline_locations_train.csv")
bl_test =  pd.read_csv("../input/google-smartphone-decimeter-challenge/baseline_locations_test.csv")
sample_submission = pd.read_csv("../input/google-smartphone-decimeter-challenge/sample_submission.csv")

In [None]:
# make ground_truth_df from all ground_truth files:
#  collectionName*phoneName, latDeg, lngDeg, millisSinceGpsEpoch
# merge to train on collectionName*phoneName (phone) and millisSinceGpsEpoch
i = 0
ground_truths_df = pd.DataFrame()
for path,dir_names,_ in  os.walk("../input/google-smartphone-decimeter-challenge/train"):
    if i != 0 and 'supplemental' not in dir_names:
        for phone in dir_names:
            temp_df =  pd.read_csv("{}/{}/ground_truth.csv".format(path, phone))
            temp_df["phone"] = temp_df["collectionName"] + "_" + temp_df["phoneName"]
            temp_df = temp_df[[ "phone", "latDeg", "lngDeg", "millisSinceGpsEpoch"]]
            ground_truths_df = ground_truths_df.append(temp_df)
    
    i = i + 1
ground_truths_df = ground_truths_df.reset_index().drop("index", axis = 1)
train = train.merge(ground_truths_df, on =["phone","millisSinceGpsEpoch"],suffixes = ("", "_y"))

In [None]:
# suppose to be ~7 (compare ground truth of train to the small antena's latDeg, lngDeg)
evaluation(train["phoneName"],
           train["latDeg_y"],train["lngDeg_y"],
           train["latDeg"], train["lngDeg"])
