In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


**Loading train, test & submission files**

In [None]:
trainfile = pd.read_csv("../input/google-smartphone-decimeter-challenge/baseline_locations_train.csv")
testfile = pd.read_csv("../input/google-smartphone-decimeter-challenge/baseline_locations_test.csv")
submission = pd.read_csv("../input/google-smartphone-decimeter-challenge/sample_submission.csv")

In [None]:
trainfile

**Extracting ground truths and aligning with train inputs**

In [None]:
datapath = Path("../input/google-smartphone-decimeter-challenge")
truths = (datapath / 'train').rglob('ground_truth.csv')


cols = ['collectionName', 'phoneName', 'millisSinceGpsEpoch','phone','heightAboveWgs84EllipsoidM','latDeg',
       'lngDeg',]
truth_list =[]
for filepath in tqdm(truths, total=73):
    file = pd.read_csv(filepath, usecols=cols)
    truth_list.append(file)
    
truth_data = pd.concat(truth_list, ignore_index=True)

train = trainfile[cols]

train = train.merge(truth_data.iloc[:,3:], suffixes=("_current","_truth","_truth"))

In [None]:
train

In [None]:
train.info()

**Checking the correlation between current & truth coordinates**

In [None]:
train.iloc[:,3:].corr()

* In above correlation table, we can see that ***latDeg_current is highly correlated with latDeg_truth*** and ***lngDeg_current is highly correlated with lngDeg_truth***
* That would help us to build a baseline model

In [None]:
test = testfile.copy()

print("############### collectionName unique values ##############################")
print("train: {}".format(train.collectionName.nunique()))
print(train.collectionName.unique())
print("----------------------------------------------")
print("test: {}".format(test.collectionName.nunique()))
print(test.collectionName.unique())
print("----------------------------------------------")

print("\n")

print("############### phoneName unique values ##############################")
print("train: {}".format(train.phoneName.nunique()))
print(train.phoneName.unique())
print("----------------------------------------------")
print("test: {}".format(test.phoneName.nunique()))
print(test.phoneName.unique())
print("----------------------------------------------")

**Note:** Here, collectionName would be useless to consider as it would be different in train and test data but **phoneName** might be helpful

**One-hot encoding for 'phoneName' of train and test data**

In [None]:
train_phone = pd.get_dummies(train.loc[:,"phoneName"])
test_phone = pd.get_dummies(test.loc[:,"phoneName"])

print("train_phone shape:{}".format(train_phone.shape))
print("test_phone shape:{}".format(test_phone.shape))

**Aligning train & test columns**

In [None]:
train_phone1, test_phone1 = train_phone.align(test_phone, join="outer",axis=1, fill_value=0)
print("Updated train shape {}".format(train_phone.shape))

print("Updated test shape {}".format(test_phone.shape))

* Basically, there is no change is one-hot encoded phoneName columns for train or test data

**Train-test data after addition of one-hot encoded columns**

In [None]:
train1 = pd.concat([train.iloc[:,3:], train_phone1], axis=1, ignore_index=False)
test1 = pd.concat([test.iloc[:,3:5], test_phone1], axis=1, ignore_index=False)

In [None]:
print("train1_shape:",train1.shape)
train1.columns

In [None]:
print("test1-shape:",test1.shape)
test1.columns

**Plotting current vs truth coordinates**

In [None]:
plt.figure(figsize=[10,5])
plt.plot(train1["latDeg_current"][:200],train1["lngDeg_current"][:200],"bo",label="current")
plt.plot(train1["latDeg_truth"][:200],train1["lngDeg_truth"][:200],"r*",label="truth")
plt.title("current vs truth", fontweight="bold")
plt.xlabel("latDeg")
plt.ylabel("lngDeg")
plt.legend()

plt.figure(figsize=[15,5])
plt.subplot(1,2,1)
plt.plot(train1["latDeg_current"][:200],train1["latDeg_truth"][:200],"bo")
plt.title("lat (current vs truth)", fontweight="bold")

plt.subplot(1,2,2)
plt.plot(train1["lngDeg_current"][:200],train1["lngDeg_truth"][:200],"bo")
plt.title("lng (current vs truth)", fontweight="bold")

**Input-output**

In [None]:
# for lat
columns1 = list(train1.columns[4:])
X1,y1 = train1.loc[:,[train1.columns[0]]+columns1],train1["latDeg_truth"].values
  
#for lng
columns2 = list(train1.columns[4:])
X2,y2 = train1.loc[:,[train1.columns[1]]+columns2],train1["lngDeg_truth"].values


Xt1 = test1.loc[:,[test1.columns[0]]+columns1] #for lat
Xt2 = test1.loc[:,[test1.columns[1]]+columns2] #for lng

print("X1 columns:", X1.columns.tolist())
print("Xt1 columns:", Xt1.columns.tolist())
print("\n")
print("X2 columns:", X2.columns.tolist())
print("Xt2 columns:", Xt2.columns.tolist())

In [None]:
xtr1,xval1,ytr1,yval1 = train_test_split(X1, y1, test_size=0.3, random_state=10)
xtr2,xval2,ytr2,yval2 = train_test_split(X2, y2, test_size=0.3, random_state=10)

print("xtr1 shape:{}; xval1 shape:{}".format(xtr1.shape,xval1.shape))
print("xtr2 shape:{}; xval2 shape:{}".format(xtr2.shape,xval2.shape))

**Defining a function to get prediction and ground truth distance estimation (in meters)**

[Haversine formula for distance estimation using GPS co-ordinates](https://stackoverflow.com/questions/15736995/how-can-i-quickly-estimate-the-distance-between-two-latitude-longitude-points)

In [None]:
from math import radians, cos, sin, asin, sqrt
def lat_lon_dist(df):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    dist_list = []
    for i in tqdm(range(df.shape[0]),total=100):
        lat1 = df["latDeg_truth"][i]
        lon1 = df["lngDeg_truth"][i]
        lat2 = df["latDeg_pred"][i]
        lon2 = df["lngDeg_pred"][i]
        # convert decimal degrees to radians 
        lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
        # haversine formula 
        dlon = lon2 - lon1 
        dlat = lat2 - lat1 
        a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
        c = 2 * asin(sqrt(a)) 
        # Radius of earth in kilometers is 6371
        mdist = 6371* c*1000
        dist_list.append(mdist)
    
    return dist_list

In [None]:
idx_Mi8=np.where(xtr1["Mi8"]==1)[0]
idx_Pixel4=np.where(xtr1["Pixel4"]==1)[0]
idx_Pixel4Modded=np.where(xtr1["Pixel4Modded"]==1)[0]
idx_Pixel4XL=np.where(xtr1["Pixel4XL"]==1)[0]
idx_Pixel4XLModded=np.where(xtr1["Pixel4XLModded"]==1)[0]
idx_Pixel5=np.where(xtr1["Pixel5"]==1)[0]
idx_SamsungS20Ultra=np.where(xtr1["SamsungS20Ultra"]==1)[0]

In [None]:
xtr1

**Model building**

In [None]:
#lr1 = LinearRegression() #selected as a starting point
#model_lat = lr1.fit(xtr1.to_numpy()[idx_Mi8,0].reshape(-1,1),ytr1)
#pred_yval1 = model_lat.predict(xval1) # prediction for val data (lat)
#lr2 = LinearRegression()
#model_lng = lr2.fit(xtr2,ytr2)
#pred_yval2 = model_lng.predict(xval2) # prediction for val data (long)

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C

# Instantiate a Gaussian Process model
gp1_Mi8 = GaussianProcessRegressor()
gp1_Mi8.fit(xtr1.to_numpy()[idx_Mi8,0].reshape(-1,1),np.array(ytr1[idx_Mi8].ravel()).reshape(-1,1))





**Model evaluation on validation data**

In [None]:
val_df = pd.concat([xval1[["latDeg_current"]], xval2], ignore_index=False, axis=1).reset_index(drop=["index"])

**Adding truth & predicted lat-long values to val_df**

In [None]:
#truth
val_df["latDeg_truth"] = yval1
val_df["lngDeg_truth"] = yval2

#pred
val_df["latDeg_pred"] = pred_yval1
val_df["lngDeg_pred"] = pred_yval2

In [None]:
val_df["dist"] = lat_lon_dist(val_df)

In [None]:
phone = val_df.iloc[:,2:-5].idxmax(axis=1) #Reversing one-hot decoding for phoneName

val_df1 = pd.concat([val_df.iloc[:,:2],val_df.iloc[:,-3:]], axis=1, ignore_index=False)
val_df1["phoneName"] = phone

val_df1 = val_df1[val_df1.columns[-1:].tolist()+val_df1.columns[:-1].tolist()]

**Box-plot analysis for dist  analysis for each phone**

In [None]:
import seaborn as sns
plt.figure(figsize=[15,7])

# ax, fig = plt.subplots(figsize=[15,7])
sns.boxplot(x="phoneName", y="dist",data=val_df1)
plt.ylabel("Dist (m)") # distance in meters
#plt.ylim([0,30]) # for better visualization

**Preparing evaluation score for each phone (50th & 95th percentile)**

In [None]:
val_df2 = pd.DataFrame()
val_df2["phoneName"] =  val_df1.phoneName.unique().tolist()
val_df2["dist_50"] = [np.percentile(val_df1[val_df1.phoneName==ph]["dist"],50) for ph in val_df2["phoneName"].tolist()]
val_df2["dist_95"] = [np.percentile(val_df1[val_df1.phoneName==ph]["dist"],95) for ph in val_df2["phoneName"].tolist()]
val_df2["avg_dist_50_95"] = np.mean(np.array(val_df2.iloc[:,1:]),axis=1)
print("Val evaluation details:\n",val_df2)

print("\n")
print("------------------------------------------------------")
print("Final val evaluation score: {}".format(val_df2.iloc[:,-1].mean()))
print("------------------------------------------------------")

**Training the model on complete data and predict for test data**

In [None]:
lr1 = LinearRegression()
model_lat = lr1.fit(X1,y1)
pred_yt1 = model_lat.predict(Xt1) # prediction for test data (lat)

lr2 = LinearRegression()
model_lng = lr2.fit(X2,y2)
pred_yt2 = model_lng.predict(Xt2) # prediction for test data (long)

In [None]:
submission = test[['phone','millisSinceGpsEpoch']]
pd.options.mode.chained_assignment = None  # default='warn'
submission['latDeg'] = pred_yt1.tolist()
submission['lngDeg'] = pred_yt2.tolist()

In [None]:
submission.to_csv("./submission.csv",index=False)