In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

Read datasets

In [79]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import gmplot
from IPython.display import IFrame
import tensorflow as tf
import datetime
from sklearn.model_selection import train_test_split
from tensorflow import keras
import seaborn as sns

%matplotlib notebook

In [80]:
df = pd.read_csv('google-smartphone-decimeter-challenge/train/2020-05-14-US-MTV-1/Pixel4/Pixel4_derived.csv', nrows=1)
derived = df.drop(0)
df = pd.read_csv('google-smartphone-decimeter-challenge/train/2020-05-14-US-MTV-1/Pixel4/ground_truth.csv', nrows=1)
ground = df.drop(0)

for dirname, _, filenames in os.walk('google-smartphone-decimeter-challenge/train'):
    for filename in filenames:
        if 'derived.csv' in filename:
            r = os.path.join(dirname, filename)
            derived = derived.append(pd.read_csv(os.path.join(dirname, filename)))
        elif 'ground_truth.csv' in filename:
            ground = ground.append(pd.read_csv(os.path.join(dirname, filename)))

bsln_trn = pd.read_csv('google-smartphone-decimeter-challenge/baseline_locations_train.csv')
bsln_tst = pd.read_csv('google-smartphone-decimeter-challenge/baseline_locations_test.csv')
target = ground.merge(bsln_trn, how='inner', on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], suffixes=('_grnd', '_bsln'))

Submissions are scored on the mean of the 50th and 95th percentile distance errors. For every phone and at every millisSinceGpsEpoch,<br> the horizontal distance (in meters) is computed between the predicted lat/lng and the ground truth lat/lng.<br> These distance errors form a distribution from which the 50th and 95th percentile errors are calculated (i.e. the 95th percentile error is the value, in meters, for which 95% of the distance errors are smaller).<br> The 50th and 95th percentile errors are then averaged for each phone.<br> Lastly, the mean of these averaged values is calculated across all phones in the test set.<br><br>

https://en.wikipedia.org/wiki/Haversine_formula

The harversine function determines the "Great circle" distance between 2 latlon datapoints.<br>
We are using this measurement to accurratly determine as needed the `horizontal distance` between two points on earth.



In [81]:
def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(a**0.5)
    return dist

In [82]:
def haversine_50thP_95thP_mean(lat1, lon1, lat2, lon2):
    """Using calc_haversine formula to calculate the mean of  the mean of the 50th and 95th percentile distance errors
    The Competition evaluation metric
    """
    haversine = calc_haversine(lat1, lon1, lat2, lon2)
    return (np.percentile(haversine, 95) + np.median(haversine)) / 2

In [83]:
haversine_50thP_95thP_mean(target['latDeg_grnd'], target['lngDeg_grnd'], target['latDeg_bsln'], target['lngDeg_bsln'])


6.265253011167563

As implied in the compition name, we would like to reduce this error to at least under 1m

### Baseline vs ground truth spatial difference

In [84]:
target['bsln_grnd_distance'] = calc_haversine(target['latDeg_grnd'], target['lngDeg_grnd'], target['latDeg_bsln'], target['lngDeg_bsln'])
plt.figure()
plt.plot(range(target.shape[0]), target['bsln_grnd_distance'])

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x2b35789e1c0>]

In [85]:
target['bsln_grnd_distance'].describe()

count    131342.000000
mean          3.846848
std          30.739767
min           0.001338
25%           1.210976
50%           2.065769
75%           3.560001
max        8340.257976
Name: bsln_grnd_distance, dtype: float64

As we can see the error is quite large using simply the baseline data (Weighted Least Squars on derived data).<br>
Further more we have some very strange outlier measurements that with very large errors that are unacceptable at any circumstances and do not even imply on simple GPS accurracy.<br> These outlier samples can range all the way from 40m error to 8km error.<br> In the future we would use outlier detection algorithm to detect these samples and then clean them.

In [86]:
colors_lat = (target['latDeg_grnd'] == target['latDeg_bsln']).apply(lambda x: 'g' if x else 'r')
size_lat = (target['latDeg_grnd'] == target['latDeg_bsln']).apply(lambda x: 30 if x else 1)
colors_lng = (target['lngDeg_grnd'] == target['lngDeg_bsln']).apply(lambda x: 'g' if x else 'r')
size_lng = (target['lngDeg_grnd'] == target['lngDeg_bsln']).apply(lambda x: 30 if x else 1)

In [87]:
plt.figure(figsize=(12,5))
plt.subplot(121); plt.title('Baseline vs. ground truth latDeg'); 
plt.scatter(target['latDeg_grnd'], target['latDeg_bsln'], color=colors_lat, s=size_lat)
plt.xlabel('ground truth latDeg'); plt.ylabel('baseline latDeg')
plt.subplot(122); plt.title('Baseline vs. ground truth lngDeg');
plt.scatter(target['lngDeg_grnd'], target['lngDeg_bsln'], color=colors_lng, s=size_lng)
plt.xlabel('ground truth lngDeg'); plt.ylabel('baseline lngDeg')

<IPython.core.display.Javascript object>

Text(0, 0.5, 'baseline lngDeg')

In [88]:
if np.all(bsln_trn.reset_index()['millisSinceGpsEpoch'] == ground.reset_index()['millisSinceGpsEpoch']):
    print("Baseline and ground truth data are synchronized")

Baseline and ground truth data are synchronized


In [89]:
plt.savefig('Baseline_vs_ground_truth_latlon.png')

#### Plot collection route on a map using ground truth route vs. baseline route

In [90]:
def compareRoutes(true_route, predicted_route, mapName='map'):
    """Recieve as input two routes one as the ground truth and the other as the predicted.
    The ground truth would be drawn as line on the map.
    The predicted would be drawn as heatmap.
    
    Parameters
    ----------
    true_route : numpy 2d array
        The true route the sample has went through
    predicted_route : numpy 2d array
        The predicted route
    mapName : str
        defualt = 'map'
        The name of the generated html google map
        
    Output
    ------
    mapName.html file containing the google map.
    """
    gmap1 = gmplot.GoogleMapPlotter(true_route[true_route.shape[0]//2,0], true_route[true_route.shape[0]//2,1], 14, apikey='AIzaSyB0ONxmQBgtM14DqTRDrYBBUw2-woWkCIE', map_type='hybrid')
    gmap1.plot(true_route[:,0],true_route[:,1],'cornflowerblue', edge_width=2)
    gmap1.heatmap(predicted_route[:,0],predicted_route[:,1])
    gmap1.draw( "{}.html".format(mapName) )
    # Print the map to notebook
    IFrame(src="./{}.html".format(mapName), width=700, height=600)

In [91]:
t = target[target['collectionName'] == target['collectionName'].sample().iloc[0]]
compareRoutes(np.array(t[['latDeg_grnd','lngDeg_grnd']])[100:], np.array(t[['latDeg_bsln','lngDeg_bsln']])[100:])
IFrame(src='./map.html', width=700, height=600)

From the following map we can see the baseline predicted values vary around the ground truth

----------------------------------

In [92]:
def plotSatelite(ax, x, phone_cor=None):
    """Plot satelite measurements for a specific sample.
    """
    ax.scatter(x['xSatPosM'], x['ySatPosM'], x['zSatPosM'], c='r',s=50)
    ax.quiver(x['xSatPosM'], x['ySatPosM'], x['zSatPosM'], x['xSatVelMps'], x['ySatVelMps'], x['zSatVelMps'], length=(x['xSatVelMps']**2+x['ySatVelMps']**2+x['zSatVelMps']**2)**0.5)
    if isinstance(x.name, (int, np.integer)):
        ax.text(x['xSatPosM'], x['ySatPosM'], x['zSatPosM']+10, str(x.name))
    if phone_cor is not None:
        ax.plot([x['xSatPosM'], phone_cor[0]], [x['ySatPosM'], phone_cor[1]], [x['zSatPosM'], phone_cor[2]], c='g')

In [93]:
def plotSphere(ax, r=6731000, center=(0,0,0), hRange=(0, 2 * np.pi), vRange=(0, np.pi), phoneLat=0, phoneLng=0):
    theta = np.array([np.linspace(hRange[0], hRange[1], 50)])
    theta = np.ones_like(theta).T @ theta
    phi = np.array([np.linspace(vRange[0], vRange[1], 50)])
    phi = np.ones_like(phi).T @ phi
    phi = phi.T
    
    xx = r * np.sin(phi) * np.cos(theta) + center[0]
    yy = r * np.sin(phi) * np.sin(theta) + center[1]
    zz = r * np.cos(phi) + center[2]
    
    ax.plot_surface(xx, yy,zz)
    
    plt.show()

In [94]:
def latlonToXYZ(lat, lon):
    r=6731000 # Earth radius
    return (float(r * np.sin(lat) * np.cos(lon)), float(r * np.sin(lat) * np.sin(lon)), float(r * np.cos(lat)))

## Geospacial data analysis on derived dataset

### Single sample 3d visualization
for single android measurment visualize the specific point on earth the phone exists and the connected satelites,
Exibit the velocity of every satelite and the distance from the phone measured as the corrected pseudo range

In [95]:
derived.reset_index(inplace=True)
ground.reset_index(inplace=True)

Index of every sample is attributed to the collection, the phone used in the collection set <br>(The data is collected using multiple android phones in driving car)<br>

In [96]:
derived['millisSinceGpsEpoch_drvd'] = derived['millisSinceGpsEpoch']
derived_idx = derived.set_index(['collectionName', 'phoneName', 'millisSinceGpsEpoch', 'svid'])

#### ground truth vs. derived `millisSinceGpsEpoch` comparison
* The baseline and derived are correlated. moreover baseline is genrated using derived dataset
* It seems that unfortunatly there is a difference between the timestamp of the samples in the ground truth
  and the baseline as shown in the following. there are some samples that are the same regarding the timestamp and others which not

In [97]:
if np.all(bsln_trn.reset_index()['millisSinceGpsEpoch'] == ground['millisSinceGpsEpoch']):
    print("Baseline and ground truth data are synchronized")
if derived['millisSinceGpsEpoch'].nunique() == ground['millisSinceGpsEpoch'].nunique():
    print("Derived and ground truth data has the same number of time ephocs")
    if np.all(derived['millisSinceGpsEpoch'] == ground['millisSinceGpsEpoch']):
        print("Derived and ground truth data are synchronized")
    else:
        print("Derived and ground truth data are not synchronized")
else:
    print("Derived and ground truth data has different number of time ephocs")

Baseline and ground truth data are synchronized
Derived and ground truth data has different number of time ephocs


##### Choose random sample from derived data to display satelites

In [98]:
sample = derived[['collectionName', 'phoneName', 'millisSinceGpsEpoch']].sample()
while not np.any(ground['millisSinceGpsEpoch']==sample.values[0,2]):
    sample = derived[['collectionName', 'phoneName', 'millisSinceGpsEpoch']].sample()
sample_target = ground[ground['millisSinceGpsEpoch']==sample.values[0,2]]

ax = plt.figure().add_subplot(projection='3d')
#sample = derived_orig[['collectionName', 'phoneName', 'millisSinceGpsEpoch']].sample()
satelites = derived_idx.loc[sample.values[0,0], sample.values[0,1], sample.values[0,2]]
phone_cor = latlonToXYZ(sample_target['latDeg'], sample_target['lngDeg'])
satelites.apply(lambda sat: plotSatelite(ax, sat, phone_cor), axis=1)
plt.title("collectionName: {}  || Phone: {} ||  timestamp: {} ms".format(sample.values[0,0], sample.values[0,1], sample.values[0,2]))
ax.scatter(phone_cor[0],phone_cor[1],phone_cor[2], c='g', s=100)
ax.text(phone_cor[0]+1000,phone_cor[1]+1000,phone_cor[2]+1000, "lat:{}, lng:{}".format(float(sample_target['latDeg'],), float(sample_target['lngDeg'])))
plotSphere(ax)
plt.show()

<IPython.core.display.Javascript object>

### Feature Engineering & data preperation

#### Mean latDeg & lngDeg in baseline dataset over phones at the same epoch
Each collection is made by driving car using one or more androind deviced to collect GNSS logs.<br>
At each epoch several devices can collect GNSS data. Each of them should have the same lat/lng position<br>
Therefore we would average over the devices the lat lng degrees.<br>
As we can see not all the phones in our collections are synchronyzed.<br>
Therefore we would like to mean the bucket of quarter of the seconds of epochs to leave some space for unsynchronized phones to average themeself.<br>
The choice of 250 milliseconds was made by trial and error.<br>
We using averaging the prediction whereas an assumption on the speed between each 250ms bucket is at max 65 mph (California highway speed limit) which between each bucket leave room for 3.6m error after averaging. most of the driving in the dataset is <br>made within the city, therefore an presumably the average speed is much lower so the error is much less critical.<br> Further invistagation regarding the speed at which the samples where taken will be introduced later in the project.

In [187]:
bsln_trn['row'] = bsln_trn.index
bsln_trn['sSinceGpsEpoch'] = bsln_trn.millisSinceGpsEpoch//250
bsln_trn['millisSinceFirstEpoch'] = bsln_trn.millisSinceGpsEpoch - min(bsln_trn.millisSinceGpsEpoch)

df = bsln_trn.groupby(['collectionName', 'sSinceGpsEpoch']).agg({'latDeg': [ np.mean ], 'latDeg_prv': [ np.mean ], 
                                                                'lngDeg': [ np.mean ], 'lngDeg_prv': [ np.mean ], 'phoneName': [list], 'millisSinceFirstEpoch': [list],  'row': [len]})
bsln_mean_smoothed = pd.merge(bsln_trn, df.reset_index(), how='left', on=['collectionName', 'sSinceGpsEpoch'], suffixes=('raw', 'mean'))
haversine_50thP_95thP_mean(target['latDeg_grnd'], target['lngDeg_grnd'], bsln_mean_smoothed[('latDeg','mean')], bsln_mean_smoothed[('lngDeg','mean')])

5.859044545034475

In [195]:
print("{}m error rate from our evaluation metric.\n an impovement of {}m".format(haversine_50thP_95thP_mean(target['latDeg_grnd'], target['lngDeg_grnd'], bsln_mean_smoothed[('latDeg','mean')], bsln_mean_smoothed[('lngDeg','mean')]),haversine_50thP_95thP_mean(target['latDeg_grnd'], target['lngDeg_grnd'], target['latDeg_bsln'], target['lngDeg_bsln'])-haversine_50thP_95thP_mean(target['latDeg_grnd'], target['lngDeg_grnd'], bsln_mean_smoothed[('latDeg','mean')], bsln_mean_smoothed[('lngDeg','mean')])))

5.859044545034475m error rate from our evaluation metric.
 an impovement of 0.40620846613308803m


An improvement of 4 decimeters, not much but still not bad

In [192]:
target[['bsln_smoothed_grnd_distance', 'bsln_grnd_distance']].describe()

Unnamed: 0,bsln_smoothed_grnd_distance,bsln_grnd_distance
count,131342.0,131342.0
mean,3.588409,3.846848
std,30.530647,30.739767
min,0.001338,0.001338
25%,1.041625,1.210976
50%,1.836816,2.065769
75%,3.229373,3.560001
max,8340.257976,8340.257976


All metrics look a bit better for our smoothed version

In [196]:
plt.figure(figsize=(12,8))
plt.subplot(211); plt.plot(range(target.shape[0]), target['bsln_smoothed_grnd_distance']); plt.title('bsln_smoothed_grnd_distance'); plt.ylabel('horizontal distance (m)')
plt.subplot(212); plt.plot(range(target.shape[0]), target['bsln_grnd_distance']); plt.title('bsln_grnd_distance'); plt.xlabel('no. sample'); plt.ylabel('horizontal distance (m)')

<IPython.core.display.Javascript object>

Text(0, 0.5, 'horizontal distance (m)')

Some smaller peaks had been smoothed. some had been lowered down, while some more accurate measurements had been pulled upwards<br>
In the future we need to smooth these error using kelman filter to allow reasonable distances between the datapoints

##### Calculate corrected pseudo range

With these derived values, a corrected pseudorange (i.e. a closer approximation to the geometric range from the phone to the satellite) can be computed as: correctedPrM = rawPrM + satClkBiasM - isrbM - ionoDelayM - tropoDelayM. The baseline locations are computed using correctedPrM and the satellite positions, using a standard Weighted Least Squares (WLS) solver, with the phone's position (x, y, z), clock bias (t), and isrbM for each unique signal type as states for each epoch.

In [99]:
derived['correctedPrM'] = derived['rawPrM'] + derived['satClkBiasM'] - derived['isrbM'] - derived['ionoDelayM'] -derived['tropoDelayM']

#####  Foreach sample take the previous location of the phone

In [100]:
bsln_trn['latDeg_prv'] = bsln_trn['latDeg']
bsln_trn['lngDeg_prv'] = bsln_trn['lngDeg']
bsln_trn.loc[bsln_trn['collectionName']==bsln_trn.shift()['collectionName'], 'latDeg_prv'] = bsln_trn.shift()['latDeg']
bsln_trn.loc[bsln_trn['collectionName']==bsln_trn.shift()['collectionName'], 'latDeg_prv'] = bsln_trn.shift()['lngDeg']

##### Add datetime timestamps

For the ease of analysis

In [101]:
baseTime = datetime.datetime(1980,1,6,0,0,0,0)
derived['epoch_timestamp'] = derived['millisSinceGpsEpoch'].apply(lambda x: datetime.datetime.fromtimestamp(baseTime.timestamp()+x/1000.0))
ground['epoch_timestamp'] = ground['millisSinceGpsEpoch'].apply(lambda x: datetime.datetime.fromtimestamp(baseTime.timestamp()+x/1000.0))
bsln_trn['epoch_timestamp'] = bsln_trn['millisSinceGpsEpoch'].apply(lambda x: datetime.datetime.fromtimestamp(baseTime.timestamp()+x/1000.0))

In [102]:
features = {'correctedPrM'}

In [103]:
piv = derived.pivot_table(values=list(features), index=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], columns=['svid'])
grouped = derived[['phoneName','collectionName', 'millisSinceGpsEpoch', 'receivedSvTimeInGpsNanos', 'epoch_timestamp']].groupby(['phoneName', 'millisSinceGpsEpoch']).max()
piv = piv.merge(grouped, on=['phoneName', 'millisSinceGpsEpoch'])



In [104]:
piv.fillna(0, inplace=True)

### Try and correlate the derived data to the ground truth

As we seen before, the number of derived epochs is different that the number of epoch is ground truth dataset. <br>
This is a huge problem because we need to correlate the target data to the training data. <br>
Our main objective is to predict accurratly the lat/lng position of the phone in the world using the derived data and baseline data.
<br>
One obsticle in doing so is that the derived dataset grouped by time epochs does not necesseraly own the same time epochs as baseline.<br>
This problem reoccurres again when trying to compare to the target data.<br>
So before we even do any training we need to figure out the cause for this incosistency. Or at least when does it occurres and at which scale.

In [105]:
bsln = bsln_trn.set_index('millisSinceGpsEpoch').sort_index()
piv = piv.reset_index().set_index('millisSinceGpsEpoch').sort_index()
df = pd.merge_asof(bsln, piv, on='millisSinceGpsEpoch',by='phoneName', suffixes=('_bsln', '_piv'), direction='nearest', tolerance=1)

In [107]:
print("No. of rows that are outside the tolerance range are {}.\nThere are total {} samples".format(df['epoch_timestamp_piv'].isna().sum(), df.shape[0]))

No. of rows that are outside the tolerance range are 1013.
There are total 131342 samples


In [108]:
bsln_trn.shape[0]-piv.shape[0]

1003

In [109]:
print("{}% of the baseline is missing from derived".format((df['epoch_timestamp_piv'].isna().sum()/df.shape[0])*100))

0.7712689010369874% of the baseline is missing from derived


As we noticed before the derived data has 1003 missing records from baseline.<br>
Therefore we can be satisfied with 1013 missing which is a very small percentage of the data.<br>
nethertheless it is still part of the data and we should decide what to do next with our null values.

In [936]:
data = pd.merge(df, ground[['phoneName', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg']], how='inner', left_on=['phoneName', 'millisSinceGpsEpoch'], right_on=['phoneName','millisSinceGpsEpoch'], suffixes=('_bsln', '_grnd'))

--------------------------------------------------------------------------------------------------------
## Prepare data for training

In [1032]:
data.columns

Index([       'collectionName_bsln',        'millisSinceGpsEpoch',
                        'phoneName',                'latDeg_bsln',
                      'lngDeg_bsln', 'heightAboveWgs84EllipsoidM',
                            'phone',       'epoch_timestamp_bsln',
                       'latDeg_prv',                 'lngDeg_prv',
                ('correctedPrM', 1),          ('correctedPrM', 2),
                ('correctedPrM', 3),          ('correctedPrM', 4),
                ('correctedPrM', 5),          ('correctedPrM', 6),
                ('correctedPrM', 7),          ('correctedPrM', 8),
                ('correctedPrM', 9),         ('correctedPrM', 10),
               ('correctedPrM', 11),         ('correctedPrM', 12),
               ('correctedPrM', 13),         ('correctedPrM', 14),
               ('correctedPrM', 15),         ('correctedPrM', 16),
               ('correctedPrM', 17),         ('correctedPrM', 18),
               ('correctedPrM', 19),         ('correctedPrM', 

In [1030]:
data.drop(['latDeg_bsln_prev', 'lngDeg_bsln_prev'], axis=1, inplace=True)

KeyError: "['latDeg_bsln_prev' 'lngDeg_bsln_prev'] not found in axis"

In [938]:
target = data[['latDeg_grnd', 'lngDeg_grnd']]

In [1037]:
train = data
train = train.fillna(0)
X = train.drop(['collectionName_bsln', 'collectionName_piv', 'phoneName', 'phone', 'epoch_timestamp_bsln', 'epoch_timestamp_piv', 'latDeg_grnd', 'lngDeg_grnd'], axis=1)
X.fillna(0, inplace=True)

## Split for train/test the data & normilize

normalizer = tf.keras.layers.experimental.preprocessing.Normalization()
normalizer.adapt(x_train)

In [1038]:
from sklearn.preprocessing import StandardScaler
n_features = x_train.shape[1]
scaler_x = StandardScaler()
scaler_y = StandardScaler()
x_normed = scaler_x.fit_transform(X)
y_normed = scaler_y.fit_transform(target)
x_train, x_test, y_train, y_test = train_test_split(x_normed, y_normed, random_state=0, test_size=0.2)

---------------------------------------------------------------------------

## DecisionTreeRegressor / GradientBoostingRegresor 

In [942]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor

dtr = DecisionTreeRegressor()
dtr.fit(x_train, y_train)
dtr.score(x_test, y_test)

dtr = GradientBoostingRegressor()
dtr.fit(x_train, y_train[:,0])
dtr.score(x_test, y_test[:,0])

dtr_lng = GradientBoostingRegressor()
dtr_lng.fit(x_train, y_train[:,1])
dtr_lng.score(x_test, y_test[:,1)
                             
X['lat_prd'] = dtr.predict(normed)
X['lng_prd'] = dtr_lng(normed)
model.predict(x_test)

In [None]:
pred = X[['latDeg_bsln', 'lngDeg_bsln', 'lat_prd', 'lng_prd']]
pred['lat_diff_bsln_pred'] = pred.latDeg_bsln - pred.lat_prd
pred['lng_diff_bsln_pred'] = pred.lngDeg_bsln - pred.lng_prd
pred[['latDeg_grnd', 'lngDeg_grnd']]=target
pred['lat_diff_grnd_pred'] = pred.latDeg_grnd - pred.lat_prd
pred['lng_diff_grnd_pred'] = pred.lngDeg_grnd - pred.lng_prd
pred['lat_diff_grnd_bsln'] = pred.latDeg_grnd - pred.latDeg_bsln
pred['lng_diff_grnd_bsln'] = pred.lngDeg_grnd - pred.lngDeg_bsln
calc_haversine(pred.latDeg_grnd, pred.lngDeg_grnd, pred.lat_prd, pred.lng_prd)

#### Simple sequential model

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(n_features,)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(40, activation='relu'),
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dense(5, activation='relu'),
    tf.keras.layers.Dense(2)
])

model.compile(loss='mean_squared_error',
                optimizer=tf.keras.optimizers.Adam(0.1))
model.summary()

## Neural Network with recural inputs

In [1039]:
lrate = 0.001
input_dim = x_train.shape[1]
output_dim = 2
def build_model():
    inputs = keras.layers.Input((input_dim,))
    x = keras.layers.Dense(128, activation='relu')(inputs)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Dense(128, activation='relu')(x)
    x = keras.layers.Dropout(.3)(x)
    
    ox = x
    
    x = keras.layers.Dense(128, activation='relu')(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Dense(128, activation='relu')(x)
    x = keras.layers.Dropout(.3)(x)
    
    x = keras.layers.Add()([x, ox])
    
    x = keras.layers.Dense(128, activation='relu')(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Dense(128, activation='relu')(x)
    x = keras.layers.Dropout(.3)(x)
    
    outputs = keras.layers.Dense(output_dim, activation='linear')(x)
    
    model = keras.Model(inputs, outputs)
    model.compile(optimizer=keras.optimizers.Adam(lrate), loss='mean_squared_error')
    return model
model = build_model()
model.summary()

Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_17 (InputLayer)           [(None, 44)]         0                                            
__________________________________________________________________________________________________
dense_154 (Dense)               (None, 128)          5760        input_17[0][0]                   
__________________________________________________________________________________________________
batch_normalization_38 (BatchNo (None, 128)          512         dense_154[0][0]                  
__________________________________________________________________________________________________
dense_155 (Dense)               (None, 128)          16512       batch_normalization_38[0][0]     
____________________________________________________________________________________________

In [1020]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(2, input_shape=(2,), use_bias=False, kernel_initializer=tf.keras.initializers.Ones()
)
])

model.compile(loss='mean_squared_error',
                optimizer=tf.keras.optimizers.Adam(0.001))

model.summary()

Model: "sequential_27"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_139 (Dense)            (None, 2)                 4         
Total params: 4
Trainable params: 4
Non-trainable params: 0
_________________________________________________________________


In [1040]:
%%time
#stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
mcp_save = tf.keras.callbacks.ModelCheckpoint('.mdl_wts.hdf5', save_best_only=True, monitor='val_loss', mode='max')

history = model.fit(x_train, y_train, epochs=50, validation_data=(x_test, y_test), callbacks=[mcp_save])

Train on 105073 samples, validate on 26269 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Wall time: 37min 8s


In [1041]:
reconstructed_model = tf.keras.models.load_model(".mdl_wts.hdf5")
model = reconstructed_model

In [1017]:
latlon = np.array(X_val[['latDeg_bsln', 'lngDeg_bsln']])
lossCalcHarvestine(val_truth_arr, latlon)

array([3.58684154, 2.74590055, 1.8884092 , ..., 1.33866108, 1.24941385,
       1.46261318])

array([[  37.361492, -122.000114],
       [  37.34547 , -121.96718 ],
       [  37.334896, -121.96249 ],
       ...,
       [  37.607555, -122.37543 ],
       [  37.609642, -122.37939 ],
       [  37.60104 , -122.40292 ]], dtype=float32)

In [1051]:
val_set = train[(train['collectionName_bsln'] == '2020-05-14-US-MTV-1')&(train['phoneName']=='Pixel4')]
X_val = val_set.drop(['collectionName_bsln', 'collectionName_piv', 'phoneName', 'phone', 'epoch_timestamp_bsln', 'epoch_timestamp_piv', 'latDeg_grnd', 'lngDeg_grnd'], axis=1)
X_val.fillna(0, inplace=True)
val_truth = ground[(ground['collectionName'] == '2020-05-14-US-MTV-1')&(ground['phoneName']=='Pixel4')]
X_val_norm = scaler.fit_transform(X_val)
#val_truth_norm = scaler.fit_transform(val_truth)
val_truth_arr = np.array(val_truth[['latDeg', 'lngDeg']])
latlon = np.array(X_val)
pred = model.predict(X_val_norm)
pred = scaler_y.inverse_transform(pred)
#compareRoutes(val_truth_arr, pred)
compareRoutes(val_truth_arr, pred)
IFrame(src='./map.html', width=700, height=600)

In [842]:
np.abs(x_test[:,1:3] - y_test)

array([[ 37.51211117, 122.63692053],
       [ 37.43037396, 121.6421317 ],
       [ 37.74772266, 122.65096019],
       ...,
       [ 38.17209756, 122.24047398],
       [ 36.28248354, 120.92908478],
       [ 38.52916946, 123.70680213]])

In [845]:
np.abs(pred - y_test)

array([[0.01833343, 0.15006336],
       [0.01492554, 0.04188257],
       [0.0396296 , 0.15798037],
       ...,
       [0.07108695, 0.03483078],
       [0.05172424, 0.18113454],
       [0.07873785, 0.0939684 ]])

In [996]:
def lossCalcHarvestine(y_true, y_pred):
    return calc_haversine(y_true[:, 0], y_true[:, 1], y_pred[:, 0], y_pred[:, 1])

In [None]:
plotPoints(y, c)
IFrame(src='./map.html', width=700, height=600)

In [None]:
ground.head(1)

In [None]:
c = bsln_trn[bsln_trn['collectionName']=='2020-05-14-US-MTV-1'][['latDeg','lngDeg']]
c = np.array(c)

In [None]:
IFrame(src='./map.html', width=700, height=600)