# 0 - Import

In [None]:
# Basic
import pickle as pkl

# Data
import pandas as pd
import numpy as np

# Sklearn 
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import haversine_distances
from sklearn.neighbors import NearestNeighbors

# Plot
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from location_prediction.data_visualisation import chart_plot, chart_subplot, map_view_prediction, single_map_view, visualize_filtering

# Model
from location_prediction.model import filtering_df, compute_distance_from_point, compute_k_avg_distance, get_living_location, get_adress_from_coordinates

# Prevent plotly to lag on scroll when there is a lot of graph displayed
import plotly.io as pio
pio.renderers.default = 'iframe'

# 1 - Feature Selection

## 1.1 - Mean Walking Speed Estimation

In [None]:
# The speed is between 4 and 5 km/h

# Calculate the mean of the interval
mean = ((4 * 1e3)  / (60*60) + (5 *1e3) / (60*60)) / 2

print('We\'ll define the mean walking speed as {}'.format(mean))

## 1.2 - GPS Measure Uncertainty Estimation

### 1.2.1 - Found a dense area

In [None]:
# Read our data
gps_df = pkl.load(open('./data/gps_df.pkl', 'rb'))

In [None]:
# Using the map view, we see that for the user 5e8f9 there is a dense area of measure corresponding to a specific location. 
# We will use the measure in this location to estimate the incertainty of the measures.

# We take the measure of the user 5e8f9
relevant_user = gps_df.copy()[gps_df.user_id == '5e8f9b28f3b9a5516fe24bc39f74837617d6026abf8cb4de6c6989923860f4a2']

# Argument for plotting
plot_args = dict(
    title = 'Map view of a dense area for user 5e8f9',
    title_x = 0.5,
    width=1400,
    height=700, 
    mapbox=dict(
        style = "open-street-map",
        center=go.layout.mapbox.Center(
            lat=48.98961,
            lon=2.231469
        ),
        zoom=18
    )
)

# We display the target location on a map view
single_map_view(relevant_user, plot_args, img_name='incertainty_estimation.png')

### 1.2.2 - Cluster the dense area

In [None]:
# We will cluster the GPS coordinate of this user to group this location in a cluster a get all measure of this location. 

# We get the GPS coordinate of the user 5e8f9
data = relevant_user[["lon", "lat"]].to_numpy()
data = np.radians(data)

# We cluster the gps coordinates to isolate the dense area
dbscan_kwargs = {'eps':5e-7, 'min_samples':5, 'metric':'haversine', 'metric_params':None, 'algorithm':'auto', 'leaf_size':30, 'p':None, 'n_jobs':-1}
model = DBSCAN(**dbscan_kwargs)
model.fit(data)
relevant_user.loc[:,"label"] = model.labels_

# We visualize the result of our clustering
plot_args = {
            'title': 'DBSCAN Clustering of the GPS Coordinates for user 5e8f9',
            'title_x': 0.5,
            'width': 1400,
            'height': 700
        }

chart_plot(
    graph = px.scatter, 
    dataframe = relevant_user, 
    x = relevant_user.lon, 
    y = relevant_user.lat, 
    plot_args = plot_args, 
    color=relevant_user.label, 
    ranges=([2.226, 2.236], [48.985, 48.995]), 
    img_name=None
)

<font size="3">
    <b>Note </b> : We see that we finally succeeded to isolate the dense are using DBSCAN
</font>

### 1.2.3 - Estimate the uncertainty

In [None]:
# Once we have all the points of the location we can estimate the incertainty as the means of the distance from the center of the cluster

# We get all the point we have successfully clustered, as the label change with runtime we have save a point of the location
label = relevant_user[(relevant_user.lat == 48.98954170975) & (relevant_user.lon == 2.2313633158346)].label.to_list()[0]
interesting_cluster = relevant_user.copy()[relevant_user.label == label]

# We take the center of the cluster
cluster_center = interesting_cluster.lon.mean(), interesting_cluster.lat.mean()
cluster_coord = interesting_cluster[['lon', 'lat']].to_numpy()

# We append the coordinates of the center to the data to visualize it
x = interesting_cluster.lon.to_list() + [cluster_center[0]]
y = interesting_cluster.lat.to_list() + [cluster_center[1]]

color = ['blue' for k in range(len(interesting_cluster))]
color.append('red')

# We plot the cluster and it center
chart_plot(
    graph = px.scatter, 
    dataframe = interesting_cluster, 
    x = x , 
    y = y , 
    plot_args = plot_args, 
    color=color, 
    ranges=([2.2312, 2.2318], [48.9899, 48.9894]), 
    img_name=None
)

# We can finally compute the mean of the distance between the point in the cluster and the center of the cluster 
distance_from_center = compute_distance_from_point([cluster_center], cluster_coord)
print('We can estimate incertitude as : {} km'.format(distance_from_center.mean()))

<font size="3">
    <b>Note </b> : We estimate the incertitude to around 10 m 
</font>

# 2 - Visualizing Filtering

In [None]:
# Read our data 
gps_df = pkl.load(open('./data/gps_df.pkl', 'rb'))

In [None]:
# To visualize the effect of the filtering we will display for somme user the gps coordinate before and after the filtering 
filtered_df = filtering_df(gps_df, 1.25, 0.15)

pkl.dump(filtered_df, open('./data/filtered_gps_df.pkl', 'wb'))

# Arguments for plot
plot_args = {
    'width' : 1400,
    'height' : 700,
    'title' : 'Visualizing GPS coordiantes before (left) and after (right) filtering',
    'title_x' : 0.5
}

# Visualize the filtering
visualize_filtering(
    dataframe = gps_df, 
    filtered_dataframe = filtered_df, 
    plot_args = plot_args, 
    n_users= 2, 
    img_name = 'viz_filtering.png')

# 3 - Prediction

## 3.1 - Elbow Curve

In [None]:
# Read our filtered dataset
filtered_gps_df = pkl.load(open('./data/filtered_gps_df.pkl', 'rb'))

# Compute the k-th distances
distances = compute_k_avg_distance(
    dataframe = filtered_gps_df, 
    n_neighbors = 3
)

# Plotting Arguments 
plot_args = {
            'title': 'K-distance Elbow Curve',
            'title_x': 0.5,
            'width': 1400,
            'height': 700,
            'xaxis_title' : 'Samples',
            'yaxis_title' : 'Distance',
        }

# Plot the elbow curve
chart_plot(
    graph = px.line, 
    dataframe = distances, 
    x = [k for k in range(len(distances))] , 
    y = distances , 
    plot_args = plot_args, 
    ranges = ([11800,13200], [-0.02, 0.1]),
    img_name='elbow.png'
)

## 3.2 - Get the prediction

In [None]:
# Read our datasets 
gps_df = pkl.load(open('./data/gps_df.pkl', 'rb'))
filtered_gps_df = pkl.load(open('./data/filtered_gps_df.pkl', 'rb'))

# Setting-up argument for DBSCAN
dbscan_kwargs = {'eps':5e-5, 'min_samples':3, 'metric':'haversine', 'metric_params':None, 'algorithm':'auto', 'leaf_size':30, 'p':None, 'n_jobs':-1}

# Plotting Arguments
plot_args = {
            'title': 'DBSCAN Clustering of the GPS Coordinates',
            'title_x': 0.5,
            'width': 1400,
            'height': 700
        }

# Apply DBSCAN, we get the center of a cluster by computing the mean 
clustered_gps_df_mean, prediction_mean = get_living_location(filtered_gps_df, dbscan_kwargs, 'mean')
pkl.dump(clustered_gps_df_mean, open('./data/clustered_gps_df_mean.pkl', 'wb'))
pkl.dump(prediction_mean, open('./data/prediction_mean.pkl', 'wb'))


# Apply DBSCAN, we get the center of a cluster by getting the points with most nearest neighbors
clustered_gps_df_dist, prediction_dist = get_living_location(filtered_gps_df, dbscan_kwargs, 'dist')
pkl.dump(clustered_gps_df_dist, open('./data/clustered_gps_df_dist.pkl', 'wb'))
pkl.dump(prediction_dist, open('./data/prediction_dist.pkl', 'wb'))

# 4 - Visualize DBSCAN clustering against ground-truth

In [None]:
# Read our clustered dataset 
clustered_gps_df_dist = pkl.load(open('./data/clustered_gps_df_dist.pkl', 'rb'))

# Arguments for plotting DBSCAN result
plot_args = {
    'title': 'DBSCAN Clustering of the GPS Coordinates',
    'title_x': 0.5,
    'width': 1400,
    'height': 700,
    'xaxis_title' : 'Longitude',
    'yaxis_title' : 'Latitude'
}

# Plot the DBSCAN result
chart_plot(
    px.scatter, 
    clustered_gps_df_dist, 
    clustered_gps_df_dist.lat, 
    clustered_gps_df_dist.lon, 
    plot_args, 
    color=clustered_gps_df_dist.label, 
    ranges=([42, 52], [-5, 15]), 
    img_name='DBSCAN_cluster.png'
)

# Arguments for plotting ground-truth clusters
plot_args = {
    'title': 'User Clustering of the GPS Coordinates',
    'title_x': 0.5,
    'width': 1400,
    'height': 700,
    'xaxis_title' : 'Longitude',
    'yaxis_title' : 'Latitude'
}

# PLot the ground-truth cluster
chart_plot(
    px.scatter, 
    clustered_gps_df_dist, 
    clustered_gps_df_dist.lat, 
    clustered_gps_df_dist.lon, 
    plot_args, 
    color=clustered_gps_df_dist.user_id.apply(lambda x: x[:4]), 
    ranges=([42, 52], [-5, 15]), 
    img_name='user_cluster.png'
)

# 5 - Display the location prediction in a map view with the GPS coordinate

In [None]:
# Get our data 
gps_df = pkl.load(open('./data/gps_df.pkl', 'rb'))

# Plot the prediction result against the original data using the mean method
map_view_prediction(gps_df, prediction_mean)

In [None]:
# Get our filtered dataset and prediction
filtered_gps_df = pkl.load(open('./data/filtered_gps_df.pkl', 'rb'))
prediction_mean = pkl.load(open('./data/prediction_mean.pkl', 'rb'))

# Plot the prediction result against the filtered data using the mean method
map_view_prediction(filtered_gps_df, prediction_mean)

In [None]:
# Get our filtered dataset and prediction
filtered_gps_df = pkl.load(open('./data/filtered_gps_df.pkl', 'rb'))
prediction_dist = pkl.load(open('./data/prediction_dist.pkl', 'rb'))

# Plot the prediction result against the filtered data using the dist method
map_view_prediction(filtered_gps_df, prediction_dist, 'map_prediction.png')

<font size="3">
    <b>Note 1</b> : According to the graph, the 'dist' method to predict the living location of a user gives better result. The point are more center on the location, thus the longitute and latitude are more precise.
    <br>
    <b>Note 2</b> : We also see that for some user the prediction could be wrong, for example for user 38b9, the living prediction give a veterinary clinic
</font>

# 6 - Get the living adress of the users

In [None]:
# Get our prediction
prediction_dist = pkl.load(open('./data/prediction_dist.pkl', 'rb'))

# We can now get the adress of the living location from the GPS coordinates
prediction = get_adress_from_coordinates(prediction_dist)
prediction