# 0 - Import

In [None]:
# Basic
import pickle as pkl

# Data
import pandas as pd
import numpy as np

# Geospatial 
from sklearn.metrics.pairwise import haversine_distances

# Date
import calendar
from dateutil.parser import parse

# Plot
from location_prediction.data_visualisation import chart_plot, chart_subplot, map_view
import plotly.express as px
import plotly.graph_objects as go

# Model
from location_prediction.model import generate_distance_travelled

# Prevent plotly to lag on scroll when there is a lot of graph displayed
import plotly.io as pio
pio.renderers.default = 'iframe'

# 1 - Preparing the dataset

In [None]:
# Reading the data
df = pd.read_excel('./data/data_test_datascience.xlsx', header=None)
df = df.rename({0:'user_id', 1:'lon', 2:'lat', 3:'timestamp_client'}, axis = 1)
df

## 1.1 - Timestamp

In [None]:
# Check for type of timestamp values
df['timestamp_client']

**Note** : Timestamp are stored as string

In [None]:
# Convert the timestamp in datetime object
df['timestamp_client'] = df['timestamp_client'].apply(parse)

## 1.2 - Data Cleaning

In [None]:
# Check for duplicates values
print('We have {} duplicates in the dataset \n'.format(len(df[df.duplicated()])))

# Remove duplicates
df = df.drop_duplicates()

# Check for missing values
for col in df.columns : 
    print('We have {} missing values for the features {} in the dataset'.format(len(df[pd.isnull(df[col])]), col))


## 1.3 - Sorting (user_id, timestamp)

In [None]:
# Sort the dataset by user id and time
df = df.sort_values(['user_id', 'timestamp_client'])

In [None]:
# We dump the dataset to quicker read and write 
pkl.dump(df, open('./data/gps_df.pkl', 'wb'))

# 2 - Exploratory Data Analysis

## 2.1 - Samples

In [None]:
# Load our clean dataset
gps_df = pkl.load(open('./data/gps_df.pkl', 'rb'))

# Display the number of samples in the dataset
print('We have {} samples in the dataset'.format(len(gps_df)))

## 2.2 - Users

In [None]:
# Display the number of users in the dataset
print('We have {} different users in the dataset '.format(len(gps_df['user_id'].unique())))

In [None]:
# Display the numbers of sample per user
count = gps_df.groupby(['user_id']).count()
short_user_id = [user[:4]+'...' for user in count.index]
timestamp = count.timestamp_client

# Argument for ploting the distribution of sample per user
plot_args = {
    'title': 'Distribution of samples per user',
    'title_x': 0.5,
    'width': 1400,
    'height': 600,
    'xaxis_title' : 'User id',
    'yaxis_title' : 'Number of sample',
    'showlegend':False,
}

# Our custom function to plot the distribution of sample per user
chart_plot(
    graph = px.bar, 
    dataframe = count, 
    x = short_user_id, 
    y = timestamp, 
    plot_args = plot_args, 
    color = count.index, 
    img_name = 'sample_per_user.png')

In [None]:
# Print descriptive statistics of the distribution of sample per user
print('Sample per user means is : {} \n'.format(count.mean().values[0]))
print('Sample per user median is : {} \n'.format(count.median().values[0]))
print('Sample per user standard deviation is : {} \n'.format(count.std().values[0]))
print('Sample per user max is : {} \n'.format(count.max().values[0]))
print('Sample per user min is : {} \n'.format(count.min().values[0]))

## 2.3 - Date

In [None]:
# Basic information on tracking period
print('Less recent date : {} \n'.format(gps_df['timestamp_client'].min()))
print('Most recent date : {}\n'.format(gps_df['timestamp_client'].max()))
print('Tracking Period : {}'.format(gps_df['timestamp_client'].max() - gps_df['timestamp_client'].min()))

### 2.3.1 - Sample per month

In [None]:
# Display the number of samples per month 
sample_per_month = gps_df.groupby([pd.Grouper(key='timestamp_client',freq='M')]).count()

# Argument for ploting the distribution of samples per month 
plot_args = {
    'title': 'Number of samples per month',
    'title_x': 0.5,
    'width': 1450,
    'height': 600,
    'xaxis_title' : 'Month',
    'yaxis_title' : 'Number of sample'
}

# Our custom function to plot the distribution of samples per month
chart_plot(
    graph = px.bar, 
    dataframe = sample_per_month, 
    x = sample_per_month.index, 
    y = sample_per_month.lon, 
    plot_args = plot_args,  
    img_name = 'sample_per_month.png'
)

### 2.3.2 Sample per day

In [None]:
# Display the number of samples per day 
sample_per_day = gps_df.groupby([pd.Grouper(key='timestamp_client',freq='D')]).count()

# Argument for ploting the distribution of samples per day 
plot_args = {
    'title': 'Number of samples per day',
    'title_x': 0.5,
    'width': 1400,
    'height': 600,
    'xaxis_title' : 'Day',
    'yaxis_title' : 'Number of sample'
}

# Our custom function to plot the distribution of samples per day
chart_plot(
    graph = px.bar, 
    dataframe = sample_per_day, 
    x = sample_per_day.index, 
    y = sample_per_day.lon, 
    plot_args = plot_args,  
    img_name = 'sample_per_day.png'
)

### 2.3.3 Sample per week day

In [None]:
# Display the distribution of sample among days for each user
def generate_sample_per_day(dataframe, unique_id):
    """
    This function is used to generate the number of samples per week day. 
    
    Args : 
        dataframe (pandas.Dataframe):
            The dataframe containing the GPS coordinates of all the users
            
        unique_id (string):
            The unique id of a user
            
    Returns : 
        sample_per_day.index (pandas.Series):
            A series containing the week day
            
        sample_per_day.timestamp_client (pandas.Series):
            A series containing number of records
    """
    
    user_df = dataframe.copy()[dataframe.user_id == unique_id]
    days = [ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    user_df.loc[:,'day'] = user_df.copy().timestamp_client.apply(lambda x : calendar.day_name[x.day_of_week])
    sample_per_day = user_df.groupby('day').count()
    sample_per_day = sample_per_day.reindex(days)
    
    return sample_per_day.index, sample_per_day.timestamp_client

# Argument for ploting the distribution of sample among days for each user
plot_args = {
    'width':1400, 
    'height':1000, 
    'title_text':'Distribution of sample among days for each user',
    'title_x': 0.5,
    'x_title': 'Day',
    'y_title': 'Number of sample'
}

# Our custom function to plot the distribution of sample among days for each user
chart_subplot(
    go.Bar, 
    gps_df, 
    generate_data = generate_sample_per_day, 
    plot_args = plot_args, 
    n_columns= 3, 
    img_name = 'sample_per_week_day.png'
)

### 2.3.4 - Time Elapse between two consecutive measure

In [None]:
# Display the distribution of the time elapsed between two consecutive measure for each user
def generate_time_elapse(dataframe, unique_id):
    """
    This function is used to generate the time elased between two measures for each user. 
    
    Args : 
        dataframe (pandas.Dataframe):
            The dataframe containing the GPS coordinates of all the users
            
        unique_id (string):
            The unique id of a user
            
    Returns : 
        user_df.timestamp_client (pandas.Series):
            A series containing the timestamp of the user
            
        elapsed_time (pandas.Series):
            A series containing the time elapsed
    """
    
    user_df = dataframe[dataframe.user_id == unique_id]
    elapsed_time = user_df.timestamp_client.diff()
    
    return user_df.timestamp_client, elapsed_time

# Argument for ploting the distribution of the time elapsed between two consecutive measure for each user
plot_args = {
    'width':1400, 
    'height':1000, 
    'title_text': 'Distribtuion of the time elapsed between two measure',
    'title_x': 0.5,
    'x_title': 'Timestamp',
    'y_title': 'Time Elapsed (s)'
}

# Our custom function to plot the distribution of the time elapsed between two consecutive measure for each user
chart_subplot(
    go.Scatter, 
    gps_df, 
    generate_data = generate_time_elapse, 
    plot_args = plot_args, 
    n_columns= 3, 
    img_name = 'time_elapse.png'
)

## 2.4 - GPS Coordinates

### 2.4.1 - GPS overview

In [None]:
# Argument for ploting the GPS coordinates
plot_args = {
    'title': 'Overview of GPS coordinates group by user',
    'title_x': 0.5,
    'width': 1400,
    'height': 600,
    'xaxis_title' : 'Longitude',
    'yaxis_title' : 'Latitude'
}

# Our custom function to plot the GPS coordinates
chart_plot(
    graph = px.scatter, 
    dataframe = gps_df, 
    x = gps_df.lon, 
    y = gps_df.lat, 
    plot_args = plot_args,  
    color = gps_df.user_id.apply(lambda x : x[:4]),
    img_name = 'gps_data.png'
)

# Our custom function to plot the GPS coordinates focus on a dense zone
chart_plot(
    graph = px.scatter, 
    dataframe = gps_df, 
    x = gps_df.lon, 
    y = gps_df.lat, 
    plot_args = plot_args,  
    color = gps_df.user_id.apply(lambda x : x[:4]),
    ranges = ([-10, 10], [60, 40]),
    img_name = 'gps_data_zoom.png'
)

### 2.4.2 - GPS coordinates per user

In [None]:
# Display the distribution of the distance travelled between two consecutive measure
def generate_user_coordinate(dataframe, unique_id) : 
    """
    This function is used to generate the distance travelled between two measures for each user. 
    
    Args : 
        dataframe (pandas.Dataframe):
            The dataframe containing the GPS coordinates of all the users
            
        unique_id (string):
            The unique id of a user
            
    Returns : 
        user_df.lon (pandas.Series):
            A series containing the longitude of the user
            
        user_df.lat (pandas.Series):
            A series containing the latitude of the user
    """
    
    user_df = dataframe[dataframe.user_id == unique_id]
    return user_df.lon, user_df.lat


# Argument for ploting the GPS coordinates
plot_args = {
    'width':1400, 
    'height':1000, 
    'title_text': 'Visualization of GPS coordinates per user',
    'title_x': 0.5,
    'x_title': 'Longitude',
    'y_title': 'Latitude'  
}

# Our custom function to plot the GPS coordinates focus on a dense zone
chart_subplot(
    graph = go.Scatter, 
    dataframe = gps_df, 
    generate_data = generate_user_coordinate, 
    n_columns = 3, 
    plot_args = plot_args, 
    img_name = 'gps_per_user.png')

### 2.4.3 - Distance travelled

In [None]:
# Display the distribution of the distance travelled between two points
def generate_user_distance_travelled(dataframe, unique_id) : 
    """
    This function is used to generate the distance travelled between two measures for each user. 
    
    Args : 
        dataframe (pandas.Dataframe):
            The dataframe containing the GPS coordinates of all the users
            
        unique_id (string):
            The unique id of a user
            
    Returns : 
        user_df.timestamp_client (pandas.Series) : 
            A series containing the timestamp of the user
            
        user_df.distance_travelled (pandas.Series):
            A series containing the distance travelled of the user
    """
    
    user_df = dataframe.copy()[dataframe.user_id == unique_id]    
    user_gps_coordinate = user_df[['lat', 'lon']].to_numpy()
    user_gps_coordinate_rad = np.radians(user_gps_coordinate)

    distance_travelled = np.array([haversine_distances([user_gps_coordinate_rad[i-1], user_gps_coordinate_rad[i]]) for i in range(1, len(user_gps_coordinate_rad))])
    distance_travelled = distance_travelled[:, 1][:, 0] * 6371
    distance_travelled = np.insert(distance_travelled, 0, 0.0)
    
    user_df.loc[:, 'distance_travelled'] = distance_travelled
    
    return user_df.timestamp_client, user_df.distance_travelled


# Argument for ploting the distance travelled between two points
plot_args = {
    'width':1400, 
    'height':1000, 
    'title_text': 'Distribution of distance travelled between two points',
    'title_x': 0.5,
    'x_title': 'Timestamp',
    'y_title': 'Distance Travelled (km)'
}

# Our custom function to plot the distance travelled between two points
chart_subplot(
    graph = go.Scatter, 
    dataframe = gps_df, 
    generate_data = generate_user_distance_travelled, 
    n_columns = 3, 
    plot_args = plot_args, 
    img_name = 'distance_travelled.png')

In [None]:
# Descriptive Statistics on distance travelled
gps_df = generate_distance_travelled(df)
distance_travelled = gps_df['distance_travelled']

print('Distance travelled mean : {} km'.format(gps_df.distance_travelled.mean()))
print('Distance travelled median : {} km'.format(gps_df.distance_travelled.median()))
print('Distance travelled standard deviation : {} km'.format(gps_df.distance_travelled.std()))

## 2.5 - Map view

In [None]:
# Plot a map view of the GPS coordinates of each user 
map_view(gps_df, 'map_view.png')