# Description
Given the details of a Sendy order, this model will use historic data to predict an accurate time for the arrival of the rider at the destination of a package.

In [1]:
import numpy as np
import pandas as pd

# Importing necessary libraries for data wrangling, manipulation and visualization

To begin, we will create a basic regression model with the scikit-learn library and the dataset provided by the Sendy Logistics Challenge on Zindi. The data can be found __[here.](https://zindi.africa/competitions/sendy-logistics-challenge/data)__

In [49]:
# This base version of our model will attempt to find a linear relationship within the dataset.
# First, we import the necessary part of the scikit-learn library and the dataset into a Pandas Dataframe.



training_data = pd.read_csv("Train.csv")
training_data.head()

Unnamed: 0,Order No,User Id,Vehicle Type,Platform Type,Personal or Business,Placement - Day of Month,Placement - Weekday (Mo = 1),Placement - Time,Confirmation - Day of Month,Confirmation - Weekday (Mo = 1),...,Arrival at Destination - Time,Distance (KM),Temperature,Precipitation in millimeters,Pickup Lat,Pickup Long,Destination Lat,Destination Long,Rider Id,Time from Pickup to Arrival
0,Order_No_4211,User_Id_633,Bike,3,Business,9,5,9:35:46 AM,9,5,...,10:39:55 AM,4,20.4,,-1.317755,36.83037,-1.300406,36.829741,Rider_Id_432,745
1,Order_No_25375,User_Id_2285,Bike,3,Personal,12,5,11:16:16 AM,12,5,...,12:17:22 PM,16,26.4,,-1.351453,36.899315,-1.295004,36.814358,Rider_Id_856,1993
2,Order_No_1899,User_Id_265,Bike,3,Business,30,2,12:39:25 PM,30,2,...,1:00:38 PM,3,,,-1.308284,36.843419,-1.300921,36.828195,Rider_Id_155,455
3,Order_No_9336,User_Id_1402,Bike,3,Business,15,5,9:25:34 AM,15,5,...,10:05:27 AM,9,19.2,,-1.281301,36.832396,-1.257147,36.795063,Rider_Id_855,1341
4,Order_No_27883,User_Id_1737,Bike,1,Personal,13,1,9:55:18 AM,13,1,...,10:25:37 AM,9,15.4,,-1.266597,36.792118,-1.295041,36.809817,Rider_Id_770,1214


Next, we will attempt to train our model with the variable that currently has the highest
value of correlation, (Total distance in kilometers, based off road distance), with our
target variable, (Total time to arrival from the point of pickup to the destination).

In [39]:
eta_model.fit(X = training_data[['Distance (KM)']], y = training_data['Time from Pickup to Arrival'])
eta_model.coef_

array([101.11500695])

In [50]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(training_data, test_size=0.2, random_state=12345)

print('Training set has', train.shape[0], 'rows')
print('Test set has', test.shape[0], 'rows')

Training set has 16960 rows
Test set has 4241 rows


It's a start, but there's plenty of bias present with a large dataset as this one. Let's begin to think of other ways to paint a clearer relationship with scaling and additional features.

In [46]:
d_pred=eta_model.predict(X=test[['Distance (KM)']])

Unnamed: 0,Order No,User Id,Vehicle Type,Platform Type,Personal or Business,Placement - Day of Month,Placement - Weekday (Mo = 1),Placement - Time,Confirmation - Day of Month,Confirmation - Weekday (Mo = 1),...,Pickup - Weekday (Mo = 1),Pickup - Time,Distance (KM),Temperature,Precipitation in millimeters,Pickup Lat,Pickup Long,Destination Lat,Destination Long,Rider Id
0,Order_No_19248,User_Id_3355,Bike,3,Business,27,3,4:44:10 PM,27,3,...,3,5:06:47 PM,8,,,-1.333275,36.870815,-1.305249,36.82239,Rider_Id_192
1,Order_No_12736,User_Id_3647,Bike,3,Business,17,5,12:57:35 PM,17,5,...,5,1:25:37 PM,5,,,-1.272639,36.794723,-1.277007,36.823907,Rider_Id_868
2,Order_No_768,User_Id_2154,Bike,3,Business,27,4,11:08:14 AM,27,4,...,4,11:57:54 AM,5,22.8,,-1.290894,36.822971,-1.276574,36.851365,Rider_Id_26
3,Order_No_15332,User_Id_2910,Bike,3,Business,17,1,1:51:35 PM,17,1,...,1,2:16:52 PM,5,24.5,,-1.290503,36.809646,-1.303382,36.790658,Rider_Id_685
4,Order_No_21373,User_Id_1205,Bike,3,Business,11,2,11:30:28 AM,11,2,...,2,11:56:04 AM,6,24.4,,-1.281081,36.814423,-1.266467,36.792161,Rider_Id_858


In [53]:
from sklearn.neighbors import KNeighborsRegressor
eta_model = KNeighborsRegressor()

In [54]:
eta_model.fit(X=train[['Distance (KM)']], y=train['Time from Pickup to Arrival'])

KNeighborsRegressor()

In [55]:
d_pred=eta_model.predict(X=test[['Distance (KM)']])

In [56]:
d_pred

array([ 953. , 1260.6, 1837.2, ...,  444.4, 1837.2, 2680.2])

In [22]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [58]:
mean_squared_error(y_true=test['Time from Pickup to Arrival'], y_pred=d_pred)

747717.1665644895

In [59]:
mean_absolute_error(y_true=test['Time from Pickup to Arrival'], y_pred=d_pred)

616.4966281537372

In [None]:
#Trying with different knns to see change in mean absolute error 



In [63]:
knn1 = KNeighborsRegressor(n_neighbors = 1) # K = 1
knn1.fit(X=train[['Distance (KM)']], y=train['Time from Pickup to Arrival'])
y_pred1 = knn1.predict(X=test[['Distance (KM)']])

knn3 = KNeighborsRegressor(n_neighbors = 3) # K = 3
knn3.fit(X=train[['Distance (KM)']], y=train['Time from Pickup to Arrival'])
y_pred3 = knn3.predict(X=test[['Distance (KM)']])

knn5 = KNeighborsRegressor(n_neighbors = 5) # K = 5
knn5.fit(X=train[['Distance (KM)']], y=train['Time from Pickup to Arrival'])
y_pred5 = knn5.predict(X=test[['Distance (KM)']])

knn50 = KNeighborsRegressor(n_neighbors = 50) # K = 50
knn50.fit(X=train[['Distance (KM)']], y=train['Time from Pickup to Arrival'])
y_pred50 = knn50.predict(X=test[['Distance (KM)']])

In [64]:
print('MSE')
print('K = 1\t', mean_absolute_error(y_true=test['Time from Pickup to Arrival'], y_pred=y_pred1))
print('K = 3\t', mean_absolute_error(y_true=test['Time from Pickup to Arrival'], y_pred=y_pred3))
print('K = 5\t', mean_absolute_error(y_true=test['Time from Pickup to Arrival'], y_pred=y_pred5))
print('K = 50\t', mean_absolute_error(y_true=test['Time from Pickup to Arrival'], y_pred=y_pred50))

MSE
K = 1	 638.8592313133695
K = 3	 606.9334276507112
K = 5	 616.4966281537372
K = 50	 556.5961471351098


In [None]:
# error did not change a lot is still significantly higher, in fact as the K increased the 
