<a href="https://colab.research.google.com/github/pszachew/Delivery-Prediction/blob/master/DataPreprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [274]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import datetime
from imblearn.over_sampling import SMOTE

In [263]:
def get_preprocess():
  deliveries = pd.read_json('https://raw.githubusercontent.com/pszachew/Delivery-Prediction/master/data_v2/deliveries.jsonl', lines=True)
  sessions = pd.read_json('https://raw.githubusercontent.com/pszachew/Delivery-Prediction/master/data_v2/sessions.jsonl', lines=True)
  products = pd.read_json('https://raw.githubusercontent.com/pszachew/Delivery-Prediction/master/data_v2/products.jsonl', lines=True)
  users = pd.read_json('https://raw.githubusercontent.com/pszachew/Delivery-Prediction/master/data_v2/users.jsonl', lines=True)
  ls=['monday', 'tuesday','wednesday', 'thursday', 'friday', 'saturday', 'sunday']
  deliveries['delivery_timestamp'] = deliveries['delivery_timestamp'].apply(lambda x: pd.to_datetime(x))
  deliveries['delivery_weekday'] = deliveries['delivery_timestamp'].apply(lambda x: ls[x.weekday()])
  deliveries['delivery_day'] = deliveries['delivery_timestamp'].apply(lambda x: datetime.datetime(x.year, x.month, x.day))
  deliveries['purchase_timestamp'] = deliveries['purchase_timestamp'].apply(lambda x: pd.to_datetime(x))
  deliveries['purchase_day'] = deliveries['purchase_timestamp'].apply(lambda x: datetime.datetime(x.year, x.month, x.day))
  deliveries['diff_days'] = (deliveries['delivery_day'] - deliveries['purchase_day']).apply(lambda x: x.days)
  deliveries['purchase_morning'] = deliveries['purchase_timestamp'].apply(lambda x: 1 if x.hour<=14 else 0)
  all_info = deliveries.merge(sessions.merge(products, on='product_id', how='left'), on='purchase_id', how='left')
  all_info = all_info.merge(users, on='user_id', how='left')
  dec_var = ["delivery_company", 'diff_days', 'purchase_morning', 'city', 'delivery_weekday']
  all_info = all_info.drop(all_info.index[all_info['diff_days'].isin([0,5])], axis=0)
  all_info = all_info[dec_var]
  city_dum = pd.get_dummies(all_info['city'], drop_first=True)
  weekday_dum = pd.get_dummies(all_info['delivery_weekday'], drop_first=True)
  delivery_company_dum = pd.get_dummies(all_info['delivery_company'], drop_first=True)
  delivery_company_dum = delivery_company_dum.rename(columns={360:'delivery_360', 254:'delivery_254', 516:'delivery_516', 620:'delivery_620'})
  all_info.drop('delivery_company', axis=1, inplace=True)
  all_info.drop('city', axis=1, inplace=True)
  all_info.drop('delivery_weekday', axis=1, inplace=True)
  final = pd.concat([all_info, city_dum, delivery_company_dum, weekday_dum], axis=1)
  return final.drop('diff_days', axis=1), final['diff_days']

In [264]:
X, y = get_preprocess()

In [211]:
X

Unnamed: 0,purchase_morning,Kraków,Poznań,Radom,Szczecin,Warszawa,Wrocław,delivery_360,delivery_516,delivery_620,friday,monday,saturday,thursday,tuesday,wednesday
0,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0
3,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0
4,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13755,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0
13756,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0
13757,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0
13758,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0


In [234]:
X

Unnamed: 0,purchase_morning,Kraków,Poznań,Radom,Szczecin,Warszawa,Wrocław,delivery_360,delivery_516,delivery_620,friday,monday,saturday,thursday,tuesday,wednesday
0,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0
3,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0
4,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13755,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0
13756,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0
13757,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0
13758,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0


In [315]:
oversample = SMOTE(k_neighbors=10, sampling_strategy='all')

In [316]:
over_X, over_y = oversample.fit_resample(X,y)

0        3
1        2
2        3
3        2
4        3
        ..
24627    4
24628    4
24629    4
24630    4
24631    4
Name: diff_days, Length: 24632, dtype: int64

## **Logistic Regression**

In [236]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [237]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

In [238]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_pre = model.predict(X_test)
print(accuracy_score(y_test, y_pre))
print(classification_report(y_test, y_pre))

0.7018440346589647
              precision    recall  f1-score   support

           1       0.69      0.55      0.61       556
           2       0.66      0.71      0.68      2028
           3       0.75      0.74      0.75      1917

    accuracy                           0.70      4501
   macro avg       0.70      0.67      0.68      4501
weighted avg       0.70      0.70      0.70      4501



In [220]:
cm = confusion_matrix(y_test, y_pre)

In [221]:
print(cm)

[[ 305  244    7    0]
 [ 129 1433  456   10]
 [   5  479 1082   45]
 [   0   12  252   42]]


## **SVM**

In [265]:
from sklearn.svm import SVC

In [266]:
kernel = 'poly'

In [267]:
mod_svm = SVC(kernel=kernel)
mod_svm.fit(X_train, y_train)
accuracy_score(y_test, mod_svm.predict(X_test))

0.7191735169962231

In [268]:
print(classification_report(y_test, mod_svm.predict(X_test)))

              precision    recall  f1-score   support

           1       0.71      0.56      0.63       556
           2       0.69      0.71      0.70      2028
           3       0.75      0.77      0.76      1917

    accuracy                           0.72      4501
   macro avg       0.72      0.68      0.70      4501
weighted avg       0.72      0.72      0.72      4501



## **KNN**

In [226]:
from sklearn.neighbors import KNeighborsClassifier

In [230]:
neigh = KNeighborsClassifier(n_neighbors = 5)

In [231]:
neigh.fit(X_train, y_train)

KNeighborsClassifier()

In [232]:
print(classification_report(y_test, neigh.predict(X_test)))

              precision    recall  f1-score   support

           1       0.61      0.53      0.56       556
           2       0.63      0.68      0.65      2028
           3       0.60      0.58      0.59      1611
           4       0.41      0.34      0.37       306

    accuracy                           0.60      4501
   macro avg       0.56      0.53      0.55      4501
weighted avg       0.60      0.60      0.60      4501



## **MLPClassifier**

In [317]:
from sklearn.neural_network import MLPClassifier

In [318]:
X_train, X_test, y_train, y_test = train_test_split(over_X, over_y, test_size=0.33, random_state=1)

In [319]:
MLPclass = MLPClassifier(hidden_layer_sizes=(16,16), random_state=10, activation='relu')
MLPclass.fit(X_train, y_train)
y_pred = MLPclass.predict(X_test)



In [320]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.78      0.88      0.83      2027
           2       0.65      0.53      0.59      2124
           3       0.61      0.51      0.56      2000
           4       0.71      0.89      0.79      1978

    accuracy                           0.70      8129
   macro avg       0.69      0.70      0.69      8129
weighted avg       0.69      0.70      0.69      8129



In [289]:
confusion_matrix(y_test, y_pred)

array([[1773,  203,   41,   10],
       [ 444, 1087,  415,  178],
       [  72,  330, 1058,  540],
       [   2,   11,  225, 1740]])