<a href="https://colab.research.google.com/github/pszachew/Delivery-Prediction/blob/master/DataPreprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import datetime

In [97]:
def get_preprocess():
  deliveries = pd.read_json('https://raw.githubusercontent.com/pszachew/Delivery-Prediction/master/data_v2/deliveries.jsonl', lines=True)
  sessions = pd.read_json('https://raw.githubusercontent.com/pszachew/Delivery-Prediction/master/data_v2/sessions.jsonl', lines=True)
  products = pd.read_json('https://raw.githubusercontent.com/pszachew/Delivery-Prediction/master/data_v2/products.jsonl', lines=True)
  users = pd.read_json('https://raw.githubusercontent.com/pszachew/Delivery-Prediction/master/data_v2/users.jsonl', lines=True)
  deliveries['delivery_timestamp'] = deliveries['delivery_timestamp'].apply(lambda x: pd.to_datetime(x))
  deliveries['delivery_day'] = deliveries['delivery_timestamp'].apply(lambda x: datetime.datetime(x.year, x.month, x.day))
  deliveries['purchase_timestamp'] = deliveries['purchase_timestamp'].apply(lambda x: pd.to_datetime(x))
  deliveries['purchase_day'] = deliveries['purchase_timestamp'].apply(lambda x: datetime.datetime(x.year, x.month, x.day))
  deliveries['diff_days'] = (deliveries['delivery_day'] - deliveries['purchase_day']).apply(lambda x: x.days)
  deliveries['purchase_morning'] = deliveries['purchase_timestamp'].apply(lambda x: 1 if x.hour<=14 else 0)
  all_info = deliveries.merge(sessions.merge(products, on='product_id', how='left'), on='purchase_id', how='left')
  all_info = all_info.merge(users, on='user_id', how='left')
  dec_var = ["delivery_company", 'diff_days', 'purchase_morning', 'city']
  all_info = all_info.drop(all_info.index[all_info['diff_days'].isin([0,5])], axis=0)
  all_info = all_info[dec_var]
  city_dum = pd.get_dummies(all_info['city'], drop_first=True)
  delivery_company_dum = pd.get_dummies(all_info['delivery_company'], drop_first=True)
  delivery_company_dum = delivery_company_dum.rename(columns={360:'delivery_360', 254:'delivery_254', 516:'delivery_516', 620:'delivery_620'})
  all_info.drop('delivery_company', axis=1, inplace=True)
  all_info.drop('city', axis=1, inplace=True)
  final = pd.concat([all_info, city_dum, delivery_company_dum], axis=1)
  return final.drop('diff_days', axis=1), final['diff_days']

In [162]:
X, y = get_preprocess()

In [184]:
y = y.apply(lambda x: 3 if x>3 else x)
y

0        3
1        2
2        3
3        2
4        3
        ..
13755    3
13756    2
13757    3
13758    2
13759    3
Name: diff_days, Length: 13639, dtype: int64

In [182]:
X

Unnamed: 0,purchase_morning,Kraków,Poznań,Radom,Szczecin,Warszawa,Wrocław,delivery_360,delivery_516,delivery_620
0,1,0,0,0,0,1,0,0,1,0
1,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,1,0,0,1,0
3,1,0,0,0,0,1,0,1,0,0
4,1,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
13755,1,0,0,0,1,0,0,0,1,0
13756,1,0,0,0,1,0,0,0,0,1
13757,0,0,0,0,1,0,0,0,0,1
13758,1,0,0,0,0,0,0,0,0,1


In [183]:
y

0        3
1        2
2        3
3        2
4        3
        ..
13755    3
13756    2
13757    3
13758    2
13759    3
Name: diff_days, Length: 13639, dtype: int64

## **Logistic Regression**

In [195]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [196]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

In [197]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_pre = model.predict(X_test)
print(accuracy_score(y_test, y_pre))
print(classification_report(y_test, y_pre))

0.6631859586758498
              precision    recall  f1-score   support

           1       0.70      0.53      0.60       556
           2       0.61      0.72      0.66      2028
           3       0.73      0.64      0.68      1917

    accuracy                           0.66      4501
   macro avg       0.68      0.63      0.65      4501
weighted avg       0.67      0.66      0.66      4501



In [107]:
cm = confusion_matrix(y_test, y_pre)

In [59]:
print(cm)

[[ 295  255    6    0]
 [ 112 1491  425    0]
 [  14  650  947    0]
 [   0   62  244    0]]


## **SVM**

In [148]:
from sklearn.svm import SVC

In [159]:
kernel = 'poly'

In [160]:
mod_svm = SVC(kernel=kernel)
mod_svm.fit(X_train, y_train)
accuracy_score(y_test, mod_svm.predict(X_test))

0.6974005776494112

In [161]:
print(classification_report(y_test, mod_svm.predict(X_test)))

              precision    recall  f1-score   support

           1       0.69      0.61      0.65       556
           2       0.65      0.75      0.69      2028
           3       0.77      0.67      0.72      1917

    accuracy                           0.70      4501
   macro avg       0.70      0.68      0.69      4501
weighted avg       0.71      0.70      0.70      4501



## **KNN**

In [152]:
from sklearn.neighbors import KNeighborsClassifier

In [153]:
neigh = KNeighborsClassifier(n_neighbors = 3)

In [154]:
neigh.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=3)

In [155]:
print(classification_report(y_test, neigh.predict(X_test)))

              precision    recall  f1-score   support

           1       0.58      0.53      0.55       556
           2       0.60      0.65      0.62      2028
           3       0.70      0.66      0.68      1917

    accuracy                           0.64      4501
   macro avg       0.63      0.61      0.62      4501
weighted avg       0.64      0.64      0.64      4501



## **MLPClassifier**

In [185]:
from sklearn.neural_network import MLPClassifier

In [186]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

In [193]:
MLPclass = MLPClassifier(hidden_layer_sizes=(128,64), random_state=1, activation='relu')
MLPclass.fit(X_train, y_train)
y_pred = MLPclass.predict(X_test)

In [194]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.69      0.61      0.65       556
           2       0.64      0.78      0.70      2028
           3       0.80      0.63      0.71      1917

    accuracy                           0.70      4501
   macro avg       0.71      0.68      0.69      4501
weighted avg       0.71      0.70      0.70      4501

