# Machine Learning

We will use all the prior order information to generate features, and use the train data to create the target variables.

From exploratory analysis, we learned that purchase pattern at the department level is pretty consistence, and the reorder pattern are similar to orders created by users before. Also, reorder on average fall under 15 days for new clients, and gradually decrease over time to 2-4 days. Hence, we will test the following features
- product id, and number of re-ordered
- product that fall under the last purchases, and days since last order
- what number of order

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pycaret.classification import *
sns.set_style('ticks')

  import pandas.util.testing as tm


In [2]:
aisles = pd.read_csv('./data/aisles.csv')
dept = pd.read_csv('./data/departments.csv')
orders = pd.read_csv('./data/orders.csv')
products = pd.read_csv('./data/products.csv')
orders_p = pd.read_csv('./data/order_products__prior.csv')
orders_tr = pd.read_csv('./data/order_products__train.csv')

In [3]:
prior_order = orders.query('eval_set == "prior"')
train_order = orders.query('eval_set == "train"')
test_order = orders.query('eval_set == "test"')

In [12]:
orders.shape

(3421083, 7)

In [13]:
orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [15]:
orders_p.shape

(32434489, 4)

In [16]:
orders_tr.shape

(1384617, 4)

#### Getting product details

In [4]:
product_full_detail = pd.merge(products, dept, on='department_id').merge(aisles, on='aisle_id')

### Obtain a list of last purchase order by user

In [5]:
last_purchase = prior_order[prior_order['order_number'] == prior_order.groupby(['user_id'])['order_number'].transform('max')]
last_purchase_list = last_purchase['order_id'].tolist()

In [6]:
last_purchase.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
9,2550362,1,prior,10,4,8,30.0
24,839880,2,prior,14,3,10,13.0
37,1402502,3,prior,12,1,15,15.0
43,2557754,4,prior,5,5,13,0.0
48,157374,5,prior,4,1,18,19.0


### Feature Creations

- last purchase, whether a particular product was purchased in the latest order per user
- get the duration since last purchase (will be zero if the product didn't exist in latest orders)
- get day of week of the purchase
- get aisle_id and department_id

In [7]:
order_p_detail = orders_p.merge(prior_order[['order_id','user_id','order_number']], on=['order_id'], how='left')
order_p_detail.loc[order_p_detail['order_id'].isin(last_purchase_list), 'last_purchase'] = 1
order_p_detail.loc[~order_p_detail['order_id'].isin(last_purchase_list), 'last_purchase'] = 0
order_p_detail = order_p_detail.merge(last_purchase[['order_id', 'days_since_prior_order', 'order_dow']], on=['order_id'], how='left').fillna(0)
order_p_detail = order_p_detail.merge(product_full_detail[['product_id', 'aisle_id', 'department_id']], on=['product_id'], how='left')

- get purchases times by product
- get the nth purchase number by users

In [8]:
user_df = order_p_detail.groupby(['user_id', 'product_id', 'aisle_id','department_id']).agg({'order_number':'max', 'add_to_cart_order': 'count', 'last_purchase':'max', 'days_since_prior_order':['max', 'mean']}).reset_index()

In [9]:
user_df.columns = ['user_id','product_id','aisle_id','department_id','order_number','add_to_cart_order','last_purchase','days_since_prior_order', 'avg_day_order']

In [10]:
user_df.head()

Unnamed: 0,user_id,product_id,aisle_id,department_id,order_number,add_to_cart_order,last_purchase,days_since_prior_order,avg_day_order
0,1,196,77,7,10,10,1.0,30.0,3.0
1,1,10258,117,19,10,9,1.0,30.0,3.333333
2,1,10326,24,4,5,1,0.0,0.0,0.0
3,1,12427,23,19,10,10,1.0,30.0,3.0
4,1,13032,121,14,10,3,1.0,30.0,10.0


In [11]:
user_df.shape

(13307953, 9)

In [66]:
train_id = train_order.user_id.unique().tolist()

In [67]:
user_df_test = user_df.loc[user_df['user_id'].isin(train_id)]

In [68]:
user_df_test

Unnamed: 0,user_id,product_id,aisle_id,department_id,order_number,add_to_cart_order,last_purchase,days_since_prior_order,avg_day_order
0,1,196,77,7,10,10,1.0,30.0,3.000000
1,1,10258,117,19,10,9,1.0,30.0,3.333333
2,1,10326,24,4,5,1,0.0,0.0,0.000000
3,1,12427,23,19,10,10,1.0,30.0,3.000000
4,1,13032,121,14,10,3,1.0,30.0,10.000000
...,...,...,...,...,...,...,...,...,...
13307948,206209,43961,123,4,12,3,0.0,0.0,0.000000
13307949,206209,44325,131,9,7,1,0.0,0.0,0.000000
13307950,206209,48370,54,17,11,1,0.0,0.0,0.000000
13307951,206209,48697,19,13,7,1,0.0,0.0,0.000000


In [69]:
train_target = pd.merge(train_order[['order_id', 'user_id']], orders_tr, on=['order_id'], how='left').drop(['order_id','add_to_cart_order'], axis=1)
train_target = train_target.rename(columns={'reordered': 'target'})

In [70]:
train_target.head()

Unnamed: 0,user_id,product_id,target
0,1,196,1
1,1,25133,1
2,1,38928,1
3,1,26405,1
4,1,39657,1


In [71]:
train_df = pd.merge(user_df_test, train_target, on=['user_id', 'product_id'], how='outer').fillna(0).set_index(['user_id','product_id'])

In [72]:
train_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,aisle_id,department_id,order_number,add_to_cart_order,last_purchase,days_since_prior_order,avg_day_order,target
user_id,product_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,196,77.0,7.0,10.0,10.0,1.0,30.0,3.0,1.0
1,10258,117.0,19.0,10.0,9.0,1.0,30.0,3.333333,1.0
1,10326,24.0,4.0,5.0,1.0,0.0,0.0,0.0,0.0
1,12427,23.0,19.0,10.0,10.0,1.0,30.0,3.0,0.0
1,13032,121.0,14.0,10.0,3.0,1.0,30.0,10.0,1.0


In [73]:
train_df = train_df.astype({'target':'int', 'last_purchase':'str'})

In [None]:
# train_df[['aisle_id', 'department_id']] = train_df[['aisle_id', 'department_id']].astype('str')

In [74]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 9030454 entries, (1, 196) to (206209, 37966)
Data columns (total 8 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   aisle_id                float64
 1   department_id           float64
 2   order_number            float64
 3   add_to_cart_order       float64
 4   last_purchase           object 
 5   days_since_prior_order  float64
 6   avg_day_order           float64
 7   target                  int32  
dtypes: float64(6), int32(1), object(1)
memory usage: 587.0+ MB


In [75]:
%%time
exp = setup(train_df, target='target')

IntProgress(value=0, description='Processing: ', max=13)

Text(value="Following data types have been inferred automatically, if they are correct press enter to continue…

Unnamed: 0,Data Type
aisle_id,Numeric
department_id,Numeric
order_number,Numeric
add_to_cart_order,Numeric
last_purchase,Categorical
days_since_prior_order,Numeric
avg_day_order,Numeric
target,Label


 


MemoryError: Unable to allocate 17.2 MiB for an array with shape (4515227,) and data type int32

In [30]:
%%time
compare_models(fold=2, blacklist=['knn', 'ridge', 'ada', 'et', 'lda', 'svm','rf','gbc', 'catboost'])

Wall time: 20min 1s


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,Extreme Gradient Boosting,0.9322,0.8949,0.68,0.5678,0.6188,0.582
1,Light Gradient Boosting Machine,0.9322,0.897,0.68,0.5678,0.6188,0.582
2,Decision Tree Classifier,0.9272,0.8321,0.6944,0.5391,0.607,0.5676
3,Logistic Regression,0.9239,0.8344,0.2474,0.568,0.3446,0.3108
4,Quadratic Discriminant Analysis,0.922,0.8492,0.4623,0.5206,0.4897,0.4477
5,Naive Bayes,0.908,0.8188,0.4718,0.4365,0.4535,0.4033


##### Based on above results, create model for top 3 algorithms and bottom one. Save off as pkl for ease of access later.

In [34]:
%%time
naiveb = create_model('nb', fold=2)
pred_naive = predict_model(naiveb)
save_model(naiveb, model_name='./data/naive_b')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,Naive Bayes,0.9081,0.8193,0.4717,0.4374,0.4539,0.4038


Transformation Pipeline and Model Succesfully Saved
Wall time: 58.3 s


In [35]:
%%time
xgb = create_model('xgboost', fold=2)
pred_xgb = predict_model(xgb)
save_model(xgb, model_name='./data/xgb')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,Extreme Gradient Boosting,0.9323,0.8948,0.682,0.5681,0.6198,0.583


Transformation Pipeline and Model Succesfully Saved
Wall time: 8min 44s


In [37]:
%%time
lightgbm = create_model('lightgbm', fold=2)
pred_light = predict_model(lightgbm)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,Light Gradient Boosting Machine,0.9323,0.8971,0.682,0.5681,0.6198,0.583


TypeError: save_model() got multiple values for argument 'model'

In [38]:
save_model(lightgbm, model_name='./data/light')

Transformation Pipeline and Model Succesfully Saved


In [39]:
decisiontree = create_model('dt', fold=2)
pred_dt = predict_model(decisiontree)
save_model(decisiontree, model_name='./data/dt')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,Decision Tree Classifier,0.9287,0.8431,0.6952,0.5466,0.612,0.5733


Transformation Pipeline and Model Succesfully Saved


##### Create separate formatted test dataset for ease of access later.

In [40]:
test_id = test_order.user_id.unique().tolist()
df_for_submit = user_df.loc[user_df['user_id'].isin(test_id)].set_index(['user_id','product_id'])

In [42]:
df_for_submit.to_csv('./data/df_submit.csv')