# Machine Learning

We will use all the prior order information to generate features, and use the train data to create the target variables.

From exploratory analysis, we learned that purchase pattern at the department level is pretty consistence, and the reorder pattern are similar to orders created by users before. Also, reorder on average fall under 15 days for new clients, and gradually decrease over time to 2-4 days. Hence, we will test the following features
- product id, and number of re-ordered
- product that fall under the last purchases, and days since last order
- what number of order

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pycaret.classification import *
sns.set_style('ticks')

  import pandas.util.testing as tm


In [2]:
aisles = pd.read_csv('./data/aisles.csv')
dept = pd.read_csv('./data/departments.csv')
orders = pd.read_csv('./data/orders.csv')
products = pd.read_csv('./data/products.csv')
orders_p = pd.read_csv('./data/order_products__prior.csv')
orders_tr = pd.read_csv('./data/order_products__train.csv')

In [3]:
prior_order = orders.query('eval_set == "prior"')
train_order = orders.query('eval_set == "train"')
test_order = orders.query('eval_set == "test"')

#### Getting product details

In [4]:
product_full_detail = pd.merge(products, dept, on='department_id').merge(aisles, on='aisle_id')

### Obtain a list of last purchase order by user

In [5]:
last_purchase = prior_order[prior_order['order_number'] == prior_order.groupby(['user_id'])['order_number'].transform('max')]
last_purchase_list = last_purchase['order_id'].tolist()

In [19]:
last_purchase.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
9,2550362,1,prior,10,4,8,30.0
24,839880,2,prior,14,3,10,13.0
37,1402502,3,prior,12,1,15,15.0
43,2557754,4,prior,5,5,13,0.0
48,157374,5,prior,4,1,18,19.0


### Feature Creations

- last purchase, whether a particular product was purchased in the latest order per user
- get the duration since last purchase (will be zero if the product didn't exist in latest orders)
- get aisle_id and department_id

In [6]:
order_p_detail = orders_p.merge(prior_order[['order_id','user_id','order_number']], on=['order_id'], how='left')
order_p_detail.loc[order_p_detail['order_id'].isin(last_purchase_list), 'last_purchase'] = 1
order_p_detail.loc[~order_p_detail['order_id'].isin(last_purchase_list), 'last_purchase'] = 0
order_p_detail = order_p_detail.merge(last_purchase[['order_id', 'days_since_prior_order']], on=['order_id'], how='left').fillna(0)
order_p_detail = order_p_detail.merge(product_full_detail[['product_id', 'aisle_id', 'department_id']], on=['product_id'], how='left')

- get purchases times by product
- get the nth purchase number by users

In [7]:
user_df = order_p_detail.groupby(['user_id', 'product_id','aisle_id','department_id']).agg({'order_number':'max', 'add_to_cart_order': 'count', 'reordered':'sum', 'last_purchase':'max', 'days_since_prior_order':'max'}).reset_index()

In [8]:
user_df.head()

Unnamed: 0,user_id,product_id,aisle_id,department_id,order_number,add_to_cart_order,reordered,last_purchase,days_since_prior_order
0,1,196,77,7,10,10,9,1.0,30.0
1,1,10258,117,19,10,9,8,1.0,30.0
2,1,10326,24,4,5,1,0,0.0,0.0
3,1,12427,23,19,10,10,9,1.0,30.0
4,1,13032,121,14,10,3,2,1.0,30.0


In [52]:
user_df.shape

(13307953, 9)

In [29]:
train_order.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
10,1187899,1,train,11,4,8,14.0
25,1492625,2,train,15,1,11,30.0
49,2196797,5,train,5,0,11,6.0
74,525192,7,train,21,2,11,6.0
78,880375,8,train,4,1,14,10.0


In [40]:
train_id = train_order.user_id.unique().tolist()

In [41]:
user_df_test = user_df.loc[user_df['user_id'].isin(train_id)]

In [42]:
user_df_test

Unnamed: 0,user_id,product_id,aisle_id,department_id,order_number,add_to_cart_order,reordered,last_purchase,days_since_prior_order
0,1,196,77,7,10,10,9,1.0,30.0
1,1,10258,117,19,10,9,8,1.0,30.0
2,1,10326,24,4,5,1,0,0.0,0.0
3,1,12427,23,19,10,10,9,1.0,30.0
4,1,13032,121,14,10,3,2,1.0,30.0
...,...,...,...,...,...,...,...,...,...
13307948,206209,43961,123,4,12,3,2,0.0,0.0
13307949,206209,44325,131,9,7,1,0,0.0,0.0
13307950,206209,48370,54,17,11,1,0,0.0,0.0
13307951,206209,48697,19,13,7,1,0,0.0,0.0


In [9]:
train_target = pd.merge(train_order[['order_id', 'user_id']], orders_tr, on=['order_id'], how='left').drop(['order_id','add_to_cart_order'], axis=1)
train_target = train_target.rename(columns={'reordered': 'target'})

In [43]:
train_target.head()

Unnamed: 0,user_id,product_id,target
0,1,196,1
1,1,25133,1
2,1,38928,1
3,1,26405,1
4,1,39657,1


In [45]:
train_df = pd.merge(user_df_test, train_target, on=['user_id', 'product_id'], how='outer').fillna(0).set_index('user_id')

In [46]:
train_df.head()

Unnamed: 0_level_0,product_id,aisle_id,department_id,order_number,add_to_cart_order,reordered,last_purchase,days_since_prior_order,target
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,196,77.0,7.0,10.0,10.0,9.0,1.0,30.0,1.0
1,10258,117.0,19.0,10.0,9.0,8.0,1.0,30.0,1.0
1,10326,24.0,4.0,5.0,1.0,0.0,0.0,0.0,0.0
1,12427,23.0,19.0,10.0,10.0,9.0,1.0,30.0,0.0
1,13032,121.0,14.0,10.0,3.0,2.0,1.0,30.0,1.0


In [51]:
train_df.loc[1]

Unnamed: 0_level_0,product_id,aisle_id,department_id,order_number,add_to_cart_order,reordered,last_purchase,days_since_prior_order,target
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,196,77.0,7.0,10.0,10.0,9.0,1.0,30.0,1
1,10258,117.0,19.0,10.0,9.0,8.0,1.0,30.0,1
1,10326,24.0,4.0,5.0,1.0,0.0,0.0,0.0,0
1,12427,23.0,19.0,10.0,10.0,9.0,1.0,30.0,0
1,13032,121.0,14.0,10.0,3.0,2.0,1.0,30.0,1
1,13176,24.0,4.0,5.0,2.0,1.0,0.0,0.0,0
1,14084,91.0,16.0,1.0,1.0,0.0,0.0,0.0,0
1,17122,24.0,4.0,5.0,1.0,0.0,0.0,0.0,0
1,25133,21.0,16.0,10.0,8.0,7.0,1.0,30.0,1
1,26088,23.0,19.0,2.0,2.0,1.0,0.0,0.0,1


In [48]:
train_df = train_df.astype({'target':'int'})

In [49]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9030454 entries, 1 to 206209
Data columns (total 9 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   product_id              int64  
 1   aisle_id                float64
 2   department_id           float64
 3   order_number            float64
 4   add_to_cart_order       float64
 5   reordered               float64
 6   last_purchase           float64
 7   days_since_prior_order  float64
 8   target                  int32  
dtypes: float64(7), int32(1), int64(1)
memory usage: 654.5 MB


In [50]:
exp1 = setup(train_df, target='target')

 
Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,3571
1,Target Type,Binary
2,Label Encoded,
3,Original Data,"(9030454, 9)"
4,Missing Values,False
5,Numeric Features,7
6,Categorical Features,1
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


In [111]:
compare_models(fold=2)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,Light Gradient Boosting Machine,0.9132,0.8011,0.1425,0.6184,0.2316,0.2042
1,Gradient Boosting Classifier,0.9131,0.7975,0.1494,0.607,0.2397,0.2111
2,Extreme Gradient Boosting,0.913,0.797,0.1442,0.611,0.2333,0.2055
3,CatBoost Classifier,0.9129,0.8011,0.1415,0.6111,0.2297,0.2022
4,Ada Boost Classifier,0.9117,0.795,0.1261,0.5884,0.2076,0.1811
5,Ridge Classifier,0.9095,0.0,0.046,0.5858,0.0852,0.0729
6,Linear Discriminant Analysis,0.9076,0.7819,0.1843,0.4903,0.2679,0.2293
7,Logistic Regression,0.9064,0.6316,0.0792,0.4447,0.1343,0.1096
8,K Neighbors Classifier,0.9033,0.6505,0.1013,0.3948,0.1612,0.1285
9,SVM - Linear Kernel,0.8967,0.0,0.1707,0.3734,0.224,0.179


In [53]:
catboost = create_model('catboost', fold=2)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.9128,0.8009,0.1405,0.6077,0.2282,0.2007
1,0.9131,0.8013,0.1425,0.6144,0.2313,0.2038
Mean,0.9129,0.8011,0.1415,0.6111,0.2297,0.2022
SD,0.0001,0.0002,0.001,0.0033,0.0015,0.0016


In [54]:
logistcs = create_model('lr', fold=2)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.9059,0.6357,0.0841,0.4344,0.141,0.1146
1,0.9069,0.6276,0.0742,0.4549,0.1276,0.1046
Mean,0.9064,0.6316,0.0792,0.4447,0.1343,0.1096
SD,0.0005,0.0041,0.005,0.0103,0.0067,0.005


In [55]:
lightgbm = create_model('lightgbm', fold=2)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.9131,0.8011,0.1439,0.614,0.2331,0.2054
1,0.9133,0.8011,0.1411,0.6229,0.23,0.203
Mean,0.9132,0.8011,0.1425,0.6184,0.2316,0.2042
SD,0.0001,0.0,0.0014,0.0045,0.0015,0.0012


In [110]:
light_tuned = tune_model('lightgbm', fold=2)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.9131,0.8014,0.1439,0.6131,0.2331,0.2054
1,0.9133,0.8018,0.1434,0.6193,0.2328,0.2054
Mean,0.9132,0.8016,0.1436,0.6162,0.233,0.2054
SD,0.0001,0.0002,0.0003,0.0031,0.0001,0.0


In [56]:
rfc = create_model('rf', fold=2)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.8964,0.7066,0.162,0.3576,0.223,0.1758
1,0.8963,0.7069,0.1617,0.3564,0.2225,0.1752
Mean,0.8963,0.7067,0.1618,0.357,0.2227,0.1755
SD,0.0001,0.0002,0.0002,0.0006,0.0003,0.0003


In [109]:
rfc_tuned = tune_model('rf', fold=2)

IntProgress(value=0, description='Processing: ', max=11)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa


KeyboardInterrupt: 

In [57]:
test_id = test_order.user_id.unique().tolist()
user_df_submit = user_df.loc[user_df['user_id'].isin(test_id)]

In [58]:
pred = predict_model(rfc, data=user_df_submit)

In [61]:
rfc_submit = pred.loc[pred['Label']==1]

In [63]:
rfc_submit = rfc_submit[['user_id','product_id']]

In [68]:
random_forest = pd.DataFrame(rfc_submit)

In [73]:
random_forest.shape

(219051, 2)

In [99]:
rfc_final = pd.merge(test_order[['order_id', 'user_id']], random_forest, on=['user_id'], how='left')

In [100]:
rfc_final.fillna(0,inplace=True)

In [101]:
rfc_final.drop(columns='user_id', inplace=True)

In [102]:
rfc_final['product_id'] = rfc_final['product_id'].astype('int')

In [103]:
rfc_final.head()

Unnamed: 0,order_id,product_id
0,2774568,22035
1,2774568,39190
2,2774568,47766
3,329954,0
4,1528013,0


In [104]:
def merge_products(x):
    return " ".join(list(x.astype('str')))

In [105]:
rfc_df =  rfc_final.groupby(['order_id'])['product_id'].aggregate(merge_products).reset_index()

In [106]:
rfc_df.columns = ['order_id','products']

In [107]:
rfc_df.shape

(75000, 2)

In [108]:
rfc_df.to_csv("./data/rfc_model.csv", index=False)