In [1]:
import pandas as pd
import numpy as np

In [2]:
import sagemaker
import boto3

sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name="sagemaker", region_name=region)
s3 = boto3.Session().client(service_name="s3", region_name=region)

# Data overview

In [3]:
aisles = pd.read_csv("./data-clarify/aisles.csv")
departments = pd.read_csv("./data-clarify/departments.csv")
orders = pd.read_csv("./data-clarify/orders.csv")
order_products_prior = pd.read_csv("./data-clarify/order_products__prior.csv")
order_products_train = pd.read_csv("./data-clarify/order_products__train.csv")
products = pd.read_csv("./data-clarify/products.csv")

## Merging train data with orders data and products data

In [40]:
train_df = order_products_train.merge(orders, on ='order_id', how='inner')
train_df = train_df.merge(products, on = 'product_id', how = 'left')
train_df.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle_id,department_id
0,1,49302,1,1,112108,train,4,4,10,9.0,Bulgarian Yogurt,120,16
1,1,11109,2,1,112108,train,4,4,10,9.0,Organic 4% Milk Fat Whole Milk Cottage Cheese,108,16
2,1,10246,3,0,112108,train,4,4,10,9.0,Organic Celery Hearts,83,4
3,1,49683,4,0,112108,train,4,4,10,9.0,Cucumber Kirby,83,4
4,1,43633,5,1,112108,train,4,4,10,9.0,Lightly Smoked Sardines in Olive Oil,95,15


## Features creation

Calculating how many times a user buy the product

In [41]:
train_df['user_buy_product_times'] = train_df.groupby(['user_id', 'product_id']).cumcount() + 1
train_df.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle_id,department_id,user_buy_product_times
0,1,49302,1,1,112108,train,4,4,10,9.0,Bulgarian Yogurt,120,16,1
1,1,11109,2,1,112108,train,4,4,10,9.0,Organic 4% Milk Fat Whole Milk Cottage Cheese,108,16,1
2,1,10246,3,0,112108,train,4,4,10,9.0,Organic Celery Hearts,83,4,1
3,1,49683,4,0,112108,train,4,4,10,9.0,Cucumber Kirby,83,4,1
4,1,43633,5,1,112108,train,4,4,10,9.0,Lightly Smoked Sardines in Olive Oil,95,15,1



## Product level features

(1) Product's average add-to-cart-order

(2) Total times the product was ordered

(3) Total times the product was reordered

(4) Reorder percentage of a product

(5) Total unique users of a product




In [42]:
prod_features = train_df.groupby('product_id').agg(mean_add_to_cart_order=pd.NamedAgg(column='add_to_cart_order',aggfunc='mean'),\
                                                 total_orders=pd.NamedAgg(column='reordered',aggfunc='count'),\
                                                 total_reorders=pd.NamedAgg(column='reordered',aggfunc='sum'),\
                                                 reorder_percentage=pd.NamedAgg(column='reordered',aggfunc='mean'),\
                                                 unique_users=pd.NamedAgg(column='user_id',aggfunc=lambda x: x.nunique()),\
                                                 order_first_time_total_cnt=pd.NamedAgg(column='user_buy_product_times',aggfunc=lambda x: sum(x==1)))
                            
                                                  

prod_features.reset_index(inplace = True)




In [43]:
prod_features.head()

Unnamed: 0,product_id,mean_add_to_cart_order,total_orders,total_reorders,reorder_percentage,unique_users,order_first_time_total_cnt
0,1,6.921053,76,49,0.644737,76,76
1,2,15.75,4,1,0.25,4,4
2,3,4.5,6,6,1.0,6,6
3,4,8.954545,22,14,0.636364,22,22
4,5,5.0,1,1,1.0,1,1


## Aisle and department features


(7) Reorder percentage, Total orders and reorders of a product aisle

(8) Mean and std of aisle add-to-cart-order

(9) Aisle unique users



In [44]:
aisle_features= train_df.groupby('aisle_id').agg(aisle_mean_add_to_cart_order=pd.NamedAgg(column='add_to_cart_order',aggfunc='mean'),\
                                              aisle_std_add_to_cart_order=pd.NamedAgg(column='add_to_cart_order',aggfunc='std'),\
                                              aisle_total_orders=pd.NamedAgg(column='reordered',aggfunc='count'),\
                                              aisle_total_reorders=pd.NamedAgg(column='reordered',aggfunc='sum'),\
                                              aisle_reorder_percentage=pd.NamedAgg(column='reordered',aggfunc='mean'),\
                                              aisle_unique_users=pd.NamedAgg(column='user_id',aggfunc=lambda x: x.nunique()))
                                              
aisle_features.reset_index(inplace = True)                                            

In [45]:
aisle_features.head()

Unnamed: 0,aisle_id,aisle_mean_add_to_cart_order,aisle_std_add_to_cart_order,aisle_total_orders,aisle_total_reorders,aisle_reorder_percentage,aisle_unique_users
0,1,8.088556,6.804272,2936,1754,0.597411,2611
1,2,9.727085,7.689724,3873,1932,0.498838,3592
2,3,9.730701,7.927592,17449,10429,0.597685,11057
3,4,10.861652,8.278374,9917,5001,0.504286,7880
4,5,10.866437,8.541065,2905,860,0.296041,2711


#### features
(10) Reorder percentage, Total orders and reorders of a product department

(11) Mean and std of department add-to-cart-order

(12) Department unique users

In [46]:
dpt_features= train_df.groupby('department_id').agg(department_mean_add_to_cart_order=pd.NamedAgg(column='add_to_cart_order',aggfunc='mean'),\
                                                 department_std_add_to_cart_order=pd.NamedAgg(column='add_to_cart_order',aggfunc='std'),\
                                                 department_total_orders=pd.NamedAgg(column='reordered',aggfunc='count'),\
                                                department_total_reorders=pd.NamedAgg(column='reordered',aggfunc='sum'),\
                                                 department_reorder_percentage=pd.NamedAgg(column='reordered',aggfunc='mean'),\
                                                  department_unique_users=pd.NamedAgg(column='user_id',aggfunc=lambda x: x.nunique()))
                                                 
dpt_features.reset_index(inplace = True)                                                 

In [47]:
dpt_features.head()

Unnamed: 0,department_id,department_mean_add_to_cart_order,department_std_add_to_cart_order,department_total_orders,department_total_reorders,department_reorder_percentage,department_unique_users
0,1,9.439926,7.732969,100426,56168,0.559297,51071
1,2,8.518106,7.719445,1795,697,0.388301,1725
2,3,8.541679,7.231486,48394,30692,0.634211,36424
3,4,8.431048,6.934668,409087,271886,0.664617,96927
4,5,5.645052,6.363332,5598,3397,0.606824,3105


#### features

(13) Binary encoding of aisle feature

(14) Binary encoding of department feature



In [48]:
# combine all previous dataframes
prod_features = prod_features.merge(products, on = 'product_id', how = 'left')
prod_features = prod_features.merge(aisle_features, on = 'aisle_id', how = 'left')
prod_features = prod_features.merge(aisles, on = 'aisle_id', how = 'left')
prod_features = prod_features.merge(dpt_features, on = 'department_id', how = 'left')
prod_features = prod_features.merge(departments, on = 'department_id', how = 'left')
prod_features.head()



Unnamed: 0,product_id,mean_add_to_cart_order,total_orders,total_reorders,reorder_percentage,unique_users,order_first_time_total_cnt,product_name,aisle_id,department_id,...,aisle_reorder_percentage,aisle_unique_users,aisle,department_mean_add_to_cart_order,department_std_add_to_cart_order,department_total_orders,department_total_reorders,department_reorder_percentage,department_unique_users,department
0,1,6.921053,76,49,0.644737,76,76,Chocolate Sandwich Cookies,61,19,...,0.564429,8077,cookies cakes,9.562097,7.922739,118862,69102,0.581363,57302,snacks
1,2,15.75,4,1,0.25,4,4,All-Seasons Salt,104,13,...,0.16532,7378,spices seasonings,10.115051,8.131759,81242,29498,0.363088,47599,pantry
2,3,4.5,6,6,1.0,6,6,Robust Golden Unsweetened Oolong Tea,94,7,...,0.52773,7083,tea,7.13826,6.908445,114046,75060,0.658155,61482,beverages
3,4,8.954545,22,14,0.636364,22,22,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,...,0.601504,10780,frozen meals,9.439926,7.732969,100426,56168,0.559297,51071,frozen
4,5,5.0,1,1,1.0,1,1,Green Chile Anytime Sauce,5,13,...,0.296041,2711,marinades meat preparation,10.115051,8.131759,81242,29498,0.363088,47599,pantry


In [49]:
prod_features.drop(['product_name', 'aisle_id', 'department_id'], axis = 1, inplace = True)
prod_features.head()

Unnamed: 0,product_id,mean_add_to_cart_order,total_orders,total_reorders,reorder_percentage,unique_users,order_first_time_total_cnt,aisle_mean_add_to_cart_order,aisle_std_add_to_cart_order,aisle_total_orders,...,aisle_reorder_percentage,aisle_unique_users,aisle,department_mean_add_to_cart_order,department_std_add_to_cart_order,department_total_orders,department_total_reorders,department_reorder_percentage,department_unique_users,department
0,1,6.921053,76,49,0.644737,76,76,9.505711,8.010219,9980,...,0.564429,8077,cookies cakes,9.562097,7.922739,118862,69102,0.581363,57302,snacks
1,2,15.75,4,1,0.25,4,4,10.469663,8.02574,9279,...,0.16532,7378,spices seasonings,10.115051,8.131759,81242,29498,0.363088,47599,pantry
2,3,4.5,6,6,1.0,6,6,8.585964,7.730768,9376,...,0.52773,7083,tea,7.13826,6.908445,114046,75060,0.658155,61482,beverages
3,4,8.954545,22,14,0.636364,22,22,9.4778,7.693313,18221,...,0.601504,10780,frozen meals,9.439926,7.732969,100426,56168,0.559297,51071,frozen
4,5,5.0,1,1,1.0,1,1,10.866437,8.541065,2905,...,0.296041,2711,marinades meat preparation,10.115051,8.131759,81242,29498,0.363088,47599,pantry


In [50]:
prod_features.shape

(39123, 21)

In [51]:
prod_features.dtypes

product_id                             int64
mean_add_to_cart_order               float64
total_orders                           int64
total_reorders                         int64
reorder_percentage                   float64
unique_users                           int64
order_first_time_total_cnt             int64
aisle_mean_add_to_cart_order         float64
aisle_std_add_to_cart_order          float64
aisle_total_orders                     int64
aisle_total_reorders                   int64
aisle_reorder_percentage             float64
aisle_unique_users                     int64
aisle                                 object
department_mean_add_to_cart_order    float64
department_std_add_to_cart_order     float64
department_total_orders                int64
department_total_reorders              int64
department_reorder_percentage        float64
department_unique_users                int64
department                            object
dtype: object

## User level features 

(15) Total orders by a user

(16) Total products user has bought

(17) Total unique products user has bought

(18) user's total reordered products

(19) User's overall reorder percentage



In [52]:
user_features = train_df.groupby('user_id').agg(total_orders_by_user=pd.NamedAgg(column='order_number',aggfunc=lambda x: x.nunique()),\
                                             total_products_by_user=pd.NamedAgg(column='product_id',aggfunc='count'),\
                                            total_unique_product_by_user=pd.NamedAgg(column='product_id',aggfunc=lambda x: x.nunique()),\
                                              total_reorders_by_user=pd.NamedAgg(column='reordered',aggfunc='sum'),\
                                          reorder_propotion_by_user=pd.NamedAgg(column='reordered',aggfunc='mean'))
                                             

user_features.reset_index(inplace = True)  

In [53]:
user_features.head()

Unnamed: 0,user_id,total_orders_by_user,total_products_by_user,total_unique_product_by_user,total_reorders_by_user,reorder_propotion_by_user
0,1,1,11,11,10,0.909091
1,2,1,31,31,12,0.387097
2,5,1,9,9,4,0.444444
3,7,1,9,9,8,0.888889
4,8,1,18,18,4,0.222222


#### features

(20) Average order size of a user

(21) User's mean of reordered items of all orders



In [54]:
user_features2 = train_df.groupby(['user_id','order_number']).agg(average_order_size=pd.NamedAgg(column='reordered',aggfunc='count'),\
                                                                reorder_in_order=pd.NamedAgg(column='reordered',aggfunc='mean'))
user_features2.reset_index(inplace = True)  
user_features2.head(20)

Unnamed: 0,user_id,order_number,average_order_size,reorder_in_order
0,1,11,11,0.909091
1,2,15,31,0.387097
2,5,5,9,0.444444
3,7,21,9,0.888889
4,8,4,18,0.222222
5,9,4,22,1.0
6,10,6,4,0.0
7,13,13,5,0.8
8,14,14,11,0.727273
9,17,41,6,0.5


In [55]:
user_features3 = user_features2.groupby('user_id').agg({'average_order_size' : 'mean', 
                                   'reorder_in_order':'mean'})
user_features3.reset_index(inplace = True)  
user_features3.head()

Unnamed: 0,user_id,average_order_size,reorder_in_order
0,1,11,0.909091
1,2,31,0.387097
2,5,9,0.444444
3,7,9,0.888889
4,8,18,0.222222


In [56]:

user_features = user_features.merge(user_features3, on = 'user_id', how = 'left')
user_features.head()

Unnamed: 0,user_id,total_orders_by_user,total_products_by_user,total_unique_product_by_user,total_reorders_by_user,reorder_propotion_by_user,average_order_size,reorder_in_order
0,1,1,11,11,10,0.909091,11,0.909091
1,2,1,31,31,12,0.387097,31,0.387097
2,5,1,9,9,4,0.444444,9,0.444444
3,7,1,9,9,8,0.888889,9,0.888889
4,8,1,18,18,4,0.222222,18,0.222222


In [57]:
path = "./data-clarify/product_features.pkl"

prod_features.to_pickle(path)

In [58]:
path = "./data-clarify/user_features.pkl"

user_features.to_pickle(path)

In [59]:
df = pd.read_pickle("./data-clarify/product_features.pkl")
df.head()

Unnamed: 0,product_id,mean_add_to_cart_order,total_orders,total_reorders,reorder_percentage,unique_users,order_first_time_total_cnt,aisle_mean_add_to_cart_order,aisle_std_add_to_cart_order,aisle_total_orders,...,aisle_reorder_percentage,aisle_unique_users,aisle,department_mean_add_to_cart_order,department_std_add_to_cart_order,department_total_orders,department_total_reorders,department_reorder_percentage,department_unique_users,department
0,1,6.921053,76,49,0.644737,76,76,9.505711,8.010219,9980,...,0.564429,8077,cookies cakes,9.562097,7.922739,118862,69102,0.581363,57302,snacks
1,2,15.75,4,1,0.25,4,4,10.469663,8.02574,9279,...,0.16532,7378,spices seasonings,10.115051,8.131759,81242,29498,0.363088,47599,pantry
2,3,4.5,6,6,1.0,6,6,8.585964,7.730768,9376,...,0.52773,7083,tea,7.13826,6.908445,114046,75060,0.658155,61482,beverages
3,4,8.954545,22,14,0.636364,22,22,9.4778,7.693313,18221,...,0.601504,10780,frozen meals,9.439926,7.732969,100426,56168,0.559297,51071,frozen
4,5,5.0,1,1,1.0,1,1,10.866437,8.541065,2905,...,0.296041,2711,marinades meat preparation,10.115051,8.131759,81242,29498,0.363088,47599,pantry


In [60]:
df = pd.read_pickle("./data-clarify/user_features.pkl")
df.head()

Unnamed: 0,user_id,total_orders_by_user,total_products_by_user,total_unique_product_by_user,total_reorders_by_user,reorder_propotion_by_user,average_order_size,reorder_in_order
0,1,1,11,11,10,0.909091,11,0.909091
1,2,1,31,31,12,0.387097,31,0.387097
2,5,1,9,9,4,0.444444,9,0.444444
3,7,1,9,9,8,0.888889,9,0.888889
4,8,1,18,18,4,0.222222,18,0.222222


In [62]:
prod_features.dtypes

product_id                             int64
mean_add_to_cart_order               float64
total_orders                           int64
total_reorders                         int64
reorder_percentage                   float64
unique_users                           int64
order_first_time_total_cnt             int64
aisle_mean_add_to_cart_order         float64
aisle_std_add_to_cart_order          float64
aisle_total_orders                     int64
aisle_total_reorders                   int64
aisle_reorder_percentage             float64
aisle_unique_users                     int64
aisle                                 object
department_mean_add_to_cart_order    float64
department_std_add_to_cart_order     float64
department_total_orders                int64
department_total_reorders              int64
department_reorder_percentage        float64
department_unique_users                int64
department                            object
dtype: object

In [63]:
user_features.dtypes

user_id                           int64
total_orders_by_user              int64
total_products_by_user            int64
total_unique_product_by_user      int64
total_reorders_by_user            int64
reorder_propotion_by_user       float64
average_order_size                int64
reorder_in_order                float64
dtype: object