In [29]:
import pandas as pd
import numpy as np
import gc
import lightgbm as lgb
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

myfolder = '../OriginalDataset/'
print('loading files ...')

prior = pd.read_csv(myfolder + 'order_products__prior.csv', dtype={'order_id': np.uint32,
           'product_id': np.uint16, 'reordered': np.uint8, 'add_to_cart_order': np.uint8})

train_orders = pd.read_csv(myfolder + 'order_products__train.csv', dtype={'order_id': np.uint32,
           'product_id': np.uint16, 'reordered': np.int8, 'add_to_cart_order': np.uint8 })

orders = pd.read_csv(myfolder + 'orders.csv', dtype={'order_hour_of_day': np.uint8,
           'order_number': np.uint8, 'order_id': np.uint32, 'user_id': np.uint32,
           'order_dow': np.uint8, 'days_since_prior_order': np.float32})

orders.eval_set = orders.eval_set.replace({'prior': 0, 'train': 1, 'test':2}).astype(np.uint8)
orders.days_since_prior_order = orders.days_since_prior_order.fillna(30).astype(np.uint8)

products = pd.read_csv(myfolder + 'products.csv', dtype={'product_id': np.uint16,
            'aisle_id': np.uint8, 'department_id': np.uint8},
             usecols=['product_id', 'aisle_id', 'department_id'])

print('done loading')

loading files ...
done loading


  orders.eval_set = orders.eval_set.replace({'prior': 0, 'train': 1, 'test':2}).astype(np.uint8)


In [30]:
print('merge prior and orders and keep train separate ...')

orders_products = orders.merge(prior, how = 'inner', on = 'order_id')
train_orders = train_orders.merge(orders[['user_id','order_id']], left_on = 'order_id', right_on = 'order_id', how = 'inner')

del prior
gc.collect()

merge prior and orders and keep train separate ...


169

In [31]:
print('Creating features I ...')

# sort orders and products to get the rank or the reorder frequency
prdss = orders_products.sort_values(['user_id', 'order_number', 'product_id'], ascending=True)
prdss['product_time'] = prdss.groupby(['user_id', 'product_id']).cumcount()+1

# getting products ordered first and second times to calculate probability later
sub1 = prdss[prdss['product_time'] == 1].groupby('product_id').size().to_frame('prod_first_orders')
sub2 = prdss[prdss['product_time'] == 2].groupby('product_id').size().to_frame('prod_second_orders')
sub1['prod_orders'] = prdss.groupby('product_id')['product_id'].size()
sub1['prod_reorders'] = prdss.groupby('product_id')['reordered'].sum()
sub2 = sub2.reset_index().merge(sub1.reset_index())
sub2['prod_reorder_probability'] = sub2['prod_second_orders']/sub2['prod_first_orders']
sub2['prod_reorder_ratio'] = sub2['prod_reorders']/sub2['prod_orders']
prd = sub2[['product_id', 'prod_orders','prod_reorder_probability', 'prod_reorder_ratio']]

del sub1, sub2, prdss
gc.collect()

Creating features I ...


0

In [32]:
prd.isna().sum()

product_id                  0
prod_orders                 0
prod_reorder_probability    0
prod_reorder_ratio          0
dtype: int64

In [33]:
print('Creating features II ...')

# extracting prior information (features) by user
users = orders[orders['eval_set'] == 0].groupby(['user_id'])['order_number'].max().to_frame('user_orders')
users['user_period'] = orders[orders['eval_set'] == 0].groupby(['user_id'])['days_since_prior_order'].sum()
users['user_mean_days_since_prior'] = orders[orders['eval_set'] == 0].groupby(['user_id'])['days_since_prior_order'].mean()

# merging features about users and orders into one dataset
us = orders_products.groupby('user_id').size().to_frame('user_total_products')
us['eq_1'] = orders_products[orders_products['reordered'] == 1].groupby('user_id')['product_id'].size()
us['gt_1'] = orders_products[orders_products['order_number'] > 1].groupby('user_id')['product_id'].size()
us['user_reorder_ratio'] = us['eq_1'] / us['gt_1']
us.drop(['eq_1', 'gt_1'], axis = 1, inplace = True)
us['user_distinct_products'] = orders_products.groupby(['user_id'])['product_id'].nunique()

# the average basket size of the user
users = users.reset_index().merge(us.reset_index())
users['user_average_basket'] = users['user_total_products'] / users['user_orders']

us = orders[orders['eval_set'] != 0]
us = us[['user_id', 'order_id', 'eval_set', 'days_since_prior_order']]
users = users.merge(us)

del us
gc.collect()

Creating features II ...


0

In [41]:
us = orders[orders['eval_set'] != 0]
us.

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
10,1187899,1,1,11,4,8,14
25,1492625,2,1,15,1,11,30
38,2774568,3,2,13,5,15,11
44,329954,4,2,6,3,12,30
49,2196797,5,1,5,0,11,6


In [42]:
us.user_id.value_counts()

user_id
1         1
137478    1
137468    1
137469    1
137470    1
         ..
68740     1
68741     1
68742     1
68743     1
206209    1
Name: count, Length: 206209, dtype: int64

In [45]:
users

Unnamed: 0,user_id,user_orders,user_period,user_mean_days_since_prior,user_total_products,user_reorder_ratio,user_distinct_products,user_average_basket,order_id,eval_set,days_since_prior_order
0,1,10,206,20.600000,59,0.759259,18,5.900000,1187899,1,14
1,2,14,228,16.285714,195,0.510989,102,13.928571,1492625,1,30
2,3,12,163,13.583333,88,0.705128,33,7.333333,2774568,2,11
3,4,5,85,17.000000,18,0.071429,17,3.600000,329954,2,30
4,5,4,70,17.500000,37,0.538462,23,9.250000,2196797,1,6
...,...,...,...,...,...,...,...,...,...,...,...
206204,206205,3,70,23.333333,32,0.533333,24,10.666667,1716008,1,10
206205,206206,67,279,4.164179,285,0.480427,150,4.253731,1043943,2,0
206206,206207,16,245,15.312500,223,0.658291,92,13.937500,2821651,2,14
206207,206208,49,387,7.897959,677,0.720301,198,13.816327,803273,2,4


In [34]:
users.isna().sum()

user_id                          0
user_orders                      0
user_period                      0
user_mean_days_since_prior       0
user_total_products              0
user_reorder_ratio            3045
user_distinct_products           0
user_average_basket              0
order_id                         0
eval_set                         0
days_since_prior_order           0
dtype: int64

In [35]:
print('Finalizing features and the main data file  ...')
# merging orders and products and grouping by user and product and calculating features for the user/product combination
data = orders_products.groupby(['user_id', 'product_id']).size().to_frame('up_orders')
data['up_first_order'] = orders_products.groupby(['user_id', 'product_id'])['order_number'].min()
data['up_last_order'] = orders_products.groupby(['user_id', 'product_id'])['order_number'].max()
data['up_average_cart_position'] = orders_products.groupby(['user_id', 'product_id'])['add_to_cart_order'].mean()
data = data.reset_index()

#merging previous data with users
data = data.merge(prd, on = 'product_id')
data = data.merge(users, on = 'user_id')

#user/product combination features about the particular order
data['up_order_rate'] = data['up_orders'] / data['user_orders']
data['up_orders_since_last_order'] = data['user_orders'] - data['up_last_order']

Finalizing features and the main data file  ...


In [36]:
data.isna().sum()

user_id                           0
product_id                        0
up_orders                         0
up_first_order                    0
up_last_order                     0
up_average_cart_position          0
prod_orders                       0
prod_reorder_probability          0
prod_reorder_ratio                0
user_orders                       0
user_period                       0
user_mean_days_since_prior        0
user_total_products               0
user_reorder_ratio            47895
user_distinct_products            0
user_average_basket               0
order_id                          0
eval_set                          0
days_since_prior_order            0
up_order_rate                     0
up_orders_since_last_order        0
dtype: int64

In [37]:
data = data.merge(train_orders[['user_id', 'product_id', 'reordered']], 
                  how = 'left', on = ['user_id', 'product_id'])
data = data.merge(products, on = 'product_id')

del orders_products     #, orders, train_orders

In [38]:
data.isna().sum()

user_id                              0
product_id                           0
up_orders                            0
up_first_order                       0
up_last_order                        0
up_average_cart_position             0
prod_orders                          0
prod_reorder_probability             0
prod_reorder_ratio                   0
user_orders                          0
user_period                          0
user_mean_days_since_prior           0
user_total_products                  0
user_reorder_ratio               47895
user_distinct_products               0
user_average_basket                  0
order_id                             0
eval_set                             0
days_since_prior_order               0
up_order_rate                        0
up_orders_since_last_order           0
reordered                     12449508
aisle_id                             0
department_id                        0
dtype: int64

In [13]:
print('setting dtypes for data ...')

#reduce the size by setting data types
data = data.astype(dtype= {'user_id' : np.uint32, 'product_id'  : np.uint16,
            'up_orders'  : np.uint8, 'up_first_order' : np.uint8, 'up_last_order' : np.uint8,
            'up_average_cart_position' : np.uint8, 'prod_orders' : np.uint16, 
            'prod_reorder_probability' : np.float16,   
            'prod_reorder_ratio' : np.float16, 'user_orders' : np.uint8,
            'user_period' : np.uint8, 'user_mean_days_since_prior' : np.uint8,
            'user_total_products' : np.uint8, 'user_reorder_ratio' : np.float16, 
            'user_distinct_products' : np.uint8, 'user_average_basket' : np.uint8,
            'order_id'  : np.uint32, 'eval_set' : np.uint8, 
            'days_since_prior_order' : np.uint8, 'up_order_rate' : np.float16, 
            'up_orders_since_last_order':np.uint8,
            'aisle_id': np.uint8, 'department_id': np.uint8})

data['reordered'].fillna(0, inplace=True)  # replace NaN with zeros (not reordered) 
data['reordered']=data['reordered'].astype(np.uint8)

gc.collect()

setting dtypes for data ...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['reordered'].fillna(0, inplace=True)  # replace NaN with zeros (not reordered)


0

In [39]:
data.isna().sum()

user_id                              0
product_id                           0
up_orders                            0
up_first_order                       0
up_last_order                        0
up_average_cart_position             0
prod_orders                          0
prod_reorder_probability             0
prod_reorder_ratio                   0
user_orders                          0
user_period                          0
user_mean_days_since_prior           0
user_total_products                  0
user_reorder_ratio               47895
user_distinct_products               0
user_average_basket                  0
order_id                             0
eval_set                             0
days_since_prior_order               0
up_order_rate                        0
up_orders_since_last_order           0
reordered                     12449508
aisle_id                             0
department_id                        0
dtype: int64

In [None]:
print('Preparing Train and Test sets ...')

# filter by eval_set (train=1, test=2) and dropp the id's columns (not part of training features) 
# but keep prod_id and user_id in test

train = data[data['eval_set'] == 1].drop(['eval_set', 'user_id', 'product_id', 'order_id'], axis = 1)
test =  data[data['eval_set'] == 2].drop(['eval_set', 'user_id', 'reordered'], axis = 1)

check =  data.drop(['eval_set', 'user_id', 'reordered'], axis = 1)

del data
gc.collect()