# Establish Connection with G-Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
%cd drive/My Drive/NBR

/content/drive/My Drive/NBR


# Import Required Libraries

In [None]:
import numpy as np
from pandas import DataFrame, Series
import warnings
warnings.filterwarnings("ignore")
import pandas as pd

# Read Datasets

In [None]:
aisles = pd.read_csv('aisles.csv')
departments = pd.read_csv('departments.csv')
products = pd.read_csv('products.csv')
prior = pd.read_csv('order_products__prior.csv')
train = pd.read_csv('order_products__train.csv')
orders = pd.read_csv('orders.csv')

In [None]:
prior_train=pd.concat([prior, train],ignore_index=True)
prior_train=pd.merge(prior_train, orders, on='order_id', how='outer')
prior_train=pd.merge(prior_train, products, on='product_id', how='left')
prior_train=prior_train.sort_values(by=['user_id', 'order_number','product_id'])

In [None]:
prior_train = prior_train[ : len(prior_train)//100]

In [None]:
prior_train

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle_id,department_id
24076664,2539329,196.0,1.0,0.0,1,prior,1,2,8,,Soda,77.0,7.0
24076666,2539329,12427.0,3.0,0.0,1,prior,1,2,8,,Original Beef Jerky,23.0,19.0
24076665,2539329,14084.0,2.0,0.0,1,prior,1,2,8,,Organic Unsweetened Vanilla Almond Milk,91.0,16.0
24076667,2539329,26088.0,4.0,0.0,1,prior,1,2,8,,Aged White Cheddar Popcorn,23.0,19.0
24076668,2539329,26405.0,5.0,0.0,1,prior,1,2,8,,XL Pick-A-Size Paper Towel Rolls,54.0,17.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
21386310,2255814,28204.0,3.0,1.0,2192,prior,23,6,11,30.0,Organic Fuji Apple,24.0,4.0
21386308,2255814,36011.0,1.0,1.0,2192,prior,23,6,11,30.0,Organic Fat Free Milk,84.0,16.0
21386314,2255814,38777.0,7.0,1.0,2192,prior,23,6,11,30.0,Organic Green Seedless Grapes,123.0,4.0
20090088,2118953,1463.0,4.0,1.0,2192,prior,24,3,8,11.0,Organic Milk,84.0,16.0


## The train data preparation

In [None]:
test_users=prior_train[prior_train.eval_set=='test'].user_id.values
X_test_users=pd.DataFrame(test_users, columns=['user_id'])
prior_train_test_users=pd.merge(X_test_users, prior_train, on='user_id', how='left')
total_orders_user=prior_train_test_users[prior_train_test_users.eval_set=='prior'].groupby('user_id')['order_number'].max()
prior_train_test_users['total_orders_user']=prior_train_test_users.user_id.map(total_orders_user)
X_train_test_users=prior_train_test_users[prior_train_test_users.order_number < prior_train_test_users.total_orders_user]
## build 'user_id','product_id'
print('build user_id & product_id')
userid_productid=X_train_test_users.groupby(['user_id','product_id'])['order_number'].size().reset_index()
X_test_users=pd.merge(X_test_users,userid_productid[['user_id','product_id']], on='user_id', how='left')
#### reorder or not in the last order as Y output
Y_train_test_users=prior_train_test_users[(prior_train_test_users.order_number ==\
                                          prior_train_test_users.total_orders_user)&\
                                          (prior_train_test_users.reordered==1)][['user_id','product_id', 'reordered']]
X_test_users=pd.merge(X_test_users, Y_train_test_users, on=['user_id','product_id'], how='left').fillna(0)

build user_id & product_id


In [None]:
X_train_test_users

Unnamed: 0,user_id,order_id,product_id,add_to_cart_order,reordered,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle_id,department_id,total_orders_user
0,3,1374495,9387.0,1.0,0.0,prior,1,1,14,,Granny Smith Apples,24.0,4.0,12
1,3,1374495,15143.0,3.0,0.0,prior,1,1,14,,Blueberry Pint,24.0,4.0,12
2,3,1374495,16797.0,4.0,0.0,prior,1,1,14,,Strawberries,24.0,4.0,12
3,3,1374495,17668.0,2.0,0.0,prior,1,1,14,,Unsweetened Chocolate Almond Breeze Almond Milk,91.0,16.0,12
4,3,1374495,21903.0,7.0,0.0,prior,1,1,14,,Organic Baby Spinach,123.0,4.0,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121746,2191,1485199,21616.0,3.0,0.0,prior,6,4,15,30.0,Organic Baby Arugula,123.0,4.0,7
121747,2191,1485199,27966.0,2.0,0.0,prior,6,4,15,30.0,Organic Raspberries,123.0,4.0,7
121748,2191,1485199,39275.0,6.0,0.0,prior,6,4,15,30.0,Organic Blueberries,123.0,4.0,7
121749,2191,1485199,40604.0,4.0,0.0,prior,6,4,15,30.0,Feta Cheese Crumbles,21.0,16.0,7


In [None]:
X_test_users

Unnamed: 0,user_id,product_id,reordered
0,3,248.0,0.0
1,3,1005.0,0.0
2,3,1819.0,0.0
3,3,7503.0,0.0
4,3,8021.0,0.0
...,...,...,...
47386,2191,45681.0,0.0
47387,2191,46667.0,0.0
47388,2191,46676.0,0.0
47389,2191,47626.0,1.0


## Feature Creation

In [None]:
def transform_data(df_X_train,X_train, status):
    print(X_train.shape)
    #### total orders for an individual user
    print('total orders for an individual user')
    total_orders_user=df_X_train.groupby('user_id')['order_number'].max()
    X_train['total_orders_user']=X_train.user_id.map(total_orders_user)
    #### total unique products for an individual user
    print('total unique products for an individual user')
    def cal1(x):
        return x.nunique()
    unique_items_user=df_X_train.groupby('user_id')['product_id'].apply(cal1)
    X_train['unique_items_user']=X_train.user_id.map(unique_items_user)
    #### total items for an individual user
    print('total items for an individual user')
    total_items_user=df_X_train.groupby('user_id')['product_id'].size()
    X_train['total_items_user']=X_train.user_id.map(total_items_user)
    #### average items per order for an individual user
    X_train['average_items_per_order']=X_train.total_items_user/X_train.total_orders_user
    #### average days between each order for an individual user
    print('average days between each order for an individual user')
    days_per_order=df_X_train.groupby(['user_id','order_number'])['days_since_prior_order'].agg('mean')
    average_days_per_order=days_per_order.dropna().reset_index()
    average_days_per_order=average_days_per_order.groupby(['user_id'])['days_since_prior_order'].mean()
    X_train['average_days_per_order']=X_train.user_id.map(average_days_per_order)
    #### appear rate for this product in the user baskets
    print('appear rate for this product in the user baskets')
    appear_num=df_X_train.groupby(['user_id','product_id'])['order_number'].agg('count').reset_index()
    appear_num=appear_num.rename(columns={'order_number':'appear_num'})
    X_train=pd.merge(X_train, appear_num, on=['user_id','product_id'], how='left')
    X_train['appear_rate']=X_train.appear_num/X_train.total_orders_user
    X_train.drop('appear_num', axis=1, inplace=True)
    #### total reorder number for this product for this user
    print('total reorder number for this product for this user')
    reorder_num=df_X_train.groupby(['user_id','product_id'])['reordered'].agg([('reorder_num', 'sum')]).reset_index()
    X_train=pd.merge(X_train, reorder_num, on=['user_id','product_id'], how='left')
    #### mean values of add to cart order for this product for this user
    print('mean values of add to cart order for this product for this user')
    add_to_cart_order_mean=df_X_train.groupby(['user_id','product_id'])['add_to_cart_order'].agg('mean').reset_index()
    X_train=pd.merge(X_train, add_to_cart_order_mean, on=['user_id','product_id'], how='left')
    #### order interval for this product for this user
    print('order interval for this product for this user')
    def cal2(x):
        if x.nunique()==1:
            return x.max()
        else:
            return x.unique()[-2]
    order_interval_product=df_X_train.groupby(['user_id','product_id'])['order_number'].agg(last_order = 'max').reset_index()#, 'last_second_order' : [cal2]}).reset_index()
    X_train=pd.merge(X_train, order_interval_product, on=['user_id','product_id'], how='left')
    X_train['total_order_minus_last_order']=X_train['total_orders_user']-X_train['last_order']
    #### last order of this product / total orders
    X_train['last_order_ratio']=X_train['last_order'].values / np.array(X_train['total_orders_user'].values, dtype='float')
    #### 'aisle_id','department_id' for a product
    print('aisle_id,department_id for a product')
    aisle_dep=df_X_train.groupby(['user_id','product_id'])['aisle_id','department_id'].agg('mean').reset_index()
    X_train=pd.merge(X_train, aisle_dep, on=['user_id','product_id'], how='left')
    #### the user number who bought this product
    print('the user number who bought this product')
    def cal3(x):
        return x.nunique()
    user_num_product=df_X_train.groupby('product_id')['user_id'].apply(cal3)
    X_train['user_num_product']=X_train.product_id.map(user_num_product)
    #### orders which have this product
    print('orders which have this product')
    product_total_orders=df_X_train.groupby('product_id')['user_id'].size()
    X_train['product_total_orders']=X_train.product_id.map(product_total_orders)
    #### re-orders which have this product
    print('re-orders which have this product')
    product_total_reorders=df_X_train[df_X_train.reordered==1].groupby('product_id')['user_id'].size()
    X_train['product_total_reorders']=X_train.product_id.map(product_total_reorders)
    X_train['reorder_ratio']=X_train['product_total_reorders']/X_train['product_total_orders']
    #### order sequence for this product (one hot encoder) 
    print('order sequence (one hot encoder) for this product (last 20 orders for this user)')
    df_X_train['total_order_minus_order_number']=df_X_train.total_orders_user - df_X_train.order_number
    for i in range(1,21):
        order_sequence_binary=df_X_train[df_X_train.total_order_minus_order_number==i][['user_id','product_id','total_order_minus_order_number']]
        order_sequence_binary['total_order_minus_order_number']=order_sequence_binary.total_order_minus_order_number / i
        order_sequence_binary=order_sequence_binary.rename(columns={'total_order_minus_order_number':'last_{}st_order'.format(i)})
        X_train=pd.merge(X_train, order_sequence_binary, on=['user_id','product_id'], how='left').fillna(0)
    if status == 'train':
        X_train['last_five_orders_sum']=X_train.iloc[:,21:26].sum(1)
        X_train['last_10_orders_sum']=X_train.iloc[:,21:31].sum(1)
        X_train['last_15_orders_sum']=X_train.iloc[:,21:36].sum(1)
        X_train['last_20_orders_sum']=X_train.iloc[:,21:41].sum(1) 
    else: 
        X_train['last_five_orders_sum']=X_train.iloc[:,20:25].sum(1)
        X_train['last_10_orders_sum']=X_train.iloc[:,20:30].sum(1)
        X_train['last_15_orders_sum']=X_train.iloc[:,20:35].sum(1)
        X_train['last_20_orders_sum']=X_train.iloc[:,20:40].sum(1) 
    print(X_train.shape)
    return X_train

In [None]:
X_train_test_users

Unnamed: 0,user_id,order_id,product_id,add_to_cart_order,reordered,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle_id,department_id,total_orders_user
0,3,1374495,9387.0,1.0,0.0,prior,1,1,14,,Granny Smith Apples,24.0,4.0,12
1,3,1374495,15143.0,3.0,0.0,prior,1,1,14,,Blueberry Pint,24.0,4.0,12
2,3,1374495,16797.0,4.0,0.0,prior,1,1,14,,Strawberries,24.0,4.0,12
3,3,1374495,17668.0,2.0,0.0,prior,1,1,14,,Unsweetened Chocolate Almond Breeze Almond Milk,91.0,16.0,12
4,3,1374495,21903.0,7.0,0.0,prior,1,1,14,,Organic Baby Spinach,123.0,4.0,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121746,2191,1485199,21616.0,3.0,0.0,prior,6,4,15,30.0,Organic Baby Arugula,123.0,4.0,7
121747,2191,1485199,27966.0,2.0,0.0,prior,6,4,15,30.0,Organic Raspberries,123.0,4.0,7
121748,2191,1485199,39275.0,6.0,0.0,prior,6,4,15,30.0,Organic Blueberries,123.0,4.0,7
121749,2191,1485199,40604.0,4.0,0.0,prior,6,4,15,30.0,Feta Cheese Crumbles,21.0,16.0,7


In [None]:
X_test_users

Unnamed: 0,user_id,product_id,reordered
0,3,248.0,0.0
1,3,1005.0,0.0
2,3,1819.0,0.0
3,3,7503.0,0.0
4,3,8021.0,0.0
...,...,...,...
47386,2191,45681.0,0.0
47387,2191,46667.0,0.0
47388,2191,46676.0,0.0
47389,2191,47626.0,1.0


In [None]:
X_test_users = transform_data(X_train_test_users,X_test_users,'train')

(47391, 3)
total orders for an individual user
total unique products for an individual user
total items for an individual user
average days between each order for an individual user
appear rate for this product in the user baskets
total reorder number for this product for this user
mean values of add to cart order for this product for this user
order interval for this product for this user
aisle_id,department_id for a product
the user number who bought this product
orders which have this product
re-orders which have this product
order sequence (one hot encoder) for this product (last 20 orders for this user)
(47391, 44)


In [None]:
X_test_users

Unnamed: 0,user_id,product_id,reordered,total_orders_user,unique_items_user,total_items_user,average_items_per_order,average_days_per_order,appear_rate,reorder_num,...,last_15st_order,last_16st_order,last_17st_order,last_18st_order,last_19st_order,last_20st_order,last_five_orders_sum,last_10_orders_sum,last_15_orders_sum,last_20_orders_sum
0,3,248.0,0.0,11,33,82,7.454545,11.8,0.090909,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
1,3,1005.0,0.0,11,33,82,7.454545,11.8,0.090909,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0
2,3,1819.0,0.0,11,33,82,7.454545,11.8,0.272727,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,3.0,3.0,5.0
3,3,7503.0,0.0,11,33,82,7.454545,11.8,0.090909,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
4,3,8021.0,0.0,11,33,82,7.454545,11.8,0.090909,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47386,2191,45681.0,0.0,6,64,71,11.833333,23.6,0.166667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0
47387,2191,46667.0,0.0,6,64,71,11.833333,23.6,0.166667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0
47388,2191,46676.0,0.0,6,64,71,11.833333,23.6,0.166667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0
47389,2191,47626.0,1.0,6,64,71,11.833333,23.6,0.166667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0


In [None]:
userid_productid=prior_train[(prior_train.eval_set=='train')&(prior_train.reordered==1)][['user_id','product_id','reordered']].reset_index()
X_train_users=userid_productid[['user_id','product_id','reordered']]
train_users=pd.DataFrame(X_train_users.user_id.unique(), columns=['user_id'])
prior_train_selected=pd.merge(train_users, prior_train[prior_train.eval_set=='prior'], on='user_id', how='left')
total_orders_user=prior_train.groupby('user_id')['order_number'].max()
prior_train_selected['total_orders_user']=prior_train_selected.user_id.map(total_orders_user)

In [None]:
X_train_users=transform_data(prior_train_selected,X_train_users,'train')

(8795, 3)
total orders for an individual user
total unique products for an individual user
total items for an individual user
average days between each order for an individual user
appear rate for this product in the user baskets
total reorder number for this product for this user
mean values of add to cart order for this product for this user
order interval for this product for this user
aisle_id,department_id for a product
the user number who bought this product
orders which have this product
re-orders which have this product
order sequence (one hot encoder) for this product (last 20 orders for this user)
(8795, 44)


In [None]:
reordered_0=X_test_users[(X_test_users.last_20_orders_sum==0)&(X_test_users.reordered==0)]
reordered_1=X_test_users[(X_test_users.last_20_orders_sum==0)&(X_test_users.reordered==1)]
X_test_users=X_test_users[X_test_users.last_20_orders_sum!=0]
reordered_0_seleted=reordered_0.groupby(['user_id'])['product_id'].min().reset_index()
reordered_0=pd.merge(reordered_0_seleted, reordered_0, on=['user_id', 'product_id'], how='left')
X_test_users=pd.concat([X_test_users, reordered_0, reordered_1],ignore_index=True)

In [None]:
X=pd.concat([X_test_users, X_train_users],ignore_index=True)
# X.to_csv('X.csv')

In [None]:
print(X.shape[0])
print(np.bincount(np.array(X.reordered.values, dtype='int')))
print(np.bincount(np.array(X.reordered.values, dtype='int'))/float(X.shape[0]))

46884
[33377 13507]
[0.71190598 0.28809402]


In [None]:
test_users=prior_train[prior_train.eval_set=='test'].user_id.values
X_test_users=pd.DataFrame(test_users, columns=['user_id'])
prior_train_test_users=pd.merge(X_test_users, prior_train, on='user_id', how='left')
total_orders_user=prior_train_test_users.groupby('user_id')['order_number'].max()
prior_train_test_users['total_orders_user']=prior_train_test_users.user_id.map(total_orders_user)
prior_train_test_users=prior_train_test_users[prior_train_test_users.eval_set=='prior']
## build 'user_id','product_id'
print('build user_id,product_id')
userid_productid=prior_train_test_users.groupby(['user_id','product_id'])['order_number'].size().reset_index()
X_test_users=pd.merge(X_test_users,userid_productid[['user_id','product_id']], on='user_id', how='left')

build user_id,product_id


In [None]:
X_test=transform_data(prior_train_test_users,X_test_users,'test')

(50744, 2)
total orders for an individual user
total unique products for an individual user
total items for an individual user
average days between each order for an individual user
appear rate for this product in the user baskets
total reorder number for this product for this user
mean values of add to cart order for this product for this user
order interval for this product for this user
aisle_id,department_id for a product
the user number who bought this product
orders which have this product
re-orders which have this product
order sequence (one hot encoder) for this product (last 20 orders for this user)
(50744, 43)


In [None]:
X_test

Unnamed: 0,user_id,product_id,total_orders_user,unique_items_user,total_items_user,average_items_per_order,average_days_per_order,appear_rate,reorder_num,add_to_cart_order,...,last_15st_order,last_16st_order,last_17st_order,last_18st_order,last_19st_order,last_20st_order,last_five_orders_sum,last_10_orders_sum,last_15_orders_sum,last_20_orders_sum
0,3,248.0,12,33,88,7.333333,12.090909,0.083333,0.0,3.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
1,3,1005.0,12,33,88,7.333333,12.090909,0.083333,0.0,5.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0
2,3,1819.0,12,33,88,7.333333,12.090909,0.250000,2.0,2.666667,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,3.0,4.0
3,3,7503.0,12,33,88,7.333333,12.090909,0.083333,0.0,6.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
4,3,8021.0,12,33,88,7.333333,12.090909,0.083333,0.0,5.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50739,2191,45681.0,7,69,78,11.142857,20.833333,0.142857,0.0,19.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
50740,2191,46667.0,7,69,78,11.142857,20.833333,0.142857,0.0,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0
50741,2191,46676.0,7,69,78,11.142857,20.833333,0.142857,0.0,18.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0
50742,2191,47626.0,7,69,78,11.142857,20.833333,0.285714,1.0,5.500000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0


# Buliding the model for NBR

In [None]:
# import libraries to create model
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import warnings
warnings.filterwarnings("ignore")
import joblib

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import ModelCheckpoint
from keras import backend as K
import h5py
from tensorflow.keras.layers import BatchNormalization
from sklearn.preprocessing import MinMaxScaler

In [None]:
X=X.take(np.random.permutation(len(X)))
Y=X['reordered']
X.drop(['reordered','user_id', 'product_id'], axis=1, inplace=True)
X.columns

Index(['total_orders_user', 'unique_items_user', 'total_items_user',
       'average_items_per_order', 'average_days_per_order', 'appear_rate',
       'reorder_num', 'add_to_cart_order', 'last_order',
       'total_order_minus_last_order', 'last_order_ratio', 'aisle_id',
       'department_id', 'user_num_product', 'product_total_orders',
       'product_total_reorders', 'reorder_ratio', 'last_1st_order',
       'last_2st_order', 'last_3st_order', 'last_4st_order', 'last_5st_order',
       'last_6st_order', 'last_7st_order', 'last_8st_order', 'last_9st_order',
       'last_10st_order', 'last_11st_order', 'last_12st_order',
       'last_13st_order', 'last_14st_order', 'last_15st_order',
       'last_16st_order', 'last_17st_order', 'last_18st_order',
       'last_19st_order', 'last_20st_order', 'last_five_orders_sum',
       'last_10_orders_sum', 'last_15_orders_sum', 'last_20_orders_sum'],
      dtype='object')

In [None]:
X

Unnamed: 0,total_orders_user,unique_items_user,total_items_user,average_items_per_order,average_days_per_order,appear_rate,reorder_num,add_to_cart_order,last_order,total_order_minus_last_order,...,last_15st_order,last_16st_order,last_17st_order,last_18st_order,last_19st_order,last_20st_order,last_five_orders_sum,last_10_orders_sum,last_15_orders_sum,last_20_orders_sum
29623,6,90,121,20.166667,27.600000,0.166667,0.0,19.000000,3,3,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0
39372,25,175,308,12.320000,1.000000,0.080000,1.0,7.500000,25,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12988,2,28,38,19.000000,13.000000,0.500000,0.0,5.000000,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0
34591,13,75,148,11.384615,15.083333,0.230769,2.0,15.666667,12,1,...,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,3.0,6.0
12560,6,130,191,31.833333,14.800000,0.166667,0.0,41.000000,1,5,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4230,41,220,526,12.829268,8.600000,0.048780,1.0,10.500000,38,3,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,3.0
44396,13,78,253,19.461538,18.166667,0.538462,6.0,13.142857,13,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,5.0,6.0,7.0
36214,4,10,13,3.250000,21.333333,0.250000,0.0,3.000000,3,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0
8999,18,83,206,11.444444,12.235294,0.333333,5.0,6.833333,18,0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.0,5.0,5.0,9.0


## Split the training data (80% : Train data + 20% : Validation data) for cross-validation

In [None]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5)
i=1
for train_index, valid_index  in skf.split(X, Y):
    joblib.dump(X.values[train_index],'X_train_modify_{}.pkl'.format(i))
    joblib.dump(Y.values[train_index],'Y_train_modify_{}.pkl'.format(i))
    joblib.dump(X.values[valid_index],'X_valid_modify_{}.pkl'.format(i))
    joblib.dump(Y.values[valid_index],'Y_valid_modify_{}.pkl'.format(i))
    i += 1

## NN Model

In [None]:
def NNmodel(X_scaled_train, y_train,X_scaled_valid, y_valid,optimizer,batch_size,nb_epoch, c1, c2, c3, c4,c5,c6,drop1, drop2, init, testnumber):
    model=Sequential()
    model.add(Dense(c1, input_dim=X_train.shape[1], activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(drop1))
    model.add(Dense(c2, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(drop2))
    model.add(Dense(c3, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dense(c4, activation='relu'))
    # model.add(BatchNormalization())
    # model.add(Dense(c5, activation='relu'))
    # model.add(BatchNormalization())
    # model.add(Dense(c6, activation='relu'))  #init = init,
    model.add(BatchNormalization())
    model.add(Dense(1, activation='sigmoid'))
  
    from sklearn.utils import class_weight
    print(np.unique(y_train))
    class_weight = class_weight.compute_class_weight('balanced', classes = np.unique(y_train), y = y_train)
    print(class_weight)
    class_weight = {0:class_weight[0], 1:class_weight[1]}
    model.compile(loss = 'binary_crossentropy', optimizer = optimizer,metrics=['accuracy'])
    if testnumber == 1:
        print(model.summary()) 
    filepath="model_{}.best.hdf5".format(testnumber)
    saveBestModel = ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True, mode='min')
    model.fit(X_scaled_train, y_train, batch_size=batch_size, epochs=nb_epoch, class_weight=class_weight, validation_data=(X_scaled_valid, y_valid),callbacks=[saveBestModel],verbose=0) 
    return model

In [None]:
def NNmodel_prediction(X_scaled_test,optimizer,batch_size,c1, c2, c3, c4,c5,c6,drop1, drop2, init,testnumber):
    model=Sequential()
    model.add(Dense(c1, input_dim=X_train.shape[1], activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(drop1))
    model.add(Dense(c2, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(drop2))
    model.add(Dense(c3, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dense(c4, activation='relu'))
    # model.add(BatchNormalization())
    # model.add(Dense(c5, activation='relu'))
    # model.add(BatchNormalization())
    # model.add(Dense(c6, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dense(1, activation='sigmoid'))
  
    filepath="model_{}.best.hdf5".format(testnumber)
    model.load_weights(filepath)
    model.compile(loss = 'binary_crossentropy', optimizer = optimizer,metrics=['accuracy']) 
    y_pred_test = model.predict(X_scaled_test, batch_size=batch_size, verbose=0)
    return y_pred_test

## 5-Fold Cross-Validation

In [None]:
y_pred_all=[]
for i in range(1,6):
    ## load data 
    X_train=joblib.load('X_train_modify_{}.pkl'.format(i))
    Y_train=joblib.load('Y_train_modify_{}.pkl'.format(i))
    X_valid=joblib.load('X_train_modify_{}.pkl'.format(i))
    Y_valid=joblib.load('Y_train_modify_{}.pkl'.format(i))
    ## scale data
    scaler=MinMaxScaler()
    X_scaled_train=scaler.fit_transform(X_train)
    X_scaled_valid=scaler.transform(X_valid)
    l = list(X_test.columns)
    
    X_scaled_test=scaler.transform(X_test[l[2:]].values)
    ## train data
    NNmodel(X_scaled_train, Y_train,X_scaled_valid, Y_valid,'adamax', 1000, 200, 500, 250, 250, 100, 100, 100, 0.4, 0.4, 'normal',i) #1000,500, 500, 400, 400,200, 150,150,0.4, 0.4,
    ## predict data
    y_pred_test=NNmodel_prediction(X_scaled_test,'adamax', 1000, 500, 250, 250, 100, 100, 100, 0.4, 0.4, 'normal',i) #800, 400, 400, 300, 200, 200, 100, 100, 0.4, 0.4,
    y_pred_all.append(y_pred_test)

[0. 1.]
[0.70235197 1.73547103]
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 500)               21000     
                                                                 
 batch_normalization (BatchN  (None, 500)              2000      
 ormalization)                                                   
                                                                 
 dropout (Dropout)           (None, 500)               0         
                                                                 
 dense_1 (Dense)             (None, 250)               125250    
                                                                 
 batch_normalization_1 (Batc  (None, 250)              1000      
 hNormalization)                                                 
                                                                 
 dropout_1 (Dropout)    

In [None]:
y_pred_all=np.array(y_pred_all)
y_pred_average=np.average(y_pred_all, axis=0)

## Prediction

In [None]:
thresh=0.24
y_pred_binary=np.where(y_pred_average[:, 0]>thresh, 1,0)
X_test_new=X_test.copy()
X_test_new['pred']=y_pred_binary
X_test_new['prediction_pro']=y_pred_test[:, 0]
X_test_new['product_id']=X_test_new['product_id'].astype(int)
next_baskets=X_test_new[X_test_new.pred==1].groupby('user_id')['product_id'].apply(lambda x: ', '.join([str(e) for e in set(x)]))

In [None]:
next_baskets

user_id
3       18370, 23650, 17668, 18599, 24810, 9387, 48523...
4       21573, 17769, 35469, 37646, 26576, 19057, 2707...
6       27521, 48679, 8424, 21903, 45007, 38293, 49401...
11      17794, 33037, 30480, 10644, 15261, 20383, 3357...
12      11520, 17794, 44422, 37646, 14992, 31506, 4968...
                              ...                        
2178    40706, 10504, 48395, 21137, 34451, 24852, 4968...
2181    21376, 30338, 36036, 27077, 29373, 24810, 3947...
2183    17794, 47492, 47626, 25355, 21903, 24852, 2946...
2188    13834, 40592, 19348, 5782, 39321, 39581, 47393...
2191    34050, 24964, 39812, 47626, 13451, 21903, 9366...
Name: product_id, Length: 791, dtype: object

In [None]:
key = products["product_id"]
value = products["product_name"]
key = key.values
value = value.values
m = dict([(k,v) for k,v in zip(key,value)])

In [None]:
def print_products(basket):
    for l in basket.split(', '):
        print(m[int(l)])

In [None]:
user_id = 3
print(f"For user: {user_id}")
print("*"*60)
print("Predicted Next Basket is:")
print("*"*60)
print_products(next_baskets[user_id])
print("*"*60)

For user: 3
************************************************************
Predicted Next Basket is:
************************************************************
Organic Shredded Mozzarella
Electrolyte Water
Unsweetened Chocolate Almond Breeze Almond Milk
Garlic Couscous
Organic Lightly Salted Brown Rice Cakes
Granny Smith Apples
Birthday Cake Light Ice Cream
Whole Almonds
Green Beans
Organic Baby Spinach
Crackers, Crispy, Cheddar
Organic Whole String Cheese
Organic Peeled Whole Baby Carrots
Vanilla Unsweetened Almond Milk
Organic Avocado
Organic Baby Carrots
All Natural No Stir Creamy Almond Butter
Strawberries
************************************************************


## Conclusion
In this project, I have developed a neural network model to predict which products will be in users' next basket. The NN model combined with feature engineering technique can well capture both sequential behavior and general taste of users.