In [1]:
#importing required libraries
import pandas as pd
import numpy as np
import glob
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import os
os.getcwd()

'/kaggle/working'

In [3]:
os.chdir("../input")
#Finding the list of files to uplod
pattern = '*.csv'
csv_files = glob.glob(pattern)
print(csv_files)

['orders.csv', 'departments.csv', 'products.csv', 'order_products__train.csv', 'order_products__prior.csv', 'aisles.csv']


In [4]:
#uploading files
aisles = pd.read_csv("aisles.csv", dtype={'aisle_id': np.uint8, 'aisle': 'category'})
departments = pd.read_csv("departments.csv", dtype={'department_id': np.uint8, 'department': 'category'})
orders = pd.read_csv("orders.csv", dtype = {'order_id': np.uint32,
                                            'user_id': np.uint32,
                                            'eval_set': 'category',
                                            'order_number': np.uint8,
                                            'order_dow': np.uint8,
                                            'order_hour_of_day': np.uint8})
orders_products_prior = pd.read_csv("order_products__prior.csv", dtype={'order_id': np.uint32,
                                                                        'product_id': np.uint16,
                                                                        'add_to_cart_order': np.uint8,
                                                                        'reordered': bool})
orders_products_train = pd.read_csv("order_products__train.csv", dtype={'order_id': np.uint32,
                                                                        'product_id': np.uint16,
                                                                        'add_to_cart_order': np.uint8,
                                                                        'reordered': bool})
products = pd.read_csv("products.csv", dtype={'product_id': np.uint16,
                                              'aisle_id': np.uint8,
                                              'department_id': np.uint8})

In [5]:
data_list = [aisles, departments, orders, 
             orders_products_prior, orders_products_train, products]
data_list_name = ['aisles', 'departments', 'orders', 
'orders_products_prior', 'orders_products_train', 'products']

j=0
for i in data_list:
    i.name = data_list_name[j]
    j += 1

In [6]:
#Checking for null values
#Creating a function to find null values
def null_columns(x):
    y = x.columns[x.isnull().any()]
    return y

In [7]:
for dataset in data_list:
    if len(null_columns(dataset)) == 0:
        print('Dataset ' + dataset.name + ' has no null values ')
    else:
        print('Dataset '+ dataset.name + ' has null values in column '+ str([i for i in null_columns(dataset)]))

Dataset aisles has no null values 
Dataset departments has no null values 
Dataset orders has null values in column ['days_since_prior_order']
Dataset orders_products_prior has no null values 
Dataset orders_products_train has no null values 
Dataset products has no null values 


In [8]:
#Dataset orders has null values in column ['days_since_prior_order'] 
# Other Columns have no null values
#Finding the number of null values in orders.days_since_prior_order
sum(orders.days_since_prior_order.isnull())

206209

In [9]:
#There are 206209 null values
#Lets examine the null values
orders.loc[orders.days_since_prior_order.isnull()==True,['user_id', 'order_number']]

Unnamed: 0,user_id,order_number
0,1,1
11,2,1
26,3,1
39,4,1
45,5,1
50,6,1
54,7,1
75,8,1
79,9,1
83,10,1


In [10]:
#days_since_prior_order attribue for first order for all the users is null so it can be changed to 0

orders.days_since_prior_order = orders.days_since_prior_order.fillna(int(0))

## Feature Engineering

In [11]:
prior_prods = pd.DataFrame()
prior_prods['prior_orders'] = orders_products_prior.groupby(orders_products_prior.product_id).size().astype(np.int32)
prior_prods['prior_reorders'] = orders_products_prior['reordered'].groupby(orders_products_prior.product_id).sum().astype(np.float32)
prior_prods['prior_reorder_rate'] = (prior_prods.prior_reorders / prior_prods.prior_orders).astype(np.float32)
prior_products = products.join(prior_prods, on='product_id')

In [12]:
prior_products['is_gluten_free'] = prior_products['product_name'].str.contains('gluten' and 'free').astype(np.uint8)
prior_products['is_organic'] = prior_products['product_name'].str.contains('organic' or 'Organic').astype(np.uint8)
prior_products['is_low_fat'] = prior_products['product_name'].str.contains('low' and 'fat').astype(np.uint8)

In [13]:
prior_avg_add_to_cart = orders_products_prior.groupby('product_id')['add_to_cart_order'].mean().to_frame('prior_avg_add_to_cart').astype(np.float32).reset_index()

In [14]:
prior_products = prior_products.merge(prior_avg_add_to_cart, how='left', left_on='product_id', right_on='product_id')
prior_products = prior_products.replace([np.inf, -np.inf], np.nan)
prior_products = prior_products.fillna(0)

In [15]:
# mean order hour and day for all the products
ord_plus_orderprior = orders_products_prior.merge(orders, how='left', on='order_id')
ord_days = ord_plus_orderprior.groupby('product_id')['order_dow'].mean().to_frame('product_avg_orderdow').astype(np.float32).reset_index()
ord_days = ord_days.merge(ord_plus_orderprior.groupby
               ('product_id')['order_hour_of_day'].mean().to_frame
               ('product_avg_orderhour').astype(np.float32).reset_index(), on = 'product_id')
prior_products = prior_products.merge(ord_days, how = 'left', on='product_id')

In [16]:
#Top_100_reordered_products_prior
top_100_reordered_products = pd.DataFrame(orders_products_prior.loc[orders_products_prior.reordered == 
                                                       True].groupby('product_id').size().sort_values(ascending=False).head(100).reset_index(name='counts'))
top_100_reordered_products_list = list(top_100_reordered_products.product_id)
is_top_100_reordered = []
for i in prior_products.product_id:
    is_top_100_reordered += [i in top_100_reordered_products_list]
prior_products['is_top_100_reordered'] = is_top_100_reordered
prior_products['is_top_100_reordered'] = prior_products['is_top_100_reordered'].astype(np.uint8)

In [17]:
prior_products.head().T

Unnamed: 0,0,1,2,3,4
product_id,1,2,3,4,5
product_name,Chocolate Sandwich Cookies,All-Seasons Salt,Robust Golden Unsweetened Oolong Tea,Smart Ones Classic Favorites Mini Rigatoni Wit...,Green Chile Anytime Sauce
aisle_id,61,104,94,38,5
department_id,19,13,7,1,13
prior_orders,1852,90,277,329,15
prior_reorders,1136,12,203,147,9
prior_reorder_rate,0.613391,0.133333,0.732852,0.446809,0.6
is_gluten_free,0,0,0,0,0
is_organic,0,0,0,0,0
is_low_fat,0,0,0,0,0


In [18]:
#Merging with products
orders_products_prior = orders_products_prior.merge(products, on='product_id')
orders_products_prior.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id
0,2,33120,1,True,Organic Egg Whites,86,16
1,26,33120,5,False,Organic Egg Whites,86,16
2,120,33120,13,False,Organic Egg Whites,86,16
3,327,33120,5,True,Organic Egg Whites,86,16
4,390,33120,28,True,Organic Egg Whites,86,16


In [19]:
#Top_30_reordered_aisle_prior
top_30_reordered_aisles = pd.DataFrame(orders_products_prior.loc[orders_products_prior.reordered ==
                                                                 True].groupby('aisle_id').size().sort_values(ascending=False).head(30).reset_index(name='counts'))
top_30_reordered_aisles_list = list(top_30_reordered_aisles.aisle_id)

#Top_10_reordered_deparments_prior
top_10_reordered_department = pd.DataFrame(orders_products_prior.loc[orders_products_prior.reordered == 
                                                       True].groupby('department_id').size().sort_values(ascending=False).head(10).reset_index(name='counts'))
top_10_reordered_departments_list = list(top_10_reordered_department.department_id)

In [20]:
#Merging with products
#Adding Order info to prior orders
orders_products_prior = orders_products_prior.merge(orders, on='order_id')
orders_products_prior.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2,33120,1,True,Organic Egg Whites,86,16,202279,prior,3,5,9,8.0
1,2,28985,2,True,Michigan Organic Kale,83,4,202279,prior,3,5,9,8.0
2,2,9327,3,False,Garlic Powder,104,13,202279,prior,3,5,9,8.0
3,2,45918,4,True,Coconut Butter,19,13,202279,prior,3,5,9,8.0
4,2,30035,5,False,Natural Sweetener,17,13,202279,prior,3,5,9,8.0


### User_features

In [21]:
#User Features
user = pd.DataFrame()
user['mean_days_between_orders'] = orders.loc[orders.eval_set=='prior',].groupby('user_id')['days_since_prior_order'].mean().astype(np.float32)
user['number_of_orders'] = orders.loc[orders.eval_set=='prior',].groupby('user_id').size().astype(np.int16)
user = user.reset_index()
user.head()

Unnamed: 0,user_id,mean_days_between_orders,number_of_orders
0,1,17.6,10
1,2,14.142858,14
2,3,11.083333,12
3,4,11.0,5
4,5,10.0,4


In [22]:
# Order similarity per user
orders_set = orders_products_prior.groupby(['user_id', 'order_number'])['product_id'].apply(set)
orders_set = orders_set.to_frame('products').reset_index()
orders_set['next_order_products'] = orders_set.groupby('user_id')['products'].shift(-1)
orders_set = orders_set.dropna()

# The Jaccard similarity measures similarity between 
# finite sample sets, and is defined as the cardinality of the intersection of sets 
# divided by the cardinality of the union of the sample sets. 

def jaccard_similarity(x,y):
    intersection_len = len(set.intersection(x,y))
    union_len = len(set.union(x,y))
    return intersection_len/union_len

orders_set['jaccard_similarity'] = orders_set.apply(lambda x: jaccard_similarity(x['products'], x['next_order_products']), axis=1)

In [23]:
order_similarity = orders_set.groupby('user_id')['jaccard_similarity'].mean().to_frame('mean_order_similarity').astype(np.float32).reset_index()

In [24]:
#User features based on prior orders
prior_users = pd.DataFrame()
prior_users['prior_total_items'] = orders_products_prior.groupby('user_id').size().astype(np.int16)
prior_users['prior_all_products'] = orders_products_prior.groupby('user_id')['product_id'].apply(set)
prior_users['total_distinct_products'] = prior_users.prior_all_products.map(len).astype(np.int16)

prior_users = prior_users.reset_index()
prior_users = prior_users.merge(user, on = 'user_id')
del user
prior_users['average_products_per_basket'] = (prior_users.prior_total_items / prior_users.number_of_orders).astype(np.float32)
prior_users = pd.DataFrame(prior_users)
prior_users = prior_users.merge(order_similarity, how='left', on = 'user_id')
prior_users.head()

Unnamed: 0,user_id,prior_total_items,prior_all_products,total_distinct_products,mean_days_between_orders,number_of_orders,average_products_per_basket,mean_order_similarity
0,1,59,"{17122, 196, 46149, 26405, 14084, 13032, 26088...",18,17.6,10,5.9,0.581393
1,2,195,"{45066, 2573, 18961, 1559, 32792, 23, 22559, 1...",102,14.142858,14,13.928572,0.140149
2,3,88,"{17668, 44683, 48523, 21903, 14992, 21137, 324...",33,11.083333,12,7.333333,0.253725
3,4,18,"{21573, 42329, 17769, 35469, 37646, 1200, 1905...",17,11.0,5,3.6,0.0
4,5,37,"{11777, 28289, 40706, 48775, 20754, 6808, 1398...",23,10.0,4,9.25,0.210317


In [25]:
user_days = orders_products_prior.groupby('user_id')['order_dow'].mean().to_frame('user_avg_orderdow').astype(np.float32).reset_index()
user_days = user_days.merge(orders_products_prior.groupby
                           ('user_id')['order_hour_of_day'].mean().to_frame
                            ('user_avg_orderhour').astype(np.float32).reset_index(), on = 'user_id')
prior_users = prior_users.merge(user_days, how= 'left', on = 'user_id')
prior_users.head().T

Unnamed: 0,0,1,2,3,4
user_id,1,2,3,4,5
prior_total_items,59,195,88,18,37
prior_all_products,"{17122, 196, 46149, 26405, 14084, 13032, 26088...","{45066, 2573, 18961, 1559, 32792, 23, 22559, 1...","{17668, 44683, 48523, 21903, 14992, 21137, 324...","{21573, 42329, 17769, 35469, 37646, 1200, 1905...","{11777, 28289, 40706, 48775, 20754, 6808, 1398..."
total_distinct_products,18,102,33,17,23
mean_days_between_orders,17.6,14.1429,11.0833,11,10
number_of_orders,10,14,12,5,4
average_products_per_basket,5.9,13.9286,7.33333,3.6,9.25
mean_order_similarity,0.581393,0.140149,0.253725,0,0.210317
user_avg_orderdow,2.64407,2.00513,1.01136,4.72222,1.62162
user_avg_orderhour,10.5424,10.441,16.3523,13.1111,15.7297


### User X Product Features

In [26]:
#User X Product features
orders_products_prior['user_product'] = orders_products_prior.product_id + orders_products_prior.user_id * 100000


In [27]:
userXproduct = orders_products_prior.groupby('user_product').size().astype(np.uint32).reset_index(name = 'nb_counts')
a = orders_products_prior.groupby('user_product')['order_id'].max().astype(np.uint32).reset_index(name = 'last_order_id')
avg_pos_in_cart = orders_products_prior.groupby('user_product')['add_to_cart_order'].mean().astype(np.float32).reset_index(name = 'mean_pos_in_cart')
mean_reorder = orders_products_prior.groupby('user_product')['reordered'].mean().astype(np.float32).reset_index(name = 'UP_mean_reorder_rate')

In [28]:
userXproduct = userXproduct.merge(a, on='user_product')
userXproduct = userXproduct.merge(avg_pos_in_cart, on='user_product')
userXproduct = userXproduct.merge(mean_reorder, on='user_product')

In [29]:
userXproduct.head()

Unnamed: 0,user_product,nb_counts,last_order_id,mean_pos_in_cart,UP_mean_reorder_rate
0,462,3,2557077,7.333333,0.666667
1,1077,2,2320368,21.0,0.5
2,2857,2,1132311,13.5,0.5
3,3176,3,1190356,14.0,0.666667
4,3210,1,1532072,8.0,0.0


In [30]:
#Train test Split
test_orders = orders[orders.eval_set == 'test']
train_orders = orders[orders.eval_set == 'train']

In [31]:
def create_dummies(df, colname):
    col_dummies = pd.get_dummies(df[colname], prefix=colname)
    col_dummies.drop(col_dummies.columns[0], axis=1, inplace=True)
    df = pd.concat([df, col_dummies], axis=1)
    df.drop( colname, axis = 1, inplace = True )
    return df

In [32]:
orders_products_train_1 = orders_products_train.copy()
orders_products_train_1.set_index(['order_id', 'product_id'], inplace=True)

In [33]:
def df_initiate(df, labels_given=False):
    order_list = []
    product_list = []
    labels = []
    i=0
    for row in df.itertuples():
        i+=1
        if i%10000 == 0: print('order row',i)
        order_id = row.order_id
        user_id = row.user_id
        user_products = prior_users.loc[prior_users.user_id == user_id, 'prior_all_products'].values[0]
        product_list += user_products
        order_list += [order_id] * len(user_products)
        if labels_given:
            labels += [(order_id, product) in orders_products_train_1.index for product in user_products]
    df = pd.DataFrame({'order_id':order_list, 'product_id':product_list}, dtype=np.int32)
    labels = np.array(labels, dtype=np.int8)
    if labels_given:
        df['y'] = labels
    del order_list
    del product_list
    return df

In [34]:
def user_related_features(df):
    df['user_id'] = df.order_id.map(orders.user_id).astype(np.uint32)
    df = df.merge(prior_users, on = 'user_id', how = 'left').drop('prior_all_products', axis=1)
    return df

In [35]:
def order_related_features(df):
    df = df.merge(orders[['order_id', 'order_dow','order_hour_of_day', 'days_since_prior_order']], on = 'order_id', how='left')
    df['days_since_ratio'] = (df.days_since_prior_order / df.mean_days_between_orders).astype(np.float32)
    return df

In [36]:
def product_related_features(df):
    prior_products_x = prior_products.drop('product_name', axis=1)
    df = df.merge(prior_products_x, on='product_id', how='left')
    return df

In [37]:
def user_product(df):
    df['user_product'] = df.product_id + df.user_id * 100000
    df = df.merge(userXproduct, on='user_product', how='left')
    df = df.rename(columns = {'nb_counts' : 'UP_orders', 'last_order_id': 'UP_last_order_id', 'mean_pos_in_cart': 'UP_mean_pos_in_cart'})
    df['UP_orders_ratio'] = (df.UP_orders / df.number_of_orders).astype(np.float32)
    df['UP_orders_since_last'] = df.number_of_orders - df.UP_last_order_id.map(orders.order_number)
    df.drop(['UP_last_order_id', 'user_product', 'user_id'], axis=1, inplace=True)
    return df

In [38]:
def top_aisle_department(df):
    is_aisle_top_30_reordered = []
    for i in df.aisle_id:
        is_aisle_top_30_reordered += [i in top_30_reordered_aisles_list]
    df['is_aisle_top_30_reordered'] = is_aisle_top_30_reordered
    df['is_aisle_top_30_reordered'] = df['is_aisle_top_30_reordered'].astype(np.uint8)    

    is_department_top_10_reordered = []
    for i in df.department_id:
        is_department_top_10_reordered += [i in top_10_reordered_departments_list]
    df['is_department_top_10_reordered'] = is_department_top_10_reordered
    df['is_department_top_10_reordered'] = df['is_department_top_10_reordered'].astype(np.uint8)
    return df

In [39]:
def dummy_var(df):
    df = create_dummies(df, 'order_dow')
    df = create_dummies(df, 'order_hour_of_day')
    return df

In [None]:
print("Training Set")
df_train = df_initiate(train_orders, labels_given=True)

Training Set


In [None]:
df_train = user_related_features(df_train)
df_train = order_related_features(df_train)
df_train = product_related_features(df_train)
df_train = user_product(df_train)
df_train = top_aisle_department(df_train)
df_train = dummy_var(df_train)

In [None]:
df_test = df_initiate(test_orders)

In [None]:
df_test = user_related_features(df_test)
df_test = order_related_features(df_test)
df_test = product_related_features(df_test)
df_test = user_product(df_test)
df_test = top_aisle_department(df_test)
df_test = dummy_var(df_test)

In [None]:
len(set(df_train.order_id))

In [None]:
df_train.to_pickle('/kaggle/working/df_train')
df_test.to_pickle('/kaggle/working/df_test')

In [None]:
import lightgbm as lgb

In [None]:
cols_to_use = ['prior_total_items',
       'total_distinct_products', 'mean_days_between_orders',
       'number_of_orders', 'average_products_per_basket',
       'mean_order_similarity', 'user_avg_orderdow', 'user_avg_orderhour',
       'days_since_prior_order', 'days_since_ratio', 'prior_orders', 'prior_reorders', 'prior_reorder_rate',
       'is_gluten_free', 'is_organic', 'is_low_fat', 'prior_avg_add_to_cart',
       'product_avg_orderdow', 'product_avg_orderhour', 'is_top_100_reordered',
       'UP_orders', 'UP_mean_pos_in_cart', 'UP_mean_reorder_rate',
       'UP_orders_ratio', 'UP_orders_since_last', 'is_aisle_top_30_reordered',
       'is_department_top_10_reordered', 'order_dow_1', 'order_dow_2',
       'order_dow_3', 'order_dow_4', 'order_dow_5', 'order_dow_6',
       'order_hour_of_day_1', 'order_hour_of_day_2', 'order_hour_of_day_3',
       'order_hour_of_day_4', 'order_hour_of_day_5', 'order_hour_of_day_6',
       'order_hour_of_day_7', 'order_hour_of_day_8', 'order_hour_of_day_9',
       'order_hour_of_day_10', 'order_hour_of_day_11', 'order_hour_of_day_12',
       'order_hour_of_day_13', 'order_hour_of_day_14', 'order_hour_of_day_15',
       'order_hour_of_day_16', 'order_hour_of_day_17', 'order_hour_of_day_18',
       'order_hour_of_day_19', 'order_hour_of_day_20', 'order_hour_of_day_21',
       'order_hour_of_day_22', 'order_hour_of_day_23']

In [None]:
labels = df_train['y']

In [None]:
d_train = lgb.Dataset(df_train[cols_to_use], label=labels)

In [None]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 96,
    'max_depth': 10,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.95,
    'bagging_freq': 5
}
ROUNDS = 500

In [None]:
model = lgb.train(params, d_train, ROUNDS)

In [None]:
preds = model.predict(df_test[cols_to_use])

In [None]:
df_test['pred'] = preds
TRESHOLD = 0.22

In [None]:
d = dict()
for row in df_test.itertuples():
    if row.pred > TRESHOLD:
        try:
            d[row.order_id] += ' ' + str(row.product_id)
        except:
            d[row.order_id] = str(row.product_id)

for order in test_orders.order_id:
    if order not in d:
        d[order] = 'None'

In [None]:
sub = pd.DataFrame.from_dict(d, orient='index')

# sub.reset_index(inplace=True)
# sub.columns = ['order_id', 'products']
# sub.to_csv('/kaggle/working/sub.csv', index=False)

In [None]:
os.listdir('/kaggle/working/')

In [None]:
sub.shape