Prediction of product reorder in an order<br>
This code is based on the analysis done in clusterl<br>
https://www.kaggle.com/paulantoine/light-gbm-benchmark-0-3692

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
import numpy as np
import pandas as pd
import math
import lightgbm as lgb
import matplotlib.pyplot as plt
import operator
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
IDIR = '../input/'

In [None]:
print('loading prior')
priors = pd.read_csv(IDIR + 'order_products__prior.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

print('loading train')
train = pd.read_csv(IDIR + 'order_products__train.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

print('loading orders')
orders = pd.read_csv(IDIR + 'orders.csv', dtype={
        'order_id': np.int32,
        'user_id': np.int32,
        'eval_set': 'category',
        'order_number': np.int16,
        'order_dow': np.int8,
        'order_hour_of_day': np.int8,
        'days_since_prior_order': np.float32})

print('loading products')
products = pd.read_csv(IDIR + 'products.csv', dtype={
        'product_id': np.uint16,
        'order_id': np.int32,
        'aisle_id': np.uint8,
        'department_id': np.uint8},
        usecols=['product_id', 'aisle_id', 'department_id'])


In [None]:
print('priors {}: {}'.format(priors.shape, ', '.join(priors.columns)))
print('orders {}: {}'.format(orders.shape, ', '.join(orders.columns)))
print('train {}: {}'.format(train.shape, ', '.join(train.columns)))

In [None]:
print('computing product f')
prods = pd.DataFrame()
prods['orders'] = priors.groupby(priors.product_id).size().astype(np.int32)
prods['reorders'] = priors['reordered'].groupby(priors.product_id).sum().astype(np.float32)
prods['reorder_rate'] = (prods.reorders / prods.orders).astype(np.float32)
products = products.join(prods, on='product_id')
products.set_index('product_id', drop=False, inplace=True)
del prods

In [None]:
print('add order info to priors')
orders.set_index('order_id', inplace=True, drop=False)
priors = priors.join(orders, on='order_id', rsuffix='_')
priors.drop('order_id_', inplace=True, axis=1)

In [None]:
print('computing user f')
usr = pd.DataFrame()
usr['average_days_between_orders'] = orders.groupby('user_id')['days_since_prior_order'].mean().astype(np.float32)
usr['nb_orders'] = orders.groupby('user_id').size().astype(np.int16)

In [None]:
users = pd.DataFrame()
users['total_items'] = priors.groupby('user_id').size().astype(np.int16)
users['all_products'] = priors.groupby('user_id')['product_id'].apply(set)
users['total_distinct_items'] = (users.all_products.map(len)).astype(np.int16)
users['user_max_order_num'] =  priors.groupby('user_id')['order_number'].max().astype(np.int16)
users['total_buy_max'] =  priors.groupby(['user_id','product_id'])['product_id'].count().reset_index(level = 'user_id').reset_index(drop = True).groupby('user_id').max().astype(np.int16)
users = users.join(usr) 
del usr
users['average_basket'] = (users.total_items / users.nb_orders).astype(np.float32)
print('user f', users.shape)

In [None]:
print('compute userXproduct f - this is long...')
priors['user_product'] = priors.product_id + priors.user_id * 100000

In [None]:
d= dict()
for row in priors.itertuples():
    z = row.user_product
    if z not in d:
        d[z] = (1,
                (row.order_number, row.order_id),
                (row.order_number, row.order_id),
                row.add_to_cart_order)
    else:
        d[z] = (d[z][0] + 1,
                max(d[z][1], (row.order_number, row.order_id)),
                min(d[z][2], (row.order_number, row.order_id)),
                d[z][3] + row.add_to_cart_order)

In [None]:
print('to dataframe (less memory)')
userXproduct = pd.DataFrame.from_dict(d, orient='index')
del d

In [None]:
userXproduct.columns = ['nb_orders', 'last_order_id','first_order_number', 'sum_pos_in_cart']
userXproduct.nb_orders = userXproduct.nb_orders.astype(np.int16)
userXproduct.last_order_id = userXproduct.last_order_id.map(lambda x: x[1]).astype(np.int32)
userXproduct.first_order_number = userXproduct.first_order_number.map(lambda x: x[0]).astype(np.int16)
userXproduct.sum_pos_in_cart = userXproduct.sum_pos_in_cart.astype(np.int16)
print('user X product f', len(userXproduct))

userXproduct.head()

In [None]:
del priors

In [None]:
print('split orders : train, test')
test_orders = orders[orders.eval_set == 'test']
train_orders = orders[orders.eval_set == 'train']

In [None]:
train.set_index(['order_id', 'product_id'], inplace=True, drop=False)

In [None]:
def features(selected_orders, labels_given=False):
    print('build candidate list')
    order_list = []
    product_list = []
    labels = []
    i=0
    for row in selected_orders.itertuples():
        i+=1
        if i%10000 == 0: print('order row',i)
        order_id = row.order_id
        user_id = row.user_id
        user_products = users.all_products[user_id]
        product_list += user_products
        order_list += [order_id] * len(user_products)
        if labels_given:
            labels += [(order_id, product) in train.index for product in user_products]
        
    df = pd.DataFrame({'order_id':order_list, 'product_id':product_list}, dtype=np.int32)
    labels = np.array(labels, dtype=np.int8)
    del order_list
    del product_list
    
    print('user related features')
    df['user_id'] = df.order_id.map(orders.user_id)
    df['user_total_orders'] = df.user_id.map(users.nb_orders)
    df['user_total_items'] = df.user_id.map(users.total_items)
    df['total_distinct_items'] = df.user_id.map(users.total_distinct_items)
    df['user_average_days_between_orders'] = df.user_id.map(users.average_days_between_orders)
    df['user_average_basket'] =  df.user_id.map(users.average_basket)
    df['user_total_buy_max'] = df.user_id.map(users.total_buy_max).astype(np.int16)
    
    print('order related features')
    df['order_dow'] = df.order_id.map(orders.order_dow)
    df['order_hour_of_day'] = df.order_id.map(orders.order_hour_of_day)
    df['days_since_prior_order'] = df.order_id.map(orders.days_since_prior_order)
    df['days_since_ratio'] = (df.days_since_prior_order / df.user_average_days_between_orders).map(lambda x: 0 if math.isnan(x) else x).astype(np.float32)
    
    print('product related features')
    df['aisle_id'] = df.product_id.map(products.aisle_id)
    df['department_id'] = df.product_id.map(products.department_id)
    df['product_orders'] = df.product_id.map(products.orders).astype(np.int32)
    df['product_reorders'] = df.product_id.map(products.reorders)
    df['product_reorder_rate'] = df.product_id.map(products.reorder_rate).astype(np.float32)

    print('user_X_product related features')
    df['z'] = df.user_id * 100000 + df.product_id
    df.drop(['user_id'], axis=1, inplace=True)
    df['UP_orders'] = df.z.map(userXproduct.nb_orders)
    df['UP_orders_ratio'] = (df.UP_orders / df.user_total_orders).astype(np.float32)
    df['UP_last_order_id'] = df.z.map(userXproduct.last_order_id)
    df['UP_average_pos_in_cart'] = (df.z.map(userXproduct.sum_pos_in_cart) / df.UP_orders).astype(np.float32)
    df['UP_reorder_rate'] = ((df.UP_orders-1) / (df.user_total_orders-1).astype(np.float32))
    df['UP_orders_since_last'] = df.user_total_orders - df.UP_last_order_id.map(orders.order_number)
    df['UP_delta_hour_vs_last'] = abs(df.order_hour_of_day - df.UP_last_order_id.map(orders.order_hour_of_day)).map(lambda x: min(x, 24-x)).astype(np.int8)
    df['UP_delta_dow_vs_last'] = abs(df.order_dow - df.UP_last_order_id.map(orders.order_dow)).map(lambda x: min(x, 7-x)).astype(np.int8)
    df['UP_drop_chance'] = (df.user_total_orders - df.UP_last_order_id.map(orders.order_number)).astype(np.float)
    df['UP_chance_vs_bought'] = (df.user_total_orders - df.z.map(userXproduct.first_order_number)).astype(np.float32)
    df['UP_chance'] = (df.UP_orders - 1)/(df.user_total_orders - df.z.map(userXproduct.first_order_number)).astype(np.float32)
    df['UP_chance_ratio'] = (1/(df.user_total_orders - df.UP_last_order_id.map(orders.order_number)) - (df.UP_orders - 1)/(df.user_total_orders - df.z.map(userXproduct.first_order_number))).astype(np.float32)
    df.drop(['UP_last_order_id','z'], axis=1, inplace=True)
    df.drop(['order_id','product_id'], axis=1)
    print(df.dtypes)
    print(df.memory_usage())
    return (df, labels)

In [None]:
df_train, labels = features(train_orders, labels_given=True)

In [None]:
features_to_use = ['user_total_orders', 'user_total_items',
       'total_distinct_items', 'user_average_days_between_orders',
       'user_average_basket', 'order_dow',
       'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio',
       'department_id', 'product_orders', 'product_reorders',
       'product_reorder_rate', 'UP_orders', 'UP_orders_ratio',
       'UP_average_pos_in_cart', 'UP_reorder_rate', 'UP_orders_since_last',
       'UP_delta_hour_vs_last', 'UP_delta_dow_vs_last', 'UP_drop_chance',
       'UP_chance_vs_bought', 'user_total_buy_max', 'UP_chance', 'UP_chance_ratio','aisle_id']

In [None]:
#Dividing into train and cv for selecting best parameters
d_train, d_cv, l_train, l_cv = train_test_split(df_train, labels, test_size=0.2, random_state=42)

In [None]:
print('formating for lgb')
d_train_gbm = lgb.Dataset(d_train[features_to_use],
                      label=l_train,
                      categorical_feature=['aisle_id', 'department_id']) 

In [None]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
#     'num_leaves': 96,
#     'max_depth': 10,
#     'feature_fraction': 0.9,
#     'bagging_fraction': 0.95,
#     'bagging_freq': 5
}
ROUNDS = 100

In [None]:
print('light GBM train :-)')
bst = lgb.train(params, d_train_gbm, ROUNDS)

In [None]:
lgb.plot_importance(bst)

In [None]:
#Checking on CV
print('light GBM predict')
preds = bst.predict(d_cv)


In [None]:
d_cv['pred'] = preds

In [None]:
d_cv['given_label'] = l_cv
d_cv.head()

In [None]:
#Creating dataframe of given products
given_prods = {}
for row in d_cv.itertuples():
    if row.given_label == 1:
        try:
            given_prods[row.order_id] += ' ' + str(row.product_id)
        except:
            given_prods[row.order_id] = str(row.product_id)

for order in d_cv.order_id:
    if order not in given_prods:
        given_prods[order] = 'None'

#Creating dataframe of given products
given_prods = pd.DataFrame.from_dict(given_prods, orient='index')
given_prods.reset_index(inplace=True)
given_prods.columns = ['order_id','products']
#print(given_prods.head())

In [None]:
#Eval function
def eval_fun(labels, preds):
    labels = labels.split(' ')
    preds = preds.split(' ')
    rr = (np.intersect1d(labels, preds))
    precision = np.float(len(rr)) / len(preds)
    recall = np.float(len(rr)) / len(labels)
    try:
        f1 = 2 * precision * recall / (precision + recall)
    except ZeroDivisionError:
        return (precision, recall, 0.0)
    return (precision, recall, f1)

In [None]:
##Creating label column based on thresold
thresold = 0.5
f1_scores=[]
for t in thresold:
    d_cv['pred_label'] = np.where(d_cv['pred'] > t, 1, 0)

    #Creating dataframe of predicted products
    pred_prods = {}
    for row in d_cv.itertuples():
        if row.pred_label == 1:
            try:
                pred_prods[row.order_id] += ' ' + str(row.product_id)
            except:
                pred_prods[row.order_id] = str(row.product_id)

    for order in d_cv.order_id:
        if order not in pred_prods:
            pred_prods[order] = 'None'

    #Creating dataframe of predicted products
    pred_prods = pd.DataFrame.from_dict(pred_prods, orient='index')
    pred_prods.reset_index(inplace=True)
    pred_prods.columns = ['order_id','products']
    #print(pred_prods.head())

    #Merging predicted and given
    merge_eval = pd.merge(pred_prods, given_prods, how='inner', on='order_id')
    #print(merge_eval.head())

    res = list()
    for entry in merge_eval.itertuples():
        res.append(eval_fun(entry[2], entry[3]))

    res = pd.DataFrame(np.array(res), columns=['precision', 'recall', 'f1'])
    #print(res.head())

    f1_score = np.mean(res['f1'])

    f1_scores.append(f1_score)

print(f1_scores)

### Best Features
We obtain the best accuracy when we train on all the features 

### Best Parameters
number of leaves: 96(keeps on increasing so limiting)<br>
depth: 10<br>
feature fraction: 0.85 (0.4855)<br>
bagging fraction: 0.75 (0.4856)<br>
bagging frequency: 5<br>
learning rate: 0.01<br>
rounds: 100<br>

#### Please Note:
These are the best values of parameters we have obtained after looping through range of values for each parameter. <br>
We obtain a frequency of 36.4%(public) on these parameter, if we use the parameters originallly mentioned in the reference code, we get an accuracy of 38.09%(public). <br>
$\therefore$ we shall go ahead with the parameter mentioned in the reference code


### Predicting on test

In [None]:
d_train_bst = lgb.Dataset(df_train[features_to_use],
                      label=labels,
                      categorical_feature=['aisle_id', 'department_id'])  # , 'order_hour_of_day', 'dow'

In [None]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 96,
    'max_depth': 10,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.95,
    'bagging_freq': 5
}
ROUNDS = 100

In [None]:
print('light GBM train :-)')
bst2 = lgb.train(params, d_train_bst, ROUNDS)

In [None]:
#Predicting on test
df_test, _ = features(test_orders)

In [None]:
print('light GBM predict')
preds = bst2.predict(df_test[features_to_use])

In [None]:
df_test['pred'] = preds

In [None]:
del df_train

In [None]:
##Writing a function to generate f-score by giving threshold and top n features
#n_features = 10
def gbm_tune(n_features, thresold):
    
    feat_imp=[]
    for i in range(0,n_features):
        feat_imp.append(imp_features[i][0])
        
    #Formatting for lgbm (best features)
    if n_features==26:
        d_train_ftune = lgb.Dataset(d_train[feat_imp], label=l_train, categorical_feature=['aisle_id','department_id'])
    else:
        d_train_ftune = lgb.Dataset(d_train[feat_imp], label=l_train, categorical_feature=['aisle_id',])
    
    #lgbm training for best features
    lgbc_ftune = lgb.train(params, d_train_ftune, ROUNDS)
    
    #Predicting
    d_cv_ftune = d_cv[feat_imp]
    pred_gbm = lgbc_ftune.predict(d_cv_ftune)
    d_cv['pred'] = pred_gbm
    #print('test:',d_cv_ftune.head())
    
    ##Creating label column based on thresold
    #thresold = 0.5
    d_cv['pred_label'] = np.where(d_cv['pred'] > thresold, 1, 0)
        
    #Creating dataframe of predicted products
    pred_prods = {}
    for row in d_cv.itertuples():
        if row.pred_label == 1:
            try:
                pred_prods[row.order_id] += ' ' + str(row.product_id)
            except:
                pred_prods[row.order_id] = str(row.product_id)
    
    for order in d_cv.order_id:
        if order not in pred_prods:
            pred_prods[order] = 'None'
            
    #Creating dataframe of predicted products
    pred_prods = pd.DataFrame.from_dict(pred_prods, orient='index')
    pred_prods.reset_index(inplace=True)
    pred_prods.columns = ['order_id','products']
    #print(pred_prods.head())
    
    #Merging predicted and given
    merge_eval = pd.merge(pred_prods, given_prods, how='inner', on='order_id')
    #print(merge_eval.head())
    
    res = list()
    for entry in merge_eval.itertuples():
        res.append(eval_fun(entry[2], entry[3]))
    
    res = pd.DataFrame(np.array(res), columns=['precision', 'recall', 'f1'])
    #print(res.head())
    
    f1_score = np.mean(res['f1'])
    
    return f1_score

In [None]:
# #Plotting for n-features vs f1-score
# thresold = [0.2,0.22,0.3,0.35,0.4,0.5,0.55]
# f1_scores = []
# for t in thresold:
#     f1 = gbm_tune(26,t)
#     print('t',t)
#     print('f1',f1)
#     f1_scores.append(f1)    


In [None]:
# plt.plot(thresold,f1_scores)

In [None]:
# TRESHOLD = 0.35 #Obtained using CV dataset
# d = dict()
# for row in df_test.itertuples():
#     if row.pred > TRESHOLD:
#         try:
#             d[row.order_id] += ' ' + str(row.product_id)
#         except:
#             d[row.order_id] = str(row.product_id)

# for order in test_orders.order_id:
#     if order not in d:
#         d[order] = 'None'

In [None]:
# sub = pd.DataFrame.from_dict(d, orient='index')

In [None]:
# #First submission
# sub.reset_index(inplace=True)
# sub.columns = ['order_id', 'products']

In [None]:
# sub.to_csv('sub0.csv', index=False)

In [None]:
TRESHOLD = 0.22 #Obtained using CV dataset
d = dict()
for row in df_test.itertuples():
    if row.pred > TRESHOLD:
        try:
            d[row.order_id] += ' ' + str(row.product_id)
        except:
            d[row.order_id] = str(row.product_id)

for order in test_orders.order_id:
    if order not in d:
        d[order] = 'None'

In [None]:
sub = pd.DataFrame.from_dict(d, orient='index')

In [None]:
#First submission
sub.reset_index(inplace=True)
sub.columns = ['order_id', 'products']

In [None]:
sub.to_csv('sub_LGBM.csv', index=False)

In [None]:
sub.head()

### Selecting top features

In [None]:
feature_names = list(bst.feature_name())
feature_importances = list(bst.feature_importance())
print(len(feature_names))
print(feature_importances)

In [None]:
#Assigning features to importance
imp_features = []
for i,j  in zip(feature_names, feature_importances):
    imp_features.append((i,j))

#Sort on the basis of the importance
imp_features.sort(key=operator.itemgetter(1), reverse=True)

imp_features

In [None]:
# #Plotting for n-features vs f1-score
# n_top_features = [5,10,15,20,25]
# f1_scores = []
# for n in n_top_features:
#     f1 = gbm_tune(n,0.22)
#     print('f1',f1)
#     print('n',n)
#     f1_scores.append(f1)
    
# plt.plot(n_top_features,f1_scores)

$\therefore$ best f1-score is obtained when all the features are used

### Parameter Tuning

In [None]:
# ##Grid Search to find best parameters
# f1_scores=[]
# #num_leaves = [60,96,120,200,500]
# #max_depth = [7,10,20,-1]
# #feature_fraction = [0.7,0.75,0.80,0.85,0.90,0.95]
# #bagging_fraction = [0.7,0.75,0.80,0.85,0.90,0.95,1.0]
# # bagging_freq = [1,5,10,20,100]
# # learning_rate = [0.001,0.005,0.01]
# rounds=[10,50,100,500]

# #for l in num_leaves:
# #for m in max_depth:
# #for f in feature_fraction:
# # for b in bagging_fraction:
# # for bf in bagging_freq:
# # for lr in learning_rate:
# for r in rounds:
#     params = {
#         'learning_rate': [lr],
#         'num_leaves': [96],
#         'boosting_type' : ['gbdt'],
#         'objective' : ['binary'],
#         'max_depth': 10,
#         'feature_fraction': 0.85,
#         'bagging_fraction': 0.75,
#         'bagging_freq': 5
#     }
    
#     #Formatting for lgbm(Best params)
#     d_train_ptune = lgb.Dataset(d_train[features_to_use], label=l_train, categorical_feature=['aisle_id','department_id'])
    
#     #lgbm training for best params
#     lgbc_ptune = lgb.train(params, d_train_ptune, r)
    
#     #Predicting
#     d_cv_ptune = d_cv[features_to_use]
#     pred_gbm = lgbc_ptune.predict(d_cv_ptune)
#     d_cv['pred'] = pred_gbm
    
#     ##Creating label column based on thresold
#     d_cv['pred_label'] = np.where(d_cv['pred'] > 0.22, 1, 0)
    
#     pred_prods = {}
#     for row in d_cv.itertuples():
#         if row.pred_label == 1:
#             try:
#                 pred_prods[row.order_id] += ' ' + str(row.product_id)
#             except:
#                 pred_prods[row.order_id] = str(row.product_id)
    
#     for order in d_cv.order_id:
#         if order not in pred_prods:
#             pred_prods[order] = 'None'
            
#     #Creating dataframe of predicted products
#     pred_prods = pd.DataFrame.from_dict(pred_prods, orient='index')
#     pred_prods.reset_index(inplace=True)
#     pred_prods.columns = ['order_id','products']
    
#     merge_eval = pd.merge(pred_prods, given_prods, how='inner', on='order_id')
#     #print(merge_eval.head())
    
#     res = list()
#     for entry in merge_eval.itertuples():
#         res.append(eval_fun(entry[2], entry[3]))
    
#     res = pd.DataFrame(np.array(res), columns=['precision', 'recall', 'f1'])
#     #print(res.head())
    
#     f1_score = np.mean(res['f1'])
    
#     f1_scores.append(f1_score)
    
# print('f1 scores:',f1_scores)
# #plt.plot(num_leaves,f1_scores)
# #plt.plot(max_depth,f1_scores)
# #plt.plot(feature_fraction,f1_scores)
# # plt.plot(bagging_fraction,f1_scores)
# # plt.plot(learning_rate,f1_scores)
# plt.plot(rounds,f1_scores)

# plt.xlabel('rounds')
# plt.ylabel('f-score')
# plt.title('f-score vs rounds')
    


In [None]:
##Best Parameters
# number of leaves: 96(keeps on increasing so limiting)
# depth: 10
# feature fraction: 0.85 (0.4855)
# bagging fraction: 0.75 (0.4856)
# bagging frequency: 5
# learning rate: 0.01
# rounds=100

In [None]:
# #Training GBM model for Grid Search
# grd_cv = lgb.LGBMClassifier(boosting_type= 'gbdt')

In [None]:
# # #Creating the grid
# gridCV = GridSearchCV(grd_cv, gridParams,
#                       verbose=0,
#                       cv=4,
#                       n_jobs=2)

In [None]:
# # #Running the Grid
# gridCV.fit(d_train, l_train)

In [None]:
f1_scores