In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestClassifier
import math 
import gc
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
### List of created features
##product features
#number of orders
#number of reorders
# reorder rate(number of reorders/number of order)

##user features
#average days between orders
# number of orders
# total items
# all products
# total distinct items
# average basket

In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
IDIR = '../input/'

In [None]:
print('loading prior')
priors = pd.read_csv(IDIR + 'order_products__prior.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

print('loading train')
train = pd.read_csv(IDIR + 'order_products__train.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

print('loading orders')
orders = pd.read_csv(IDIR + 'orders.csv', dtype={
        'order_id': np.int32,
        'user_id': np.int32,
        'eval_set': 'category',
        'order_number': np.int16,
        'order_dow': np.int8,
        'order_hour_of_day': np.int8,
        'days_since_prior_order': np.float32})

print('loading products')
products = pd.read_csv(IDIR + 'products.csv', dtype={
        'product_id': np.uint16,
        'order_id': np.int32,
        'aisle_id': np.uint8,
        'department_id': np.uint8},
        usecols=['product_id', 'aisle_id', 'department_id'])

In [None]:
print('priors {}: {}'.format(priors.shape, ', '.join(priors.columns)))
print('orders {}: {}'.format(orders.shape, ', '.join(orders.columns)))
print('train {}: {}'.format(train.shape, ', '.join(train.columns)))

In [None]:
print('computing product f')
prods = pd.DataFrame()
prods['orders'] = priors.groupby(priors.product_id).size().astype(np.int32)
prods['reorders'] = priors['reordered'].groupby(priors.product_id).sum().astype(np.float32)
prods['reorder_rate'] = (prods.reorders / prods.orders).astype(np.float32)
products = products.join(prods, on='product_id')
products.set_index('product_id', drop=False, inplace=True)
del prods

In [None]:
print('add order info to priors')
orders.set_index('order_id', inplace=True, drop=False)
priors = priors.join(orders, on='order_id', rsuffix='_')
priors.drop('order_id_', inplace=True, axis=1)

In [None]:
### user features 1
print('computing user f')
usr = pd.DataFrame()
usr['average_days_between_orders'] = orders.groupby('user_id')['days_since_prior_order'].mean().astype(np.float32)
usr['nb_orders'] = orders.groupby('user_id').size().astype(np.int16)

In [None]:
### user features 2
users = pd.DataFrame()
users['total_items'] = priors.groupby('user_id').size().astype(np.int16)
users['all_products'] = priors.groupby('user_id')['product_id'].apply(set)
users['total_distinct_items'] = (users.all_products.map(len)).astype(np.int16)
users['user_max_order_num'] =  priors.groupby('user_id')['order_number'].max().astype(np.int16)
users['total_buy_max'] =  priors.groupby(['user_id','product_id'])['product_id'].count().reset_index(level = 'user_id').reset_index(drop = True).groupby('user_id').max().astype(np.int16)
users = users.join(usr) 
del usr
users['average_basket'] = (users.total_items / users.nb_orders).astype(np.float32)
print('user f', users.shape)

In [None]:
print('compute userXproduct f - this is long...')
priors['user_product'] = priors.product_id + priors.user_id * 100000

In [None]:
##Created for one user, change t'temp' to 'prior'
d= dict()
for row in priors.itertuples():
    z = row.user_product
    if z not in d:
        d[z] = (1,
                (row.order_number, row.order_id),
                (row.order_number, row.order_id),
                row.add_to_cart_order)
    else:
        d[z] = (d[z][0] + 1,
                max(d[z][1], (row.order_number, row.order_id)),
                min(d[z][2], (row.order_number, row.order_id)),
                d[z][3] + row.add_to_cart_order)

In [None]:
print('to dataframe (less memory)')
userXproduct = pd.DataFrame.from_dict(d, orient='index')
del d

In [None]:
userXproduct.columns = ['nb_orders', 'last_order_id','first_order_number', 'sum_pos_in_cart']
userXproduct.nb_orders = userXproduct.nb_orders.astype(np.int16)
userXproduct.last_order_id = userXproduct.last_order_id.map(lambda x: x[1]).astype(np.int32)
userXproduct.first_order_number = userXproduct.first_order_number.map(lambda x: x[0]).astype(np.int16)
userXproduct.sum_pos_in_cart = userXproduct.sum_pos_in_cart.astype(np.int16)
print('user X product f', len(userXproduct))

userXproduct.head()

In [None]:
del priors

In [None]:
print('split orders : train, test')
test_orders = orders[orders.eval_set == 'test']
train_orders = orders[orders.eval_set == 'train']

train.set_index(['order_id', 'product_id'], inplace=True, drop=False)

In [None]:
def features(selected_orders, labels_given=False):
    print('build candidate list')
    order_list = []
    product_list = []
    labels = []
    i=0
    for row in selected_orders.itertuples():
        i+=1
        if i%10000 == 0: print('order row',i)
        order_id = row.order_id
        user_id = row.user_id
        user_products = users.all_products[user_id]
        product_list += user_products
        order_list += [order_id] * len(user_products)
        if labels_given:
            labels += [(order_id, product) in train.index for product in user_products]
        
    df = pd.DataFrame({'order_id':order_list, 'product_id':product_list}, dtype=np.int32)
    labels = np.array(labels, dtype=np.int8)
    del order_list
    del product_list
    
    print('user related features')
    df['user_id'] = df.order_id.map(orders.user_id)
    df['user_total_orders'] = df.user_id.map(users.nb_orders)
    df['user_total_items'] = df.user_id.map(users.total_items)
    df['total_distinct_items'] = df.user_id.map(users.total_distinct_items)
    df['user_average_days_between_orders'] = df.user_id.map(users.average_days_between_orders)
    df['user_average_basket'] =  df.user_id.map(users.average_basket)
    df['user_total_buy_max'] = df.user_id.map(users.total_buy_max).astype(np.int16)
    
    print('order related features')
    df['order_dow'] = df.order_id.map(orders.order_dow)
    df['order_hour_of_day'] = df.order_id.map(orders.order_hour_of_day)
    df['days_since_prior_order'] = df.order_id.map(orders.days_since_prior_order)
    df['days_since_ratio'] = (df.days_since_prior_order / df.user_average_days_between_orders).map(lambda x: 0 if math.isnan(x) else x).astype(np.float32)
    
    print('product related features')
    df['aisle_id'] = df.product_id.map(products.aisle_id)
    df['department_id'] = df.product_id.map(products.department_id)
    df['product_orders'] = df.product_id.map(products.orders).astype(np.int32)
    df['product_reorders'] = df.product_id.map(products.reorders)
    df['product_reorder_rate'] = df.product_id.map(products.reorder_rate).astype(np.float32)

    print('user_X_product related features')
    df['z'] = df.user_id * 100000 + df.product_id
    df.drop(['user_id'], axis=1, inplace=True)
    df['UP_orders'] = df.z.map(userXproduct.nb_orders)
    df['UP_orders_ratio'] = (df.UP_orders / df.user_total_orders).astype(np.float32)
    df['UP_last_order_id'] = df.z.map(userXproduct.last_order_id)
    df['UP_average_pos_in_cart'] = (df.z.map(userXproduct.sum_pos_in_cart) / df.UP_orders).astype(np.float32)
    df['UP_reorder_rate'] = ((df.UP_orders-1) / (df.user_total_orders-1).astype(np.float32))
    df['UP_orders_since_last'] = df.user_total_orders - df.UP_last_order_id.map(orders.order_number)
    df['UP_delta_hour_vs_last'] = abs(df.order_hour_of_day - df.UP_last_order_id.map(orders.order_hour_of_day)).map(lambda x: min(x, 24-x)).astype(np.int8)
    df['UP_delta_dow_vs_last'] = abs(df.order_dow - df.UP_last_order_id.map(orders.order_dow)).map(lambda x: min(x, 7-x)).astype(np.int8)
    df['UP_drop_chance'] = (df.user_total_orders - df.UP_last_order_id.map(orders.order_number)).astype(np.float)
    df['UP_chance_vs_bought'] = (df.user_total_orders - df.z.map(userXproduct.first_order_number)).astype(np.float32)
    df['UP_chance'] = (df.UP_orders - 1)/(df.user_total_orders - df.z.map(userXproduct.first_order_number)).astype(np.float32)
    df['UP_chance_ratio'] = (1/(df.user_total_orders - df.UP_last_order_id.map(orders.order_number)) - (df.UP_orders - 1)/(df.user_total_orders - df.z.map(userXproduct.first_order_number))).astype(np.float32)
    df.drop(['UP_last_order_id','z'], axis=1, inplace=True)
    #df.drop(['order_id','product_id'], axis=1)
    print(df.dtypes)
    print(df.memory_usage())
    return (df, labels)
    

In [None]:
df_train, labels = features(train_orders, labels_given=True)
df_train.head()

### Implementing GBM

In [None]:
#Dropping the ID columns
df_train_var = df_train.drop(['order_id','product_id'], axis=1)

In [None]:
#Train-Test Split
d_train, d_test, l_train, l_test = train_test_split(df_train_var, labels, test_size=0.2, random_state=42)

In [None]:
#Columns of training dataset
d_train.columns

In [None]:
#Formatting for lgbm
d_train_gbm = lgb.Dataset(d_train, label=l_train, categorical_feature=['aisle_id','department_id'])

In [None]:
#Selecting random parameter to start with
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 96,
    'max_depth': 10,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.95,
    'bagging_freq': 5
}
ROUNDS = 100

In [None]:
#lgbm training
lgbc = lgb.train(params, d_train_gbm, ROUNDS)

In [None]:
#Plotting the importance of features
lgb.plot_importance(lgbc)
plt.show()

In [None]:
#Predicting
pred_gbm = lgbc.predict(d_test)
d_test['pred'] = pred_gbm

In [None]:
##Creating label column based on thresold
thresold = 0.5
d_test['label'] = np.where(d_test['pred'] > thresold, 1, 0)

In [None]:
##Checking accuracy score
acc = accuracy_score(d_test['label'],l_test)
print(acc)

In [None]:
##Checking f-score
f1_gbm = f1_score(d_test['label'],l_test)
print(f1_gbm)

In [None]:
#Checking thresold for predicting 1
thresold = np.arange(0.1,0.61,0.02)
f1_gbm=[]
max_f1=0
for t in thresold:
    d_test['label'] = np.where(d_test['pred'] > t, 1, 0)
    f = f1_score(d_test['label'],l_test)
    f1_gbm.append(f)
    if f > max_f1:
        max_f1 = f
        max_t = t

In [None]:
plt.plot(thresold,f1_gbm)
print('max f1:',max_f1)
print('thresold:',max_t)

In [None]:
##Grid Search to find best parameters
gridParams = {
    'learning_rate': [0.001,0.005,0.01],
    'n_estimators': [10,40,100],
    'num_leaves': [6,8,12,16],
    'max_depth' : [-1,4,10]
}

In [None]:
# del df_train
gc.collect()

In [None]:
#Training GBM model for Grid Search
# bst = lgb.LGBMClassifier(boosting_type= 'gbdt')

In [None]:
#Creating the grid
# gridCV = GridSearchCV(bst, gridParams,
#                       verbose=0,
#                       cv=4,
#                       n_jobs=2)

In [None]:
# l_train

In [None]:
#Running the Grid
#gridCV.fit(d_train, l_train)

In [None]:
# print(gridCV.best_params_)
# print(gridCV.best_score_)

### Random Forest

In [None]:
rf = RandomForestClassifier(random_state = 42)  
rf.fit(df_train, labels)  

### finding most important features and filtering dataset for the same
feature_importances = pd.DataFrame(rf.feature_importances_, index = df_train.columns, columns=['importance']).sort_values('importance',ascending=False)
print(feature_importances)



In [None]:
n_features=10
rf_imp_features = rf.feature_importances_.argsort()[::-1][:n_features]

In [None]:
df_train_imp = df_train.iloc[:,rf_imp_features]
df_train_imp.head()

In [None]:
##Predicting
df_test,_ = features(test_orders)
df_test_var = df_test.drop(['order_id','product_id'], axis=1)

print('RF predict')
pred = rf.predict(df_test_var)


In [None]:
#Predicting probabilities
pred_prob = rf.predict_proba(df_test_var)

In [None]:
df_test['predict'] = pred

In [None]:
df_test.head()

In [None]:
#Order - Product
d = dict()
for row in df_test.itertuples():
    if row.predict==1:
        try:
            d[row.order_id]+= ' ' + str(row.product_id)
        except:
            d[row.order_id] = str(row.product_id)

for order in test_orders.order_id:
    if order not in d:
        d[order] = 'None'
        
sub_rf0 = pd.DataFrame.from_dict(d, orient='index')

sub_rf0.reset_index(inplace=True)
sub_rf0.columns = ['order_id', 'products']

In [None]:
sub_rf0.head()

In [None]:
sub_rf0.to_csv('sub_rf0.csv', header=True)

In [None]:
###Implementing GBM with best top 10 rf features
##Creating datasets
df_train_top = df_train.iloc[:,rf_imp_features]
train_gbm = lgb.Dataset(df_train_top, label=labels)

In [None]:
##Creating validation dataset
df_valid_gbm = train_gbm.create_valid('test.svm')

In [None]:
#setting random starting parameters
param = {'num_leaves':31, 'num_trees':100, 'objective':'binary'}
param['metric'] = 'auc'

In [None]:
num_round = 10
bst = lgb.train(param, train_gbm, num_round)

In [None]:
#top features for test dataset
df_test_top = df_test.iloc[:,rf_imp_features]

In [None]:
#predicting for test
pred_gbm = bst.predict(df_test_top, num_iteration=bst.best_iteration)

In [None]:
pred_gbm

In [None]:
df_test['pred'] = pred_gbm