In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [6]:
# Importing data

order_total = pd.read_csv('order_final')

In [7]:
#order_total.head()
order_total = order_total.drop(columns = ['product_name','aisle','department','order_number','order_id'])

In [8]:
target = order_total['reordered']
order_total = order_total.drop(columns = ['reordered'])

In [9]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
def Labelencoder_feature(x):
    le=LabelEncoder()
    x=le.fit_transform(x)
    return x

In [10]:
order_total=order_total.apply(Labelencoder_feature)

In [11]:
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import validation_curve
from sklearn.metrics import confusion_matrix

In [13]:
X_train, X_test, y_train, y_test = train_test_split(order_total, target, test_size=0.2, random_state=42)
scaler_x = MinMaxScaler((-1,1))
X_train = scaler_x.fit_transform(X_train)
X_test = scaler_x.transform(X_test)

In [14]:
#Linear Regression

C_param_range = [0.001,0.01,0.1,1,10,100]

table = pd.DataFrame(columns = ['C_parameter','Accuracy'])
table['C_parameter'] = C_param_range

j = 0
for i in C_param_range:
    
    # Apply logistic regression model to training data
    Logreg = LogisticRegression(penalty = 'l2', C = i,random_state = 42)
    Logreg.fit(X_train,y_train)
    
    # Predict using model
    y_pred_lr = Logreg.predict(X_test)
    
    # Saving accuracy score in table
    table.iloc[j,1] = accuracy_score(y_test,y_pred_lr)
    j += 1
    
table   

Unnamed: 0,C_parameter,Accuracy
0,0.001,0.674274
1,0.01,0.674176
2,0.1,0.674181
3,1.0,0.674179
4,10.0,0.674179
5,100.0,0.674179


In [15]:
from sklearn.ensemble import RandomForestClassifier


rf = RandomForestClassifier(bootstrap=True,n_estimators=100,criterion='entropy')
rf.fit(X_train, y_train)


y_predict_rf = rf.predict(X_test)


cnf_matrix = confusion_matrix(y_test, y_predict_rf)
print(cnf_matrix)
Accuracy_rf=rf.score(X_test,y_test)
print(Accuracy_rf)

[[ 190314  384901]
 [ 135738 1047511]]
0.7039239927573154


In [16]:
cv_scores_test= cross_val_score(rf,X_test,y_test,cv=5,scoring='roc_auc')
cv_scores_train= cross_val_score(rf,X_train,y_train,cv=5,scoring='roc_auc')

print(cv_scores_test)

cv_scores_rf_test= cv_scores_test.mean()
cv_scores_rf_train= cv_scores_train.mean()
cv_scores_std_rf= cv_scores_test.std()
print ('Mean cross validation test score: ' +str(cv_scores_rf_test))
print ('Mean cross validation train score: ' +str(cv_scores_rf_train))
print ('Standard deviation in cv scores: ' +str(cv_scores_std_rf))

[0.68222615 0.68097095 0.6834969  0.68139081 0.68253596]
Mean cross validation test score: 0.6821241551915708
Mean cross validation train score: 0.7027360521453098
Standard deviation in cv scores: 0.0008868012642297641


In [17]:
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
gbc = GradientBoostingClassifier(subsample=0.8, learning_rate=0.05 , n_estimators=100, random_state=5, max_depth=9, max_leaf_nodes=100)
gbc.fit(X_train, y_train)

#Predict using the model:

y_predict_gbc = gbc.predict(X_test)

#Confusion matrix:

cnf_matrix = confusion_matrix(y_test, y_predict_gbc)
print(cnf_matrix)
Accuracy_gbc=gbc.score(X_test,y_test)
print(Accuracy_gbc)

[[ 128010  447205]
 [  82704 1100545]]
0.6986523465933906


In [18]:
cv_scores_test= cross_val_score(gbc,X_test,y_test,cv=5,scoring='roc_auc')
cv_scores_train= cross_val_score(gbc,X_train,y_train,cv=5,scoring='roc_auc')

print(cv_scores_test)

cv_scores_gbc_test= cv_scores_test.mean()
cv_scores_gbc_train= cv_scores_train.mean()
cv_scores_std_gbc= cv_scores_test.std()
print ('Mean cross validation test score: ' +str(cv_scores_gbc_test))
print ('Mean cross validation train score: ' +str(cv_scores_gbc_train))
print ('Standard deviation in cv scores: ' +str(cv_scores_std_gbc))

[0.69258306 0.69057517 0.69410721 0.69274026 0.69474203]
Mean cross validation test score: 0.6929495463454713
Mean cross validation train score: 0.6929003184251197
Standard deviation in cv scores: 0.0014400681334626522


In [19]:
from lightgbm import LGBMClassifier
lgbm = LGBMClassifier()
lgbm.fit(X_train, y_train)


y_predict_lgbm = lgbm.predict(X_test)

cnf_matrix = confusion_matrix(y_test, y_predict_lgbm)
print(cnf_matrix)
Accuracy_lgbm=lgbm.score(X_test,y_test)
print(Accuracy_lgbm)

[[ 134675  440540]
 [  90401 1092848]]
0.6980654707744941


In [20]:
cv_scores_test= cross_val_score(lgbm,X_test,y_test,cv=5,scoring='roc_auc')
cv_scores_train= cross_val_score(lgbm,X_train,y_train,cv=5,scoring='roc_auc')

print(cv_scores_test)

cv_scores_lgbm_test= cv_scores_test.mean()
cv_scores_lgbm_train= cv_scores_train.mean()
cv_scores_std_lgbm= cv_scores_test.std()
print ('Mean cross validation test score: ' +str(cv_scores_lgbm_test))
print ('Mean cross validation train score: ' +str(cv_scores_lgbm_train))
print ('Standard deviation in cv scores: ' +str(cv_scores_std_lgbm))

[0.69174668 0.68962431 0.69289536 0.6912938  0.69349087]
Mean cross validation test score: 0.6918102031992472
Mean cross validation train score: 0.6920108599188091
Standard deviation in cv scores: 0.0013454402382618263


In [21]:
import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [22]:
#grid search  hyperparameter tuning
parameters = {
#     'task' : ['predict'],
#     'boosting': ['gbdt' ],
#     'objective': ['root_mean_squared_error'],
     'num_iterations': [  1500, 2000,5000  ],
     'learning_rate':[  0.05, 0.005 ],
#    'num_leaves':[ 7, 15, 31  ],
    'max_depth' :[ 5,10],
    'min_data_in_leaf':[15,25 ],
#   'feature_fraction': [ 0.6, 0.8,  0.9],
#     'bagging_fraction': [  0.6, 0.8 ],
#     'bagging_freq': [   100, 200, 400  ],
     
 }

gsearch_lgb = GridSearchCV(lgbm, param_grid = parameters, n_jobs=6, verbose=10)
model = gsearch_lgb.fit(X_train,y_train)
 

print("The best parameters are: /n",  gsearch_lgb.best_params_)

# Store the model for prediction (chapter 5)
model = gsearch_lgb.best_estimator_
print(gsearch_lgb.best_score_)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
The best parameters are: /n {'learning_rate': 0.05, 'max_depth': 10, 'min_data_in_leaf': 25, 'num_iterations': 5000}
0.7086032777950576


In [26]:

lgbm_best = LGBMClassifier(learning_rate = 0.05, max_depth = 10, min_data_in_leaf = 25, num_iterations = 5000)
lgbm_best.fit(X_train, y_train)


y_predict_lgbm = lgbm_best.predict(X_test)

cnf_matrix = confusion_matrix(y_test, y_predict_lgbm)
print(cnf_matrix)
Accuracy_lgbm=lgbm_best.score(X_test,y_test)
print(Accuracy_lgbm)

[[ 163478  411737]
 [  99697 1083552]]
0.7091586748434998


In [27]:
cv_scores_test= cross_val_score(lgbm_best,X_test,y_test,cv=5,scoring='roc_auc')
cv_scores_train= cross_val_score(lgbm_best,X_train,y_train,cv=5,scoring='roc_auc')

print(cv_scores_test)

cv_scores_lgbm_test= cv_scores_test.mean()
cv_scores_lgbm_train= cv_scores_train.mean()
cv_scores_std_lgbm= cv_scores_test.std()
print ('Mean cross validation test score: ' +str(cv_scores_lgbm_test))
print ('Mean cross validation train score: ' +str(cv_scores_lgbm_train))
print ('Standard deviation in cv scores: ' +str(cv_scores_std_lgbm))

[0.70798909 0.70577142 0.70865416 0.70788279 0.70898313]
Mean cross validation test score: 0.7078561191057784
Mean cross validation train score: 0.715335696810019
Standard deviation in cv scores: 0.001119944493690363


In [19]:
#paramGrid = {"max_depth":[5,10],
 #           "colsample_bytree":[0.3,0.4]}  


#lgbm = LGBMClassifier(objective='binary', num_boost_round=10)

#gs = GridSearchCV(lgbm, paramGrid, cv=3, verbose=2, n_jobs=1)


#model = gs.fit(X_train, y_train)

# Print the best parameters
#print("The best parameters are: /n",  gs.best_params_)

# Store the model for prediction (chapter 5)
#model = gs.best_estimator_

In [28]:
m_Labels = ['LGBM','Random Forest', 'Gradient Boosting']
scores_test = [cv_scores_lgbm_test, cv_scores_rf_test, cv_scores_gbc_test]
scores_train = [cv_scores_lgbm_train, cv_scores_rf_train, cv_scores_gbc_test]
accuracies = [Accuracy_lgbm, Accuracy_rf, Accuracy_gbc]

score_tab_acc = pd.DataFrame(list(zip(m_Labels, accuracies)), 
               columns =['Algorithm', 'Model accuracy score']) 

score_tab = pd.DataFrame(list(zip(m_Labels, scores_train, scores_test)), 
               columns =['Algorithm', 'ROC-AUC train score', 'ROC-AUC test score' ]) 

print(score_tab_acc)
print("\n")
print(score_tab)

           Algorithm  Model accuracy score
0               LGBM              0.709159
1      Random Forest              0.703924
2  Gradient Boosting              0.698652


           Algorithm  ROC-AUC train score  ROC-AUC test score
0               LGBM             0.715336            0.707856
1      Random Forest             0.702736            0.682124
2  Gradient Boosting             0.692950            0.692950


In [29]:
lgbm_importances = list(lgbm_best.feature_importances_)
imp=np.sort(lgbm_importances)
table=pd.DataFrame(list(zip(order_total,imp)),columns =['Features', 'Importance scores']) 
print(table)

                 Features  Importance scores
0              Unnamed: 0                  0
1                 user_id               8815
2                eval_set              12036
3               order_dow              12601
4       order_hour_of_day              14472
5  days_since_prior_order              15528
6              product_id              18244
7       add_to_cart_order              19833
8                aisle_id              22745
9           department_id              25726


In [30]:
sum_lgbm = sum(lgbm_importances)
percentages = lgbm_importances / sum_lgbm
table['Importance scores'] = table['Importance scores'] / sum_lgbm
print(table)

                 Features  Importance scores
0              Unnamed: 0           0.000000
1                 user_id           0.058767
2                eval_set           0.080240
3               order_dow           0.084007
4       order_hour_of_day           0.096480
5  days_since_prior_order           0.103520
6              product_id           0.121627
7       add_to_cart_order           0.132220
8                aisle_id           0.151633
9           department_id           0.171507


In [31]:
importances = list(gbc.feature_importances_)
imp=np.sort(importances)
table=pd.DataFrame(list(zip(order_total,imp)),columns =['Features', 'Importance scores']) 
print(table)

                 Features  Importance scores
0              Unnamed: 0           0.000000
1                 user_id           0.001423
2                eval_set           0.005301
3               order_dow           0.007436
4       order_hour_of_day           0.011417
5  days_since_prior_order           0.059939
6              product_id           0.152800
7       add_to_cart_order           0.158253
8                aisle_id           0.258121
9           department_id           0.345309


In [32]:
importances = list(rf.feature_importances_)
imp=np.sort(importances)
table=pd.DataFrame(list(zip(order_total,imp)),columns =['Features', 'Importance scores']) 
print(table)

                 Features  Importance scores
0              Unnamed: 0           0.000000
1                 user_id           0.032358
2                eval_set           0.047516
3               order_dow           0.069548
4       order_hour_of_day           0.077645
5  days_since_prior_order           0.082274
6              product_id           0.107432
7       add_to_cart_order           0.172446
8                aisle_id           0.201412
9           department_id           0.209367


In [33]:

from sklearn.metrics import f1_score
print("LGBM score: ", f1_score(y_test,y_predict_lgbm))
print("GBC score: ", f1_score(y_test,y_predict_gbc))
print("Rf score: ", f1_score(y_test,y_predict_rf))

LGBM score:  0.8090622570969686
GBC score:  0.8059651431582363
Rf score:  0.8009531816240713
