In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
# Importing data

order_total = pd.read_csv('order_final')

In [3]:
#order_total.head()
order_total = order_total.drop(columns = ['product_name','aisle','department'])

In [5]:
target = order_total['reordered']
order_total = order_total.drop(columns = ['reordered'])

In [6]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
def Labelencoder_feature(x):
    le=LabelEncoder()
    x=le.fit_transform(x)
    return x

In [7]:
order_total=order_total.apply(Labelencoder_feature)

In [8]:
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import validation_curve
from sklearn.metrics import confusion_matrix

In [10]:
X_train, X_test, y_train, y_test = train_test_split(order_total, target, test_size=0.2, random_state=42)
scaler_x = MinMaxScaler((-1,1))
X_train = scaler_x.fit_transform(X_train)
X_test = scaler_x.transform(X_test)

In [11]:
#Linear Regression

C_param_range = [0.001,0.01,0.1,1,10,100]

table = pd.DataFrame(columns = ['C_parameter','Accuracy'])
table['C_parameter'] = C_param_range

j = 0
for i in C_param_range:
    
    # Apply logistic regression model to training data
    Logreg = LogisticRegression(penalty = 'l2', C = i,random_state = 42)
    Logreg.fit(X_train,y_train)
    
    # Predict using model
    y_pred_lr = Logreg.predict(X_test)
    
    # Saving accuracy score in table
    table.iloc[j,1] = accuracy_score(y_test,y_pred_lr)
    j += 1
    
table   

Unnamed: 0,C_parameter,Accuracy
0,0.001,0.705246
1,0.01,0.705353
2,0.1,0.705378
3,1.0,0.705382
4,10.0,0.705383
5,100.0,0.705382


In [12]:
from sklearn.ensemble import RandomForestClassifier


rf = RandomForestClassifier(bootstrap=True,n_estimators=100,criterion='entropy')
rf.fit(X_train, y_train)


y_predict_rf = rf.predict(X_test)


cnf_matrix = confusion_matrix(y_test, y_predict_rf)
print(cnf_matrix)
Accuracy_rf=rf.score(X_test,y_test)
print(Accuracy_rf)

[[ 266568  308647]
 [  99457 1083792]]
0.7679201848886301


In [13]:
cv_scores_test= cross_val_score(rf,X_test,y_test,cv=5,scoring='roc_auc')
cv_scores_train= cross_val_score(rf,X_train,y_train,cv=5,scoring='roc_auc')

print(cv_scores_test)

cv_scores_rf_test= cv_scores_test.mean()
cv_scores_rf_train= cv_scores_train.mean()
cv_scores_std_rf= cv_scores_test.std()
print ('Mean cross validation test score: ' +str(cv_scores_rf_test))
print ('Mean cross validation train score: ' +str(cv_scores_rf_train))
print ('Standard deviation in cv scores: ' +str(cv_scores_std_rf))

[0.77598613 0.77688362 0.77851716 0.77660452 0.7786397 ]
Mean cross validation test score: 0.7773262241401726
Mean cross validation train score: 0.795390733152289
Standard deviation in cv scores: 0.0010635947514436753


In [14]:
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
gbc = GradientBoostingClassifier(subsample=0.8, learning_rate=0.05 , n_estimators=100, random_state=5, max_depth=9, max_leaf_nodes=100)
gbc.fit(X_train, y_train)

#Predict using the model:

y_predict_gbc = gbc.predict(X_test)

#Confusion matrix:

cnf_matrix = confusion_matrix(y_test, y_predict_gbc)
print(cnf_matrix)
Accuracy_gbc=gbc.score(X_test,y_test)
print(Accuracy_gbc)

[[ 234940  340275]
 [  89260 1093989]]
0.7557328441185034


In [15]:
cv_scores_test= cross_val_score(gbc,X_test,y_test,cv=5,scoring='roc_auc')
cv_scores_train= cross_val_score(gbc,X_train,y_train,cv=5,scoring='roc_auc')

print(cv_scores_test)

cv_scores_gbc_test= cv_scores_test.mean()
cv_scores_gbc_train= cv_scores_train.mean()
cv_scores_std_gbc= cv_scores_test.std()
print ('Mean cross validation test score: ' +str(cv_scores_gbc_test))
print ('Mean cross validation train score: ' +str(cv_scores_gbc_train))
print ('Standard deviation in cv scores: ' +str(cv_scores_std_gbc))

[0.77696562 0.7771163  0.7785517  0.77759421 0.78005489]
Mean cross validation test score: 0.7780565456403374
Mean cross validation train score: 0.7782898192799717
Standard deviation in cv scores: 0.0011424966277863398


In [16]:
from lightgbm import LGBMClassifier
lgbm = LGBMClassifier()
lgbm.fit(X_train, y_train)


y_predict_lgbm = lgbm.predict(X_test)

cnf_matrix = confusion_matrix(y_test, y_predict_lgbm)
print(cnf_matrix)
Accuracy_lgbm=lgbm.score(X_test,y_test)
print(Accuracy_lgbm)

[[ 238716  336499]
 [  92103 1091146]]
0.7562634208036104


In [17]:
cv_scores_test= cross_val_score(lgbm,X_test,y_test,cv=5,scoring='roc_auc')
cv_scores_train= cross_val_score(lgbm,X_train,y_train,cv=5,scoring='roc_auc')

print(cv_scores_test)

cv_scores_lgbm_test= cv_scores_test.mean()
cv_scores_lgbm_train= cv_scores_train.mean()
cv_scores_std_lgbm= cv_scores_test.std()
print ('Mean cross validation test score: ' +str(cv_scores_lgbm_test))
print ('Mean cross validation train score: ' +str(cv_scores_lgbm_train))
print ('Standard deviation in cv scores: ' +str(cv_scores_std_lgbm))

[0.77758338 0.77761489 0.77880001 0.77818276 0.78038659]
Mean cross validation test score: 0.7785135252646231
Mean cross validation train score: 0.7790891940491722
Standard deviation in cv scores: 0.0010365859088685163


In [18]:
#grid search  hyperparameter tuning
parameters = {
#     'task' : ['predict'],
#     'boosting': ['gbdt' ],
#     'objective': ['root_mean_squared_error'],
     'num_iterations': [  1500, 2000,5000  ],
     'learning_rate':[  0.05, 0.005 ],
#    'num_leaves':[ 7, 15, 31  ],
    'max_depth' :[ 5,10],
    'min_data_in_leaf':[15,25 ],
#   'feature_fraction': [ 0.6, 0.8,  0.9],
#     'bagging_fraction': [  0.6, 0.8 ],
#     'bagging_freq': [   100, 200, 400  ],
     
 }

gsearch_lgb = GridSearchCV(lgbm, param_grid = parameters, n_jobs=6, verbose=10)
model = gsearch_lgb.fit(X_train,y_train)
 

print("The best parameters are: /n",  gsearch_lgb.best_params_)

# Store the model for prediction (chapter 5)
model = gsearch_lgb.best_estimator_
print(gsearch_lgb.best_score_)

Fitting 5 folds for each of 24 candidates, totalling 120 fits




The best parameters are: /n {'learning_rate': 0.05, 'max_depth': 5, 'min_data_in_leaf': 15, 'num_iterations': 1500}
0.7516102789190134


In [20]:

lgbm_best = LGBMClassifier(learning_rate = 0.05, max_depth = 5, min_data_in_leaf = 15, num_iterations = 1500)
lgbm_best.fit(X_train, y_train)


y_predict_lgbm = lgbm_best.predict(X_test)

cnf_matrix = confusion_matrix(y_test, y_predict_lgbm)
print(cnf_matrix)
Accuracy_lgbm=lgbm_best.score(X_test,y_test)
print(Accuracy_lgbm)



[[ 248189  327026]
 [  95372 1087877]]
0.7597914998544184


In [21]:
cv_scores_test= cross_val_score(lgbm_best,X_test,y_test,cv=5,scoring='roc_auc')
cv_scores_train= cross_val_score(lgbm_best,X_train,y_train,cv=5,scoring='roc_auc')

print(cv_scores_test)

cv_scores_lgbm_test= cv_scores_test.mean()
cv_scores_lgbm_train= cv_scores_train.mean()
cv_scores_std_lgbm= cv_scores_test.std()
print ('Mean cross validation test score: ' +str(cv_scores_lgbm_test))
print ('Mean cross validation train score: ' +str(cv_scores_lgbm_train))
print ('Standard deviation in cv scores: ' +str(cv_scores_std_lgbm))







































[0.78523854 0.78502068 0.78636645 0.7861716  0.78784202]
Mean cross validation test score: 0.7861278613373788
Mean cross validation train score: 0.7883425896100268
Standard deviation in cv scores: 0.001001392226219488


In [19]:
#paramGrid = {"max_depth":[5,10],
 #           "colsample_bytree":[0.3,0.4]}  


#lgbm = LGBMClassifier(objective='binary', num_boost_round=10)

#gs = GridSearchCV(lgbm, paramGrid, cv=3, verbose=2, n_jobs=1)


#model = gs.fit(X_train, y_train)

# Print the best parameters
#print("The best parameters are: /n",  gs.best_params_)

# Store the model for prediction (chapter 5)
#model = gs.best_estimator_

In [22]:
m_Labels = ['LGBM','Random Forest', 'Gradient Boosting']
scores_test = [cv_scores_lgbm_test, cv_scores_rf_test, cv_scores_gbc_test]
scores_train = [cv_scores_lgbm_train, cv_scores_rf_train, cv_scores_gbc_test]
accuracies = [Accuracy_lgbm, Accuracy_rf, Accuracy_gbc]

score_tab_acc = pd.DataFrame(list(zip(m_Labels, accuracies)), 
               columns =['Algorithm', 'Model accuracy score']) 

score_tab = pd.DataFrame(list(zip(m_Labels, scores_train, scores_test)), 
               columns =['Algorithm', 'ROC-AUC train score', 'ROC-AUC test score' ]) 

print(score_tab_acc)
print("\n")
print(score_tab)

           Algorithm  Model accuracy score
0               LGBM              0.759791
1      Random Forest              0.767920
2  Gradient Boosting              0.755733


           Algorithm  ROC-AUC train score  ROC-AUC test score
0               LGBM             0.788343            0.786128
1      Random Forest             0.795391            0.777326
2  Gradient Boosting             0.778057            0.778057


In [30]:
lgbm_importances = list(lgbm_best.feature_importances_)
imp=np.sort(importances)
table=pd.DataFrame(list(zip(order_total,imp)),columns =['Features', 'Importance scores']) 
print(table)

                  Features  Importance scores
0               Unnamed: 0                  0
1                 order_id               1072
2                  user_id               1944
3                 eval_set               2279
4             order_number               2986
5                order_dow               3400
6        order_hour_of_day               3435
7   days_since_prior_order               3588
8               product_id               3700
9        add_to_cart_order               4584
10                aisle_id               8419
11           department_id               8499


In [36]:
sum_lgbm = sum(lgbm_importances)
percentages = lgbm_importances / sum_lgbm
table['Importance scores'] = table['Importance scores'] / sum_lgbm
print(table)

                  Features  Importance scores
0               Unnamed: 0           0.000000
1                 order_id           0.024416
2                  user_id           0.044276
3                 eval_set           0.051906
4             order_number           0.068009
5                order_dow           0.077438
6        order_hour_of_day           0.078235
7   days_since_prior_order           0.081720
8               product_id           0.084271
9        add_to_cart_order           0.104405
10                aisle_id           0.191751
11           department_id           0.193573


In [27]:
importances = list(gbc.feature_importances_)
imp=np.sort(importances)
table=pd.DataFrame(list(zip(order_total,imp)),columns =['Features', 'Importance scores']) 
print(table)

                  Features  Importance scores
0               Unnamed: 0           0.000000
1                 order_id           0.000079
2                  user_id           0.000133
3                 eval_set           0.000642
4             order_number           0.000652
5                order_dow           0.001827
6        order_hour_of_day           0.019695
7   days_since_prior_order           0.031294
8               product_id           0.067394
9        add_to_cart_order           0.078385
10                aisle_id           0.141282
11           department_id           0.658618


In [28]:
importances = list(rf.feature_importances_)
imp=np.sort(importances)
table=pd.DataFrame(list(zip(order_total,imp)),columns =['Features', 'Importance scores']) 
print(table)

                  Features  Importance scores
0               Unnamed: 0           0.000000
1                 order_id           0.036849
2                  user_id           0.047058
3                 eval_set           0.049744
4             order_number           0.065573
5                order_dow           0.067067
6        order_hour_of_day           0.072756
7   days_since_prior_order           0.113658
8               product_id           0.116192
9        add_to_cart_order           0.120472
10                aisle_id           0.125124
11           department_id           0.185507


In [26]:

from sklearn.metrics import f1_score
print("LGBM score: ", f1_score(y_test,y_predict_lgbm))
print("GBC score: ", f1_score(y_test,y_predict_gbc))
print("Rf score: ", f1_score(y_test,y_predict_rf))

LGBM score:  0.8374236765208501
GBC score:  0.8358995733736565
Rf score:  0.8415553436596357
