### Objective : In this Notebook we will choose the best performing model out of various models and fine tune it

In [1]:
#import libraries here; add more as necessary.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import math
import warnings
warnings.filterwarnings("ignore")

#Models
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.feature_selection import RFE , RFECV

from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from pandas import ExcelWriter
from openpyxl import load_workbook

## Load Data

In [2]:
X_train = pd.read_csv(r'C:\Users\psahu\Documents\My_projects\online_shoppers\data\Transformed_X_train_shoppersData.csv')
y_train = pd.read_csv(r'C:\Users\psahu\Documents\My_projects\online_shoppers\data\Transformed_y_train.csv')

In [3]:
X_train.shape , y_train.shape

((7398, 68), (7398, 1))

## Declare Important Functions and variables

In [68]:
lr  = LogisticRegression()
svc = SVC(random_state = 50)
knn = KNeighborsClassifier()
rfc = RandomForestClassifier(random_state = 50)
abc = AdaBoostClassifier(random_state=50) # AdaBoostClassifier
gbc = GradientBoostingClassifier(random_state=50) # GradientBoostingClassifier
xgb = XGBClassifier(random_state = 50)


clf_list = [lr, knn,rfc,abc , gbc, xgb]

# Multi Model Performance

In [17]:
print(X_train.shape)
y_train.shape

(7398, 68)


(7398, 1)

In [63]:
y_train.value_counts()

Revenue
0          6271
1          1127
dtype: int64

In [69]:
def fit_model( clf , param_grid, X=X_train, y=y_train):

    
    scorer = { 
        'accuracy'   : 'accuracy',
         'roc_auc'   : 'roc_auc',
            'f1'     : 'f1',
         'precision' : 'precision',
            'recall' : 'recall'    
                }
    
    myratio = {1: round( 1 *  y_train['Revenue'].value_counts()[0],0).astype(int) } 
    clf_model = Pipeline(
        [
      ('sampling', SMOTE(sampling_strategy= myratio )),
        ('clf', clf)
    ])

    # cv uses StratifiedKFold
    # scoring r2_score available as parameter
    start = time.time()
    grid = GridSearchCV(estimator=clf_model, param_grid= {}, scoring= scorer ,  cv=5 , refit ='recall')
    print("Training {} :".format(clf.__class__.__name__))
    grid.fit(X, y)
    end = time.time()
    time_taken = round((end-start)/60,2)
    #Recall_score = round(grid.best_score_,4)
    
    accuracy = grid.cv_results_['mean_test_accuracy']
    roc_auc = grid.cv_results_['mean_test_roc_auc']
    f1 = grid.cv_results_['mean_test_f1']
    precision = grid.cv_results_['mean_test_precision']
    recall = grid.cv_results_['mean_test_recall']


    print(clf.__class__.__name__)
    print("Time taken : {} mins".format(time_taken))
    print("Accuracy : {}".format(accuracy))
    print("*"*40)
    
    return accuracy, roc_auc, f1, precision, recall, time_taken

In [70]:
# Find best classification algorithm

clf_names = []
clf_accuracy = []
clf_roc_auc = []
clf_f1 = []
clf_precision = []
clf_recall = []
#clf_scores = []
#clf_best_ests = []
clf_time_taken = []
clf_dict = {}

for clf in clf_list:
    accuracy, roc_auc, f1, precision, recall, time_taken = fit_model(clf, {})
    clf_names.append(clf.__class__.__name__)
    clf_accuracy.append(accuracy)
    clf_roc_auc.append(roc_auc)
    clf_f1.append(f1)
    clf_precision.append(precision)
    clf_recall.append(recall)
#   clf_scores.append(best_score)
#   clf_best_ests.append(best_est)
    clf_time_taken.append(time_taken)

Training LogisticRegression :
LogisticRegression
Time taken : 0.04 mins
Accuracy : [0.84495724]
****************************************
Training KNeighborsClassifier :
KNeighborsClassifier
Time taken : 0.06 mins
Accuracy : [0.75182656]
****************************************
Training RandomForestClassifier :
RandomForestClassifier
Time taken : 0.17 mins
Accuracy : [0.88537525]
****************************************
Training AdaBoostClassifier :
AdaBoostClassifier
Time taken : 0.11 mins
Accuracy : [0.87294008]
****************************************
Training GradientBoostingClassifier :
GradientBoostingClassifier
Time taken : 0.29 mins
Accuracy : [0.88497094]
****************************************
Training XGBClassifier :
XGBClassifier
Time taken : 0.33 mins
Accuracy : [0.88470031]
****************************************


In [71]:
#Round 
clf_accuracy = list(np.round(clf_accuracy,5))
clf_roc_auc = list(np.round(clf_roc_auc,5))
clf_f1 = list(np.round(clf_f1,5))
clf_precision = list(np.round(clf_precision,5))
clf_recall = list(np.round(clf_recall,5))

In [73]:
# Create clf_df dataframe from clf_dict
clf_dict['Accuracy'] = clf_accuracy
clf_dict['Roc_Auc'] = clf_roc_auc
clf_dict['F1'] =   clf_f1
clf_dict['Precision'] = clf_precision
clf_dict['Recall'] = clf_recall
clf_dict['Time_Taken (mins)'] = clf_time_taken
#clf_dict['roc_auc'] = clf_scores
#clf_dict['best_est'] = clf_best_ests
clf_df = pd.DataFrame(clf_dict, index=clf_names)
clf_df

Unnamed: 0,Accuracy,Roc_Auc,F1,Precision,Recall,Time_Taken (mins)
LogisticRegression,[0.84496],[0.84062],[0.54098],[0.49387],[0.59988],0.04
KNeighborsClassifier,[0.75183],[0.76091],[0.43645],[0.3339],[0.63084],0.06
RandomForestClassifier,[0.88538],[0.90939],[0.62135],[0.62629],[0.61757],0.17
AdaBoostClassifier,[0.87294],[0.8951],[0.60417],[0.57571],[0.63619],0.11
GradientBoostingClassifier,[0.88497],[0.91549],[0.64911],[0.60766],[0.69744],0.29
XGBClassifier,[0.8847],[0.91611],[0.65105],[0.60467],[0.70544],0.33


# Get the best Hyper-Parameters using Grid Search

In [161]:
#!pip install pactools

In [79]:
from sklearn.metrics import  make_scorer
from imblearn.pipeline import Pipeline
from pactools.grid_search import GridSearchCVProgressBar

In [80]:
clf = XGBClassifier(random_state = 50)

In [81]:
def fit_classifier(clf, param_grid,  X=X_train, y=y_train): 
    
    scorer = {'recall' : 'recall' , 
              'accuracy' : 'accuracy'}
    
    myratio = {1: round( 1 *  y_train['Revenue'].value_counts()[0],0).astype(int) }
    
    clf_model = Pipeline(  [
        ('sampling', SMOTE(sampling_strategy= myratio )),
        ('clf', clf)
    ])

    start = time.time()
    grid = GridSearchCV(clf_model, param_grid=param_grid, cv = 5 ,scoring= scorer, verbose = 10 , refit ='recall', n_jobs=-1 )
    print("Training {} :".format(clf.__class__.__name__))
    grid.fit(X, y)
    end = time.time()
    time_taken = round((end-start)/3600,2)

    print(clf.__class__.__name__)
    print("Time taken : {} hours".format(time_taken))
    print("Best score : {}".format(round(grid.best_score_,4)))
    print("*"*40)
    
    return grid.best_score_, grid.best_estimator_, time_taken

In [83]:
param_grid = {
 "clf__learning_rate"    : [0.05, 0.1, 0.3  ] ,
 'clf__n_estimators'     : [100 , 200 , 300],
 "clf__max_depth"        : [4, 5 ,6, 7],
 "clf__min_child_weight" : [1, 2 , 3 ],
 "clf__gamma"            : [0.0 ,0.1,0.2],
 "clf__colsample_bytree" : [0.5 , 0.7, 1.0]
}

In [84]:
xgb_best_score, xgb_best_est, time_taken = fit_classifier(XGBClassifier(random_state = 50), param_grid , X_train, y_train)
xgb_best_est

Training XGBClassifier :
Fitting 5 folds for each of 972 candidates, totalling 4860 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   14.5s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   30.2s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   48.2s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:  9

XGBClassifier
Time taken : 5.59 hours
Best score : 0.7436
****************************************


Pipeline(steps=[('sampling', SMOTE(sampling_strategy={1: 6271})),
                ('clf',
                 XGBClassifier(colsample_bytree=0.7, gamma=0.0,
                               learning_rate=0.05, max_depth=4,
                               min_child_weight=3, random_state=50))])

In [85]:
print("Best: %f using %s in %f hours" % (xgb_best_score, xgb_best_est , time_taken))

Best: 0.743591 using Pipeline(steps=[('sampling', SMOTE(sampling_strategy={1: 6271})),
                ('clf',
                 XGBClassifier(colsample_bytree=0.7, gamma=0.0,
                               learning_rate=0.05, max_depth=4,
                               min_child_weight=3, random_state=50))]) in 5.590000 hours
