In [64]:
#Importing all necessary libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import json
import math
import os
import random

import seaborn as sns
from collections import defaultdict
import datetime

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit, train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier


% matplotlib inline

In [65]:
#Cleaned & feature engineered master combined dataset for modeling later with categorical values and OHE
master_df = pd.read_csv("4_clean_master.csv")

In [66]:
master_df.head()

Unnamed: 0,id_customer,id_offer,num_times_received,num_times_viewed,num_times_completed,offer_successful,reward,duration,difficulty,email,...,income,membership_days,membership_year,gender_F,gender_M,gender_O,Baby_Boomer,Gen-X,Millenial,Gen-Z
0,7997,7.0,1,1,1,1,5,7,5,1,...,100000.0,1314,2017,1,0,0,1,0,0,0
1,15044,3.0,1,1,0,0,2,7,10,1,...,70000.0,962,2018,0,1,0,1,0,0,0
2,3729,9.0,2,2,2,1,5,5,5,1,...,53000.0,1038,2018,0,1,0,1,0,0,0
3,3060,4.0,1,0,0,0,0,4,0,1,...,51000.0,1128,2017,0,1,0,1,0,0,0
4,11411,1.0,1,1,0,0,5,10,20,1,...,57000.0,1189,2017,1,0,0,1,0,0,0


In [67]:
master_df.shape

(55222, 26)

In [68]:
master_df.columns

Index([u'id_customer', u'id_offer', u'num_times_received', u'num_times_viewed',
       u'num_times_completed', u'offer_successful', u'reward', u'duration',
       u'difficulty', u'email', u'mobile', u'social', u'web', u'offer_bogo',
       u'offer_discount', u'offer_informational', u'income',
       u'membership_days', u'membership_year', u'gender_F', u'gender_M',
       u'gender_O', u'Baby_Boomer', u'Gen-X', u'Millenial', u'Gen-Z'],
      dtype='object')

## Final Clean up for Modeling

In [69]:
master_df_backup = master_df.copy()

In [70]:
# Drop all informational rows.
master_df.drop(master_df[master_df.offer_informational == 1].index, inplace=True)

# Drop all rows where id_offer is NaN.
master_df.drop(master_df[master_df.id_offer.isnull()].index, inplace=True)

# Drop the columns not needed for the model.
master_df.drop(columns=['id_customer', 'num_times_received',
                       'num_times_viewed', 'num_times_completed',
                       'email','mobile','social','web', 'offer_informational', 'membership_year'],
                        inplace=True, axis=1)

# Remove duplicate rows, if any exist.
master_df.drop_duplicates(inplace=True)

In [71]:
master_df.head()

Unnamed: 0,id_offer,offer_successful,reward,duration,difficulty,offer_bogo,offer_discount,income,membership_days,gender_F,gender_M,gender_O,Baby_Boomer,Gen-X,Millenial,Gen-Z
0,7.0,1,5,7,5,1,0,100000.0,1314,1,0,0,1,0,0,0
1,3.0,0,2,7,10,0,1,70000.0,962,0,1,0,1,0,0,0
2,9.0,1,5,5,5,1,0,53000.0,1038,0,1,0,1,0,0,0
4,1.0,0,5,10,20,0,1,57000.0,1189,1,0,0,1,0,0,0
5,1.0,0,5,10,20,0,1,71000.0,1767,1,0,0,1,0,0,0


In [72]:
master_df.shape

(43999, 16)

In [73]:
#Let's check and final benchmark the OFR (Offer Success Rate)
TSR=float(master_df.offer_successful.sum()*1.00 / master_df.offer_successful.count()*1.00)
print("Total Offer Success Rate = ", TSR)

('Total Offer Success Rate = ', 0.6323098252233005)


In [74]:
#Now we have to do feature normalization using SKLEARN minmaxscaler
# 'reward', 'difficulty', 'duration', and 'income'
features_to_scale = ['reward', 'difficulty', 'duration', 'income']

for col in features_to_scale:
    scaler = MinMaxScaler()
    master_df[col] = scaler.fit_transform(master_df.loc[:, [col]])

In [75]:
master_df.head()

Unnamed: 0,id_offer,offer_successful,reward,duration,difficulty,offer_bogo,offer_discount,income,membership_days,gender_F,gender_M,gender_O,Baby_Boomer,Gen-X,Millenial,Gen-Z
0,7.0,1,0.375,0.4,0.0,1,0,0.777778,1314,1,0,0,1,0,0,0
1,3.0,0,0.0,0.4,0.333333,0,1,0.444444,962,0,1,0,1,0,0,0
2,9.0,1,0.375,0.0,0.0,1,0,0.255556,1038,0,1,0,1,0,0,0
4,1.0,0,0.375,1.0,1.0,0,1,0.3,1189,1,0,0,1,0,0,0
5,1.0,0,0.375,1.0,1.0,0,1,0.455556,1767,1,0,0,1,0,0,0


In [76]:
model_df = master_df.copy()

In [77]:
#Let's save this final model df to a csv

if not os.path.exists("model_data"):
    os.mkdir("model_data")

model_df.to_csv(os.path.join("model_data", "model.csv"), index=False)


### Now's its time for Modeling

In [78]:
#read the model csv and load to pandas df
df_model = pd.read_csv(os.path.join("model_data", "model.csv"))

In [79]:
df_model.head()

Unnamed: 0,id_offer,offer_successful,reward,duration,difficulty,offer_bogo,offer_discount,income,membership_days,gender_F,gender_M,gender_O,Baby_Boomer,Gen-X,Millenial,Gen-Z
0,7.0,1,0.375,0.4,0.0,1,0,0.777778,1314,1,0,0,1,0,0,0
1,3.0,0,0.0,0.4,0.333333,0,1,0.444444,962,0,1,0,1,0,0,0
2,9.0,1,0.375,0.0,0.0,1,0,0.255556,1038,0,1,0,1,0,0,0
3,1.0,0,0.375,1.0,1.0,0,1,0.3,1189,1,0,0,1,0,0,0
4,1.0,0,0.375,1.0,1.0,0,1,0.455556,1767,1,0,0,1,0,0,0


In [80]:
df_model.shape

(43999, 16)

### Dataset Split into Train, Test, Valiation using Sklearn SSS

In [81]:
#Y_LABEL is "offer_successful" - Need to remove that from feature dataset (X Labels)
y_class_df = df_model['offer_successful']

In [82]:
y_class_df.shape

(43999,)

In [83]:
X_features_df = df_model.drop('offer_successful', axis=1)

In [84]:
X_features_df.shape

(43999, 15)

In [85]:
X = X_features_df.to_numpy()

In [86]:
y = y_class_df.to_numpy()

In [87]:
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

In [88]:
sss.get_n_splits(X, y)

5

In [89]:
for train_index, test_index in sss.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

('TRAIN:', array([15757, 23200, 42672, ..., 18492, 31724, 16045]), 'TEST:', array([37517, 18063, 25562, ..., 16297,  6046, 30484]))
('TRAIN:', array([ 2469, 40422, 13652, ..., 19095, 30243, 43770]), 'TEST:', array([39205, 39680, 12886, ...,  5481, 18904, 14282]))
('TRAIN:', array([30422, 15282,  6838, ..., 29157, 42694, 28504]), 'TEST:', array([33002, 36202, 40517, ..., 21554, 32431, 28485]))
('TRAIN:', array([12107,   210, 35075, ..., 20255, 35085, 37337]), 'TEST:', array([ 3832,  3769, 16396, ..., 20065, 31974, 41435]))
('TRAIN:', array([32050,  6915, 15039, ..., 30742, 36705, 19524]), 'TEST:', array([13732, 24984, 28889, ..., 42276, 19761, 11719]))


In [90]:
print("Shapes of X and y - Train and Test")
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

Shapes of X and y - Train and Test
((35199, 15), (35199,), (8800, 15), (8800,))


In [91]:
sss2 = StratifiedShuffleSplit(n_splits=5, test_size=0.25, random_state=0)

In [92]:
sss2.get_n_splits(X_train, y_train)

5

In [93]:
for train_index, val_index in sss.split(X_train, y_train):
    print("TRAIN:", train_index, "VAL:", val_index)
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]

('TRAIN:', array([12095, 28792, 18283, ...,  6328, 18998,  6010]), 'VAL:', array([26053,  4839, 15356, ..., 34377, 10516, 19950]))
('TRAIN:', array([34021, 10497,   743, ..., 33276, 10682,  6018]), 'VAL:', array([12540, 11626,  1336, ...,  2821, 12608,  5210]))
('TRAIN:', array([ 2403, 12895,  3894, ..., 21363, 14496,  6832]), 'VAL:', array([ 4358, 19833, 20380, ..., 24728,  3667, 23511]))
('TRAIN:', array([ 4247, 24541,  2260, ..., 28552, 29522, 19564]), 'VAL:', array([22263, 16051, 33155, ..., 28381,  8945, 26563]))
('TRAIN:', array([14129, 27468,  5658, ...,  4596, 27874, 10909]), 'VAL:', array([12201,  9518, 17151, ..., 23357, 23043, 12452]))


In [94]:
print("Shapes of X and y - Train and Validation")
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

Shapes of X and y - Train and Validation
((28159, 15), (28159,), (7040, 15), (7040,))


In [95]:
# convert features/labels to numpy float32
X_train = X_train.astype('float32')
y_train = y_train.astype('float32')

X_test = X_test.astype('float32')
y_test = y_test.astype('float32')

X_val = X_val.astype('float32')
y_val = y_val.astype('float32')

### Metric Evaluator

In [161]:
def get_all_metrics(model_name, x_features, y_label):
    y_predictions = model_name.predict(x_features)
    tn, fp, fn, tp = confusion_matrix(y_label, y_predictions).ravel()
    
    #Precision = TruePositives / (TruePositives + FalsePositives)
    precision = float(tp*1.00/(tp+fp)*1.00)
    
    #Recall = TruePositives / (TruePositives + FalseNegatives)
    recall = float(tp*1.00/(tp+fn)*1.00)
    
    #Accuracy Score
    accuracy = accuracy_score(y_label, y_predictions)
    return precision, recall, accuracy   

def f_score(precision, recall, beta):
    f_score = float((1 + beta**2) * (precision * recall) * 1.00 / (((beta**2) * precision) + recall) * 1.00)
    return f_score
    
def print_all_metrics(model, X, y):
    precision, recall, accuracy = get_all_metrics(model, X, y)
    f1_score = f_score(precision, recall, 1)
    f2_score = f_score(precision, recall, 2)
    
    print("Model: ")
    print(model)
    print("Precision: ", precision, "Recall: ", recall, "Accuracy: ",  accuracy)
    print("F1 Score: ", f1_score, "F2 Score: ", f2_score)
    
    

### Model A: SKlearn LR (Benchmark)

In [125]:
clf_lr = LogisticRegression(random_state=0, max_iter=5000)
clf_lr

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=5000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [126]:
%%time
clf_lr.fit(X_train, y_train)

CPU times: user 108 ms, sys: 0 ns, total: 108 ms
Wall time: 107 ms


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=5000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [127]:
#testing some predictions from X_test randomly
n = random.randint(0,len(X_test)-1)

for i in range(0, 10):
    n = random.randint(0,len(X_test)-1)
    result = clf_lr.predict([X_test[n]])
    result = int(result)
    print(n, result)

(3520, 0)
(2221, 0)
(6761, 1)
(1860, 1)
(2635, 1)
(2959, 0)
(751, 0)
(8611, 0)
(1686, 1)
(381, 1)


In [163]:
#LR Metrics
print_all_metrics(clf_lr, X_test, y_test)

Model: 
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=5000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
('Precision: ', 0.7185173932857359, 'Recall: ', 0.8501078360891445, 'Accuracy: ', 0.694659090909091)
('F1 Score: ', 0.7787931176422163, 'F2 Score: ', 0.820070044037588)


### Model B: SKlearn - SVC 

In [107]:
%%time

C_range = np.logspace(-2, 4, 10)
gamma_range = np.logspace(-9, 3, 10)
param_grid = dict(gamma=gamma_range, C=C_range)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv, verbose=100)
grid.fit(X_val, y_val)

print("The best hyperparameters are ", grid.best_params_, " with a score of ", grid.best_score_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV] C=0.01, gamma=1e-09 .............................................
[CV] ............. C=0.01, gamma=1e-09, score=0.6484375, total=   1.4s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.3s remaining:    0.0s
[CV] C=0.01, gamma=1e-09 .............................................
[CV] ............. C=0.01, gamma=1e-09, score=0.6484375, total=   1.4s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    4.6s remaining:    0.0s
[CV] C=0.01, gamma=1e-09 .............................................
[CV] ............. C=0.01, gamma=1e-09, score=0.6484375, total=   1.4s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    6.9s remaining:    0.0s
[CV] C=0.01, gamma=1e-09 .............................................
[CV] ............. C=0.01, gamma=1e-09, score=0.6484375, total=   1.5s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elap

[CV] . C=0.01, gamma=2.154434690031878, score=0.6484375, total=   1.6s
[Parallel(n_jobs=1)]: Done  37 out of  37 | elapsed:  1.5min remaining:    0.0s
[CV] C=0.01, gamma=2.154434690031878 .................................
[CV] . C=0.01, gamma=2.154434690031878, score=0.6484375, total=   1.6s
[Parallel(n_jobs=1)]: Done  38 out of  38 | elapsed:  1.6min remaining:    0.0s
[CV] C=0.01, gamma=2.154434690031878 .................................
[CV] . C=0.01, gamma=2.154434690031878, score=0.6484375, total=   1.6s
[Parallel(n_jobs=1)]: Done  39 out of  39 | elapsed:  1.6min remaining:    0.0s
[CV] C=0.01, gamma=2.154434690031878 .................................
[CV] . C=0.01, gamma=2.154434690031878, score=0.6484375, total=   1.6s
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:  1.7min remaining:    0.0s
[CV] C=0.01, gamma=46.41588833612773 .................................
[CV] . C=0.01, gamma=46.41588833612773, score=0.6484375, total=   1.4s
[Parallel(n_jobs=1)]: Done  41 out of  41

[CV]  C=0.046415888336127774, gamma=0.00021544346900318823, score=0.680397727273, total=   1.7s
[Parallel(n_jobs=1)]: Done  73 out of  73 | elapsed:  3.0min remaining:    0.0s
[CV] C=0.046415888336127774, gamma=0.00021544346900318823 ............
[CV]  C=0.046415888336127774, gamma=0.00021544346900318823, score=0.675426136364, total=   1.7s
[Parallel(n_jobs=1)]: Done  74 out of  74 | elapsed:  3.0min remaining:    0.0s
[CV] C=0.046415888336127774, gamma=0.00021544346900318823 ............
[CV]  C=0.046415888336127774, gamma=0.00021544346900318823, score=0.675426136364, total=   1.7s
[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:  3.1min remaining:    0.0s
[CV] C=0.046415888336127774, gamma=0.004641588833612773 ..............
[CV]  C=0.046415888336127774, gamma=0.004641588833612773, score=0.6796875, total=   1.8s
[Parallel(n_jobs=1)]: Done  76 out of  76 | elapsed:  3.1min remaining:    0.0s
[CV] C=0.046415888336127774, gamma=0.004641588833612773 ..............
[CV]  C=0.046415888

[CV]  C=0.21544346900318834, gamma=4.6415888336127725e-07, score=0.6484375, total=   1.6s
[CV] C=0.21544346900318834, gamma=4.6415888336127725e-07 .............
[CV]  C=0.21544346900318834, gamma=4.6415888336127725e-07, score=0.6484375, total=   1.6s
[CV] C=0.21544346900318834, gamma=4.6415888336127725e-07 .............
[CV]  C=0.21544346900318834, gamma=4.6415888336127725e-07, score=0.6484375, total=   1.6s
[CV] C=0.21544346900318834, gamma=1e-05 ..............................
[CV]  C=0.21544346900318834, gamma=1e-05, score=0.681818181818, total=   1.8s
[CV] C=0.21544346900318834, gamma=1e-05 ..............................
[CV]  C=0.21544346900318834, gamma=1e-05, score=0.681818181818, total=   1.8s
[CV] C=0.21544346900318834, gamma=1e-05 ..............................
[CV]  C=0.21544346900318834, gamma=1e-05, score=0.686789772727, total=   1.8s
[CV] C=0.21544346900318834, gamma=1e-05 ..............................
[CV]  C=0.21544346900318834, gamma=1e-05, score=0.671164772727, total=

[CV] ......... C=1.0, gamma=1e-05, score=0.682528409091, total=   2.0s
[CV] C=1.0, gamma=1e-05 ..............................................
[CV] ......... C=1.0, gamma=1e-05, score=0.686079545455, total=   2.0s
[CV] C=1.0, gamma=1e-05 ..............................................
[CV] ......... C=1.0, gamma=1e-05, score=0.675426136364, total=   2.0s
[CV] C=1.0, gamma=1e-05 ..............................................
[CV] ......... C=1.0, gamma=1e-05, score=0.678977272727, total=   2.0s
[CV] C=1.0, gamma=0.00021544346900318823 .............................
[CV]  C=1.0, gamma=0.00021544346900318823, score=0.682528409091, total=   1.9s
[CV] C=1.0, gamma=0.00021544346900318823 .............................
[CV]  C=1.0, gamma=0.00021544346900318823, score=0.683238636364, total=   1.9s
[CV] C=1.0, gamma=0.00021544346900318823 .............................
[CV]  C=1.0, gamma=0.00021544346900318823, score=0.683238636364, total=   1.9s
[CV] C=1.0, gamma=0.00021544346900318823 ............

[CV]  C=4.6415888336127775, gamma=0.00021544346900318823, score=0.682528409091, total=   2.2s
[CV] C=4.6415888336127775, gamma=0.00021544346900318823 ..............
[CV]  C=4.6415888336127775, gamma=0.00021544346900318823, score=0.676846590909, total=   2.2s
[CV] C=4.6415888336127775, gamma=0.00021544346900318823 ..............
[CV]  C=4.6415888336127775, gamma=0.00021544346900318823, score=0.6796875, total=   2.2s
[CV] C=4.6415888336127775, gamma=0.004641588833612773 ................
[CV]  C=4.6415888336127775, gamma=0.004641588833612773, score=0.691761363636, total=   2.0s
[CV] C=4.6415888336127775, gamma=0.004641588833612773 ................
[CV]  C=4.6415888336127775, gamma=0.004641588833612773, score=0.691051136364, total=   2.0s
[CV] C=4.6415888336127775, gamma=0.004641588833612773 ................
[CV]  C=4.6415888336127775, gamma=0.004641588833612773, score=0.696022727273, total=   2.0s
[CV] C=4.6415888336127775, gamma=0.004641588833612773 ................
[CV]  C=4.64158883361

[CV]  C=21.54434690031882, gamma=0.004641588833612773, score=0.688920454545, total=   2.1s
[CV] C=21.54434690031882, gamma=0.004641588833612773 .................
[CV]  C=21.54434690031882, gamma=0.004641588833612773, score=0.708806818182, total=   2.1s
[CV] C=21.54434690031882, gamma=0.004641588833612773 .................
[CV]  C=21.54434690031882, gamma=0.004641588833612773, score=0.705255681818, total=   2.2s
[CV] C=21.54434690031882, gamma=0.004641588833612773 .................
[CV]  C=21.54434690031882, gamma=0.004641588833612773, score=0.681818181818, total=   2.2s
[CV] C=21.54434690031882, gamma=0.004641588833612773 .................
[CV]  C=21.54434690031882, gamma=0.004641588833612773, score=0.700284090909, total=   2.1s
[CV] C=21.54434690031882, gamma=0.1 ..................................
[CV]  C=21.54434690031882, gamma=0.1, score=0.642045454545, total=   2.4s
[CV] C=21.54434690031882, gamma=0.1 ..................................
[CV]  C=21.54434690031882, gamma=0.1, score=0

[CV] ......... C=100.0, gamma=0.1, score=0.645596590909, total=   2.6s
[CV] C=100.0, gamma=0.1 ..............................................
[CV] ......... C=100.0, gamma=0.1, score=0.636363636364, total=   2.5s
[CV] C=100.0, gamma=0.1 ..............................................
[CV] ......... C=100.0, gamma=0.1, score=0.656960227273, total=   2.3s
[CV] C=100.0, gamma=0.1 ..............................................
[CV] ......... C=100.0, gamma=0.1, score=0.618607954545, total=   2.5s
[CV] C=100.0, gamma=0.1 ..............................................
[CV] ......... C=100.0, gamma=0.1, score=0.649857954545, total=   2.6s
[CV] C=100.0, gamma=2.154434690031878 ................................
[CV]  C=100.0, gamma=2.154434690031878, score=0.650568181818, total=   1.7s
[CV] C=100.0, gamma=2.154434690031878 ................................
[CV]  C=100.0, gamma=2.154434690031878, score=0.645596590909, total=   1.7s
[CV] C=100.0, gamma=2.154434690031878 .............................

[CV]  C=464.1588833612773, gamma=2.154434690031878, score=0.650568181818, total=   1.7s
[CV] C=464.1588833612773, gamma=2.154434690031878 ....................
[CV]  C=464.1588833612773, gamma=2.154434690031878, score=0.645596590909, total=   1.7s
[CV] C=464.1588833612773, gamma=2.154434690031878 ....................
[CV]  C=464.1588833612773, gamma=2.154434690031878, score=0.652698863636, total=   1.7s
[CV] C=464.1588833612773, gamma=2.154434690031878 ....................
[CV]  C=464.1588833612773, gamma=2.154434690031878, score=0.653409090909, total=   1.7s
[CV] C=464.1588833612773, gamma=2.154434690031878 ....................
[CV]  C=464.1588833612773, gamma=2.154434690031878, score=0.654119318182, total=   1.7s
[CV] C=464.1588833612773, gamma=46.41588833612773 ....................
[CV]  C=464.1588833612773, gamma=46.41588833612773, score=0.650568181818, total=   1.8s
[CV] C=464.1588833612773, gamma=46.41588833612773 ....................
[CV]  C=464.1588833612773, gamma=46.4158883361

[CV]  C=2154.4346900318824, gamma=2.154434690031878, score=0.652698863636, total=   1.7s
[CV] C=2154.4346900318824, gamma=2.154434690031878 ...................
[CV]  C=2154.4346900318824, gamma=2.154434690031878, score=0.654119318182, total=   1.7s
[CV] C=2154.4346900318824, gamma=46.41588833612773 ...................
[CV]  C=2154.4346900318824, gamma=46.41588833612773, score=0.650568181818, total=   1.9s
[CV] C=2154.4346900318824, gamma=46.41588833612773 ...................
[CV]  C=2154.4346900318824, gamma=46.41588833612773, score=0.646306818182, total=   1.8s
[CV] C=2154.4346900318824, gamma=46.41588833612773 ...................
[CV]  C=2154.4346900318824, gamma=46.41588833612773, score=0.649147727273, total=   1.8s
[CV] C=2154.4346900318824, gamma=46.41588833612773 ...................
[CV]  C=2154.4346900318824, gamma=46.41588833612773, score=0.649857954545, total=   1.8s
[CV] C=2154.4346900318824, gamma=46.41588833612773 ...................
[CV]  C=2154.4346900318824, gamma=46.415

[CV]  C=10000.0, gamma=46.41588833612773, score=0.649857954545, total=   1.8s
[CV] C=10000.0, gamma=46.41588833612773 ..............................
[CV]  C=10000.0, gamma=46.41588833612773, score=0.651278409091, total=   1.8s
[CV] C=10000.0, gamma=1000.0 .........................................
[CV] .... C=10000.0, gamma=1000.0, score=0.649857954545, total=   1.7s
[CV] C=10000.0, gamma=1000.0 .........................................
[CV] .... C=10000.0, gamma=1000.0, score=0.647017045455, total=   1.8s
[CV] C=10000.0, gamma=1000.0 .........................................
[CV] .... C=10000.0, gamma=1000.0, score=0.649147727273, total=   1.8s
[CV] C=10000.0, gamma=1000.0 .........................................
[CV] .... C=10000.0, gamma=1000.0, score=0.649147727273, total=   1.8s
[CV] C=10000.0, gamma=1000.0 .........................................
[CV] .... C=10000.0, gamma=1000.0, score=0.649857954545, total=   1.8s
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed: 40.1min fi

#### ('The best hyperparameters are ', {'C': 2154.4346900318824, 'gamma': 0.00021544346900318823}, ' with a score of ', 0.7214488636363636)

In [130]:
%%time
clf_svm = SVC(C=2154.4346900318824, gamma=0.00021544346900318823).fit(X_train, y_train)
#('The best hyperparameters are ', {'C': 215`4.4346900318824, 'gamma': 0.00021544346900318823}, ' 
# with a score of ', 0.7214488636363636)

CPU times: user 8min 48s, sys: 124 ms, total: 8min 49s
Wall time: 8min 49s


In [143]:
#testing some predictions from X_test randomly
n = random.randint(0,len(X_test)-1)

for i in range(0, 10):
    n = random.randint(0,len(X_test)-1)
    result = clf_svm.predict([X_test[n]])
    result = int(result)
    print(n, result)

(4172, 1)
(3895, 1)
(5466, 1)
(4974, 1)
(4589, 0)
(3875, 0)
(262, 1)
(8150, 1)
(2572, 1)
(4202, 1)


In [164]:
#SVM Metrics
print_all_metrics(clf_svm, X_test, y_test)

Model: 
SVC(C=2154.43469003, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.000215443469003,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
('Precision: ', 0.752022453359749, 'Recall: ', 0.8186556434219986, 'Accuracy: ', 0.7146590909090909)
('F1 Score: ', 0.7839256518371914, 'F2 Score: ', 0.8044008052837919)


### C. Model:  Gradient Boost 

In [145]:
##SKlearn GradientBoostingClassifier
clf_grad_boost = GradientBoostingClassifier(random_state=0, learning_rate=0.1, n_estimators=1000, max_depth=10)
clf_grad_boost

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=10,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=1000,
              n_iter_no_change=None, presort='auto', random_state=0,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [146]:
%%time
clf_grad_boost.fit(X_train, y_train)

CPU times: user 2min 36s, sys: 31.9 ms, total: 2min 37s
Wall time: 2min 37s


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=10,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=1000,
              n_iter_no_change=None, presort='auto', random_state=0,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [147]:
#testing some predictions from X_test randomly
n = random.randint(0,len(X_test)-1)

for i in range(0, 10):
    n = random.randint(0,len(X_test)-1)
    result = clf_grad_boost.predict([X_test[n]])
    result = int(result)
    print(n, result)

(4877, 1)
(1682, 0)
(170, 1)
(7440, 0)
(1021, 1)
(8027, 1)
(3588, 0)
(3184, 1)
(7079, 1)
(4040, 1)


In [165]:
#metrics for gradient boosting
print_all_metrics(clf_grad_boost, X_test, y_test)

Model: 
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=10,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=1000,
              n_iter_no_change=None, presort='auto', random_state=0,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)
('Precision: ', 0.9139690358902182, 'Recall: ', 0.9336808051761323, 'Accuracy: ', 0.9025)
('F1 Score: ', 0.9237197724039828, 'F2 Score: ', 0.9296707229778094)


### D. Model:  RandomForrestClassifier

In [151]:
clf_rf = RandomForestClassifier(max_depth=10, random_state=0, n_estimators=1000)
clf_rf

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [152]:
%%time
clf_rf.fit(X_train, y_train)

CPU times: user 13.4 s, sys: 48 ms, total: 13.4 s
Wall time: 13.4 s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [166]:
#metrics for RandomForrest
print_all_metrics(clf_rf, X_test, y_test)

Model: 
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)
('Precision: ', 0.7693923723335488, 'Recall: ', 0.8556793673616103, 'Accuracy: ', 0.7465909090909091)
('F1 Score: ', 0.8102450646698436, 'F2 Score: ', 0.836907607931374)


## Results

In [172]:
print("Metrics from all the models:\n")
print_all_metrics(clf_lr, X_test, y_test)
print("\n")
print_all_metrics(clf_svm, X_test, y_test)
print("\n")
print_all_metrics(clf_grad_boost, X_test, y_test)
print("\n")
print_all_metrics(clf_rf, X_test, y_test)

Metrics from all the models:

Model: 
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=5000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
('Precision: ', 0.7185173932857359, 'Recall: ', 0.8501078360891445, 'Accuracy: ', 0.694659090909091)
('F1 Score: ', 0.7787931176422163, 'F2 Score: ', 0.820070044037588)


Model: 
SVC(C=2154.43469003, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.000215443469003,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
('Precision: ', 0.752022453359749, 'Recall: ', 0.8186556434219986, 'Accuracy: ', 0.7146590909090909)
('F1 Score: ', 0.7839256518371914, 'F2 Score: ', 0.8044008052837919)


Model: 
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1

## Summary & Next steps

### GradientBoostingClassifier is the best model with 90% accuracy and above 90% precision, recall, f1 and f2 scores as well