In [1]:
# Load Dataset
from helpers import get_dataset

(X_train, y_train), (X_test, y_test) = get_dataset()

X_train - (11388, 8) , y_train - (11388,)
X_test - (3603, 8) , y_test - (3603,)


In [2]:
# imports
from helpers import print_analysis, average
import joblib

In [4]:
# 1. Logistic Regression

from sklearn.linear_model import LogisticRegression
clf_LR = LogisticRegression(max_iter=1000000, verbose=1, C=0.2).fit(X_train, y_train)
y_pred = clf_LR.predict(X_test)
y_pred = average(y_pred)
y_score = clf_LR.predict_proba(X_test)[:,1]

print_analysis(y_test, y_pred, y_score)

# save the model
joblib.dump(clf_LR, './models/logistic.pkl')

Accuracy: 0.7502081598667777
Precison: 0.8413251961639059
Recall: 0.5733808674985146
F1 Score: 0.6819787985865725
ROC Score:  0.7891448182808477
Confusion Matrix:
[[1738  182]
 [ 718  965]]
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


['./models/logistic.pkl']

In [9]:
# 2. KNN

from sklearn.neighbors import KNeighborsClassifier
# Define KNN and Fit data
clf_neigh = KNeighborsClassifier(n_neighbors=80)
clf_neigh.fit(X_train, y_train)

# Predict values 
y_pred_KN = clf_neigh.predict(X_test)
y_pred_KN = average(y_pred_KN)

# Get Score
y_score_KN = clf_neigh.predict_proba(X_test)[:,1]

print_analysis(y_test, y_pred_KN, y_score_KN)

# save the model
joblib.dump(clf_neigh, './models/knn.pkl')

Accuracy: 0.8184845961698585
Precison: 0.8819599109131403
Recall: 0.7058823529411765
F1 Score: 0.7841584158415842
ROC Score:  0.8281140139136464
Confusion Matrix:
[[1761  159]
 [ 495 1188]]


['./models/knn.pkl']

In [10]:
# 3. Decision Tree
from sklearn.tree import DecisionTreeClassifier

# Create and Fit Decison tree
clf_dt = DecisionTreeClassifier(random_state=0, max_depth=5)
clf_dt.fit(X_train, y_train)

# Get predictions
y_pred_dt = clf_dt.predict(X_test)
y_pred_dt = average(y_pred_dt)
 # Get Score
y_score_dt = clf_dt.predict_proba(X_test)[:,1]

print_analysis(y_test, y_pred_dt, y_score_dt)

# save the model
joblib.dump(clf_dt, './models/decision_tree.pkl')

Accuracy: 0.8157091312794893
Precison: 0.9027667984189723
Recall: 0.6785502079619726
F1 Score: 0.7747625508819539
ROC Score:  0.7893105379778175
Confusion Matrix:
[[1797  123]
 [ 541 1142]]


['./models/decision_tree.pkl']

In [11]:
# 4. Random Forest
from sklearn.ensemble import RandomForestClassifier

# Create and Fit Random Forest
clf_RF = RandomForestClassifier(random_state=0, max_depth=6)
clf_RF.fit(X_train, y_train)

# Get predictions
y_pred_RF = clf_RF.predict(X_test)
y_pred_RF = average(y_pred_RF)
 # Get Score
y_score_RF = clf_RF.predict_proba(X_test)[:,1]

print_analysis(y_test, y_pred_RF, y_score_RF)

# save the model
joblib.dump(clf_RF, './models/random_forest.pkl')

Accuracy: 0.8201498751040799
Precison: 0.8935361216730038
Recall: 0.6981580510992276
F1 Score: 0.7838559039359573
ROC Score:  0.8146299081501287
Confusion Matrix:
[[1780  140]
 [ 508 1175]]


['./models/random_forest.pkl']

In [3]:
# 5. XGB Classifier

from xgboost import XGBClassifier

# Create and Fit XGB Classifier
clf_XGB = XGBClassifier()
clf_XGB.fit(X_train, y_train)

# Get predictions
y_pred_XGB = clf_XGB.predict(X_test)
y_pred_XGB = average(y_pred_XGB)
 # Get Score
y_score_XGB = clf_XGB.predict_proba(X_test)[:,1]

print_analysis(y_test, y_pred_XGB, y_score_XGB)

# save the model
joblib.dump(clf_XGB, './models/xgb.pkl')

Accuracy: 0.8254232583957813
Precison: 0.8675034867503487
Recall: 0.7391562685680333
F1 Score: 0.7982034007058069
ROC Score:  0.8382798573975044
Confusion Matrix:
[[1730  190]
 [ 439 1244]]


['./models/xgb.pkl']

In [3]:
# Ensemble model. An ensemble of Random forest, Decision Tree and XG Boost

import numpy as np
import pandas as pd
import joblib

In [9]:

dt_model = joblib.load('./models/decision_tree.pkl', mmap_mode ='r')
rf_model = joblib.load('./models/random_forest.pkl' , mmap_mode ='r')
xgb_model = joblib.load('./models/xgb.pkl' , mmap_mode ='r')

In [10]:
# Get predictions
pred_dt = dt_model.predict(X_train)
pred_dt = average(pred_dt)
# Get Score
score_dt = dt_model.predict_proba(X_train)[:,1]

print_analysis(y_train, pred_dt, score_dt)

Accuracy: 0.797154899894626
Precison: 0.8923788653518383
Recall: 0.6623893005602747
F1 Score: 0.7603734439834023
ROC Score:  0.8255984163337651
Confusion Matrix:
[[5413  442]
 [1868 3665]]


In [11]:
# Get predictions
pred_rf = rf_model.predict(X_train)
pred_rf = average(pred_rf)
# Get Score
score_rf = rf_model.predict_proba(X_train)[:,1]

print_analysis(y_train, pred_rf, score_rf)

Accuracy: 0.8202493853178785
Precison: 0.905160390516039
Recall: 0.7037773359840954
F1 Score: 0.7918657854600916
ROC Score:  0.8632612059959164
Confusion Matrix:
[[5447  408]
 [1639 3894]]


In [12]:
# Get predictions
pred_xgb = xgb_model.predict(X_train)
pred_xgb = average(pred_xgb)
# Get Score
score_xgb = xgb_model.predict_proba(X_train)[:,1]

print_analysis(y_train, pred_xgb, score_xgb)

Accuracy: 0.8276255707762558
Precison: 0.8861964517524881
Recall: 0.7402855593710465
F1 Score: 0.8066962087641556
ROC Score:  0.8879302864591813
Confusion Matrix:
[[5329  526]
 [1437 4096]]


In [22]:
y_ensemble_train = np.stack((pred_dt, pred_rf, pred_xgb), axis=1)
y_ensemble_train.shape

(11388, 3)

In [27]:
from sklearn.linear_model import LogisticRegression
clf_LR_ensemble = LogisticRegression(max_iter=1000000, verbose=1, C=0.2).fit(y_ensemble_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


In [30]:
# Get predictions
test_pred_dt = dt_model.predict(X_test)
test_pred_dt = average(test_pred_dt)
# Get Score
test_score_dt = dt_model.predict_proba(X_test)[:,1]

print_analysis(y_test, test_pred_dt, test_score_dt)

Accuracy: 0.8157091312794893
Precison: 0.9027667984189723
Recall: 0.6785502079619726
F1 Score: 0.7747625508819539
ROC Score:  0.7893105379778175
Confusion Matrix:
[[1797  123]
 [ 541 1142]]


In [31]:
# Get predictions
test_pred_rf = rf_model.predict(X_test)
test_pred_rf = average(test_pred_rf)
# Get Score
test_score_rf = rf_model.predict_proba(X_test)[:,1]

print_analysis(y_test, test_pred_rf, test_score_rf)

Accuracy: 0.8201498751040799
Precison: 0.8935361216730038
Recall: 0.6981580510992276
F1 Score: 0.7838559039359573
ROC Score:  0.8146299081501287
Confusion Matrix:
[[1780  140]
 [ 508 1175]]


In [32]:
# Get predictions
test_pred_xgb = xgb_model.predict(X_test)
test_pred_xgb = average(test_pred_xgb)
# Get Score
test_score_xgb = xgb_model.predict_proba(X_test)[:,1]

print_analysis(y_test, test_pred_xgb, test_score_xgb)

Accuracy: 0.8254232583957813
Precison: 0.8675034867503487
Recall: 0.7391562685680333
F1 Score: 0.7982034007058069
ROC Score:  0.8382798573975044
Confusion Matrix:
[[1730  190]
 [ 439 1244]]


In [33]:
y_ensemble_test = np.stack((test_pred_dt, test_pred_rf, test_pred_xgb), axis=1)
y_ensemble_test.shape

(3603, 3)

In [34]:
y_pred_ensemble = clf_LR_ensemble.predict(y_ensemble_test)
y_score_ensemble = clf_LR.predict_proba(y_ensemble_test)[:,1]

print_analysis(y_test, y_pred_ensemble, y_score_ensemble)

Accuracy: 0.8226477935054122
Precison: 0.859504132231405
Recall: 0.7415329768270945
F1 Score: 0.7961722488038279
ROC Score:  0.8381565347098436
Confusion Matrix:
[[1716  204]
 [ 435 1248]]


In [35]:
# We will be using this ensemble for our model.
# save the model
joblib.dump(clf_LR_ensemble, './models/ensemble_model.pkl')

['./models/ensemble_model.pkl']

In [None]:
s