# **Baseline Model Implementation**

#### Importing libraries

In [1]:
#importing required libraries for data analysis
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

#Import Data balancing libraries
import imblearn
from imblearn.over_sampling import SMOTE

# Import models from sklearn
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

# Import evaluation metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score, roc_curve, auc

In [2]:
# Read the training & test datasets from Part1-Preprocessing part

x_train=pd.read_csv('x_train.csv')
x_test=pd.read_csv('x_test.csv')

y_train=pd.read_csv('y_train.csv')
y_test=pd.read_csv('y_test.csv')

print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)

(5250, 86) (2250, 86) (5250, 1) (2250, 1)


### Data Balancing


In supervised learning, a common strategy to overcome the class imbalance problem is to resample the original training dataset to decrease the overall level of class imbalance. Resampling is done either by oversampling the minority (positive) class and/or under-sampling the majority (negative) class until the classes are approximately equally represented.

All resampling operations are applied to only training datasets. If upsampling is done before splitting the dataset into a train and validation set, then it could end up with the same observation in both datasets. As a result, a machine learning model will be able to perfectly predict the value for those observations when predicting on the validation set, hence inflating the accuracy and recall.

A. Under Sampling:

    1. Random undersampling --> rus =  RandomUnderSampler()
    2. Near Miss --> nm = NearMiss()

B. Over Sampling:
    
    3. Random oversampling  --> ros =  RandomOverSampler()
    4. SMOTE (Synthetic Minority Over-Sampling Technique)
        sm = SMOTE()
        x_train_smote, y_train_smote = sm.fit_sample(x_train, y_train)
        x_train_smote.shape,y_train_smote.shape
    5. ADASYN (Adaptive Synthetic Sampling) --> adasyn =  ADASYN()

C. Hybrid Sampling:

    6. SMOTE+ENN --> smtenn =  SMOTEENN()
    7. SMOTE+Tomek link --> smtom =  SMOTETomek()

In [3]:
#importing SMOTE to handle class imbalance
from imblearn.over_sampling import SMOTE

smote = SMOTE()

# fit predictor and target variable
x_smote, y_smote = smote.fit_resample(x_train, y_train)

print('Original unbalanced dataset shape', len(y_train))
print('Resampled balanced dataset shape', len(y_smote))

Original unbalanced dataset shape 5250
Resampled balanced dataset shape 8242


In [4]:
# Saving final balanced files in csv format for future reference (for Part 3)

pd.DataFrame(x_smote).to_csv("x_smote.csv", index=None)
pd.DataFrame(y_smote).to_csv("y_smote.csv", index=None)

### 1. **Logistic Regression Model**

In [5]:
# Importing the Logistic Regression Model
logmodel = LogisticRegression(random_state=1)
logmodel.fit(x_smote,y_smote)

# predicting the y test observations
y_pred = logmodel.predict(x_test)
y_train_pred = logmodel.predict(x_smote)

In [6]:
#getting all scores for Logistic Regression
log_acctr = round(accuracy_score(y_train_pred,y_smote), 3)
log_acc = round(accuracy_score(y_pred,y_test), 3)
log_prec = round(precision_score(y_pred,y_test), 3)
log_rec = round(recall_score(y_pred,y_test), 3)
log_f1 = round(f1_score(y_pred,y_test), 3)
log_roc = round(roc_auc_score(y_pred,y_test), 3)

results = pd.DataFrame([['Logistic Regression', log_acctr, log_acc, log_prec, log_rec, log_f1, log_roc]],
               columns = ['Model', 'Train Accuracy', 'Test Accuracy', 'Precision', 'Recall', 'F1 Score','ROC'])
results

Unnamed: 0,Model,Train Accuracy,Test Accuracy,Precision,Recall,F1 Score,ROC
0,Logistic Regression,0.703,0.77,0.569,0.505,0.535,0.684


In [7]:
# Confusion Matrix
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.83      0.85      1726
           1       0.51      0.57      0.54       524

    accuracy                           0.77      2250
   macro avg       0.68      0.70      0.69      2250
weighted avg       0.78      0.77      0.77      2250

[[1434  292]
 [ 226  298]]


### 2. **Naive Bayes**

In [8]:
# Importing the Naive Bayes Model
nb = GaussianNB()
nb.fit(x_smote,y_smote)

# predicting the y test observations
y_pred = nb.predict(x_test)
y_train_pred = nb.predict(x_smote)

In [9]:
#getting all scores for Naive Bayes Model
nb_acctr = round(accuracy_score(y_train_pred,y_smote), 3)
nb_acc = round(accuracy_score(y_pred,y_test), 3)
nb_prec = round(precision_score(y_pred,y_test), 3)
nb_rec = round(recall_score(y_pred,y_test), 3)
nb_f1 = round(f1_score(y_pred,y_test), 3)
nb_roc = round(roc_auc_score(y_pred,y_test), 3)

results = pd.DataFrame([['Naive Bayes', nb_acctr, nb_acc, nb_prec, nb_rec, nb_f1, nb_roc]],
               columns = ['Model', 'Train Accuracy', 'Test Accuracy', 'Precision', 'Recall', 'F1 Score','ROC'])
results

Unnamed: 0,Model,Train Accuracy,Test Accuracy,Precision,Recall,F1 Score,ROC
0,Naive Bayes,0.542,0.781,0.105,0.696,0.182,0.74


In [10]:
# Confusion Matrix
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.99      0.87      1726
           1       0.70      0.10      0.18       524

    accuracy                           0.78      2250
   macro avg       0.74      0.55      0.53      2250
weighted avg       0.76      0.78      0.71      2250

[[1702   24]
 [ 469   55]]


### 3. **KNN classifier algorithm**

In [11]:
# Importing the KNN classifier
knn = KNeighborsClassifier()
knn.fit(x_smote,y_smote)

# predicting the y test observations
y_pred = knn.predict(x_test)
y_train_pred = knn.predict(x_smote)

In [12]:
#getting all scores for KNN classifier
knn_acctr = round(accuracy_score(y_train_pred,y_smote), 3)
knn_acc = round(accuracy_score(y_pred,y_test), 3)
knn_prec = round(precision_score(y_pred,y_test), 3)
knn_rec = round(recall_score(y_pred,y_test), 3)
knn_f1 = round(f1_score(y_pred,y_test), 3)
knn_roc = round(roc_auc_score(y_pred,y_test), 3)

results = pd.DataFrame([['KNN classifier', knn_acctr, knn_acc, knn_prec, knn_rec, knn_f1, knn_roc]],
               columns = ['Model', 'Train Accuracy', 'Test Accuracy', 'Precision', 'Recall', 'F1 Score','ROC'])
results

Unnamed: 0,Model,Train Accuracy,Test Accuracy,Precision,Recall,F1 Score,ROC
0,KNN classifier,0.866,0.659,0.605,0.361,0.453,0.605


In [13]:
# Confusion Matrix
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.68      0.75      1726
           1       0.36      0.60      0.45       524

    accuracy                           0.66      2250
   macro avg       0.61      0.64      0.60      2250
weighted avg       0.74      0.66      0.68      2250

[[1166  560]
 [ 207  317]]


### 4. **Decision Tree Classification**

In [14]:
#fitting data into Decision Tree Classifier
dtc = DecisionTreeClassifier(random_state=1)
dtc.fit(x_smote,y_smote)

# predicting the y test observations
y_pred = dtc.predict(x_test)
y_train_pred = dtc.predict(x_smote)

In [15]:
#getting all scores for Decision Tree Classifier
dtc_acctr = round(accuracy_score(y_train_pred,y_smote), 3)
dtc_acc = round(accuracy_score(y_pred,y_test), 3)
dtc_prec = round(precision_score(y_pred,y_test), 3)
dtc_rec = round(recall_score(y_pred,y_test), 3)
dtc_f1 = round(f1_score(y_pred,y_test), 3)
dtc_roc = round(roc_auc_score(y_pred,y_test), 3)

results = pd.DataFrame([['Decision Trees', dtc_acctr, dtc_acc, dtc_prec, dtc_rec, dtc_f1, dtc_roc]],
               columns = ['Model', 'Train Accuracy', 'Test Accuracy', 'Precision', 'Recall', 'F1 Score','ROC'])
results

Unnamed: 0,Model,Train Accuracy,Test Accuracy,Precision,Recall,F1 Score,ROC
0,Decision Trees,1.0,0.692,0.468,0.372,0.415,0.599


In [16]:
# Confusion Matrix
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.76      0.79      1726
           1       0.37      0.47      0.41       524

    accuracy                           0.69      2250
   macro avg       0.60      0.61      0.60      2250
weighted avg       0.72      0.69      0.70      2250

[[1313  413]
 [ 279  245]]


### 5. **Random Forest Classification**

In [17]:
#fitting data into Random Forest Classifier
rfc = RandomForestClassifier()
rfc.fit(x_smote,y_smote)

# predicting the y test observations
y_pred = rfc.predict(x_test)
y_train_pred = rfc.predict(x_smote)

In [18]:
#getting all scores for Random Forest Classifier
rfc_acctr = round(accuracy_score(y_train_pred,y_smote), 3)
rfc_acc = round(accuracy_score(y_pred,y_test), 3)
rfc_prec = round(precision_score(y_pred,y_test), 3)
rfc_rec = round(recall_score(y_pred,y_test), 3)
rfc_f1 = round(f1_score(y_pred,y_test), 3)
rfc_roc = round(roc_auc_score(y_pred,y_test), 3)

results = pd.DataFrame([['Random Forest', rfc_acctr, rfc_acc, rfc_prec, rfc_rec, rfc_f1, rfc_roc]],
               columns = ['Model', 'Train Accuracy', 'Test Accuracy', 'Precision', 'Recall', 'F1 Score','ROC'])
results

Unnamed: 0,Model,Train Accuracy,Test Accuracy,Precision,Recall,F1 Score,ROC
0,Random Forest,1.0,0.796,0.454,0.579,0.509,0.712


In [19]:
# Confusion Matrix
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.90      0.87      1726
           1       0.58      0.45      0.51       524

    accuracy                           0.80      2250
   macro avg       0.71      0.68      0.69      2250
weighted avg       0.78      0.80      0.79      2250

[[1553  173]
 [ 286  238]]


### 6.  **Gradient Boosting**

In [20]:
#fitting data into Gradient Boosting Classifier
gbc = GradientBoostingClassifier()
gbc.fit(x_smote,y_smote)

# predicting the y test observations
y_pred = gbc.predict(x_test)
y_train_pred = gbc.predict(x_smote)

In [21]:
#getting all scores for Logistic Regression
gbc_acctr = round(accuracy_score(y_train_pred,y_smote), 3)
gbc_acc = round(accuracy_score(y_pred,y_test), 3)
gbc_prec = round(precision_score(y_pred,y_test), 3)
gbc_rec = round(recall_score(y_pred,y_test), 3)
gbc_f1 = round(f1_score(y_pred,y_test), 3)
gbc_roc = round(roc_auc_score(y_pred,y_test), 3)

results = pd.DataFrame([['Gradient Boosting', gbc_acctr, gbc_acc, gbc_prec, gbc_rec, gbc_f1, gbc_roc]],
               columns = ['Model', 'Train Accuracy', 'Test Accuracy', 'Precision', 'Recall', 'F1 Score','ROC'])
results

Unnamed: 0,Model,Train Accuracy,Test Accuracy,Precision,Recall,F1 Score,ROC
0,Gradient Boosting,0.834,0.793,0.435,0.573,0.495,0.707


In [22]:
# Confusion Matrix
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.90      0.87      1726
           1       0.57      0.44      0.49       524

    accuracy                           0.79      2250
   macro avg       0.71      0.67      0.68      2250
weighted avg       0.78      0.79      0.78      2250

[[1556  170]
 [ 296  228]]


### 7. **XG Boosting**

In [23]:
#fitting data into XG Boosting Classifier
xgb = XGBClassifier()
xgb.fit(x_smote,y_smote)

# predicting the y test observations
y_pred = xgb.predict(x_test)
y_train_pred = xgb.predict(x_smote)

In [24]:
#getting all scores for XG Boosting Classifier
xgb_acctr = round(accuracy_score(y_train_pred,y_smote), 3)
xgb_acc = round(accuracy_score(y_pred,y_test), 3)
xgb_prec = round(precision_score(y_pred,y_test), 3)
xgb_rec = round(recall_score(y_pred,y_test), 3)
xgb_f1 = round(f1_score(y_pred,y_test), 3)
xgb_roc = round(roc_auc_score(y_pred,y_test), 3)

results = pd.DataFrame([['XG Boosting', xgb_acctr, xgb_acc, xgb_prec, xgb_rec, xgb_f1, xgb_roc]],
               columns = ['Model', 'Train Accuracy', 'Test Accuracy', 'Precision', 'Recall', 'F1 Score','ROC'])
results

Unnamed: 0,Model,Train Accuracy,Test Accuracy,Precision,Recall,F1 Score,ROC
0,XG Boosting,0.987,0.791,0.389,0.576,0.465,0.704


In [25]:
# Confusion Matrix
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.91      0.87      1726
           1       0.58      0.39      0.46       524

    accuracy                           0.79      2250
   macro avg       0.70      0.65      0.67      2250
weighted avg       0.77      0.79      0.78      2250

[[1576  150]
 [ 320  204]]


### 8. **ADA Boosting**

In [26]:
#fitting data into Ada Boosting Classifier
ada = AdaBoostClassifier()
ada.fit(x_smote,y_smote)

# predicting the y test observations
y_pred = ada.predict(x_test)
y_train_pred = ada.predict(x_smote)

In [27]:
#getting all scores for Ada Boosting Classifier
ada_acctr = round(accuracy_score(y_train_pred,y_smote), 3)
ada_acc = round(accuracy_score(y_pred,y_test), 3)
ada_prec = round(precision_score(y_pred,y_test), 3)
ada_rec = round(recall_score(y_pred,y_test), 3)
ada_f1 = round(f1_score(y_pred,y_test), 3)
ada_roc = round(roc_auc_score(y_pred,y_test), 3)

results = pd.DataFrame([['ADA Boosting', ada_acctr, ada_acc, ada_prec, ada_rec, ada_f1, ada_roc]],
               columns = ['Model', 'Train Accuracy', 'Test Accuracy', 'Precision', 'Recall', 'F1 Score','ROC'])
results

Unnamed: 0,Model,Train Accuracy,Test Accuracy,Precision,Recall,F1 Score,ROC
0,ADA Boosting,0.781,0.764,0.475,0.492,0.483,0.667


In [28]:
# Confusion Matrix
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.85      0.85      1726
           1       0.49      0.48      0.48       524

    accuracy                           0.76      2250
   macro avg       0.67      0.66      0.67      2250
weighted avg       0.76      0.76      0.76      2250

[[1469  257]
 [ 275  249]]


### 9. **Bagging Classifier**

In [29]:
#fitting data into Bagging Classifier
bag = BaggingClassifier()
bag.fit(x_smote,y_smote)

# predicting the y test observations
y_pred = bag.predict(x_test)
y_train_pred = bag.predict(x_smote)

In [30]:
#getting all scores for Bagging Classifier
bag_acctr = round(accuracy_score(y_train_pred,y_smote), 3)
bag_acc = round(accuracy_score(y_pred,y_test), 3)
bag_prec = round(precision_score(y_pred,y_test), 3)
bag_rec = round(recall_score(y_pred,y_test), 3)
bag_f1 = round(f1_score(y_pred,y_test), 3)
bag_roc = round(roc_auc_score(y_pred,y_test), 3)

results = pd.DataFrame([['Bagging Classifier', bag_acctr, bag_acc, bag_prec, bag_rec, bag_f1, bag_roc]],
               columns = ['Model', 'Train Accuracy', 'Test Accuracy', 'Precision', 'Recall', 'F1 Score','ROC'])
results

Unnamed: 0,Model,Train Accuracy,Test Accuracy,Precision,Recall,F1 Score,ROC
0,Bagging Classifier,0.993,0.776,0.422,0.525,0.468,0.68


In [31]:
# Confusion Matrix
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.88      0.86      1726
           1       0.52      0.42      0.47       524

    accuracy                           0.78      2250
   macro avg       0.68      0.65      0.66      2250
weighted avg       0.76      0.78      0.77      2250

[[1526  200]
 [ 303  221]]


### **Baseline Model Comparision**

In [32]:
all_classifiers = ['Logistic Regression', 'KNN Classifier', 'Naive Bayes', 'Decision Tree', 'Random Forest',
                  'Gradient Boosting', 'XG Boosting', 'Ada Boosting', 'Bagging algorithm']

all_train_accuracy =  [log_acctr, knn_acctr, nb_acctr, dtc_acctr, rfc_acctr, gbc_acctr, xgb_acctr, ada_acctr, bag_acctr]
all_test_accuracy =   [log_acc, knn_acc, nb_acc, dtc_acc, rfc_acc, gbc_acc, xgb_acc, ada_acc, bag_acc]
all_precision_score = [log_prec, knn_prec, nb_prec, dtc_prec, rfc_prec, gbc_prec, xgb_prec, ada_prec, bag_prec]
all_recall_score =    [log_rec, knn_rec, nb_rec, dtc_rec, rfc_rec, gbc_rec, xgb_rec, ada_rec, bag_rec]
all_f1_score =        [log_f1, knn_f1, nb_f1, dtc_f1, rfc_f1, gbc_f1, xgb_f1, ada_f1, bag_f1]
all_roc_score =       [log_roc, knn_roc, nb_roc, dtc_roc, rfc_roc, gbc_roc, xgb_roc, ada_roc, bag_roc]

In [33]:
compare_df = pd.DataFrame({'Classifier':all_classifiers, 'Train Accuracy': all_train_accuracy, 'Test Accuracy': all_test_accuracy, 'Precision': all_precision_score, 'Recall': all_recall_score, 'F1 Score': all_f1_score , 'AUC': all_roc_score})
compare_df

Unnamed: 0,Classifier,Train Accuracy,Test Accuracy,Precision,Recall,F1 Score,AUC
0,Logistic Regression,0.703,0.77,0.569,0.505,0.535,0.684
1,KNN Classifier,0.866,0.659,0.605,0.361,0.453,0.605
2,Naive Bayes,0.542,0.781,0.105,0.696,0.182,0.74
3,Decision Tree,1.0,0.692,0.468,0.372,0.415,0.599
4,Random Forest,1.0,0.796,0.454,0.579,0.509,0.712
5,Gradient Boosting,0.834,0.793,0.435,0.573,0.495,0.707
6,XG Boosting,0.987,0.791,0.389,0.576,0.465,0.704
7,Ada Boosting,0.781,0.764,0.475,0.492,0.483,0.667
8,Bagging algorithm,0.993,0.776,0.422,0.525,0.468,0.68


#### Conclusion from Baseline models

1. From all baseline model, XG Boosting classifier shows better test accuracy and F1 score and AUC.
2. Ensemble models shows better performance compated to base models (except for Logistic regression model)
3. Train accuracy - There is huge difference between Train and Test Accuracy in tree based models which shows OverFitting

We can try in the next part, Cross validation and Hyperparameter tuning techniques to reduce chances of overfitting and also increases performance of model.