In [1]:
# Importing the libraries 

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

# Ignore harmless warnings 

import warnings 
warnings.filterwarnings("ignore")

# Set to display all the columns in dataset

pd.set_option("display.max_columns", None)

# Import psql to run queries 

import pandasql as psql

# import datetime class from datetime module

from datetime import datetime

In [2]:
# Load the HR data 

hrdata = pd.read_csv(r"C:\Users\Admin\Downloads\Raju Sir DLS\Ensemble Learning\HR_comma_sep.csv", header=0)

# Copy to back-up file

hrdata_bk = hrdata.copy()

# Display first 5 records

hrdata.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [3]:
# Display the dataset information

hrdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   Department             14999 non-null  object 
 9   salary                 14999 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


In [5]:
# Display dataset columns

hrdata.columns

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident', 'left',
       'promotion_last_5years', 'Department', 'salary'],
      dtype='object')

In [6]:
# Create cols1 for crating a dummy variables & Cols 2 for scaling the data normilization

cols1 = ['Department', 'salary']
cols2 = ['satisfaction_level', 'last_evaluation', 'number_project', 'average_montly_hours', 'time_spend_company']

In [7]:
# Create dummy variable for all cols1 columns

hrdata = pd.get_dummies(hrdata, columns=cols1)
hrdata.head().T

Unnamed: 0,0,1,2,3,4
satisfaction_level,0.38,0.8,0.11,0.72,0.37
last_evaluation,0.53,0.86,0.88,0.87,0.52
number_project,2.0,5.0,7.0,5.0,2.0
average_montly_hours,157.0,262.0,272.0,223.0,159.0
time_spend_company,3.0,6.0,4.0,5.0,3.0
Work_accident,0.0,0.0,0.0,0.0,0.0
left,1.0,1.0,1.0,1.0,1.0
promotion_last_5years,0.0,0.0,0.0,0.0,0.0
Department_IT,0.0,0.0,0.0,0.0,0.0
Department_RandD,0.0,0.0,0.0,0.0,0.0


In [8]:
# Identify the independent and Target variables

IndepVar = []
for col in hrdata.columns:
    if col != 'left':
        IndepVar.append(col)

TargetVar = 'left'

x = hrdata[IndepVar]
y = hrdata[TargetVar]

In [9]:
# Splitting the dataset into train and test 

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.30, random_state = 42)
x_test_F1 = x_test.copy()
x_train.shape, x_test.shape, y_train.shape, y_test.shape 

((10499, 20), (4500, 20), (10499,), (4500,))

In [10]:
# Scaling the features by using MinMaxScaler

from sklearn.preprocessing import MinMaxScaler

mmscaler = MinMaxScaler(feature_range=(0, 1))

x_train[cols2] = mmscaler.fit_transform(x_train[cols2])
x_train = pd.DataFrame(x_train)

x_test[cols2] = mmscaler.fit_transform(x_test[cols2])
x_test = pd.DataFrame(x_test)

In [12]:
# Load the result dataset

CSResults = pd.read_csv(r"C:\Users\Admin\Downloads\Raju Sir DLS\Ensemble Learning\CSResults.csv", header=0)
CSResults.head()

Unnamed: 0,Model Name,Accuracy,Precision,Recall,F1 Score,Specificity,MCC,ROC_AUC_Score,Balanced Accuracy


# AdaBoost with other classification models as base model

In [13]:
# AdaBoost Classifier
# Checking accuracy by changing base estimator - 

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

# Create objects

modelLR = LogisticRegression()
modelRF = RandomForestClassifier(criterion='gini', n_estimators=100, random_state=0)
modelDT = DecisionTreeClassifier(criterion="entropy")
ModelET = ExtraTreesClassifier()
ModelGNB = GaussianNB()
modelSVMGaussian = SVC(kernel='rbf', random_state = None, class_weight=None,probability=True)

base_methods=[None, modelLR, modelRF, modelDT, ModelET, ModelGNB]
for bm in base_methods:
    print("Method: ", bm)
    
    from sklearn.ensemble import AdaBoostClassifier
    
    modelAda = AdaBoostClassifier(base_estimator=bm, n_estimators=50, learning_rate=1.0,
                                  algorithm='SAMME.R', random_state=None)
    
    # fit the model with train data
    
    modelAda.fit(x_train, y_train)
    
    # Predict the model
    
    y_pred = modelAda.predict(x_test)
    y_pred_prob = modelAda.predict_proba(x_test)
    
    # Evaluate the model performance by metrics
    # confusion matrix in sklearn

    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import classification_report

    # actual values

    actual = y_test

    # predicted values

    predicted = y_pred

    # confusion matrix

    matrix = confusion_matrix(actual,predicted, labels=[1,0],sample_weight=None, normalize=None)
    print('Confusion matrix : \n', matrix)

    # outcome values order in sklearn

    tp, fn, fp, tn = confusion_matrix(actual,predicted,labels=[1,0]).reshape(-1)
    print('Outcome values : \n', tp, fn, fp, tn)

    # classification report for precision, recall f1-score and accuracy

    matrix = classification_report(actual,predicted,labels=[1,0])

    print('Classification report : \n',matrix)

    # calculating the metrics

    sensitivity = round(tp/(tp+fn), 3)
    specificity = round(tn/(tn+fp), 3);
    accuracy = round((tp+tn)/(tp+fp+tn+fn), 3);
    balanced_accuracy = round((sensitivity+specificity)/2, 3);
    
    precision = round(tp/(tp+fp), 3);
    f1Score = round((2*tp/(2*tp + fp + fn)), 3);

    # Matthews Correlation Coefficient (MCC). Range of values of MCC lie between -1 to +1. 
    # A model with a score of +1 is a perfect model and -1 is a poor model

    from math import sqrt

    mx = (tp+fp) * (tp+fn) * (tn+fp) * (tn+fn)
    MCC = round(((tp * tn) - (fp * fn)) / sqrt(mx), 3)

    print('Accuracy :', round(accuracy*100, 2),'%')
    print('Precision :', round(precision*100, 2),'%')
    print('Recall :', round(sensitivity*100,2), '%')
    print('F1 Score :', f1Score)
    print('Balanced Accuracy :', round(balanced_accuracy*100, 2),'%')
    print('MCC :', MCC)

    # Area under ROC curve 

    from sklearn.metrics import roc_curve, roc_auc_score

    print('roc_auc_score:', round(roc_auc_score(actual, predicted), 3))
    print('-----------------------------------------------------------------------')
    #-------------------------------------------------------------------------------------
    new_row = {'Model Name' : bm,
               'Accuracy' : accuracy,
               'Precision' : precision,
               'Recall' : sensitivity,
               'F1 Score' : f1Score,
               'Specificity' : specificity,
               'MCC':MCC,
               'ROC_AUC_Score':roc_auc_score(y_test, y_pred),
               'Balanced Accuracy':balanced_accuracy}
    CSResults = CSResults.append(new_row, ignore_index=True)
    #-------------------------------------------------------------------------------------

Method:  None
Confusion matrix : 
 [[ 959  113]
 [  72 3356]]
Outcome values : 
 959 113 72 3356
Classification report : 
               precision    recall  f1-score   support

           1       0.93      0.89      0.91      1072
           0       0.97      0.98      0.97      3428

    accuracy                           0.96      4500
   macro avg       0.95      0.94      0.94      4500
weighted avg       0.96      0.96      0.96      4500

Accuracy : 95.9 %
Precision : 93.0 %
Recall : 89.5 %
F1 Score : 0.912
Balanced Accuracy : 93.7 %
MCC : 0.886
roc_auc_score: 0.937
-----------------------------------------------------------------------
Method:  LogisticRegression()
Confusion matrix : 
 [[ 163  909]
 [  73 3355]]
Outcome values : 
 163 909 73 3355
Classification report : 
               precision    recall  f1-score   support

           1       0.69      0.15      0.25      1072
           0       0.79      0.98      0.87      3428

    accuracy                           0.78  

In [14]:
# Results with comparing the all the algorithms 

#CSResults.to_csv("D://000 DataScience//01-Internship//CSResults_07.csv")

CSResults.head(20)

Unnamed: 0,Model Name,Accuracy,Precision,Recall,F1 Score,Specificity,MCC,ROC_AUC_Score,Balanced Accuracy
0,,0.959,0.93,0.895,0.912,0.979,0.886,0.936793,0.937
1,LogisticRegression(),0.782,0.691,0.152,0.249,0.979,0.25,0.565379,0.566
2,RandomForestClassifier(random_state=0),0.987,0.991,0.952,0.971,0.997,0.963,0.9749,0.974
3,DecisionTreeClassifier(criterion='entropy'),0.974,0.937,0.954,0.945,0.98,0.928,0.967081,0.967
4,ExtraTreesClassifier(),0.982,0.979,0.945,0.962,0.994,0.95,0.969272,0.97
5,GaussianNB(),0.465,0.284,0.819,0.422,0.354,0.159,0.586586,0.586


# AdaBoost Classifier with Loan Data

In [15]:
# Load the Loan data

loans = pd.read_csv(r"C:\Users\Admin\Downloads\Raju Sir DLS\Ensemble Learning\loan_data.csv", header=0)
loans.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [16]:
# Change the name of variable

loans = loans.rename(columns = {'not.fully.paid': 'NFPaid'}, inplace = False)
loans.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,NFPaid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [17]:
cat_cols = ['purpose']

loans = pd.get_dummies(loans,columns=cat_cols)

loans=pd.DataFrame(loans)
loans.head()

Unnamed: 0,credit.policy,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,NFPaid,purpose_all_other,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_major_purchase,purpose_small_business
0,1,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0,0,0,1,0,0,0,0
1,1,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0,0,1,0,0,0,0,0
2,1,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0,0,0,1,0,0,0,0
3,1,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0,0,0,1,0,0,0,0
4,1,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0,0,1,0,0,0,0,0


In [18]:
# Identify the independent and Target (dependent) variables

IndepVar = []
for col in loans.columns:
    if col != 'NFPaid':
        IndepVar.append(col)

TargetVar = 'NFPaid'

x = loans[IndepVar]
y = loans[TargetVar]

In [19]:
# Splitting the dataset into train and test 

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.30, random_state = 42) 

In [20]:
# display the columns names

loans.columns

Index(['credit.policy', 'int.rate', 'installment', 'log.annual.inc', 'dti',
       'fico', 'days.with.cr.line', 'revol.bal', 'revol.util',
       'inq.last.6mths', 'delinq.2yrs', 'pub.rec', 'NFPaid',
       'purpose_all_other', 'purpose_credit_card',
       'purpose_debt_consolidation', 'purpose_educational',
       'purpose_home_improvement', 'purpose_major_purchase',
       'purpose_small_business'],
      dtype='object')

In [21]:
cols1 = ['int.rate', 'installment', 'log.annual.inc', 'dti', 'fico', 'days.with.cr.line', 'revol.bal', 'revol.util']

In [22]:
# Scaling the features by using MinMaxScaler

from sklearn.preprocessing import MinMaxScaler

mmscaler = MinMaxScaler(feature_range=(0, 1))

x_train[cols1] = mmscaler.fit_transform(x_train[cols1])
x_train = pd.DataFrame(x_train)

x_test[cols1] = mmscaler.fit_transform(x_test[cols1])
x_test = pd.DataFrame(x_test)

# AdaBoost with other classification models as base model - 2

In [23]:
# AdaBoost Classifier
# Checking accuracy by changing base estimator - 

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB

# Create objects

modelLR = LogisticRegression()
modelRF = RandomForestClassifier(criterion='gini', n_estimators=100, random_state=0)
modelDT = DecisionTreeClassifier(criterion="entropy")
ModelET = ExtraTreesClassifier()
ModelGNB = GaussianNB()

base_methods=[None, modelLR, modelRF, modelDT, ModelET, ModelGNB]
for bm in base_methods:
    print("Method: ", bm)
    
    from sklearn.ensemble import AdaBoostClassifier
    
    modelAda = AdaBoostClassifier(base_estimator=bm, n_estimators=100, learning_rate=1.0,
                                  algorithm='SAMME.R', random_state=None)
    
    # fit the model with train data
    
    modelAda.fit(x_train, y_train)
    
    # Predict the model
    
    y_pred = modelAda.predict(x_test)
    y_pred_prob = modelAda.predict_proba(x_test)
    
    # Evaluate the model performance by metrics
    # confusion matrix in sklearn

    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import classification_report

    # actual values

    actual = y_test

    # predicted values

    predicted = y_pred

    # confusion matrix

    matrix = confusion_matrix(actual,predicted, labels=[1,0],sample_weight=None, normalize=None)
    print('Confusion matrix : \n', matrix)

    # outcome values order in sklearn

    tp, fn, fp, tn = confusion_matrix(actual,predicted,labels=[1,0]).reshape(-1)
    print('Outcome values : \n', tp, fn, fp, tn)

    # classification report for precision, recall f1-score and accuracy

    matrix = classification_report(actual,predicted,labels=[1,0])

    print('Classification report : \n',matrix)

    # calculating the metrics

    sensitivity = round(tp/(tp+fn), 3)
    specificity = round(tn/(tn+fp), 3);
    accuracy = round((tp+tn)/(tp+fp+tn+fn), 3);
    balanced_accuracy = round((sensitivity+specificity)/2, 3);
    
    precision = round(tp/(tp+fp), 3);
    f1Score = round((2*tp/(2*tp + fp + fn)), 3);

    # Matthews Correlation Coefficient (MCC). Range of values of MCC lie between -1 to +1. 
    # A model with a score of +1 is a perfect model and -1 is a poor model

    from math import sqrt

    mx = (tp+fp) * (tp+fn) * (tn+fp) * (tn+fn)
    MCC = round(((tp * tn) - (fp * fn)) / sqrt(mx), 3)

    print('Accuracy :', round(accuracy*100, 2),'%')
    print('Precision :', round(precision*100, 2),'%')
    print('Recall :', round(sensitivity*100,2), '%')
    print('F1 Score :', f1Score)
    print('Balanced Accuracy :', round(balanced_accuracy*100, 2),'%')
    print('MCC :', MCC)

    # Area under ROC curve 

    from sklearn.metrics import roc_curve, roc_auc_score

    print('roc_auc_score:', round(roc_auc_score(actual, predicted), 3))
    print('-----------------------------------------------------------------------')
    #-------------------------------------------------------------------------------------
    new_row = {'Model Name' : bm,
               'Accuracy' : accuracy,
               'Precision' : precision,
               'Recall' : sensitivity,
               'F1 Score' : f1Score,
               'Specificity' : specificity,
               'MCC':MCC,
               'ROC_AUC_Score':roc_auc_score(y_test, y_pred),
               'Balanced Accuracy':balanced_accuracy}
    CSResults = CSResults.append(new_row, ignore_index=True)
    #-------------------------------------------------------------------------------------

Method:  None
Confusion matrix : 
 [[  23  443]
 [  49 2359]]
Outcome values : 
 23 443 49 2359
Classification report : 
               precision    recall  f1-score   support

           1       0.32      0.05      0.09       466
           0       0.84      0.98      0.91      2408

    accuracy                           0.83      2874
   macro avg       0.58      0.51      0.50      2874
weighted avg       0.76      0.83      0.77      2874

Accuracy : 82.9 %
Precision : 31.9 %
Recall : 4.9 %
F1 Score : 0.086
Balanced Accuracy : 51.4 %
MCC : 0.068
roc_auc_score: 0.515
-----------------------------------------------------------------------
Method:  LogisticRegression()
Confusion matrix : 
 [[   7  459]
 [   9 2399]]
Outcome values : 
 7 459 9 2399
Classification report : 
               precision    recall  f1-score   support

           1       0.44      0.02      0.03       466
           0       0.84      1.00      0.91      2408

    accuracy                           0.84      2

In [26]:
# Results with comparing the all the algorithms 

CSResults.to_csv("C://Users//Admin//Downloads//Raju Sir DLS//Ensemble Learning//CSResultsNew01.csv")

CSResults.head(20)

Unnamed: 0,Model Name,Accuracy,Precision,Recall,F1 Score,Specificity,MCC,ROC_AUC_Score,Balanced Accuracy
0,,0.959,0.93,0.895,0.912,0.979,0.886,0.936793,0.937
1,LogisticRegression(),0.782,0.691,0.152,0.249,0.979,0.25,0.565379,0.566
2,RandomForestClassifier(random_state=0),0.987,0.991,0.952,0.971,0.997,0.963,0.9749,0.974
3,DecisionTreeClassifier(criterion='entropy'),0.974,0.937,0.954,0.945,0.98,0.928,0.967081,0.967
4,ExtraTreesClassifier(),0.982,0.979,0.945,0.962,0.994,0.95,0.969272,0.97
5,GaussianNB(),0.465,0.284,0.819,0.422,0.354,0.159,0.586586,0.586
6,,0.829,0.319,0.049,0.086,0.98,0.068,0.514504,0.514
7,LogisticRegression(),0.837,0.438,0.015,0.029,0.996,0.056,0.505642,0.505
8,RandomForestClassifier(random_state=0),0.836,0.333,0.013,0.025,0.995,0.037,0.503946,0.504
9,DecisionTreeClassifier(criterion='entropy'),0.721,0.22,0.283,0.248,0.806,0.081,0.544455,0.544
