# Classification Models Evaluation Metrics
# Sanjay Gupta
# Date: 30-August-2021

# Objective

In [1]:
import pandas as pd
import io
import requests
url = "https://raw.githubusercontent.com/sanjaygupta1963/Pythoncoding/main/bank-names.txt"
read_data=requests.get(url).content.decode('utf-8')
print(read_data)

Citation Request:
  This dataset is public available for research. The details are described in [Moro et al., 2011]. 
  Please include this citation if you plan to use this database:

  [Moro et al., 2011] S. Moro, R. Laureano and P. Cortez. Using Data Mining for Bank Direct Marketing: An Application of the CRISP-DM Methodology. 
  In P. Novais et al. (Eds.), Proceedings of the European Simulation and Modelling Conference - ESM'2011, pp. 117-121, Guimarães, Portugal, October, 2011. EUROSIS.

  Available at: [pdf] http://hdl.handle.net/1822/14838
                [bib] http://www3.dsi.uminho.pt/pcortez/bib/2011-esm-1.txt

1. Title: Bank Marketing

2. Sources
   Created by: Paulo Cortez (Univ. Minho) and Sérgio Moro (ISCTE-IUL) @ 2012
   
3. Past Usage:

  The full dataset was described and analyzed in:

  S. Moro, R. Laureano and P. Cortez. Using Data Mining for Bank Direct Marketing: An Application of the CRISP-DM Methodology. 
  In P. Novais et al. (Eds.), Proceedin

# Importing Libraries

In [2]:
# importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.cluster import KMeans

# Load the Dataset

In [3]:
#specify URL where data is located
urltrain = 'https://raw.githubusercontent.com/sanjaygupta1963/Pythoncoding/main/bank-full.csv'
urltest = 'https://raw.githubusercontent.com/sanjaygupta1963/Pythoncoding/main/bank.csv'

# Load Dataset from the Github URL
dftrain = pd.read_csv(urltrain,sep=';')
dftest = pd.read_csv(urltest,sep=';')

In [4]:
dftrain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [5]:
dftest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        4521 non-null   int64 
 1   job        4521 non-null   object
 2   marital    4521 non-null   object
 3   education  4521 non-null   object
 4   default    4521 non-null   object
 5   balance    4521 non-null   int64 
 6   housing    4521 non-null   object
 7   loan       4521 non-null   object
 8   contact    4521 non-null   object
 9   day        4521 non-null   int64 
 10  month      4521 non-null   object
 11  duration   4521 non-null   int64 
 12  campaign   4521 non-null   int64 
 13  pdays      4521 non-null   int64 
 14  previous   4521 non-null   int64 
 15  poutcome   4521 non-null   object
 16  y          4521 non-null   object
dtypes: int64(7), object(10)
memory usage: 600.6+ KB


In [6]:
# Rename the Column Names 'y' as 'tdeposit' for avoiding any confusion
dftrain=dftrain.rename(columns={"y": "tdeposit"})
dftest=dftest.rename(columns={"y": "tdeposit"})
# Replace 
dftrain=dftrain.replace({'no': 0, 'yes': 1})
dftest=dftest.replace({'no': 0, 'yes': 1})

In [7]:
dftrain.head(10)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,tdeposit
0,58,management,married,tertiary,0,2143,1,0,unknown,5,may,261,1,-1,0,unknown,0
1,44,technician,single,secondary,0,29,1,0,unknown,5,may,151,1,-1,0,unknown,0
2,33,entrepreneur,married,secondary,0,2,1,1,unknown,5,may,76,1,-1,0,unknown,0
3,47,blue-collar,married,unknown,0,1506,1,0,unknown,5,may,92,1,-1,0,unknown,0
4,33,unknown,single,unknown,0,1,0,0,unknown,5,may,198,1,-1,0,unknown,0
5,35,management,married,tertiary,0,231,1,0,unknown,5,may,139,1,-1,0,unknown,0
6,28,management,single,tertiary,0,447,1,1,unknown,5,may,217,1,-1,0,unknown,0
7,42,entrepreneur,divorced,tertiary,1,2,1,0,unknown,5,may,380,1,-1,0,unknown,0
8,58,retired,married,primary,0,121,1,0,unknown,5,may,50,1,-1,0,unknown,0
9,43,technician,single,secondary,0,593,1,0,unknown,5,may,55,1,-1,0,unknown,0


In [8]:
dftrain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  int64 
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  int64 
 7   loan       45211 non-null  int64 
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  tdeposit   45211 non-null  int64 
dtypes: int64(11), object(6)
memory usage: 5.9+ MB


In [9]:
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score, classification_report

from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score

#remove columns that are not required
dftrain = dftrain.drop(['contact','pdays'], axis=1)

# Convert Negative Values from Balance and PDays
dftrain['balance'] = dftrain.balance.abs()

In [10]:
# Defining the categorical columns 
categoricalColumns = ['job', 'marital', 'education', 'month','poutcome']

print("Categorical columns : " )
print(categoricalColumns)

impute_categorical = SimpleImputer(strategy="most_frequent")
onehot_categorical =  OneHotEncoder(handle_unknown='ignore')

categorical_transformer = Pipeline(steps=[('impute',impute_categorical),('onehot',onehot_categorical)])

print(categorical_transformer)

Categorical columns : 
['job', 'marital', 'education', 'month', 'poutcome']
Pipeline(steps=[('impute', SimpleImputer(strategy='most_frequent')),
                ('onehot', OneHotEncoder(handle_unknown='ignore'))])


# Naive Bayes Classification Model

In [11]:
dftrain.info() 
# Defining the numerical columns 
numericalColumns = dftrain.select_dtypes(include=[np.float,np.int]).columns

# Drop All object datatype columns before the regression model
# numericalColumns = dftrain.select_dtypes(include=['float64','int32','int64', 'uint8'])

print("Numerical columns : " )
print(numericalColumns)

scaler_numerical = StandardScaler()

numerical_transformer = Pipeline(steps=[('scale',scaler_numerical)])

# Column Transform
preprocessorForCategoricalColumns = ColumnTransformer(transformers=[('cat', categorical_transformer, categoricalColumns)],
                                            remainder="passthrough")
preprocessorForAllColumns = ColumnTransformer(transformers=[('cat', categorical_transformer, categoricalColumns),('num',numerical_transformer,numericalColumns)],
                                            remainder="passthrough")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  int64 
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  int64 
 7   loan       45211 non-null  int64 
 8   day        45211 non-null  int64 
 9   month      45211 non-null  object
 10  duration   45211 non-null  int64 
 11  campaign   45211 non-null  int64 
 12  previous   45211 non-null  int64 
 13  poutcome   45211 non-null  object
 14  tdeposit   45211 non-null  int64 
dtypes: int64(10), object(5)
memory usage: 5.2+ MB
Numerical columns : 
Index([], dtype='object')


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  numericalColumns = dftrain.select_dtypes(include=[np.float,np.int]).columns
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  numericalColumns = dftrain.select_dtypes(include=[np.float,np.int]).columns


In [12]:
# Dropping the output variable price from the train dataset to the X for Model Building
X = dftrain
# Assigning the output variable price from the train dataset to the Y for Model prediction 
y = dftrain["tdeposit"]

# split into 80:20 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
X_train.head(10)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,day,month,duration,campaign,previous,poutcome,tdeposit
13932,57,admin.,divorced,secondary,0,658,0,0,10,jul,724,1,0,unknown,1
9894,37,unknown,married,unknown,0,1699,0,0,9,jun,63,1,0,unknown,0
39946,35,technician,divorced,secondary,0,2823,1,0,2,jun,102,4,2,failure,0
9217,35,admin.,married,secondary,0,214,1,1,5,jun,247,1,0,unknown,0
4124,38,services,single,tertiary,0,323,1,0,19,may,138,1,0,unknown,0
30085,30,management,single,tertiary,0,57,1,0,4,feb,153,2,0,unknown,0
17266,39,blue-collar,married,primary,0,643,0,1,28,jul,24,14,0,unknown,0
34553,40,technician,single,tertiary,0,47,1,0,5,may,159,1,1,failure,0
5386,41,technician,married,tertiary,0,762,1,1,23,may,145,1,0,unknown,0
12146,52,technician,married,secondary,0,945,0,0,20,jun,16,8,0,unknown,0


In [13]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36168 entries, 13932 to 2732
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        36168 non-null  int64 
 1   job        36168 non-null  object
 2   marital    36168 non-null  object
 3   education  36168 non-null  object
 4   default    36168 non-null  int64 
 5   balance    36168 non-null  int64 
 6   housing    36168 non-null  int64 
 7   loan       36168 non-null  int64 
 8   day        36168 non-null  int64 
 9   month      36168 non-null  object
 10  duration   36168 non-null  int64 
 11  campaign   36168 non-null  int64 
 12  previous   36168 non-null  int64 
 13  poutcome   36168 non-null  object
 14  tdeposit   36168 non-null  int64 
dtypes: int64(10), object(5)
memory usage: 4.4+ MB


In [14]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

model_name = 'Naive Bayes Classifier'

nbClassifier = MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

nb_model = Pipeline(steps=[('preprocessor', preprocessorForCategoricalColumns),('classifier', nbClassifier)]) 

nb_model.fit(X_train,y_train)

y_pred_nb = nb_model.predict(X_test)

print('Actual vs Predicted data : ' +model_name + '. Accuracy : %.2f' % accuracy_score(y_test, y_pred_nb))

Actual vs Predicted data : Naive Bayes Classifier. Accuracy : 0.51


In [15]:
# Confusion Matrix
confusion_matrix(y_test, y_pred_nb)

array([[3857, 4123],
       [ 315,  748]], dtype=int64)

In [16]:
# Finding precision 
precision_score(y_test, y_pred_nb)

0.15356189694107986

In [17]:
# Finding recall
recall_score(y_test, y_pred_nb)

0.7036688617121355

# Logistic Regression Classification Model

In [18]:
from sklearn.linear_model import LogisticRegression

model_name = "Logistic Regression Classifier"

logisticRegressionClassifier = LogisticRegression(random_state=0,multi_class='auto',solver='lbfgs',max_iter=1000)

lrc_model = Pipeline(steps=[('preprocessor', preprocessorForCategoricalColumns),
                            ('classifier', logisticRegressionClassifier)]) 

lrc_model.fit(X_train,y_train)

y_pred_lrc = lrc_model.predict(X_test)

print('Actual vs Predicted data : ' +model_name + '. Accuracy : %.2f' % accuracy_score(y_test, y_pred_lrc))

Actual vs Predicted data : Logistic Regression Classifier. Accuracy : 1.00


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [19]:
# Confusion Matrix
confusion_matrix(y_test, y_pred_lrc)

array([[7980,    0],
       [   0, 1063]], dtype=int64)

In [20]:
# Finding precision 
precision_score(y_test, y_pred_lrc)

1.0

In [21]:
# Finding recall
recall_score(y_test, y_pred_lrc)

1.0

# K-Nearest Classification Model

In [22]:
from sklearn.neighbors import KNeighborsClassifier

model_name = "K-Nearest Neighbor Classifier"

knnClassifier = KNeighborsClassifier(n_neighbors = 5, metric='minkowski', p=2)

knn_model = Pipeline(steps=[('preprocessorAll',preprocessorForAllColumns),('classifier', knnClassifier)]) 

knn_model.fit(X_train,y_train)

y_pred_knn = knn_model.predict(X_test)

print('Actual vs Predicted data : ' +model_name + '. Accuracy : %.2f' % accuracy_score(y_test, y_pred_knn))

Actual vs Predicted data : K-Nearest Neighbor Classifier. Accuracy : 0.88


In [23]:
# Confusion Matrix
confusion_matrix(y_test, y_pred_knn)

array([[7709,  271],
       [ 845,  218]], dtype=int64)

In [24]:
# Finding precision 
precision_score(y_test, y_pred_knn)

0.4458077709611452

In [25]:
# Finding recall
recall_score(y_test, y_pred_knn)

0.2050799623706491

# Random Forest Classification Model

In [26]:
from sklearn.ensemble import RandomForestClassifier

model_name = "Random Forest Classifier"

randomForestClassifier = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0)

rfc_model = Pipeline(steps=[('preprocessorAll',preprocessorForAllColumns),('classifier', randomForestClassifier)]) 

rfc_model.fit(X_train,y_train)

y_pred_rfc = rfc_model.predict(X_test)

print('Actual vs Predicted data : ' +model_name + '. Accuracy : %.2f' % accuracy_score(y_test, y_pred_rfc))

Actual vs Predicted data : Random Forest Classifier. Accuracy : 0.90


In [27]:
# Confusion Matrix
confusion_matrix(y_test, y_pred_rfc)

array([[7980,    0],
       [ 932,  131]], dtype=int64)

In [28]:
# Finding precision 
precision_score(y_test, y_pred_rfc)

1.0

In [29]:
# Finding recall
recall_score(y_test, y_pred_rfc)

0.12323612417685795

# Kernel SVM Classification Model

In [None]:
from sklearn.svm import SVC

model_name = 'Kernel SVM Classifier'

svmClassifier = SVC(kernel='rbf', gamma= 'auto')

svm_model = Pipeline(steps=[('preprocessorAll',preprocessorForAllColumns),('classifier', svmClassifier)]) 

svm_model.fit(X_train,y_train)

y_pred_svm = svm_model.predict(X_test)

print('Actual vs Predicted data : ' +model_name + '. Accuracy : %.2f' % accuracy_score(y_test, y_pred_svm))

In [None]:
# Confusion Matrix
confusion_matrix(y_test, y_pred_svm)

In [None]:
# Finding precision 
precision_score(y_test, y_pred_svm)

In [None]:
# Finding recall
recall_score(y_test, y_pred_svm)

# Notes on Confusion Matrix

# Notes on Precision

# Notes on Recall

# Notes on F1 Score