# TARGET MARKETING FOR PORTUGUESE BANK

*Problem: Has the client subscribed to a term deposit?*

***IMPORT LIBRARIES***

In [1]:
import pandas as pd
from sklearn.preprocessing import label_binarize
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation

import warnings
warnings.filterwarnings("ignore")
#Please ignore the warnings with version change


***IMPORT TRAIN AND TEST DATASET***

In [2]:
trainfile = r'C:\Users\Shehjar Raina\Desktop\CIS 508 Data Mining\IA2\BankTrain.csv'
trainData = pd.read_csv(trainfile) #creates a dataframe
testfile = r'C:\Users\Shehjar Raina\Desktop\CIS 508 Data Mining\IA2\BankTest.csv'
testData = pd.read_csv(testfile)

print(trainData.shape)
print(testData.shape)

(4521, 17)
(45211, 17)


***READ TRAIN DATA***

In [3]:
trainData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        4521 non-null   int64 
 1   job        4521 non-null   object
 2   marital    4521 non-null   object
 3   education  4521 non-null   object
 4   default    4521 non-null   object
 5   balance    4521 non-null   int64 
 6   housing    4521 non-null   object
 7   loan       4521 non-null   object
 8   contact    4521 non-null   object
 9   day        4521 non-null   int64 
 10  month      4521 non-null   object
 11  duration   4521 non-null   int64 
 12  campaign   4521 non-null   int64 
 13  pdays      4521 non-null   int64 
 14  previous   4521 non-null   int64 
 15  poutcome   4521 non-null   object
 16  y          4521 non-null   object
dtypes: int64(7), object(10)
memory usage: 600.6+ KB


***READ TEST DATA***

In [4]:
testData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


***CHECK FOR MISSING VALUES***

In [5]:
trainData.isnull().sum().sort_values(ascending=False)

age          0
day          0
poutcome     0
previous     0
pdays        0
campaign     0
duration     0
month        0
contact      0
job          0
loan         0
housing      0
balance      0
default      0
education    0
marital      0
y            0
dtype: int64

***DESCRIBE***

In [6]:
trainData.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0
mean,41.170095,1422.657819,15.915284,263.961292,2.79363,39.766645,0.542579
std,10.576211,3009.638142,8.247667,259.856633,3.109807,100.121124,1.693562
min,19.0,-3313.0,1.0,4.0,1.0,-1.0,0.0
25%,33.0,69.0,9.0,104.0,1.0,-1.0,0.0
50%,39.0,444.0,16.0,185.0,2.0,-1.0,0.0
75%,49.0,1480.0,21.0,329.0,3.0,-1.0,0.0
max,87.0,71188.0,31.0,3025.0,50.0,871.0,25.0


In [7]:
TrainCols = list(trainData.columns.values)
TestCols = list(testData.columns.values)
print("Train Data Columns")
print(TrainCols)
print()
print("Test Data Columns")
print(TestCols)

Train Data Columns
['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']

Test Data Columns
['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']


***SEPARATE TARGET COLUMN FROM TRAIN DATASET***

In [8]:
# Seperate Target column from Train Data
Xtrain = trainData[TrainCols[0:len(TrainCols)-1]].copy()
Ytrain = trainData[['y']].copy()
print("Train Set shape:")
print(Xtrain.shape)
print(Ytrain.shape)
Xtest = testData[TestCols[0:len(TestCols)-1]].copy()
Ytest = testData[['y']].copy()
print("Test Set shape:")
print(Xtest.shape)
print(Ytest.shape)

Train Set shape:
(4521, 16)
(4521, 1)
Test Set shape:
(45211, 16)
(45211, 1)


***LIST ALL CATEGORICAL FEATURES***

In [9]:
#List of Categorical Features
categoricalFeatures = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']

***ONE HOT ENCODING ON TRAIN DATASET***

In [10]:
# OneHotEncoding on Train (fit & transform)
# OneHotEncoding is to be done on Categorical variables.
ohe = OneHotEncoder(handle_unknown='ignore',sparse=False)
Xcat = pd.DataFrame(ohe.fit_transform(Xtrain[categoricalFeatures]),columns=ohe.get_feature_names(),index=Xtrain.index)
Xtrain = pd.concat([Xtrain,Xcat],axis=1)
Xtrain.drop(labels=categoricalFeatures,axis=1,inplace=True)
Xtrain.sample(5)

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,x0_admin.,x0_blue-collar,x0_entrepreneur,...,x7_jun,x7_mar,x7_may,x7_nov,x7_oct,x7_sep,x8_failure,x8_other,x8_success,x8_unknown
3138,36,23,26,133,11,-1,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1933,49,474,9,152,3,221,1,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3048,35,414,20,11,5,319,2,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
237,57,206,5,216,3,-1,0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2562,46,7378,18,466,1,-1,0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


***ONE HOT ENCODING ON TEST DATASET***

In [11]:
# OneHotEncoding on Test (only transform)
# OneHotEncoding is to be done on Categorical variables.
Xcat = pd.DataFrame(ohe.transform(Xtest[categoricalFeatures]),columns=ohe.get_feature_names(),index=Xtest.index)
Xtest = pd.concat([Xtest,Xcat],axis=1)
Xtest.drop(labels=categoricalFeatures,axis=1,inplace=True)
Xtest.sample(5)

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,x0_admin.,x0_blue-collar,x0_entrepreneur,...,x7_jun,x7_mar,x7_may,x7_nov,x7_oct,x7_sep,x8_failure,x8_other,x8_success,x8_unknown
43492,21,546,17,94,1,-1,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9255,49,1677,5,19,3,-1,0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
44566,34,-52,13,12,1,-1,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
36768,39,102,12,293,2,-1,0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
23160,37,0,26,99,6,-1,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


***INITIALIZE DECISION TREE CLASSIFIER***

In [12]:
dt = DecisionTreeClassifier()
dt.fit(Xtrain, Ytrain)

DecisionTreeClassifier()

In [13]:
X_Pred = dt.predict(Xtest)
XPred = dt.predict(Xtrain)
#Model Accuracy
print("Train Accuracy:", metrics.accuracy_score(Ytrain,XPred))
print("Test Accuracy:", metrics.accuracy_score(Ytest,X_Pred))
print("Confusion Matrix for Decision Tree:")
print(confusion_matrix(Ytest,X_Pred))
print("Max Depth",dt.get_depth())
print("Leaf",dt.get_n_leaves())
print('Printing the precision and recall, among other metrics')
print(metrics.classification_report(Ytest, X_Pred))
clf_cv_score = cross_val_score(dt, Xtrain, Ytrain, cv=10, scoring="accuracy")
print("Accuracy of Model with Cross Validation is:",clf_cv_score.mean() * 100)

Train Accuracy: 1.0
Test Accuracy: 0.8844750171418461
Confusion Matrix for Decision Tree:
[[37270  2652]
 [ 2571  2718]]
Max Depth 26
Leaf 382
Printing the precision and recall, among other metrics
              precision    recall  f1-score   support

          no       0.94      0.93      0.93     39922
         yes       0.51      0.51      0.51      5289

    accuracy                           0.88     45211
   macro avg       0.72      0.72      0.72     45211
weighted avg       0.89      0.88      0.88     45211

Accuracy of Model with Cross Validation is: 87.50341870323702


***INITIALIZE RANDOM FOREST CLASSIFIER***

In [14]:
rf = RandomForestClassifier()
rf.fit(Xtrain, Ytrain)

RandomForestClassifier()

In [15]:
X_Pred1 = rf.predict(Xtest)
XPred1 = rf.predict(Xtrain)
#Model Accuracy
print("Train Accuracy:", metrics.accuracy_score(Ytrain,XPred1))
print("Test Accuracy:", metrics.accuracy_score(Ytest,X_Pred1))
print("Confusion Matrix for Random Forest:")
print(confusion_matrix(Ytest,X_Pred1))
print('Printing the precision and recall, among other metrics')
print(metrics.classification_report(Ytest, X_Pred1))
clf_cv_score = cross_val_score(rf, Xtrain, Ytrain, cv=10, scoring="accuracy")
print("Accuracy of Model with Cross Validation is:",clf_cv_score.mean() * 100)

Train Accuracy: 1.0
Test Accuracy: 0.9115480745836191
Confusion Matrix for Random Forest:
[[39290   632]
 [ 3367  1922]]
Printing the precision and recall, among other metrics
              precision    recall  f1-score   support

          no       0.92      0.98      0.95     39922
         yes       0.75      0.36      0.49      5289

    accuracy                           0.91     45211
   macro avg       0.84      0.67      0.72     45211
weighted avg       0.90      0.91      0.90     45211

Accuracy of Model with Cross Validation is: 89.75873722870148


# ***Decision Tree: Random & Grid Search***

***Hyperparameter tuning done for decision tree classifier***

***RANDOM SEARCH***

In [16]:
import time
start_time = time.time()

print("Randomized Search CV for Decision tree")
parameters={'min_samples_leaf' : range(10,300,10),'max_depth': 
            range(5,30,2),'criterion':['gini','entropy']}
dt_random = RandomizedSearchCV(dt,parameters,n_iter=25,cv=5)
dt_random.fit(Xtrain, Ytrain)
grid_parm=dt_random.best_params_
print(grid_parm)
print("accuracy Score for Decision Tree:{0:6f}".
      format(dt_random.score(Xtest,Ytest)))

print("--- %s seconds ---" % (time.time() - start_time))

Randomized Search CV for Decision tree
{'min_samples_leaf': 50, 'max_depth': 29, 'criterion': 'gini'}
accuracy Score for Decision Tree:0.895026
--- 2.298304319381714 seconds ---


***GRID SEARCH***

In [17]:
import time
start_time = time.time()

print("Grid Search CV for Decision tree")
dt_grid = GridSearchCV(dt,parameters)
dt_grid.fit(Xtrain, Ytrain)
grid_parm1=dt_grid.best_params_
print(grid_parm1)
print("accuracy Score for Decision Tree:{0:6f}".
      format(dt_grid.score(Xtest,Ytest)))

print("--- %s seconds ---" % (time.time() - start_time))

Grid Search CV for Decision tree
{'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 30}
accuracy Score for Decision Tree:0.896220
--- 62.67777705192566 seconds ---


In [18]:
#Using the parameters obtained from HyperParameterTuning in the DecisionTreeClassifier 
dtRand = DecisionTreeClassifier(**grid_parm)
dtGrid = DecisionTreeClassifier(**grid_parm1)

dtRand.fit(Xtrain,Ytrain)
dtRand_predict = dtRand.predict(Xtest)
dtGrid.fit(Xtrain,Ytrain)
dtGrid_predict = dtGrid.predict(Xtest)

***Accuracy for Decision Tree using Random Search CV for Hyperparameter Tuning***

In [19]:
# Accuracy for Decision Tree using Random Search CV for Hyperparameter Tuning

print("Test Accuracy:", metrics.accuracy_score(Ytest,dtRand_predict))
print()
print("Confusion Matrix for Decision Tree:")
print(confusion_matrix(Ytest,dtRand_predict))
print()
print('Printing the precision and recall, among other metrics')
print(metrics.classification_report(Ytest, dtRand_predict))
clf_cv_score = cross_val_score(dtRand, Xtrain, Ytrain, cv=10, scoring="accuracy")
print("Accuracy of Model with Cross Validation is:",clf_cv_score.mean() * 100)

Test Accuracy: 0.895025546880184

Confusion Matrix for Decision Tree:
[[38578  1344]
 [ 3402  1887]]

Printing the precision and recall, among other metrics
              precision    recall  f1-score   support

          no       0.92      0.97      0.94     39922
         yes       0.58      0.36      0.44      5289

    accuracy                           0.90     45211
   macro avg       0.75      0.66      0.69     45211
weighted avg       0.88      0.90      0.88     45211

Accuracy of Model with Cross Validation is: 89.38292406571723


***Accuracy for Decision Tree using Grid Search for Hyperparameter Tuning***

In [20]:
# Accuracy for Decision Tree using Grid Search for Hyperparameter Tuning

print("Test Accuracy:", metrics.accuracy_score(Ytest,dtGrid_predict))
print()
print("Confusion Matrix for Decision Tree:")
print(confusion_matrix(Ytest,dtGrid_predict))
print()
print('Printing the precision and recall, among other metrics')
print(metrics.classification_report(Ytest, dtGrid_predict))
clf_cv_score = cross_val_score(dtGrid, Xtrain, Ytrain, cv=10, scoring="accuracy")
print("Accuracy of Model with Cross Validation is:",clf_cv_score.mean() * 100)

Test Accuracy: 0.8962199464732035

Confusion Matrix for Decision Tree:
[[38959   963]
 [ 3729  1560]]

Printing the precision and recall, among other metrics
              precision    recall  f1-score   support

          no       0.91      0.98      0.94     39922
         yes       0.62      0.29      0.40      5289

    accuracy                           0.90     45211
   macro avg       0.77      0.64      0.67     45211
weighted avg       0.88      0.90      0.88     45211

Accuracy of Model with Cross Validation is: 89.47151731817384


# ***Random Forest: Random & Grid Search***

***Hyperparameter tuning done for random forest classifier***

***RANDOM SEARCH***

In [21]:
import time
start_time = time.time()

print("Randomized Search CV for Random Forest")
rand_parameters={'min_samples_leaf' : range(10,100,10),'max_depth': 
            range(1,10,2),'max_features':[10,20,30],'n_estimators':[20,30,40]}
rf_random = RandomizedSearchCV(rf,rand_parameters,n_iter=25,cv=5)
rf_random.fit(Xtrain, Ytrain)
grid_parm=rf_random.best_params_
print(grid_parm)
print("Accuracy Score for Random Forest:{0:6f}".
      format(rf_random.score(Xtest,Ytest)))

print("--- %s seconds ---" % (time.time() - start_time))

Randomized Search CV for Random Forest
{'n_estimators': 30, 'min_samples_leaf': 50, 'max_features': 30, 'max_depth': 5}
Accuracy Score for Random Forest:0.900378
--- 10.703381299972534 seconds ---


***GRID SEARCH***

In [22]:
import time
start_time = time.time()

print("Grid Search CV for Random Forest")
rf_grid = GridSearchCV(rf,rand_parameters)
rf_grid.fit(Xtrain, Ytrain)
grid_parm1=rf_grid.best_params_
print(grid_parm1)
print("Accuracy Score for Random Forest:{0:6f}".
      format(rf_grid.score(Xtest,Ytest)))

print("--- %s seconds ---" % (time.time() - start_time))

Grid Search CV for Random Forest
{'max_depth': 7, 'max_features': 20, 'min_samples_leaf': 40, 'n_estimators': 20}
Accuracy Score for Random Forest:0.899073
--- 177.80277228355408 seconds ---


In [23]:
#Using the parameters obtained from HyperParameterTuning in the RandomForestClassifier 
rfRand = RandomForestClassifier(**grid_parm)
rfGrid = RandomForestClassifier(**grid_parm1)

rfRand.fit(Xtrain,Ytrain)
rfRand_predict = rfRand.predict(Xtest)
rfGrid.fit(Xtrain,Ytrain)
rfGrid_predict = rfGrid.predict(Xtest)

***Accuracy for Random Forest using Random Search CV for Hyperparameter Tuning***

In [24]:
# Accuracy for Random Forest using Random Search CV for Hyperparameter Tuning

print("Test Accuracy:", metrics.accuracy_score(Ytest,rfRand_predict))
print("Confusion Matrix for Random Forest:")
print(confusion_matrix(Ytest,rfRand_predict))
print('Printing the precision and recall, among other metrics')
print(metrics.classification_report(Ytest, rfRand_predict))
clf_cv_score = cross_val_score(rfRand, Xtrain, Ytrain, cv=10, scoring="accuracy")
print("Accuracy of Model with Cross Validation is:",clf_cv_score.mean() * 100)

Test Accuracy: 0.9011744929331358
Confusion Matrix for Random Forest:
[[38952   970]
 [ 3498  1791]]
Printing the precision and recall, among other metrics
              precision    recall  f1-score   support

          no       0.92      0.98      0.95     39922
         yes       0.65      0.34      0.44      5289

    accuracy                           0.90     45211
   macro avg       0.78      0.66      0.70     45211
weighted avg       0.89      0.90      0.89     45211

Accuracy of Model with Cross Validation is: 89.91375100119166


***Accuracy for Random Forest using Grid Search for Hyperparameter Tuning***

In [25]:
# Accuracy for Random Forest using Grid Search for Hyperparameter Tuning

print("Test Accuracy:", metrics.accuracy_score(Ytest,rfGrid_predict))
print("Confusion Matrix for Random Forest:")
print(confusion_matrix(Ytest,rfGrid_predict))
print('Printing the precision and recall, among other metrics')
print(metrics.classification_report(Ytest, rfGrid_predict))
clf_cv_score = cross_val_score(rfGrid, Xtrain, Ytrain, cv=10, scoring="accuracy")
print("Accuracy of Model with Cross Validation is:",clf_cv_score.mean() * 100)

Test Accuracy: 0.9003339895158258
Confusion Matrix for Random Forest:
[[39229   693]
 [ 3813  1476]]
Printing the precision and recall, among other metrics
              precision    recall  f1-score   support

          no       0.91      0.98      0.95     39922
         yes       0.68      0.28      0.40      5289

    accuracy                           0.90     45211
   macro avg       0.80      0.63      0.67     45211
weighted avg       0.88      0.90      0.88     45211

Accuracy of Model with Cross Validation is: 90.04649436402352
