In [None]:
import pandas as pd
from sklearn.preprocessing import label_binarize
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier

import warnings
warnings.filterwarnings("ignore")
#Please ignore the warnings with version change

from google.colab import drive
drive.mount('/gdrive')
#Change current working directory to gdrive
%cd /gdrive



Mounted at /gdrive
/gdrive


In [None]:
trainfile = r'/gdrive/MyDrive/CIS_508/IA_2/Portugese Bank Data - TRAIN.csv'
trainData = pd.read_csv(trainfile) #creates a dataframe
testfile = r'/gdrive/MyDrive/CIS_508/IA_2/Portugese Bank Data - TEST.csv'
testData = pd.read_csv(testfile)  #creates a dataframe


print(trainData.shape)
print(testData.shape)

trainData.head()
testData.head()

(4521, 17)
(45211, 17)


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [None]:
#Data Description
trainData.info()
testData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        4521 non-null   int64 
 1   job        4521 non-null   object
 2   marital    4521 non-null   object
 3   education  4521 non-null   object
 4   default    4521 non-null   object
 5   balance    4521 non-null   int64 
 6   housing    4521 non-null   object
 7   loan       4521 non-null   object
 8   contact    4521 non-null   object
 9   day        4521 non-null   int64 
 10  month      4521 non-null   object
 11  duration   4521 non-null   int64 
 12  campaign   4521 non-null   int64 
 13  pdays      4521 non-null   int64 
 14  previous   4521 non-null   int64 
 15  poutcome   4521 non-null   object
 16  y          4521 non-null   object
dtypes: int64(7), object(10)
memory usage: 600.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 

In [None]:
#Copy Train data excluding target
trainData_Copy = trainData.iloc[:, :-1].copy()
testData_Copy = testData.iloc[:, :-1].copy()

trainData_Copy.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown


In [None]:
#List of names of all Columns from a dataframe

TrainCols = list(trainData.columns.values)
TestCols = list(testData.columns.values)
print(TrainCols)
print(TestCols)

['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']
['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']


In [None]:
#List of Categorical Features
categoricalFeatures = ["job", "marital", "education", "default", "housing",
                       "loan", "contact", "month", "poutcome"]

#Combine Train and test for one Hot Encoding
combined_Data = pd.concat([trainData_Copy,testData_Copy], keys=[0,1])

#Do one Hot encoding for categorical features
combined_Data = pd.get_dummies(combined_Data,columns=categoricalFeatures)

#Separate Train data and test data
X_Train = combined_Data.xs(0)
X_Test = combined_Data.xs(1)

X_Test.head()


Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown,marital_divorced,marital_married,marital_single,education_primary,education_secondary,education_tertiary,education_unknown,default_no,default_yes,housing_no,housing_yes,loan_no,loan_yes,contact_cellular,contact_telephone,contact_unknown,month_apr,month_aug,month_dec,month_feb,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,58,2143,5,261,1,-1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
1,44,29,5,151,1,-1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
2,33,2,5,76,1,-1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
3,47,1506,5,92,1,-1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
4,33,1,5,198,1,-1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,1,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1


In [None]:
#Select Target Column and Tranform into Numerical variable
Y_Train = label_binarize(trainData.iloc[:, -1], classes=["no", "yes"])
Y_Test = label_binarize(testData.iloc[:, -1], classes=["no", "yes"])


In [None]:
#Decision Tree Classifier ========================================================================
#CONSTRUCT DEFAULT DECISION TREE AND OBTAIN RESPECTIVE ACCURACY 
clf = DecisionTreeClassifier()
clf.fit(X_Train, Y_Train)
clf_predict=clf.predict(X_Test)
print("accuracy Score (testset) for Decision Tree:{0:6f}".format(clf.score(X_Test,Y_Test)))
print("Confusion Matrix for Decision Tree")
print(confusion_matrix(Y_Test,clf_predict))


accuracy Score (testset) for Decision Tree:0.882418
Confusion Matrix for Decision Tree
[[37180  2742]
 [ 2574  2715]]


In [None]:
#print(clf_random.cv_results_)
#print(clf_grid.cv_results_)


In [None]:
print(clf.feature_importances_)

[0.09027577 0.0903708  0.09933811 0.25685428 0.03417217 0.03793584
 0.01198483 0.00108844 0.00494456 0.00184891 0.00174173 0.00815249
 0.01096978 0.00044586 0.00466731 0.00661146 0.02005861 0.00369882
 0.00266167 0.006513   0.01105758 0.00638906 0.00272505 0.00900295
 0.01615418 0.00108469 0.0058561  0.00195245 0.00582779 0.01450987
 0.00297585 0.0050619  0.00754    0.00238358 0.01075664 0.01025353
 0.0074214  0.         0.01394778 0.00144626 0.00779016 0.01774798
 0.01069306 0.00809944 0.0059009  0.0191741  0.00438063 0.00343848
 0.00180782 0.09028633 0.        ]


In [None]:
#Hyperparameter tuning done for decision tree classifier
#Random search
print("RandomizedSearchCV-Decision tree")
parameters={'min_samples_leaf' : range(10,100,10),'max_depth': range(5,30,5),'criterion':['gini','entropy']}
clf_random = RandomizedSearchCV(clf,parameters,n_iter=15,cv=5) #randomly choose 15 combinations
clf_random.fit(X_Train, Y_Train)
rand_parm=clf_random.best_params_
print(rand_parm) #results could be different and misleading
#Now do grid search
print("GridSearchCV-Decision tree")
clf_grid = GridSearchCV(clf,parameters)
clf_grid.fit(X_Train, Y_Train)
grid_parm=clf_grid.best_params_
print(grid_parm)



RandomizedSearchCV-Decision tree
{'min_samples_leaf': 20, 'max_depth': 5, 'criterion': 'entropy'}
GridSearchCV-Decision tree
{'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 30}


In [None]:
#Using the parameters obtained from HyperParameterTuning in the DecisionTreeClassifier
#Construct Decision Trees using the best parameters
clfr = DecisionTreeClassifier(**rand_parm)
clfg = DecisionTreeClassifier(**grid_parm)

clfr.fit(X_Train,Y_Train)
clfr_predict = clfr.predict(X_Test)
clfg.fit(X_Train,Y_Train)
clfg_predict = clfg.predict(X_Test)


In [None]:
#Obtain accuracy ,confusion matrix,classification report and AUC values for the result above.
print("accuracy Score (testset) after 1st hypertuning randomized search for Decision Tree:{0:6f}".format(clfr.score(X_Test,Y_Test)))
print("accuracy Score (testset) after 1st hypertuning grid search for Decision Tree:{0:6f}".format(clfg.score(X_Test,Y_Test)))

print("Confusion Matrix after hypertuning for Random Search Decision Tree")
print(confusion_matrix(Y_Test,clfr_predict))
print("=== 1st Random Search Classification Report ===")
print(classification_report(Y_Test,clfr_predict))

print("Confusion Matrix after hypertuning for Grid Search Decision Tree")
print(confusion_matrix(Y_Test,clfg_predict))
print("=== 1st Grid Search Classification Report ===")
print(classification_report(Y_Test,clfg_predict))

clf_cv_score = cross_val_score(clfr, X_Train, Y_Train, cv=10, scoring="balanced_accuracy")
print(clf_cv_score)
print('\n')

accuracy Score (testset) after 1st hypertuning randomized search for Decision Tree:0.897724
accuracy Score (testset) after 1st hypertuning grid search for Decision Tree:0.896220
Confusion Matrix after hypertuning for Random Search Decision Tree
[[39063   859]
 [ 3765  1524]]
=== 1st Random Search Classification Report ===
              precision    recall  f1-score   support

           0       0.91      0.98      0.94     39922
           1       0.64      0.29      0.40      5289

    accuracy                           0.90     45211
   macro avg       0.78      0.63      0.67     45211
weighted avg       0.88      0.90      0.88     45211

Confusion Matrix after hypertuning for Grid Search Decision Tree
[[38959   963]
 [ 3729  1560]]
=== 1st Grid Search Classification Report ===
              precision    recall  f1-score   support

           0       0.91      0.98      0.94     39922
           1       0.62      0.29      0.40      5289

    accuracy                           0.90

In [None]:
#2nd Hyperparameter tuning done for decision tree classifier
#Random search
print("RandomizedSearchCV-Decision tree")
parameters2={'min_samples_leaf' : range(10,100,10),'max_depth': range(5,30,5),'criterion':['gini','entropy'], 'class_weight':["balanced"]}
clf_random2 = RandomizedSearchCV(clf,parameters2,n_iter=15,cv=3)
clf_random2.fit(X_Train, Y_Train)
rand_parm2=clf_random2.best_params_
print(rand_parm2) #results could be different and misleading
#Now do grid search
print("GridSearchCV-Decision tree")
clf_grid2 = GridSearchCV(clf,parameters2)
clf_grid2.fit(X_Train, Y_Train)
grid_parm2=clf_grid2.best_params_
print(grid_parm2)


RandomizedSearchCV-Decision tree
{'min_samples_leaf': 20, 'max_depth': 15, 'criterion': 'gini', 'class_weight': 'balanced'}
GridSearchCV-Decision tree
{'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 25, 'min_samples_leaf': 10}


In [None]:
#2nd HyperParameterTuning
clfr2 = DecisionTreeClassifier(**rand_parm2)
clfg2 = DecisionTreeClassifier(**grid_parm2)

clfr2.fit(X_Train,Y_Train)
clfr2_predict = clfr2.predict(X_Test)
clfg2.fit(X_Train,Y_Train)
clfg2_predict = clfg2.predict(X_Test)


In [None]:
print("==============2nd===============")
print("accuracy Score (testset) after 2nd hypertuning randomized search for Decision Tree:{0:6f}".format(clfr2.score(X_Test,Y_Test)))
print("accuracy Score (testset) after 2nd hypertuning grid search for Decision Tree:{0:6f}".format(clfg2.score(X_Test,Y_Test)))

print("Confusion Matrix after hypertuning for Random Search Decision Tree")
print(confusion_matrix(Y_Test,clfr2_predict))
print("=== 2nd Random Search Classification Report ===")
print(classification_report(Y_Test,clfr2_predict))

print("Confusion Matrix after hypertuning for Grid Search Decision Tree")
print(confusion_matrix(Y_Test,clfg2_predict))
print("=== 2nd Grid Search Classification Report ===")
print(classification_report(Y_Test,clfg2_predict))


accuracy Score (testset) after 2nd hypertuning randomized search for Decision Tree:0.805866
accuracy Score (testset) after 2nd hypertuning grid search for Decision Tree:0.841189
Confusion Matrix after hypertuning for Random Search Decision Tree
[[32134  7788]
 [  989  4300]]
=== 2nd Random Search Classification Report ===
              precision    recall  f1-score   support

           0       0.97      0.80      0.88     39922
           1       0.36      0.81      0.49      5289

    accuracy                           0.81     45211
   macro avg       0.66      0.81      0.69     45211
weighted avg       0.90      0.81      0.83     45211

Confusion Matrix after hypertuning for Grid Search Decision Tree
[[34143  5779]
 [ 1401  3888]]
=== 2nd Grid Search Classification Report ===
              precision    recall  f1-score   support

           0       0.96      0.86      0.90     39922
           1       0.40      0.74      0.52      5289

    accuracy                           0.84

In [None]:
#3nd Hyperparameter tuning done for decision tree classifier
#Random search
print("RandomizedSearchCV-Decision tree")
parameters3={'min_samples_leaf' : range(1,5,1),'max_depth': range(10,60,10),'criterion':['gini','entropy'] }
clf_random3 = RandomizedSearchCV(clf,parameters3,n_iter=15,cv=5)
clf_random3.fit(X_Train, Y_Train)
rand_parm3=clf_random3.best_params_
print(rand_parm3) #results could be different and misleading
#Now do grid search
print("GridSearchCV-Decision tree")
clf_grid3 = GridSearchCV(clf,parameters3)
clf_grid3.fit(X_Train, Y_Train)
grid_parm3=clf_grid3.best_params_
print(grid_parm3)


RandomizedSearchCV-Decision tree
{'min_samples_leaf': 1, 'max_depth': 10, 'criterion': 'entropy'}
GridSearchCV-Decision tree
{'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 4}


In [None]:
#3rd HyperParameterTuning
clfr3 = DecisionTreeClassifier(**rand_parm3)
clfg3 = DecisionTreeClassifier(**grid_parm3)

clfr3.fit(X_Train,Y_Train)
clfr3_predict = clfr3.predict(X_Test)
clfg3.fit(X_Train,Y_Train)
clfg3_predict = clfg3.predict(X_Test)

In [None]:
print("==============3rd===============")
print("accuracy Score (testset) after 3rd hypertuning randomized search for Decision Tree:{0:6f}".format(clfr3.score(X_Test,Y_Test)))
print("accuracy Score (testset) after 3rd hypertuning grid search for Decision Tree:{0:6f}".format(clfg3.score(X_Test,Y_Test)))

print("Confusion Matrix after hypertuning for Random Search Decision Tree")
print(confusion_matrix(Y_Test,clfr3_predict))
print("=== 3rd Random Search Classification Report ===")
print(classification_report(Y_Test,clfr3_predict))

print("Confusion Matrix after hypertuning for Grid Search Decision Tree")
print(confusion_matrix(Y_Test,clfg3_predict))
print("=== 3rd Grid Search Classification Report ===")
print(classification_report(Y_Test,clfg3_predict))

accuracy Score (testset) after 3rd hypertuning randomized search for Decision Tree:0.895512
accuracy Score (testset) after 3rd hypertuning grid search for Decision Tree:0.898786
Confusion Matrix after hypertuning for Random Search Decision Tree
[[38416  1506]
 [ 3218  2071]]
=== 3rd Random Search Classification Report ===
              precision    recall  f1-score   support

           0       0.92      0.96      0.94     39922
           1       0.58      0.39      0.47      5289

    accuracy                           0.90     45211
   macro avg       0.75      0.68      0.70     45211
weighted avg       0.88      0.90      0.89     45211

Confusion Matrix after hypertuning for Grid Search Decision Tree
[[38524  1398]
 [ 3178  2111]]
=== 3rd Grid Search Classification Report ===
              precision    recall  f1-score   support

           0       0.92      0.96      0.94     39922
           1       0.60      0.40      0.48      5289

    accuracy                           0.90

In [None]:
#Normal randomforest==============================================================================
#=================================================================================================
rand_parameters={'min_samples_leaf' : range(10,100,10),'max_depth': range(1,10,2),'max_features':[2,3,4],'n_estimators':[20,30,40]}
rfc = RandomForestClassifier()
rfc.fit(X_Train, Y_Train)
rfc_predict=rfc.predict(X_Test)
print("accuracy Score (testset) for RandomForest:{0:6f}".format(rfc.score(X_Test,Y_Test)))
print("Confusion Matrix for Random Forest:")
print(confusion_matrix(Y_Test,rfc_predict))
#do random search with cross-validation
rfc_random = RandomizedSearchCV(rfc,rand_parameters,n_iter=15,cv=5)
rfc_random.fit(X_Train, Y_Train)
grid_parm_rfc=rfc_random.best_params_
print(grid_parm_rfc)
#create new classifier using the best parameters
rfc= RandomForestClassifier(**grid_parm_rfc)
rfc.fit(X_Train,Y_Train)
rfc_predict = rfc.predict(X_Test)
print("accuracy Score (testset) after hypertuning for Random Forest:{0:6f}".format(rfc.score(X_Test,Y_Test)))
print("Confusion Matrix after hypertuning for Random Forest:")
print(confusion_matrix(Y_Test,rfc_predict))
print("=== Classification Report ===")
print(classification_report(Y_Test,rfc_predict))
rfc_cv_score = cross_val_score(rfc, X_Train, Y_Train)
print(rfc_cv_score)
print('\n')

accuracy Score (testset) for RandomForest:0.911968
Confusion Matrix for Random Forest:
[[39233   689]
 [ 3291  1998]]
{'n_estimators': 20, 'min_samples_leaf': 10, 'max_features': 3, 'max_depth': 3}
accuracy Score (testset) after hypertuning for Random Forest:0.883015
Confusion Matrix after hypertuning for Random Forest:
[[39922     0]
 [ 5289     0]]
=== Classification Report ===
              precision    recall  f1-score   support

           0       0.88      1.00      0.94     39922
           1       0.00      0.00      0.00      5289

    accuracy                           0.88     45211
   macro avg       0.44      0.50      0.47     45211
weighted avg       0.78      0.88      0.83     45211

[0.8839779  0.88495575 0.88495575 0.88495575 0.88495575]




In [None]:
#2nd Normal randomforest==============================================================================
#=================================================================================================
rand_parameters2={'min_samples_leaf' :range(10,100,5),'max_depth':range(20,100,20),'max_features':[5,10,15],'n_estimators':[30,40,50],'class_weight':["balanced"], 'max_samples':[300]}

#do random search with cross-validation
rfc2_random = RandomizedSearchCV(rfc,rand_parameters2,n_iter=15,cv=5)
rfc2_random.fit(X_Train, Y_Train)
grid_parm_rfc2=rfc2_random.best_params_
print(grid_parm_rfc2)
#create new classifier using the best parameters
rfc2= RandomForestClassifier(**grid_parm_rfc2)
rfc2.fit(X_Train,Y_Train)
rfc2_predict = rfc2.predict(X_Test)
print("accuracy Score (testset) after hypertuning for Random Forest:{0:6f}".format(rfc2.score(X_Test,Y_Test)))
print("Confusion Matrix after hypertuning for Random Forest:")
print(confusion_matrix(Y_Test,rfc2_predict))
print("=== 2nd Classification Report ===")
print(classification_report(Y_Test,rfc2_predict))
rfc2_cv_score = cross_val_score(rfc2, X_Train, Y_Train)
print(rfc2_cv_score)
print('\n')

{'n_estimators': 50, 'min_samples_leaf': 10, 'max_samples': 300, 'max_features': 5, 'max_depth': 40, 'class_weight': 'balanced'}
accuracy Score (testset) after hypertuning for Random Forest:0.862357
Confusion Matrix after hypertuning for Random Forest:
[[36178  3744]
 [ 2479  2810]]
=== 2nd Classification Report ===
              precision    recall  f1-score   support

           0       0.94      0.91      0.92     39922
           1       0.43      0.53      0.47      5289

    accuracy                           0.86     45211
   macro avg       0.68      0.72      0.70     45211
weighted avg       0.88      0.86      0.87     45211

[0.84088398 0.82964602 0.87942478 0.86393805 0.81747788]




In [None]:
#3rd Normal randomforest==============================================================================
#=================================================================================================

rand_parameters3={'min_samples_leaf':range(1,5,1),'max_depth':range(2,40,5),'max_features':[2,15,1],'n_estimators':[5,6,7], 'criterion':['entropy']}

#do random search with cross-validation
rfc3_random = RandomizedSearchCV(rfc,rand_parameters3,n_iter=15,cv=5)
rfc3_random.fit(X_Train, Y_Train)
grid_parm_rfc3=rfc3_random.best_params_
print(grid_parm_rfc3)
#create new classifier using the best parameters
rfc3= RandomForestClassifier(**grid_parm_rfc3)
rfc3.fit(X_Train,Y_Train)
rfc3_predict = rfc3.predict(X_Test)
print("accuracy Score (testset) after hypertuning for Random Forest:{0:6f}".format(rfc3.score(X_Test,Y_Test)))
print("Confusion Matrix after hypertuning for Random Forest:")
print(confusion_matrix(Y_Test,rfc3_predict))
print("=== 2nd Classification Report ===")
print(classification_report(Y_Test,rfc3_predict))
rfc3_cv_score = cross_val_score(rfc3, X_Train, Y_Train)
print(rfc3_cv_score)
print('\n')

{'n_estimators': 5, 'min_samples_leaf': 1, 'max_features': 15, 'max_depth': 12, 'criterion': 'entropy'}
accuracy Score (testset) after hypertuning for Random Forest:0.900378
Confusion Matrix after hypertuning for Random Forest:
[[38775  1147]
 [ 3357  1932]]
=== 2nd Classification Report ===
              precision    recall  f1-score   support

           0       0.92      0.97      0.95     39922
           1       0.63      0.37      0.46      5289

    accuracy                           0.90     45211
   macro avg       0.77      0.67      0.70     45211
weighted avg       0.89      0.90      0.89     45211

[0.89060773 0.88274336 0.88274336 0.88716814 0.90376106]


