In [1]:
# import packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

np.random.seed(1)

# Loading the data

In [2]:
df = pd.read_csv('UniversalBank.csv')
df.head(5)

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1


# Exploring the dataset

In [3]:
# Explore the dataset
# read the first row of the dataset 
print(df.head())
print(df.columns)
print(df.describe())
print(df.info())

   ID  Age  Experience  Income  ZIP Code  Family  CCAvg  Education  Mortgage  \
0   1   25           1      49     91107       4    1.6          1         0   
1   2   45          19      34     90089       3    1.5          1         0   
2   3   39          15      11     94720       1    1.0          1         0   
3   4   35           9     100     94112       1    2.7          2         0   
4   5   35           8      45     91330       4    1.0          2         0   

   Personal Loan  Securities Account  CD Account  Online  CreditCard  
0              0                   1           0       0           0  
1              0                   1           0       0           0  
2              0                   0           0       0           0  
3              0                   0           0       0           0  
4              0                   0           0       0           1  
Index(['ID', 'Age', 'Experience', 'Income', 'ZIP Code', 'Family', 'CCAvg',
       'Education'

# Clean/transform data (where necessary)

In [4]:
# based on findings from data exploration, we need to clean up colum names, as there are some leading whitespace characters
df.columns = [s.strip() for s in df.columns] 
df.columns

Index(['ID', 'Age', 'Experience', 'Income', 'ZIP Code', 'Family', 'CCAvg',
       'Education', 'Mortgage', 'Personal Loan', 'Securities Account',
       'CD Account', 'Online', 'CreditCard'],
      dtype='object')

# Drop the columns we are not using as predictors

In [5]:
df = df.drop(columns=['ID', 'ZIP Code'])

In [6]:
# translation education categories into dummy vars
df = df.join(pd.get_dummies(df['Education'], prefix='Edu', drop_first=True))
df.drop('Education', axis=1, inplace = True)

df.head(3)

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard,Edu_2,Edu_3
0,25,1,49,4,1.6,0,0,1,0,0,0,0,0
1,45,19,34,3,1.5,0,0,1,0,0,0,0,0
2,39,15,11,1,1.0,0,0,0,0,0,0,0,0


# Spliting data intro training and validation sets

In [7]:
# construct datasets for analysis
target = 'Personal Loan'
predictors = list(df.columns)
predictors.remove(target)
X = df[predictors]
y = df[target]

In [8]:
# create the training set and the test set 
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=1)

# Prediction with RandomForest

In [9]:
rforest = RandomForestClassifier()

In [10]:
_ = rforest.fit(X_train, y_train)

In [11]:
y_pred = rforest.predict(X_test)

In [12]:
print(f"{'Model':^18}{'Score':^18}")
print("************************************")
print(f"{'>> Recall Score:':18}{recall_score(y_test, y_pred)}")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred)}")

      Model             Score       
************************************
>> Recall Score:  0.8456375838926175
Accuracy Score:   0.9833333333333333
Precision Score:  0.984375
F1 Score:         0.9097472924187726


In [13]:
rforest_recall = recall_score(y_test, y_pred)

# Prediction with RandomForest using the random searchcv 

In [28]:
from sklearn.metrics import confusion_matrix
score_measure = "recall"
kfolds = 5

param_grid = {
    'min_samples_split': np.arange(1,55),  
    'min_samples_leaf': np.arange(1,55),
    'min_impurity_decrease': np.arange(0.0001, 0.01, 0.0005),
    'max_leaf_nodes': np.arange(5, 200), 
    'max_depth': np.arange(1,50), 
    'criterion': ['entropy', 'gini'],
}

rforest = RandomForestClassifier()
rand_search = RandomizedSearchCV(estimator = dtree, param_distributions=param_grid, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = rand_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

bestRecallTree = rand_search.best_estimator_

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
The best recall score is 0.9063319764812302
... with parameters: {'min_samples_split': 24, 'min_samples_leaf': 1, 'min_impurity_decrease': 0.0006000000000000001, 'max_leaf_nodes': 19, 'max_depth': 31, 'criterion': 'entropy'}


45 fits failed out of a total of 2500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\shanm\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\shanm\anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 937, in fit
    super().fit(
  File "C:\Users\shanm\anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 250, in fit
    raise ValueError(
ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

 0.84898236 0.87919493 0.86413388 0.80660335 0.79769335 0.84898236
 0.592582

In [29]:
from sklearn.metrics import confusion_matrix
c_matrix = confusion_matrix(y_test, rand_search.predict(X_test))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
print(f"Accuracy={(TP+TN)/(TP+TN+FP+FN):.7f} Precision={TP/(TP+FP):.7f} Recall={TP/(TP+FN):.7f} F1={2*TP/(2*TP+FP+FN):.7f}")

Accuracy=0.9793333 Precision=0.9836066 Recall=0.8053691 F1=0.8856089


# Prediction with Decision Tree

In [14]:
dtree=DecisionTreeClassifier()

In [15]:
_ = dtree.fit(X_train, y_train)

In [16]:
y_pred = dtree.predict(X_test)

In [17]:
print(f"{'Model':^18}{'Score':^18}")
print("************************************")
print(f"{'>> Recall Score:':18}{recall_score(y_test, y_pred)}")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred)}")

      Model             Score       
************************************
>> Recall Score:  0.8993288590604027
Accuracy Score:   0.986
Precision Score:  0.9571428571428572
F1 Score:         0.9273356401384083


In [18]:
dtree_recall = recall_score(y_test, y_pred)

# Prediction with ADABoost

In [19]:
from sklearn.ensemble import AdaBoostClassifier

In [20]:
aboost = AdaBoostClassifier()
_ = aboost.fit(X_train, y_train)
y_pred = aboost.predict(X_test)

In [21]:
print(f"{'Model':^18}{'Score':^18}")
print("************************************")
print(f"{'>> Recall Score:':18}{recall_score(y_test, y_pred)}")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred)}")

      Model             Score       
************************************
>> Recall Score:  0.7248322147651006
Accuracy Score:   0.9626666666666667
Precision Score:  0.8780487804878049
F1 Score:         0.7941176470588235


In [22]:
aboost_recall = recall_score(y_test, y_pred)

# Prediction with GradientBoostingClassifier

In [23]:
from sklearn.ensemble import GradientBoostingClassifier

In [24]:
gboost = GradientBoostingClassifier()
_ = gboost.fit(X_train, y_train)
y_pred = gboost.predict(X_test)

In [25]:
print(f"{'Model':^18}{'Score':^18}")
print("************************************")
print(f"{'>> Recall Score:':18}{recall_score(y_test, y_pred)}")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred)}")

      Model             Score       
************************************
>> Recall Score:  0.8657718120805369
Accuracy Score:   0.9826666666666667
Precision Score:  0.9555555555555556
F1 Score:         0.9084507042253522


In [26]:
gboost_recall = recall_score(y_test, y_pred)

# Summarizing the results¶

In [30]:
print("Recall scores...")
print(f"{'Decision Tree:':18}{dtree_recall}")
print(f"{'Random Forest:':18}{rforest_recall}")
print(f"{'Ada Boosted Tree:':18}{aboost_recall}")
print(f"{'Gradient Tree:':18}{gboost_recall}")
print(f"The best {score_measure} score using randomsearchcv on random forest {rand_search.best_score_}")

Recall scores...
Decision Tree:    0.8993288590604027
Random Forest:    0.8456375838926175
Ada Boosted Tree: 0.7248322147651006
Gradient Tree:    0.8657718120805369
The best recall score using randomsearchcv on random forest 0.9063319764812302


# By fitting the decision tree, random forest, ada boosted tree, gradient tree and random search cv on random forest, given in the question. according to my point of view random serach cv on random forest gives the best fit recall score= 0.9063319