In [5]:
import pandas as pd
import numpy as np
import requests
import io

In [42]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [33]:
url="https://raw.githubusercontent.com/timcsmith/MIS536-Public/master/Data/UniversalBank.csv"
s=requests.get(url).content
band_df=pd.read_csv(io.StringIO(s.decode('utf-8')))

In [34]:
band_df.head()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1


In [35]:
band_df.columns

Index(['ID', 'Age', 'Experience', 'Income', 'ZIP Code', 'Family', 'CCAvg',
       'Education', 'Mortgage', 'Personal Loan', 'Securities Account',
       'CD Account', 'Online', 'CreditCard'],
      dtype='object')

In [36]:
band_df=band_df.drop(['ID', 'Age', 'Experience', 'ZIP Code', 'Family', 
         'Securities Account',
       'CD Account'],axis=1)

In [37]:
band_df.head()

Unnamed: 0,Income,CCAvg,Education,Mortgage,Personal Loan,Online,CreditCard
0,49,1.6,1,0,0,0,0
1,34,1.5,1,0,0,0,0
2,11,1.0,1,0,0,0,0
3,100,2.7,2,0,0,0,0
4,45,1.0,2,0,0,0,1


In [39]:
X=band_df.drop(['Personal Loan'],axis=1)
y=band_df.iloc[:,4]
train_X, valid_X, train_y, valid_y = train_test_split(X,y, test_size=0.3, random_state=1)

In [43]:
dtree=DecisionTreeClassifier(random_state=1)
dtree.fit(train_X, train_y)

DecisionTreeClassifier(random_state=1)

In [44]:
validation_predictions = dtree.predict(valid_X)

print('Confusion Matrix: ', confusion_matrix(valid_y, validation_predictions))
print('Accuracy score', accuracy_score(valid_y, validation_predictions))
print('Precision score', precision_score(valid_y, validation_predictions))
print('Recall score', recall_score(valid_y, validation_predictions))

Confusion Matrix:  [[1314   37]
 [  39  110]]
Accuracy score 0.9493333333333334
Precision score 0.7482993197278912
Recall score 0.738255033557047


In [45]:
criterion = ['gini', 'entropy']
max_depth = [int(x) for x in np.linspace(5, 200, num = 40)]
max_depth.append(None)
min_samples_split = [1, 3, 5, 8, 10, 15]
min_samples_leaf = [1, 2, 3, 4]
max_leaf_nodes = [None]
min_impurity_decrease = [0.000, 0.0005, 0.001, 0.005, 0.01]
param_grid_random = { 'criterion': criterion,
                      'max_depth': max_depth,
                      'min_samples_split': min_samples_split,
                      'min_samples_leaf' : min_samples_leaf,
                      'max_leaf_nodes' : max_leaf_nodes,
                      'min_impurity_decrease' : min_impurity_decrease,
                     }

In [47]:
dtree_default = DecisionTreeClassifier()
randomSearch = RandomizedSearchCV(estimator = dtree_default, param_distributions = param_grid_random, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
randomSearch.fit(train_X, train_y)
bestRandomModel = randomSearch.best_estimator_
print('Best parameters found: ', randomSearch.best_params_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:    2.0s


Best parameters found:  {'min_samples_split': 10, 'min_samples_leaf': 2, 'min_impurity_decrease': 0.005, 'max_leaf_nodes': None, 'max_depth': 80, 'criterion': 'entropy'}


[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    2.8s finished


In [48]:
validation_predictions = bestRandomModel.predict(valid_X)
print('Accuracy Score: ', accuracy_score(valid_y, validation_predictions))
print('Precision Score: ', precision_score(valid_y, validation_predictions))
print('Recall Score: ', recall_score(valid_y, validation_predictions))

Accuracy Score:  0.9653333333333334
Precision Score:  0.944954128440367
Recall Score:  0.6912751677852349


In [49]:
param_grid = {
              'min_samples_split': [1, 3, 5, 7, 9,10,12,15],  
              'min_samples_leaf': [1, 2, 3, 4, 5],
              'min_impurity_decrease': [0.0003, 0.0005, 0.0008, 0.001, 0.002],
              'max_leaf_nodes': [None], 
              'max_depth': [75,77,80,83,85],
              'criterion': ['entropy'],
              }

In [50]:
dtree_tuned = DecisionTreeClassifier()
gridSearch = GridSearchCV(estimator = dtree_tuned, param_grid=param_grid, cv = 3, verbose=2,  n_jobs = -1)
gridSearch.fit(train_X, train_y)
bestGridModel = gridSearch.best_estimator_
print('Best parameters found: ', gridSearch.best_params_)

Fitting 3 folds for each of 1000 candidates, totalling 3000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 1772 tasks      | elapsed:    6.1s


Best parameters found:  {'criterion': 'entropy', 'max_depth': 75, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.002, 'min_samples_leaf': 1, 'min_samples_split': 3}


[Parallel(n_jobs=-1)]: Done 3000 out of 3000 | elapsed:    8.6s finished


In [53]:
validation_predictions = bestGridModel.predict(valid_X)
print('accuracy score:',accuracy_score(valid_y, validation_predictions))
print('precision score:',precision_score(valid_y, validation_predictions))
print('recall:',recall_score(valid_y, validation_predictions))

accuracy score: 0.9653333333333334
precision score: 0.944954128440367
recall: 0.6912751677852349


In [54]:
#random forest

In [55]:
forest=RandomForestClassifier(random_state=1)
forest.fit(train_X, train_y)

RandomForestClassifier(random_state=1)

In [56]:
validation_predictions = forest.predict(valid_X)

print(confusion_matrix(valid_y, validation_predictions))
print(accuracy_score(valid_y, validation_predictions))
print(precision_score(valid_y, validation_predictions))
print(recall_score(valid_y, validation_predictions))

[[1339   12]
 [  44  105]]
0.9626666666666667
0.8974358974358975
0.7046979865771812


In [58]:
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
criterion = ['gini', 'entropy']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2,5,10]
min_samples_leaf = [1, 2,  4]
max_features = ['auto']
max_leaf_nodes = [None]
min_impurity_decrease = [0.000, 0.0005, 0.001, 0.005, 0.01]
bootstrap = [True]
param_grid_random = {'n_estimators': n_estimators,
                      'criterion': criterion,
                      'max_depth': max_depth,
                      'min_samples_split': min_samples_split,
                      'min_samples_leaf' : min_samples_leaf,
                      'max_features': max_features,
                      'max_leaf_nodes' : max_leaf_nodes,
                      'min_impurity_decrease' : min_impurity_decrease,
                      'bootstrap': bootstrap,
                     }

In [59]:
rf = RandomForestClassifier()
randomSearch = RandomizedSearchCV(estimator = rf, param_distributions = param_grid_random, n_iter = 300, cv = 3, verbose=2, random_state=42, n_jobs = -1)
randomSearch.fit(train_X, train_y)
bestRandomModel = randomSearch.best_estimator_
print('Best parameters found: ', randomSearch.best_params_)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   14.6s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   53.2s
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:  4.8min finished


Best parameters found:  {'n_estimators': 800, 'min_samples_split': 2, 'min_samples_leaf': 2, 'min_impurity_decrease': 0.0, 'max_leaf_nodes': None, 'max_features': 'auto', 'max_depth': 80, 'criterion': 'gini', 'bootstrap': True}


In [60]:
validation_predictions = bestRandomModel.predict(valid_X)
print('Accuracy Score: ', accuracy_score(valid_y, validation_predictions))
print('Precision Score: ', precision_score(valid_y, validation_predictions))
print('Recall Score: ', recall_score(valid_y, validation_predictions)) 

Accuracy Score:  0.964
Precision Score:  0.9357798165137615
Recall Score:  0.6845637583892618


In [61]:
param_grid = {'n_estimators': [700, 750, 800, 850, 900],
              'min_samples_split': [1, 2, 3, 4, 5],  
              'min_samples_leaf': [1, 2, 3],
              'min_impurity_decrease': [0.000, 0.0005, 0.001, 0.0015, 0.002],
              'max_leaf_nodes': [None], 
              'max_features': ['auto'], 
              'max_depth': [70, 75, 80, 85, 90],
              'criterion': ['gini'],
              'bootstrap': [True]}

In [62]:
rf = RandomForestClassifier()
gridSearch = GridSearchCV(estimator = rf, param_grid=param_grid, cv = 3, verbose=2,  n_jobs = -1)
gridSearch.fit(train_X, train_y)
bestGridModel = gridSearch.best_estimator_
print('Best parameters found: ', gridSearch.best_params_)

Fitting 3 folds for each of 1875 candidates, totalling 5625 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   12.7s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done 1450 tasks      | elapsed: 10.2min
[Parallel(n_jobs=-1)]: Done 1977 tasks      | elapsed: 13.3min
[Parallel(n_jobs=-1)]: Done 2584 tasks      | elapsed: 17.0min
[Parallel(n_jobs=-1)]: Done 3273 tasks      | elapsed: 21.6min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed: 26.9min
[Parallel(n_jobs=-1)]: Done 4893 tasks      | elapsed: 32.0min
[Parallel(n_jobs=-1)]: Done 5625 out of 5625 | elapsed: 36.9min finished


Best parameters found:  {'bootstrap': True, 'criterion': 'gini', 'max_depth': 75, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 750}


In [63]:

validation_predictions = bestGridModel.predict(valid_X)
print(accuracy_score(valid_y, validation_predictions))
print(precision_score(valid_y, validation_predictions))
print(recall_score(valid_y, validation_predictions))

0.9646666666666667
0.9363636363636364
0.6912751677852349


In [64]:
#XGBoost

In [65]:
xgboost=XGBClassifier(random_state=1)

xgboost.fit(train_X, train_y)
validation_predictions = xgboost.predict(valid_X)

print(confusion_matrix(valid_y, validation_predictions))
print(accuracy_score(valid_y, validation_predictions))
print(precision_score(valid_y, validation_predictions))
print(recall_score(valid_y, validation_predictions))

[[1331   20]
 [  37  112]]
0.962
0.8484848484848485
0.7516778523489933


In [66]:
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
learning_rate = [0.1, 0.2, 0.3, 0.4, 0.5]
max_depth = [int(x) for x in np.linspace(1, 200, num = 199)]
max_depth.append(None)
param_grid_random = {'n_estimators': n_estimators,
                      'max_depth': max_depth,
                      'learning_rate': learning_rate,
                     }

In [67]:
xgb = XGBClassifier()
randomSearch = RandomizedSearchCV(estimator = xgb, param_distributions = param_grid_random, n_iter = 300, cv = 3, verbose=2, random_state=42, n_jobs = -1)
randomSearch.fit(train_X, train_y)
bestRandomModel = randomSearch.best_estimator_
print('Best parameters found: ', randomSearch.best_params_)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    9.0s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   31.0s
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  2.0min


Best parameters found:  {'n_estimators': 400, 'max_depth': 1, 'learning_rate': 0.3}


[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:  2.7min finished


In [68]:
validation_predictions = bestRandomModel.predict(valid_X)
print('Accuracy Score: ', accuracy_score(valid_y, validation_predictions))
print('Precision Score: ', precision_score(valid_y, validation_predictions))
print('Recall Score: ', recall_score(valid_y, validation_predictions))

Accuracy Score:  0.9646666666666667
Precision Score:  0.9363636363636364
Recall Score:  0.6912751677852349


In [69]:
param_grid =  {'n_estimators': [380, 390, 395, 400, 405, 410, 420],
                'max_depth': [1,2,3],
                'learning_rate': [0.2, 0.25, 0.3, 0.35, 0.4],
              }

In [70]:
xgb = XGBClassifier()
gridSearch = GridSearchCV(estimator = xgb, param_grid=param_grid, cv = 3, verbose=2,  n_jobs = -1)
gridSearch.fit(train_X, train_y)
bestGridModel = gridSearch.best_estimator_
print('Best parameters found: ', gridSearch.best_params_)

Fitting 3 folds for each of 105 candidates, totalling 315 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   11.5s


Best parameters found:  {'learning_rate': 0.35, 'max_depth': 1, 'n_estimators': 405}


[Parallel(n_jobs=-1)]: Done 315 out of 315 | elapsed:   24.3s finished


In [71]:
validation_predictions = bestGridModel.predict(valid_X)
print(accuracy_score(valid_y, validation_predictions))
print(precision_score(valid_y, validation_predictions))
print(recall_score(valid_y, validation_predictions))

0.9653333333333334
0.9369369369369369
0.697986577181208


In [72]:
#Decision tree
#accuracy score: 0.9653333333333334
#precision score: 0.944954128440367
#recall: 0.6912751677852349

#random forest
#accuracy: 0.9646666666666667
#precision: 0.9363636363636364
#recall: 0.6912751677852349

#XGBoost
#accuracy: 0.9653333333333334
#precision: 0.9369369369369369
#recall: 0.697986577181208


In [73]:
#It seems all three models even with parametre tuning I could get accuracy as much as 96.5%
#Precision max I can see is with decision tree model which is 94.49%
#Recall score  with XGboost is highest

In [74]:
#Looking at all these findings I can say which model bank can use
#If I were to only look at accuracy as parametre for model selection then it would be Decision Tree or XGBoost
#Based on precision Decision tree is best by small percent.
#So Overall I would recommend Decision Tree for bank to use.
#All of this I have obtained is based on predictors I chose based on previous assignments
#So these results can be improved if I include more predictors.
#It is also possible to get better model if I were to tune more but it takes lot longer with that.

#For now with models I trained and deployed for test set I can decision tree as best model for bank
#Also it is possible to train models for better precision then it might be different result but as 
# in class 8 notebook I chose accuracy to improve by model.

