In [1]:
# working for hyperparameters blog piece for dataquest - use UCI bank dataset

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import scipy as sp
from timeit import default_timer as timer
from sklearn.metrics import confusion_matrix

%matplotlib inline

In [2]:
bank = pd.read_csv('bank.csv')
bank.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [3]:
bank = pd.get_dummies(bank, drop_first=True)

In [4]:
bank.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job_blue-collar,job_entrepreneur,job_housemaid,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_other,poutcome_success,poutcome_unknown,y_yes
0,58,2143,5,261,1,-1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
1,44,29,5,151,1,-1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
2,33,2,5,76,1,-1,0,0,1,0,...,0,0,1,0,0,0,0,0,1,0
3,47,1506,5,92,1,-1,0,1,0,0,...,0,0,1,0,0,0,0,0,1,0
4,33,1,5,198,1,-1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0


In [5]:
cutoff = bank.shape[0] * 0.8
train = bank.sample(frac = 0.8)
test = bank.loc[~bank.index.isin(train.index)]

In [6]:
X = train.drop('y_yes', axis = 1)
y = train.y_yes

X.shape, y.shape

((36169, 42), (36169,))

In [7]:
columns = train.columns.drop('y_yes')
columns

Index(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous',
       'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'marital_married', 'marital_single', 'education_secondary',
       'education_tertiary', 'education_unknown', 'default_yes', 'housing_yes',
       'loan_yes', 'contact_telephone', 'contact_unknown', 'month_aug',
       'month_dec', 'month_feb', 'month_jan', 'month_jul', 'month_jun',
       'month_mar', 'month_may', 'month_nov', 'month_oct', 'month_sep',
       'poutcome_other', 'poutcome_success', 'poutcome_unknown'],
      dtype='object')

In [8]:
train['y_yes'].value_counts()

0    31945
1     4224
Name: y_yes, dtype: int64

In [9]:
test['y_yes'].value_counts()

0    7977
1    1065
Name: y_yes, dtype: int64

In [10]:
clf = RandomForestClassifier(max_depth=2, random_state=0, class_weight='balanced')
clf.fit(X,y)
predictions = clf.predict(test[columns])

In [11]:
true_pos = 0
false_pos = 0
true_neg = 0
false_neg = 0

for i, val in enumerate(predictions):
    if val == 1 and test['y_yes'].iloc[i] ==1:
        true_pos += 1
    elif val == 1 and test['y_yes'].iloc[i] ==0:
        false_pos += 1
    elif val == 0 and test['y_yes'].iloc[i] ==0:
        true_neg += 1
    elif val == 0 and test['y_yes'].iloc[i] ==1:
        false_neg += 1  
        
print(true_pos)
print(false_pos)
print(true_neg)
print(false_neg)

802
1817
6160
263


In [12]:
report = classification_report(test['y_yes'], predictions, digits=5)
print(report)

             precision    recall  f1-score   support

          0    0.95905   0.77222   0.85556      7977
          1    0.30622   0.75305   0.43540      1065

avg / total    0.88216   0.76996   0.80607      9042



In [13]:
correct_count = 0
false_count = 0

for i, val in enumerate(predictions):
    if val == test['y_yes'].iloc[i]:
        correct_count += 1
    else:
        false_count += 1
        
print(correct_count)
print(false_count)

6962
2080


In [14]:
param_grid = {'n_estimators':[2, 5, 10, 15, 50, 100, 200], 'max_depth':[3, 5, 10, 15, 25, 40, 75]}

In [16]:
start = timer()
rndm_f = RandomForestClassifier(class_weight='balanced', random_state=2)
clf = GridSearchCV(rndm_f, param_grid, cv = 5)
clf.fit(X,y)
#predictions = clf.predict(test[columns])
predictions = (clf.predict_proba(test[columns])[:,1] >= 0.45).astype(bool)
end = timer()
print('Grid Search took', end - start, 'seconds')

Grid Search took 379.5351302070194 seconds


In [17]:
print (clf.best_params_)

{'max_depth': 25, 'n_estimators': 200}


In [18]:
report = classification_report(test['y_yes'], predictions, digits=5)
print(report)

             precision    recall  f1-score   support

          0    0.93413   0.95637   0.94512      7977
          1    0.60229   0.49484   0.54330      1065

avg / total    0.89504   0.90201   0.89779      9042



In [19]:
print (clf.best_score_)

0.9041720810638945


In [20]:
true_pos = 0
false_pos = 0
true_neg = 0
false_neg = 0

for i, val in enumerate(predictions):
    if val == 1 and test['y_yes'].iloc[i] ==1:
        true_pos += 1
    elif val == 1 and test['y_yes'].iloc[i] ==0:
        false_pos += 1
    elif val == 0 and test['y_yes'].iloc[i] ==0:
        true_neg += 1
    else:
        false_neg += 1  
        
print('True positives:', true_pos)
print('')
print('False positives:', false_pos)
print('')
print('True negatives:', true_neg)
print('')
print('False negatives:', false_neg)

True positives: 527

False positives: 348

True negatives: 7629

False negatives: 538


In [21]:
correct_count = 0
false_count = 0

for i, val in enumerate(predictions):
    if val == test['y_yes'].iloc[i]:
        correct_count += 1
    else:
        false_count += 1
        
print('Correct predictions:',correct_count)
print('')
print('False predictions', false_count)

Correct predictions: 8156

False predictions 886


# Random search

In [22]:
param_dist = {'n_estimators':[2, 5, 10, 15, 50, 100, 200], 'max_depth':[3, 5, 10, 15, 25, 40, 75]}

In [23]:
start = timer()

rndm_f = RandomForestClassifier(random_state=2)
clf = RandomizedSearchCV(rndm_f, param_dist, cv = 5)
clf.fit(X,y)
#predictions = clf.predict(test[columns])

predictions = (clf.predict_proba(test[columns])[:,1] >= 0.45).astype(bool)

end = timer()
print('Random Search took', end - start, 'seconds')

Random Search took 70.40942063799594 seconds


In [24]:
print (clf.best_params_)

{'n_estimators': 50, 'max_depth': 25}


In [25]:
report = classification_report(test['y_yes'], predictions, digits=5)
print(report)

             precision    recall  f1-score   support

          0    0.93192   0.95926   0.94539      7977
          1    0.60890   0.47512   0.53376      1065

avg / total    0.89387   0.90223   0.89691      9042



In [26]:
print (clf.best_score_)

0.9053885924410406


In [27]:
true_pos = 0
false_pos = 0
true_neg = 0
false_neg = 0

for i, val in enumerate(predictions):
    if val == 1 and test['y_yes'].iloc[i] ==1:
        true_pos += 1
    elif val == 1 and test['y_yes'].iloc[i] ==0:
        false_pos += 1
    elif val == 0 and test['y_yes'].iloc[i] ==0:
        true_neg += 1
    else:
        false_neg += 1  
        
print('True positives:', true_pos)
print('')
print('False positives:', false_pos)
print('')
print('True negatives:', true_neg)
print('')
print('False negatives:', false_neg)

True positives: 506

False positives: 325

True negatives: 7652

False negatives: 559


In [28]:
correct_count = 0
false_count = 0

for i, val in enumerate(predictions):
    if val == test['y_yes'].iloc[i]:
        correct_count += 1
    else:
        false_count += 1
        
print('Correct predictions:',correct_count)
print('')
print('False predictions', false_count)

Correct predictions: 8158

False predictions 884
