In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
bank_dataset = pd.read_csv('https://raw.githubusercontent.com/albanda/CE888/master/lab3/bank-additional-full.csv', sep=';')
bank_dataset.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,307,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import SCORERS, accuracy_score as acc, f1_score as f1
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier


In [None]:
le = LabelEncoder()
bank_dataset['y'] = le.fit_transform(bank_dataset['y'])

In [None]:
dummies = pd.get_dummies(bank_dataset)

In [None]:
dummies.drop('duration', axis=1, inplace=True)

In [None]:
bank_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

In [None]:
dummies.isna().sum()
x = dummies.drop('y', axis=1)
y = dummies['y']

age                     0
campaign                0
pdays                   0
previous                0
emp.var.rate            0
                       ..
day_of_week_tue         0
day_of_week_wed         0
poutcome_failure        0
poutcome_nonexistent    0
poutcome_success        0
Length: 63, dtype: int64

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, random_state=50, test_size=0.3)
cv = StratifiedKFold(n_splits=10, random_state=50, shuffle=True)
clf = RandomForestClassifier(n_estimators=100, max_depth=3)
clf.fit(x_train, y_train)

In [None]:
scores = cross_val_score(clf, x_train, y_train, cv=cv, n_jobs=-1, scoring='accuracy', verbose=10)
scores
sorted(SCORERS.keys())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    9.2s finished


In [None]:
dummyClf = DummyClassifier(strategy='stratified')
dummyClf.fit(x_train, y_train)
dummy_score = cross_val_score(dummyClf, x_train, y=y_train, scoring='f1', cv=10)
dummy_score.mean()

DummyClassifier(constant=None, random_state=None, strategy='stratified')

In [None]:
acc(y_test, dummyClf.predict(x_test))
acc(y_test, clf.predict(x_test))

0.7983329287043781

In [None]:
params = { 
    'n_estimators': [50, 100, 200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8, 10],
    'criterion' :['gini', 'entropy']
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=params, scoring='accuracy', n_jobs=-1, verbose=10)
grid_search.fit(x_train, y_train)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:   32.1s
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:   37.8s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   57.0s
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done  94 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 109 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 141 tasks      | elapsed:  4

In [None]:
grid_search.best_score_
scores_cv = cross_val_score(estimator=grid_search.best_estimator_, X=x_train, 
                            y=y_train, cv=cv, n_jobs=-1, scoring='accuracy', verbose=10)

In [None]:
scores_cv
dummies['y'].value_counts().iloc[1]/dummies['y'].value_counts().sum()
dummy_f1_score = cross_val_score(estimator=dummyClf, cv=cv, scoring='f1', X=x_train, y=y_train) 
dummies.shape

In [None]:
rfclf = RandomForestClassifier(n_estimators=150, max_depth=6)
score_rf = cross_val_score(rfclf, x_train, y=y_train, cv=cv, n_jobs=-1, scoring='f1')
score_rf.mean()
rfclf = RandomForestClassifier(n_estimators=100, max_depth=3)
score_rf = cross_val_score(rfclf, x_train, y=y_train, cv=cv, n_jobs=-1, scoring='f1')
score_rf.mean()

In [None]:
rfclf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=60)
score_rf = cross_val_score(rfclf, x_train, y=y_train, cv=cv, n_jobs=-1, scoring='f1')
score_rf.mean()

0.32549833133979383

In [None]:
rfclf = RandomForestClassifier(n_estimators=100, max_depth=6, random_state=60)
rfclf.fit(x_train, y_train)
# score_rf = cross_val_score(rfclf, x_train, y=y_train, cv=cv, n_jobs=-1, scoring='f1')
# score_rf.mean()

pred = rfclf.predict(x_test)
f1(y_test, pred)

0.3105175292153589

In [None]:
rfclf = RandomForestClassifier(n_estimators=300, max_depth=12, random_state=60)
rfclf.fit(x_train, y_train)
importances = rfclf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rfclf.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]
print(indices)

for f in range(x.shape[1]):
    print("%d. %s (%f)" % (f + 1, columns[indices[f]],  importances[indices[f]]))

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=12, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=None, oob_score=False, random_state=60, verbose=0,
                       warm_start=False)