# Random Forest

## Load Packages and Prep Data

In [10]:
# custom utils
import utils

from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [11]:
RandomForestClassifier?

[0;31mInit signature:[0m
[0mRandomForestClassifier[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mn_estimators[0m[0;34m=[0m[0;36m100[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcriterion[0m[0;34m=[0m[0;34m'gini'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_depth[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_samples_split[0m[0;34m=[0m[0;36m2[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_samples_leaf[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_weight_fraction_leaf[0m[0;34m=[0m[0;36m0.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_features[0m[0;34m=[0m[0;34m'sqrt'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_leaf_nodes[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_impurity_decrease[0m[0;34m=[0m[0;36m0.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbootstrap[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[

In [12]:
# load data
X_train, y_train, X_test, y_test = utils.load_data()

(62889, 46)
(15723, 46)


## Model 1
- Default hyperparameters

In [16]:
# fit Random Forest model
rf_1 = RandomForestClassifier()
x = rf_1.fit(X_train, y_train)

In [17]:
# cross validation with f1 scoring
score = utils.f1_cv(rf_1, X_train, y_train)

[0.7338 0.7198 0.7311 0.7132 0.71  ]
0.7216


## Model 2
- Hyperparameter grid search

In [18]:
# find best parameters by grid search
params = {'n_estimators':range(60,91,10), 'max_depth':range(7,14,2), 'min_samples_split':range(80,121,20), 'min_samples_leaf':range(40,61,10)}
gs = GridSearchCV(RandomForestClassifier(max_features='sqrt' ,random_state=10), params, cv=5, scoring='f1', n_jobs=-1)
gs.fit(X_train, y_train)
print(gs.best_params_)
print(gs.best_score_)

# store best model
rf_2 = gs.best_estimator_

{'max_depth': 13, 'min_samples_leaf': 40, 'min_samples_split': 100, 'n_estimators': 90}
0.6623578963934849


In [19]:
# cross validation with f1 scoring
score = utils.f1_cv(rf_2, X_train, y_train)

[0.6633 0.6667 0.6808 0.6556 0.6454]
0.6624


## Test

In [20]:
# test the performance of the selected model
y_pred = rf_2.predict(X_test)

# scores
utils.pred_metrics(y_test, y_pred)

# confusion matrix
utils.cm_plot(y_test,y_pred)

Accuracy:	0.9577688736246264
Precision:	0.7751633986928105
Recall:		0.5465437788018433
F1:		0.6410810810810811
