## 45 - Modeling: Random Forest

> One of the actual modeling.


In [None]:
# import needed packages
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [None]:
# read the data
full_data = pd.read_csv('data/cumulative_data_fe.csv')
train_data = pd.read_csv('data/train_data_fe.csv')
test_data = pd.read_csv('data/test_data_fe.csv')

# drop columns
y_train_data = pd.DataFrame(train_data['stone_soil_enc'])
x_train_data = train_data.drop(['id', 'img_id', 'stone_soil', 'stone_soil_enc'], axis = 1)

# drop columns
y_test_data = pd.DataFrame(test_data['stone_soil_enc'])
x_test_data = test_data.drop(['id', 'img_id', 'stone_soil', 'stone_soil_enc'], axis = 1)

In [None]:
# define a function for scoring
def pred_metrics(real, pred) -> None:
    print("Accuracy:\t{}".format(metrics.accuracy_score(real, pred)))
    print("Precision:\t{}".format(metrics.precision_score(real, pred)))
    print("Recall:\t\t{}".format(metrics.recall_score(real, pred)))
    print("F1:\t\t{}".format(metrics.f1_score(real, pred)))

In [None]:
# initialize RF classifier instance and train
rf = RandomForestClassifier(oob_score=True, random_state=10)
rf.fit(x_train_data, y_train_data.values.ravel())

RandomForestClassifier(oob_score=True, random_state=10)

In [None]:
# calculate metrics of training
y_train_pred = rf.predict(x_train_data)
pred_metrics(y_train_data, y_train_pred)

Accuracy:	0.9981555096199713
Precision:	0.9997560975609756
Recall:		0.9727100142382534
F1:		0.9860476305027664


In [None]:
# calculate metrics of testing
y_test_pred = rf.predict(x_test_data)
pred_metrics(y_test_data, y_test_pred)

Accuracy:	0.9600559725225798
Precision:	0.7464940668824164
Recall:		0.6377880184331797
F1:		0.687872763419483


In [None]:
# cross validation
cross_val_score(rf, x_train_data, y_train_data.values.ravel(), cv= 10, scoring = 'f1', n_jobs=-1)

array([0.73006993, 0.74123989, 0.72455902, 0.72479564, 0.73170732,
       0.72409152, 0.71370421, 0.7107438 , 0.68156425, 0.74798928])

In [None]:
# find best parameters by grid search
params = {'n_estimators':range(60,91,10), 'max_depth':range(7,14,2), 'min_samples_split':range(80,121,20), 'min_samples_leaf':range(40,61,10)}
gs = GridSearchCV(RandomForestClassifier(max_features='sqrt' ,random_state=10), params, cv=5, scoring='f1', n_jobs=-1)
gs.fit(x_train_data, y_train_data.values.ravel())
print(gs.best_params_)
print(gs.best_score_)

{'max_depth': 13, 'min_samples_leaf': 40, 'min_samples_split': 80, 'n_estimators': 60}
0.6633304946678612


In [None]:
# last version of the model
rf_n = RandomForestClassifier(n_estimators = 60, max_depth = 13, min_samples_leaf = 40, min_samples_split=80, max_features='sqrt',random_state=10, n_jobs=-1)
rf_n.fit(x_train_data, y_train_data.values.ravel())

# calculate metrics of testing
y_test_pred = rf_n.predict(x_test_data)
pred_metrics(y_test_data, y_test_pred)

Accuracy:	0.9574481618114744
Precision:	0.7613065326633166
Recall:		0.5585253456221199
F1:		0.6443381180223285


In [None]:
# cross validation
cross_val_score(rf_n, x_train_data, y_train_data.values.ravel(), cv= 10, scoring = 'f1',n_jobs=-1)

array([0.64201183, 0.68635724, 0.66666667, 0.67621777, 0.66378066,
       0.68994413, 0.67136812, 0.64723032, 0.62898551, 0.68465909])