# Random Forest Classification Model

> you must run the data-science.ipynb first in order to get the experimental model

# Import packages

In [2]:
# load data
# Add directory above current directory to path
import sys; sys.path.insert(0, '..')
#from submodules.load_data import load_data

# data manipulation
import numpy as np
import pandas as pd

# data splitting
from sklearn.model_selection import train_test_split

# data preprocessing
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# model
from xgboost import XGBClassifier

# hyperparameter tuning
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import RandomizedSearchCV

# k-fold cross validation
from sklearn.model_selection import cross_validate
from sklearn.metrics import f1_score

# saving models
import joblib

# performance
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import PrecisionRecallDisplay
import matplotlib.pyplot as plt

import sys
sys.path.append("../lib")
from getConfig import *
config = getConfig("../")
config.cleanup(config.tuned_path)


# Load the data

In [3]:

with open(config.traintest_path + "X_train_prepared.csv") as file_name:
    X_train_prepared = np.loadtxt(file_name, delimiter=",")

with open(config.traintest_path + "X_train_prepared_m.csv") as file_name:
    X_train_prepared_m = np.loadtxt(file_name, delimiter=",")

with open(config.traintest_path + "X_test_prepared.csv") as file_name:
    X_test_prepared = np.loadtxt(file_name, delimiter=",")
    
with open(config.traintest_path + "y_train.csv") as file_name:
    y_train = np.loadtxt(file_name, delimiter=",")
    
with open(config.traintest_path + "y_test.csv") as file_name:
    y_test = np.loadtxt(file_name, delimiter=",")



# Load the model

In [4]:
# load the model from disk
model = joblib.load(config.trained_path + "rfc_model.pkl")


# Fine tune Random Forest Classifier model using RandomizedSearchCV
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html

In [5]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
# search across 100 different combinations, and use all available cores

# Number of trees in random forest
n_estimators = [200, 400, 600]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

rf_random = RandomizedSearchCV(estimator = model, param_distributions = random_grid, n_iter = 25, cv =  3, verbose=2, random_state=42, n_jobs = -1)# Fit the random search model
rf_random.fit(X_train_prepared, y_train)


{'n_estimators': [200, 400, 600], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}
Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=   0.9s
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=   0.9s
[CV] END bootstrap=True, max_depth=70, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=400; total time=   0.8s
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=   0.9s
[CV] END bootstrap=True, max_depth=70, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=400; total time=   0.8s
[CV] END bootstrap=True, m

RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=25,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600]},
                   random_state=42, verbose=2)

In [6]:
# print the best score
rf_random.best_score_

0.9449656035021888

In [7]:
# print the best combination of parameters
rf_random.best_params_

{'n_estimators': 400,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 90,
 'bootstrap': True}

In [8]:
# print the best estimator directly
rf_random.best_estimator_

RandomForestClassifier(max_depth=90, max_features='sqrt', min_samples_leaf=4,
                       n_estimators=400)

## Load the model with best estimator

In [9]:
# load the model from disk
model_test = rf_random.best_estimator_

## Test the Model

In [10]:
model_test.fit(X_test_prepared,y_test)
print (f'Accuracy : {model_test.score(X_test_prepared,y_test):.3f}')
y_pred = model_test.predict(X_test_prepared)

f1score = str(f1_score(y_test, y_pred))
print("f1 score: " + f1score)

Accuracy : 0.955
f1 score: 0.6999999999999998


### Save the finalized model

In [11]:
# serialize the model
joblib.dump(model, config.tuned_path + "rfc_model.pkl", compress=('bz2', 3))

['../experiments/experiment_0/models/tuned/rfc_model.pkl']