# Training Random Forest 

## Code for training a random forest model using cross-validation

In [1]:
import os
import joblib
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import class_weight
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import roc_curve, auc, roc_auc_score, confusion_matrix, accuracy_score, f1_score, precision_recall_curve
from sklearn.model_selection import GridSearchCV,RepeatedStratifiedKFold, cross_val_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
"""Functions def"""
def read_data(raw_clinical_note):
    data = pd.read_csv(raw_clinical_note, header=0,na_filter=True)
    return data

def saving_model(model,model_name,logs_file):
    model_name = model_name + '.pkl'
    url_save = os.path.join(logs_file,str(model_name))
    joblib.dump(model, url_save)

In [3]:
"""Loading training sets"""
path = "C:/Users/Salvador/Modelo_COVID19/Libretas manuscrito/BCM Infectius diseases/BCM/"
x_train = pd.read_csv(path + "X_train.csv")
y_train = pd.read_csv(path + "y_train.csv")
print ("x_train:", x_train.shape, "y_train:",y_train.shape)

x_train: (9251, 35) y_train: (9251, 2)


In [4]:
x_train = x_train.drop(['Unnamed: 0'],axis=1).astype(int)
y_train = y_train.drop(['Unnamed: 0'],axis=1).astype(int)
print ("x_train:", x_train.shape, "y_train:",y_train.shape)

x_train: (9251, 34) y_train: (9251, 1)


In [5]:
estimator = RandomForestClassifier()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=442)
sample_weights = class_weight.compute_sample_weight('balanced', y_train)
parameters={"n_estimators":[200, 250, 300], 
            "max_features": ['log2'],
           "max_depth" : [5, 6],
            "min_impurity_decrease":[1e-4],
            "criterion":['gini'],
           "class_weight":['balanced'],
            "random_state": [422]}
grid = GridSearchCV(estimator=estimator, param_grid=parameters, cv = cv, scoring='roc_auc',refit = True)
grid.fit(x_train,y_train,sample_weight = sample_weights)

In [6]:
print('Best params achieved via GridSearch: \n',grid.best_params_)
print('Best score in training:', grid.best_score_)

Best params achieved via GridSearch: 
 {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 6, 'max_features': 'log2', 'min_impurity_decrease': 0.0001, 'n_estimators': 300, 'random_state': 422}
Best score in training: 0.8280599705260591


In [7]:
"""Visualing metrics from gridSearchCV"""
cv_results = pd.DataFrame.from_dict(grid.cv_results_)
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_criterion,param_max_depth,param_max_features,param_min_impurity_decrease,param_n_estimators,...,split93_test_score,split94_test_score,split95_test_score,split96_test_score,split97_test_score,split98_test_score,split99_test_score,mean_test_score,std_test_score,rank_test_score
0,0.600631,0.07418,0.037487,0.00976,balanced,gini,5,log2,0.0001,200,...,0.826841,0.81165,0.825302,0.781805,0.809153,0.850614,0.843609,0.827032,0.016921,5
1,0.7618,0.051689,0.043716,0.010125,balanced,gini,5,log2,0.0001,250,...,0.826222,0.810489,0.82596,0.782569,0.809269,0.849715,0.843416,0.826943,0.016969,6
2,0.970956,0.11088,0.054809,0.015781,balanced,gini,5,log2,0.0001,300,...,0.82626,0.810508,0.825907,0.781185,0.810914,0.849676,0.843232,0.82724,0.017013,4
3,0.639377,0.03972,0.036909,0.007214,balanced,gini,6,log2,0.0001,200,...,0.823764,0.814717,0.826778,0.779269,0.811476,0.85194,0.840726,0.827676,0.017089,3
4,0.798596,0.048365,0.045926,0.0091,balanced,gini,6,log2,0.0001,250,...,0.823019,0.813691,0.825636,0.780353,0.811988,0.851195,0.841442,0.827709,0.016995,2
5,0.985257,0.089494,0.053444,0.009062,balanced,gini,6,log2,0.0001,300,...,0.823309,0.813149,0.826942,0.78044,0.812066,0.851466,0.841548,0.82806,0.017067,1


In [8]:
"""Saving AUC (cross-validation)"""
cv_results = pd.DataFrame.from_dict(grid.cv_results_)
auc_training = cv_results.mean_test_score
auc_training.to_csv(path+"AUC_crossval_RF_0.csv")

In [9]:
""" Re-training the model with the best params"""
best_params = grid.best_params_
model = RandomForestClassifier(**best_params)
model = model.fit(x_train,y_train)

In [10]:
"""Saving model"""
saving_model(model,"RF_model_0",path)