# 0.0 Import libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier 
from sklearn import metrics as mt

# 0.1 Load dataset

In [2]:
X_train = pd.read_csv('../../dataset/classification/X_training.csv')
y_train = pd.read_csv('../../dataset/classification/y_training.csv')
X_val = pd.read_csv('../../dataset/classification/X_validation.csv')
y_val = pd.read_csv('../../dataset/classification/y_validation.csv')
X_test = pd.read_csv('../../dataset/classification/X_test.csv')
y_test = pd.read_csv('../../dataset/classification/y_test.csv')


# 1.0 Training Model

In [3]:
y_train = y_train.values.ravel()
y_val = y_val.values.ravel()

## Dados de treino

In [4]:
k_list = np.arange(2, 41, 1)
acc_list = []
pre_list = []
rec_list = []
f1_list = []
max_acc = 0
max_pre = 0
max_rec = 0
max_f1 = 0


for i in k_list:
    # definition
    model = RandomForestClassifier(n_estimators=50, 
                                    max_depth=i)
    # fit
    model.fit( X_train, y_train )

    # performance
    y_pred = model.predict( X_train )

   # accuracy
    acc = mt.accuracy_score(y_train, y_pred)
    acc_list.append( acc )
    if acc > max_acc:
        max_acc = acc
   
   # precision
    pre = mt.precision_score( y_train, y_pred)
    pre_list.append( pre )
    if pre > max_pre:
        max_pre = pre

    # recall
    rec = mt.recall_score( y_train, y_pred)
    rec_list.append( rec )
    if rec > max_rec:
        max_rec = rec
    
    # f1-score
    f1 = mt.f1_score( y_train, y_pred)
    f1_list.append( f1 )
    if f1 > max_f1:
        max_f1 = f1

print('Max Accuracy: {}'.format(max_acc))
print( 'Max Precision = {}'.format( max_pre ))
print( 'Max Recall = {}'.format( max_rec ))
print( 'Max f1-score = {}'.format( max_f1 ))

Max Accuracy: 0.999972419499414
Max Precision = 1.0
Max Recall = 0.9999363624793178
Max f1-score = 0.9999681802271931


## Dados de validação

In [6]:
# Retreinando o modelo com o melhor parâmetro sobre os dados de validação
m = np.arange( 2, 41, 1)
best_m = acc_list.index( max ( acc_list ) )

# model definition
model = model = RandomForestClassifier(n_estimators=50, 
                                        max_depth=m[best_m] )

# model training
model.fit ( X_train, y_train )

# model performance
yhat_val = model.predict( X_val)

# accuracy
acc_val = mt.accuracy_score ( y_val, yhat_val)
print( 'Accuracy = {}'.format ( acc_val ))

# precision
precision_val = mt.precision_score( y_val, yhat_val)
print( 'Precision = {}'.format( precision_val ))
    
# recall
recall_val = mt.recall_score( y_val, yhat_val)
print( 'Recall = {}'.format( recall_val ))
    
# f1-score
f1_score_val = mt.f1_score( y_val, yhat_val)
print( 'F1-score = {}'.format( f1_score_val ))

Accuracy = 0.9641558608706844
Precision = 0.9736256996089857
Recall = 0.9428316875788848
F1-score = 0.9579812914906457


## Dados de teste

In [8]:
# Retreinando o modelo com o melhor parâmetro sobre os dados de teste
best_m = acc_list.index( max ( acc_list ) )

# model definition
model = RandomForestClassifier(n_estimators=50, 
                                        max_depth=m[best_m] )

# model training
model.fit(np.concatenate( (X_train, X_val) ),
          np.concatenate( (y_train, y_val) ) )

# model performance
y_pred = model.predict( X_test )

# accuracy
acc_test = mt.accuracy_score ( y_test, y_pred)
print( 'Accuracy = {}'.format ( acc_test ))

# precision
precision_test = mt.precision_score( y_test, y_pred)
print( 'Precision = {}'.format( precision_test ))
    
# recall
recall_test = mt.recall_score( y_test, y_pred)
print( 'Recall = {}'.format( recall_test ))
    
# f1-score
f1_score_test = mt.f1_score( y_test, y_pred)
print( 'F1-score = {}'.format( f1_score_test ))



Accuracy = 0.9649326072683737
Precision = 0.9740683652189682
Recall = 0.9452705675318962
F1-score = 0.9594534250245601
