# Machine Learning - Random Forest (Floresta Aleatória)

#### Importação das bibliotecas necessárias

In [2]:
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_predict  # Fix: replaced 'sklearn.cross_validation' with 'sklearn.model_selection'
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#### Carregando a base de dados.

In [3]:
df_edu = pd.read_csv('xAPI-Edu-Data.csv')

In [4]:
df_edu.head()

Unnamed: 0,gender,NationalITy,PlaceofBirth,StageID,GradeID,SectionID,Topic,Semester,Relation,raisedhands,VisITedResources,AnnouncementsView,Discussion,ParentAnsweringSurvey,ParentschoolSatisfaction,StudentAbsenceDays,Class
0,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,15,16,2,20,Yes,Good,Under-7,M
1,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,20,20,3,25,Yes,Good,Under-7,M
2,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,10,7,0,30,No,Bad,Above-7,L
3,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,30,25,5,35,No,Bad,Above-7,L
4,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,40,50,12,50,No,Bad,Above-7,M


#### Verificando as distribuições de classes.

In [5]:
df_edu['Class'].value_counts()

M    211
H    142
L    127
Name: Class, dtype: int64

#### Verificando os registros nulos

In [6]:
df_edu.isnull().sum()

gender                      0
NationalITy                 0
PlaceofBirth                0
StageID                     0
GradeID                     0
SectionID                   0
Topic                       0
Semester                    0
Relation                    0
raisedhands                 0
VisITedResources            0
AnnouncementsView           0
Discussion                  0
ParentAnsweringSurvey       0
ParentschoolSatisfaction    0
StudentAbsenceDays          0
Class                       0
dtype: int64

#### Codificando os atributos numéricos.

In [7]:
Features = df_edu
Cat_Colums = Features.dtypes.pipe(lambda Features: Features[Features=='object']).index
for col in Cat_Colums:
    label = LabelEncoder()
    Features[col] = label.fit_transform(Features[col])

In [8]:
Features.head()

Unnamed: 0,gender,NationalITy,PlaceofBirth,StageID,GradeID,SectionID,Topic,Semester,Relation,raisedhands,VisITedResources,AnnouncementsView,Discussion,ParentAnsweringSurvey,ParentschoolSatisfaction,StudentAbsenceDays,Class
0,1,4,4,2,1,0,7,0,0,15,16,2,20,1,1,1,2
1,1,4,4,2,1,0,7,0,0,20,20,3,25,1,1,1,2
2,1,4,4,2,1,0,7,0,0,10,7,0,30,0,0,0,1
3,1,4,4,2,1,0,7,0,0,30,25,5,35,0,0,0,1
4,1,4,4,2,1,0,7,0,0,40,50,12,50,0,0,0,2


#### Separando os dados e classes

In [9]:
dataset = df_edu.drop('Class',axis=1)

In [10]:
classes = df_edu['Class']

# Random Forest vs Árvore de Decisão

#### Resultados Random Forest

In [11]:
random_clf = RandomForestClassifier(random_state=1,n_estimators=100)

In [12]:
resultados_random = cross_val_predict(random_clf, dataset, classes, cv=5)

In [13]:
print(classification_report(classes,resultados_random))

              precision    recall  f1-score   support

           0       0.65      0.64      0.65       142
           1       0.77      0.78      0.77       127
           2       0.63      0.63      0.63       211

    accuracy                           0.67       480
   macro avg       0.68      0.68      0.68       480
weighted avg       0.67      0.67      0.67       480



#### Resultados Decision Tree

In [14]:
tree_clf = DecisionTreeClassifier(random_state=1)

In [15]:
resultados_tree = cross_val_predict(tree_clf,dataset,classes,cv=5)

In [16]:
print(classification_report(classes,resultados_tree))

              precision    recall  f1-score   support

           0       0.50      0.61      0.55       142
           1       0.74      0.68      0.70       127
           2       0.54      0.49      0.52       211

    accuracy                           0.57       480
   macro avg       0.59      0.59      0.59       480
weighted avg       0.58      0.57      0.58       480



#### Verificando Overfitting

In [17]:
X_train, X_test, y_train, y_test = train_test_split(df_edu.drop('Class',axis=1),df_edu['Class'],test_size=0.3,random_state=1)

In [18]:
def compara_modelos_random_forest(maxdepth):
    if maxdepth == 0:
        rf = RandomForestClassifier(n_estimators=100,random_state=1)
    else: 
        rf = RandomForestClassifier(n_estimators=100,random_state=1, max_depth=maxdepth)
    rf.fit(X_train, y_train)
    train_score = rf.score(X_train, y_train)
    test_score = rf.score(X_test, y_test)
    return train_score,test_score

In [19]:
print('{:10} {:20} {:20}'.format('depth', 'Training score','Testing score'))
print('{:10} {:20} {:20}'.format('-----', '--------------','-------------'))
print('{:1}         {} '.format(2,str(compara_modelos_random_forest(2))))
print('{:1}         {} '.format(3,str(compara_modelos_random_forest(3))))
print('{:1}         {} '.format(4,str(compara_modelos_random_forest(4))))
print('{:1}         {} '.format(10,str(compara_modelos_random_forest(10))))
print('{:1}         {} '.format(15,str(compara_modelos_random_forest(15))))
print('{:1}         {} '.format('Full',str(compara_modelos_random_forest(0))))

depth      Training score       Testing score       
-----      --------------       -------------       
2         (0.75, 0.6180555555555556) 
3         (0.8244047619047619, 0.6805555555555556) 
4         (0.8720238095238095, 0.7152777777777778) 
10         (1.0, 0.7569444444444444) 
15         (1.0, 0.7986111111111112) 
Full         (1.0, 0.7986111111111112) 


In [20]:
def compara_modelos_decision_tree(maxdepth):
    if maxdepth == 0:
        df = DecisionTreeClassifier(random_state=1)
    else: 
        df = DecisionTreeClassifier(random_state=1, max_depth=maxdepth)
    df.fit(X_train, y_train)
    train_score = df.score(X_train, y_train)
    test_score = df.score(X_test, y_test)
    return train_score,test_score

In [21]:
print('{:10} {:20} {:20}'.format('depth', 'Training score','Testing score'))
print('{:10} {:20} {:20}'.format('-----', '--------------','-------------'))
print('{:1}         {} '.format(2,str(compara_modelos_decision_tree(2))))
print('{:1}         {} '.format(3,str(compara_modelos_decision_tree(3))))
print('{:1}         {} '.format(4,str(compara_modelos_decision_tree(4))))
print('{:1}         {} '.format(10,str(compara_modelos_decision_tree(10))))
print('{:1}         {} '.format(15,str(compara_modelos_decision_tree(15))))
print('{:1}         {} '.format('Full',str(compara_modelos_decision_tree(0))))

depth      Training score       Testing score       
-----      --------------       -------------       
2         (0.6398809523809523, 0.6805555555555556) 
3         (0.7321428571428571, 0.7013888888888888) 
4         (0.7916666666666666, 0.7430555555555556) 
10         (0.9910714285714286, 0.6875) 
15         (1.0, 0.6944444444444444) 
Full         (1.0, 0.6944444444444444) 


# Tunning do Modelo para Garantir o Melhor Desempenho

#### Como encontrar os melhores valores para os parametros do modelo?

RandomForestClassifier(
n_estimators=?,
criterion='gini' ou 'entropy',
max_depth=?,
min_samples_split=?,
min_samples_leaf=?
) ...

#### GridSearchCV para testes de Hyperparametros

In [22]:
from sklearn.model_selection import GridSearchCV

#### Lista de possíveis valores de estimators ou quantidade de árvores da floresta.

In [23]:
valores_estimators = [10, 20, 50, 100, 150]

#### Lista de possíveis valores para o critério de divisão.

In [24]:
valores_criterion = ['gini','entropy']

#### Lista de possíveis valores para a profundidade máxima de cada árvore

In [25]:
valores_max_depth = [10, 20, 50, 100]

#### Lista de possíveis valores para os parametros min_samples_split e min_samples_leaf.

In [26]:
valores_min_samples_split = [2, 5, 10,15]
valores_min_samples_leaf = [1, 5, 10,15]

#### Define um dicionário que recebe as listas de parâmetros e valores.

In [27]:
parametros_grid = dict(n_estimators=valores_estimators,
                       criterion=valores_criterion,
                       max_depth=valores_max_depth,
                       min_samples_split=valores_min_samples_split,
                       min_samples_leaf=valores_min_samples_leaf 
                      )

#### Dicionário com os parametros que serão utilizados no grid.

In [28]:
parametros_grid

{'n_estimators': [10, 20, 50, 100, 150],
 'criterion': ['gini', 'entropy'],
 'max_depth': [10, 20, 50, 100],
 'min_samples_split': [2, 5, 10, 15],
 'min_samples_leaf': [1, 5, 10, 15]}

#### Instancia o GridSearch com o modelo a ser utilizado, parametros, número de folds e scoring.

In [29]:
rf = RandomForestClassifier()

In [30]:
grid = GridSearchCV(rf, parametros_grid, cv=5, scoring='accuracy')

#### Aplica o GridSearch passando as features e classes

In [31]:
grid.fit(df_edu.drop('Class',axis=1),df_edu['Class'])

#### Imprime os scores por combinações.

In [33]:
grid.cv_results_


{'mean_fit_time': array([0.01282816, 0.02592082, 0.05583439, 0.10479078, 0.15608883,
        0.01268258, 0.02612619, 0.05276866, 0.1008903 , 0.15879536,
        0.00625939, 0.02411432, 0.05326266, 0.10616689, 0.15594702,
        0.01404262, 0.01901755, 0.04786553, 0.11245074, 0.16351433,
        0.0108439 , 0.02327809, 0.05396833, 0.10568361, 0.15532451,
        0.00830779, 0.02341566, 0.05723634, 0.1094327 , 0.15603609,
        0.01279526, 0.02308359, 0.05898595, 0.10500665, 0.14816604,
        0.01022921, 0.02270575, 0.05076303, 0.10010157, 0.1544333 ,
        0.01332631, 0.02064099, 0.05137239, 0.09801702, 0.1461472 ,
        0.01009979, 0.01645613, 0.04855633, 0.09688749, 0.14702916,
        0.01201468, 0.0198451 , 0.05549502, 0.09869876, 0.14940991,
        0.01244345, 0.02335067, 0.050419  , 0.10195193, 0.14875264,
        0.00966897, 0.02004862, 0.04738336, 0.09976954, 0.15170264,
        0.01003542, 0.02014146, 0.04712644, 0.09402514, 0.14551744,
        0.00965633, 0.01992831,

#### Verificando os melhores parâmetros.

In [34]:
grid.best_params_

{'criterion': 'entropy',
 'max_depth': 100,
 'min_samples_leaf': 15,
 'min_samples_split': 5,
 'n_estimators': 20}

#### Verificando o melhor score.

In [35]:
grid.best_score_

0.7291666666666666