# Algoritmo Random Forest

In [2]:
# Importando as libs que serão usadas
import pandas as pd
import numpy as np

In [3]:
# Importando as funções que serão usadas no código

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from yellowbrick.classifier import ConfusionMatrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV

In [84]:
# Importando a base de dados e separando os atributos de entrada do atributo de classificação

X = pd.read_csv('train.csv', sep=',')

y_train = X.loc[:, "Survived"]
X_train = X.drop("Survived", axis="columns")

X_test = pd.read_csv('test.csv', sep=',')
y_test = pd.read_csv('classification.csv', sep=',')
y_test = y_test.loc[:, "Survived"]


In [57]:
# Visualizando os tipos de dados

X_train.head(6)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q


In [85]:
# Removendo as colunas que não são relevantes

# Colunas que não seriam muito uteis para a criação do modelo
X_train = X_train.drop("Name", axis="columns")
X_train = X_train.drop("Ticket", axis="columns")
X_train = X_train.drop("Fare", axis="columns")
X_train = X_train.drop("Cabin", axis="columns")

X_test = X_test.drop("Name", axis="columns")
X_test = X_test.drop("Ticket", axis="columns")
X_test = X_test.drop("Fare", axis="columns")
X_test = X_test.drop("Cabin", axis="columns")

# Colunas com dados faltando 
X_train = X_train.drop("Age", axis="columns")

X_test = X_test.drop("Age", axis="columns")

# Visualização da base depois da remoção das colunas

X_train.head()


Unnamed: 0,PassengerId,Pclass,Sex,SibSp,Parch,Embarked
0,1,3,male,1,0,S
1,2,1,female,1,0,C
2,3,3,female,0,0,S
3,4,1,female,1,0,S
4,5,3,male,0,0,S


In [86]:
# Codificando os dados

categorical_cols = ["Sex", "Embarked"]
transformer = ColumnTransformer(
    transformers=[("OneHot", OneHotEncoder(), categorical_cols)],
    remainder="passthrough"
)

X_train = transformer.fit_transform(X_train)
X_test = transformer.fit_transform(X_test)

X_train



array([[0., 1., 0., ..., 3., 1., 0.],
       [1., 0., 1., ..., 1., 1., 0.],
       [1., 0., 0., ..., 3., 0., 0.],
       ...,
       [1., 0., 0., ..., 3., 1., 2.],
       [0., 1., 1., ..., 1., 0., 0.],
       [0., 1., 0., ..., 3., 0., 0.]])

### Antes de gerar um modelo iremos testar os melhores hiperparametros utilizando o RandomizedSearchCV

In [90]:
# Função que irá me retornar os melhoeres hiperparametros

def hypertuning_rscv(est, p_distr, nbr_iter,X,y):
    rdmsearch = RandomizedSearchCV(est, param_distributions=p_distr,
                                  n_jobs=-1, n_iter=nbr_iter, cv=9)
    #CV = Cross-Validation ( here using Stratified KFold CV)
    rdmsearch.fit(X,y)
    ht_params = rdmsearch.best_params_
    ht_score = rdmsearch.best_score_
    return ht_params, ht_score

In [91]:
# Definindo os hiperparametros que serão testados

params = {'max_depth': [3,5,10,None],
              'n_estimators': [10,100,200,300,400,500, 600, 700, 800, 900, 1000],
              'max_features': [None, 'sqrt', 'log2', 0.2, 0.4, 0.6, 0.8],
               'criterion': ['gini','entropy'],
               'bootstrap': [True,False],
               'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
              }

In [92]:
# Achando os melhores hiperparametros

est = RandomForestClassifier(n_jobs=-1)
rf_parameters, rf_ht_score = hypertuning_rscv(est, params, 40, X_train, y_train)

print(rf_parameters)
print(rf_ht_score)

{'n_estimators': 400, 'min_samples_leaf': 5, 'max_features': 0.4, 'max_depth': 3, 'criterion': 'entropy', 'bootstrap': False}
0.8058361391694725


In [96]:
# Gerando o modelo e treinando

model = RandomForestClassifier(n_jobs=-1, n_estimators=600,bootstrap=False,criterion='entropy',max_depth=3,max_features='log2',min_samples_leaf= 3)
model.fit(X_train, y_train)


In [None]:
# Testando o modelo

pred = model.predict(X_test)

print("Prediction: ", end="")
print(pred)
print("\n")
print("Expected result: ", end="")
print(np.array(y_test))
print("\n")
print("Acurracy: ", end="")
print(accuracy_score(y_test,pred))

In [None]:
# Confusion Matrix

print(confusion_matrix(y_test,pred))
print("\n")

cm = ConfusionMatrix(model)
cm.fit(X_train, y_train)
cm.score(X_test, y_test)

In [None]:
# Classification Report

print(classification_report(y_test,pred))