In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [2]:
df_wred_sup = pd.read_csv(r'datasets\winequality-red.csv')

df_wred_sup.info()

#Como não há nenhuma coluna do tipo texto, não será necessário utilizar o LabelEncoder para alterar para inteiro

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [3]:
X = df_wred_sup.drop('quality', axis = 1).values
y = df_wred_sup['quality'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

rf = RandomForestClassifier()
KNN = KNeighborsClassifier(n_neighbors = 5)

rf.fit(X_train, y_train)
KNN.fit(X_train, y_train)

rf_previsao = rf.predict(X_test)
KNN_previsao = KNN.predict(X_test)

rf_acc = accuracy_score(y_test, rf_previsao)
KNN_acc = accuracy_score(y_test, KNN_previsao)

print('Assertividade RandomForest: {}'.format(np.round(rf_acc * 100, 2)), '%')
print('Assertividade KNeighbors: {}'.format(np.round(KNN_acc * 100, 2)), '%')

Assertividade RandomForest: 65.83 %
Assertividade KNeighbors: 48.54 %


Utilizando metódo de Ajuste de Hyperparâmetro com GridSearchCV para tentar melhorar o modelo RandomForest:

Os Hiperparâmetros são ajustes que controlam o comportamento do modelo. O RandomForest possui vários destes hiperparâmetros que são importantes, como número de árvores, profundidade máxima das árvores, número mínimo de amostras para dividir um nó, etc. Podemos usar validação cruzada para encontrar a melhor configuração para cada conjunto de dados.

In [4]:
#Importando o GridSearchCV

from sklearn.model_selection import GridSearchCV, KFold

#Carregando o conjunto de dados

df_grid = df_wred_sup
X_g = df_grid.drop('quality', axis = 1).values
y_g = df_grid['quality'].values

#Criando dados de treino e teste
X_train_g, X_test_g, y_train_g, y_test_g = train_test_split(X_g, y_g, test_size = 0.3, random_state = 42)

#Definindo Hiperparametros

params = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]}

#Criando o modelo RandomForest
rf_grid = RandomForestClassifier()

#Criando objeto KFold para variação com 5 folds

kf = KFold(n_splits = 5, shuffle = True, random_state = 42)

#Realizando busca em grade com validação cruzada
grid_search = GridSearchCV(rf_grid, param_grid = params, cv = kf)
grid_search.fit(X_train_g, y_train_g)

#Obetendo os melhores hiperparametros encontrados
best_params = grid_search.best_params_

In [5]:
#Treinando modelo com melhores parâmetros
best_rf_grid = RandomForestClassifier(**best_params)
best_rf_grid.fit(X_train_g, y_train_g)

#Fazendo a previsão
previsao_grid = best_rf_grid.predict(X_test_g)

#Avaliando o modelo
acc_grid = accuracy_score(previsao_grid, y_test_g)

print('Assertividade RandomForest depois do GridSearch: {}'.format(np.round(acc_grid * 100, 2)), '%')

Assertividade RandomForest depois do GridSearch: 67.29 %


Após aplicar o metódo GridSearchCV vemos que o metódo RandomForest teve um aumento em aproximadamente 2% na assertividade.

In [6]:
#Prevendo novos valores
new_wine_red_rf = pd.read_excel(r'datasets\winequality-red_new.xlsx')

new_X = new_wine_red_rf

In [7]:
#Prevendo a qualidade da nova base
new_wine_rf_previsao = rf.predict(new_X)
new_wine_best_rf_previsao = best_rf_grid.predict(new_X)

new_wine_red_rf['previsao_rf'] = new_wine_rf_previsao
new_wine_red_rf['previsao_best_rf'] = new_wine_best_rf_previsao

new_wine_red_rf.head()



Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,previsao_rf,previsao_best_rf
0,12.316663,203.252499,0.11089,4.991177,1.161658,61.077033,166.315292,64942.828795,4.121691,2.046822,9.506718,5,5
1,14.458159,322.673695,1.525682,3.535888,1.416264,14.518429,55.177394,32952.313943,4.982958,2.153916,9.479864,5,6
2,10.948115,226.709451,1.709013,15.709482,1.523201,60.376718,189.457848,35479.193989,3.153437,2.35894,11.729912,6,6
3,13.377812,1079.074033,0.479684,11.215556,1.061894,14.167554,53.508704,10896.055317,4.211033,1.575347,13.376391,6,6
4,7.780856,407.958954,0.406692,8.553842,1.078871,36.720235,215.574131,60511.173695,3.845229,2.532875,10.670422,5,5


Agora utilizando a Padronização de Caracteristicas com StandardScaler vamos tentar melhorar o modelo KNeighbors:

Uma vez que o modelo KNeighbors é sensível à escala das caracteristicas, é importante padroniza-las antes de treinar o modelo.

In [8]:
from sklearn.preprocessing import StandardScaler

new_wine_red_knn = pd.read_excel(r'datasets\winequality-red_new.xlsx')

new_X = new_wine_red_knn

df_grid = df_wred_sup
X_ss = df_grid.drop('quality', axis = 1).values
y_ss = df_grid['quality'].values

X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_ss, y_ss, test_size = 0.3, random_state = 42)

#Criando instância do Scaler e padronizando caracteristicas
scaler = StandardScaler()
X_train_s_scaled = scaler.fit_transform(X_train_s)
X_test_s_scaled = scaler.transform(X_test_s)

#Criando modelo KNN
KNN_scaled = KNeighborsClassifier(n_neighbors = 7)

#Treinando modelo com dados escalados
KNN_scaled.fit(X_train_s_scaled, y_train_s)

KNN_scaled_previsao = KNN_scaled.predict(X_test_s)

acc_scaled = accuracy_score(KNN_scaled_previsao, y_test_s)

print('Assertividade KNeighbors com StandarScaler: {}'.format(np.round(acc_scaled * 100, 2)), '%')

Assertividade KNeighbors com StandarScaler: 40.62 %


Neste caso vemos que a padrozinação dos dados não foi interessante para esta base de dados, uma vez que a Assetividade do modelo caiu cerca de 5%

In [9]:
new_wine_knn = KNN.predict(new_wine_red_knn)
new_wine_knn_scaled_previsao = KNN_scaled.predict(new_wine_red_knn)

new_wine_red_knn['previsao_rf'] = new_wine_knn
new_wine_red_knn['previsao_best_rf'] = new_wine_knn_scaled_previsao

new_wine_red_knn.head()



Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,previsao_rf,previsao_best_rf
0,12.316663,203.252499,0.11089,4.991177,1.161658,61.077033,166.315292,64942.828795,4.121691,2.046822,9.506718,5,5
1,14.458159,322.673695,1.525682,3.535888,1.416264,14.518429,55.177394,32952.313943,4.982958,2.153916,9.479864,5,5
2,10.948115,226.709451,1.709013,15.709482,1.523201,60.376718,189.457848,35479.193989,3.153437,2.35894,11.729912,5,5
3,13.377812,1079.074033,0.479684,11.215556,1.061894,14.167554,53.508704,10896.055317,4.211033,1.575347,13.376391,5,5
4,7.780856,407.958954,0.406692,8.553842,1.078871,36.720235,215.574131,60511.173695,3.845229,2.532875,10.670422,5,5
