## 1 - Dataset

In [19]:
import pandas as pd
import kagglehub

path = kagglehub.dataset_download("ehsanesmaeili/red-and-white-wine-quality-merged")
dados = pd.read_csv(path + '/wine_quality_merged.csv', decimal=",")

Using Colab cache for faster access to the 'red-and-white-wine-quality-merged' dataset.


In [None]:
!pip install mlflow
!pip install shap
!pip install optuna



In [None]:
dados.head()

## 2 - Preparação e Normalização

Normalização

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
scaler = MinMaxScaler()

In [None]:
x_dados = dados.loc[:, ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality']]
y_dados = dados['type'].values

x_dados_norm = scaler.fit_transform(x_dados)
y_dados_norm = encoder.fit_transform(y_dados)

In [None]:
df = pd.DataFrame(x_dados_norm, columns=x_dados.columns)
df['target'] = y_dados_norm
display(df.head())

## 5 - Learning

In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split

In [None]:
dados_treino, dados_teste, rotulos_treino, rotulos_teste = train_test_split(x_dados_norm, y_dados_norm, test_size=0.2, stratify=y_dados_norm)

In [None]:
clf_svm = SVC(C=1.0, kernel='rbf', gamma='scale')
clf_rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)

In [None]:
clf_svm = clf_svm.fit(dados_treino, rotulos_treino)
clf_rf = clf_rf.fit(dados_treino, rotulos_treino)

# MLFlow

In [None]:
import os
import warnings
import sys

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from urllib.parse import urlparse
import mlflow
import mlflow.sklearn

In [None]:
# Ignore warnings
warnings.filterwarnings("ignore", message="`artifact_path` is deprecated.*")
warnings.filterwarnings("ignore", message="Model logged without a signature and input example.*")

# Generate Seed
np.random.seed(40)

# Define the evaluation metrics function
from sklearn.metrics import f1_score, accuracy_score, precision_score

def eval_metrics(actual, pred):
    f1 = f1_score(actual, pred)
    accuracy = accuracy_score(actual, pred)
    precision = precision_score(actual, pred)
    return f1, accuracy, precision

# Initialize dictionary to store metrics
metrics_data = {
    'Model': [],
    'F1 Score': [],
    'Accuracy': [],
    'Precision': []
}

# Log the SVM model
with mlflow.start_run(run_name="SVM Model"):
    predicted_rotulos = clf_svm.predict(dados_teste)
    (f1, accuracy, precision) = eval_metrics(rotulos_teste, predicted_rotulos)
    model_score = clf_svm.score(dados_teste, rotulos_teste)

    print("SVM model:")
    print("  F1 Score: %s" % f1)
    print("  Accuracy: %s" % accuracy)
    print("  Precision: %s" % precision)
    print("  Model Score: %s" % model_score)

    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("model_score", model_score)

    mlflow.sklearn.log_model(clf_svm, "svm_model")

    # Store metrics in the dictionary
    metrics_data['Model'].append('SVM')
    metrics_data['F1 Score'].append(f1)
    metrics_data['Accuracy'].append(accuracy)
    metrics_data['Precision'].append(precision)


# Log the RandomForest model
with mlflow.start_run(run_name="RandomForest Model"):
    predicted_rotulos = clf_rf.predict(dados_teste)
    (f1, accuracy, precision) = eval_metrics(rotulos_teste, predicted_rotulos)
    model_score = clf_rf.score(dados_teste, rotulos_teste)


    print("RandomForest model:")
    print("  F1 Score: %s" % f1)
    print("  Accuracy: %s" % accuracy)
    print("  Precision: %s" % precision)
    print("  Model Score: %s" % model_score)


    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("model_score", model_score)

    mlflow.sklearn.log_model(clf_rf, "random_forest_model")

    # Store metrics in the dictionary
    metrics_data['Model'].append('RandomForest')
    metrics_data['F1 Score'].append(f1)
    metrics_data['Accuracy'].append(accuracy)
    metrics_data['Precision'].append(precision)

# Print the dictionary with the collected metrics
print("\nCollected Metrics:")
print(metrics_data)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Use the metrics_data dictionary generated in the previous step
# metrics_data is available from the execution of the previous cell

# Create a DataFrame from the dictionary
df_metrics = pd.DataFrame(metrics_data)

# Set 'Model' as the index for easier plotting
df_metrics = df_metrics.set_index('Model')

# Plotting the metrics in a 2x2 grid of subplots using bar plots with consistent colors for each model
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.flatten() # Flatten the 2D array of axes for easy iteration

metrics_to_plot = ['F1 Score', 'Accuracy', 'Precision']
models = df_metrics.index
colors = plt.cm.viridis(np.linspace(0, 1, len(models))) # Generate distinct colors

bar_width = 0.2

for i, metric in enumerate(metrics_to_plot):
    x = np.arange(len(models)) # the label locations
    for j, model in enumerate(models):
        axes[i].bar(x[j], df_metrics.loc[model, metric], bar_width, label=model, color=colors[j])

    axes[i].set_title(f'Comparison of Model {metric}')
    axes[i].set_xlabel('Model')
    axes[i].set_ylabel(metric)
    axes[i].set_xticks(x)
    axes[i].set_xticklabels(models)
    axes[i].tick_params(axis='x', rotation=0)
    axes[i].grid(axis='y')
    if i == 0: # Add legend to the first plot
        axes[i].legend()


# Hide the unused subplot
axes[len(metrics_to_plot)].axis('off')

plt.tight_layout()
plt.show()

# Hiper Parametros

In [None]:
import shap

In [None]:
# Create a SHAP explainer for the SVM model
# Wrap the predict method in a lambda function to make it callable
explainer = shap.Explainer(lambda x: clf_svm.predict(x), x_dados_norm)

# Calculate SHAP values for the data
# Using a subset of the data for faster execution
# shap_values = explainer(x_dados_norm[:100]) # Using the first 100 samples as an example subset
shap_values = explainer(x_dados_norm[:100]) # Using the first 100 samples as an example subset

# Get feature names from the original DataFrame
feature_names = x_dados.columns.tolist()

# Summarize the effects of all the features
shap.summary_plot(shap_values, feature_names=feature_names)

In [None]:
import optuna
from sklearn.model_selection import cross_val_score

# Define the objective function for Optuna for SVM
def objective_svm(trial):
    C = trial.suggest_float('C', 0.1, 100, log=True) # Reduced range
    gamma = trial.suggest_float('gamma', 0.001, 10, log=True) # Reduced range
    kernel = trial.suggest_categorical('kernel', ['rbf', 'linear'])

    clf = SVC(C=C, gamma=gamma, kernel=kernel)
    score = cross_val_score(clf, dados_treino, rotulos_treino, n_jobs=-1, cv=3).mean()
    return score

# Create an Optuna study and optimize for SVM
study_svm = optuna.create_study(direction='maximize')
study_svm.optimize(objective_svm, n_trials=50) # Reduced number of trials

print("Best hyperparameters for SVM: ", study_svm.best_params)
clf_svm_tuned = SVC(**study_svm.best_params)
clf_svm_tuned.fit(dados_treino, rotulos_treino)

In [None]:
# Define the objective function for Optuna for RandomForest
def objective_rf(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 150) # Reduced range
    max_depth = trial.suggest_int('max_depth', 5, 15) # Reduced range
    min_samples_split = trial.suggest_float('min_samples_split', 0.05, 0.5) # Reduced range
    min_samples_leaf = trial.suggest_float('min_samples_leaf', 0.02, 0.2) # Reduced range
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])

    clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                   min_samples_split=min_samples_split,
                                   min_samples_leaf=min_samples_leaf,
                                   criterion=criterion,
                                   random_state=42, n_jobs=-1)
    score = cross_val_score(clf, dados_treino, rotulos_treino, n_jobs=-1, cv=3).mean()
    return score

# Create an Optuna study and optimize for RandomForest
study_rf = optuna.create_study(direction='maximize')
study_rf.optimize(objective_rf, n_trials=50) # Reduced number of trials

print("Best hyperparameters for RandomForest: ", study_rf.best_params)
clf_rf_tuned = RandomForestClassifier(**study_rf.best_params, random_state=42, n_jobs=-1)
clf_rf_tuned.fit(dados_treino, rotulos_treino)

In [None]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation on the tuned SVM model
svm_cv_scores = cross_val_score(clf_svm_tuned, dados_teste, rotulos_teste, cv=5) # Using 5 folds for cross-validation

print("Cross-validation scores for tuned SVM model:", svm_cv_scores)
print("Mean cross-validation score for tuned SVM model:", svm_cv_scores.mean())

# Perform cross-validation on the tuned RandomForest model
rf_cv_scores = cross_val_score(clf_rf_tuned, dados_teste, rotulos_teste, cv=5) # Using 5 folds for cross-validation

print("\nCross-validation scores for tuned RandomForest model:", rf_cv_scores)
print("Mean cross-validation score for tuned RandomForest model:", rf_cv_scores.mean())

In [None]:
from sklearn.metrics import f1_score, accuracy_score, precision_score

# Evaluate tuned SVM model
predicted_rotulos_svm_tuned = clf_svm_tuned.predict(dados_teste)
f1_svm_tuned, accuracy_svm_tuned, precision_svm_tuned = eval_metrics(rotulos_teste, predicted_rotulos_svm_tuned)

# Evaluate tuned RandomForest model
predicted_rotulos_rf_tuned = clf_rf_tuned.predict(dados_teste)
f1_rf_tuned, accuracy_rf_tuned, precision_rf_tuned = eval_metrics(rotulos_teste, predicted_rotulos_rf_tuned)

# Update the metrics_data dictionary with tuned model metrics
metrics_data['Model'].extend(['SVM_tuned', 'RandomForest_tuned'])
metrics_data['F1 Score'].extend([f1_svm_tuned, f1_rf_tuned])
metrics_data['Accuracy'].extend([accuracy_svm_tuned, accuracy_rf_tuned])
metrics_data['Precision'].extend([precision_svm_tuned, precision_rf_tuned])

# Create a DataFrame from the updated dictionary
df_metrics_comparison = pd.DataFrame(metrics_data)

# Set 'Model' as the index for easier plotting
df_metrics_comparison = df_metrics_comparison.set_index('Model')

display(df_metrics_comparison)

In [None]:
# Evaluate tuned SVM model on training data
predicted_rotulos_svm_tuned_train = clf_svm_tuned.predict(dados_treino)
f1_svm_tuned_train, accuracy_svm_tuned_train, precision_svm_tuned_train = eval_metrics(rotulos_treino, predicted_rotulos_svm_tuned_train)

# Evaluate tuned RandomForest model on training data
predicted_rotulos_rf_tuned_train = clf_rf_tuned.predict(dados_treino)
f1_rf_tuned_train, accuracy_rf_tuned_train, precision_rf_tuned_train = eval_metrics(rotulos_treino, predicted_rotulos_rf_tuned_train)

# Create a dictionary to hold the comparison metrics
comparison_metrics = {
    'Model': ['SVM_tuned (Train)', 'SVM_tuned (Test)', 'RandomForest_tuned (Train)', 'RandomForest_tuned (Test)'],
    'F1 Score': [f1_svm_tuned_train, f1_svm_tuned, f1_rf_tuned_train, f1_rf_tuned],
    'Accuracy': [accuracy_svm_tuned_train, accuracy_svm_tuned, accuracy_rf_tuned_train, accuracy_rf_tuned],
    'Precision': [precision_svm_tuned_train, precision_svm_tuned, precision_rf_tuned_train, precision_rf_tuned]
}

# Create a DataFrame from the dictionary and display it
df_comparison = pd.DataFrame(comparison_metrics).set_index('Model')
display(df_comparison)

# Conclusão

Inicialmente, os dados foram baixados, normalizados e, em seguida, divididos em conjuntos de treino e teste para a avaliação dos modelos. Para otimizar o desempenho, foi realizado um ajuste fino de hiperparâmetros utilizando a biblioteca Optuna, que buscou a melhor combinação para cada algoritmo.

Ao final, ambos os modelos apresentaram um ótimo resultado com os dados de teste, que não foram vistos durante a fase de treinamento. O modelo vencedor foi o SVM, embora com uma pequena margem de diferença em relação ao Random Forest.

Consideramos que o desempenho alcançado é suficiente para o objetivo desta classificação. Desse modo, não serão necessários novos ajustes nos hiperparâmetros ou a exploração de outros modelos