# T1 - Aprendizado de Máquina

Este arquivo contém o código, bem como as explicações necessárias, para o T1 da disciplina de Aprendizado de Máquina, compreendendo as etapas de download do dataset escolhido, pré-processamento dos dados, treinamento dos modelos de **kNN, Naïve Bayes e Árvores de Decisão** e, por fim, suas avaliações, interpretações e comparações.

## Download do dataset

O dataset escolhido para a tarefa de classificação possui dados sobre características físico-químicas da água e classifica as amostras em potável ou não. Nesta seção, fazemos o download do dataset do Kaggle e o convertemos para um objeto `pandas.DataFrame`.

In [None]:
import kagglehub as kh
import pandas    as pd
import os

dataset_path = kh.dataset_download("uom190346a/water-quality-and-potability")
dataset_file = "water_potability.csv"

dataset_abs_path = os.path.join(dataset_path, dataset_file)

df = pd.read_csv(dataset_abs_path)
print(df.head())
print("Shape:", df.shape)

## Pré-processamento

Para o pré-processamento, serão aplicados:
1. Tratamento de valores ausentes (NaNs) preenchendo-os com a mediana da coluna;
2. Verificação de outliers e padronização dos dados (para o kNN);
3. Separação de atributos (features) e rótulos (labels).

Não será realizada a separação em conjuntos de treino e teste pois será utilizada **validação cruzada**.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

print("Valores NaN (pré-tratamento):")
print(df.isna().sum())
df['ph'] = df['ph'].fillna(df['ph'].median())
df['Sulfate'] = df['Sulfate'].fillna(df['Sulfate'].median())
df['Trihalomethanes'] = df['Trihalomethanes'].fillna(df['Trihalomethanes'].median())

numeric_columns = df.select_dtypes(include='number').columns
plt.figure(figsize=(10, 5))
for i, col in enumerate(numeric_columns):
    plt.subplot(3, 4, i + 1)
    sns.boxplot(data=df, y=col)
    plt.title(col)
    plt.tight_layout()
plt.suptitle("Boxplots para detecção de outliers", y=1.02)
plt.show()

X = df.drop(columns=['Potability'])
y = df['Potability']

# Padronizacao devido aos outliers para o kNN
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

print("Dataset tratado:")
print(X.describe().round(2))

## Treinamento dos modelos (com validação cruzada)

Nesta seção é feito o treinamento dos modelos utilizando o método de grid search para encontrar os parâmetros que produzem os melhores valores de acurácia.

### Árvore de Decisão

In [None]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.tree import DecisionTreeClassifier

param_grid = {
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [2, 5, 10, 15],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'splitter': ['best'],
    'class_weight': [None, 'balanced'],
    'max_features': [None, 'sqrt', 'log2'],
    'ccp_alpha': [0.0, 0.001, 0.01, 0.1],
}

kf = StratifiedKFold(n_splits=3, shuffle=False)
dt_grid = GridSearchCV(
    DecisionTreeClassifier(),
    param_grid,
    cv=kf,
    scoring='accuracy',
    n_jobs=-1
)
dt_grid.fit(X, y)

print("Best score (DT):", dt_grid.best_score_)
print("Best params (DT):", dt_grid.best_params_)

### kNN

In [None]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

param_grid = {
    'n_neighbors': [21, 35, 49,71,83, 105],
    'weights': ['distance'],
    'algorithm': ['brute'],
    'leaf_size': [1,2,3,5,30],
    'metric': ['cosine'],
    'p': [1, 2], # 1=Manhattan, 2=Euclidean
}

kf = StratifiedKFold(n_splits=3, shuffle=False)
knn_grid = GridSearchCV(
    KNeighborsClassifier(),
    param_grid,
    cv=kf,
    scoring='accuracy',
    n_jobs=-1
)
knn_grid.fit(X, y)

print("Best score (kNN):", knn_grid.best_score_)
print("Best params (kNN):", knn_grid.best_params_)

### Naïve Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV, StratifiedKFold
import numpy as np

param_grid = {
    'var_smoothing': np.logspace(-11, -7, 3)
}

kf = StratifiedKFold(n_splits=3, shuffle=True)
nb_grid = GridSearchCV(
    estimator=GaussianNB(),
    param_grid=param_grid,
    cv=kf,
    scoring='accuracy',
    n_jobs=-1
)
nb_grid.fit(X, y)

print("Best accuracy (NB):", nb_grid.best_score_)
print("Best params (NB):", nb_grid.best_params_)

## Interpretação dos modelos

### Árvore de Decisão

In [None]:
import numpy as np
import shap
from lime.lime_tabular import LimeTabularExplainer
import matplotlib.pyplot as plt

#################################### SHAP #################################### 
shap.initjs()

explainer = shap.TreeExplainer(dt_grid.best_estimator_)
shap_values = explainer.shap_values(X)

plt.figure()
shap.summary_plot(shap_values, X, feature_names=X.columns)

##################################### LIME ####################################
X_np = X.values if hasattr(X, "values") else X
feature_names = X.columns if hasattr(X, "columns") else [f"Feature {i}" for i in range(X.shape[1])]
class_names = np.unique(y).astype(str)

lime_explainer = LimeTabularExplainer(
    training_data=X_np,
    feature_names=feature_names,
    class_names=class_names,
    mode="classification"
)

instance_idx = 0
exp = lime_explainer.explain_instance(X_np[instance_idx], dt_grid.predict_proba)

exp.show_in_notebook(show_table=True)

plt.figure()
exp.as_pyplot_figure()
plt.show()

### kNN

In [None]:
import shap
from lime.lime_tabular import LimeTabularExplainer
import numpy as np
import matplotlib.pyplot as plt

#################################### SHAP ####################################
shap.initjs()

X_background = shap.kmeans(X,300)

explainer = shap.KernelExplainer(knn_grid.best_estimator_.predict_proba, X_background)
shap_values = explainer.shap_values(X.iloc[:50])
figure=plt.figure()
shap.summary_plot(shap_values, X.iloc[:50])

##################################### LIME ####################################
X_np = X.values if hasattr(X, "values") else X
feature_names = X.columns if hasattr(X, "columns") else [f"Feature {i}" for i in range(X.shape[1])]
class_names = np.unique(y).astype(str)

lime_explainer = LimeTabularExplainer(
    training_data=X_np,
    feature_names=feature_names,
    class_names=class_names,
    mode="classification"
)

instance_idx = 0
exp = lime_explainer.explain_instance(X_np[instance_idx], knn_grid.predict_proba)

exp.show_in_notebook(show_table=True)
plt.figure()
exp.as_pyplot_figure()
plt.show()

### Naïve Bayes

In [None]:
import shap
from lime.lime_tabular import LimeTabularExplainer
import numpy as np
import matplotlib.pyplot as plt

##################################### SHAP ####################################
shap.initjs()

X_background=shap.sample(X,300)
explainer=shap.KernelExplainer(nb_grid.best_estimator_.predict_proba, X_background)
shap_values = explainer.shap_values(X.iloc[:50])

figure=plt.figure()
shap.summary_plot(shap_values, X.iloc[:50])

##################################### LIME ####################################
X_np = X.values if hasattr(X, "values") else X
feature_names = X.columns if hasattr(X, "columns") else [f"Feature {i}" for i in range(X.shape[1])]
class_names = np.unique(y).astype(str)

lime_explainer = LimeTabularExplainer(
    training_data=X_np,
    feature_names=feature_names,
    class_names=class_names,
    mode="classification"
)

instance_idx = 0
exp = lime_explainer.explain_instance(X_np[instance_idx], nb_grid.predict_proba)

exp.show_in_notebook(show_table=True)
plt.figure()
exp.as_pyplot_figure()
plt.show()

## Comparações de interpretabilidade e limitações dos modelos