
# Capítulo 4 — Classificação (end‑to‑end)
**Objetivo:** treinar e comparar classificadores em um problema real (Breast Cancer).
- Modelos: LogReg, KNN, Árvores, SVM
- *Pipeline* com `StandardScaler`
- Validação cruzada (`StratifiedKFold`)
- *GridSearchCV* para hiperparâmetros
- Matriz de confusão, ROC AUC e curva ROC


In [None]:

import numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import (classification_report, confusion_matrix, roc_auc_score, 
                             RocCurveDisplay, ConfusionMatrixDisplay, accuracy_score)

ds = load_breast_cancer(as_frame=True)
X = ds.data
y = ds.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

models = {
    "logreg": Pipeline([("scaler", StandardScaler()), ("clf", LogisticRegression(max_iter=1000))]),
    "knn": Pipeline([("scaler", StandardScaler()), ("clf", KNeighborsClassifier())]),
    "tree": Pipeline([("clf", DecisionTreeClassifier(random_state=42))]),
    "svm": Pipeline([("scaler", StandardScaler()), ("clf", SVC(probability=True))]),
}

results = {}
for name, pipe in models.items():
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    y_proba = pipe.predict_proba(X_test)[:,1] if hasattr(pipe[-1],'predict_proba') else None
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba) if y_proba is not None else np.nan
    results[name] = {"accuracy": acc, "roc_auc": auc}
    print(f"\n== {name} ==")
    print("Accuracy:", acc, "ROC AUC:", auc)
    print(classification_report(y_test, y_pred))

pd.DataFrame(results).T


In [None]:

# GridSearch no melhor candidato (por ex., SVM)
param_grid = {
    "clf__C": [0.1, 1, 10],
    "clf__gamma": ["scale", 0.01, 0.001],
    "clf__kernel": ["rbf"]
}
svm_pipe = Pipeline([("scaler", StandardScaler()), ("clf", SVC(probability=True))])
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
gs = GridSearchCV(svm_pipe, param_grid, cv=cv, scoring="roc_auc", n_jobs=-1)
gs.fit(X_train, y_train)
print("Melhor:", gs.best_params_, "Score:", gs.best_score_)

best = gs.best_estimator_
y_pred = best.predict(X_test)
y_proba = best.predict_proba(X_test)[:,1]

# Matriz de confusão
ConfusionMatrixDisplay(confusion_matrix(y_test, y_pred)).plot()
plt.title("Matriz de confusão (SVM otimizado)")
plt.show()

# Curva ROC
RocCurveDisplay.from_predictions(y_test, y_proba)
plt.title("Curva ROC (SVM otimizado)")
plt.show()
