
# Capítulo 5 — Regressão (end‑to‑end)
**Objetivo:** prever alvo contínuo (Diabetes).
- Baseline (média)
- Regressão Linear, Ridge/Lasso, RandomForest
- Pipeline + `GridSearchCV`
- Métricas: MAE, MSE, RMSE, R²
- Importância de atributos (RF) e `permutation_importance`


In [None]:

import numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.inspection import permutation_importance

ds = load_diabetes()
X = pd.DataFrame(ds.data, columns=ds.feature_names)
y = pd.Series(ds.target, name="target")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Baseline (prever média)
baseline_pred = np.repeat(y_train.mean(), len(y_test))
def report(y_true, y_pred, label):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    print(f"{label}: MAE={mae:.3f} MSE={mse:.1f} RMSE={rmse:.3f} R2={r2:.3f}")

report(y_test, baseline_pred, "Baseline (média)")

# Modelos
candidates = {
    "linreg": Pipeline([("scaler", StandardScaler()), ("mdl", LinearRegression())]),
    "ridge": Pipeline([("scaler", StandardScaler()), ("mdl", Ridge())]),
    "lasso": Pipeline([("scaler", StandardScaler()), ("mdl", Lasso(max_iter=5000))]),
    "rf": Pipeline([("mdl", RandomForestRegressor(random_state=42))]),
}

for name, pipe in candidates.items():
    pipe.fit(X_train, y_train)
    pred = pipe.predict(X_test)
    report(y_test, pred, name)

# Tuning em RandomForest
grid = {
    "mdl__n_estimators": [100, 300],
    "mdl__max_depth": [None, 10, 20],
    "mdl__min_samples_split": [2, 5]
}
gs = GridSearchCV(candidates["rf"], grid, scoring="neg_root_mean_squared_error", cv=5, n_jobs=-1)
gs.fit(X_train, y_train)
print("Best RF:", gs.best_params_)
best_rf = gs.best_estimator_
pred = best_rf.predict(X_test)
report(y_test, pred, "RF otimizado")

# Importância (permutation)
r = permutation_importance(best_rf, X_test, y_test, n_repeats=10, random_state=42)
imp = pd.Series(r.importances_mean, index=X.columns).sort_values(ascending=False)
imp.head(10).plot(kind="bar")
plt.title("Importância (Permutation) - Top 10")
plt.show()
