
# Regularização em PyTorch: Baseline vs L2 vs L1

Este notebook treina uma MLP simples em um dataset sintético de classificação binária
e compara três cenários:
1. **Baseline** (sem regularização)
2. **L2** (*weight decay* no otimizador)
3. **L1** (penalização manual na loss)

São geradas curvas de **loss** e **acurácia** de treino/validação para cada cenário.
> Observações:
> - Somente **matplotlib** é utilizada (sem seaborn).
> - Cada gráfico é plotado em **uma figura** (sem subplots).


In [None]:
import math
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# Fixar semente para reprodutibilidade
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.mps.is_available() else "cpu")
device


## 1) Dataset sintético

In [None]:
# Dataset binário moderadamente ruidoso e não-linear
X, y = make_classification(
    n_samples=2000, n_features=20, n_informative=6, n_redundant=4, n_repeated=0,
    n_clusters_per_class=2, weights=[0.6, 0.4], class_sep=1.2, flip_y=0.02,
    random_state=SEED
)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=SEED
)

scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_val   = scaler.transform(X_val)

X_train = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train = torch.tensor(y_train, dtype=torch.float32).to(device)
X_val   = torch.tensor(X_val,   dtype=torch.float32).to(device)
y_val   = torch.tensor(y_val,   dtype=torch.float32).to(device)

X_train.shape, X_val.shape


## 2) MLP simples

In [None]:
class MLP(nn.Module):
    def __init__(self, n_in, n_hidden=64):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_in, n_hidden),
            nn.ReLU(),
            nn.Linear(n_hidden, n_hidden),
            nn.ReLU(),
            nn.Linear(n_hidden, 1)
        )

    def forward(self, x):
        return self.net(x).squeeze(-1)

n_features = X_train.shape[1]
n_features


## 3) Funções de treino/avaliação

In [None]:
def batch_iter(X, y, batch_size=128, shuffle=True):
    n = X.shape[0]
    idx = torch.arange(n)
    if shuffle:
        idx = idx[torch.randperm(n)]
    for i in range(0, n, batch_size):
        j = idx[i:i+batch_size]
        yield X[j], y[j]

def accuracy_from_logits(logits, y_true):
    probs = torch.sigmoid(logits)
    preds = (probs >= 0.5).float()
    return (preds == y_true).float().mean().item()

def train_variant(variant="baseline", l2_lambda=0.0, l1_lambda=0.0, epochs=50, lr=1e-3):
    model = MLP(n_features).to(device)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=l2_lambda)

    hist = {"train_loss": [], "val_loss": [], "train_acc": [], "val_acc": []}

    for ep in range(epochs):
        model.train()
        running_loss = 0.0
        running_acc = 0.0
        batches = 0

        for xb, yb in batch_iter(X_train, y_train, batch_size=128, shuffle=True):
            optimizer.zero_grad()
            logits = model(xb)
            loss = criterion(logits, yb)

            # L1 manual (se aplicável)
            if l1_lambda > 0.0:
                l1_term = torch.tensor(0.0, device=device)
                for p in model.parameters():
                    l1_term = l1_term + p.abs().sum()
                loss = loss + l1_lambda * l1_term

            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            running_acc  += accuracy_from_logits(logits, yb)
            batches += 1

        # Métricas de validação
        model.eval()
        with torch.no_grad():
            val_logits = model(X_val)
            val_loss = criterion(val_logits, y_val).item()
            if l1_lambda > 0.0:
                l1_term_val = torch.tensor(0.0, device=device)
                for p in model.parameters():
                    l1_term_val = l1_term_val + p.abs().sum()
                val_loss = val_loss + l1_lambda * l1_term_val.item()

            val_acc = accuracy_from_logits(val_logits, y_val)

        hist["train_loss"].append(running_loss / max(1, batches))
        hist["train_acc"].append(running_acc / max(1, batches))
        hist["val_loss"].append(val_loss)
        hist["val_acc"].append(val_acc)

    return model, hist

def plot_curves(hist, title_prefix="Baseline", dpi=180, save_prefix=None):
    epochs = np.arange(1, len(hist["train_loss"]) + 1)

    # Loss
    plt.figure(figsize=(7.2, 4.2))
    plt.plot(epochs, hist["train_loss"], label="Treino (loss)")
    plt.plot(epochs, hist["val_loss"],   label="Validação (loss)", linestyle="--")
    plt.xlabel("Épocas")
    plt.ylabel("Loss")
    plt.title(f"{title_prefix} — Curva de Loss")
    plt.legend()
    plt.grid(alpha=0.25)
    if save_prefix:
        plt.savefig(f"{save_prefix}_loss.png", dpi=dpi, bbox_inches="tight")
    plt.show()

    # Acurácia
    plt.figure(figsize=(7.2, 4.2))
    plt.plot(epochs, hist["train_acc"], label="Treino (acurácia)")
    plt.plot(epochs, hist["val_acc"],   label="Validação (acurácia)", linestyle="--")
    plt.xlabel("Épocas")
    plt.ylabel("Acurácia")
    plt.title(f"{title_prefix} — Curva de Acurácia")
    plt.legend()
    plt.grid(alpha=0.25)
    if save_prefix:
        plt.savefig(f"{save_prefix}_accuracy.png", dpi=dpi, bbox_inches="tight")
    plt.show()


## 4) Treinos: Baseline, L2 e L1

In [None]:
EPOCHS = 60
LR = 1e-3
DPI = 180

# Baseline (sem regularização)
model_base, hist_base = train_variant("baseline", l2_lambda=0.0, l1_lambda=0.0, epochs=EPOCHS, lr=LR)
plot_curves(hist_base, title_prefix="Baseline", dpi=DPI, save_prefix="regularization_baseline")

# L2 (weight decay)
model_l2, hist_l2 = train_variant("l2", l2_lambda=1e-3, l1_lambda=0.0, epochs=EPOCHS, lr=LR)
plot_curves(hist_l2, title_prefix="L2 (weight decay)", dpi=DPI, save_prefix="regularization_l2")

# L1 (penalização manual)
model_l1, hist_l1 = train_variant("l1", l2_lambda=0.0, l1_lambda=1e-6, epochs=EPOCHS, lr=LR)
plot_curves(hist_l1, title_prefix="L1", dpi=DPI, save_prefix="regularization_l1")



## 5) Observações

- **L2** tende a estabilizar os pesos e reduzir oscilações, geralmente ajudando a manter a
  curva de validação mais próxima da de treino.
- **L1** induz esparsidade, podendo simplificar a hipótese aprendida; a escolha do `l1_lambda`
  é sensível — valores muito altos podem prejudicar o aprendizado.
- Ajuste `DPI` nas funções `savefig` dentro de `plot_curves` para controlar a qualidade.
