# 06 — Neural Networks (ANN, PyTorch) — CPU Friendly

Objectives:
- Build a small MLP in PyTorch for tabular classification (Breast Cancer dataset)
- Practice scaling/encoding, train/val/test splits, early stopping, regularization (dropout/weight decay)
- Evaluate with accuracy and ROC-AUC; compare to tree-based baselines conceptually

Assumptions:
- Nonlinear relationships may exist that linear or simple rules miss
- Enough data and proper regularization to avoid overfitting

Cautions / Data Prep:
- Always scale/normalize numeric features for neural networks
- One-hot encode categoricals (not needed here)
- Watch class imbalance; consider `pos_weight` or rebalancing
- Split train/val/test to prevent information leakage


In [None]:
%matplotlib inline
import warnings; warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid', context='notebook')
np.random.seed(42)

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from tqdm import tqdm

device = torch.device('cpu')  # CPU-only per course constraints
torch.manual_seed(42)


## 1) Load dataset and split: train/val/test
Use a validation set for early stopping. Keep test set untouched for final evaluation.

In [None]:
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name='target')  # 0/1

X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.2, random_state=42, stratify=y_train_full
)
X.shape, y.value_counts(normalize=True).round(3)

Scale features on train only, then apply to val/test to avoid leakage. Convert to tensors and build loaders.

In [None]:
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_val_s   = scaler.transform(X_val)
X_test_s  = scaler.transform(X_test)

X_train_t = torch.tensor(X_train_s, dtype=torch.float32)
y_train_t = torch.tensor(y_train.values.reshape(-1, 1), dtype=torch.float32)
X_val_t   = torch.tensor(X_val_s, dtype=torch.float32)
y_val_t   = torch.tensor(y_val.values.reshape(-1, 1), dtype=torch.float32)
X_test_t  = torch.tensor(X_test_s, dtype=torch.float32)
y_test_t  = torch.tensor(y_test.values.reshape(-1, 1), dtype=torch.float32)

train_ds = TensorDataset(X_train_t, y_train_t)
val_ds   = TensorDataset(X_val_t, y_val_t)
test_ds  = TensorDataset(X_test_t, y_test_t)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=256, shuffle=False)
test_loader  = DataLoader(test_ds, batch_size=256, shuffle=False)
X_train_t.shape, y_train_t.shape

## 2) Define a small MLP with dropout
Binary classification via `BCEWithLogitsLoss` (logits out; apply sigmoid at evaluation).

In [None]:
class MLP(nn.Module):
    def __init__(self, in_dim, hidden=64, p_drop=0.2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden),
            nn.ReLU(),
            nn.Dropout(p_drop),
            nn.Linear(hidden, hidden//2),
            nn.ReLU(),
            nn.Dropout(p_drop),
            nn.Linear(hidden//2, 1)
        )
    def forward(self, x):
        return self.net(x)

in_dim = X_train_t.shape[1]
model = MLP(in_dim=in_dim, hidden=64, p_drop=0.2).to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)  # weight decay as L2 reg


Helper functions: train one epoch, evaluate on a loader (loss, accuracy, ROC-AUC).

In [None]:
@torch.no_grad()
def eval_model(model, loader):
    model.eval()
    losses, probs, targets = [], [], []
    for xb, yb in loader:
        xb, yb = xb.to(device), yb.to(device)
        logits = model(xb)
        loss = criterion(logits, yb)
        losses.append(loss.item())
        p = torch.sigmoid(logits).cpu().numpy().ravel()
        t = yb.cpu().numpy().ravel()
        probs.append(p); targets.append(t)
    probs = np.concatenate(probs)
    targets = np.concatenate(targets)
    preds = (probs >= 0.5).astype(int)
    acc = accuracy_score(targets, preds)
    try:
        auc = roc_auc_score(targets, probs)
    except ValueError:
        auc = np.nan
    return np.mean(losses), acc, auc

def train_one_epoch(model, loader):
    model.train()
    losses = []
    for xb, yb in loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
    return np.mean(losses)


## 3) Train with early stopping (by validation AUC)
Keep epochs small for CPU and stop when val AUC stops improving for a few rounds.

In [None]:
epochs = 50
patience = 5
best_auc = -np.inf
pat = 0
hist = {'train_loss': [], 'val_loss': [], 'val_acc': [], 'val_auc': []}

for ep in range(1, epochs+1):
    tr_loss = train_one_epoch(model, train_loader)
    vl_loss, vl_acc, vl_auc = eval_model(model, val_loader)
    hist['train_loss'].append(tr_loss)
    hist['val_loss'].append(vl_loss)
    hist['val_acc'].append(vl_acc)
    hist['val_auc'].append(vl_auc)
    print(f"Epoch {ep:02d} | train_loss={tr_loss:.4f} val_loss={vl_loss:.4f} val_acc={vl_acc:.3f} val_auc={vl_auc:.3f}")
    if vl_auc > best_auc + 1e-4:
        best_auc = vl_auc
        best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
        pat = 0
    else:
        pat += 1
        if pat >= patience:
            print(f"Early stopping at epoch {ep} (best val AUC={best_auc:.3f})")
            break

# Restore best weights
model.load_state_dict(best_state)
best_auc

In [None]:
plt.figure(figsize=(10,3))
plt.subplot(1,2,1)
plt.plot(hist['train_loss'], label='train')
plt.plot(hist['val_loss'], label='val')
plt.title('Loss'); plt.legend()
plt.subplot(1,2,2)
plt.plot(hist['val_auc'], label='val AUC')
plt.title('Validation AUC'); plt.legend()
plt.tight_layout(); plt.show()

## 4) Test evaluation
Report accuracy, ROC-AUC, and classification report on the test set (held out entirely).

In [None]:
tl, ta, tu = eval_model(model, test_loader)
print({'test_loss': round(tl, 4), 'test_acc': round(ta, 3), 'test_auc': round(tu, 3)})

# Classification report
model.eval()
with torch.no_grad():
    logits = model(X_test_t.to(device))
    probs = torch.sigmoid(logits).cpu().numpy().ravel()
preds = (probs >= 0.5).astype(int)
print(classification_report(y_test.values, preds, digits=3))

Note: On tabular data, tree ensembles (Random Forest / XGBoost) often outperform simple MLPs. ANNs shine with extensive feature engineering, large data, or when strong nonlinearity exists.

## Exercises
Instructor solution cells are hidden/collapsed.
1. Scaling ablation: Retrain the model without scaling (use raw `X_train.values`)—observe convergence and test AUC differences.
2. Regularization: Increase dropout to 0.5 and/or weight_decay to 1e-3. Does validation AUC improve or degrade?
3. Class imbalance: Compute `pos_weight = (N_neg/N_pos)` from training labels and pass it to `BCEWithLogitsLoss(pos_weight=...)`. Compare metrics.


In [None]:
# Exercise 1: Remove scaling
# TODO: Build new tensors/loaders from raw X_train/X_val/X_test without StandardScaler and retrain a small model (10-20 epochs).
# Compare test AUC.
...

In [None]:
# Solution 1 (hidden)
Xtr_raw = torch.tensor(X_train.values, dtype=torch.float32)
Xva_raw = torch.tensor(X_val.values, dtype=torch.float32)
Xte_raw = torch.tensor(X_test.values, dtype=torch.float32)
ytr = torch.tensor(y_train.values.reshape(-1,1), dtype=torch.float32)
yva = torch.tensor(y_val.values.reshape(-1,1), dtype=torch.float32)
yte = torch.tensor(y_test.values.reshape(-1,1), dtype=torch.float32)

trL = DataLoader(TensorDataset(Xtr_raw, ytr), batch_size=64, shuffle=True)
vaL = DataLoader(TensorDataset(Xva_raw, yva), batch_size=256)
teL = DataLoader(TensorDataset(Xte_raw, yte), batch_size=256)

m2 = MLP(in_dim=Xtr_raw.shape[1], hidden=64, p_drop=0.2)
opt2 = torch.optim.Adam(m2.parameters(), lr=1e-3, weight_decay=1e-4)
crit2 = nn.BCEWithLogitsLoss()
best = -np.inf; state=None; pat=0
for ep in range(15):
    m2.train(); ls=[]
    for xb,yb in trL:
        opt2.zero_grad(); out = m2(xb); loss = crit2(out, yb); loss.backward(); opt2.step(); ls.append(loss.item())
    vl = eval_model(m2, vaL)[-1]
    if vl>best: best=vl; state={k:v.clone() for k,v in m2.state_dict().items()}; pat=0
    else: pat+=1
    if pat>=3: break
m2.load_state_dict(state)
test_auc_raw = eval_model(m2, teL)[-1]
round(test_auc_raw,3)

In [None]:
# Exercise 2: Regularization sweep
# TODO: Try dropout=0.5 and/or weight_decay=1e-3 in the optimizer; record val/test AUC.
...

In [None]:
# Solution 2 (hidden)
def run_conf(pdrop=0.5, wd=1e-3):
    m = MLP(in_dim=in_dim, hidden=64, p_drop=pdrop)
    opt = torch.optim.Adam(m.parameters(), lr=1e-3, weight_decay=wd)
    best=-np.inf; st=None; pat=0
    for ep in range(20):
        m.train();
        for xb,yb in train_loader:
            opt.zero_grad(); out=m(xb); loss=criterion(out, yb); loss.backward(); opt.step()
        va_auc = eval_model(m, val_loader)[-1]
        if va_auc>best: best=va_auc; st={k:v.clone() for k,v in m.state_dict().items()}; pat=0
        else:
            pat+=1
            if pat>=4: break
    m.load_state_dict(st)
    return eval_model(m, test_loader)[-1]

out_a = run_conf(0.5, 1e-3)
out_b = run_conf(0.2, 1e-3)
out_c = run_conf(0.5, 1e-4)
{'drop0.5_wd1e-3': round(out_a,3), 'drop0.2_wd1e-3': round(out_b,3), 'drop0.5_wd1e-4': round(out_c,3)}

In [None]:
# Exercise 3: Class imbalance with pos_weight
    # TODO: Compute pos_weight = N_neg/N_pos on the training labels and pass it to BCEWithLogitsLoss.
# Retrain briefly (10-15 epochs) and compare test AUC.
...

In [None]:
# Solution 3 (hidden)
pos = (y_train.values==1).sum(); neg = (y_train.values==0).sum()
pw = torch.tensor([neg/pos], dtype=torch.float32)
m3 = MLP(in_dim=in_dim, hidden=64, p_drop=0.2)
crit3 = nn.BCEWithLogitsLoss(pos_weight=pw)
opt3 = torch.optim.Adam(m3.parameters(), lr=1e-3, weight_decay=1e-4)
best=-np.inf; st=None; pat=0
for ep in range(15):
    m3.train()
    for xb,yb in train_loader:
        opt3.zero_grad(); out=m3(xb); loss=crit3(out, yb); loss.backward(); opt3.step()
    val_auc = eval_model(m3, val_loader)[-1]
    if val_auc>best: best=val_auc; st={k:v.clone() for k,v in m3.state_dict().items()}; pat=0
    else:
        pat+=1
        if pat>=3: break
m3.load_state_dict(st)
auc3 = eval_model(m3, test_loader)[-1]
round(auc3,3)

## Wrap-up checklist
- [ ] Scale numeric features and split train/val/test
- [ ] Use early stopping to limit overfitting
- [ ] Try dropout and weight decay; tune hidden sizes
- [ ] Consider class imbalance via pos_weight or sampling
- [ ] Track both accuracy and ROC-AUC for classification
