# 07 â€” CNNs (PyTorch) + Optional Transfer Learning (CPU-friendly)

Objectives:
- Build and train a simple CNN on Fashion-MNIST (CPU-friendly)
- Use normalization and light augmentation; track accuracy and loss
- Discuss overfitting and mitigations (augmentation, dropout)
- Optional: Demonstrate transfer learning with a pretrained ResNet-18 as a frozen feature extractor (tiny subset, CPU-safe)

Assumptions:
- Local connectivity and translational invariance are useful for images
- Inputs are tensors with standardized shape and normalized pixel values

Cautions/Data Prep:
- Normalize pixel values (e.g., mean/std) and keep consistent pre-processing
- Data augmentation can help generalization but must be reasonable
- Training deep models from scratch on CPU is slow; keep models small and epochs few
- Transfer learning on CPU is feasible for small subsets; keep it optional


In [None]:
%matplotlib inline
import warnings; warnings.filterwarnings('ignore')
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid', context='notebook')
np.random.seed(42)

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Subset
from torchvision import datasets, transforms, models
from sklearn.metrics import classification_report, confusion_matrix

device = torch.device('cpu')  # CPU-only per course constraints
torch.manual_seed(42)


## 1) Dataset and transforms (Fashion-MNIST)
Normalize to mean=0.5, std=0.5 (or dataset-specific stats). Add light augmentation for training.

In [None]:
mean, std = (0.5,), (0.5,)
train_tfms = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomCrop(28, padding=3),
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])
test_tfms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])

root = './_data'
train_ds = datasets.FashionMNIST(root, train=True, download=True, transform=train_tfms)
test_ds  = datasets.FashionMNIST(root, train=False, download=True, transform=test_tfms)

# Create a small validation split from the training set (e.g., 5k val)
val_size = 5000
indices = torch.randperm(len(train_ds)).tolist()
val_idx, tr_idx = indices[:val_size], indices[val_size:]
val_ds = Subset(datasets.FashionMNIST(root, train=True, download=False, transform=test_tfms), val_idx)
tr_ds  = Subset(train_ds, tr_idx)

batch_size = 128
tr_loader = DataLoader(tr_ds, batch_size=batch_size, shuffle=True, num_workers=2)
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=2)
te_loader  = DataLoader(test_ds, batch_size=batch_size, shuffle=False, num_workers=2)
len(tr_ds), len(val_ds), len(test_ds)

Visualize a few samples to confirm transforms and labels look reasonable.

In [None]:
classes = ['T-shirt/top','Trouser','Pullover','Dress','Coat','Sandal','Shirt','Sneaker','Bag','Ankle boot']
xb, yb = next(iter(tr_loader))
grid = xb[:16].clone()
# de-normalize for plotting
grid = torch.clamp(grid * std[0] + mean[0], 0, 1)
fig, axes = plt.subplots(4,4, figsize=(6,6))
for i, ax in enumerate(axes.ravel()):
    ax.imshow(grid[i,0].numpy(), cmap='gray')
    ax.set_title(classes[yb[i].item()], fontsize=8)
    ax.axis('off')
plt.tight_layout(); plt.show()


## 2) Simple CNN model
Small architecture suitable for CPU training within a few epochs.

In [None]:
class SmallCNN(nn.Module):
    def __init__(self, num_classes=10, p_drop=0.25):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),  # 14x14
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2)   # 7x7
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Dropout(p_drop),
            nn.Linear(64*7*7, 128),
            nn.ReLU(inplace=True),
            nn.Dropout(p_drop),
            nn.Linear(128, num_classes)
        )
    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

model = SmallCNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)


Helper functions for training and evaluation.

In [None]:
def run_epoch(model, loader, optimizer=None):
    is_train = optimizer is not None
    model.train() if is_train else model.eval()
    losses, correct, total = [], 0, 0
    for xb, yb in loader:
        xb, yb = xb.to(device), yb.to(device)
        if is_train:
            optimizer.zero_grad()
        out = model(xb)
        loss = criterion(out, yb)
        if is_train:
            loss.backward(); optimizer.step()
        losses.append(loss.item())
        preds = out.argmax(dim=1)
        correct += (preds == yb).sum().item()
        total += yb.size(0)
    return np.mean(losses), correct/total

def evaluate(model, loader):
    model.eval()
    all_preds, all_true = [], []
    with torch.no_grad():
        for xb, yb in loader:
            out = model(xb.to(device))
            preds = out.argmax(dim=1).cpu()
            all_preds.append(preds)
            all_true.append(yb)
    y_pred = torch.cat(all_preds).numpy()
    y_true = torch.cat(all_true).numpy()
    return y_true, y_pred


## 3) Train a few epochs (CPU-friendly)
Keep epochs small; use validation to track generalization and stop early if needed.

In [None]:
epochs = 5
hist = {'tr_loss':[], 'tr_acc':[], 'val_loss':[], 'val_acc':[]}
best_val = -np.inf
best_state = None
patience, pat = 2, 0

for ep in range(1, epochs+1):
    tl, ta = run_epoch(model, tr_loader, optimizer)
    vl, va = run_epoch(model, val_loader)
    hist['tr_loss'].append(tl); hist['tr_acc'].append(ta)
    hist['val_loss'].append(vl); hist['val_acc'].append(va)
    print(f"Epoch {ep:02d} | train_loss={tl:.4f} acc={ta:.3f} | val_loss={vl:.4f} acc={va:.3f}")
    if va > best_val + 1e-4:
        best_val = va
        best_state = {k: v.cpu().clone() for k,v in model.state_dict().items()}
        pat = 0
    else:
        pat += 1
        if pat >= patience:
            print(f"Early stopping at epoch {ep} (best val acc={best_val:.3f})")
            break

if best_state is not None:
    model.load_state_dict(best_state)

plt.figure(figsize=(9,3))
plt.subplot(1,2,1); plt.plot(hist['tr_loss'], label='train'); plt.plot(hist['val_loss'], label='val'); plt.title('Loss'); plt.legend()
plt.subplot(1,2,2); plt.plot(hist['tr_acc'], label='train'); plt.plot(hist['val_acc'], label='val'); plt.title('Accuracy'); plt.legend()
plt.tight_layout(); plt.show()


Evaluate on the test set and show a confusion matrix + classification report.

In [None]:
y_true, y_pred = evaluate(model, te_loader)
print(classification_report(y_true, y_pred, target_names=classes, digits=3))
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=False, cmap='Blues')
plt.title('Confusion Matrix (Test)'); plt.xlabel('Predicted'); plt.ylabel('True')
plt.show()


## 4) Augmentation ablation (brief)
Compare with no augmentation for a couple of epochs to see effect on validation accuracy (may vary by seed).

In [None]:
# Exercise: Remove augmentation and compare
# TODO: Build a new training set with transforms.ToTensor()+Normalize only, train 2 epochs, and compare val acc.
...

In [None]:
# Solution (hidden)
train_noaug = datasets.FashionMNIST(root, train=True, download=False, transform=test_tfms)
train_noaug = Subset(train_noaug, tr_idx)
loader_noaug = DataLoader(train_noaug, batch_size=batch_size, shuffle=True, num_workers=2)
m2 = SmallCNN().to(device)
opt2 = torch.optim.Adam(m2.parameters(), lr=1e-3, weight_decay=1e-4)
for ep in range(2):
    tl, ta = run_epoch(m2, loader_noaug, opt2)
    vl, va = run_epoch(m2, val_loader)
va

## 5) Optional: Transfer learning with ResNet-18 (feature extractor)
Use an ImageNet-pretrained ResNet-18 as a frozen feature extractor. We adapt Fashion-MNIST (1 channel) to 3 channels and resize to 224. For CPU, we only use a small subset for training to keep runtime short. Accuracy is illustrative, not state-of-the-art.

Warning: Downloading pretrained weights requires internet and some minutes on CPU for feature extraction even on small subsets. Keep sample sizes small.

In [None]:
opt_tfms = transforms.Compose([
    transforms.Resize(224),
    transforms.Grayscale(num_output_channels=3),  # expand to 3 channels
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
train_opt = datasets.FashionMNIST(root, train=True, download=False, transform=opt_tfms)
test_opt  = datasets.FashionMNIST(root, train=False, download=False, transform=opt_tfms)

# tiny subset to keep CPU runtime small
train_small_idx = torch.randperm(len(train_opt))[:1500]
val_small_idx   = torch.randperm(len(train_opt))[1500:2000]
test_small_idx  = torch.randperm(len(test_opt))[:2000]

train_small = Subset(train_opt, train_small_idx)
val_small   = Subset(train_opt, val_small_idx)
test_small  = Subset(test_opt,  test_small_idx)

trL = DataLoader(train_small, batch_size=64, shuffle=True, num_workers=2)
vaL = DataLoader(val_small, batch_size=128, shuffle=False, num_workers=2)
teL = DataLoader(test_small, batch_size=128, shuffle=False, num_workers=2)

resnet = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
for p in resnet.parameters():
    p.requires_grad = False
resnet.fc = nn.Linear(resnet.fc.in_features, 10)  # classifier head
resnet = resnet.to(device)

opt = torch.optim.Adam(resnet.fc.parameters(), lr=1e-3)

def train_head(model, trL, vaL, epochs=2):  # keep very small
    best, st = -np.inf, None
    for ep in range(1, epochs+1):
        tl, ta = run_epoch(model, trL, opt)
        vl, va = run_epoch(model, vaL)
        print(f"[ResNet head] epoch {ep} | train_acc={ta:.3f} | val_acc={va:.3f}")
        if va>best: best, st = va, {k:v.cpu().clone() for k,v in model.state_dict().items()}
    if st: model.load_state_dict(st)
    return best

best_va = train_head(resnet, trL, vaL, epochs=2)
y_true_r, y_pred_r = evaluate(resnet, teL)
print(classification_report(y_true_r, y_pred_r, target_names=classes, digits=3))


## Exercises
Instructor solution cells are hidden/collapsed.
1. Dropout sweep: Try `p_drop` in [0.0, 0.25, 0.5]; train 3 epochs; compare validation accuracy and test report.
2. Augmentation variants: Replace RandomCrop with RandomAffine (small rotations) and compare.
3. Optional ResNet: Unfreeze the last block (layer4) and train for 1 extra epoch with very small LR (e.g., 1e-4). Does val acc improve?


In [None]:
# Exercise 1: Dropout sweep
# TODO: Rebuild SmallCNN with p_drop in [0.0, 0.25, 0.5] and record val acc after 3 epochs.
...

In [None]:
# Solution 1 (hidden)
vals = {}
for p in [0.0, 0.25, 0.5]:
    m = SmallCNN(p_drop=p).to(device)
    optm = torch.optim.Adam(m.parameters(), lr=1e-3, weight_decay=1e-4)
    best=-np.inf; st=None
    for _ in range(3):
        run_epoch(m, tr_loader, optm)
        v = run_epoch(m, val_loader)[1]
        if v>best: best=v; st={k:v_.cpu().clone() for k,v_ in m.state_dict().items()}
    if st: m.load_state_dict(st)
    vals[p] = best
vals

In [None]:
# Exercise 2: Augmentation variant
# TODO: Replace RandomCrop with transforms.RandomAffine(degrees=10, translate=(0.05,0.05)) and compare val acc after 3 epochs.
...

In [None]:
# Solution 2 (hidden)
aug2 = transforms.Compose([
    transforms.RandomHorizontalFlip(0.5),
    transforms.RandomAffine(degrees=10, translate=(0.05,0.05)),
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])
train_aug2 = datasets.FashionMNIST(root, train=True, download=False, transform=aug2)
train_aug2 = Subset(train_aug2, tr_idx)
loader_aug2 = DataLoader(train_aug2, batch_size=batch_size, shuffle=True, num_workers=2)
m = SmallCNN().to(device)
optm = torch.optim.Adam(m.parameters(), lr=1e-3, weight_decay=1e-4)
best=-np.inf
for _ in range(3):
    run_epoch(m, loader_aug2, optm)
    best = max(best, run_epoch(m, val_loader)[1])
best

In [None]:
# Exercise 3 (Optional): Unfreeze part of ResNet
# TODO: Set requires_grad=True for layer4 parameters, use LR=1e-4, and train 1 epoch on the small subset; check val acc.
...

In [None]:
# Solution 3 (hidden, optional)
try:
    for p in resnet.layer4.parameters():
        p.requires_grad = True
    opt_small = torch.optim.Adam(filter(lambda p: p.requires_grad, resnet.parameters()), lr=1e-4)
    # quick 1 epoch fine-tune
    run_epoch(resnet, trL, opt_small)
    va_acc = run_epoch(resnet, vaL)[1]
    va_acc
except NameError:
    print('ResNet section was not run. Execute the optional cell above first.')


## Wrap-up checklist
- [ ] Normalize inputs consistently across train/val/test
- [ ] Keep CNN architectures small for CPU; limit epochs
- [ ] Use augmentation and dropout to reduce overfitting
- [ ] Consider frozen backbones for transfer learning on small datasets
- [ ] Inspect errors via confusion matrix per class
