In [None]:
import os, random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision.datasets import ImageFolder
from torch.utils.data import Dataset, DataLoader, Subset
from torchvision import transforms, models
from tqdm.auto import tqdm

from typing import List, Tuple

In [None]:

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

NUM_CLASSES_LETTERS = 29      # ASL alphabet
NUM_CLASSES_WORDS = 2000      # WLASL (if used) - could not implement this time 
DATA_ROOT = "/kaggle/input/asl-alphabet/asl_alphabet_train/asl_alphabet_train"
TEST_ROOT = "/kaggle/input/asl-alphabet/asl_alphabet_test/asl_alphabet_test"
IMAGE_SIZE = 224
LEARNING_RATE = 1e-4
EPOCHS = 20
BATCH_SIZE = 64
NUM_WORKERS = 2

# Enhanced Data Augmentation

To improve generalization and prevent overfitting, we applied an enhanced data‚Äêaugmentation pipeline.  
The goal of augmentation is to artificially increase dataset diversity by creating varied versions of the same image. Although our presented data set is quite diverse, this additional step prevents the model from memorizing training examples and encourages learning more robust, invariant features.

### **Transformations Used** 
- Random rotations  
- Color jitter (hue, saturation, brightness variations)  
- Random resized crops  
- Normalization with ImageNet mean and std  

### **Why This Helps**
Deep CNNs such as ResNet can easily overfit when trained on datasets with limited intra-class variation. Augmentation simulates real-world variability and forces the network to learn spatially invariant, shape-based representations rather than memorizing texture patterns.

### **Observed Improvements**
- Reduced gap between training and validation accuracy  
- Smoother training curves with less volatility  
- Higher validation accuracy due to improved generalization  


In [None]:
train_transform = transforms.Compose([
    transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
    transforms.RandomApply([
        transforms.ColorJitter(brightness=0.25, contrast=0.25, saturation=0.15, hue=0.02)
    ], p=0.8),
    transforms.RandomAffine(degrees=10, translate=(0.08, 0.08), scale=(0.9, 1.1)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])

val_transform = transforms.Compose([
    transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])

## Dataset Splitting (Train / Validation)

We load the dataset twice using `ImageFolder`, applying **different transforms** for training and validation.  
This allows us to use **data augmentation** during training (e.g., random crops, flips) while keeping the validation data deterministic and representative of real inference conditions.

To ensure **reproducibility**, we generate a random permutation of dataset indices using a fixed random seed (`42`).  
This guarantees that the same images are assigned to the training and validation sets across runs.

We then split the dataset as follows:
- **90%** of the data is used for training  
- **10%** of the data is reserved for validation  

The split is implemented using `Subset`, which indexes into the original dataset without duplicating data.

Although both subsets reference the same underlying images, they apply **different transforms**, ensuring:
- No data leakage between training and validation
- Consistent experimental results
- Proper separation of augmented training data from clean validation data


In [None]:

# DATASET SPLITTING
full_train_ds = ImageFolder(DATA_ROOT, transform=train_transform)
full_val_ds   = ImageFolder(DATA_ROOT, transform=val_transform)

# reproducible split indices
val_frac = 0.1
n = len(full_train_ds)
indices = torch.randperm(n, generator=torch.Generator().manual_seed(42)).tolist()
val_size = int(n * val_frac)

val_idx = indices[:val_size]
train_idx = indices[val_size:]

train_ds = Subset(full_train_ds, train_idx)
val_ds   = Subset(full_val_ds,   val_idx)

print("Total:", n)
print("Train:", len(train_ds), "Val:", len(val_ds))
print("Classes:", len(full_train_ds.classes), full_train_ds.classes[:10])


In [None]:
train_loader = DataLoader(
    train_ds,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=NUM_WORKERS,
    pin_memory=True,
    persistent_workers=True,
    prefetch_factor=2
)

val_loader = DataLoader(
    val_ds,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS,
    pin_memory=True,
    persistent_workers=True,
    prefetch_factor=2
)

# shows us size
x, y = next(iter(train_loader))
print("Batch:", x.shape, y.shape, "dtype:", x.dtype)


## Model Architecture (ResNet-18)

We use **ResNet-18**, a deep convolutional neural network pretrained on the ImageNet dataset, as the backbone of our model.  
Leveraging a pretrained model allows us to benefit from learned low-level and mid-level visual features, improving convergence speed and performance‚Äîespecially with limited training data.

### Model Construction
- The model is initialized with **ImageNet pretrained weights**.
- We extract the number of input features to the final fully connected layer (`model.fc.in_features`).
- The original classification head is replaced with a custom head consisting of:
  - `Dropout(p = 0.3)` to reduce overfitting
  - A `Linear` layer mapping to `num_classes`, matching the target classification task

### Why This Design?
- **Transfer learning**: Reuses robust visual features learned from large-scale data.
- **Regularization**: Dropout improves generalization.
- **Flexibility**: The function allows easy reuse for different numbers of output classes.

The resulting model is well-suited for fine-tuning on a custom image classification dataset.


In [None]:
import torch.nn as nn
from torchvision import models

# Using the resNet18 model architecture
def build_model(num_classes):
    model = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
    in_features = model.fc.in_features
    model.fc = nn.Sequential(
        nn.Dropout(0.3),
        nn.Linear(in_features, num_classes)
    )
    return model

## Training + Evaluation Loop (Mixed Precision)

This section sets up the full training pipeline: model initialization, loss/optimizer/scheduler configuration, and the per-epoch **train -> validate** loop.

### Loss Function
We use **Cross Entropy Loss** with **label smoothing**:
- `nn.CrossEntropyLoss(label_smoothing=0.05)`
Label smoothing slightly softens the target labels, which can reduce overconfidence and improve generalization.

### Optimizer + LR Scheduler
- **AdamW** is used for optimization (`lr=3e-4`, `weight_decay=1e-4`), which typically works well for fine-tuning CNNs.
- A **Cosine Annealing** learning rate schedule is applied over `T_max=15` epochs, gradually reducing the learning rate in a smooth cosine curve.
---

## üîÅ `train_one_epoch`
During training we:
1. Set the model to training mode with `model.train()`.
2. Iterate over batches with a `tqdm` progress bar.
3. Move inputs/labels to the target device (with `non_blocking=True` for faster transfers).
4. Zero gradients using `optimizer.zero_grad(set_to_none=True)` (more memory-efficient).
5. Forward pass + loss computation under `autocast(...)`.
6. Backprop using scaled gradients (`scaler.scale(loss).backward()`).
7. Clip gradients to stabilize training (`clip_grad_norm_`).
8. Update model weights with `scaler.step(optimizer)` and update the scaler.

We track and return the **average training loss** over the epoch.

---

## `eval_one_epoch` (Validation)
Validation is run after each epoch to measure generalization performance.

Key differences from training:
- `@torch.no_grad()` disables gradient tracking (faster + lower memory).
- `model.eval()` switches off training-only behaviors (e.g., Dropout, BatchNorm updates).
- We compute:
  - average validation loss
  - validation accuracy (`correct / total`)

Even though gradients are disabled, we still use `autocast(...)` on CUDA for faster inference.

---

## Training Loop + Checkpointing
For each epoch (1 to 15), we:
1. Train for one epoch
2. Evaluate on validation data
3. Step the LR scheduler
4. Store loss/accuracy history for plotting later

### Saving the Best Model
We track `best_val_loss` and save a checkpoint whenever validation loss improves:
- model weights (`model_state`)
- class labels (`classes`)
- image size (`image_size`)
- best validation loss and epoch

Checkpoint file:
- `asl_resnet18_best.pth`

This ensures we keep the model that generalizes best, not just the one from the final epoch.


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.amp import autocast, GradScaler  # <-- new API

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
model = build_model(NUM_CLASSES_LETTERS).to(DEVICE)

criterion = nn.CrossEntropyLoss(label_smoothing=0.05)
optimizer = optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-4)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=15)

scaler = GradScaler(enabled=(DEVICE == "cuda"))

def train_one_epoch(model, loader, epoch):
    model.train()
    total_loss = 0.0

    pbar = tqdm(loader, desc=f"Epoch {epoch:02d} [train]", leave=False)
    try:
        for x, y in pbar:
            x, y = x.to(DEVICE, non_blocking=True), y.to(DEVICE, non_blocking=True)
            optimizer.zero_grad(set_to_none=True)

            with autocast(device_type="cuda", enabled=(DEVICE == "cuda")):
                out = model(x)
                loss = criterion(out, y)

            scaler.scale(loss).backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(optimizer)
            scaler.update()

            total_loss += loss.item()
            pbar.set_postfix(loss=f"{loss.item():.4f}")
    finally:
        pbar.close()

    return total_loss / len(loader)
# use no grad as we want inferences in a tighter time loop through our webcam, this eval step is performed after training one epoch
@torch.no_grad()
def eval_one_epoch(model, loader):
    model.eval()
    total_loss, correct, total = 0.0, 0, 0

    pbar = tqdm(loader, desc="           [val]", leave=False)
    try:
        for x, y in pbar:
            x, y = x.to(DEVICE, non_blocking=True), y.to(DEVICE, non_blocking=True)

            with autocast(device_type="cuda", enabled=(DEVICE == "cuda")):
                out = model(x)
                loss = criterion(out, y)

            total_loss += loss.item()
            pred = out.argmax(1)
            correct += (pred == y).sum().item()
            total += y.size(0)

            acc = correct / total
            pbar.set_postfix(loss=f"{loss.item():.4f}", acc=f"{acc:.3f}")
    finally:
        pbar.close()

    return total_loss / len(loader), correct / total



best_val_loss = float("inf")
train_losses = []
val_losses = []
val_accs = []


for epoch in range(1, 16):
    print(f"\n=== Epoch {epoch:02d} ===")

    tr_loss = train_one_epoch(model, train_loader, epoch)
    va_loss, va_acc = eval_one_epoch(model, val_loader)
    scheduler.step()

    print(
        f"Epoch {epoch:02d} | "
        f"train loss {tr_loss:.4f} | "
        f"val loss {va_loss:.4f} | "
        f"val acc {va_acc:.4f}"
    )

    train_losses.append(tr_loss)
    val_losses.append(va_loss)
    val_accs.append(va_acc)

    if va_loss < best_val_loss:
        best_val_loss = va_loss
        torch.save({
            "model_state": model.state_dict(),
            "classes": full_train_ds.classes,
            "image_size": IMAGE_SIZE,
            "best_val_loss": best_val_loss,
            "epoch": epoch,
        }, "asl_resnet18_best.pth")
        print(f"saved new best val loss: {best_val_loss:.4f}")



## Training Curves (Loss & Accuracy)

To analyze model performance over time, we visualize both **training loss**, **validation loss**, and **validation accuracy** across epochs.

### Loss Curves
- **Training loss** shows how well the model fits the training data.
- **Validation loss** reflects how well the model generalizes to unseen data.
- Plotting both together helps diagnose:
  - Overfitting (training loss decreases while validation loss increases)
  - Underfitting (both losses remain high)
  - Proper convergence (both decrease and stabilize)

### Accuracy Curve
- Validation accuracy measures classification performance on unseen data.
- Tracking accuracy over epochs allows us to confirm that improvements in loss translate to better predictive performance.

### Why This Matters
Visualizing these metrics helps:
- Validate training stability
- Select the best epoch (in conjunction with checkpointing)
- Debug learning rate or regularization issues

These plots provide an intuitive summary of the model‚Äôs learning behavior throughout training.


In [None]:
import matplotlib.pyplot as plt

epochs = range(1, len(train_losses) + 1)

# ---- Loss plot ----
plt.figure()
plt.plot(epochs, train_losses, label="Train Loss")
plt.plot(epochs, val_losses, label="Val Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training vs Validation Loss")
plt.legend()
plt.show()

# ---- Accuracy plot ----
plt.figure()
plt.plot(epochs, val_accs, label="Val Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Validation Accuracy")
plt.legend()
plt.show()
