# Q2: CNNs and Transformers [6 points]
1. [2.5 points] Set up a modular codebase for training a CNN (LeNet) on the task of handwritten digit recognition.
You should have clear functional separation between the data (dataset and dataloader), model (nn.Module),
and trainer (train/test epoch loops). Implement logging: using Weights & Biases is highly recommended,
alternatively, create your own plots using other plotting libraries. Log the training and evaluation losses and
accuracies at every epoch, show the plots for at least one training and evaluation run.

    `Note` 1: Seed random numbers for reproducibility (running the notebook again should give you the same results!).

In [None]:
DATASET_PATH = "MNIST"

DEBUG = False

In [None]:
import cv2
import numpy as np
from sklearn.cluster import KMeans #, MiniBatchKMeans
from sklearn import svm
# from sklearn.model_selection import train_test_split
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import wandb
from tqdm.notebook import tqdm

In [None]:
# seed for reproducibility

torch.manual_seed(42)
np.random.seed(42)

# DataLoaders

In [None]:
def load_data(dataset_path, batch_size=128):
    transform = transforms.Compose([
                transforms.Resize((32,32)), # Resizing as MNIST is 28 * 28
                transforms.ToTensor(),
                transforms.Normalize((0.1307,), (0.3081,))
            ])

    train_dataset = datasets.MNIST(dataset_path, train=True, download=True, transform=transform)
    test_dataset = datasets.MNIST(dataset_path, train=False, transform=transform)

    # print(len(train_dataset))
    # print(len(test_dataset))

    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
    return train_dataloader, test_dataloader

# LeNet

In [None]:
class LeNet(nn.Module):
    def __init__(self) -> None:
        super(LeNet, self).__init__()
        # Mnist images are 28x28 so we have to pad the images to make them 32x32
        self.conv1 = nn.Conv2d(in_channels = 1,out_channels= 6, kernel_size=5, stride=1, padding=0)
        self.conv2 = nn.Conv2d(in_channels = 6,out_channels= 16, kernel_size=5, stride=1, padding=0)

        self.fc1 = nn.Linear(16*5*5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = F.max_pool2d(F.relu(self.conv1(x)), 2)
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = x.view(-1, 16*5*5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        # No need to apply softmax as we're using CrossEntropyLoss
        return x

<!-- insert Image here -->

![LeNet5 Architecture](LeNetArchitecture.jpg)

In [None]:
# # PyTorch implementation of the LeNet-5 architecture as described in the above image table

# class LeNet5(nn.Module):
#     def __init__(self):
#         super(LeNet5, self).__init__()
#         # According to the table, using tanh activations
#         self.c1 = nn.Conv2d(1, 6, kernel_size=5, stride=1, padding=0)  # C1: Convolutional layer
#         self.s2 = nn.AvgPool2d(kernel_size=2, stride=2)                # S2: Average pooling layer
#         self.c3 = nn.Conv2d(6, 16, kernel_size=5, stride=1, padding=0) # C3: Convolutional layer
#         self.s4 = nn.AvgPool2d(kernel_size=2, stride=2)                # S4: Average pooling layer
#         self.c5 = nn.Conv2d(16, 120, kernel_size=5, stride=1, padding=0) # C5: Convolutional layer, output is 1x1

#         self.f6 = nn.Linear(120, 84)                                   # F6: Fully connected layer
#         self.output = nn.Linear(84, 10)                                 # Output: Fully connected layer, 10 classes for digits 0-9

#     def forward(self, x):
#         # Implementing the forward pass with tanh activations and softmax at the output layer
#         x = torch.tanh(self.c1(x))
#         x = self.s2(x)
#         x = torch.tanh(self.c3(x))
#         x = self.s4(x)
#         x = torch.tanh(self.c5(x))

#         # Flatten the tensor for the fully connected layer
#         x = x.view(x.size(0), -1)

#         x = torch.tanh(self.f6(x))
#         x = self.output(x)
#         # No need to apply softmax as we're using CrossEntropyLoss
#         return x

# # Instantiate the model
# lenet5 = LeNet5()

# # Print the model structure
# print(lenet5)


In [None]:
def train(model, device, train_dataloader, optimizer, epoch)-> None:
    model.train()
    criterion = nn.CrossEntropyLoss()

    total_loss = 0
    correct = 0

    for batch_idx, (data, labels) in tqdm(enumerate(train_dataloader), total=len(train_dataloader), desc="Training"):

        data, labels = data.to(device), labels.to(device) # Move data and labels to device

        optimizer.zero_grad()

        output = model(data)

        loss = criterion(output, labels)

        loss.backward()

        optimizer.step()

        total_loss += loss.item()
        pred = output.argmax(dim=1, keepdim=True)
        correct += pred.eq(labels.view_as(pred)).sum().item()

    total_loss /= len(train_dataloader.dataset)
    accuracy = 100. * correct / len(train_dataloader.dataset)
    wandb.log({"Train Loss": total_loss, "Train Accuracy": accuracy, "Epoch": epoch})
    print(f"Train Epoch: {epoch} \t Loss: {total_loss:.4f} \t Accuracy: {correct}/{len(train_dataloader.dataset)} ({accuracy:.0f}%)\n")

    return total_loss, accuracy


def test(model, device, test_dataloader, epoch)-> None:
    model.eval()
    criterion = nn.CrossEntropyLoss()

    with torch.inference_mode():
        test_loss = 0
        correct = 0
        with torch.no_grad():
            for data, labels in tqdm(test_dataloader, total=len(test_dataloader), desc="Testing"):
                data, labels = data.to(device), labels.to(device)

                # prediction
                output = model(data)

                test_loss += criterion(output, labels).item()

                pred = output.argmax(dim=1, keepdim=True)

                correct += pred.eq(labels.view_as(pred)).sum().item()

        test_loss /= len(test_dataloader.dataset)
        accuracy = 100. * correct / len(test_dataloader.dataset)
        wandb.log({"Test Loss": test_loss, "Test Accuracy": accuracy, "Epoch": epoch})
        print(f"\nTest set: Average loss: {test_loss:.4f}, Accuracy: {correct}/{len(test_dataloader.dataset)} ({accuracy:.0f}%)\n")

        return test_loss, accuracy

def get_optimizer(opt, model, learning_rate)-> optim.Optimizer:
    if opt == "adam":
        return optim.Adam(model.parameters(), lr=learning_rate)
    elif opt == "sgd":
        return optim.SGD(model.parameters(), lr=learning_rate)
    elif opt == "rmsprop":
        return optim.RMSprop(model.parameters(), lr=learning_rate)
    elif opt == "adagrad":
        return optim.Adagrad(model.parameters(), lr=learning_rate)
    else:
        raise ValueError("Invalid optimizer")

def plot_metrics(train_loss_list, test_loss_list, train_accuracy_list, test_accuracy_list, title):

    fig, ax = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle(title, fontsize=16)

    ax[0, 0].plot(train_loss_list, label="Train Loss")
    ax[0, 0].set_title("Train Loss")
    ax[0, 0].set_xlabel("Epochs")
    ax[0, 0].set_ylabel("Loss")
    ax[0, 0].legend()

    ax[0, 1].plot(test_loss_list, label="Test Loss")
    ax[0, 1].set_title("Test Loss")
    ax[0, 1].set_xlabel("Epochs")
    ax[0, 1].set_ylabel("Loss")
    ax[0, 1].legend()

    ax[1, 0].plot(train_accuracy_list, label="Train Accuracy")
    ax[1, 0].set_title("Train Accuracy")
    ax[1, 0].set_xlabel("Epochs")
    ax[1, 0].set_ylabel("Accuracy")
    ax[1, 0].legend()

    ax[1, 1].plot(test_accuracy_list, label="Test Accuracy")
    ax[1, 1].set_title("Test Accuracy")
    ax[1, 1].set_xlabel("Epochs")
    ax[1, 1].set_ylabel("Accuracy")
    ax[1, 1].legend()

    plt.show()

# def main():

#     epochs = 10
#     learning_rate = 0.001
#     batch_size = 64
#     optimizer = optim.Adam(model.parameters(), lr=learning_rate)

#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#     train_dataloader, test_loader = load_data(dataset_path=DATASET_PATH, batch_size=batch_size)

#     model = LeNet().to(device)



#     wandb.init(project="lenet-mnist",     config={
#         "learning_rate": learning_rate,
#         "architecture": "CNN-Lenet5",
#         "dataset": "MNIST",
#         "Optimizer": type(optimizer).__name__,
#         "batch_size": batch_size,
#         "epochs": epochs
#         })

#     for epoch in range(1, epochs+1):
#         wandb.config.step = epoch
#         train(model, device, train_dataloader, optimizer, epoch)
#         test(model, device, test_loader, epoch)
# main()


In [None]:
def main(epochs_list, learning_rates, batch_sizes, optimizers):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    for epochs in epochs_list:
        for lr in learning_rates:
            for batch_size in batch_sizes:
                for opt in optimizers:

                    train_dataloader, test_loader = load_data(dataset_path='path_to_dataset', batch_size=batch_size)
                    model = LeNet().to(device)

                    optimizer = get_optimizer(opt, model, lr)

                    # Initialize wandb run
                    wandb.init(project="lenet-mnist", config={
                        "learning_rate": lr,
                        "architecture": "CNN-LeNet5",
                        "dataset": "MNIST",
                        "Optimizer": opt,
                        "batch_size": batch_size,
                        "epochs": epochs
                    })
                    train_loss_list = []
                    train_accuracy_list = []
                    test_loss_list = []
                    test_accuracy_list = []

                    for epoch in range(1, epochs + 1):
                        train_loss , train_accuracy = train(model, device, train_dataloader, optimizer, epoch)
                        test_loss, test_accuracy = test(model, device, test_loader, epoch)

                        train_loss_list.append(train_loss)
                        train_accuracy_list.append(train_accuracy)
                        test_loss_list.append(test_loss)
                        test_accuracy_list.append(test_accuracy)

                    # Finish the wandb run
                    wandb.finish()
                    plot_metrics(train_loss_list, test_loss_list, train_accuracy_list, test_accuracy_list, f"LeNet-5 with {opt} optimizer, lr={lr}, batch_size={batch_size}, epochs={epochs}")

epochs = [5, 10, 15, 20]
learning_rates = [0.001, 0.0005, 0.0001]
batch_sizes = [32, 64, 128]
optimizers = ['Adam', 'SGD', 'RMSprop', 'Adagrad']

main(epochs, learning_rates, batch_sizes, optimizers)


# Wandb Published Report

https://api.wandb.ai/links/prakhar-jain/689hnt5x

---



2. [1 point] Show the results for 6 different settings of hyperparameters. You may want to change the batch size,
learning rate, and optimizer. Explain the trends in classification accuracy that you observe. Which hyperpa-
rameters are most important?


*`Answer` : Optimizer, Batch Size and Learning Rate in their decreasing order of importance as can be seen from   `Parameter Importance Panel` in the above attached wandb report.*

---

3. [0.5 points] Compare the best performing CNN (from above) against the SIFT-BoVW-SVM approach. Explain
the differences.


*`Answer` : The best performing CNN model has `batch_size = 32`, `epochs = 10`, `learning_rate = 0.0005`, `optimizer = rmsprop`*

- **Feature Extraction**: *CNNs automatically learn hierarchical features directly from raw images, while `SIFT-BoVW-SVM` relies on manually engineered features (`SIFT`) and a quantization step (`BoVW`) to create image descriptors for SVM classification.*

- **Computational Complexity**: *`CNNs` require substantial computational resources for training but offer fast inference once trained. `SIFT-BoVW-SVM` has significant upfront computational costs and very large training times (because of `KMeans`) for feature extraction and quantization but generally involves simpler models (`SVM`) with fewer parameters.*

- **Training Process**: *`CNN` training involves multiple epochs of forward and backward passes to adjust weights, highly dependent on hyperparameters. In contrast, `SIFT-BoVW-SVM` focuses on extracting and quantizing features before training a potentially simpler `SVM` model.*

- **Application Suitability**: *`CNNs` excel in complex image classification tasks with large datasets, benefiting from deep learning advancements. `SIFT-BoVW-SVM` can be effective in scenarios where specific, interpretable features are crucial or when dealing with smaller datasets.*

---

4. [0.5 points] How does the performance change if you double the number of convolutional layers?

---

5. [0.5 points] How does the performance change as you increase the number of training samples: [0.6K, 1.8K, 6K,
18K, 60K]? Explain the trends in classification accuracy that you observe.

    `Note` 1: Make sure that all classes are represented equally within different subsets of the training sets.


---

6. [1 point] Replace the CNN model with a 2 layer TransformerEncoder. Using a ViT style prediction scheme,
evaluate classification accuracy when training with 6K and 60K images. How do the results compare against
CNNs? Explain the trends.