In [19]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torchvision import transforms
from sklearn.model_selection import train_test_split
import numpy as np

seed = 42
torch.manual_seed(seed)

transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])

data = np.load("permuted_mnist.npz")
permuted_x_train = data["train_images"]
y_train = data["train_labels"]
permuted_x_test = data["test_images"]
y_test = data["test_labels"]

train_dataset = torch.utils.data.TensorDataset(torch.from_numpy(permuted_x_train), torch.from_numpy(y_train))
test_dataset = torch.utils.data.TensorDataset(torch.from_numpy(permuted_x_test), torch.from_numpy(y_test))

# train_dataset = torchvision.datasets.MNIST(root='./data', train=True, transform=transform, download=True)
# test_dataset = torchvision.datasets.MNIST(root='./data', train=False, transform=transform, download=True)

train_dataset, val_dataset = train_test_split(train_dataset, test_size=0.2, random_state=seed)

batch_size = 64
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import transforms

class MLPModel(nn.Module):
    def __init__(self, input_size, hidden_sizes, num_classes):
        super(MLPModel, self).__init__()
        layers = []
        layers.append(nn.Flatten())
        for i in range(len(hidden_sizes)):
            layers.append(nn.Linear(input_size if i == 0 else hidden_sizes[i - 1], hidden_sizes[i]))
            layers.append(nn.ReLU())
        layers.append(nn.Linear(hidden_sizes[-1], num_classes))
        self.model = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.model(x)

input_size = 28 * 28  # Input size is the flattened image
hidden_sizes = [128, 64]  # Example hidden layer sizes
output_size = 10  # Number of classes (digits)

model = MLPModel(input_size, hidden_sizes, output_size)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for images, labels in train_loader:
        optimizer.zero_grad()
        images = images.view(images.size(0), -1)
        # Convert the images tensor from Byte to Float
        images = images.float()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

In [8]:
def evaluate(model, dataloader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in dataloader:
            images = images.view(images.size(0), -1)
            images = images.float()
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    accuracy = correct / total
    return accuracy

train_accuracy = evaluate(model, train_loader)
print(f"Training Accuracy: {train_accuracy:.2f}")

val_accuracy = evaluate(model, val_loader)
print(f"Validation Accuracy: {val_accuracy:.2f}")

Training Accuracy: 0.99
Validation Accuracy: 0.97


In [10]:
hidden_layers_list = [1, 2, 3]  # Vary the number of hidden layers
neurons_list = [64, 128, 256]  # Vary the number of neurons in each layer

best_accuracy = 0
best_model = None

for num_hidden_layers in hidden_layers_list:
    for num_neurons in neurons_list:
        # Create the model with the current hyperparameters
        hidden_sizes = [num_neurons] * num_hidden_layers
        model = MLPModel(input_size, hidden_sizes, output_size)
        optimizer = optim.Adam(model.parameters(), lr=0.001)

        # Train the model
        for epoch in range(num_epochs):
            model.train()
            for images, labels in train_loader:
                optimizer.zero_grad()
                images = images.view(images.size(0), -1)
                images = images.float()
                outputs = model(images)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

        # Evaluate on the validation set
        val_accuracy = evaluate(model, val_loader)

        # Check if the current model is the best
        if val_accuracy > best_accuracy:
            best_accuracy = val_accuracy
            best_model = model

In [11]:
# Report the best model's hyperparameters
print(f"Best Model: {best_model}")
print(f"Best Hyperparameters - Hidden Layers: {num_hidden_layers}, Neurons: {num_neurons}")

# Evaluate the best model on the test set
test_accuracy = evaluate(best_model, test_loader)
print(f"Test Accuracy of the Best Model: {test_accuracy:.2f}")

Best Model: MLPModel(
  (model): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=784, out_features=256, bias=True)
    (2): ReLU()
    (3): Linear(in_features=256, out_features=256, bias=True)
    (4): ReLU()
    (5): Linear(in_features=256, out_features=256, bias=True)
    (6): ReLU()
    (7): Linear(in_features=256, out_features=10, bias=True)
  )
)
Best Hyperparameters - Hidden Layers: 3, Neurons: 256
Test Accuracy of the Best Model: 0.97


In [13]:
# Evaluate on the test set
test_accuracy = evaluate(model, test_loader)
print(f"Test Accuracy: {test_accuracy:.2f}")

Test Accuracy: 0.97


### CNN ON PERMUTED_MNIST

In [44]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torchvision import transforms
from sklearn.model_selection import train_test_split

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(64 * 7 * 7, 128)
        self.fc2 = nn.Linear(128, 10)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.pool(nn.functional.relu(self.conv1(x)))
        x = self.pool(nn.functional.relu(self.conv2(x)))
        x = x.view(-1, 64 * 7 * 7)
        x = nn.functional.relu(self.fc1(x))
        x = self.fc2(x)
        return x
    
    def setKernelsize(self, kernelsize):
        self.kernelsize = kernelsize
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=kernelsize, stride=1, padding=1)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=kernelsize, stride=1, padding=1)

    def setStride(self, stride):
        self.stride = stride
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=self.kernelsize, stride=stride, padding=1)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=self.kernelsize, stride=stride, padding=1)

    def setDropout(self, dropout):
        self.dropout = nn.Dropout(dropout)

Model = CNN()

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(Model.parameters(), lr=0.001)

num_epochs = 10

for epoch in range(num_epochs):
    Model.train()
    for images, labels in train_loader:
        optimizer.zero_grad()
        images = images.float()
        outputs = Model(images.unsqueeze(1))  # Add channel dimension
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

Model.eval()
correct = 0
total = 0

with torch.no_grad():
    for images, labels in train_loader:
        images = images.float()
        outputs = Model(images.unsqueeze(1))
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

train_accuracy = 100 * correct / total
print(f"Training Accuracy: {train_accuracy:.2f}%")

Training Accuracy: 98.66%


In [46]:
def evaluate(model, dataloader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in dataloader:
            images = images.float()
            outputs = model(images.unsqueeze(1))
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    accuracy = correct / total
    accuracy *= 100.0
    return accuracy

val_accuracy = evaluate(Model, val_loader)
print(f"Validation Accuracy: {val_accuracy:.2f}")

test_accuracy = evaluate(Model, test_loader)
print(f"Test Accuracy: {test_accuracy:.2f}")

Validation Accuracy: 94.70
Test Accuracy: 95.13


In [49]:
# perform hyperparameter tuning by varying the kernel size, dropout rate and learning rate in a few tuples
# kernel size, dropout rate, learning rate

hyperparameter_tuples = [(3, 0.3, 0.001), (3, 0.5, 0.0005), (3, 0.7, 0.001), (5, 0.3, 0.001), (7, 0.5, 0.0005)]

best_accuracy = 0
best_model = None

for kernelsize, dropout, learning_rate in hyperparameter_tuples:
    Model = CNN()
    Model.setKernelsize(kernelsize)
    Model.setDropout(dropout)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(Model.parameters(), lr=learning_rate)
    for epoch in range(num_epochs):
        Model.train()
        for images, labels in train_loader:
            optimizer.zero_grad()
            images = images.float()
            outputs = Model(images.unsqueeze(1))
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
    val_accuracy = evaluate(Model, val_loader)
    print("Validation Accuracy: ", val_accuracy)
    if val_accuracy > best_accuracy:
        best_accuracy = val_accuracy
        best_model = Model

Validation Accuracy:  94.27499999999999
Validation Accuracy:  95.6
Validation Accuracy:  95.15833333333333


In [55]:
print("Best Accuracy (validation dataset): ", best_accuracy)
print("Best Model: ", best_model)

Best Accuracy (validation dataset):  95.6
Best Model:  CNN(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=3136, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=10, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)


## Analysis:

Detailed analysis in Part-5 Report.

1. Observed Differences and Challenges:

Training and Evaluation Times: CNNs are generally more computationally intensive during training due to their convolutional layers. This results in longer training times compared to MLPs.

Data Augmentation: For both datasets, data augmentation techniques like rotation, scaling, and translation can significantly benefit CNNs by increasing the variety of learned features. MLPs may not benefit as much from data augmentation.

Overfitting: CNNs have more parameters and are prone to overfitting when there's limited data. Overfitting can be mitigated with techniques like dropout and weight decay. MLPs, having fewer parameters, might be less prone to overfitting.

1. Potential for Overfitting and Continual Learning:

Overfitting in CNN: CNNs have a higher potential for overfitting, especially on smaller datasets, due to their larger number of learnable parameters. Regularization techniques like dropout and weight decay are essential to prevent overfitting.

Overfitting in MLP: While MLPs are less prone to overfitting due to fewer parameters, they can still overfit, particularly when the model is too complex or the dataset is small. Regularization is also useful for MLPs.

Continual Learning: Continual learning, or lifelong learning, is the ability of a model to learn from a sequence of tasks without forgetting what it learned previously. Both CNNs and MLPs can suffer from catastrophic forgetting when not specifically designed for continual learning. Techniques like Elastic Weight Consolidation (EWC) and Progressive Neural Networks (PNN) can be applied to mitigate this issue.

In summary, CNNs outperform MLPs on image datasets like MN2IST due to their ability to capture spatial features efficiently. However, for the Permuted MNIST dataset, both models face challenges in recognizing digits without spatial information. Continual learning techniques are crucial when dealing with a sequence of tasks, ensuring that previously learned knowledge is retained while adapting to new tasks. Proper regularization and data augmentation are key factors in preventing overfitting in both CNNs and MLPs. The choice of model depends on the nature of the dataset and the specific task requirements.