### Exercise 1)
L1:

$L=\dfrac{1}{2}(Y-y)^2+\lambda\sum_{i} |W_i|$

L2:

$L=\dfrac{1}{2}(Y-y)^2+\lambda\sum_{i} W_i^2$

L1 is more strict and it brings more weights to zero, thus making the weights sparse.

L2 encourages smaller and more distributed weights across all features, reducing the impact of any individual feature. L2 doesn't make the weights exactly zero, but rather very small and close to it.

### Exercise 2)
In this part, you will implement a simple multi-layer perceptron neural network using PyTorch
to solve a clothing classification problem. You have to work with the Fashion MNIST dataset,
which consists of 10 classes with 60,000 examples in the training set and 10,000 examples in
the test set.

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms

### data preparation:

In [2]:
# Define data transformations with mean of 0.5 and st of 0.5
transform = transforms.Compose([  transforms.ToTensor(),  transforms.Normalize((0.5,), (0.5,)) ])

# Load the Fashion MNIST dataset
train_dataset = datasets.FashionMNIST( root='./data', train=True, transform=transform, download=True)

test_dataset = datasets.FashionMNIST( root='./data',train=False, transform=transform, download=True)

# Create data loaders
batch_size = 64
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

### model definition:

In [3]:
class MLP(nn.Module):
    def __init__(self, input_size, output_size, hidden_sizes):
        super(MLP, self).__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.hidden_sizes = hidden_sizes

        # nn.ModuleList is used to contain multiple instances of PyTorch's nn.Module subclasses,
        # such as linear layers (nn.Linear), activation functions (nn.ReLU, nn.Sigmoid, etc.), or other custom modules.
        self.layers = nn.ModuleList()
        sizes = [input_size] + hidden_sizes + [output_size]
        for i in range(len(sizes) - 1):
            self.layers.append(nn.Linear(sizes[i], sizes[i+1]))
            if i != len(sizes) - 2:
                self.layers.append(nn.ReLU())

    def forward(self, x):
        x = x.view(x.size(0), -1) 
        for layer in self.layers:
            x = layer(x)
        return x

# Initialize the model
input_size = 28 * 28
output_size = 10
hidden_sizes = [256]  # Vary the hidden layer sizes here

model = MLP(input_size, output_size, hidden_sizes)
print(model)

MLP(
  (layers): ModuleList(
    (0): Linear(in_features=784, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=10, bias=True)
  )
)


### training loop:

In [4]:
# Define the loss function and optimizer
criterion = nn.MSELoss()
learning_rate = 0.01
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

# Training loop
num_epochs = 10

for epoch in range(num_epochs):
    for batch_idx, (data, targets) in enumerate(train_loader):
        # Forward pass
        outputs = model(data)
        # The line torch.eye(10)[targets] performs one-hot encoding on the targets by creating 
        # a one-hot encoded tensor of shape (batch_size, num_classes).
        loss = criterion(outputs, torch.eye(10)[targets])

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Print progress
        if (batch_idx + 1) == len(train_loader):
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item():.4f}')

Epoch [1/10], Step [938/938], Loss: 0.0433
Epoch [2/10], Step [938/938], Loss: 0.0318
Epoch [3/10], Step [938/938], Loss: 0.0282
Epoch [4/10], Step [938/938], Loss: 0.0345
Epoch [5/10], Step [938/938], Loss: 0.0302
Epoch [6/10], Step [938/938], Loss: 0.0212
Epoch [7/10], Step [938/938], Loss: 0.0306
Epoch [8/10], Step [938/938], Loss: 0.0312
Epoch [9/10], Step [938/938], Loss: 0.0306
Epoch [10/10], Step [938/938], Loss: 0.0252


### evaluation:

In [5]:
# Evaluation
model.eval()
# torch.noused to disable gradient calculation. This is done to speed up the evaluation process and reduce
# memory consumption since gradients are not needed during evaluation.
with torch.no_grad():
    correct = 0
    total = 0

    for data, targets in test_loader:
        outputs = model(data)
        #  find the predicted class labels by selecting the class with the highest score from the outputs tensor. 
        # The torch.max function returns both the maximum values and their corresponding indices
        _, predicted = torch.max(outputs.data, 1)
        total += targets.size(0)
        correct += (predicted == targets).sum().item()

    accuracy = 100 * correct / total
    print(f'Test Accuracy: {accuracy:.2f}%')

Test Accuracy: 82.19%


We did the experment with 1 hidden layer and achieved an accuracy of 81.65 percent. Now, we are going to increase the layer count to study the depth effect:

In [6]:
# Initialize the model
input_size = 28 * 28
output_size = 10
hidden_sizes = [256, 256]  # Vary the hidden layer sizes here

model = MLP(input_size, output_size, hidden_sizes)
print(model)

# Define the loss function and optimizer
criterion = nn.MSELoss()
learning_rate = 0.01
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

# Training loop
num_epochs = 10

for epoch in range(num_epochs):
    for batch_idx, (data, targets) in enumerate(train_loader):
        # Forward pass
        outputs = model(data)
        # The line torch.eye(10)[targets] performs one-hot encoding on the targets by creating 
        # a one-hot encoded tensor of shape (batch_size, num_classes).
        loss = criterion(outputs, torch.eye(10)[targets])

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Print progress
        if (batch_idx + 1) == len(train_loader):
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item():.4f}')

# Evaluation
model.eval()
# torch.noused to disable gradient calculation. This is done to speed up the evaluation process and reduce
# memory consumption since gradients are not needed during evaluation.
with torch.no_grad():
    correct = 0
    total = 0

    for data, targets in test_loader:
        outputs = model(data)
        #  find the predicted class labels by selecting the class with the highest score from the outputs tensor. 
        # The torch.max function returns both the maximum values and their corresponding indices
        _, predicted = torch.max(outputs.data, 1)
        total += targets.size(0)
        correct += (predicted == targets).sum().item()

    accuracy = 100 * correct / total
    print(f'Test Accuracy: {accuracy:.2f}%')

MLP(
  (layers): ModuleList(
    (0): Linear(in_features=784, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=256, bias=True)
    (3): ReLU()
    (4): Linear(in_features=256, out_features=10, bias=True)
  )
)
Epoch [1/10], Step [938/938], Loss: 0.0542
Epoch [2/10], Step [938/938], Loss: 0.0530
Epoch [3/10], Step [938/938], Loss: 0.0418
Epoch [4/10], Step [938/938], Loss: 0.0517
Epoch [5/10], Step [938/938], Loss: 0.0446
Epoch [6/10], Step [938/938], Loss: 0.0342
Epoch [7/10], Step [938/938], Loss: 0.0387
Epoch [8/10], Step [938/938], Loss: 0.0306
Epoch [9/10], Step [938/938], Loss: 0.0283
Epoch [10/10], Step [938/938], Loss: 0.0236
Test Accuracy: 80.53%


In [7]:
# Initialize the model
input_size = 28 * 28
output_size = 10
hidden_sizes = [256, 256, 256]  # Vary the hidden layer sizes here

model = MLP(input_size, output_size, hidden_sizes)
print(model)

# Define the loss function and optimizer
criterion = nn.MSELoss()
learning_rate = 0.01
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

# Training loop
num_epochs = 10

for epoch in range(num_epochs):
    for batch_idx, (data, targets) in enumerate(train_loader):
        # Forward pass
        outputs = model(data)
        # The line torch.eye(10)[targets] performs one-hot encoding on the targets by creating 
        # a one-hot encoded tensor of shape (batch_size, num_classes).
        loss = criterion(outputs, torch.eye(10)[targets])

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Print progress
        if (batch_idx + 1) == len(train_loader):
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item():.4f}')

# Evaluation
model.eval()
# torch.noused to disable gradient calculation. This is done to speed up the evaluation process and reduce
# memory consumption since gradients are not needed during evaluation.
with torch.no_grad():
    correct = 0
    total = 0

    for data, targets in test_loader:
        outputs = model(data)
        #  find the predicted class labels by selecting the class with the highest score from the outputs tensor. 
        # The torch.max function returns both the maximum values and their corresponding indices
        _, predicted = torch.max(outputs.data, 1)
        total += targets.size(0)
        correct += (predicted == targets).sum().item()

    accuracy = 100 * correct / total
    print(f'Test Accuracy: {accuracy:.2f}%')

MLP(
  (layers): ModuleList(
    (0): Linear(in_features=784, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=256, bias=True)
    (3): ReLU()
    (4): Linear(in_features=256, out_features=256, bias=True)
    (5): ReLU()
    (6): Linear(in_features=256, out_features=10, bias=True)
  )
)
Epoch [1/10], Step [938/938], Loss: 0.0664
Epoch [2/10], Step [938/938], Loss: 0.0527
Epoch [3/10], Step [938/938], Loss: 0.0542
Epoch [4/10], Step [938/938], Loss: 0.0482
Epoch [5/10], Step [938/938], Loss: 0.0382
Epoch [6/10], Step [938/938], Loss: 0.0367
Epoch [7/10], Step [938/938], Loss: 0.0381
Epoch [8/10], Step [938/938], Loss: 0.0406
Epoch [9/10], Step [938/938], Loss: 0.0313
Epoch [10/10], Step [938/938], Loss: 0.0234
Test Accuracy: 78.92%


We can conclude that the more we add hidden layers and increase the depth, the less accurate our model gets. This can be due to vanishing/exploding gradient or just the fact that we have more weights and they produce more error.

### dropout:

In [9]:
class MLP_d(nn.Module):
    def __init__(self, sizes, dropout_rate):
        super(MLP_d, self).__init__()
        self.layers = nn.ModuleList()
        for i in range(len(sizes) - 1):
            self.layers.append(nn.Linear(sizes[i], sizes[i+1]))
            if i != len(sizes) - 2:
                self.layers.append(nn.ReLU())
                self.layers.append(nn.Dropout(p=dropout_rate))

    def forward(self, x):
        x = x.view(x.size(0), -1) 
        for layer in self.layers:
            x = layer(x)
        return x

In [12]:
# Initialize the model
sizes = [28 * 28, 256, 10]  

model = MLP_d(sizes = sizes, dropout_rate = 0.4)
print(model)

# Define the loss function and optimizer
criterion = nn.MSELoss()
learning_rate = 0.01
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

# Training loop
num_epochs = 10

for epoch in range(num_epochs):
    for batch_idx, (data, targets) in enumerate(train_loader):
        # Forward pass
        outputs = model(data)
        # The line torch.eye(10)[targets] performs one-hot encoding on the targets by creating 
        # a one-hot encoded tensor of shape (batch_size, num_classes).
        loss = criterion(outputs, torch.eye(10)[targets])

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Print progress
        if (batch_idx + 1) == len(train_loader):
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item():.4f}')

# Evaluation
model.eval()
# torch.noused to disable gradient calculation. This is done to speed up the evaluation process and reduce
# memory consumption since gradients are not needed during evaluation.
with torch.no_grad():
    correct = 0
    total = 0

    for data, targets in test_loader:
        outputs = model(data)
        #  find the predicted class labels by selecting the class with the highest score from the outputs tensor. 
        # The torch.max function returns both the maximum values and their corresponding indices
        _, predicted = torch.max(outputs.data, 1)
        total += targets.size(0)
        correct += (predicted == targets).sum().item()

    accuracy = 100 * correct / total
    print(f'Test Accuracy: {accuracy:.2f}%')

MLP_d(
  (layers): ModuleList(
    (0): Linear(in_features=784, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.4, inplace=False)
    (3): Linear(in_features=256, out_features=10, bias=True)
  )
)
Epoch [1/10], Step [938/938], Loss: 0.0561
Epoch [2/10], Step [938/938], Loss: 0.0451
Epoch [3/10], Step [938/938], Loss: 0.0383
Epoch [4/10], Step [938/938], Loss: 0.0431
Epoch [5/10], Step [938/938], Loss: 0.0546
Epoch [6/10], Step [938/938], Loss: 0.0424
Epoch [7/10], Step [938/938], Loss: 0.0436
Epoch [8/10], Step [938/938], Loss: 0.0349
Epoch [9/10], Step [938/938], Loss: 0.0275
Epoch [10/10], Step [938/938], Loss: 0.0339
Test Accuracy: 80.98%


### early stopping:

In [None]:
num_epochs = 50
early_stopping_patience = 10
best_loss = float('inf')
best_model = None
epochs_without_improvement = 0

# Training loop
for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        # Forward pass
        outputs = model(inputs)
        
        # Compute loss
        loss = criterion(outputs, torch.eye(10)[labels])
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    # Validation
    model.eval()
    with torch.no_grad():
        val_loss = 0
        for val_inputs, val_labels in validation_loader:
            val_inputs = val_inputs.view(-1, 28*28)
            val_outputs = model(val_inputs)
            val_loss += criterion(val_outputs, val_labels.float()).item()
        val_loss /= len(validation_loader)

        # Check if validation loss has improved
        if val_loss < best_loss:
            best_loss = val_loss
            best_model = model.state_dict()
            epochs_without_improvement = 0
        else:
            epochs_without_improvement += 1

        # Check if early stopping criteria is met
        if epochs_without_improvement >= early_stopping_patience:
            print(f"Early stopping triggered! No improvement in {early_stopping_patience} epochs.")
            break

    # Print progress
    if (epoch+1) % 1 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Load the best model
model.load_state_dict(best_model)