# Optimizer

## Initialize the dataset

In [None]:
import torch
import torchvision
import torchvision.transforms as transforms
import tqdm

def load_mnist_data(root_path='./data', batch_size=4):
    transform = transforms.Compose(
        [transforms.ToTensor(),
        transforms.Normalize((0.5), (0.5))]
    )

    trainset = torchvision.datasets.MNIST(root=root_path, train=True, download=True, transform=transform)
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)

    testset = torchvision.datasets.MNIST(root=root_path, train=False, download=True, transform=transform)
    testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=2)

    return trainloader, testloader

## Define the neural network structure

In [None]:
import torch.nn as nn

model = nn.Sequential(
    nn.Linear(784, 32),  # input layer (do not change the in_features size of this layer - we need it later)
    #nn.ReLU(),
    nn.Linear(32, 32),
    #nn.ReLU(),
    # your layers
    nn.Linear(32, 10)  # you can change the in_features of this layer but let the out_features at size 10 here - we need it layer
)

## Training loop

In [None]:
from typing import Callable
from torch.optim import Optimizer

def train_model(
    model: nn.Module, loss_fn: Callable, optimizer: Optimizer,
    batch_size: int = 4, epochs: int = 10
):
    # we only consider the mnist train data for this example
    train_loader, _ = load_mnist_data(batch_size=batch_size)

    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model = model.to(device=device)
    criterion = loss_fn

    losses = []
    iteration = 0

    for epoch in range(epochs):
        running_loss = 0.0
        running_accuracy = []
        for imgs, targets in tqdm.tqdm(train_loader, desc=f'Training iteration {epoch + 1}'):
            iteration += 1
            imgs, targets = imgs.to(device=device), targets.to(device=device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(imgs.reshape(imgs.shape[0], -1))
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()

            if iteration % 50 == 0:
                losses.append(loss.item())

            # Calculate the Accuracy (how many of all samples are correctly classified?)
            max_outputs = torch.max(outputs, dim=1).indices
            accuracy = (max_outputs.detach() == targets.detach()).to(dtype=torch.float32).mean()
            running_accuracy.append(accuracy)

        print(f'Epoch {epoch + 1} finished with loss: {running_loss / len(train_loader):.3f} and accuracy {torch.tensor(running_accuracy).mean():.3f}')
    
    return losses

## Define the optimizers

In [None]:
from torch.optim import SGD, Adagrad, RMSprop, Adam

# Your code here
learning_rate = 0.01

def get_optimizers(model):
    optimizers = {
        'SGD': SGD(model.parameters(), lr=learning_rate),
        'SGD with Momentum': SGD(model.parameters(), lr=learning_rate, momentum=0.9),
        'Adagrad': Adagrad(model.parameters(), lr=learning_rate),
        'RMSprop': RMSprop(model.parameters(), lr=learning_rate),
        'Adam': Adam(model.parameters(), lr=learning_rate)
    }
    return optimizers

## Run the training loop with different optimizers

In [None]:
# Store the average loss of every 50th iteration in some iterable structure, e. g. dictionaries
# Create a mapping between the optimizer and the loss, so that you know which losses where achieved for which optimizer
# Update the method train_model accordingly to save the losses

# Update the 'train_model' method above and run the method with different optimizers


# Your code here

import copy

loss_fn = nn.CrossEntropyLoss()
batch_size = 64
epochs = 5

optimizer_losses = {}

original_model = nn.Sequential(
    nn.Linear(784, 32),
    nn.Linear(32, 32),
    nn.Linear(32, 10)
)

for name, optimizer_fn in [
    ('SGD', lambda m: SGD(m.parameters(), lr=learning_rate)),
    ('SGD with Momentum', lambda m: SGD(m.parameters(), lr=learning_rate, momentum=0.9)),
    ('Adagrad', lambda m: Adagrad(m.parameters(), lr=learning_rate)),
    ('RMSprop', lambda m: RMSprop(m.parameters(), lr=learning_rate)),
    ('Adam', lambda m: Adam(m.parameters(), lr=learning_rate))
]:
    print(f"\n{'='*50}")
    print(f"Training with {name}")
    print(f"{'='*50}")
    
    model_copy = copy.deepcopy(original_model)
    optimizer = optimizer_fn(model_copy)
    
    losses = train_model(
        model=model_copy,
        loss_fn=loss_fn,
        optimizer=optimizer,
        batch_size=batch_size,
        epochs=epochs
    )
    
    optimizer_losses[name] = losses

print(f"\n{'='*50}")
print("Training completed for all optimizers!")
print(f"{'='*50}")

## Visualize the results of the optimizers / losses

In [None]:
# Plot the results with matplotlib and show the difference in convergence speed with different optimizers.
# Plot the decreasing loss of each model with each optimizer

# Your code here
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))

for optimizer_name, losses in optimizer_losses.items():
    iterations = [i * 50 for i in range(1, len(losses) + 1)]
    plt.plot(iterations, losses, label=optimizer_name, linewidth=2)

plt.xlabel('Iteration', fontsize=12)
plt.ylabel('Loss', fontsize=12)
plt.title('Optimizer Comparison: Loss Over Iterations', fontsize=14, fontweight='bold')
plt.legend(fontsize=10)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("\nObservations:")
print("- Adam typically converges fastest due to adaptive learning rates and momentum")
print("- RMSprop also shows fast convergence with adaptive learning rates")
print("- SGD with Momentum converges faster than vanilla SGD by accumulating velocity")
print("- Adagrad may slow down over time as it accumulates squared gradients")
print("- Vanilla SGD is usually the slowest but most stable")