In [171]:
%load_ext autoreload
%autoreload 2

import torch
import torch.optim as optim
import torch.nn as nn
from torch.backends import cudnn

from statistics import mean

import sys
sys.path.append('../data/cifar100/')  
from cifar100_loader import load_cifar100
from models.model import LeNet5 #import the model

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Hyperparameters

In [172]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'  # Check whether a GPU is available and if so, use it
print(DEVICE)
#Momentum and batch size have not been tuned
BATCH_SIZE = 64    # Higher batch sizes allows for larger learning rates. An empirical heuristic suggests that, when changing
                     # the batch size, learning rate should change by the same factor to have comparable results
MOMENTUM = 0.9       # Hyperparameter for SGD, keep this at 0.9 when using SGD

NUM_EPOCHS = 100      # Total number of training epochs (iterations over dataset)

cuda


In [173]:
#25% of the dataset kept for validation
trainloader, validloader, testloader = load_cifar100(batch_size=32, validation_split=0.25)
#The previous function has been verified to ensure that the distribution among classes
#is preserved in the training and validation sets

Files already downloaded and verified
Files already downloaded and verified


# Prepare training

In [174]:
#Modified version of LeNet5 to work with CIFAR100, paper cited in model.py
model = LeNet5().to(DEVICE) # Create the model
# Define loss function -> softmax used by the model, It doesn't make sense to use the CrossqEntropyLoss
criterion = nn.NLLLoss()

# Schedulers

In [175]:
def get_scheduler_factory(num_epochs):
    """
    Return a set of predefined learning rate scheduler factories with reasonable parameters.

    Args:
        num_epochs (int): Total number of epochs.

    Returns:
        list: List of tuples with scheduler names and factory functions.
    """
    schedulers = [
        # StepLR
        ("StepLR (step_size=num_epochs//3, gamma=0.1)",
         lambda optimizer: torch.optim.lr_scheduler.StepLR(optimizer, step_size=num_epochs // 3, gamma=0.1)),
        ("StepLR (step_size=num_epochs//5, gamma=0.5)",
         lambda optimizer: torch.optim.lr_scheduler.StepLR(optimizer, step_size=num_epochs // 5, gamma=0.5)),

        # CosineAnnealingLR
        ("CosineAnnealingLR (T_max=num_epochs//2, eta_min=1e-4)",
         lambda optimizer: torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs // 3, eta_min=1e-4)),

        # ExponentialLR
        ("ExponentialLR (gamma=0.95)",
         lambda optimizer: torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)),
        ("ExponentialLR (gamma=0.9)",
         lambda optimizer: torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)),

        # ReduceLROnPlateau
        ("ReduceLROnPlateau (patience=5, factor=0.5)",
         lambda optimizer: torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.5))
    ]

    return schedulers


# Validation

In [176]:
def evaluate(model, dataloader):
    """
    Evaluate the model on the given dataset.
    """
    with torch.no_grad():
        model.train(False) # Set Network to evaluation mode
        running_corrects = 0
        losses = []
        for data, targets in dataloader:
            data = data.to(DEVICE)        # Move the data to the GPU
            targets = targets.to(DEVICE)  # Move the targets to the GPU
            # Forward Pass
            outputs = model(data)
            loss = criterion(outputs, targets)
            losses.append(loss.item())
            # Get predictions
            _, preds = torch.max(outputs.data, 1)
            # Update Corrects
            running_corrects += torch.sum(preds == targets.data).data.item()
            # Calculate Accuracy
            accuracy = running_corrects / float(len(dataloader.dataset))

    return accuracy, mean(losses)

In [177]:
def train(num_epochs, scheduler,optimizer):
    val_accuracies = []
    val_losses = []
    train_accuracies = []
    train_losses = []
    cudnn.benchmark  # Calling this optimizes runtime

    best_val_acc = 0.0  
    best_model_state = None  # The model with the best accuracy

    # Training loop
    for epoch in range(num_epochs): 
        for data, targets in trainloader:
            data = data.to(DEVICE)        # Move the data to the GPU
            targets = targets.to(DEVICE)  # Move the targets to the GPU
            model.train()                # Set Network to train mode
            optimizer.zero_grad()         # Zero the gradients
            outputs = model(data)         # Pass data through the model
            loss = criterion(outputs, targets)  # Compute loss
            loss.backward()               # Backpropagation
            optimizer.step()              # Update model parameters

        # Evaluate on the training set
        train_acc, train_loss = evaluate(model, trainloader)
        train_accuracies.append(train_acc)
        train_losses.append(train_loss)

        # Evaluate on the validation set
        val_acc, val_loss = evaluate(model, validloader)
        #print(f'Epoch {epoch+1}, Validation Accuracy: {val_acc*100:.2f}%')

        val_accuracies.append(val_acc)
        val_losses.append(val_loss)

        # Update the best model if validation accuracy improves
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_model_state = model.state_dict()  # Salva lo stato corrente del modello
            #print(f"New best model found with accuracy: {val_acc*100:.2f}%")

        # Step the scheduler
        scheduler.step()
        #print(f'Epoch {epoch+1}, Loss: {loss.item()}')

    # Alla fine del training, ritorna il miglior modello
    model.load_state_dict(best_model_state)
    return train_accuracies, train_losses, val_accuracies, val_losses, model


# Test

In [178]:
def test(model, testloader):
    """
    Test the model on the test set.
    """
    accuracy, loss = evaluate(model, testloader)
    return accuracy, loss

# Hyperparameters tuning

In [None]:
num_epochs = 20  # low value for parameter tuning
learning_rates = [0.05, 0.01, 0.005, 0.001]
weight_decays = [1e-5, 5e-5, 1e-4]
scheduler_factories = get_scheduler_factory(num_epochs)
results = []
best_validation_accuracy_overall = 0.0
best_setting = None

for lr in learning_rates:
    for wd in weight_decays:
        for scheduler_name, scheduler_factory in scheduler_factories:
            # Reset the model
            model = LeNet5().to(DEVICE)
            # Create the optimizer
            optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=wd)
            # Create the scheduler
            scheduler = scheduler_factory(optimizer)
            # Ezecute training
            train_accuracies, train_losses, val_accuracies, val_losses, model = train(num_epochs, scheduler,optimizer)
            # Print the best validation accuracy
            best_val_accuracy = max(val_accuracies)
            if best_val_accuracy > best_validation_accuracy_overall:
                best_validation_accuracy_overall = best_val_accuracy
                best_setting = (lr, wd, scheduler_name)
            print(f'Learning Rate: {lr}, Weight Decay: {wd}, Scheduler: {scheduler_name}, Best Validation Accuracy: {best_val_accuracy*100:.2f}%')

            results.append({
                'learning_rate': lr,
                'weight_decay': wd,
                'scheduler_name': scheduler_name,
                'train_accuracies': train_accuracies,
                'train_losses': train_losses,
                'val_accuracies': val_accuracies,
                'val_losses': val_losses,
            })


# Plotting and saving results

In [None]:
import matplotlib.pyplot as plt
import os

def plot_results(results, save_dir='./plots_centralized'):
    """
    Save plots comparing training accuracy and validation accuracy per epoch for each combination of hyperparameters.

    Args:
        results (list): List of dictionaries, where each dictionary contains:
                        - 'learning_rate': Learning rate used.
                        - 'weight_decay': Weight decay used.
                        - 'scheduler_name': Name of the scheduler.
                        - 'train_accuracies': List of training accuracies.
                        - 'val_accuracies': List of validation accuracies.
        save_dir (str): Directory where the plots will be saved.
    """
    os.makedirs(save_dir, exist_ok=True)

    for res in results:
        # Extract hyperparameter values
        lr = res['learning_rate']
        wd = res['weight_decay']
        scheduler_name = res['scheduler_name']

        # Generate a unique filename prefix for each configuration
        file_prefix = f"LR_{lr}_WD_{wd}_Scheduler_{scheduler_name.replace(' ', '_')}"

        # Plot training and validation accuracy per epoch
        plt.figure(figsize=(12, 6))
        plt.plot(res['train_accuracies'], label='Training Accuracy')
        plt.plot(res['val_accuracies'], label='Validation Accuracy')
        plt.xlabel('Epoch')
        plt.ylabel('Accuracy')
        plt.title(f"Training vs Validation Accuracy (LR={lr}, WD={wd}, Scheduler={scheduler_name})")
        plt.legend()
        accuracy_plot_path = os.path.join(save_dir, f"{file_prefix}_training_vs_validation_accuracy.png")
        plt.savefig(accuracy_plot_path)
        plt.close()

    print(f"Plots saved to directory: {save_dir}")
plot_results(results)