In [1]:
import torch
import torch.optim as optim
import torch.nn as nn
from torch.backends import cudnn

from copy import deepcopy
import numpy as np

import sys
sys.path.append('../data/cifar100/')  
from cifar100_loader import CIFAR100DataLoader
from models.model import LeNet5 #import the model
from utils.utils import evaluate
from utils.plotting_utils import plot_metrics,test,save_data,load_data

# Constants

In [None]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'  # Check whether a GPU is available and if so, use it
print(DEVICE)
#Momentum and batch size have not been tuned
BATCH_SIZE = 64    # Higher batch sizes allows for larger learning rates. An empirical heuristic suggests that, when changing
                     # the batch size, learning rate should change by the same factor to have comparable results
MOMENTUM = 0.9       # Hyperparameter for SGD, keep this at 0.9 when using SGD
LOG_FREQUENCY = 10

# Data loading

In [None]:
#10% of the dataset kept for validation
data_loader = CIFAR100DataLoader(batch_size=BATCH_SIZE, validation_split=0.1, download=True, num_workers=4, pin_memory=True)
trainloader, validloader, testloader = data_loader.train_loader, data_loader.val_loader, data_loader.test_loader
#The previous function has been verified to ensure that the distribution among classes
#is preserved in the training and validation sets

# Prepare training

In [6]:
#Modified version of LeNet5 to work with CIFAR100, paper cited in model.py
model = LeNet5().to(DEVICE) # Create the model
# Define loss function -> log_softmax used by the model, NLL is required
criterion = nn.NLLLoss()

# Schedulers
List of schedulers to be experimented 

In [5]:
def get_scheduler_factory(num_epochs):
    """
    Return a set of predefined learning rate scheduler factories with reasonable parameters.

    Args:
        num_epochs (int): Total number of epochs.

    Returns:
        list: List of tuples with scheduler names and factory functions.
    """
    schedulers = [
        # StepLR
        ("StepLR (step_size=num_epochs//3, gamma=0.1)",
         lambda optimizer: torch.optim.lr_scheduler.StepLR(optimizer, step_size=num_epochs // 3, gamma=0.1)),

        # CosineAnnealingLR
        ("CosineAnnealingLR",
         lambda optimizer: torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, num_epochs)),

        # ExponentialLR
        ("ExponentialLR (gamma=0.9)",
         lambda optimizer: torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)),
    ]
    return schedulers


# Training loop

In [10]:
def train(num_epochs, scheduler,optimizer,model):
    val_accuracies = []
    val_losses = []
    train_accuracies = []
    train_losses = []
    cudnn.benchmark  # Calling this optimizes runtime

    best_val_acc = 0.0  
    best_model_state = None  # The model with the best accuracy

    # Training loop
    for epoch in range(num_epochs): 
        epoch_train_loss = 0.0 
        correct = 0  # Number of correct predictions
        total = 0  # Total number of examples
        for data, targets in trainloader:
            data = data.to(DEVICE)        # Move the data to the GPU
            targets = targets.to(DEVICE)  # Move the targets to the GPU
            model.train()                # Set Network to train mode
            optimizer.zero_grad()         # Zero the gradients
            outputs = model(data)         # Pass data through the model
            loss = criterion(outputs, targets)  # Compute loss
            loss.backward()               # Backpropagation
            optimizer.step()              # Update model parameters
            # Accumulate training loss
            epoch_train_loss += loss.item() * data.size(0)  # Multiply by batch dimension
            # Compute accuracy
            _, predicted = outputs.max(1)  # Predictions
            correct += predicted.eq(targets).sum().item()
            total += targets.size(0)

        # Compute the average training loss and accuracy
        train_loss = epoch_train_loss / total
        train_acc = (correct / total) * 100 #in percentage
        train_losses.append(train_loss)
        train_accuracies.append(train_acc)
        # Evaluate on the validation set, done every epoch
        val_acc, val_loss = evaluate(model, validloader)
        val_accuracies.append(val_acc)
        val_losses.append(val_loss)

        # Update the best model if validation accuracy improves
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_model_state = deepcopy(model.state_dict())  # Save the current model state
        
        if(epoch+1%LOG_FREQUENCY==0):
            print(f"--> epoch: {epoch+1}, training accuracy: {train_acc:.2f}, validation accuracy: {val_acc:.2f}")
            
        # Step the scheduler
        scheduler.step()
        

    # At the end, return the best model
    model.load_state_dict(best_model_state)
    return train_accuracies, train_losses, val_accuracies, val_losses, model


# Hyperparameters tuning

In [None]:
# Generate 3 values for the learning rate (lr) between 1e-3 and 1e-1 in log-uniform
learning_rates = np.logspace(-3, -1, num=3)

# Generate 4 values for the weight decay (lr) between 1e-4 and 1e-1 in log-uniform
weight_decays = np.logspace(-4, -1, num=4)

print("Learning Rate Values (log-uniform):", learning_rates)
print("Weight Decay Values (log-uniform):", weight_decays)

In [None]:
num_epochs = 20  # low value for parameter tuning

scheduler_factories = get_scheduler_factory(num_epochs)
results = []
best_validation_accuracy_overall = 0.0
best_setting = None
print('Starting the parameter tuning loop...')
for lr in learning_rates:
    for wd in weight_decays:
        for scheduler_name, scheduler_factory in scheduler_factories:
            # Reset the model
            model = LeNet5().to(DEVICE)
            # Create the optimizer
            optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=wd)
            # Create the scheduler
            scheduler = scheduler_factory(optimizer)
            # Ezecute training
            train_accuracies, train_losses, val_accuracies, val_losses, model = train(num_epochs, scheduler,optimizer,model)
            # Print the best validation accuracy
            best_val_accuracy = max(val_accuracies)
            if best_val_accuracy > best_validation_accuracy_overall:
                best_validation_accuracy_overall = best_val_accuracy
                best_setting = (lr, wd, scheduler_name)
            print(f'Learning Rate: {lr}, Weight Decay: {wd}, Scheduler: {scheduler_name}, Best Validation Accuracy: {best_val_accuracy:.2f}%')

            results.append({
                'learning_rate': lr,
                'weight_decay': wd,
                'scheduler_name': scheduler_name,
                'train_accuracies': train_accuracies,
                'train_losses': train_losses,
                'val_accuracies': val_accuracies,
                'val_losses': val_losses,
            })
print("Finished training loop.")
print(f'Best validation accuracy overall: {best_validation_accuracy_overall:.2f}%')
print(f'Best setting: {best_setting}')

# Plotting and saving results

In [None]:
import matplotlib.pyplot as plt
import os
import re  # Imported module for regular expressions

def plot_results(results, save_dir='./plots_centralized'):
    """
    Save plots comparing training accuracy and validation accuracy per epoch for each combination of hyperparameters.

    Args:
        results (list): List of dictionaries, where each dictionary contains:
                        - 'learning_rate': Learning rate used.
                        - 'weight_decay': Weight decay used.
                        - 'scheduler_name': Name of the scheduler.
                        - 'train_accuracies': List of training accuracies.
                        - 'val_accuracies': List of validation accuracies.
        save_dir (str): Directory where the plots will be saved.
    """
    os.makedirs(save_dir, exist_ok=True)

    for res in results:
        # Extract hyperparameter values
        lr = res['learning_rate']
        wd = res['weight_decay']
        scheduler_name = res['scheduler_name']

        # Clean up the scheduler name for filename compatibility
        clean_scheduler_name = re.sub(r"[^a-zA-Z0-9]", "_", scheduler_name)  # Sostituisce i caratteri non alfanumerici con '_'

        # Generate a unique filename prefix for each configuration
        file_prefix = f"LR_{lr}_WD_{wd}_Scheduler_{clean_scheduler_name}"

        # Plot training and validation accuracy per epoch
        plt.figure(figsize=(12, 6))
        plt.plot(res['train_accuracies'], label='Training Accuracy')
        plt.plot(res['val_accuracies'], label='Validation Accuracy')
        plt.xlabel('Epoch')
        plt.ylabel('Accuracy')
        plt.title(f"Training vs Validation Accuracy (LR={lr}, WD={wd}, Scheduler={scheduler_name})")
        plt.legend()
        accuracy_plot_path = os.path.join(save_dir, f"{file_prefix}_training_vs_validation_accuracy.png")
        plt.savefig(accuracy_plot_path)
        plt.close()

    print(f"Plots saved to directory: {save_dir}")

#Plot only the best result 
filtered_results = [res for res in results if res['learning_rate'] == 0.01 and res['weight_decay'] == 0.0001 and res['scheduler_name']=="CosineAnnealingLR (T_max=num_epochs//3, eta_min=1e-4)"]
plot_results(filtered_results)

# Final model
Given the observations done in the sections before and analyzing all the plots, the following is the configuration that brought the best results. More details are reported in Report.md

In [11]:
NUM_EPOCHS = 200 
#Values found in the previous step that gave the best accuracy
LR = 0.01
WD =  0.001

In [None]:
model = LeNet5().to(DEVICE)
optimizer = optim.SGD(model.parameters(), lr=LR, momentum=MOMENTUM, weight_decay=WD)
scheduler =torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, NUM_EPOCHS)
train_accuracies, train_losses, val_accuracies, val_losses, model = train(NUM_EPOCHS, scheduler, optimizer,model)
test_accuracy = test(model, testloader)
plot_metrics(train_accuracies, train_losses, val_accuracies, val_losses, 
                 f"CentralizedCifar.png")
save_data(model, val_accuracies, val_losses, train_accuracies, train_losses, None, 
              f"CentralizedCifar.pth")
print(f'Test Accuracy: {test_accuracy:.2f}%')