In [1]:
import sys
import torch
import torch.nn as nn
from Server import Server
from Client import Client
from Individual import Individual
from shakespeare_model import CharLSTM

In [2]:
# Constants for FL training
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(DEVICE)
NUM_CLIENTS = 1129  # Total number of clients in the federation
FRACTION_CLIENTS = 0.1  # Fraction of clients selected per round (C)
LOCAL_STEPS = 4  # Number of local steps (J)
GLOBAL_ROUNDS = 2000  # Total number of communication rounds

BATCH_SIZE = 100 # Batch size for local training

MOMENTUM = 0  # Momentum for SGD optimizer
CHECKPOINT_DIR = '/content/drive/MyDrive/colab_checkpoints/'
LOG_FREQUENCY = 10 # Frequency of logging training progress

cuda


In [3]:
import torch
from statistics import mean
import torch.nn as nn

"""
Utility function used both in the centralized and federated learning
Computes the accuracy and the loss on the validation/test set depending on the dataloader passed
"""
def evaluate(model, dataloader, criterion, DEVICE):
    model.eval()  # Set the model to evaluation mode
    running_corrects = 0
    total_samples = 0  # Total samples counter
    losses = []

    with torch.no_grad():
        for data, targets in dataloader:
            data = data.to(DEVICE)
            targets = targets.to(DEVICE)
            hidden = model.init_hidden(data.size(0))
            hidden = (hidden[0].to(DEVICE), hidden[1].to(DEVICE))
            outputs, _ = model(data, hidden)
            outputs_flat = outputs.view(-1, model.vocab_size)
            targets_flat = targets.view(-1)

            loss = criterion(outputs_flat, targets_flat)
            losses.append(loss.item())

            _, preds = outputs_flat.max(1)
            #running_corrects += torch.sum(preds == targets_flat).item()
            running_corrects += (preds == targets_flat).sum().item()
            total_samples += targets_flat.size(0)

    accuracy = (running_corrects / total_samples) * 100
    return accuracy, sum(losses) / len(losses)

# Checkpointing

In [4]:
import os
import torch
import json

# Ensure the checkpoint directory exists, creating it if necessary
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

def save_checkpoint(model, optimizer, epoch, hyperparameters, subfolder="", checkpoint_data=None):
    """
    Saves the model checkpoint and removes the previous one if it exists.

    Arguments:
    model -- The model whose state is to be saved.
    optimizer -- The optimizer whose state is to be saved (can be None).
    epoch -- The current epoch of the training process.
    hyperparameters -- A string representing the model's hyperparameters for file naming.
    subfolder -- Optional subfolder within the checkpoint directory to save the checkpoint.
    checkpoint_data -- Data to save in a JSON file (e.g., training logs).
    """
    # Define the path for the subfolder where checkpoints will be stored
    subfolder_path = os.path.join(CHECKPOINT_DIR, subfolder)
    # Create the subfolder if it doesn't exist
    os.makedirs(subfolder_path, exist_ok=True)

    # Construct filenames for both the model checkpoint and the associated JSON file
    filename = f"model_epoch_{epoch}_params_{hyperparameters}.pth"
    filepath = os.path.join(subfolder_path, filename)
    filename_json = f"model_epoch_{epoch}_params_{hyperparameters}.json"
    filepath_json = os.path.join(subfolder_path, filename_json)

    # Define the filenames for the previous checkpoint files, to remove them if necessary
    previous_filepath = os.path.join(subfolder_path, f"model_epoch_{epoch - 1}_params_{hyperparameters}.pth")
    previous_filepath_json = os.path.join(subfolder_path, f"model_epoch_{epoch - 1}_params_{hyperparameters}.json")

    # Remove the previous checkpoint if it exists, but only for epochs greater than 1
    if epoch > 1 and os.path.exists(previous_filepath):
        os.remove(previous_filepath)
        os.remove(previous_filepath_json)

    # Prepare the checkpoint data dictionary
    checkpoint = {'model_state_dict': model.state_dict(), 'epoch': epoch}
    # If an optimizer is provided, save its state as well
    if optimizer is not None:
        checkpoint['optimizer_state_dict'] = optimizer.state_dict()

    # Save the model and optimizer (if provided) state dictionary to the checkpoint file
    torch.save(checkpoint, filepath)
    print(f"Checkpoint saved: {filepath}")

    # If additional data (e.g., training logs) is provided, save it to a JSON file
    if checkpoint_data:
        with open(filepath_json, 'w') as json_file:
            json.dump(checkpoint_data, json_file, indent=4)

def load_checkpoint(model, optimizer, hyperparameters, subfolder=""):
    """
    Loads the latest checkpoint available based on the specified hyperparameters.

    Arguments:
    model -- The model whose state will be updated from the checkpoint.
    optimizer -- The optimizer whose state will be updated from the checkpoint (can be None).
    hyperparameters -- A string representing the model's hyperparameters for file naming.
    subfolder -- Optional subfolder within the checkpoint directory to look for checkpoints.

    Returns:
    The next epoch to resume from and the associated JSON data if available.
    """
    # Define the path to the subfolder where checkpoints are stored
    subfolder_path = os.path.join(CHECKPOINT_DIR, subfolder)

    # If the subfolder doesn't exist, print a message and start from epoch 1
    if not os.path.exists(subfolder_path):
        print("No checkpoint found, starting from epoch 1.")
        return 1, None  # Epoch starts from 1

    # Search for checkpoint files in the subfolder that match the hyperparameters
    files = [f for f in os.listdir(subfolder_path) if f"params_{hyperparameters}" in f and f.endswith('.pth')]

    # If checkpoint files are found, load the one with the highest epoch number
    if files:
        latest_file = max(files, key=lambda x: int(x.split('_')[2]))  # Find the latest epoch file
        filepath = os.path.join(subfolder_path, latest_file)
        checkpoint = torch.load(filepath, weights_only=True)

        # Load the model state from the checkpoint
        model.load_state_dict(checkpoint['model_state_dict'])
        # If an optimizer is provided, load its state as well
        if optimizer:
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

        # Try to load the associated JSON file if available
        json_filepath = os.path.join(subfolder_path, latest_file.replace('.pth', '.json'))
        json_data = None
        if os.path.exists(json_filepath):
            # If the JSON file exists, load its contents
            with open(json_filepath, 'r') as json_file:
                json_data = json.load(json_file)
            print("Data loaded!")
        else:
            # If no JSON file exists, print a message
            print("No data found")

        # Print the epoch from which the model is resuming
        print(f"Checkpoint found: Resuming from epoch {checkpoint['epoch'] + 1}\n\n")
        return checkpoint['epoch'] + 1, json_data

    # If no checkpoint is found, print a message and start from epoch 1
    print("No checkpoint found, starting from epoch 1..\n\n")
    return 1, None  # Epoch starts from 1

def delete_existing_checkpoints(subfolder=""):
    """
    Deletes all existing checkpoints in the specified subfolder.

    Arguments:
    subfolder -- Optional subfolder within the checkpoint directory to delete checkpoints from.
    """
    subfolder_path = os.path.join(CHECKPOINT_DIR, subfolder)
    if os.path.exists(subfolder_path):
        for file_name in os.listdir(subfolder_path):
            file_path = os.path.join(subfolder_path, file_name)
            if os.path.isfile(file_path):
                os.remove(file_path)
        print(f"All existing checkpoints in {subfolder_path} have been deleted.")
    else:
        print(f"No checkpoint folder found at {subfolder_path}.")

# DataLoading

In [7]:
from google.colab import files
uploaded = files.upload()

Saving all_data_iid_089_06_train_8.json to all_data_iid_089_06_train_8.json


In [8]:
from google.colab import files
uploaded2 = files.upload()

Saving all_data_iid_089_06_test_8.json to all_data_iid_089_06_test_8.json


In [61]:
import io
import json

data = json.load(io.BytesIO(uploaded['all_data_iid_089_06_train_8.json']))

In [62]:
test_data  = json.load(io.BytesIO(uploaded2['all_data_iid_089_06_test_8.json']))

In [63]:
import json
import torch
from torch.utils.data import DataLoader, TensorDataset

#Load the Json file
with open('all_data_iid_089_06_train_8.json', 'r') as file:
    data = json.load(file)

In [64]:
with open('all_data_iid_089_06_test_8.json', 'r') as f:
    test_data = json.load(f)

In [65]:
num_clients = len(data['users'])
print("Number of clients:", num_clients)
NUM_CLIENTS = num_clients

Number of clients: 100


In [66]:
users = data['users']
num_samples = data['num_samples']
user_data = data['user_data']

In [67]:
all_texts = ''.join([''.join(seq) for user in users for seq in user_data[user]['x']])
chars = sorted(set(all_texts))
char_to_idx = {ch: idx for idx, ch in enumerate(chars)}

# Add the padding character
char_to_idx['<pad>'] = len(char_to_idx)
idx_to_char = {idx: ch for ch, idx in char_to_idx.items()}

## Convert data into indices

In [68]:
inputs = [[char_to_idx[char] for char in user_data[user]['x'][0]] for user in users]
targets = [[char_to_idx[char] for char in user_data[user]['y'][0]] for user in users]

## Creation of TensorDataset and DataLoader

In [69]:
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, TensorDataset


input_tensors = [torch.tensor(seq) for seq in inputs]
target_tensors = [torch.tensor([seq]) for seq in targets]

chars = sorted(set(all_texts))
char_to_idx = {ch: idx for idx, ch in enumerate(chars)}
char_to_idx['<pad>'] = len(char_to_idx)
idx_to_char = {idx: ch for ch, idx in char_to_idx.items()}

padded_inputs = pad_sequence(input_tensors, batch_first=True, padding_value=char_to_idx['<pad>'])

target_tensors = torch.cat(target_tensors, dim=0)

dataset = TensorDataset(padded_inputs, target_tensors)
batch_size = 4
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)


In [70]:
def tensor_to_string(tensor, idx_to_char):
    """Converte un tensore di indici in una stringa di caratteri."""
    return ''.join(idx_to_char[idx.item()] for idx in tensor)

In [71]:
# Function to convert character values into indices
from torch.nn.utils.rnn import pad_sequence
# def char_to_tensor(characters):
#     indices = [char_to_idx[char] for char in characters]
#     return torch.tensor(indices, dtype=torch.long)

# Function to convert character values into indices
from torch.nn.utils.rnn import pad_sequence
def char_to_tensor(characters):
    indices = [char_to_idx.get(char, char_to_idx['<pad>']) for char in characters] # Get the index for the character. If not found, use the index for padding.
    return torch.tensor(indices, dtype=torch.long)

# Prepare the training data_loader
# Prepara i dati di test
input_tensors = []
target_tensors = []
for user in data['users']:
    for entry, target in zip(data['user_data'][user]['x'], data['user_data'][user]['y']):
        input_tensors.append(char_to_tensor(entry))  # Use the full sequence of x
        target_tensors.append(char_to_tensor(target))  # Directly use the corresponding y as target

# Padding e creazione di DataLoader
padded_inputs = pad_sequence(input_tensors, batch_first=True, padding_value=char_to_idx['<pad>'])
targets = torch.cat(target_tensors)
dataset = TensorDataset(padded_inputs, targets)
for elem1, elem2 in dataset:
  elem2 = elem2.unsqueeze(0)

data_loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False)

In [73]:
# Prepare the test loader:
# Prepare the training data_loader

input_tensors = []
target_tensors = []
for user in test_data['users']:
    for entry, target in zip(test_data['user_data'][user]['x'], test_data['user_data'][user]['y']):
        input_tensors.append(char_to_tensor(entry))  # Use the full sequence of x
        target_tensors.append(char_to_tensor(target))  # Directly use the corresponding y as target

# Padding e creazione di DataLoader
padded_inputs = pad_sequence(input_tensors, batch_first=True, padding_value=char_to_idx['<pad>'])
targets = torch.cat(target_tensors)
dataset = TensorDataset(padded_inputs, targets)
for elem1, elem2 in dataset:
  elem2 = elem2.unsqueeze(0)

test_loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False)

## Model definition

In [74]:
global_model = CharLSTM(vocab_size=len(char_to_idx))
criterion = nn.CrossEntropyLoss()

## Utility functions

In [77]:
import torch
import numpy as np
import os
import matplotlib.pyplot as plt

DIR = '/content/drive/MyDrive/colab_plots/'

def plot_client_selection(client_selection_count, file_name):
    """
    Bar plot to visualize the frequency of client selections in a federated learning simulation.

    Args:
        client_selection_count (list): list containing the number of times each client was selected.
        file_name (str): name of the file to save the plot.
    """
    # Fixed base directory
    directory = DIR +  'plots_federated/'
    # Ensure the base directory exists
    os.makedirs(directory, exist_ok=True)

    # Complete path for the file
    file_path = os.path.join(directory, file_name)

    num_clients = len(client_selection_count)
    plt.figure(figsize=(10, 6))
    plt.bar(range(num_clients), client_selection_count, alpha=0.7, edgecolor='black')
    plt.xlabel("Client ID", fontsize=14)
    plt.ylabel("Selection Count", fontsize=14)
    plt.title("Client Selection Frequency", fontsize=16)
    plt.xticks(range(num_clients), fontsize=10, rotation=90 if num_clients > 20 else 0)
    plt.tight_layout()
    plt.savefig(file_path, format="png", dpi=300)
    plt.close()

def test(global_model, test_loader, criterion, DEVICE):
    """
    Evaluate the global model on the test dataset.

    Args:
        global_model (nn.Module): The global model to be evaluated.
        test_loader (DataLoader): DataLoader for the test dataset.

    Returns:
        float: The accuracy of the model on the test dataset.
        float: The loss of the model on the test dataset.
    """
    test_accuracy, loss = evaluate(global_model, test_loader, criterion, DEVICE)
    return test_accuracy, loss

def plot_metrics(train_accuracies, train_losses, file_name):
    """
    Plot the training metrics for a federated learning simulation.

    Args:
        train_accuracies (list): List of training accuracies.
        train_losses (list): List of training losses.
        file_name (str): Name of the file to save the plot.
    """
    # Fixed base directory
    directory = DIR + '/plots_federated/'
    # Ensure the base directory exists
    os.makedirs(directory, exist_ok=True)

    # Complete path for the file
    file_path = os.path.join(directory, file_name)

    # Create a list of epochs for the x-axis
    epochs = list(range(1, len(train_losses) + 1))

    # Plot the training loss
    plt.figure(figsize=(10, 6))
    plt.plot(epochs, train_losses, label='Train Loss', color='blue')
    plt.xlabel('Rounds', fontsize=14)
    plt.ylabel('Loss', fontsize=14)
    plt.title('Training Loss', fontsize=16)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(file_path.replace('.png', '_loss.png'), format='png', dpi=300)
    plt.close()

    # Plot the training accuracy
    plt.figure(figsize=(10, 6))
    plt.plot(epochs, train_accuracies, label='Train Accuracy', color='blue')
    plt.xlabel('Rounds', fontsize=14)
    plt.ylabel('Accuracy', fontsize=14)
    plt.title('Training Accuracy', fontsize=16)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(file_path.replace('.png', '_accuracy.png'), format='png', dpi=300)
    plt.close()


def save_data(global_model, train_accuracies, train_losses,client_count, file_name):
    """
    Save the global model, train_accuracies,train_losses and client_count to a file.

    Args:
        global_model (nn.Module): The global model to be saved.
        train_accuracies (list): List of training accuracies.
        train_losses (list): List of training losses.
        file_name (str): Name of the file to save the data.
    """
    # Fixed base directory
    directory = DIR + '/trained_models/'
    # Ensure the base directory exists
    os.makedirs(directory, exist_ok=True)

    # Complete path for the file
    file_path = os.path.join(directory, file_name)

    # Save all data (model state and metrics) into a dictionary
    save_dict = {
        'model_state': global_model.state_dict(),
        'train_accuracies': train_accuracies,
        'train_losses': train_losses,
        'client_count': client_count
    }

    # Save the dictionary to the specified file
    torch.save(save_dict, file_path)
    print(f"Data saved successfully to {file_path}")

def load_data(model, file_name):
    """
    Load the model weights and metrics from a file.

    Args:
        model (nn.Module): The model to load the weights into.
        file_name (str): Name of the file to load the data from.

    Returns:
        tuple: A tuple containing the model, train_accuracies train_losses and client_count.
    """
    # Fixed base directory
    directory = DIR+ 'trained_models/'
    # Complete path for the file
    file_path = os.path.join(directory, file_name)

    # Load the saved data from the specified file
    save_dict = torch.load(file_path)

    # Load the model state
    model.load_state_dict(save_dict['model_state'])

    # Extract the metrics
    train_accuracies = save_dict['train_accuracies']
    train_losses = save_dict['train_losses']
    client_count = save_dict['client_count']

    print(f"Data loaded successfully from {file_path}")

    return model, train_accuracies, train_losses,client_count

# Evolutionary algorithm

In [79]:
import random
from copy import deepcopy
import os

import torch
import torch.nn as nn

#constants
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
#CRITERION = nn.NLLLoss()
#MOMENTUM = 0.9
#BATCHSIZE = 64
CHECKPOINTING_PATH = '../checkpoints/'

def tournament_selection_weakest(population, tau=2, p_diver=0.05):
    """
    Perform tournament selection to choose parents.
    Randomly select tau individuals and choose the weakest one.
    Fitness hole to introduce a 5% probability of choosing the fittest individual.


    :param population: List of Individuals.
    :param tau: Number of individuals to select.
    :param p_diver: Probability of choosing the worst individual in the tournament, done for the fitness hole.
    :return: Selected Individual.
    """
    participants = random.sample(population, tau)
    if random.random() < p_diver:
        winner = max(participants, key=lambda ind: ind.fitness)
    else:
      winner = min(participants, key=lambda ind: ind.fitness)
    return deepcopy(winner)

def tournament_selection_fittest(population, tau=2, p_diver=0.05):
    """
    Perform tournament selection to choose parents.
    Randomly select tau individuals and choose the best one.
    Fitness hole to introduce a 5% probability of choosing the weakest individual.


    :param population: List of Individuals.
    :param tau: Number of individuals to select.
    :param p_diver: Probability of choosing the worst individual in the tournament, done for the fitness hole.
    :return: Selected Individual.
    """
    participants = random.sample(population, tau)
    if random.random() < p_diver:
        winner = min(participants, key=lambda ind: ind.fitness)
    else:
      winner = max(participants, key=lambda ind: ind.fitness)
    return deepcopy(winner)


def client_size(individual, client_sizes):
    """
    Computes the number of total samples for individual
    """
    val = 0
    for client in individual.genome:
        val += client_sizes[client]
    return val


def EA_algorithm(generations, population_size, num_clients, num_classes, crossover_probability, dataset, lr, wd, criterion):
    """
    Perform the Evolutionary Algorithm (EA) to optimize the selection of clients.
    The EA consists of the following steps:
    1. Initialization: Create a population of individuals.
    2. Evaluation: Compute the fitness of each individual.
    3. Selection: Choose parents based on their fitness.
    4. Offspring to create the new population (generational model).
    6. Repeat from step 2 maximum iterations.

    :param generations: Number of generations to run the algorithm.
    :param population_size: Number of individuals in the population.
    :param num_clients: clients selected by each individual.
    :param num_classes: Number of classes for each client (iid or non-iid).
    :param crossover_probability: Probability of crossover for each individual.
    :param dataset: The dataset to be used for training.
    :param lr: The learning rate to be used for training.
    :param wd: The weight decay to be used for training.


    :return global_model: The global model obtained after the EA.
    :return training_accuracies: The training loss of the global model at each generation.
    :return training_losses: The training accuracy of the global model at each generation.
    :return client_selection_count: The number of times each client was selected in the population.
    """

    #Check if the checkpointing directory exists
    os.makedirs(CHECKPOINTING_PATH, exist_ok=True)

    train_losses = []
    train_accuracies = []
    val_losses = []
    val_accuracies = []
    client_selection_count = [0]*100
    best_model_state = None
    best_train_loss = float('inf')


    # Initialize the population
    # Shuffle clients before assigning them
    all_clients = list(range(100))
    random.shuffle(all_clients)

    #No individual, at the beginning, will select a client twice
    population = [
        Individual(genome=all_clients[i * num_clients:(i + 1) * num_clients])
        for i in range(population_size)
    ]
    #population = [Individual(genome=random.sample(range(100), k=num_clients)) for _ in range(population_size)]
    model = CharLSTM(vocab_size=len(char_to_idx))

    #load checkpoint if it exists
    checkpoint_start_step, data_to_load = load_checkpoint(model=model,optimizer=None,hyperparameters=f"LR{lr}_WD{wd}",subfolder="personal_contribution")
    if data_to_load is not None:
        train_accuracies = data_to_load['train_accuracies']
        train_losses = data_to_load['train_losses']
        client_selection_count = data_to_load['client_selection_count']
        population = [Individual.from_dict(ind) for ind in data_to_load['population']]
    # Create the Server instance:
    server = Server(model,DEVICE, char_to_idx,CHECKPOINTING_PATH )

    shards = server.sharding(dataset)
    client_sizes = [len(shard) for shard in shards]

    for gen in range(checkpoint_start_step,generations):
    #for gen in range(generations):
        # For each of them apply the fed_avg algorithm:
        param_list = []
        averages_acc = []
        average_loss = []
        for individual in population:
            #Update the client selection count
            for client in individual.genome:
                client_selection_count[client] += 1

            resulting_model, acc_res, loss_res = server.train_federated(criterion, lr, MOMENTUM, BATCH_SIZE, wd, individual, shards)
            param_list.append(resulting_model)
            averages_acc.append(acc_res)
            average_loss.append(loss_res)


        #Here we should average all the models to obtain the global model...
        averaged_model,  train_loss, train_accuracy = server.fedavg_aggregate(param_list, [client_size(i, client_sizes) for i in population], average_loss, averages_acc)

        train_losses.append(train_loss)
        train_accuracies.append(train_accuracy)

        # Update the model with the result of the average:
        model.load_state_dict(averaged_model)
        #Just to be sure:
        server.global_model.load_state_dict(averaged_model)

        # DO NOT HAVE A VALIDATION SPLIT, SO...
        # # Then evaluate the validation accuracy of the global model
        # acc, loss = evaluate(model, train_loader)
        # if acc > best_val_acc:
        #         best_val_acc = acc
        #         best_model_state = deepcopy(model.state_dict())

        # val_accuracies.append(acc)
        # val_losses.append(loss)

        if train_loss < best_train_loss:
            best_train_loss = train_loss
            best_model_state = deepcopy(model.state_dict())

        offspring = []
        #Offspring-> offspring size is the same as population size
        elite = sorted(population, key=lambda ind: ind.fitness, reverse=True)[0]
        offspring.append(elite) #Keep the best individual
        for j in range(population_size-1):
            # Crossover
            if random.random() < crossover_probability:
                parent1 = tournament_selection_fittest(population)
                parent2 = tournament_selection_fittest(population)
                offspring.append(Individual.crossover(parent1, parent2))
            else:
                #Mutation
                parent = tournament_selection_weakest(population)
                parent.point_mutation()
                offspring.append(parent)

        # Replace the population with the new offspring
        population = offspring

        #Checkpointing every 10 generations
        if((gen+1)%10==0):
            print(f"Generation {gen+1}, loss {train_loss}")
            checkpoint_data = {
                'train_accuracies': train_accuracies,
                'train_losses': train_losses,
                'client_selection_count': client_selection_count,
                'population': [individual.to_dict() for individual in population]
            }
            save_checkpoint(model, None, gen+1, f"LR{lr}_WD{wd}", subfolder="personal_contribution", checkpoint_data=checkpoint_data)

    model.load_state_dict(best_model_state)
    return model, train_accuracies, train_losses, client_selection_count


In [80]:
# Parameters
lr = 1.0
wd = 0.0001
generations = 200
population_size = 10
num_clients = 2
num_classes = 100
crossover_probability = 0.5

In [81]:
print(BATCH_SIZE)

100


In [84]:
#Best lr and wd found for iid federated baseline: lr=0.1, wd=0.001
global_model = CharLSTM(vocab_size=len(char_to_idx))
global_model,train_accuracies,train_losses,client_selection_count=EA_algorithm(generations=generations,population_size=population_size,num_clients=num_clients,num_classes = num_classes, crossover_probability = crossover_probability, dataset= data, lr =lr , wd = wd, criterion = criterion)
test_accuracy, test_loss = test(global_model, test_loader, criterion, DEVICE)
print("Test accuracy: ",test_accuracy)
plot_metrics(train_accuracies,train_losses,"EA_iid.png")
#plot_client_selection(client_selection_count,"EA_iid_client_selection.png")
#save_data(global_model,val_accuracies,val_losses,train_accuracies,train_losses,client_selection_count,"EA_iid.pth")

No checkpoint found, starting from epoch 1.
Generation 10, loss 3.221942177414894
Checkpoint saved: /content/drive/MyDrive/colab_checkpoints/personal_contribution/model_epoch_10_params_LR1.0_WD0.0001.pth
Generation 20, loss 3.162678799033165
Checkpoint saved: /content/drive/MyDrive/colab_checkpoints/personal_contribution/model_epoch_20_params_LR1.0_WD0.0001.pth
Generation 30, loss 3.1417674511671065
Checkpoint saved: /content/drive/MyDrive/colab_checkpoints/personal_contribution/model_epoch_30_params_LR1.0_WD0.0001.pth
Generation 40, loss 3.123864418268204
Checkpoint saved: /content/drive/MyDrive/colab_checkpoints/personal_contribution/model_epoch_40_params_LR1.0_WD0.0001.pth
Generation 50, loss 3.078569161891937
Checkpoint saved: /content/drive/MyDrive/colab_checkpoints/personal_contribution/model_epoch_50_params_LR1.0_WD0.0001.pth
Generation 60, loss 3.0075629144906997
Checkpoint saved: /content/drive/MyDrive/colab_checkpoints/personal_contribution/model_epoch_60_params_LR1.0_WD0.000

In [53]:
num_clients = len(data['users'])
print(num_clients)

100


In [None]:
# Constants
LOCAL_STEPS_VALUES = [4, 8, 16]  # Values for J (number of local steps)
NUM_RUNDS = {4: 200, 8: 100, 16:50}
lr = 0.01
wd = 0.0001
'''
These hyperparameters are taken from:
Acar, Durmus Alp Emre, et al. "Federated learning based on dynamic regularization." arXiv preprint arXiv:2111.04263 (2021).

Notice infact that the leaf version of the Shakespeare dataset doesn't come with a linked validation dataset to
choose the most accurate hyperparameters.
'''

# Function to perform the training and testing for a given configuration
def run_experiment(local_steps, plot_suffix):
    print(f"Running experiment: local_steps={local_steps}")
    global_model = CharLSTM(vocab_size=len(char_to_idx))
    server = Server(global_model, DEVICE, char_to_idx, CHECKPOINT_DIR)

    #tuning_rounds = int(NUM_RUNDS[local_steps]/20)
    #best_lr, best_wd = to be manually set

    global_model, train_accuracies, train_losses, client_selection_count = server.train_federated(
        criterion, data_loader,
        num_clients=NUM_CLIENTS,
        rounds=NUM_RUNDS[local_steps], lr=lr, momentum=MOMENTUM,
        batchsize=BATCH_SIZE, wd=wd, C=FRACTION_CLIENTS,
        local_steps=local_steps, log_freq=100,
        detailed_print=True, gamma=None  # No skewed sampling for this experiment
    )

    # Testing and plotting
    test_accuracy = test(global_model, test_loader)
    plot_metrics(train_accuracies, train_losses, f"Federated_scaled_{plot_suffix}_LR_{lr}_WD_{wd}.png")
    print(f"Test accuracy for local_steps={local_steps}: {test_accuracy}")

    # Save data for future analysis
    save_data(global_model, train_accuracies, train_losses, client_selection_count, f"Federated_{plot_suffix}_LR_{lr}_WD_{wd}.pth")


local_steps = 16# and 16
print(NUM_CLIENTS)
plot_suffix = f"local_steps_{local_steps}"
run_experiment(local_steps, plot_suffix)

In [29]:
# Constants
LOCAL_STEPS_VALUES = [4, 8, 16]  # Values for J (number of local steps)
NUM_RUNDS = {4: 200, 8: 100, 16:50}
lr = 0.01
wd = 0.0001
'''
These hyperparameters are taken from:
Acar, Durmus Alp Emre, et al. "Federated learning based on dynamic regularization." arXiv preprint arXiv:2111.04263 (2021).

Notice infact that the leaf version of the Shakespeare dataset doesn't come with a linked validation dataset to
choose the most accurate hyperparameters.
'''

# Function to perform the training and testing for a given configuration
def run_experiment(local_steps, plot_suffix):
    print(f"Running experiment: local_steps={local_steps}")
    global_model = CharLSTM(vocab_size=len(char_to_idx))
    server = Server(global_model, DEVICE, char_to_idx, CHECKPOINT_DIR)

    tuning_rounds = int(NUM_RUNDS[local_steps]/20)
    #best_lr, best_wd = to be manually set

    global_model, train_accuracies, train_losses, client_selection_count = server.train_federated(
        criterion, data_loader,
        num_clients=NUM_CLIENTS,
        rounds=NUM_RUNDS[local_steps], lr=lr, momentum=MOMENTUM,
        batchsize=BATCH_SIZE, wd=wd, C=FRACTION_CLIENTS,
        local_steps=local_steps, log_freq=100,
        detailed_print=False, gamma=None  # No skewed sampling for this experiment
    )

    # Testing and plotting
    test_accuracy = test(global_model, test_loader)
    plot_metrics(train_accuracies, train_losses, f"Federated_{plot_suffix}_LR_{lr}_WD_{wd}.png")
    print(f"Test accuracy for local_steps={local_steps}: {test_accuracy}")

    # Save data for future analysis
    save_data(global_model, train_accuracies, train_losses, client_selection_count, f"Federated_{plot_suffix}_LR_{lr}_WD_{wd}.pth")


local_steps = 8# and 16
plot_suffix = f"num_classes_{num_classes}_local_steps_{local_steps}"
run_experiment(local_steps, plot_suffix)


NameError: name 'num_classes' is not defined