## Importing libraries

In [1]:
# Standard library imports
import os
import sys
import json
import random
from collections import OrderedDict

# Third-party library imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import Dataset, DataLoader, Subset
import torchvision
import torchvision.transforms as transforms
from torchvision import datasets, models
from PIL import Image
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


## Save Checkpoint

In [2]:

import os
import torch
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)



DIR_DATA = "./data"
CHECKPOINT_DIR = "./checkpoints"

os.makedirs(CHECKPOINT_DIR, exist_ok=True)

os.makedirs(CHECKPOINT_DIR, exist_ok=True)

def save_checkpoint(model, optimizer, epoch, hyperparameters, subfolder="", data_to_save=None):
    """Salva il checkpoint del modello e rimuove quello precedente."""
    subfolder_path = os.path.join(CHECKPOINT_DIR, subfolder)
    os.makedirs(subfolder_path, exist_ok=True)

    # File corrente e precedente
    filename = f"model_epoch_{epoch}_params_{hyperparameters}.pth"
    filepath = os.path.join(subfolder_path, filename)
    filename_json = f"model_epoch_{epoch}_params_{hyperparameters}.json"
    filepath_json = os.path.join(subfolder_path, filename_json)


    previous_filename = f"model_epoch_{epoch -10}_params_{hyperparameters}.pth"
    previous_filepath = os.path.join(subfolder_path, previous_filename)
    previous_filename_json = f"model_epoch_{epoch -10}_params_{hyperparameters}.json"
    previous_filepath_json = os.path.join(subfolder_path, previous_filename_json)

    # Rimuove il checkpoint precedente
    if epoch > 1 and os.path.exists(previous_filepath) and os.path.exists(previous_filepath_json):
        os.remove(previous_filepath)
        os.remove(previous_filepath_json)

    # Salva il nuovo checkpoint
    if optimizer is not None:
        torch.save({
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),  # Salvataggio dello stato dell'ottimizzatore
            'epoch': epoch
        }, filepath)
    else:
        torch.save({
            'model_state_dict': model.state_dict(),
            'epoch': epoch
        }, filepath)
    print(f"Checkpoint salvato: {filepath}")

    with open(filepath_json, 'w') as json_file:
      json.dump(data_to_save, json_file, indent=4)


def load_checkpoint(model, optimizer, hyperparameters, subfolder=""):
    """Carica l'ultimo checkpoint disponibile basato sugli iperparametri."""
    subfolder_path = os.path.join(CHECKPOINT_DIR, subfolder)
    if not os.path.exists(subfolder_path):
        print("No checkpoint found, Starting now...")
        return 1, None  # Le epoche iniziano da 1

    # Cerca i file con gli iperparametri specificati
    files = [f for f in os.listdir(subfolder_path) if f"params_{hyperparameters}" in f and f.endswith('.pth')]
    if files:
        # Trova il file con l'epoca più alta
        latest_file = max(files, key=lambda x: int(x.split('_')[2]))
        filepath = os.path.join(subfolder_path, latest_file)
        checkpoint = torch.load(filepath)

        model.load_state_dict(checkpoint['model_state_dict'])
        if optimizer is not None:
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

        # Trova e carica il file JSON associato
        json_filename = latest_file.replace('.pth', '.json')
        json_filepath = os.path.join(subfolder_path, json_filename)
        json_data = None
        if os.path.exists(json_filepath):
            with open(json_filepath, 'r') as json_file:
                json_data = json.load(json_file)
            print(f"JSON data loaded: {json_filepath}")
        else:
            print(f"No JSON file found for: {latest_file}")

        print(f"Checkpoint found: Resume epoch {checkpoint['epoch'] + 1}")
        return checkpoint['epoch'] + 1, json_data

    print("No checkpoint found, Starting now...")
    return 1, None  # Le epoche iniziano da 1



In [6]:
import random

def generate_skewed_array(target_sum, num_elements, skew_factor=0):
    """
    Generate an array of integers that sum up to a target value.

    Args:
        target_sum (int): The target sum of the array.
        num_elements (int): The number of elements in the array.
        skew_factor (float): A positive skew factor favors smaller numbers,
                             a negative skew factor favors larger numbers,
                             and 0 results in an approximately uniform distribution.

    Returns:
        list: An array of integers summing up to the target value.
    """
    if num_elements <= 0:
        raise ValueError("num_elements must be greater than 0")

    # Generate a distribution using Dirichlet
    alpha = [1 + skew_factor] * num_elements if skew_factor >= 0 else [1 - skew_factor] * num_elements
    raw_weights = np.random.dirichlet(alpha)

    # Scale weights to the target sum
    scaled_weights = raw_weights * target_sum

    # Convert to integers while keeping track of the fractional parts
    integer_parts = np.floor(scaled_weights).astype(int)
    fractional_parts = scaled_weights - integer_parts

    # Adjust the result to ensure the sum matches the target_sum
    diff = target_sum - np.sum(integer_parts)

    # Distribute the difference based on the largest fractional parts
    fractional_indices = np.argsort(-fractional_parts)
    for i in range(abs(diff)):
        integer_parts[fractional_indices[i]] += 1 if diff > 0 else -1

    return integer_parts.tolist()

def generate_vector(target_sum, n):
    """
    Generates a vector (list) with n positive elements that sum to target_sum.
    Used for niid sharding

    Parameters:
    - target_sum: The desired sum of the vector elements.
    - n: The number of elements in the vector.

    Returns:
    - A list of n positive elements that sum to target_sum.
    """
    if target_sum < n:
        raise ValueError("Target sum must be at least equal to the number of elements to ensure all elements are positive.")

    # Create n random weights that sum to 1
    weights = [random.random() for _ in range(n)]
    total_weight = sum(weights)
    normalized_weights = [w / total_weight for w in weights]

    # Scale the weights to sum to the target_sum
    vector = [max(1, int(target_sum * w)) for w in normalized_weights]

    # Adjust to ensure the exact sum of target_sum
    current_sum = sum(vector)
    while current_sum != target_sum:
        # Find the difference between the current sum and the target sum
        difference = target_sum - current_sum

        # Adjust a random element to fix the difference
        index_to_adjust = random.randint(0, n - 1)
        if difference > 0:
            vector[index_to_adjust] += 1
        elif vector[index_to_adjust] > 1:  # Ensure no element goes below 1
            vector[index_to_adjust] -= 1

        # Recalculate the sum
        current_sum = sum(vector)

    return vector



# CIFAR100 Dataset

In [7]:
class CIFAR100Dataset(Dataset):
    def __init__(self, root, split='train', transform=None, sharding=None, K=10, Nc=2, skewed_factor = None):
        """
        CIFAR-100 Dataset with IID and non-IID sharding.

        Args:
        - root (str): Directory to store the dataset.
        - split (str): 'train' or 'test'.
        - transform (callable): Transformations applied to the images.
        - sharding (str): 'iid' or 'niid'.
        - K (int): Number of clients for the sharding.
        - Nc (int): Number of classes per client (used for non-iid sharding).
        """
        self.root = root
        self.split = split
        self.transform = transform
        self.sharding = sharding
        self.K = K
        self.Nc = Nc


        # Default transformations if none are provided
        if self.transform is None:
            if self.split == 'train':
                self.transform = transforms.Compose([
                    transforms.RandomHorizontalFlip(),  # Flip orizzontale casuale
                    transforms.RandomRotation(10),
                    transforms.ToTensor(),  # Converte l'immagine in un tensore PyTorch
                    transforms.Normalize(mean=[0.5071, 0.4865, 0.4409], std=[0.2673, 0.2564, 0.2762]),  # Normalizzazione
                ])
            else:
                self.transform = transforms.Compose([
                    transforms.ToTensor(),  # Converte in tensore PyTorch
                    transforms.Normalize(mean=[0.5071, 0.4865, 0.4409], std=[0.2673, 0.2564, 0.2762]),  # Normalizzazione
                ])

        dataset = datasets.CIFAR100(
            root=self.root,
            train=(self.split == 'train'),
            download=True
        )

        self.data = pd.DataFrame({
            "image": [dataset[i][0] for i in range(len(dataset))],
            "label": [dataset[i][1] for i in range(len(dataset))]
        })

        if self.split == 'train' and self.sharding:
            self.data = self._apply_sharding()

    def _apply_sharding(self):
        """Apply IID or non-IID sharding to the training data."""
        if self.sharding == 'iid':
            return self._iid_sharding()
        elif self.sharding == 'niid':
            return self._non_iid_sharding()
        else:
            raise ValueError("Sharding must be 'iid' or 'niid'.")

    def _iid_sharding(self):
        """Split data IID: uniformly distribute samples across K clients."""
        data_split = []
        indices = self.data.index.tolist()
        random.shuffle(indices)

        # Split indices equally among K clients
        client_indices = [indices[i::self.K] for i in range(self.K)]

        for client_id, idxs in enumerate(client_indices):
            client_data = self.data.loc[idxs].copy()
            client_data['client_id'] = client_id
            data_split.append(client_data)

        return pd.concat(data_split, ignore_index=True)

    def _non_iid_sharding(self):
      """Non-IID sharding with fixed number of classes per client"""
      data_split = []
      labels = self.data['label'].unique()
      samples_per_client = generate_skewed_array(len(self.data), self.K)

      for client_id in range(self.K):
          # Seleziona Nc classi casuali per questo client
          client_classes = np.random.choice(labels, size=self.Nc, replace=False)

          # Ottieni il numero totale di campioni per il client
          total_samples = samples_per_client[client_id]

          # Assicurati che il numero totale di campioni sia sufficiente per il numero di classi
          min_samples_per_class = 5  # Numero minimo di campioni per classe
          total_samples = max(total_samples, min_samples_per_class * len(client_classes))

          # Distribuisci i campioni tra le classi
          samples_per_class = generate_vector(total_samples, len(client_classes))

          client_data = pd.DataFrame()
          for idx, class_ in enumerate(client_classes):
              class_data = self.data[self.data['label'] == class_]
              samples = class_data.sample(n=samples_per_class[idx], replace=True)  # replace=False per evitare duplicati
              client_data = pd.concat([client_data, samples])

          client_data['client_id'] = client_id
          data_split.append(client_data)

      # Concatenate tutti i dati per avere il dataset completo per tutti i clienti
      all_data = pd.concat(data_split, ignore_index=True)
      return all_data



    def __getitem__(self, index):
        row = self.data.iloc[index]
        image, label = row['image'], row['label']

        if self.transform:
            image = self.transform(image)
        return image, label

    def __len__(self):
        return len(self.data)


## LeNet-5

In [8]:
class LeNet5(nn.Module):
    def __init__(self,num_classes=100):
        super(LeNet5, self).__init__()
        self.conv_layer = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=5),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(64, 64, kernel_size=5),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        self.fc_layer = nn.Sequential(
            nn.Linear(64 * 5 * 5, 384),  # Updated to be consistent with data augmentation
            nn.ReLU(),
            nn.Linear(384, 192),
            nn.ReLU(),
            nn.Linear(192, num_classes)  # 100 classes for CIFAR-100
        )

    def forward(self, x):
        x = self.conv_layer(x)
        x = x.view(x.size(0), -1)  # Flatten the output of the conv layers
        x = self.fc_layer(x)
        x = F.log_softmax(x, dim=1)
        return x

In [9]:
def evaluate_model(model, validation_loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, targets in validation_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            total_loss += loss.item()
            _, predicted = outputs.max(1)
            correct += (predicted == targets).sum().item()
            total += targets.size(0)

    return total_loss / len(validation_loader), correct / total

## FL Baselines

In [10]:
def plot_selected_clients_distribution(selected_clients_per_round, num_clients, hyperparameters):
    """Plotta la distribuzione dei client selezionati alla fine del processo."""
    counts = np.zeros(num_clients)

    # Conta quante volte ogni client è stato selezionato in tutti i round
    for selected_clients in selected_clients_per_round:
        for client in selected_clients:
            counts[client] += 1

    plt.figure(figsize=(10, 6))
    plt.bar(range(num_clients), counts, color='skyblue', edgecolor='black')
    plt.title("Distribuzione dei Client Selezionati Durante il Federated Averaging")
    plt.xlabel("Client ID")
    plt.ylabel("Frequenza di Selezione")
    plt.grid(axis='y')
    plt.savefig(f"CIFAR100_Client_distribution_{hyperparameters}.png")
    plt.show()


class Client:

  def __init__(self, model, client_id, data, optimizer_params):
    self.client_id = client_id
    self.data = data
    self.model = model
    self.optimizer_params = optimizer_params

  def train(self, global_weights, local_steps, batch_size):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    self.model.to(device)
    self.model.load_state_dict(global_weights)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(
        self.model.parameters(),
        lr=self.optimizer_params['lr'],
        momentum=self.optimizer_params['momentum'],
        weight_decay=self.optimizer_params['weight_decay']
        )
    trainloader = DataLoader(self.data, batch_size=batch_size, shuffle=True,  pin_memory=True)

    steps = 0
    while steps < local_steps:
      for batch_idx, (inputs, targets) in enumerate(trainloader):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = self.model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        steps += 1
        if steps >= local_steps:
          break
    return self.model.state_dict()




In [11]:
class Server:

  def __init__(self, model, clients, test_data, val_data):
    self.model = model
    self.clients = clients
    self.val_data = val_data
    self.test_data = test_data
    self.round_losses = []
    self.round_accuracies = []
    self.selected_clients_per_round = [] #clint selezionati per skewness


  def client_selection(self, num_clients, fraction, probabilities=None):
      "It selects a subset of clients based on uniform or skewed distribution"
      num_clients_to_select = int(num_clients * fraction)
      if probabilities is None:
          selected_clients = np.random.choice(num_clients, num_clients_to_select, replace=False)
      else:
          selected_clients = np.random.choice(num_clients, num_clients_to_select, p=probabilities, replace=False)
      return selected_clients

  def evaluate_clients(self, alpha):
    """
    Evaluate clients using an entropy-based approach.
    Clients with more diverse data distributions are prioritized.
    Returns:
        Probabilities favoring clients with diverse data and larger datasets.
    """
    # Assumendo che client.data sia un elenco di tuple (immagine, etichetta)
    #conto numero occorrenze di ogni etichetta
    data_distributions = [
        np.bincount([label for _, label in client.data], minlength=100) for client in self.clients
    ]
    entropies = [-np.sum(d / np.sum(d) * np.log(d / np.sum(d) + 1e-5)) for d in data_distributions]

    # Normalize entropies
    normalized_entropies = np.array(entropies) / np.sum(entropies)

    # Combine with data sizes
    data_sizes = [len(client.data) for client in self.clients]
    normalized_data_sizes = np.array(data_sizes) / np.sum(data_sizes)

    # Combine metrics
    combined_scores = alpha * normalized_entropies + (1 - alpha) * normalized_data_sizes

    # Normalize combined scores to get probabilities
    probabilities = combined_scores / np.sum(combined_scores)
    return probabilities


  def federated_averaging(self, local_steps, batch_size, num_rounds, fraction_fit, alpha = None, hyperparameters = None):

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    self.model.to(device)

     # Carica il checkpoint se esiste
    data_to_load = None
    probabilities = None

    start_epoch, data_to_load = load_checkpoint(self.model,optimizer=None,hyperparameters=hyperparameters, subfolder="Federated_Uniform/")

    if data_to_load is not None:
      self.round_losses = data_to_load['round_losses']
      self.round_accuracies = data_to_load['round_accuracies']
      self.selected_clients_per_round = data_to_load['selected_clients_per_round']

    probabilities=self.evaluate_clients(alpha) # Generate data/entropy prob

    for round in range(start_epoch, num_rounds+1):

      selected_clients = self.client_selection(len(self.clients), fraction_fit, probabilities)

      self.selected_clients_per_round.append([self.clients[client_idx].client_id for client_idx in selected_clients])

      global_weights = self.model.state_dict()

      # Simulating parallel clients training
      client_weights = {}
      for client_idx in selected_clients:
        client = self.clients[client_idx]  # Accedi all'oggetto Client usando l'indice
        client_weights[client.client_id] = client.train(global_weights, local_steps, batch_size)


      new_global_weights = {key: torch.zeros_like(value).type(torch.float32) for key, value in global_weights.items()}

      total_data_size = sum([len(self.clients[client_idx].data) for client_idx in selected_clients])
      for client_idx in selected_clients:
          client = self.clients[client_idx]  # Accedi all'oggetto Client
          scaling_factor = len(client.data) / total_data_size
          for key in new_global_weights.keys():
              new_global_weights[key] += scaling_factor * client_weights[client.client_id][key]

      # Update global model weights
      self.model.load_state_dict(new_global_weights)

      # Evaluate global model every 10 rounds
      if round % 10 == 0:
        loss, accuracy = evaluate_model(self.model, DataLoader(self.val_data, batch_size=batch_size, shuffle=False, pin_memory=True), nn.CrossEntropyLoss(), device)
        loss_test, accuracy_test = evaluate_model(self.model, DataLoader(self.test_data, batch_size=batch_size, shuffle=False, pin_memory=True), nn.CrossEntropyLoss(), device)

        self.round_losses.append(loss)
        self.round_accuracies.append(accuracy)
        print(f"Round {round}/{num_rounds} - Loss: {loss:.4f}, Accuracy: {accuracy:.4f}")

        data_to_save = {
          'round_losses': self.round_losses,
          'round_accuracies': self.round_accuracies,
          'selected_clients_per_round': [[client for client in round_clients] for round_clients in self.selected_clients_per_round],  # Serializziamo solo i client_id
        }

        save_checkpoint(self.model, None, round , hyperparameters, "Federated_Uniform/", data_to_save)



    print("Evaluation on test set...")
    loss, accuracy = evaluate_model(self.model, DataLoader(self.test_data, batch_size=batch_size, shuffle=False, pin_memory=True), nn.CrossEntropyLoss(), device)
    print(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")

     # Plot dei risultati
    plt.figure(figsize=(16, 10))

        # Validation Loss
    plt.subplot(2, 2, 1)
    plt.plot(range(0, num_rounds, 10), self.round_losses, label='Validation Loss')
    plt.xlabel('Round')
    plt.ylabel('Loss')
    plt.title('Validation Loss per Round')
    plt.legend()

        # Validation Accuracy
    plt.subplot(2, 2, 2)
    plt.plot(range(0, num_rounds, 10), self.round_accuracies, label='Validation Accuracy')
    plt.xlabel('Round')
    plt.ylabel('Accuracy')
    plt.title('Validation Accuracy per Round')
    plt.legend()


    plt.tight_layout()
    file_name = f"CIFAR100_fedavg_uniform_{hyperparameters}.jpg"
    plt.savefig(file_name)
    plt.show()

    plot_selected_clients_distribution(self.selected_clients_per_round, len(self.clients), hyperparameters)




# Run the contrib experiments
Hyperparameters can be changed here. RUn this cell to start experiments.

In [None]:
K = 100 #fix
LOCAL_STEPS = 4 # J
ROUNDS = 2000
C = 0.1 #fix
BATCH_SIZE = 64
LR = 0.001
MOMENTUM = 0
WEIGHT_DECAY = 1e-4
ALPHA=0.5

optimizer_params = {
      "lr": LR,
      "momentum": MOMENTUM,
      "weight_decay": WEIGHT_DECAY
  }

model_cifar = LeNet5(100)

train_dataset_big = CIFAR100Dataset(DIR_DATA, split="train", sharding="niid", K=K, Nc=5)
test_dataset = CIFAR100Dataset(DIR_DATA, split="test")

# Split training-validation
train_indices, validation_indices = train_test_split(
    range(len(train_dataset_big)), test_size=0.2, random_state=42,stratify=train_dataset_big.data["label"]
)

# Subset di training e validazione
train_dataset = Subset(train_dataset_big, train_indices)
validation_dataset = Subset(train_dataset_big, validation_indices)

# Mapping degli indici
original_to_subset = {original_idx: subset_idx for subset_idx, original_idx in enumerate(train_indices)}

# Creazione dei client
clients = []
for i in range(K):
    # Filtra gli indici dei client dal dataset originale
    client_original_indices = train_dataset_big.data[
        train_dataset_big.data["client_id"] == i
    ].index

    # Converte gli indici originali in indici del subset
    client_subset_indices = [original_to_subset[idx] for idx in client_original_indices if idx in original_to_subset]

    # Crea il subset per il client
    client_data = Subset(train_dataset, client_subset_indices)
    clients.append(Client(model_cifar, i, client_data, optimizer_params))



server_uniform = Server(model_cifar, clients, test_dataset, validation_dataset)
hyperparameters = f"ENTROPY_CONTRIB_BS{BATCH_SIZE}_LR{LR}_M{MOMENTUM}_WD{WEIGHT_DECAY}_J{LOCAL_STEPS}_C{C}_ALPHA{ALPHA}"
server_uniform.federated_averaging(local_steps=LOCAL_STEPS, batch_size=BATCH_SIZE, num_rounds=ROUNDS, alpha=ALPHA, fraction_fit=C,hyperparameters=hyperparameters)
