In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
import numpy as np
from tqdm import tqdm


# Define the neural network architecture
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # Define convolutional layers with specified input/output channels, kernel size, stride, and padding
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=2, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=2, padding=1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1)
        # Define fully connected (linear) layers
        self.fc1 = nn.Linear(2048, 128)  # Input size depends on the output from the convolutional layers
        self.fc2 = nn.Linear(128, 10)    # Output size corresponds to the number of classes

    def forward(self, x):
        # Forward pass through the convolutional layers with ReLU activations
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = self.conv3(x)
        x = F.relu(x)
        # Flatten the tensor from 4D to 2D, keeping the batch dimension intact
        x = torch.flatten(x, 1)
        # Forward pass through the fully connected layers with ReLU activation
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        # Apply log softmax to output logits for classification
        output = F.log_softmax(x, dim=1)
        return output


# Function to update the client's model using local training data
def client_update(client_model, optimizer, train_loader, epoch=5):
    model.train()  # Set the model to training mode
    for e in range(epoch):  # Loop over the specified number of epochs
        for batch_idx, (data, target) in enumerate(train_loader):  # Iterate over batches of data
            data, target = data, target  # No need to move to CUDA as we're using CPU
            optimizer.zero_grad()  # Zero the gradients of the optimizer
            output = client_model(data)  # Forward pass through the model
            loss = F.nll_loss(output, target)  # Calculate negative log likelihood loss
            loss.backward()  # Backpropagate the loss
            optimizer.step()  # Update the model parameters
    return loss.item()  # Return the final loss value


# Function to aggregate the models from all clients into the global model
def server_aggregate(global_model, client_models):
    global_dict = global_model.state_dict()  # Get the state dictionary of the global model
    # Iterate over each parameter key in the global model
    for k in global_dict.keys():
        # Stack the parameter values from all client models and compute their mean
        global_dict[k] = torch.stack([client_models[i].state_dict()[k] for i in range(len(client_models))], 0).mean(0)
    global_model.load_state_dict(global_dict)  # Load the averaged parameters into the global model
    # Update each client model with the new global model state
    for model in client_models:
        model.load_state_dict(global_model.state_dict())


# Function to evaluate the global model on the test data
def test(global_model, test_loader):
    model.eval()  # Set the model to evaluation mode
    test_loss = 0
    correct = 0
    with torch.no_grad():  # Disable gradient computation for evaluation
        for data, target in test_loader:  # Iterate over the test data
            data, target = data, target  # No need to move to CUDA as we're using CPU
            output = global_model(data)  # Forward pass through the model
            # Accumulate the total loss across all test batches
            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            # Get the index of the max log-probability (predicted class)
            pred = output.argmax(dim=1, keepdim=True)
            # Count the number of correct predictions
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)  # Average loss over the dataset
    acc = correct / len(test_loader.dataset)  # Calculate accuracy

    return test_loss, acc  # Return the test loss and accuracy


In [6]:
# IID (Independent and Identically Distributed) case: 
# All clients have images of all classes, simulating a balanced dataset distribution.

# Hyperparameters
num_clients = 100  # Total number of clients
num_selected = 10  # Number of clients selected per round
num_rounds = 5     # Number of federated learning rounds
epochs = 5         # Number of local training epochs for each client
batch_size = 32    # Batch size for training

# Creating decentralized datasets
# Load the MNIST training dataset and apply transformations (convert to tensor and normalize)
traindata = datasets.MNIST('./data', train=True, download=True,
                       transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
                       )
# Split the training data evenly among the clients
traindata_split = torch.utils.data.random_split(traindata, 
                      [int(traindata.data.shape[0] / num_clients) for _ in range(num_clients)])

# Create data loaders for each client's data
train_loader = [torch.utils.data.DataLoader(x, batch_size=batch_size, shuffle=True) for x in traindata_split]

# Create a data loader for the test set (used for global model evaluation)
test_loader = torch.utils.data.DataLoader(
        datasets.MNIST('./data', train=False, transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
        ), batch_size=batch_size, shuffle=True)

# Instantiate models and optimizers
# Define the global model that will aggregate updates from client models
global_model = Net()  # No .cuda() since you're using CPU
# Create a list of client models, initialized to the same state as the global model
client_models = [Net() for _ in range(num_selected)]  # All models stay on CPU

# Load the initial state of the global model into each client model
for model in client_models:
    model.load_state_dict(global_model.state_dict())

# Define optimizers for each client model, using Stochastic Gradient Descent (SGD) with a learning rate of 0.1
opt = [optim.SGD(model.parameters(), lr=0.1) for model in client_models]

# Running Federated Learning (FL)
for r in range(num_rounds):  # Iterate over the number of federated learning rounds
    # Select random clients for this round
    client_idx = np.random.permutation(num_clients)[:num_selected]  # Randomly permute client indices and select

    # Client update phase
    loss = 0  # Initialize loss accumulator for averaging later
    for i in range(num_selected):  # Iterate over the selected clients
        # Update each selected client model with its respective optimizer and data loader
        loss += client_update(client_models[i], opt[i], train_loader[client_idx[i]], epoch=epochs)
    
    # Server aggregation phase
    # Aggregate updates from all selected client models into the global model
    server_aggregate(global_model, client_models)
    # Test the aggregated global model on the test data
    test_loss, acc = test(global_model, test_loader)
    
    # Print the results of the current round
    print('%d-th round' % r)
    print('average train loss %0.3g | test loss %0.3g | test acc: %0.3f' % (loss / num_selected, test_loss, acc))


0-th round
average train loss 0.486 | test loss 0.439 | test acc: 0.880
1-th round
average train loss 0.151 | test loss 0.253 | test acc: 0.924
2-th round
average train loss 0.126 | test loss 0.205 | test acc: 0.943
3-th round
average train loss 0.0292 | test loss 0.175 | test acc: 0.953
4-th round
average train loss 0.0113 | test loss 0.157 | test acc: 0.956


In [9]:
# NON-IID (Non-Independent and Identically Distributed) case: 
# Each client has images of only two categories from the pairs [0, 1], [2, 3], [4, 5], [6, 7], or [8, 9].

# Hyperparameters
num_clients = 100  # Total number of clients
num_selected = 5   # Number of clients selected per round
num_rounds = 10    # Number of federated learning rounds
epochs = 5         # Number of local training epochs for each client
batch_size = 32    # Batch size for training

# Creating decentralized datasets
# Load the MNIST training dataset and apply transformations (convert to tensor and normalize)
traindata = datasets.MNIST('./data', train=True, download=True,
                       transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
                       )

# Create a tensor indicating which samples belong to each class (0 to 9)
target_labels = torch.stack([traindata.targets == i for i in range(10)])

# Split the dataset into groups where each group corresponds to two classes (e.g., [0, 1], [2, 3], etc.)
target_labels_split = []
for i in range(5):  # Loop over 5 pairs of classes
    # Combine two class labels and split the combined indices into subsets for clients
    target_labels_split += torch.split(torch.where(target_labels[(2 * i):(2 * (i + 1))].sum(0))[0], int(60000 / num_clients))

# Create subsets of the training data based on the split indices for each client
traindata_split = [torch.utils.data.Subset(traindata, tl) for tl in target_labels_split]

# Create data loaders for each client's data, with shuffling enabled
train_loader = [torch.utils.data.DataLoader(x, batch_size=batch_size, shuffle=True) for x in traindata_split]

# Create a data loader for the test set (used for global model evaluation)
test_loader = torch.utils.data.DataLoader(
        datasets.MNIST('./data', train=False, transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
        ), batch_size=batch_size, shuffle=True)

# Instantiate models and optimizers
# Define the global model that will aggregate updates from client models
global_model = Net()  # No .cuda() since you're using CPU
# Create a list of client models, initialized to the same state as the global model
client_models = [Net() for _ in range(num_selected)]

# Load the initial state of the global model into each client model
for model in client_models:
    model.load_state_dict(global_model.state_dict())

# Define optimizers for each client model, using Stochastic Gradient Descent (SGD) with a learning rate of 0.1
opt = [optim.SGD(model.parameters(), lr=0.1) for model in client_models]

# Running Federated Learning (FL)
for r in range(num_rounds):  # Iterate over the number of federated learning rounds
    # Select random clients for this round
    client_idx = np.random.permutation(num_clients)[:num_selected]  # Randomly permute client indices and select

    # Client update phase
    loss = 0  # Initialize loss accumulator for averaging later
    for i in range(num_selected):  # Iterate over the selected clients
        # Update each selected client model with its respective optimizer and data loader
        loss += client_update(client_models[i], opt[i], train_loader[client_idx[i]], epoch=epochs)
    
    # Server aggregation phase
    # Aggregate updates from all selected client models into the global model
    server_aggregate(global_model, client_models)
    # Test the aggregated global model on the test data
    test_loss, acc = test(global_model, test_loader)
    
    # Print the results of the current round
    print('%d-th round' % r)
    print('average train loss %0.3g | test loss %0.3g | test acc: %0.3f' % (loss / num_selected, test_loss, acc))


0-th round
average train loss 0.0141 | test loss 5.76 | test acc: 0.197
1-th round
average train loss 0.0489 | test loss 2.77 | test acc: 0.198
2-th round
average train loss 0.000319 | test loss 4.26 | test acc: 0.198
3-th round
average train loss 0.094 | test loss 1.86 | test acc: 0.268
4-th round
average train loss 0.0109 | test loss 4.86 | test acc: 0.185
5-th round
average train loss 0.00609 | test loss 1.2 | test acc: 0.750
6-th round
average train loss 0.00885 | test loss 1.17 | test acc: 0.621
