# Import

In [4]:
import sys
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import io
import json
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
from torch.backends import cudnn
import time
import numpy as np
from statistics import mean
import torch.optim as optim
from copy import deepcopy
import logging
from torch.utils.data import Subset
import tkinter as tk
from tkinter import filedialog
from Client import Client
from Server import Server
from Model import CharLSTM


# Parameters

In [5]:
# Constants for FL training
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(DEVICE)
FRACTION_CLIENTS = 0.1  # Fraction of clients selected per round (C)
criterion =  nn.CrossEntropyLoss()
BATCH_SIZE = 100  # Batch size for local training
MOMENTUM = 0.0  # Momentum for SGD optimizer
LOG_FREQUENCY = 10 # Frequency of logging training progress

cpu


# DataLoading

We must import the dataset manually since it is taken by the LEAF project.

So far the project is to go on the data folder of shakespeare and:
1. ./get_data.sh inside the preprocess folder
2. ./data_to_json.sh
3. cd ..
3. ././preprocess.sh -s niid --sf 0.2 -k 0 -t sample -tf 0.8 [depending on the preferencies]

Please upload the training dataset provided by LEAF here.

In [6]:
root = tk.Tk()
#root.withdraw()

file_path = filedialog.askopenfilename(filetypes=[("JSON files", "*.json")])

if file_path:
    with open(file_path, 'r') as file:
        data = json.load(file)
            
root.destroy()

Please upload the test dataset provided by LEAF.

In [7]:
root = tk.Tk()
#root.withdraw()

file_path = filedialog.askopenfilename(filetypes=[("JSON files", "*.json")])

if file_path:
    with open(file_path, 'r') as file:
        test_data = json.load(file)
            
root.destroy()

In [8]:
num_clients = len(data['users'])
print("Number of clients:", num_clients)
NUM_CLIENTS = num_clients

Number of clients: 100


In [9]:
users = data['users']
num_samples = data['num_samples']
user_data = data['user_data']

In [10]:
all_texts = ''.join([''.join(seq) for user in users for seq in user_data[user]['x']])
chars = sorted(set(all_texts))
char_to_idx = {ch: idx for idx, ch in enumerate(chars)}

# Add the padding character
char_to_idx['<pad>'] = len(char_to_idx)
idx_to_char = {idx: ch for ch, idx in char_to_idx.items()}

## Convert data into indices

In [11]:
inputs = [[char_to_idx[char] for char in user_data[user]['x'][0]] for user in users]
targets = [[char_to_idx[char] for char in user_data[user]['y'][0]] for user in users]

## Creation of TensorDataset and DataLoader

In [12]:
input_tensors = [torch.tensor(seq) for seq in inputs]
target_tensors = [torch.tensor([seq]) for seq in targets]

chars = sorted(set(all_texts))
char_to_idx = {ch: idx for idx, ch in enumerate(chars)}
char_to_idx['<pad>'] = len(char_to_idx)
idx_to_char = {idx: ch for ch, idx in char_to_idx.items()}

padded_inputs = pad_sequence(input_tensors, batch_first=True, padding_value=char_to_idx['<pad>'])

target_tensors = torch.cat(target_tensors, dim=0)

dataset = TensorDataset(padded_inputs, target_tensors)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)


In [13]:
#for testing porpouses
def tensor_to_string(tensor, idx_to_char):
    """Convert a tensor of indices in a string of characters."""
    return ''.join(idx_to_char[idx.item()] for idx in tensor)

In [14]:
# Function to convert character values into indices
def char_to_tensor(characters):
    indices = [char_to_idx.get(char, char_to_idx['<pad>']) for char in characters] # Get the index for the character. If not found, use the index for padding.
    return torch.tensor(indices, dtype=torch.long)

# Prepare the training data_loader
input_tensors = []
target_tensors = []
for user in data['users']:
    for entry, target in zip(data['user_data'][user]['x'], data['user_data'][user]['y']):
        input_tensors.append(char_to_tensor(entry))  # Use the full sequence of x
        target_tensors.append(char_to_tensor(target))  # Directly use the corresponding y as target

# Padding e creazione di DataLoader
padded_inputs = pad_sequence(input_tensors, batch_first=True, padding_value=char_to_idx['<pad>'])
targets = torch.cat(target_tensors)
dataset = TensorDataset(padded_inputs, targets)
# for elem1, elem2 in dataset:
#   elem2 = elem2.unsqueeze(0)

data_loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False)

In [15]:
# len of the trainig split:
print(len(data_loader.dataset))

253569


In [16]:
# Prepare the training data_loader
input_tensors = []
target_tensors = []
for user in test_data['users']:
    for entry, target in zip(test_data['user_data'][user]['x'], test_data['user_data'][user]['y']):
        input_tensors.append(char_to_tensor(entry))  # Use the full sequence of x
        target_tensors.append(char_to_tensor(target))  # Directly use the corresponding y as target

# Padding e creazione di DataLoader
padded_inputs = pad_sequence(input_tensors, batch_first=True, padding_value=char_to_idx['<pad>'])
targets = torch.cat(target_tensors)
dataset = TensorDataset(padded_inputs, targets)
# for elem1, elem2 in dataset:
#   elem2 = elem2.unsqueeze(0)

test_loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False)

In [17]:
# len of the test split:
print(len(test_loader.dataset))

42869


# Model inizialization

In [18]:
global_model = CharLSTM(vocab_size=len(char_to_idx))
criterion = nn.CrossEntropyLoss()

# Utility functions

In [19]:
DIR = '/content/drive/MyDrive/colab_plots/'

"""
Utility function used both in the centralized and federated learning
Computes the accuracy and the loss on the validation/test set depending on the dataloader passed
"""
def evaluate(model, dataloader, criterion, DEVICE):
    model.eval()  # Set the model to evaluation mode
    running_corrects = 0
    total_samples = 0  # Total samples counter
    losses = []

    with torch.no_grad():
        for data, targets in dataloader:
            data = data.to(DEVICE)
            targets = targets.to(DEVICE)
            hidden = model.init_hidden(data.size(0))
            hidden = (hidden[0].to(DEVICE), hidden[1].to(DEVICE))
            outputs, _ = model(data, hidden)
            outputs_flat = outputs.view(-1, model.vocab_size)
            targets_flat = targets.view(-1)

            loss = criterion(outputs_flat, targets_flat)
            losses.append(loss.item())

            _, preds = outputs_flat.max(1)
            #running_corrects += torch.sum(preds == targets_flat).item()
            running_corrects += (preds == targets_flat).sum().item()
            total_samples += targets_flat.size(0)

    accuracy = (running_corrects / total_samples) * 100
    return accuracy, sum(losses) / len(losses)


def test(global_model, test_loader, criterion, DEVICE):
    """
    Evaluate the global model on the test dataset.

    Args:
        global_model (nn.Module): The global model to be evaluated.
        test_loader (DataLoader): DataLoader for the test dataset.

    Returns:
        float: The accuracy of the model on the test dataset.
        float: The loss of the model on the test dataset.
    """
    test_accuracy, loss = evaluate(global_model, test_loader, criterion, DEVICE)
    return test_accuracy, loss



def save_data(global_model, train_accuracies, train_losses,client_count, file_name):
    """
    Save the global model, train_accuracies,train_losses and client_count to a file.

    Args:
        global_model (nn.Module): The global model to be saved.
        train_accuracies (list): List of training accuracies.
        train_losses (list): List of training losses.
        file_name (str): Name of the file to save the data.
    """
    # Fixed base directory
    directory = DIR + '/trained_models/'
    # Ensure the base directory exists
    os.makedirs(directory, exist_ok=True)

    # Complete path for the file
    file_path = os.path.join(directory, file_name)

    # Save all data (model state and metrics) into a dictionary
    save_dict = {
        'model_state': global_model.state_dict(),
        'train_accuracies': train_accuracies,
        'train_losses': train_losses,
        'client_count': client_count
    }

    # Save the dictionary to the specified file
    torch.save(save_dict, file_path)
    print(f"Data saved successfully to {file_path}")

def load_data(model, file_name):
    """
    Load the model weights and metrics from a file.

    Args:
        model (nn.Module): The model to load the weights into.
        file_name (str): Name of the file to load the data from.

    Returns:
        tuple: A tuple containing the model, train_accuracies train_losses and client_count.
    """
    # Fixed base directory
    directory = DIR+ 'trained_models/'
    # Complete path for the file
    file_path = os.path.join(directory, file_name)

    # Load the saved data from the specified file
    save_dict = torch.load(file_path)

    # Load the model state
    model.load_state_dict(save_dict['model_state'])

    # Extract the metrics
    train_accuracies = save_dict['train_accuracies']
    train_losses = save_dict['train_losses']
    client_count = save_dict['client_count']

    print(f"Data loaded successfully from {file_path}")

    return model, train_accuracies, train_losses,client_count

# Training cycle and testing results

In [21]:
lr = 1.0
wd = 0.0001
'''
These hyperparameters are taken from:
Acar, Durmus Alp Emre, et al. "Federated learning based on dynamic regularization." arXiv preprint arXiv:2111.04263 (2021).

Notice infact that the leaf version of the Shakespeare dataset doesn't come with a linked validation dataset to
choose the most accurate hyperparameters.
'''

local_steps =4
LOCAL_STEPS_VALUES = [4, 8, 16]  # Values for J (number of local steps)
NUM_RUNDS = {4: 600, 8: 300, 16:150}
print(f"Running experiment: local_steps={local_steps}")
global_model = CharLSTM(vocab_size=len(char_to_idx))
server = Server(global_model, DEVICE, char_to_idx)

    #tuning_rounds = int(NUM_RUNDS[local_steps]/20)
    #best_lr, best_wd = to be manually set

global_model, train_accuracies, train_losses, client_selection_count = server.train_federated(
        criterion, data,
        num_clients=NUM_CLIENTS,
        rounds=NUM_RUNDS[local_steps], lr=lr, momentum=MOMENTUM,
        batchsize=BATCH_SIZE, wd=wd, C=FRACTION_CLIENTS,
        local_steps=local_steps, log_freq=100,
        detailed_print=True, gamma=None  # No skewed sampling for this experiment
)

# Testing and plotting
test_accuracy, test_loss = test(global_model, test_loader, criterion, DEVICE)
#plot_metrics(train_accuracies, train_losses, f"Federated_scaled_LR_{lr}_WD_{wd}.png")
print(f"Test accuracy for local_steps={local_steps}: {test_accuracy}")

# Save data for future analysis
#save_data(global_model, train_accuracies, train_losses, client_selection_count, f"Federated_LR_{lr}_WD_{wd}_j_{local_steps}.pth")

Running experiment: local_steps=4


KeyboardInterrupt: 