# Import

In [1]:
import numpy as np
import pandas as pd
import os
import torch
import torch.nn as nn
from collections import Counter
import warnings
import string
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import kagglehub
import itertools
from copy import deepcopy
import collections
from sklearn.model_selection import train_test_split
import json
import random
import re
import torch.optim as optim
from collections import defaultdict
import kagglehub
import io
import json
from torch.utils.data import DataLoader, TensorDataset

# Import the Dataset

We must import the dataset manually since it is taken by the LEAF project.

So far the project is to go on the data folder of shakespeare and:
1. ./get_data.sh
2. ./data_to_json.sh
3. ././preprocess.sh -s niid --sf 0.2 -k 0 -t sample -tf 0.8 [depending on the preferencies]

## Upload the training dataset

In [2]:
from google.colab import files
uploaded = files.upload()

Saving all_data_niid_06_train_8.json to all_data_niid_06_train_8.json


## Upload the testing dataset

In [3]:
from google.colab import files
uploaded2 = files.upload()

Saving all_data_niid_06_test_8.json to all_data_niid_06_test_8.json


In [4]:
# Testing print to verify the name of the uploaded files for testing (in general no need to
# use this print since we already have the name of the uploaded file).
print(list(uploaded2.keys()))

['all_data_niid_06_test_8.json']


In [5]:
data = json.load(io.BytesIO(uploaded['all_data_niid_06_train_8.json']))

In [6]:
test_data  = json.load(io.BytesIO(uploaded2['all_data_niid_06_test_8.json']))

## Upload the json file (the leaf dataset for Shakespeare)

In [7]:
with open('all_data_niid_06_train_8.json', 'r') as file:
    data = json.load(file)

In [8]:
with open('all_data_niid_06_test_8.json', 'r') as f:  # Cambia il percorso con quello corretto
    test_data = json.load(f)

## Statistics of the dataset
Just for testing porpouses we can print some statistics about the uploaded dataset.

The values used for the train/test split and the number of his samples are inspired by:
Acar, Durmus Alp Emre, et al. "Federated learning based on dynamic regularization." arXiv preprint arXiv:2111.04263 (2021).

In [9]:
total_samples = sum(data['num_samples'])
print(f"Total number of train samples: {total_samples}")

Total number of train samples: 198981


In [10]:
total_samples = sum(test_data['num_samples'])
print(f"Total number of test samples: {total_samples}")

Total number of test samples: 49773


In [11]:
users = data['users']
num_samples = data['num_samples']
user_data = data['user_data']

# Parameters

In [12]:
'''
The batch size has been inspired by:
Sashank Reddi, Zachary Charles, Manzil Zaheer, Zachary Garrett, Keith Rush,
Jakub Konečný, Sanjiv Kumar, H. Brendan McMahan; Adaptive Federated Optimization, 2021.
'''
BATCH_SIZE = 4

## Vocab creation

In [14]:
all_texts = ''.join([''.join(seq) for user in users for seq in user_data[user]['x']])
chars = sorted(set(all_texts))
char_to_idx = {ch: idx for idx, ch in enumerate(chars)}

# Add the padding character
char_to_idx['<pad>'] = len(char_to_idx)
idx_to_char = {idx: ch for ch, idx in char_to_idx.items()}

## Covert data into indices

In [18]:
inputs = [[char_to_idx[char] for char in user_data[user]['x'][0]] for user in users]
targets = [[char_to_idx[char] for char in user_data[user]['y'][0]] for user in users]

## Creation of TensorDataset and DataLoader

In [19]:
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, TensorDataset


input_tensors = [torch.tensor(seq) for seq in inputs]
target_tensors = [torch.tensor([seq]) for seq in targets]

chars = sorted(set(all_texts))
char_to_idx = {ch: idx for idx, ch in enumerate(chars)}
char_to_idx['<pad>'] = len(char_to_idx)
idx_to_char = {idx: ch for ch, idx in char_to_idx.items()}

padded_inputs = pad_sequence(input_tensors, batch_first=True, padding_value=char_to_idx['<pad>'])

target_tensors = torch.cat(target_tensors, dim=0)

dataset = TensorDataset(padded_inputs, target_tensors)
batch_size = 4
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)


In [20]:
def tensor_to_string(tensor, idx_to_char):
    """Converte un tensore di indici in una stringa di caratteri."""
    return ''.join(idx_to_char[idx.item()] for idx in tensor)

# Model

In [21]:
import torch.nn.functional as F

class CharLSTM(nn.Module):
    """
    Character-level LSTM model for text processing tasks.
    Includes embedding, LSTM, and a fully connected output layer.
    We use:
    - embedding size equal to 8;
    - 2 LSTM layers, each with 256 nodes;
    - densely connected softmax output layer.

    We can avoid to use explicitly the softmax function in the model and
    keep a cross entropy loss function as a loss function.

    as mentioned in paper [2] (Sashank Reddi, Zachary Charles, Manzil Zaheer, Zachary Garrett, Keith Rush,
    Jakub Konečný, Sanjiv Kumar, H. Brendan McMahan; Adaptive Federated Optimization, 2021)
    """
    def __init__(self, vocab_size = 70, embedding_size = 8, lstm_hidden_dim = 256, seq_length=80):
        super(CharLSTM, self).__init__()
        self.seq_length = seq_length
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.lstm_hidden_dim = lstm_hidden_dim
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_size)
        self.lstm1 = nn.LSTM(input_size=embedding_size, hidden_size=lstm_hidden_dim, batch_first=True)
        self.lstm2 = nn.LSTM(input_size=lstm_hidden_dim, hidden_size=lstm_hidden_dim, batch_first=True)
        self.fc = nn.Linear(lstm_hidden_dim, vocab_size)

    def forward(self, x, hidden):
        """
        Forward pass through the model.
        """
        # Layer 1: Embedding
        x = self.embedding(x)  # Output shape: (batch_size, seq_length, embedding_dim)

        # Layer 2: First LSTM
        x, _ = self.lstm1(x)  # Output shape: (batch_size, seq_length, lstm_hidden_dim)

        # Layer 3: Second LSTM
        x, hidden = self.lstm2(x)  # Output shape: (batch_size, seq_length, lstm_hidden_dim)

        # Layer 4: Fully Connected Layer
        x = self.fc(x)  # Output shape: (batch_size, seq_length, vocab_size)

        # Softmax Activation
        #x = self.softmax(x)  # Output shape: (batch_size, seq_length, vocab_size)
        return x[:, -1, :], hidden

    def init_hidden(self, batch_size):
        """Initializes hidden and cell states for the LSTM."""
        return (torch.zeros(2, batch_size, self.lstm_hidden_dim),
            torch.zeros(2, batch_size, self.lstm_hidden_dim))
        #2 is equal to the number of lstm layers!



# Training

In [22]:
# Function to convert character in indices:
def char_to_tensor(characters):
    indices = [char_to_idx[char] for char in characters]
    return torch.tensor(indices, dtype=torch.long)

# Prepare the test samples:
'''
The leaf dataset is structured in the following way:
Users: Each dataset in LEAF is distributed across a simulated set of users (playing actor). The data for
each user is stored separately to mimic real-world scenarios where data is distributed
across devices or clients.
Data Format: For each user, the data include:
    x: sentences declared by the "user"/"device".
    y: Labels or outputs associated with the inputs.
'''
input_tensors = []
target_tensors = []
for user in data['users']:
    for entry, target in zip(data['user_data'][user]['x'], data['user_data'][user]['y']):
        input_tensors.append(char_to_tensor(entry))  # Use the full sequence of x
        target_tensors.append(char_to_tensor(target))  # Directly use the corresponding y as target

# Padding and creation ofDataLoader
padded_inputs = pad_sequence(input_tensors, batch_first=True, padding_value=char_to_idx['<pad>'])
targets = torch.cat(target_tensors)
dataset = TensorDataset(padded_inputs, targets)
for elem1, elem2 in dataset:
  elem2 = elem2.unsqueeze(0)

loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False)

# Checkpointing management

In [28]:
import os
import torch
import json


from google.colab import drive
drive.mount('/content/drive')
CHECKPOINT_DIR = '/content/drive/MyDrive/colab_checkpoints/'


# Ensure the checkpoint directory exists, creating it if necessary
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

def save_checkpoint(model, optimizer, epoch, hyperparameters, subfolder="", checkpoint_data=None):
    """
    Saves the model checkpoint and removes the previous one if it exists.

    Arguments:
    model -- The model whose state is to be saved.
    optimizer -- The optimizer whose state is to be saved (can be None).
    epoch -- The current epoch of the training process.
    hyperparameters -- A string representing the model's hyperparameters for file naming.
    subfolder -- Optional subfolder within the checkpoint directory to save the checkpoint.
    checkpoint_data -- Data to save in a JSON file (e.g., training logs).
    """
    # Define the path for the subfolder where checkpoints will be stored
    subfolder_path = os.path.join(CHECKPOINT_DIR, subfolder)
    # Create the subfolder if it doesn't exist
    os.makedirs(subfolder_path, exist_ok=True)

    # Construct filenames for both the model checkpoint and the associated JSON file
    filename = f"model_epoch_{epoch}_params_{hyperparameters}.pth"
    filepath = os.path.join(subfolder_path, filename)
    filename_json = f"model_epoch_{epoch}_params_{hyperparameters}.json"
    filepath_json = os.path.join(subfolder_path, filename_json)

    # Define the filenames for the previous checkpoint files, to remove them if necessary
    previous_filepath = os.path.join(subfolder_path, f"model_epoch_{epoch - 1}_params_{hyperparameters}.pth")
    previous_filepath_json = os.path.join(subfolder_path, f"model_epoch_{epoch - 1}_params_{hyperparameters}.json")

    # Remove the previous checkpoint if it exists, but only for epochs greater than 1
    # if epoch > 1 and os.path.exists(previous_filepath):
    #     os.remove(previous_filepath)
    #     os.remove(previous_filepath_json)

      # Remove the previous checkpoint if it exists, but only for epochs greater than 1
    if epoch >= 1:
        if os.path.exists(previous_filepath):
            os.remove(previous_filepath)
        if os.path.exists(previous_filepath_json):
            os.remove(previous_filepath_json)


    # Prepare the checkpoint data dictionary
    checkpoint = {'model_state_dict': model.state_dict(), 'epoch': epoch}
    # If an optimizer is provided, save its state as well
    if optimizer is not None:
        checkpoint['optimizer_state_dict'] = optimizer.state_dict()

    # Save the model and optimizer (if provided) state dictionary to the checkpoint file
    torch.save(checkpoint, filepath)
    print(f"Checkpoint saved: {filepath}")

    # If additional data (e.g., training logs) is provided, save it to a JSON file
    if checkpoint_data:
        with open(filepath_json, 'w') as json_file:
            json.dump(checkpoint_data, json_file, indent=4)

def load_checkpoint(model, optimizer, hyperparameters, subfolder=""):
    """
    Loads the latest checkpoint available based on the specified hyperparameters.

    Arguments:
    model -- The model whose state will be updated from the checkpoint.
    optimizer -- The optimizer whose state will be updated from the checkpoint (can be None).
    hyperparameters -- A string representing the model's hyperparameters for file naming.
    subfolder -- Optional subfolder within the checkpoint directory to look for checkpoints.

    Returns:
    The next epoch to resume from and the associated JSON data if available.
    """
    # Define the path to the subfolder where checkpoints are stored
    subfolder_path = os.path.join(CHECKPOINT_DIR, subfolder)

    # If the subfolder doesn't exist, print a message and start from epoch 1
    if not os.path.exists(subfolder_path):
        print("No checkpoint found, starting from epoch 1.")
        return 1, None  # Epoch starts from 1

    # Search for checkpoint files in the subfolder that match the hyperparameters
    files = [f for f in os.listdir(subfolder_path) if f"params_{hyperparameters}" in f and f.endswith('.pth')]

    # If checkpoint files are found, load the one with the highest epoch number
    if files:
        latest_file = max(files, key=lambda x: int(x.split('_')[2]))  # Find the latest epoch file
        filepath = os.path.join(subfolder_path, latest_file)
        checkpoint = torch.load(filepath, weights_only=True)

        # Load the model state from the checkpoint
        model.load_state_dict(checkpoint['model_state_dict'])
        # If an optimizer is provided, load its state as well
        if optimizer:
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

        # Try to load the associated JSON file if available
        json_filepath = os.path.join(subfolder_path, latest_file.replace('.pth', '.json'))
        json_data = None
        if os.path.exists(json_filepath):
            # If the JSON file exists, load its contents
            with open(json_filepath, 'r') as json_file:
                json_data = json.load(json_file)
            print("Data loaded!")
        else:
            # If no JSON file exists, print a message
            print("No data found")

        # Print the epoch from which the model is resuming
        print(f"Checkpoint found: Resuming from epoch {checkpoint['epoch'] + 1}\n\n")
        return checkpoint['epoch'] + 1, json_data

    # If no checkpoint is found, print a message and start from epoch 1
    print("No checkpoint found, starting from epoch 1..\n\n")
    return 1, None  # Epoch starts from 1

def delete_existing_checkpoints(subfolder=""):
    """
    Deletes all existing checkpoints in the specified subfolder.

    Arguments:
    subfolder -- Optional subfolder within the checkpoint directory to delete checkpoints from.
    """
    subfolder_path = os.path.join(CHECKPOINT_DIR, subfolder)
    if os.path.exists(subfolder_path):
        for file_name in os.listdir(subfolder_path):
            file_path = os.path.join(subfolder_path, file_name)
            if os.path.isfile(file_path):
                os.remove(file_path)
        print(f"All existing checkpoints in {subfolder_path} have been deleted.")
    else:
        print(f"No checkpoint folder found at {subfolder_path}.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define the Model to use
model = CharLSTM(vocab_size=len(char_to_idx))
model.train()  # set the model in training mode

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(DEVICE)  # Move the entire model to the right device

# Definition (manual) of loss and optimizer (with related hyperparameters)
criterion = nn.CrossEntropyLoss()
lr = 0.001
optimizer = optim.Adam(model.parameters(), lr=lr)

# Training function
def train_model(model, dataloader, criterion, optimizer, num_epochs=10):
    for epoch in range(num_epochs):
        total_loss = 0
        total_correct = 0
        total_samples = 0

        for inputs, targets in dataloader:
            inputs = inputs.to(DEVICE)
            targets = targets.to(DEVICE)

            # Reset the existing gradients (if any)
            optimizer.zero_grad()

            # ininzialize the hidden state:
            hidden = model.init_hidden(inputs.size(0))
            hidden = (hidden[0].to(DEVICE), hidden[1].to(DEVICE))

            # Forward pass
            outputs, _ = model(inputs, hidden)

            # loss
            outputs_flat = outputs.view(-1, len(char_to_idx))
            targets_flat = targets.view(-1)
            loss = criterion(outputs_flat, targets_flat)

            # Backward pass
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            total_samples += targets_flat.size(0)

            # Accuracy
            _, predicted = outputs_flat.max(1)
            total_correct += (predicted == targets_flat).sum().item()

        # loss and accuracy for epoch
        avg_loss = total_loss / len(dataloader)
        accuracy = total_correct / total_samples

        # Save the checkpoint:
        save_checkpoint(model,optimizer=None, epoch=epoch, hyperparameters=f"LR{lr}", subfolder="Federated/")

        # print statistics
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}, %')

# Execute the model:
train_model(model, loader, criterion, optimizer, num_epochs=200)


Checkpoint saved: /content/drive/MyDrive/colab_checkpoints/Federated/model_epoch_0_params_LR0.001.pth
Epoch 1/200, Loss: 1.9230, Accuracy: 0.4476
Checkpoint saved: /content/drive/MyDrive/colab_checkpoints/Federated/model_epoch_1_params_LR0.001.pth
Epoch 2/200, Loss: 1.6513, Accuracy: 0.5136
Checkpoint saved: /content/drive/MyDrive/colab_checkpoints/Federated/model_epoch_2_params_LR0.001.pth
Epoch 3/200, Loss: 1.5715, Accuracy: 0.5316
Checkpoint saved: /content/drive/MyDrive/colab_checkpoints/Federated/model_epoch_3_params_LR0.001.pth
Epoch 4/200, Loss: 1.5226, Accuracy: 0.5437
Checkpoint saved: /content/drive/MyDrive/colab_checkpoints/Federated/model_epoch_4_params_LR0.001.pth
Epoch 5/200, Loss: 1.4987, Accuracy: 0.5488
Checkpoint saved: /content/drive/MyDrive/colab_checkpoints/Federated/model_epoch_5_params_LR0.001.pth
Epoch 6/200, Loss: 1.4810, Accuracy: 0.5529
Checkpoint saved: /content/drive/MyDrive/colab_checkpoints/Federated/model_epoch_6_params_LR0.001.pth
Epoch 7/200, Loss: 1.4

# Testing

In [None]:
def char_to_tensor(characters):
    indices = [char_to_idx[char] for char in characters]
    return torch.tensor(indices, dtype=torch.long)

input_tensors = []
target_tensors = []
for user in test_data['users']:
    for entry in test_data['user_data'][user]['x']:
        input_tensors.append(char_to_tensor(entry[:-1]))  # Tutti i caratteri tranne l'ultimo
        target_tensors.append(char_to_tensor(entry[-1]))  # L'ultimo carattere come target

# Padding e creazione di DataLoader
padded_inputs = pad_sequence(input_tensors, batch_first=True, padding_value=char_to_idx[' '])  # Usa spazio per padding se non hai '<pad>'
targets = torch.cat(target_tensors)
test_dataset = TensorDataset(padded_inputs, targets)
for elem1, elem2 in test_dataset:
  elem2 = elem2.unsqueeze(0)

test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)


# Testa il modello
correct = 0
total = 0
for inputs, targets in test_loader:
    inputs = inputs.to(DEVICE)  # Sposta gli input sul dispositivo corretto
    targets = targets.to(DEVICE)  # Sposta i target sul dispositivo corretto

    # Inizializza lo stato nascosto
    hidden = model.init_hidden(inputs.size(0))
    hidden = (hidden[0].to(DEVICE), hidden[1].to(DEVICE))  # Sposta lo stato nascosto sul dispositivo corretto

    outputs, _ = model(inputs, hidden)
    outputs_flat = outputs.view(-1, len(char_to_idx))
    targets_flat = targets.view(-1)
    _, predicted = outputs_flat.max(1)
    total += targets.size(0)
    correct += (predicted == targets_flat).sum().item()

print(f'Test Accuracy: {100 * correct / total}%')