# Import

In [9]:
import numpy as np
import pandas as pd
import os
import torch
import torch.nn as nn
from collections import Counter
import warnings
import string
import itertools
from copy import deepcopy
import collections
from sklearn.model_selection import train_test_split
import json
import random
import re
import torch.optim as optim
from collections import defaultdict
import io
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
import tkinter as tk
from tkinter import filedialog
from Model import CharLSTM

# Parameters

In [10]:
BATCH_SIZE = 4
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
criterion = nn.CrossEntropyLoss()
lr = 0.001
wd = 0.0001
momentum = 0.9

# Import the Dataset

We must import the dataset manually since it is taken by the LEAF project.

So far the project is to go on the data folder of shakespeare and:
1. ./get_data.sh inside the preprocess folder
2. ./data_to_json.sh
3. cd ..
3. ././preprocess.sh -s niid --sf 0.2 -k 0 -t sample -tf 0.8 [depending on the preferencies]

## Upload the training and the testing dataset

Please upload the training dataset provided by LEAF here.

In [11]:
root = tk.Tk()
#root.withdraw()

file_path = filedialog.askopenfilename(filetypes=[("JSON files", "*.json")])

if file_path:
    with open(file_path, 'r') as file:
        data = json.load(file)
            
root.destroy()

Please upload the test dataset provided by LEAF.

In [12]:
root = tk.Tk()
#root.withdraw()

file_path = filedialog.askopenfilename(filetypes=[("JSON files", "*.json")])

if file_path:
    with open(file_path, 'r') as file:
        test_data = json.load(file)
            
root.destroy()

## Statistics of the dataset
Just for testing porpouses we can print some statistics about the uploaded dataset.

The values used for the train/test split and the number of his samples are inspired by:
Acar, Durmus Alp Emre, et al. "Federated learning based on dynamic regularization." arXiv preprint arXiv:2111.04263 (2021).

In [13]:
total_samples = sum(data['num_samples'])
print(f"Total number of train samples: {total_samples}")

total_samples = sum(test_data['num_samples'])
print(f"Total number of test samples: {total_samples}")

Total number of train samples: 253569
Total number of test samples: 50769


In [14]:
users = data['users']
num_samples = data['num_samples']
user_data = data['user_data']

In [15]:
number_of_clients = len(users)
print(f"Number of clients: {number_of_clients}")

Number of clients: 100


## Vocab creation

In [16]:
all_texts = ''.join([''.join(seq) for user in users for seq in user_data[user]['x']])
chars = sorted(set(all_texts))
char_to_idx = {ch: idx for idx, ch in enumerate(chars)}

# Add the padding character
char_to_idx['<pad>'] = len(char_to_idx)
idx_to_char = {idx: ch for ch, idx in char_to_idx.items()}

## Covert data into indices

In [17]:
inputs = [[char_to_idx[char] for char in user_data[user]['x'][0]] for user in users]
targets = [[char_to_idx[char] for char in user_data[user]['y'][0]] for user in users]

## Creation of TensorDataset and DataLoader

In [18]:
input_tensors = [torch.tensor(seq) for seq in inputs]
target_tensors = [torch.tensor([seq]) for seq in targets]

chars = sorted(set(all_texts))
char_to_idx = {ch: idx for idx, ch in enumerate(chars)}
char_to_idx['<pad>'] = len(char_to_idx)
idx_to_char = {idx: ch for ch, idx in char_to_idx.items()}

padded_inputs = pad_sequence(input_tensors, batch_first=True, padding_value=char_to_idx['<pad>'])

target_tensors = torch.cat(target_tensors, dim=0)

dataset = TensorDataset(padded_inputs, target_tensors)
batch_size = 4
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [19]:
def tensor_to_string(tensor, idx_to_char):
    """Converte un tensore di indici in una stringa di caratteri."""
    return ''.join(idx_to_char[idx.item()] for idx in tensor)

In [20]:
# Function to convert character in indices:
# def char_to_tensor(characters):
#     indices = [char_to_idx[char] for char in characters]
#     return torch.tensor(indices, dtype=torch.long)

def char_to_tensor(characters):
    indices = [char_to_idx.get(char, char_to_idx['<pad>']) for char in characters] # Get the index for the character. If not found, use the index for padding.
    return torch.tensor(indices, dtype=torch.long)

# Prepare the test samples:
'''
The leaf dataset is structured in the following way:
Users: Each dataset in LEAF is distributed across a simulated set of users (playing actor). The data for
each user is stored separately to mimic real-world scenarios where data is distributed
across devices or clients.
Data Format: For each user, the data include:
    x: sentences declared by the "user"/"device".
    y: Labels or outputs associated with the inputs.
'''
input_tensors = []
target_tensors = []
for user in data['users']:
    for entry, target in zip(data['user_data'][user]['x'], data['user_data'][user]['y']):
        input_tensors.append(char_to_tensor(entry))  # Use the full sequence of x
        target_tensors.append(char_to_tensor(target))  # Directly use the corresponding y as target

# Padding and creation ofDataLoader
padded_inputs = pad_sequence(input_tensors, batch_first=True, padding_value=char_to_idx['<pad>'])
targets = torch.cat(target_tensors)
dataset = TensorDataset(padded_inputs, targets)
for elem1, elem2 in dataset:
  elem2 = elem2.unsqueeze(0)

loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False)

# Training

In [None]:
import torch
import torch.nn.functional as F
from torch.optim.lr_scheduler import CosineAnnealingLR

# Define the Model to use
model = CharLSTM(vocab_size=len(char_to_idx))
model.train()  # Set the model in training mode
model = model.to(DEVICE)  # Move the entire model to the right device

optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=momentum, weight_decay=wd)
scheduler = CosineAnnealingLR(optimizer, T_max=200)  # T_max is the number of epochs

# Training function
def train_model(model, dataloader, criterion, optimizer, scheduler, num_epochs=10):
    train_accuracies = []
    train_losses = []
    for epoch in range(num_epochs):
        total_loss = 0
        total_correct = 0
        total_samples = 0

        for inputs, targets in dataloader:
            inputs = inputs.to(DEVICE)
            targets = targets.to(DEVICE)

            # Reset the existing gradients (if any)
            optimizer.zero_grad()

            # Initialize the hidden state:
            hidden = model.init_hidden(inputs.size(0))
            hidden = (hidden[0].to(DEVICE), hidden[1].to(DEVICE))

            # Forward pass
            outputs, _ = model(inputs, hidden)

            # Calculate loss
            outputs_flat = outputs.view(-1, len(char_to_idx))
            targets_flat = targets.view(-1)
            loss = criterion(outputs_flat, targets_flat)

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            total_samples += targets_flat.size(0)

            # Calculate accuracy
            _, predicted = outputs_flat.max(1)
            total_correct += (predicted == targets_flat).sum().item()

        # Adjust learning rate based on the scheduler
        scheduler.step()

        # Calculate loss and accuracy for the epoch
        avg_loss = total_loss / len(dataloader)
        accuracy = total_correct / total_samples
        train_losses.append(avg_loss)
        train_accuracies.append(accuracy)

        # Save the checkpoint:
        
        # Print statistics
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}, %')

# Execute the model:
train_model(model, loader, criterion, optimizer, scheduler, num_epochs=200)


# Testing

In [None]:
input_tensors = []
target_tensors = []
for user in test_data['users']:
    for entry, target in zip(test_data['user_data'][user]['x'], test_data['user_data'][user]['y']):
        input_tensors.append(char_to_tensor(entry))  # Use the full sequence of x
        target_tensors.append(char_to_tensor(target))  # Directly use the corresponding y as target


padded_inputs = pad_sequence(input_tensors, batch_first=True, padding_value=char_to_idx['<pad>'])
targets = torch.cat(target_tensors)
test_dataset = TensorDataset(padded_inputs, targets)
for elem1, elem2 in test_dataset:
  elem2 = elem2.unsqueeze(0)

test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)


# Test the model
correct = 0
total = 0
for inputs, targets in test_loader:
    inputs = inputs.to(DEVICE)  # move input to correct dev
    targets = targets.to(DEVICE)  # move target to correct dev

    # Inizialize the hidden state
    hidden = model.init_hidden(inputs.size(0))
    hidden = (hidden[0].to(DEVICE), hidden[1].to(DEVICE))  # Move the hidden state to correct dev

    outputs, _ = model(inputs, hidden)
    outputs_flat = outputs.view(-1, len(char_to_idx))
    targets_flat = targets.view(-1)
    _, predicted = outputs_flat.max(1)
    total += targets.size(0)
    correct += (predicted == targets_flat).sum().item()

print(f'Test Accuracy: {100 * correct / total}%')