In [2]:
!pip install scikit-optimize

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [3]:
from hyperband_petro import hyperband
from skopt.space import Real, Integer

In [4]:
'''
Most of the cells below are just taken from the sent_140 tutorial, for data loading
'''

# Don't need to run this after the directory has been created once

'''

%%capture preprocess_output

# Download and preprocess the data
% cd ..
!git clone https://github.com/TalwalkarLab/leaf.git
%cd leaf/data/sent140
# !./preprocess.sh --sf 0.01 -s niid -t 'user' --tf 0.90 -k 1 --spltseed 1

# change preprocess option (-t) so each user's data gets split into train/test
!./preprocess.sh --sf 0.01 -s niid -t 'sample' --tf 0.90 -k 2 --spltseed 1

'''

%cd ../leaf/data/sent140

/workspace/leaf/data/sent140


In [5]:
USE_CUDA = True
LOCAL_BATCH_SIZE = 32
MAX_SEQ_LEN = 25

# suppress large outputs
VERBOSE = False

TRAIN_DATA = !ls data/train
TRAIN_DATA = "data/train/" + TRAIN_DATA[0]

TEST_DATA = !ls data/test
TEST_DATA = "data/test/" + TEST_DATA[0]

TRAIN_DATA, TEST_DATA

('data/train/all_data_niid_01_keep_2_train_9.json',
 'data/test/all_data_niid_01_keep_2_test_9.json')

In [6]:
import json
import numpy as np

# load the training data
with open(TRAIN_DATA, "r") as f:
    training_data = json.load(f)

# how samples are distributed across users
n_samples = training_data['num_samples']
print(f"""\nNumber of samples per user:
  min={np.min(n_samples)}, 
  max={np.max(n_samples)}, 
  median={np.median(n_samples)}, 
  mean={np.mean(n_samples):.2f}, 
  std={np.std(n_samples):.2f}
  """)


Number of samples per user:
  min=1, 
  max=123, 
  median=2.0, 
  mean=3.64, 
  std=6.37
  


In [7]:
EXAMPLE_USER = training_data["users"][0]
training_data["user_data"][EXAMPLE_USER]

{'x': [['2003370575',
   'Tue Jun 02 06:24:38 PDT 2009',
   'NO_QUERY',
   'bricaligirl',
   'Doin my hair for school... Sooo tired ',
   'training']],
 'y': [0]}

In [8]:
import itertools
import re
import string
import unicodedata

import torch
from torch.utils.data import Dataset


# 1. The Sent140Dataset will store the tweets and corresponding sentiment for each user.

class Sent140Dataset(Dataset):
    def __init__(self, data_root, max_seq_len):
        self.data_root = data_root
        self.max_seq_len = max_seq_len
        self.all_letters = {c: i for i, c in enumerate(string.printable)}
        self.num_letters = len(self.all_letters)
        self.UNK = self.num_letters

        with open(data_root, "r+") as f:
            self.dataset = json.load(f)

        self.data = {}
        self.targets = {}
        self.num_classes = 2  # binary sentiment classification

        # Populate self.data and self.targets
        for user_id, user_data in self.dataset["user_data"].items():
            self.data[user_id] = self.process_x(list(user_data["x"]))
            self.targets[user_id] = self.process_y(list(user_data["y"]))

    def __len__(self):
        return len(self.data)

    def __iter__(self):
        for user_id in self.data.keys():
            yield self.__getitem__(user_id)

    def __getitem__(self, user_id: str):
        if user_id not in self.data or user_id not in self.targets:
            raise IndexError(f"User {user_id} is not in dataset")
        return self.data[user_id], self.targets[user_id]

    def unicodeToAscii(self, s):
        return "".join(
            c for c in unicodedata.normalize("NFD", s)
            if unicodedata.category(c) != "Mn" and c in self.all_letters
        )

    def line_to_indices(self, line: str, max_seq_len: int):
        line_list = self.split_line(line)  # split phrase in words
        line_list = line_list
        chars = self.flatten_list([list(word) for word in line_list])
        indices = [
            self.all_letters.get(letter, self.UNK)
            for i, letter in enumerate(chars)
            if i < max_seq_len
        ]
        # Add padding
        indices = indices + [self.UNK] * (max_seq_len - len(indices))
        return indices

    def process_x(self, raw_x_batch):
        x_batch = [e[4] for e in raw_x_batch]  # e[4] contains the actual tweet
        x_batch = [self.line_to_indices(e, self.max_seq_len) for e in x_batch]
        x_batch = torch.LongTensor(x_batch)
        return x_batch

    def process_y(self, raw_y_batch):
        y_batch = [int(e) for e in raw_y_batch]
        return y_batch

    def split_line(self, line):
        """
        Split given line/phrase (str) into list of words (List[str])
        """
        return re.findall(r"[\w']+|[.,!?;]", line)

    def flatten_list(self, nested_list):
        return list(itertools.chain.from_iterable(nested_list))

In [9]:
# 2. Load the train and test datasets.
train_dataset = Sent140Dataset(
    data_root=TRAIN_DATA,
    max_seq_len=MAX_SEQ_LEN,
)
test_dataset = Sent140Dataset(
    data_root=TEST_DATA,
    max_seq_len=MAX_SEQ_LEN,
)

In [10]:
from flsim.utils.example_utils import LEAFDataLoader, LEAFDataProvider

# 3. Batchify training, eval, and test data. Note that train_dataset is already sharded.
dataloader = LEAFDataLoader(
    train_dataset,
    test_dataset,
    test_dataset,
    batch_size=LOCAL_BATCH_SIZE,
    drop_last=False,
)

# 4. Wrap the data loader with a data provider.
data_provider = LEAFDataProvider(dataloader)
print(f"\nClients in total: {data_provider.num_users()}")

Creating FL User: 2493user [00:00, 26797.06user/s]
Creating FL User: 2493user [00:00, 32796.88user/s]
Creating FL User: 2493user [00:00, 33634.41user/s]


Clients in total: 2493





In [11]:
from torch import nn

class CharLSTM(nn.Module):
    def __init__(
        self,
        num_classes,
        n_hidden,
        num_embeddings,
        embedding_dim,
        max_seq_len,
        dropout_rate,
    ):
        super().__init__()
        self.dropout_rate = dropout_rate
        self.n_hidden = n_hidden
        self.num_classes = num_classes
        self.max_seq_len = max_seq_len
        self.num_embeddings = num_embeddings

        self.embedding = nn.Embedding(
            num_embeddings=self.num_embeddings, embedding_dim=embedding_dim
        )
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=self.n_hidden,
            num_layers=2,
            batch_first=True,
            dropout=self.dropout_rate,
        )
        self.fc = nn.Linear(self.n_hidden, self.num_classes)
        self.dropout = nn.Dropout(p=self.dropout_rate)

    def forward(self, x):
        seq_lens = torch.sum(x != (self.num_embeddings - 1), 1) - 1
        x = self.embedding(x)  # [B, S] -> [B, S, E]
        out, _ = self.lstm(x)  # [B, S, E] -> [B, S, H]
        out = out[torch.arange(out.size(0)), seq_lens]
        out = self.fc(self.dropout(out))  # [B, S, H] -> # [B, S, C]
        return out

In [20]:
from flsim.utils.example_utils import FLModel
import flsim.configs
from flsim.utils.config_utils import fl_config_from_json
from flsim.interfaces.metrics_reporter import Channel
from flsim.utils.example_utils import MetricsReporter
from omegaconf import OmegaConf
import math
from hydra.utils import instantiate
import copy


'''
TODO:
- Put all this into a function
- Another dimension for momentum
- Properly adjust resource parameters for a good run that doesn't take too long, and get results
    - 'resources' in  FLSim_Adam_objective()
        - Currently treating resources and epochs as equal, communication_rounds will need to be calculated
    - USERS_PER_ROUND (users_per_round in json_config)
'''

dimensions = [Real(1e-4, 1e-1, name="lr")]

# Create a metric reporter.
metrics_reporter = MetricsReporter([Channel.TENSORBOARD, Channel.STDOUT])

USERS_PER_ROUND = 100 # needs adjusting

def FLSim_Adam_objective(resources, checkpoint, **hyperparameters):
    '''
    resources: the number of communication rounds the FLSim trainer is allowed
    checkpoint: should be a way to load model progress
    hyperparameters: are sourced from the named dimensions (see above)
    '''
    lr = hyperparameters["lr"]
#     momentum = hyperparameters["momentum"]
#     epochs = math.floor(resources / math.ceil(data_provider.num_users()/USERS_PER_ROUND) )
    epochs = resources
#     assert epochs>=1
    
    model = CharLSTM(
        num_classes=train_dataset.num_classes,
        n_hidden=100,
        num_embeddings=train_dataset.num_letters + 1,
        embedding_dim=100,
        max_seq_len=MAX_SEQ_LEN,
        dropout_rate=0.1,
    )
    if checkpoint is not None:
        model.load_state_dict(checkpoint)       
    
    # 2. Choose where the model will be allocated.
    cuda_enabled = torch.cuda.is_available() and USE_CUDA
    device = torch.device(f"cuda:{0}" if cuda_enabled else "cpu")
    
    # 3. Wrap the model with FLModel.
    global_model = FLModel(model, device) # model gets updated as global_model is trained
    assert(global_model.fl_get_module() == model)
    
    # 4. Move the model to GPU and enable CUDA if desired.
    if cuda_enabled:
        global_model.fl_cuda()
        
    json_config = {
        "trainer": {
            "_base_": "base_sync_trainer",
            "server": {
                "_base_": "base_sync_server",
                "server_optimizer": {
                    # there are different types of server optimizers
                    # fed avg with lr requires a learning rate, whereas e.g. fed_avg doesn't
                      # "_base_": "base_fed_avg_with_lr",
                    # server's learning rate
                      # "lr": 0.7,
                    # server's global momentum
                      # "momentum": 0.9

                    # Federated ADAM (with weight decay)
                    # Server Defaults:  
                      # lr: float = 0.001
                      # weight_decay: float = 0.00001
                      # beta1: float = 0.9
                      # beta2: float = 0.999
                      # eps: float = 1e-8
                    "_base_": "base_fed_adam",
                    "lr": lr
                },
                # aggregate client models into a single model by taking their weighted sum
                "aggregation_type": "WEIGHTED_AVERAGE",
                # type of user selection sampling
                "active_user_selector": {
                    "_base_": "base_uniformly_random_active_user_selector"
                }
            },
            "client": {
                # number of client's local epochs
                # "epochs": 1,  <--- old value in example
                "epochs": 10,
                "optimizer": {
                    # client's optimizer
                    "_base_": "base_optimizer_sgd",
                    # client's local learning rate
                    # "lr": 1,
                    "lr": 0.1,
                    # client's local momentum
                    "momentum": 0
                }
            },
            # number of users per round for aggregation
            "users_per_round": USERS_PER_ROUND,
            # total number of global epochs
            # total #rounds = ceil(total_users / users_per_round) * epochs <---- THIS IS THE MAIN COMMUNICATION COST METRIC
            #   total_users = ~2500, data_provider.num_users()
            # "epochs": 1,
            "epochs": epochs,
            # frequency of reporting train metrics
            "train_metrics_reported_per_epoch": 1,
            # keep the trained model always (as opposed to only when it
            # performs better than the previous model on eval)
            "always_keep_trained_model": False,
            # frequency of evaluation per epoch
            "eval_epoch_frequency": 1,
            "do_eval": True,
            # should we report train metrics after global aggregation
            "report_train_metrics_after_aggregation": True
        }
    }

    cfg = fl_config_from_json(json_config)
#     if VERBOSE: print(OmegaConf.to_yaml(cfg))

    trainer = instantiate(cfg.trainer, model=global_model, cuda_enabled=cuda_enabled)   
    # Launch FL training.
    final_model, eval_score = trainer.train( # eval_score should have a value, but it's returning None?
        data_provider=data_provider,
        metric_reporter=metrics_reporter,
        num_total_users=data_provider.num_users(),
        distributed_world_size=1,
    )
    
    acc = trainer.test(
                data_iter=data_provider.test_data(),
                metric_reporter=MetricsReporter([Channel.STDOUT]),
            )["Accuracy"]
    
    ret_checkpt = copy.deepcopy(model.state_dict())
    return (-acc),  ret_checkpt # -acc since the hyperband functions aims to minimize



In [None]:
accuracies, hps = hyperband(objective=FLSim_Adam_objective, 
                            dimensions=dimensions,
                            downsample=2,
                            max_resources_per_model=2**5) # this parameter needs adjusting
for acc, hp in zip(accuracies, hps):
    print(acc, hp)