# Workflow Interface 104: Synthetic Data with Fedcurv implementation
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/intel/openfl/blob/develop/openfl-tutorials/experimental/Workflow_Interface_104_Synthetic_data_with_fedcurv.ipynb)

In this OpenFL workflow interface tutorial, we'll learn how to implement FedCurv aggregation algorithm using Synthetic dataset. For more information on comparison amongst various aggregation algorithms, visit the [FedProx tutorial]<insert link when ready>.

# Getting Started

First we start by installing the necessary dependencies for the workflow interface

In [None]:
# !pip install git+https://github.com/intel/openfl.git
# !pip install -r https://raw.githubusercontent.com/intel/openfl/develop/openfl-tutorials/experimental/requirements_workflow_interface.txt

# Uncomment this if running in Google Colab
#import os
#os.environ["USERNAME"] = "colab"

In [None]:
import torch
import torch.utils.data as data
import torch.nn as nn
import torch.nn.functional as F

import numpy as np

import random
import collections

import warnings
warnings.filterwarnings("ignore")

Now we'll generate synthetic dataset and define the Synthetic Dataset class for our experiment.

In [None]:
RANDOM_SEED = 10
batch_size = 10

# Sets seed to reproduce the results
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.use_deterministic_algorithms(True)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.enabled = False
    np.random.seed(seed)
    random.seed(seed)

# Uncomment the line below for setting seed.
# set_seed(RANDOM_SEED)


def one_hot(labels, classes):
    return np.eye(classes)[labels]


def softmax(x):
    ex = np.exp(x)
    sum_ex = np.sum(np.exp(x))
    return ex / sum_ex


def generate_synthetic(alpha, beta, iid, num_collaborators, num_classes):
    dimension = 60
    NUM_CLASS = num_classes
    NUM_USER = num_collaborators

    samples_per_user = np.random.lognormal(4, 2, (NUM_USER)).astype(int) + 50
    num_samples = np.sum(samples_per_user)

    X_split = [[] for _ in range(NUM_USER)]
    y_split = [[] for _ in range(NUM_USER)]

    #### define some eprior ####
    mean_W = np.random.normal(0, alpha, NUM_USER)
    mean_b = mean_W
    B = np.random.normal(0, beta, NUM_USER)
    mean_x = np.zeros((NUM_USER, dimension))

    diagonal = np.zeros(dimension)
    for j in range(dimension):
        diagonal[j] = np.power((j + 1), -1.2)
    cov_x = np.diag(diagonal)

    for i in range(NUM_USER):
        if iid == 1:
            mean_x[i] = np.ones(dimension) * B[i]  # all zeros
        else:
            mean_x[i] = np.random.normal(B[i], 1, dimension)

    if iid == 1:
        W_global = np.random.normal(0, 1, (dimension, NUM_CLASS))
        b_global = np.random.normal(0, 1, NUM_CLASS)

    for i in range(NUM_USER):

        W = np.random.normal(mean_W[i], 1, (dimension, NUM_CLASS))
        b = np.random.normal(mean_b[i], 1, NUM_CLASS)

        if iid == 1:
            W = W_global
            b = b_global

        xx = np.random.multivariate_normal(
            mean_x[i], cov_x, samples_per_user[i])
        yy = np.zeros(samples_per_user[i])

        for j in range(samples_per_user[i]):
            tmp = np.dot(xx[j], W) + b
            yy[j] = np.argmax(softmax(tmp))

        X_split[i] = xx.tolist()
        y_split[i] = yy.tolist()

    return X_split, y_split


class SyntheticFederatedDataset:
    def __init__(self, num_collaborators, batch_size=1, num_classes=10, **kwargs):
        self.batch_size = batch_size
        X, y = generate_synthetic(0.0, 0.0, 0, num_collaborators, num_classes)
        X = [np.array([np.array(sample).astype(np.float32)
                      for sample in col]) for col in X]
        y = [np.array([np.array(one_hot(int(sample), num_classes))
                      for sample in col]) for col in y]
        self.X_train_all = np.array([col[:int(0.9 * len(col))] for col in X], dtype=np.ndarray)
        self.X_valid_all = np.array([col[int(0.9 * len(col)):] for col in X], dtype=np.ndarray)
        self.y_train_all = np.array([col[:int(0.9 * len(col))] for col in y], dtype=np.ndarray)
        self.y_valid_all = np.array([col[int(0.9 * len(col)):] for col in y], dtype=np.ndarray)

    def split(self, collaborators):
        for i, collaborator in enumerate(collaborators):
            collaborator.private_attributes = {
                "train_loader":
                    data.DataLoader(
                        data.TensorDataset(
                            torch.from_numpy(self.X_train_all[i]),
                            torch.from_numpy(self.y_train_all[i])
                        ), 
                        batch_size=batch_size, shuffle=True
                    ),
                "test_loader":
                    data.DataLoader(
                        data.TensorDataset(
                            torch.from_numpy(self.X_valid_all[i]),
                            torch.from_numpy(self.y_valid_all[i])
                        ), 
                        batch_size=batch_size, shuffle=True
                    )
            }

Let's now define the model, optimizer and some helper functions.

In [None]:
class Net(nn.Module):
    def __init__(self):
        # Set RANDOM_SEED to reproduce same model
        torch.set_rng_state(torch.manual_seed(RANDOM_SEED).get_state())
        super(Net, self).__init__()
        self.linear1 = nn.Linear(60, 100)
        self.linear2 = nn.Linear(100, 10)

    def forward(self, x):
        x = self.linear1(x)
        x = self.linear2(x)
        return x
    
def cross_entropy(output, target, size_average=None):
    """
    Binary cross-entropy metric

    """
    return F.cross_entropy(output, torch.max(target, 1)[1], size_average=size_average)


def compute_loss_and_acc(network, dataloader):
    """
    Model test method

    Args:
        network: class Net object (model)
        dataloader: torch.utils.data.DataLoader

    Returns:
        (accuracy,
        loss,
        correct,
        dataloader_size)
    """
    network.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in dataloader:
            output = network(data)
            test_loss += cross_entropy(output, target).item()
            tar = target.argmax(dim=1, keepdim=True)
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(tar).sum().cpu().numpy()
    dataloader_size = len(dataloader.dataset)
    test_loss /= dataloader_size
    accuracy = float(correct / dataloader_size)
    return accuracy, test_loss, correct

Next we import the `FLSpec`, `LocalRuntime`, and placement decorators.

- `FLSpec` – Defines the flow specification. User defined flows are subclasses of this.
- `Runtime` – Defines where the flow runs, infrastructure for task transitions (how information gets sent). The `LocalRuntime` runs the flow on a single node.
- `aggregator/collaborator` - placement decorators that define where the task will be assigned

In addition to these, we also import `FedCurv` module along with `FedcurvWeightedAvg` aggregation algorithm.

In [None]:
from copy import deepcopy

from openfl.experimental.interface import FLSpec, Aggregator, Collaborator
from openfl.experimental.runtime import LocalRuntime
from openfl.experimental.placement import aggregator, collaborator

from openfl.experimental.interface.aggregation_functions.fedcurv_weighted_average import fedcurv_weighted_average
from openfl.experimental.utilities.fedcurv import FedCurv

Let us now define the Workflow for our experiment. We use the methodology as provided in quickstart, and define the workflow consisting of following steps:


In [None]:
class FederatedFlow(FLSpec):

    def __init__(self, model = None, optimizer = None, agg_method = None, n_selected_collaborators=10, total_rounds = 10, **kwargs):
        super().__init__(**kwargs)
        self.n_selected_collaborators = n_selected_collaborators
        self.total_rounds = total_rounds
        self.round_number = 0
        self.total_rounds = total_rounds
        if model is not None:
            self.model = model
            self.optimizer = optimizer
            self.agg_method = agg_method
        else:
            self.model = Net()
            self.optimizer = optim.SGD(self.model.parameters(), lr=learning_rate,
                               momentum=momentum)
            self.agg_method = FedCurv(self.model, importance=1e4)
        self.device = 'cpu'
        if torch.cuda.is_available():
            self.device = 'cuda:0'

    @aggregator
    def start(self):
        print(f'Performing initialization for model')
        print(20*"#")
        print(f"Round {self.round_number}")
        print(20*"#")
        self.collaborators = self.runtime.collaborators
        self.next(self.compute_loss_and_accuracy,foreach='collaborators')

    @collaborator
    def compute_loss_and_accuracy(self):
        """
        Compute training accuracy, training loss, aggregated validation accuracy,
        aggregated validation loss, 
        """
        # Compute Train Loss and Train Acc
        self.training_accuracy, self.training_loss, _, = compute_loss_and_acc(
            self.model, self.train_loader)
        
        # Compute Test Loss and Test Acc
        self.agg_validation_score, self.agg_validation_loss, test_correct = compute_loss_and_acc(
            self.model, self.test_loader)

        self.train_dataset_length = len(self.train_loader.dataset)
        self.test_dataset_length = len(self.test_loader.dataset)

        print(
            "<Collab: {:<5}> | Train Round: {:<5} : Train Loss {:<.6f}, Test Acc: {:<.6f} [{}/{}]".format(
                self.input,
                self.round_number,
                self.training_loss,
                self.agg_validation_score,
                test_correct, 
                self.test_dataset_length
            )
        )

        self.next(self.gather_results_and_take_weighted_average)

    @aggregator
    def gather_results_and_take_weighted_average(self, inputs):
        """
        Gather results of all collaborators computed in previous 
        step.
        Compute train and test weightes, and compute weighted average of 
        aggregated training loss, and aggregated test accuracy
        """
        # Calculate train_weights and test_weights
        train_datasize, test_datasize = [], []
        for input_ in inputs:
            train_datasize.append(input_.train_dataset_length)
            test_datasize.append(input_.test_dataset_length)

        self.train_weights, self.test_weights = [], []
        for input_ in inputs:
            self.train_weights.append(input_.train_dataset_length / sum(train_datasize))
            self.test_weights.append(input_.test_dataset_length / sum(test_datasize))

        aggregated_model_accuracy_list, aggregated_model_loss_list = [], []
        for input_ in inputs:
            aggregated_model_loss_list.append(input_.training_loss)
            aggregated_model_accuracy_list.append(input_.agg_validation_score)

        # Weighted average of training loss
        self.aggregated_model_training_loss = fedcurv_weighted_average(aggregated_model_loss_list, self.train_weights)

        # Weighted average of aggregated model accuracy
        self.aggregated_model_test_accuracy = fedcurv_weighted_average(aggregated_model_accuracy_list, self.test_weights)
        
        print(
            "<Agg> | Train Round: {:<5} : Agg Train Loss {:<.6f}, Agg Test Acc: {:<.6f}".format(
                self.round_number,
                self.aggregated_model_training_loss,
                self.aggregated_model_test_accuracy
            )
        )

        self.next(self.select_collaborators)
        
    @aggregator
    def select_collaborators(self):
        """
        Randomly select n_selected_collaborators collaborator
        """
        np.random.seed(self.round_number)
        self.selected_collaborator_indices = np.random.choice(range(len(self.collaborators)), \
            self.n_selected_collaborators, replace=False)
        self.selected_collaborators = [self.collaborators[idx] for idx in self.selected_collaborator_indices]

        self.next(self.train_selected_collaborators, foreach="selected_collaborators")

        
    @collaborator
    def train_selected_collaborators(self):
        """
        Train selected collaborators
        """

        self.train_dataset_length = len(self.train_loader.dataset)

        self.optimizer = optim.SGD(self.model.parameters(), lr=learning_rate,
                               momentum=momentum)

        self.agg_method.on_train_begin(self.model)
        self.model = self.model.to(self.device)
        self.model.train(mode=True)
        
        for epoch in range(local_epoch):
            train_loss = []
            correct = 0
            for data, target in self.train_loader:
                data = data.to(self.device)
                target = target.to(self.device)
                self.optimizer.zero_grad()
                output = self.model(data)
                loss = cross_entropy(output, target) + self.agg_method.get_penalty(self.model, self.device)
                loss.backward()
                self.optimizer.step()
                pred = output.argmax(dim=1, keepdim=True)
                tar = target.argmax(dim=1, keepdim=True)
                correct += pred.eq(tar).sum().cpu().numpy()
                train_loss.append(loss.item())
            training_accuracy = float(correct / self.train_dataset_length)
            training_loss = np.mean(train_loss)
            print(
                "<Collab: {:<5}> | Train Round: {:<5} | Local Epoch: {:<3}: FedCurv Optimization Train Loss {:<.6f}, Train Acc: {:<.6f} [{}/{}]".format(
                    self.input,
                    self.round_number,
                    epoch,
                    training_loss,
                    training_accuracy,
                    correct, 
                    len(self.train_loader.dataset)
                )
            )
            self.agg_method.on_train_end(self.model, self.train_loader, self.device, 'cross_entropy')
        self.next(self.join)


    @aggregator
    def join(self,inputs):
        train_datasize = sum([input_.train_dataset_length for input_ in inputs])

        train_weights, model_state_dict_list = [], [] 
        for input_ in inputs:
            train_weights.append(input_.train_dataset_length / train_datasize)
            model_state_dict_list.append(input_.model.state_dict())
        fedcurv_model_dict = fedcurv_weighted_average(model_state_dict_list, train_weights)
        self.model.load_state_dict(fedcurv_model_dict)
        self.next(self.end)
 
    @aggregator
    def end(self):
        if self.round_number == self.total_rounds - 1:
            print(f'This is the end of the flow')
        else:
            self.round_number += 1

****Federation Setup****

We'll now setup the federation by defining number of collaborators, initializing dataset and Runtime.

In [None]:
num_collaborators = 30

# Setup aggregator
aggregator = Aggregator()
aggregator.private_attributes = {}

# Setup collaborators with private attributes
collaborator_names = [f"col{i}" for i in range(num_collaborators)]

collaborators = [Collaborator(name=name) for name in collaborator_names]

synthetic_federated_dataset = SyntheticFederatedDataset(
    batch_size=batch_size, num_classes=10, num_collaborators=len(collaborators), seed=RANDOM_SEED)
synthetic_federated_dataset.split(collaborators)

local_runtime = LocalRuntime(
    aggregator=aggregator, collaborators=collaborators, backend="single_process")

In [None]:
loss_and_acc = {
    "Fedcurv": {
        "Train Loss": [], "Test Accuracy": []
    },
    "FedAvg": {
        "Train Loss": [], "Test Accuracy": []
    }
}

Now that we have our flow and runtime defined, let's run the experiment! 

In [None]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

n_selected_collaborators = 10
n_epochs = 200
local_epoch = 20
total_rounds = 5

learning_rate = 0.01
momentum = 0.5
log_interval = 10

flflow = FederatedFlow(n_selected_collaborators=n_selected_collaborators,
                       total_rounds=total_rounds)

flflow.runtime = local_runtime
for i in range(n_epochs):
    flflow.run()
    aggregated_model_training_loss = flflow.aggregated_model_training_loss
    aggregated_model_test_accuracy = flflow.aggregated_model_test_accuracy

    loss_and_acc["Fedcurv"]["Train Loss"].append(aggregated_model_training_loss)
    loss_and_acc["Fedcurv"]["Test Accuracy"].append(aggregated_model_test_accuracy)


**Comparison of aggregation algorithms**

Now that we have demonstrated Fedcurv on synthetic dataset, let's run through the [FedProx tutorial] to see how Fedcurv compares to FedAvg and FedProx.