# Federated Kvasir with Director example
## Using low-level Python API

# Long-Living entities update

* We now may have director running on another machine.
* We use Federation API to communicate with Director.
* Federation object should hold a Director's client (for user service)
* Keeping in mind that several API instances may be connacted to one Director.


* We do not think for now how we start a Director.
* But it knows the data shape and target shape for the DataScience problem in the Federation.
* Director holds the list of connected envoys, we do not need to specify it anymore.
* Director and Envoys are responsible for encrypting connections, we do not need to worry about certs.


* Yet we MUST have a cert to communicate to the Director.
* We MUST know the FQDN of a Director.
* Director communicates data and target shape to the Federation interface object.


* Experiment API may use this info to construct a dummy dataset and a `shard descriptor` stub.

In [1]:
# Install dependencies if not already installed
!pip install torchvision==0.8.1

You should consider upgrading via the '/home/itrushkin/.virtualenvs/openfl_research/bin/python -m pip install --upgrade pip' command.[0m


# Connect to the Federation

In [2]:
# Create a federation
from openfl.interface.interactive_api.federation import Federation

# please use the same identificator that was used in signed certificate
cliend_id = 'frontend'

# 1) Run with API layer - Director mTLS 
# If the user wants to enable mTLS their must provide CA root chain, and signed key pair to the federation interface
# cert_chain = 'cert/root_ca.crt'
# API_certificate = 'cert/frontend.crt'
# API_private_key = 'cert/frontend.key'

# federation = Federation(client_id='frontend', director_node_fqdn='localhost', director_port='50051',
#                        cert_chain=cert_chain, api_cert=API_certificate, api_private_key=API_private_key)

# --------------------------------------------------------------------------------------------------------------------

# 2) Run with TLS disabled (trusted environment)
# Federation can also determine local fqdn automatically
federation = Federation(client_id='frontend', director_node_fqdn='localhost', director_port='50052', tls=False)


In [3]:
shard_registry = federation.get_shard_registry()
shard_registry

{'env_one': {'shard_info': node_info {
    name: "env_one"
  }
  shard_description: "Kvasir dataset, shard number 1 out of 90"
  n_samples: 12
  sample_shape: "300"
  sample_shape: "400"
  sample_shape: "3"
  target_shape: "300"
  target_shape: "400",
  'is_online': True,
  'is_experiment_running': False,
  'last_updated': '2021-10-12 09:36:16',
  'current_time': '2021-10-12 09:36:32',
  'valid_duration': seconds: 120}}

In [4]:
federation.target_shape

['300', '400']

In [5]:
# First, request a dummy_shard_desc that holds information about the federated dataset 
dummy_shard_desc = federation.get_dummy_shard_descriptor(size=10)
sample, target = dummy_shard_desc[0]

## Creating a FL experiment using Interactive API

In [6]:
from openfl.interface.interactive_api.experiment import TaskInterface, DataInterface, ModelInterface, AggregationFunctionInterface, FLExperiment

### Register dataset

We extract User dataset class implementation.
Is it convinient?
What if the dataset is not a class?

In [7]:
import os
import PIL
import numpy as np
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
from torchvision import transforms as tsf

# Now you can implement you data loaders using dummy_shard_desc
class KvasirSD(DataInterface, Dataset):

    def __init__(self, validation_fraction=1/8, **kwargs):
        super().__init__(**kwargs)
        
        self.validation_fraction = validation_fraction
        
        # Prepare transforms
        self.img_trans = tsf.Compose([
            tsf.ToPILImage(),
            tsf.Resize((332, 332)),
            tsf.ToTensor(),
            tsf.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])])
        self.mask_trans = tsf.Compose([
            tsf.ToPILImage(),
            tsf.Resize((332, 332), interpolation=PIL.Image.NEAREST),
            tsf.ToTensor()])
        
    @property
    def shard_descriptor(self):
        return self._shard_descriptor
        
    @shard_descriptor.setter
    def shard_descriptor(self, shard_descriptor):
        """
        Describe per-collaborator procedures or sharding.

        This method will be called during a collaborator initialization.
        Local shard_descriptor  will be set by Envoy.
        """
        self._shard_descriptor = shard_descriptor
        
        validation_size = max(1, int(len(self.shard_descriptor) * self.validation_fraction))
        
        self.train_indeces = np.arange(len(self.shard_descriptor) - validation_size)
        self.val_indeces = np.arange(len(self.shard_descriptor) - validation_size, len(self.shard_descriptor))
        

    def __getitem__(self, index):
        img, mask = self.shard_descriptor[index]
        img = self.img_trans(img).numpy()
        mask = self.mask_trans(mask).numpy()
        return img, mask

    def __len__(self):
        return len(self.shard_descriptor)
    
    
    def get_train_loader(self, **kwargs):
        """
        Output of this method will be provided to tasks with optimizer in contract
        """
        train_sampler = SubsetRandomSampler(self.train_indeces)
        return DataLoader(
            self, num_workers=8, batch_size=self.kwargs['train_bs'], sampler=train_sampler
            )

    def get_valid_loader(self, **kwargs):
        """
        Output of this method will be provided to tasks without optimizer in contract
        """
        val_sampler = SubsetRandomSampler(self.val_indeces)
        return DataLoader(self, num_workers=8, batch_size=self.kwargs['valid_bs'], sampler=val_sampler)

    def get_train_data_size(self):
        """
        Information for aggregation
        """
        return len(self.train_indeces)

    def get_valid_data_size(self):
        """
        Information for aggregation
        """
        return len(self.val_indeces)

In [8]:
fed_dataset = KvasirSD(train_bs=4, valid_bs=8)
fed_dataset.shard_descriptor = dummy_shard_desc
for i, (sample, target) in enumerate(fed_dataset.get_train_loader()):
    print(sample.shape)

torch.Size([4, 3, 332, 332])
torch.Size([4, 3, 332, 332])
torch.Size([1, 3, 332, 332])


### Describe a model and optimizer

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim

In [10]:
"""
UNet model definition
"""
from layers import soft_dice_coef, soft_dice_loss, DoubleConv, Down, Up


class UNet(nn.Module):
    def __init__(self, n_channels=3, n_classes=1):
        super().__init__()
        self.inc = DoubleConv(n_channels, 64)
        self.down1 = Down(64, 128)
        self.down2 = Down(128, 256)
        self.down3 = Down(256, 512)
        self.up1 = Up(512, 256)
        self.up2 = Up(256, 128)
        self.up3 = Up(128, 64)
        self.outc = nn.Conv2d(64, n_classes, 1)

    def forward(self, x):
        x1 = self.inc(x)
        x2 = self.down1(x1)
        x3 = self.down2(x2)
        x4 = self.down3(x3)
        x = self.up1(x4, x3)
        x = self.up2(x, x2)
        x = self.up3(x, x1)
        x = self.outc(x)
        x = torch.sigmoid(x)
        return x
    
model_unet = UNet()

In [11]:
optimizer_adam = optim.Adam(model_unet.parameters(), lr=1e-4)

#### Register model

In [12]:
from copy import deepcopy

framework_adapter = 'openfl.plugins.frameworks_adapters.pytorch_adapter.FrameworkAdapterPlugin'
MI = ModelInterface(model=model_unet, optimizer=optimizer_adam, framework_plugin=framework_adapter)

# Save the initial model state
initial_model = deepcopy(model_unet)

### Define and register FL tasks

In [13]:
TI = TaskInterface()
AFI = AggregationFunctionInterface()
import torch

import tqdm
from openfl.component.aggregation_functions import Median

# The Interactive API supports registering functions definied in main module or imported.
def function_defined_in_notebook(some_parameter):
    print(f'Also I accept a parameter and it is {some_parameter}')

#The Interactive API supports overriding of the aggregation function
aggregation_function = Median()

# Task interface currently supports only standalone functions.
@TI.add_kwargs(**{'some_parameter': 42})
@TI.register_fl_task(model='unet_model', data_loader='train_loader', \
                     device='device', optimizer='optimizer')     
@AFI.set_aggregation_function(aggregation_function)
def train(unet_model, train_loader, optimizer, device, loss_fn=soft_dice_loss, some_parameter=None):
    if not torch.cuda.is_available():
        device = 'cpu'
    else:
        device = 'cuda'
    
    function_defined_in_notebook(some_parameter)
    
    train_loader = tqdm.tqdm(train_loader, desc="train")
    
    unet_model.train()
    unet_model.to(device)

    losses = []

    for data, target in train_loader:
        data, target = torch.tensor(data).to(device), torch.tensor(
            target).to(device, dtype=torch.float32)
        optimizer.zero_grad()
        output = unet_model(data)
        loss = loss_fn(output=output, target=target)
        loss.backward()
        optimizer.step()
        losses.append(loss.detach().cpu().numpy())
        
    return {'train_loss': np.mean(losses),}


@TI.register_fl_task(model='unet_model', data_loader='val_loader', device='device')     
def validate(unet_model, val_loader, device):
    if not torch.cuda.is_available():
        device = 'cpu'
    else:
        device = 'cuda'
        
    unet_model.eval()
    unet_model.to(device)
    
    val_loader = tqdm.tqdm(val_loader, desc="validate")

    val_score = 0
    total_samples = 0

    with torch.no_grad():
        for data, target in val_loader:
            samples = target.shape[0]
            total_samples += samples
            data, target = torch.tensor(data).to(device), \
                torch.tensor(target).to(device, dtype=torch.int64)
            output = unet_model(data)
            val = soft_dice_coef(output, target)
            val_score += val.sum().cpu().numpy()
            
    return {'dice_coef': val_score / total_samples,}

## Time to start a federated learning experiment

In [14]:
# create an experimnet in federation
experiment_name = 'kvasir_test_experiment'
fl_experiment = FLExperiment(federation=federation, experiment_name=experiment_name)

In [15]:
# If I use autoreload I got a pickling error

# The following command zips the workspace and python requirements to be transfered to collaborator nodes
fl_experiment.start(model_provider=MI, 
                    task_keeper=TI,
                    data_loader=fed_dataset,
                    aggregation_function_interface=AFI,
                    rounds_to_train=2,
                    opt_treatment='CONTINUE_GLOBAL',
                    )


In [16]:
# If user want to stop IPython session, then reconnect and check how experiment is going 
# fl_experiment.restore_experiment_state(MI)

fl_experiment.stream_metrics()

## Now we validate the best model!

In [17]:
best_model = fl_experiment.get_best_model()

  new_state[k] = pt.from_numpy(tensor_dict.pop(k)).to(device)


In [18]:
# We remove exremove_experiment_datamove_experiment_datamove_experiment_datariment data from director
fl_experiment.remove_experiment_data()

In [19]:
best_model.inc.conv[0].weight
# model_unet.inc.conv[0].weight

Parameter containing:
tensor([[[[-0.0737, -0.1915, -0.0344],
          [-0.0166,  0.1024, -0.1584],
          [ 0.0114, -0.0081,  0.1731]],

         [[-0.1079, -0.0270, -0.1239],
          [-0.1247, -0.1552, -0.1031],
          [ 0.1333,  0.0950, -0.1716]],

         [[-0.0861, -0.1247, -0.1340],
          [-0.0999,  0.1094, -0.1346],
          [-0.1167,  0.0042, -0.0011]]],


        [[[ 0.0277, -0.0771, -0.0197],
          [ 0.0077, -0.1779,  0.0551],
          [-0.1139, -0.0079,  0.1924]],

         [[-0.0055, -0.0722,  0.1574],
          [ 0.1661, -0.0146, -0.0411],
          [ 0.0908,  0.1240,  0.0143]],

         [[-0.0833,  0.0672,  0.0888],
          [-0.0406, -0.1838, -0.1030],
          [-0.0129, -0.0773, -0.1052]]],


        [[[-0.1422,  0.0150, -0.1097],
          [-0.0584,  0.0551, -0.0829],
          [ 0.1244,  0.1618,  0.1791]],

         [[ 0.1426,  0.1541, -0.1197],
          [ 0.1415, -0.0238,  0.0541],
          [ 0.1625,  0.1682,  0.1133]],

         [[-0.1785, -0

In [20]:
# Validating initial model
validate(initial_model, fed_dataset.get_valid_loader(), 'cpu')

  data, target = torch.tensor(data).to(device), \
  torch.tensor(target).to(device, dtype=torch.int64)
validate: 100%|██████████| 1/1 [00:00<00:00,  1.14it/s]


{'dice_coef': 3.748830931726843e-05}

In [21]:
# Validating trained model
validate(best_model, fed_dataset.get_valid_loader(), 'cpu')

  data, target = torch.tensor(data).to(device), \
  torch.tensor(target).to(device, dtype=torch.int64)
validate: 100%|██████████| 1/1 [00:00<00:00,  1.20it/s]


{'dice_coef': 3.6274592275731266e-05}

## We can tune model further!

In [22]:
MI = ModelInterface(model=best_model, optimizer=optimizer_adam, framework_plugin=framework_adapter)
fl_experiment.start(model_provider=MI, task_keeper=TI, data_loader=fed_dataset, 
                    aggregation_function_interface=AFI, rounds_to_train=4, \
                              opt_treatment='CONTINUE_GLOBAL')

In [23]:
best_model = fl_experiment.get_best_model()
# Validating trained model
validate(best_model, fed_dataset.get_valid_loader(), 'cpu')

  data, target = torch.tensor(data).to(device), \
  torch.tensor(target).to(device, dtype=torch.int64)
validate: 100%|██████████| 1/1 [00:00<00:00,  1.03it/s]


{'dice_coef': 3.6274592275731266e-05}

In [24]:
!pip install ~/repos/openfl-fork

Processing /home/itrushkin/repos/openfl-fork




Building wheels for collected packages: openfl
  Building wheel for openfl (setup.py) ... [?25ldone
[?25h  Created wheel for openfl: filename=openfl-1.1-py3-none-any.whl size=260071753 sha256=2b68f805bae4cd8a18e0f8054d6f96cbd37b5ba1ce88bb7a268c60216d12244f
  Stored in directory: /home/itrushkin/.cache/pip/wheels/ed/ad/82/f26baafab4be32d052fb76a6f56c9a76ce6903cc3eed7bb394
Successfully built openfl
Installing collected packages: openfl
  Attempting uninstall: openfl
    Found existing installation: openfl 1.1
    Uninstalling openfl-1.1:
      Successfully uninstalled openfl-1.1
Successfully installed openfl-1.1
You should consider upgrading via the '/home/itrushkin/.virtualenvs/openfl_research/bin/python -m pip install --upgrade pip' command.[0m
