# Setup

In [None]:
from google.colab import drive, files
drive.mount('/content/drive')

Mounted at /content/drive


## Install required libraries

In [None]:
!pip3 install 'torch'
!pip3 install 'torchvision'
!pip3 install 'pillow'
!pip3 install 'tqdm'
!pip3 install 'matplotlib'

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Clone the GitHub repository 

In [None]:
import os

if not os.path.isdir('fedmd'):
  !git clone https://github.com/sergiuabed/fedmd
  %cd fedmd/
  !git checkout experiment
  %cd ..
else:
  %cd fedmd/
  !git pull origin
  %cd ..

if not os.path.isdir('fedmd/baselines_public_cifar10'):
  !mkdir fedmd/baselines_public_cifar10

Cloning into 'fedmd'...
remote: Enumerating objects: 2472, done.[K
remote: Counting objects: 100% (347/347), done.[K
remote: Compressing objects: 100% (322/322), done.[K
remote: Total 2472 (delta 26), reused 344 (delta 23), pack-reused 2125[K
Receiving objects: 100% (2472/2472), 403.74 MiB | 24.53 MiB/s, done.
Resolving deltas: 100% (790/790), done.
/content/fedmd
Updating files: 100% (416/416), done.
Branch 'experiment' set up to track remote branch 'experiment' from 'origin'.
Switched to a new branch 'experiment'
/content


## Import libraries

In [None]:
from fedmd.models_implementations.resnet import ResNet
from fedmd.models_implementations.densenet import DenseNet
from fedmd.models_implementations.shufflenetv2 import ShuffleNetV2
from fedmd.models_implementations.utils import model_size, plot_stats
from fedmd.models_implementations.train_on_cifar import train_on_cifar
from fedmd.data_utils import read_data_splits
from fedmd.client.private_dataloader import ClientPrivateDataset
from torch.utils.data import DataLoader
from fedmd.client.client import Client

# Train each client on their respective private data
Each client has as private dataset a subset of CIFAR100. The subsets are defined based on Dirichlet distribution which takes an input ALPHA. For vey low ALPHA, the subsets are non-IID.

## Load CIFAR100 locally

In [None]:
%cd fedmd/data
!chmod +x setup_datasets.sh
!./setup_datasets.sh
%cd ../..

/content/fedmd/data
Downloading CIFAR100...
--2023-05-10 23:42:49--  https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz
Resolving www.cs.toronto.edu (www.cs.toronto.edu)... 128.100.3.30
Connecting to www.cs.toronto.edu (www.cs.toronto.edu)|128.100.3.30|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 169001437 (161M) [application/x-gzip]
Saving to: ‘cifar-100-python.tar.gz’


2023-05-10 23:42:51 (79.5 MB/s) - ‘cifar-100-python.tar.gz’ saved [169001437/169001437]

cifar-100-python/
cifar-100-python/file.txt~
cifar-100-python/train
cifar-100-python/test
cifar-100-python/meta
Extracting images for pickle files...
#### Setting up CIFAR100 ####
Saving train images...
50000it [00:31, 1578.11it/s]
Saving test images...
10000it [00:07, 1382.98it/s]
/content


## Define hyperparameters
This cell defines the hyperparameters for each architecture

In [None]:
LR = 1e-1     
MOMENTUM = 0.9       
WEIGHT_DECAY = 1e-4  
NUM_EPOCHS = 30
NUM_CLASSES = 10
GROUPS = 2

#ResNet
LAYERS = [3, 3, 3]
FILE_PATH =  os.getcwd() + '/fedmd/baselines_public_cifar10/resnet20'

#DenseNet
K = 12
FILE_PATH =  os.getcwd() + '/fedmd/baselines_public_cifar10/densenet'

#ShuffleNet
FILE_PATH =  os.getcwd() + '/fedmd/baselines_public_cifar10/shufflenetv2'
STAGES_REPEATS_BIG = [4, 8, 4]
STAGES_OUT_CHANNELS_BIG = [32, 64, 128, 256, 512]
STAGES_REPEATS_SMALL = [2, 4, 2]
STAGES_OUT_CHANNELS_SMALL = [16, 32, 64, 128, 256]

RESNET20_BN_PARAMS = [LAYERS, NUM_CLASSES] #net = ResNet(layers = LAYERS, num_classes = NUM_CLASSES)
RESNET20_GN_PARAMS = [LAYERS, NUM_CLASSES, "gn", GROUPS] #net = ResNet(layers = LAYERS, num_classes = NUM_CLASSES, norm_layer="gn", groups=GROUPS)

DENSENET10_PARAMS = [K, 10, 1, NUM_CLASSES, False, GROUPS]  #net = DenseNet(K, 10, 1, NUM_CLASSES, False, GROUPS)
DENSENET20_PARAMS = [K, 20, 1, NUM_CLASSES, False, GROUPS]  #net = DenseNet(K, 20, 1, NUM_CLASSES, False, GROUPS)

SHUFFLENET_BIG = [STAGES_REPEATS_BIG, STAGES_OUT_CHANNELS_BIG, NUM_CLASSES, GROUPS] #net = ShuffleNetV2(stages_repeats, stages_out_channels, NUM_CLASSES, GROUPS)
SHUFFLENET_SMALL = [STAGES_REPEATS_SMALL, STAGES_OUT_CHANNELS_SMALL, NUM_CLASSES, GROUPS] #net = ShuffleNetV2(stages_repeats, stages_out_channels, NUM_CLASSES, GROUPS)

#architectures = ["/resnet20/resnet20_bn", "/resnet20/resnet20_gn", "/densenet/densenet10", "/densenet/densenet20", "/shufflenetv2/shufflenetbig", "/shufflenetv2/shufflenetsmall"]

archs_dir = {}
archs_dir["/resnet20/resnet20_bn"] = (ResNet, RESNET20_BN_PARAMS)
archs_dir["/resnet20/resnet20_gn"] = (ResNet, RESNET20_GN_PARAMS)
archs_dir["/densenet/densenet10"] = (DenseNet, DENSENET10_PARAMS)
archs_dir["/densenet/densenet20"] = (DenseNet, DENSENET20_PARAMS)
archs_dir["/shufflenetv2/shufflenetbig"] = (ShuffleNetV2, SHUFFLENET_BIG)
archs_dir["/shufflenetv2/shufflenetsmall"] = (ShuffleNetV2, SHUFFLENET_SMALL)


## Load dictionary of architectures
"client_architectures.csv" contains entries of the form "client_id,architecture"

In this cell, we create a dictionary "client_archs" having as key the id of the client and as value the network instantiated using the hyperparameters defined above. The network parameters are initialized with the parameters obtained after training on the public dataset (CIFAR10).

In [None]:
import csv
import torch.nn as nn
from fedmd.models_implementations.utils import load_model

filename ="fedmd/client/client_architectures.csv"
# this csv has records with attributes "client_id"(0,1,2,3,...) and "architecture" (the keys used in "archs_dir" above)

client_archs = {} #key=client_id, val=model instance

with open(filename,'r') as data:
  for line in csv.reader(data):
    if line[0] != 'client_id':
      client_id = line[0]
      architecture = line[1]
      
      net_class = archs_dir[architecture][0]
      params = archs_dir[architecture][1]

      # instantiate model for the client
      client_archs[client_id] = net_class(*params)

      # load parameters of best model on the public dataset (CIFAR10)

      data = load_model('fedmd/baselines_public_cifar10'+architecture+'/best_model.pth')
      weights = data["weights"]
      client_archs[client_id].load_state_dict(weights)

      # changing the last FC layer
      cifar100_fc = nn.Linear(in_features=client_archs[client_id].fc.in_features, out_features=100)
      client_archs[client_id].fc = cifar100_fc

print(len(client_archs))


100


The next cell will create the directory "independent_train" and a subdirectory for each client where the checkpoints and the stats will be stored for each client

In [None]:
import os
if not os.path.isdir('independent_train'):
  PATH = os.getcwd() + '/independent_train'

  c_ids = [str(i) for i in range(100)]
  !mkdir independent_train

  for c_id in c_ids:
    client_path = PATH + '/client' + c_id
    os.mkdir(client_path)

In [None]:
from fedmd.models_implementations.train_on_cifar import _training
from fedmd.client.private_dataloader import ClientPrivateDataset
from fedmd.data_utils import read_data_splits
from torchvision.datasets import CIFAR100
from torchvision import transforms

PATH = os.getcwd() + '/independent_train'

LR = 1e-1     
MOMENTUM = 0.9       
WEIGHT_DECAY = 1e-4  
#NUM_EPOCHS = 150
NUM_CLASSES = 100
GROUPS = 2
LAYERS = [3, 3, 3]

PRIVATE_TRAIN_DATA_DIR = os.path.join('.', 'fedmd', 'data', 'cifar100', 'data', 'train') # location of json files storing the data splits
PRIVATE_TEST_DATA_DIR = os.path.join('.', 'fedmd', 'data', 'cifar100', 'data', 'test')
ALPHA = 1000.00
#ALPHA = 0.00    # non-IID
#ALPHA = 0.50    # non-IID

BATCH_SIZE = 128
NUM_WORKERS = 1

client_ids, train_data, test_data = read_data_splits(PRIVATE_TRAIN_DATA_DIR, PRIVATE_TEST_DATA_DIR, ALPHA)
# train_clients: list of client ids 
# train_data: dictionary with key=client_id and value=(dictionary storing the data of the client)
# test_data: dictionary storing the data for validation. It is not a dictionary of dictionaries. It is used by all clients

eval_transform = transforms.Compose(
    [
        transforms.ToTensor(),  # Turn PIL Image to torch.Tensor
        transforms.Normalize(
            (0.485, 0.456, 0.406), (0.229, 0.224, 0.225)
        ),  # Normalizes tensor with mean and standard deviation
    ]
)

private_test_dataset = CIFAR100(root=".", train=True, transform=eval_transform, download=True)   # the same test dataset for all clients
private_test_dataloader = DataLoader(private_test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)

private_train_dataloaders = {}
for cycle in range(4,10):
  c_ids = [str(i) for i in range(cycle*10, (cycle+1)*10)]
  for c_id in c_ids:
    client_path = PATH + '/client' + c_id# + '_' + str(ALPHA)

    private_train_dataset = ClientPrivateDataset(train_data[c_id], train=True)
    private_train_dataloaders[c_id] = DataLoader(private_train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)# , drop_last=True)

    num_epochs = 10 # in case we'll analyse ALPHA=0.50
    if ALPHA == 0.00:
      num_epochs = 5
    elif ALPHA == 1000.00:
      num_epochs = 30

    print()
    print(f"####################################### Beginning training on client{c_id} on it private data: ALPHA={ALPHA} ############################################")
    print()
    _training(net=client_archs[c_id], tr_set=private_train_dataloaders[c_id], val_set=private_test_dataloader, num_epochs=num_epochs, lr=LR, momentum=MOMENTUM, weight_decay=WEIGHT_DECAY,
              file_path=client_path)

    os.rename('independent_train/client'+c_id+'/best_model.pth', 'independent_train/client'+c_id+'/best_model_'+str(ALPHA)+'.pth')
    os.rename('independent_train/client'+c_id+'/checkpoint.pth', 'independent_train/client'+c_id+'/checkpoint_'+str(ALPHA)+'.pth')
    os.rename('independent_train/client'+c_id+'/stats.csv', 'independent_train/client'+c_id+'/stats_'+str(ALPHA)+'.csv')

  zip_name = f"/content/independent_train_{cycle*10}_{(cycle+1)*10}_ALPHA_{ALPHA}.zip"
  !zip -r {zip_name} /content/independent_train/
  !cp {zip_name} /content/drive/MyDrive/understanding_federated_learning_project/independent_train



Downloading https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz to ./cifar-100-python.tar.gz


100%|██████████| 169001437/169001437 [00:01<00:00, 106110803.14it/s]


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Current Val Accuracy = 0.16308343989769822
Starting epoch 18/30, LR = 0.1
Current Avg Loss = 2.034829616546631
Current Val Accuracy = 0.1578484654731458
Starting epoch 19/30, LR = 0.1
Current Avg Loss = 1.726812720298767
Current Val Accuracy = 0.16825847186700768
Starting epoch 20/30, LR = 0.1
Current Avg Loss = 1.6916913986206055
Current Val Accuracy = 0.16861812659846548
Starting epoch 21/30, LR = 0.1
Current Avg Loss = 1.601304531097412
Current Val Accuracy = 0.16156489769820973
Starting epoch 22/30, LR = 0.1
Current Avg Loss = 1.4701106548309326
Current Val Accuracy = 0.15403212915601022
Starting epoch 23/30, LR = 0.1
Current Avg Loss = 1.5155078172683716
Current Val Accuracy = 0.17413283248081843
Starting epoch 24/30, LR = 0.1
Current Avg Loss = 1.545712947845459
Current Val Accuracy = 0.1698968989769821
Starting epoch 25/30, LR = 0.1
Current Avg Loss = 1.343461036682129
Current Val Accuracy = 0.1716951726342711
Star

In [None]:
  zip_name = f"/content/independent_train_0_7_ALPHA_{ALPHA}.zip"
  !zip -r {zip_name} /content/independent_train/
  !cp {zip_name} /content/drive/MyDrive/understanding_federated_learning_project/independent_train