##### *EAS 595 - Fundamentals of AI*
# **Project 2**
Federated learning using Logistic Regression

*Pysyft installation*

In [0]:
# !pip3 install syft
# !pip3 install tf-encrypted
# !pip3 install pandas

*Importing required libraries*

In [0]:

import pandas as pd
import numpy as np

from matplotlib import pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

import torch

from torch.utils.data import DataLoader, TensorDataset
from torch.nn import functional

import syft

from syft.frameworks.torch import fl


*Mounting GDrive to access wdbc.data file*

In [0]:

from google.colab import drive
drive.mount('/content/gdrive')


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


*The following codeblock has the functions that are provided in syft.frameworks.torch.federated.utils.py - the operation performed by each function is provided within the functions.*

*The main aim to use these functions is to generate a much improved version of the model than the previous one* (Refer: [https://towardsdatascience.com/federated-learning-3097547f8ca3](https://)) 

In [0]:

def add_model(model1, model2):
  '''
  This function adds the parameters of 2 models
  '''

  param1 = model2.named_parameters()
  param2 = model1.named_parameters()

  dict_param2 = dict(param2)

  with torch.no_grad():
    for n1, p1 in param1:
      if n1 in dict_param2:
        dict_param2[n1].set_(p1.data + dict_param2[n1].data)

  return model1

def scale_model(model, scale):
  '''
  This function scales the parameters of a model
  '''

  param = model.named_parameters()

  dict_param = dict(param)

  with torch.no_grad():
    for n, p in dict_param.items():
      dict_param[n].set_(dict_param[n].data * scale)

  return model

def federated_avg(model_dict):
  '''
  This function calculates the federated average of the list of models
  '''

  no_of_models = len(model_dict)

  model_list = list(model_dict.values())

  init_model = model_list[0]

  for i in range(no_of_models):
    init_model = add_model(init_model, model_list[i])
  
  init_model = scale_model(model, 1.0/no_of_models)

  return init_model


*If the dataset is in a different location, change the following variable*

In [0]:

class data_specs:
  wdbc_data= "/content/gdrive/My Drive/wdbc.data"


*The properties of the model can be altered by changing the variables in this class*

In [0]:

class model_specs:
  batch_size= 30
  learn_rate= 0.001
  iterations= 100


*This class creates a Logistic Regression model with a 1-layer NN with 30 features (as in wdbc.data)*

In [0]:

class lr_model(torch.nn.Module):
  def __init__(self, features, output):
    super(lr_model, self).__init__()
    self.linear= torch.nn.Linear(features, output)
    self.sigmoid= torch.nn.Sigmoid()

  def forward(self, x):
    y_pred= self.sigmoid(self.linear(x))

    return y_pred


#### **Step 1: Extract features values and Image Ids from the data**

*Reading wdbc.data file*

In [0]:

pd.set_option("display.max_colwidth", 100)
wdbc_data= pd.read_csv(data_specs.wdbc_data, delimiter= ",", encoding= "utf-8", header= None)

print(wdbc_data.head())


         0  1      2      3       4   ...      27      28      29      30       31
0    842302  M  17.99  10.38  122.80  ...  0.6656  0.7119  0.2654  0.4601  0.11890
1    842517  M  20.57  17.77  132.90  ...  0.1866  0.2416  0.1860  0.2750  0.08902
2  84300903  M  19.69  21.25  130.00  ...  0.4245  0.4504  0.2430  0.3613  0.08758
3  84348301  M  11.42  20.38   77.58  ...  0.8663  0.6869  0.2575  0.6638  0.17300
4  84358402  M  20.29  14.34  135.10  ...  0.2050  0.4000  0.1625  0.2364  0.07678

[5 rows x 32 columns]


*Prepparing data for reading*

In [0]:

wdbc_data.rename(columns= {
    0: "id",
    1: "diagnosis"
}, inplace= True)

target = wdbc_data["diagnosis"]
y = np.array([1 if i == "M" else 0 for i in target])

del wdbc_data["id"]
del wdbc_data["diagnosis"]

print(wdbc_data.head())


      2      3       4       5   ...      28      29      30       31
0  17.99  10.38  122.80  1001.0  ...  0.7119  0.2654  0.4601  0.11890
1  20.57  17.77  132.90  1326.0  ...  0.2416  0.1860  0.2750  0.08902
2  19.69  21.25  130.00  1203.0  ...  0.4504  0.2430  0.3613  0.08758
3  11.42  20.38   77.58   386.1  ...  0.6869  0.2575  0.6638  0.17300
4  20.29  14.34  135.10  1297.0  ...  0.4000  0.1625  0.2364  0.07678

[5 rows x 30 columns]


#### **Step 2: Apply feature scaling technique**

*Scaling data to a common magnitude*

In [0]:

scaler = StandardScaler()
x = scaler.fit_transform(wdbc_data)

print(x)


[[ 1.09706398 -2.07333501  1.26993369 ...  2.29607613  2.75062224
   1.93701461]
 [ 1.82982061 -0.35363241  1.68595471 ...  1.0870843  -0.24388967
   0.28118999]
 [ 1.57988811  0.45618695  1.56650313 ...  1.95500035  1.152255
   0.20139121]
 ...
 [ 0.70228425  2.0455738   0.67267578 ...  0.41406869 -1.10454895
  -0.31840916]
 [ 1.83834103  2.33645719  1.98252415 ...  2.28998549  1.91908301
   2.21963528]
 [-1.80840125  1.22179204 -1.81438851 ... -1.74506282 -0.04813821
  -0.75120669]]


#### **Step 3: Data Partitioning**

*Splitting available data for training and testing purposes*

In [0]:

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2)


#### **Step 4: Create Logisitic Regression Architecture using Pytorch library**

*Prepparing traing data for Pytorch*

In [0]:

x_train_torch = torch.from_numpy(x_train).float()
y_train_torch = torch.from_numpy(y_train)

train_data = TensorDataset(x_train_torch, y_train_torch)

train_load = DataLoader(train_data, batch_size= model_specs.batch_size, shuffle= False)


*Prepparing testing data for Pytorch*

In [0]:

x_test_torch = torch.from_numpy(x_test).float()
y_test_torch = torch.from_numpy(y_test)

test_data = TensorDataset(x_test_torch, y_test_torch)

test_load = DataLoader(test_data, batch_size= model_specs.batch_size, shuffle= False)


*Initializing a Logistic Regression model, criterion & optimizer*

In [0]:

no_of_features = x_train_torch.shape[1]

model = lr_model(features= no_of_features, output= 1)

criterion = torch.nn.BCELoss(reduction= "sum")
optimizer = torch.optim.SGD(model.parameters(), lr= model_specs.learn_rate)

print(model)


lr_model(
  (linear): Linear(in_features=30, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


*Training the model*

In [0]:

for epochs in range(model_specs.iterations):
    y_pred = model(x_train_torch)
    loss = criterion(y_pred, y_train_torch.view(-1, 1).float())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


*Predicting output for test data (20% of the whole dataset)*

In [0]:

model_pred = model.forward(x_test_torch) > 0.5

predicted_values = model_pred.numpy().reshape(model_pred.size()[0],)
true_values = y_test_torch.numpy()

print("The confusion matrix for the Logistic Regerssion is ")
print(confusion_matrix(true_values, predicted_values))
print("\nThe classification report is ")
print(classification_report(true_values, predicted_values))
print("\nThe accuracy score is "+str(accuracy_score(true_values, predicted_values)*100)+"%")


The confusion matrix for the Logistic Regerssion is 
[[69  0]
 [ 2 43]]

The classification report is 
              precision    recall  f1-score   support

           0       0.97      1.00      0.99        69
           1       1.00      0.96      0.98        45

    accuracy                           0.98       114
   macro avg       0.99      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114


The accuracy score is 98.24561403508771%


#### **Step 5: Connect to the workers of the hospitals for training**

*Creating a hook and 2 virtual workers (ECMC & BuffaloGeneral)*

In [0]:

hook = syft.TorchHook(torch)

ECMC = syft.VirtualWorker(hook, id= "ECMC")
BuffaloGeneral = syft.VirtualWorker(hook, id= "BuffaloGeneral")

workers = [ECMC, BuffaloGeneral]

print(ECMC)
print(BuffaloGeneral)


<VirtualWorker id:ECMC #objects:0>
<VirtualWorker id:BuffaloGeneral #objects:0>


#### **Step 6: Send the data to the workers of the hospitals for training**

*Sending every second batch data to the respective workers and thus creating a local worker dataset*

In [0]:

worker_dataset = ([], [])
train_distributed_dataset = []

for each_batch, (data, target) in enumerate(train_load):
  data = data.send(workers[each_batch % len(workers)])
  target = target.send(workers[each_batch % len(workers)])

  worker_dataset[each_batch % len(workers)].append((data, target))


#### **Step 7: Train and test the federated logistic regression model**

*Creating two model instances and optimizers for each worker*

In [0]:

fl_model = lr_model(no_of_features, 1)

ECMC_model = lr_model(no_of_features, 1)
ECMC_optimizer = torch.optim.SGD(ECMC_model.parameters(), lr= model_specs.learn_rate)

BuffaloGeneral_model = lr_model(no_of_features, 1)
BuffaloGeneral_optimizer = torch.optim.SGD(BuffaloGeneral_model.parameters(), lr= model_specs.learn_rate)

worker_models = [ECMC_model, BuffaloGeneral_model]
worker_optimizers = [ECMC_optimizer, BuffaloGeneral_optimizer]

print(worker_models)
print(worker_optimizers)


[lr_model(
  (linear): Linear(in_features=30, out_features=1, bias=True)
  (sigmoid): Sigmoid()
), lr_model(
  (linear): Linear(in_features=30, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)]
[SGD (
Parameter Group 0
    dampening: 0
    lr: 0.001
    momentum: 0
    nesterov: False
    weight_decay: 0
), SGD (
Parameter Group 0
    dampening: 0
    lr: 0.001
    momentum: 0
    nesterov: False
    weight_decay: 0
)]


*Function to train the models of each worker*

In [0]:

def fl_train():
  for each_data in range(len(worker_dataset[0])-1):
    for each_worker in range(len(workers)):
      data, target = worker_dataset[each_worker][each_data]

      worker_models[each_worker].send(data.location)
      worker_optimizers[each_worker].zero_grad()
      worker_pred = worker_models[each_worker](data)
      loss = functional.mse_loss(worker_pred, target.view(-1,1).float(), reduction= "sum")
      loss.backward()
      worker_optimizers[each_worker].step()

      '''
      To see the decrease in loss in models of each model, uncomment the follwing line.

      *Commented to provide visual clarity when printed*
      '''
      # print("Loss at "+str(workers[each_worker].id)+" worker is "+str(loss.get().data))

    for each_model in worker_models:
      each_model.get()

    avg = {"ECMC": worker_models[0], "BuffloGeneral": worker_models[1]}

    return avg


*Function to test the models*

In [0]:

def fl_test(model):
  model.eval()
  loss = 0

  correct_priv = 0
  tot = 0

  for data, target in test_load:
    model_pred = model(data)
    loss = loss + functional.mse_loss(model_pred.view(-1), target, reduction= "sum").item()
    final_pred = model_pred.data.max(1, keepdim= True)[1]

    print("The confusion matrix")
    print(confusion_matrix(target.data, final_pred.data))
    print(classification_report(target.data, final_pred.data))
    print(accuracy_score(target.data, final_pred.data))


#### **Step 8: Print results: Your code should print Accuracy, Precision, Recall and Confusion matrix resulting from federated logistic regression model. Your report should describe the results.**

The models of each workers will be trained & tested over the given number of iterations

In [0]:

for each_iteration in range(model_specs.iterations):
  trained_model_dict = fl_train()

  trained_model = federated_avg(trained_model_dict)

  fl_test(trained_model)


The confusion matrix
[[17  0]
 [13  0]]
              precision    recall  f1-score   support

           0       0.57      1.00      0.72        17
           1       0.00      0.00      0.00        13

    accuracy                           0.57        30
   macro avg       0.28      0.50      0.36        30
weighted avg       0.32      0.57      0.41        30

0.5666666666666667
The confusion matrix
[[17  0]
 [13  0]]
              precision    recall  f1-score   support

           0       0.57      1.00      0.72        17
           1       0.00      0.00      0.00        13

    accuracy                           0.57        30
   macro avg       0.28      0.50      0.36        30
weighted avg       0.32      0.57      0.41        30

0.5666666666666667
The confusion matrix
[[18  0]
 [12  0]]
              precision    recall  f1-score   support

           0       0.60      1.00      0.75        18
           1       0.00      0.00      0.00        12

    accuracy            

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.57      1.00      0.72        17
           1       0.00      0.00      0.00        13

    accuracy                           0.57        30
   macro avg       0.28      0.50      0.36        30
weighted avg       0.32      0.57      0.41        30

0.5666666666666667
The confusion matrix
[[17  0]
 [13  0]]
              precision    recall  f1-score   support

           0       0.57      1.00      0.72        17
           1       0.00      0.00      0.00        13

    accuracy                           0.57        30
   macro avg       0.28      0.50      0.36        30
weighted avg       0.32      0.57      0.41        30

0.5666666666666667
The confusion matrix
[[18  0]
 [12  0]]
              precision    recall  f1-score   support

           0       0.60      1.00      0.75        18
           1       0.00      0.00      0.00        12

    accuracy                           0.60        30
   macro a