In [None]:
!pip install flwr[simulation] --quiet

In [37]:
from collections import OrderedDict
from typing import List, Tuple

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import DataLoader, random_split, TensorDataset
import torchvision
import torchvision.transforms as transforms
from torchvision.datasets import CIFAR10

from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import train_test_split

import flwr as fl
from flwr.common import Metrics, Context

In [3]:
DEVICE = torch.device('cuda')

In [None]:
# !mkdir /root/.kaggle

# with open("/root/.kaggle/kaggle.json", "w+") as f:
#     f.write('{"username":"rajaxarcmu","key":"68d40c5e38e1c786ab57736bc5c9b2cb"}')
    
# !chmod 600 /root/.kaggle/kaggle.json
!kaggle datasets download -d 'danofer/compass'
!unzip -qo /kaggle/working/compass.zip -d '/kaggle/content'

In [None]:
!ls /kaggle/content

In [None]:
df = pd.read_csv('/kaggle/content/propublicaCompassRecividism_data_fairml.csv/propublica_data_for_fairml.csv')
print(df.shape)
df.head()

In [93]:
NUM_CLIENTS = 10
# REPRESENTS SILO'D ORGANIZATIONS

In [None]:
trainset, testset = train_test_split(df, test_size=0.2)

part_size = len(trainset) // NUM_CLIENTS
lengths = [part_size] * NUM_CLIENTS
trainset = trainset.sample(sum(lengths))

df_shuffled = trainset.sample(frac=1, random_state=42).reset_index(drop=True)
datasets = np.array_split(df_shuffled, NUM_CLIENTS)

train_loaders = []
val_loaders = []

feature_columns = ['Number_of_Priors', 'score_factor','Age_Above_FourtyFive', 'Age_Below_TwentyFive', 'Misdemeanor']

for ds in datasets:
    train_x = trainset[feature_columns].values
    train_y = trainset['Two_yr_Recidivism'].values
    
    train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.25, shuffle=True, stratify=train_y, random_state=42)
    train_x_tensor = torch.from_numpy(train_x).float()
    train_y_tensor = torch.from_numpy(train_y).float()

    valid_x_tensor = torch.from_numpy(valid_x).float()
    valid_y_tensor = torch.from_numpy(valid_y).float()

    # Create TensorDataset and DataLoader
    train_dataset = TensorDataset(train_x_tensor, train_y_tensor)
    valid_dataset = TensorDataset(valid_x_tensor, valid_y_tensor)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size)

    train_loaders.append(train_loader)
    val_loaders.append(valid_loader)

test_x = testset[feature_columns].values
test_y = testset['Two_yr_Recidivism'].values
test_x_tensor = torch.from_numpy(test_x).float()
test_y_tensor = torch.from_numpy(test_y).float()
test_dataset = TensorDataset(test_x_tensor, test_y_tensor)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [89]:
class BaselineNN(nn.Module):
    def __init__(self):
        super(BaselineNN, self).__init__()
        self.fc1 = nn.Linear(5, 16)
        self.fc2 = nn.Linear(16, 8)
        self.fc3 = nn.Linear(8, 1)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x

def train(net, trainloader, epochs, verbose=True):
    """
    Train Network on Training Set
    """
    criterion = nn.BCELoss()
    optimizer = optim.Adam(net.parameters())
    net.train()
    for epoch in range(epochs):
        correct, total, epoch_loss = 0, 0, 0.0
        for inputs, labels in trainloader:
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
            optimizer.zero_grad()
            outputs = net(inputs)
            labels = labels.view(-1, 1)  # Reshape labels to match outputs
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item() * inputs.size(0)
            predicted = (outputs >= 0.5).float()
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        epoch_loss /= len(trainloader.dataset)
        epoch_acc = correct / total
        if verbose:
            print(f"Epoch {epoch+1}/{epochs} - Loss: {epoch_loss:.4f} - Acc: {epoch_acc:.4f}")

def test(net, testloader):
    """
    Test Network on Test Set
    """
    criterion = nn.BCELoss()
    net.eval()
    correct, total, loss = 0, 0, 0.0
    with torch.no_grad():
        for inputs, labels in testloader:
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
            outputs = net(inputs)
            labels = labels.view(-1, 1)  # Reshape labels to match outputs
            loss += criterion(outputs, labels).item() * inputs.size(0)
            predicted = (outputs >= 0.5).float()
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    loss /= len(testloader.dataset)
    acc = correct / total
    print(f"Test Loss: {loss:.4f} - Acc: {acc:.4f}")
    return loss, acc

# Centralized Learning

In [97]:
model = BaselineNN()

In [None]:
train_loader = train_loaders[0]
val_loader = val_loaders[0]
model = model.to(DEVICE)
epochs = 10

for epoch in range(epochs):
    train(model, train_loader, 1, verbose=False)
    loss, acc = test(model, val_loader)
    print(f"\nEpoch {epoch+1}/{epochs} - Loss: {loss:.4f} - Acc: {acc:.4f}")

loss, acc = test(model, test_loader)
print(f"Final Performance - Loss: {loss:.4f} - Acc: {acc:.4f}")

# Federated Learning with Flower

In [102]:
def get_params(net):
    return [val.cpu().numpy() for _, val in net.state_dict().items()]

def set_params(net, params):
    params_dict = zip(net.state_dict().keys(), params)
    state_dict = OrderedDict({k: torch.Tensor(v) for k,v in params_dict})
    net.load_state_dict(state_dict, strict=True)
    
class FlowerClient(fl.client.NumPyClient):
    def __init__(self, net, trainloader, valloader):
        self.net = net
        self.trainloader = trainloader
        self.valloader = valloader
    
    def get_parameters(self, config):
        return get_params(self.net)
    
    def fit(self, parameters, config):
        set_params(self.net, parameters)
        train(self.net, self.trainloader, epochs=1)
        return get_params(self.net), len(self.trainloader), {}
    
    def evaluate(self, parameters, config):
        set_params(self.net, parameters)
        loss, acc = test(self.net, self.valloader)
        return float(loss), len(self.valloader), {'accuracy': float(acc)}


In [109]:
def client_fn(cid):
    net = BaselineNN().to(DEVICE)
    trainloader = train_loaders[int(cid)]
    valloader = val_loaders[int(cid)]
    return FlowerClient(net, trainloader, valloader)

In [None]:
strategy = fl.server.strategy.FedAvg(
    fraction_fit=1.0,
    fraction_evaluate=0.5,
    min_fit_clients=10,
    min_evaluate_clients=5,
    min_available_clients=10
)

client_resources = {'num_gpus': 2, 'num_cpus': 1}

fl.simulation.start_simulation(
    client_fn=client_fn,
    num_clients=NUM_CLIENTS,
    config=fl.server.ServerConfig(num_rounds=5),
    strategy=strategy,
    client_resources=client_resources
)