In [None]:
# Make sure you're on Python > 3.8
# !pip install -r requirements.txt --quiet

In [None]:
from collections import OrderedDict

import numpy as np
import pandas as pd
import yaml
import logging 

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import DataLoader, TensorDataset

from sklearn.model_selection import train_test_split

import flwr as fl
from flwr.simulation import run_simulation
from flwr.client import Client, ClientApp, NumPyClient
from flwr.common import Context
from flwr.server import ServerApp, ServerConfig, ServerAppComponents

from datasets import Dataset
from flwr_datasets.partitioner import DirichletPartitioner

from flwr_datasets.visualization import plot_label_distributions

In [None]:
DEVICE = torch.device('cpu')

# Config

In [None]:
config = {
    # ablation name
    "name":"ablation_0",
    # sensitive feature in X --> caucasian
    "sensitive_cat": "caucasian",
    # outcome, y --> Two_yr_Recidivism, Age_Below_TwentyFive
    "outcome": "Two_yr_Recidivism",
    # alpha - data heterogenity level 500, 5, .5
    "alpha": 5,
    # beta - weight on fairness metric, .05, 0.1, .15
    "beta": 0.1,
    # gamma - weight on ind. fairness vs group, 0, .25, .5
    "gamma" : 0,
    # num clients
    "num_clients": 5,
    # local epochs
    "local_epochs":2
    
    }


In [None]:
with open("config/{}.yaml".format(config["name"]), "w") as file:
    yaml.dump(config, file, default_flow_style=False)


# Logging

In [None]:
# Configure logging
logging.basicConfig(
    filename="ablation_log/{}.log".format(config["name"]),  # Log file name
    level=logging.INFO,       # Logging level
    format="%(asctime)s - %(levelname)s - %(message)s"  # Log format
)

# Create a logger
logger = logging.getLogger()

logger.info("init ablation")

## Load Data

In [None]:
# !mkdir '.kaggle'
# !mkdir '.kaggle/data'

# with open(".kaggle/kaggle.json", 'a+') as f:
#     f.write('{"username":"rajaxarcmu","key":"68d40c5e38e1c786ab57736bc5c9b2cb"}')
    
# !chmod 600 '.kaggle/kaggle.json'
# !kaggle datasets download -d 'danofer/compass'
# !unzip -qo compass.zip -d '.kaggle/data'

In [None]:
!ls .kaggle/data

In [None]:
baseline_lik = round(df['Age_Below_TwentyFive'].sum()/len(df), 3)
outcome = config["outcome"]
logger.info(f"Baseline likelihood of {outcome} {baseline_lik}")

## Visualize Client Data

In [None]:
df = pd.read_csv('.kaggle/data/propublicaCompassRecividism_data_fairml.csv/propublica_data_for_fairml.csv')
df['caucasian'] = ((df['African_American'] + df['Asian'] + df['Hispanic'] + df['Native_American'] + df['Other']) == 0).astype(int)

baseline_lik = round(df['Age_Below_TwentyFive'].sum()/len(df), 3)
outcome = config["outcome"]
logger.info(f"Baseline likelihood of {outcome} {baseline_lik}")

trainset, testset = train_test_split(df, test_size=0.2)

ds = Dataset.from_pandas(trainset)

min_partition_size = (len(trainset) // (2*config["num_clients"]))
logger.info(f"min_partition_size {min_partition_size}")

partitioner = DirichletPartitioner(
    num_partitions=config["num_clients"],
    partition_by=config["sensitive_cat"],
    alpha=config["alpha"],
    min_partition_size=min_partition_size, 
    self_balancing=True,
    shuffle=True
)

partitioner.dataset = ds

In [None]:
fig, ax, df_viz = plot_label_distributions(
    partitioner,
    label_name="caucasian",
    plot_type="bar",
    size_unit="absolute",
    partition_id_axis="x",
    legend=True,
    verbose_labels=True,
    title="Per Partition Labels Distribution",
)

In [None]:
from custom_flwr.task import load_data

In [None]:
# trainset, testset = train_test_split(df, test_size=0.2)
# batch_size = 32

# ds = Dataset.from_pandas(trainset)

# partitioner = DirichletPartitioner(
#     num_partitions=config["num_clients"],
#     partition_by=config["sensitive_cat"],
#     alpha=config["alpha"],
#     min_partition_size=(len(trainset) // (2 * config["num_clients"])),
#     self_balancing=True,
#     shuffle=True
# )

# partitioner.dataset = ds
# datasets = []
# for i in range(config["num_clients"]):
#     curr_partition = partitioner.load_partition(i)
#     datasets.append(curr_partition.to_pandas())

# train_loaders = []
# val_loaders = []

# feature_columns = ['Number_of_Priors', 'score_factor','Age_Above_FourtyFive', 'Age_Below_TwentyFive', 'Misdemeanor']

# for ds in datasets:
#     train_x = ds[feature_columns].values
#     train_y = ds['Two_yr_Recidivism'].values
#     sensitive_feature = ds['caucasian'].values

#     train_x, val_x, train_y, val_y, sensitive_train, sensitive_val = train_test_split(
#         train_x, train_y, sensitive_feature, test_size=0.25, shuffle=True, stratify=train_y, random_state=42
#     )
    
#     train_x_tensor = torch.from_numpy(train_x).float()
#     train_y_tensor = torch.from_numpy(train_y).float()
#     sensitive_train_tensor = torch.from_numpy(sensitive_train).float()

#     valid_x_tensor = torch.from_numpy(val_x).float()
#     valid_y_tensor = torch.from_numpy(val_y).float()
#     sensitive_val_tensor = torch.from_numpy(sensitive_val).float()

#     # Create TensorDataset and DataLoader, including the sensitive attribute
#     train_dataset = TensorDataset(train_x_tensor, train_y_tensor, sensitive_train_tensor)
#     valid_dataset = TensorDataset(valid_x_tensor, valid_y_tensor, sensitive_val_tensor)

#     train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
#     val_loader = DataLoader(valid_dataset, batch_size=batch_size)

#     train_loaders.append(train_loader)
#     val_loaders.append(val_loader)

# # For test data
# test_x = testset[feature_columns].values
# test_y = testset['Two_yr_Recidivism'].values
# sensitive_test = testset['caucasian'].values

# test_x_tensor = torch.from_numpy(test_x).float()
# test_y_tensor = torch.from_numpy(test_y).float()
# sensitive_test_tensor = torch.from_numpy(sensitive_test).float()

# test_dataset = TensorDataset(test_x_tensor, test_y_tensor, sensitive_test_tensor)
# test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
train_loaders, val_loaders, test_loader = load_data(config=config)

## Isolated Client Model

In [None]:
class BaselineNN(nn.Module):
    def __init__(self):
        super(BaselineNN, self).__init__()
        self.fc1 = nn.Linear(5, 16)
        self.fc2 = nn.Linear(16, 8)
        self.fc3 = nn.Linear(8, 1)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x


    def compute_indfair(self, preds, labels):
        # Convert lists to tensors if they aren't already
        if isinstance(preds, list):
            preds = torch.cat(preds)
        if isinstance(labels, list):
            labels = torch.cat(labels)
    
        # Group predictions by the actual labels
        preds_0 = preds[labels == 0]
        preds_1 = preds[labels == 1]
    
        # Compute mean predictions for each group
        mean_preds_0 = preds_0.mean().item()
        mean_preds_1 = preds_1.mean().item()
    
        # Compute squared differences
        squared_diff_0 = ((preds_0 - mean_preds_0) ** 2).mean().item()
        squared_diff_1 = ((preds_1 - mean_preds_1) ** 2).mean().item()
    
        # Average the squared differences
        avg_squared_diff = (squared_diff_0 + squared_diff_1) / 2
        return avg_squared_diff
    
    def compute_eod(self, preds, labels, sensitive_feature):
        preds_binary = (preds >= 0.5).float()
        y_true_mask = (labels == 1).view(-1)
    
        p_a0 = preds_binary[y_true_mask & (sensitive_feature == 0)].mean().item()
        p_a1 = preds_binary[y_true_mask & (sensitive_feature == 1)].mean().item()
    
        eod = p_a0 - p_a1
        return eod
    
    def model_train(self, net, trainloader, epochs, verbose=True):
        """
        Train Network on Training Set
        """
        criterion = nn.BCELoss()
        optimizer = optim.Adam(net.parameters())
        net.train()
        for epoch in range(epochs):
            correct, total, epoch_loss = 0, 0, 0.0
            all_preds, all_labels, all_sensitives = [], [], []
            
            for inputs, labels, sensitive_features in trainloader:
                inputs, labels, sensitive_features = inputs.to(DEVICE), labels.to(DEVICE), sensitive_features.to(DEVICE)
                optimizer.zero_grad()
                outputs = net(inputs)
                labels = labels.view(-1, 1)
                # loss
                loss = criterion(outputs, labels)
                loss.backward()
                # optimizer step
                optimizer.step()
                # epoch loss increemnt
                epoch_loss += loss.item() * inputs.size(0)
                # predicted
                predicted = (outputs >= 0.5).float()
                total += labels.size(0)
                # prop correct
                correct += (predicted == labels).sum().item()
                
                # Append predictions and sensitive data for EOD computation
                all_preds.append(outputs.detach().cpu())
                all_labels.append(labels.detach().cpu())
                all_sensitives.append(sensitive_features.cpu())
            
            # Compute EOD at the end of the epoch
            all_preds = torch.cat(all_preds)
            all_labels = torch.cat(all_labels)
            all_sensitives = torch.cat(all_sensitives)
            
            eod = self.compute_eod(all_preds, all_labels, all_sensitives)
            
            epoch_loss /= len(trainloader.dataset)
            epoch_acc = correct / total
            if verbose:
                print(f"Epoch {epoch+1}/{epochs} - Loss: {epoch_loss:.4f} - Acc: {epoch_acc:.4f} - EOD: {eod:.4f}")
    
    def model_test(self, net, testloader, verbose=True):
        criterion = nn.BCELoss()
        net.eval()
        correct, total, loss = 0, 0, 0.0
        all_preds, all_labels, all_sensitives = [], [], []
        
        with torch.no_grad():
            for inputs, labels, sensitive_features in testloader:
                inputs, labels, sensitive_features = inputs.to(DEVICE), labels.to(DEVICE), sensitive_features.to(DEVICE)
                outputs = net(inputs)
                labels = labels.view(-1, 1)
                loss += criterion(outputs, labels).item() * inputs.size(0)
                predicted = (outputs >= 0.5).float()
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
                
                # Append predictions and sensitive data for EOD computation
                all_preds.append(outputs.detach().cpu())
                all_labels.append(labels.detach().cpu())
                all_sensitives.append(sensitive_features.cpu())
        
        # Compute EOD at the end of testing
        all_preds = torch.cat(all_preds)
        all_labels = torch.cat(all_labels)
        all_sensitives = torch.cat(all_sensitives)
        
        eod = self.compute_eod(all_preds, all_labels, all_sensitives)
        
        loss /= len(testloader.dataset)
        acc = correct / total
        if verbose:
            print(f"Test Loss: {loss:.4f} - Acc: {acc:.4f} - EOD: {eod:.4f}")
        return loss, acc, eod

# Centralized Learning - Isolated Training

In [None]:
model = BaselineNN()

In [None]:
curr_avg = 0
curr_eod = 0

max_eod = -1
min_acc = 1

for i in range(config["num_clients"]):
    train_loader = train_loaders[i]
    val_loader = val_loaders[i]
    model = model.to(DEVICE)
    epochs = 10

    for epoch in range(epochs):
        model.model_train(model, train_loader, 1, verbose=False)
        loss, acc, eod = model.model_test(model, val_loader, verbose=False)

    loss, acc, eod = model.model_test(model, test_loader, verbose=False)

    print(f"Client {i} - Test Loss: {loss:.4f} - Acc: {acc:.4f} - EOD: {eod:.4f}")
    
    curr_eod += eod
    curr_avg += acc

    if eod > max_eod:
        max_eod = eod
    if acc < min_acc:
        min_acc = acc

num_clients = config["num_clients"]

print("---")
print(f"Average EOD: {curr_eod / num_clients:.4f}")
print(f"Average Accuracy: {curr_avg / num_clients:.4f}")

print("---")

print(f"Max EOD: {max_eod:.4f}")
print(f"Min Accuracy: {min_acc:.4f}")

In [None]:
# for i in range(config["num_clients"]):
#     train_loader = train_loaders[i]
#     val_loader = val_loaders[i]
#     model = model.to(DEVICE)
#     epochs = 10

#     for epoch in range(epochs):
#         model.model_train(model, train_loader, 1, verbose=False)
#         loss, acc, eod = model.model_test(model, val_loader, verbose=False)

#     loss, acc, eod = model.model_test(model, test_loader, verbose=False)
#     print(f"Client {i} - Test Loss: {loss:.4f} - Acc: {acc:.4f} - EOD: {eod:.4f}")

# Federated Learning with Flower

In [None]:
import flwr as fl
from flwr.simulation import run_simulation
from flwr.client import Client, ClientApp, NumPyClient
from flwr.common import Context
from flwr.server import ServerApp, ServerConfig, ServerAppComponents

from collections import OrderedDict
import numpy as np
import pandas as pd
import torch

from custom_flwr.server_app import server_fn as server_fn_custom
from custom_flwr.client_app import client_fn as client_fn_custom

DEVICE = torch.device('cpu')

def server_fn(context: Context):
    context.run_config = {
        'num-server-rounds' : 2,
        'fraction-fit': 1,
        'fraction-evaluate': 1,
        # 'local-epochs': 2,
        'server-device': str(DEVICE),
        'use-wandb': False,
        'beta':config["beta"],
        "gamma":config["gamma"],
        "config":config
    }
    return server_fn_custom(context)

def client_fn(context: Context):
    return client_fn_custom(context)



In [None]:
client = ClientApp(client_fn=client_fn)

In [None]:
server = ServerApp(server_fn=server_fn)

In [None]:
backend_config = {"client_resources": None}

run_simulation(
    server_app=server,
    client_app=client,
    num_supernodes=config["num_clients"],s
    backend_config=backend_config,
)