In [1]:
from collections import OrderedDict

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import DataLoader, TensorDataset

from sklearn.model_selection import train_test_split

import flwr as fl
from flwr.simulation import run_simulation
from flwr.client import Client, ClientApp, NumPyClient
from flwr.common import Context
from flwr.server import ServerApp, ServerConfig, ServerAppComponents

from datasets import Dataset
from flwr_datasets.partitioner import DirichletPartitioner

DEVICE = torch.device('cpu')

  from .autonotebook import tqdm as notebook_tqdm
2024-12-06 11:57:35,108	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


## Config

In [2]:
config = {
    # flower config 
    "num_clients":10,
    # train data 
    "train_test_split":.2,
    # client model
    "batch_size":32,



}

## Load and Pre Process Data

In [3]:
!mkdir '.kaggle'
!mkdir '.kaggle/data'

with open(".kaggle/kaggle.json", 'a+') as f:
    f.write('{"username":"rajaxarcmu","key":"68d40c5e38e1c786ab57736bc5c9b2cb"}')
    
!chmod 600 '.kaggle/kaggle.json'
!kaggle datasets download -d 'danofer/compass'
!unzip -qo compass.zip -d '.kaggle/data'

!ls .kaggle/data

mkdir: cannot create directory ‘.kaggle’: File exists
mkdir: cannot create directory ‘.kaggle/data’: File exists
compass.zip: Skipping, found more recently modified local copy (use --force to force download)
compas-scores-raw.csv	cox-violent-parsed_filt.csv
cox-violent-parsed.csv	propublicaCompassRecividism_data_fairml.csv


In [4]:
df = pd.read_csv('.kaggle/data/propublicaCompassRecividism_data_fairml.csv/propublica_data_for_fairml.csv')
print(df.shape)

(6172, 12)


In [5]:
df['caucasian'] = ((df['African_American'] + df['Asian'] + df['Hispanic'] + df['Native_American'] + df['Other']) == 0).astype(int)

# Data

In [6]:
df.head()

Unnamed: 0,Two_yr_Recidivism,Number_of_Priors,score_factor,Age_Above_FourtyFive,Age_Below_TwentyFive,African_American,Asian,Hispanic,Native_American,Other,Female,Misdemeanor,caucasian
0,0,0,0,1,0,0,0,0,0,1,0,0,0
1,1,0,0,0,0,1,0,0,0,0,0,0,0
2,1,4,0,0,1,1,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,0,1,0
4,1,14,1,0,0,0,0,0,0,0,0,0,1


## Configure Data Loader
- including data heterogeneity 

In [7]:
# train/test split 
trainset, testset = train_test_split(df, test_size=0.2)

# train data set
ds = Dataset.from_pandas(trainset)

# Flower Partitioner
partitioner = DirichletPartitioner(
    num_partitions=config["num_clients"],
    partition_by="caucasian",
    alpha=0.5,
    min_partition_size=(len(trainset) // (4 * config["num_clients"])),
    self_balancing=True,
    shuffle=True)

# populate train partitioned datasets 
partitioner.dataset = ds
datasets = []
for i in range(config["num_clients"]):
    curr_partition = partitioner.load_partition(i)
    datasets.append(curr_partition.to_pandas())



In [None]:
# populate train/test data loaders 
train_loaders = []
val_loaders = []

feature_columns = ['Number_of_Priors', 'score_factor','Age_Above_FourtyFive', 'Age_Below_TwentyFive', 'Misdemeanor']

for ds in datasets:
    train_x = ds[feature_columns].values
    train_y = ds['Two_yr_Recidivism'].values
    sensitive_feature = ds['caucasian'].values

    train_x, val_x, train_y, val_y, sensitive_train, sensitive_val = train_test_split(
        train_x, train_y, sensitive_feature, test_size=0.25, shuffle=True, stratify=train_y, random_state=42
    )
    
    train_x_tensor = torch.from_numpy(train_x).float()
    train_y_tensor = torch.from_numpy(train_y).float()
    sensitive_train_tensor = torch.from_numpy(sensitive_train).float()

    valid_x_tensor = torch.from_numpy(val_x).float()
    valid_y_tensor = torch.from_numpy(val_y).float()
    sensitive_val_tensor = torch.from_numpy(sensitive_val).float()

    # Create TensorDataset and DataLoader, including the sensitive attribute
    train_dataset = TensorDataset(train_x_tensor, train_y_tensor, sensitive_train_tensor)
    valid_dataset = TensorDataset(valid_x_tensor, valid_y_tensor, sensitive_val_tensor)

    train_loader = DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True)
    val_loader = DataLoader(valid_dataset, batch_size=config["batch_size"])

    train_loaders.append(train_loader)
    val_loaders.append(val_loader)


In [None]:
# build test data set
test_x = testset[feature_columns].values
test_y = testset['Two_yr_Recidivism'].values
sensitive_test = testset['caucasian'].values

test_x_tensor = torch.from_numpy(test_x).float()
test_y_tensor = torch.from_numpy(test_y).float()
sensitive_test_tensor = torch.from_numpy(sensitive_test).float()

test_dataset = TensorDataset(test_x_tensor, test_y_tensor, sensitive_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=config["batch_size"])

## Client Model Architecture