In [None]:
# Cell 1: Install required stable packages (run once)
!pip install -q torch pandas numpy scikit-learn scipy


In [None]:
# Cell 2: Imports, seed, and device
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score
from sklearn.feature_selection import mutual_info_classif
from scipy.stats import entropy
from collections import Counter

# Reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


In [None]:
# Cell 3: Load and basic cleaning
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
columns = [
    'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
    'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
    'hours-per-week', 'native-country', 'income'
]

data = pd.read_csv(url, header=None, names=columns, na_values='?', skipinitialspace=True)
print("Original dataset shape:", data.shape)

# Drop missing rows
data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)
print("After cleaning, dataset shape:", data.shape)

# Define sensitive attribute and target
SENSITIVE_ATTR = 'sex'
TARGET = 'income'

# Binarize
data[SENSITIVE_ATTR] = data[SENSITIVE_ATTR].apply(lambda x: 1 if x == 'Male' else 0)
data[TARGET] = data[TARGET].apply(lambda x: 1 if x == '>50K' else 0)

features = data.drop(columns=[TARGET, SENSITIVE_ATTR])
sensitive_attrs = data[SENSITIVE_ATTR]
labels = data[TARGET]

print("Features shape:", features.shape)
print("Sensitive attribute counts:\n", sensitive_attrs.value_counts())
print("Target counts:\n", labels.value_counts())


Original dataset shape: (32561, 15)
After cleaning, dataset shape: (30162, 15)
Features shape: (30162, 13)
Sensitive attribute counts:
 sex
1    20380
0     9782
Name: count, dtype: int64
Target counts:
 income
0    22654
1     7508
Name: count, dtype: int64


In [None]:
# Cell 4: Split and preprocessing (fit on train only)
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import numpy as np

categorical_features = features.select_dtypes(include=['object']).columns.tolist()
numerical_features = features.select_dtypes(include=np.number).columns.tolist()

print("Categorical features:", categorical_features)
print("Numerical features:", numerical_features)

numerical_transformer = StandardScaler()

# âœ… Updated for sklearn >= 1.4
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Split (stratify by label)
X_train, X_test, y_train, y_test, A_train, A_test = train_test_split(
    features, labels, sensitive_attrs, test_size=0.2, random_state=RANDOM_SEED, stratify=labels
)

# Fit preprocessor on train and transform both
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])
X_train_processed = pipeline.fit_transform(X_train)
X_test_processed = pipeline.transform(X_test)

print("Processed X_train shape:", X_train_processed.shape)
print("Processed X_test shape:", X_test_processed.shape)


Categorical features: ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'native-country']
Numerical features: ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
Processed X_train shape: (24129, 102)
Processed X_test shape: (6033, 102)


In [None]:
# Cell 5: Torch Dataset and DataLoaders
class AdultDataset(Dataset):
    def __init__(self, features, labels, sensitive_attrs):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.labels = torch.tensor(labels.values, dtype=torch.float32).unsqueeze(1)
        self.sensitive_attrs = torch.tensor(sensitive_attrs.values, dtype=torch.int64)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx], self.sensitive_attrs[idx]

BATCH_SIZE = 256
train_dataset = AdultDataset(X_train_processed, y_train, A_train)
test_dataset = AdultDataset(X_test_processed, y_test, A_test)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

print("Train batches:", len(train_loader), "Test batches:", len(test_loader))


Train batches: 95 Test batches: 24


In [None]:
# Cell 6: MLP model
class MLP(nn.Module):
    def __init__(self, input_dim):
        super(MLP, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 1)  # output is a logit
        )

    def forward(self, x):
        return self.layers(x)

input_dim = X_train_processed.shape[1]
print("Input dim:", input_dim)
# instantiate example (you will instantiate again before training to ensure fresh weights)
_example_model = MLP(input_dim)
print(_example_model)
del _example_model


Input dim: 102
MLP(
  (layers): Sequential(
    (0): Linear(in_features=102, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=128, out_features=64, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.2, inplace=False)
    (6): Linear(in_features=64, out_features=1, bias=True)
  )
)


In [None]:
# Cell 7: PID approximation function + PIDLoss module
def calculate_pid_components(logits, labels, sensitive_attrs):
    """
    Approximate PID-like components using mutual information as a proxy.
    logits, labels, sensitive_attrs are torch tensors (possibly on GPU).
    This function moves them to CPU and returns simple approximations.
    """
    # Safe flatten + detach + cpu -> numpy
    preds = (torch.sigmoid(logits).view(-1) > 0.5).long().detach().cpu().numpy()
    labels_np = labels.view(-1).long().detach().cpu().numpy()
    sensitive_np = sensitive_attrs.view(-1).long().detach().cpu().numpy()

    # mutual information estimates (labels vs sensitive) and (preds vs sensitive)
    # mutual_info_classif expects (n_samples, n_features) for X
    try:
        mi_y_s = float(mutual_info_classif(labels_np.reshape(-1, 1), sensitive_np, discrete_features=True)[0])
        mi_pred_s = float(mutual_info_classif(preds.reshape(-1, 1), sensitive_np, discrete_features=True)[0])
    except Exception:
        mi_y_s = 0.0
        mi_pred_s = 0.0

    # A simple proxy:
    # - 'red_YA' (redundant) ~ mi_y_s
    # - 'unq_A_if_Y' (unique from A about Y_hat given Y) ~ max(mi_pred_s - mi_y_s, 0)
    # - 'syn_YA' left as 0 (not estimated here)
    unq = max(mi_pred_s - mi_y_s, 0.0)

    return {
        'unq_A_if_Y': unq,
        'red_YA': mi_y_s,
        'syn_YA': 0.0
    }

class PIDLoss(nn.Module):
    def __init__(self, component_to_penalize='unq_A_if_Y'):
        super(PIDLoss, self).__init__()
        self.component_to_penalize = component_to_penalize

    def forward(self, logits, labels, sensitive_attrs):
        pid_components = calculate_pid_components(logits, labels, sensitive_attrs)
        loss_value = float(pid_components.get(self.component_to_penalize, 0.0))
        # Return as a CPU scalar tensor (no gradient)
        return torch.tensor(loss_value, dtype=torch.float32, requires_grad=False)


In [None]:
# Cell 8: Training and evaluation helper functions
def train_model(model, train_loader, criterion, pid_regularizer, optimizer, epochs, lam, device):
    model.to(device)
    model.train()
    for epoch in range(epochs):
        total_bce_loss = 0.0
        total_pid_loss = 0.0
        total_samples = 0

        for features, labels, sensitive_attrs in train_loader:
            features = features.to(device)
            labels = labels.to(device)
            sensitive_attrs = sensitive_attrs.to(device)

            optimizer.zero_grad()
            logits = model(features)  # shape (batch, 1)
            bce_loss = criterion(logits, labels)

            # pid_regularizer returns a CPU tensor scalar; move to device for arithmetic
            pid_loss = pid_regularizer(logits, labels, sensitive_attrs).to(device)
            loss = bce_loss + lam * pid_loss

            loss.backward()
            optimizer.step()

            batch_size = features.size(0)
            total_bce_loss += bce_loss.item() * batch_size
            total_pid_loss += pid_loss.item() * batch_size
            total_samples += batch_size

        avg_bce = total_bce_loss / total_samples
        avg_pid = total_pid_loss / total_samples
        print(f"Epoch {epoch+1}/{epochs} | BCE: {avg_bce:.4f} | PID({pid_regularizer.component_to_penalize}): {avg_pid:.6f}")

def evaluate_model(model, dataloader, device):
    model.to(device)
    model.eval()
    all_preds = []
    all_trues = []
    with torch.no_grad():
        for X_batch, y_batch, _ in dataloader:
            X_batch = X_batch.to(device)
            logits = model(X_batch)
            preds = (torch.sigmoid(logits) > 0.5).long().cpu().numpy().flatten()
            all_preds.extend(preds)
            all_trues.extend(y_batch.cpu().numpy().flatten().astype(int))

    acc = accuracy_score(all_trues, all_preds)
    f1 = f1_score(all_trues, all_preds)
    print(f"Eval -> Accuracy: {acc:.4f} | F1: {f1:.4f}")
    return np.array(all_preds), np.array(all_trues)


In [None]:
# Cell 9: Train baseline (lambda=0) and PID-regularized (lambda>0) models
LEARNING_RATE = 1e-3
EPOCHS = 5
LAMBDA = 1.0

criterion = nn.BCEWithLogitsLoss()
pid_regularizer = PIDLoss(component_to_penalize='unq_A_if_Y')

# Baseline model (no PID penalty)
print("\n--- Baseline training (lambda=0) ---")
baseline_model = MLP(input_dim)
baseline_optimizer = optim.Adam(baseline_model.parameters(), lr=LEARNING_RATE)
train_model(baseline_model, train_loader, criterion, pid_regularizer, baseline_optimizer, EPOCHS, 0.0, device)
baseline_preds, baseline_trues = evaluate_model(baseline_model, test_loader, device)

# PID-regularized model
print("\n--- PID-regularized training (lambda=1.0) ---")
pid_model = MLP(input_dim)
pid_optimizer = optim.Adam(pid_model.parameters(), lr=LEARNING_RATE)
train_model(pid_model, train_loader, criterion, pid_regularizer, pid_optimizer, EPOCHS, LAMBDA, device)
pid_preds, pid_trues = evaluate_model(pid_model, test_loader, device)



--- Baseline training (lambda=0) ---
Epoch 1/5 | BCE: 0.4303 | PID(unq_A_if_Y): 0.006571
Epoch 2/5 | BCE: 0.3252 | PID(unq_A_if_Y): 0.005031
Epoch 3/5 | BCE: 0.3199 | PID(unq_A_if_Y): 0.003216
Epoch 4/5 | BCE: 0.3153 | PID(unq_A_if_Y): 0.004446
Epoch 5/5 | BCE: 0.3136 | PID(unq_A_if_Y): 0.003019
Eval -> Accuracy: 0.8497 | F1: 0.6739

--- PID-regularized training (lambda=1.0) ---
Epoch 1/5 | BCE: 0.4280 | PID(unq_A_if_Y): 0.006407
Epoch 2/5 | BCE: 0.3267 | PID(unq_A_if_Y): 0.004461
Epoch 3/5 | BCE: 0.3187 | PID(unq_A_if_Y): 0.003796
Epoch 4/5 | BCE: 0.3160 | PID(unq_A_if_Y): 0.003869
Epoch 5/5 | BCE: 0.3117 | PID(unq_A_if_Y): 0.004073
Eval -> Accuracy: 0.8536 | F1: 0.6790


In [None]:
# Cell 10: Fairness metrics on test set
def get_preds_trues_attrs(model, dataloader, device):
    model.to(device)
    model.eval()
    preds, trues, attrs = [], [], []
    with torch.no_grad():
        for X_batch, y_batch, a_batch in dataloader:
            X_batch = X_batch.to(device)
            logits = model(X_batch)
            batch_preds = (torch.sigmoid(logits) > 0.5).long().cpu().numpy().flatten()
            preds.extend(batch_preds)
            trues.extend(y_batch.cpu().numpy().flatten().astype(int))
            attrs.extend(a_batch.cpu().numpy().flatten().astype(int))
    return np.array(preds), np.array(trues), np.array(attrs)

def demographic_parity_difference(preds, attrs):
    # P(Yhat=1 | A=1) - P(Yhat=1 | A=0)
    if (attrs==1).sum() == 0 or (attrs==0).sum() == 0:
        return 0.0
    p1 = preds[attrs==1].mean()
    p0 = preds[attrs==0].mean()
    return float(p1 - p0)

def equal_opportunity_difference(preds, trues, attrs):
    # TPR difference: TPR(A=1) - TPR(A=0)
    mask_pos = trues == 1
    if mask_pos.sum() == 0:
        return 0.0
    denom1 = ((mask_pos) & (attrs==1)).sum()
    denom0 = ((mask_pos) & (attrs==0)).sum()
    tpr1 = ((preds==1) & (trues==1) & (attrs==1)).sum() / denom1 if denom1 > 0 else 0.0
    tpr0 = ((preds==1) & (trues==1) & (attrs==0)).sum() / denom0 if denom0 > 0 else 0.0
    return float(tpr1 - tpr0)

# Baseline fairness
b_preds, b_trues, b_attrs = get_preds_trues_attrs(baseline_model, test_loader, device)
dp_diff_baseline = demographic_parity_difference(b_preds, b_attrs)
eo_diff_baseline = equal_opportunity_difference(b_preds, b_trues, b_attrs)
print("Baseline fairness:")
print(" Demographic Parity Difference (A=1 - A=0):", dp_diff_baseline)
print(" Equal Opportunity Difference (TPR diff):", eo_diff_baseline)

# PID model fairness
p_preds, p_trues, p_attrs = get_preds_trues_attrs(pid_model, test_loader, device)
dp_diff_pid = demographic_parity_difference(p_preds, p_attrs)
eo_diff_pid = equal_opportunity_difference(p_preds, p_trues, p_attrs)
print("\nPID-regularized model fairness:")
print(" Demographic Parity Difference (A=1 - A=0):", dp_diff_pid)
print(" Equal Opportunity Difference (TPR diff):", eo_diff_pid)


Baseline fairness:
 Demographic Parity Difference (A=1 - A=0): 0.1653189031890319
 Equal Opportunity Difference (TPR diff): 0.033300307309946486

PID-regularized model fairness:
 Demographic Parity Difference (A=1 - A=0): 0.1639718897188972
 Equal Opportunity Difference (TPR diff): 0.05111084988832726
