## Implementation 

### For COMPASS Datasets

In [84]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load dataset
df = pd.read_csv("compas-scores-two-years.csv")

# Keep standard COMPAS filtering (as in literature)
df = df[
    (df['days_b_screening_arrest'] <= 30) &
    (df['days_b_screening_arrest'] >= -30) &
    (df['is_recid'] != -1) &
    (df['c_charge_degree'] != 'O') &
    (df['score_text'] != 'N/A')
]

# Label: recidivism
df["label"] = df["two_year_recid"]

# Sensitive attribute: race (Black vs Non-Black)
df["race"] = (df["race"] == "African-American").astype(int)

# Select features (standard choice)
features = [
    'age',
    'priors_count',
    'juv_fel_count',
    'juv_misd_count',
    'juv_other_count',
    'c_charge_degree'
]

df = df[features + ["label", "race"]]

# One-hot encode categorical features
df = pd.get_dummies(df, drop_first=True)

# Normalize numerical features
scaler = StandardScaler()
num_cols = [
    'age', 'priors_count',
    'juv_fel_count', 'juv_misd_count', 'juv_other_count'
]
df[num_cols] = scaler.fit_transform(df[num_cols])

# Train / Val / Test split
train, temp = train_test_split(df, test_size=0.3, random_state=42)
val, test = train_test_split(temp, test_size=0.5, random_state=42)

train.to_csv("compas_train.csv", index=False)
val.to_csv("compas_val.csv", index=False)
test.to_csv("compas_test.csv", index=False)

print("COMPAS preprocessing done.")

COMPAS preprocessing done.


### Train RNF for COMPASS

In [85]:
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils import data
from torch import optim
from tqdm import tqdm
from fairlearn.metrics import demographic_parity_difference, equalized_odds_difference
from mlp import Net, FC

# Reproducibility
torch.manual_seed(1)
np.random.seed(1)
random.seed(1)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Hyperparameters
EPOCHS_FIRST = 9
EPOCHS_SECOND = 4     # RNF head training only
BATCH_SIZE = 64
HIDDEN_DIM = 50

ALPHA = 0.1      # RNF regularization
CE_WEIGHT = 0.4       # anti-collapse supervision
TEMPERATURE = 5.0

# Dataset


class Dataset(data.Dataset):
    def __init__(self, x, y, s):
        self.x = x
        self.y = y
        self.s = s

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.x[idx], int(self.y[idx]), int(self.s[idx])


def load_compas(batch_size):
    def load(path):
        df = pd.read_csv(path)
        X = df.drop(columns=["label", "race"]).astype("float32").values
        y = df["label"].astype(int).values
        s = df["race"].astype(int).values
        return Dataset(X, y, s)

    train = data.DataLoader(load("compas_train.csv"), batch_size, shuffle=True)
    val = data.DataLoader(load("compas_val.csv"), batch_size)
    test = data.DataLoader(load("compas_test.csv"), batch_size)
    return train, val, test


# RNF Feature Neutralization
def feature_neutralization(r, p, y, s):
    groups = {(0, 0): [], (0, 1): [], (1, 0): [], (1, 1): []}
    for i in range(len(y)):
        groups[(int(y[i]), int(s[i]))].append((r[i], p[i]))

    def sample(y, s):
        alt = groups[(y, 1-s)]
        return random.choice(alt if len(alt) > 0 else groups[(y, s)])

    reps = []
    for w in [0.5, 0.6, 0.7, 0.8, 0.9]:
        r_new = torch.zeros_like(r)
        for i in range(len(y)):
            r_alt, _ = sample(int(y[i]), int(s[i]))
            r_new[i] = w * r[i] + (1 - w) * r_alt
        reps.append(r_new.to(device))

    p_new = torch.zeros_like(p)
    for i in range(len(y)):
        _, p_alt = sample(int(y[i]), int(s[i]))
        p_new[i] = 0.5 * p[i] + 0.5 * p_alt

    return reps, p_new.to(device)


# Training
def train_epoch(model, head, loader, ce_loss, kd_loss,
                opt_enc, opt_head, epoch):

    model.train()
    head.train()
    softmax = nn.Softmax(dim=1)

    total_loss = 0
    y_true, y_pred = [], []

    for x, y, s in loader:
        x = x.float().to(device)
        y = y.long().to(device)
        s = s.long().to(device)

        logits, r = model(x)

        # Stage 1: standard supervised training
        if epoch < EPOCHS_FIRST:
            logp = F.log_softmax(logits, dim=1)
            loss = ce_loss(logp, y)

            opt_enc.zero_grad()
            loss.backward()
            opt_enc.step()

            preds = logp.argmax(dim=1)

        # Stage 2: RNF (encoder frozen)
        else:
            for p_enc in model.parameters():
                p_enc.requires_grad = False

            with torch.no_grad():
                p = softmax(logits / TEMPERATURE)

            reps, p_neu = feature_neutralization(r, p, y, s)

            pred_neu = softmax(head(reps[0]))
            kd = kd_loss(pred_neu, p_neu)

            ce = ce_loss(F.log_softmax(head(r), dim=1), y)

            reg = 0
            for rep in reps[1:]:
                reg += torch.abs(softmax(head(rep)) - pred_neu).mean()

            loss = kd + CE_WEIGHT * ce + ALPHA * reg

            opt_head.zero_grad()
            loss.backward()
            opt_head.step()

            preds = head(r).argmax(dim=1)

        y_true.extend(y.cpu().tolist())
        y_pred.extend(preds.cpu().tolist())
        total_loss += loss.item()

    acc = np.mean(np.array(y_true) == np.array(y_pred))
    return total_loss / len(loader), acc


# Evaluation
def evaluate(model, head, loader):
    model.eval()
    head.eval()

    y_true, y_pred, s_all = [], [], []

    with torch.no_grad():
        for x, y, s in loader:
            x = x.float().to(device)
            _, r = model(x)
            pred = head(r).argmax(dim=1)

            y_true.extend(y.tolist())
            y_pred.extend(pred.cpu().tolist())
            s_all.extend(s.tolist())

    return y_true, y_pred, s_all


# Main
train_iter, val_iter, test_iter = load_compas(BATCH_SIZE)
INPUT_DIM = next(iter(train_iter))[0].shape[1]

model = Net(INPUT_DIM, HIDDEN_DIM, 2).to(device)
head = FC(HIDDEN_DIM, HIDDEN_DIM, 2).to(device)

opt_enc = optim.Adam(model.parameters(), lr=1e-3)
opt_head = optim.Adam(head.parameters(), lr=1e-3)

ce_loss = nn.NLLLoss()
kd_loss = nn.MSELoss()

print("\nTraining RNF on COMPAS...")
for epoch in range(EPOCHS_FIRST + EPOCHS_SECOND):
    loss, acc = train_epoch(
        model, head, train_iter,
        ce_loss, kd_loss,
        opt_enc, opt_head,
        epoch
    )
    print(f"Epoch {epoch+1}: loss={loss:.4f}, acc={acc*100:.2f}")

# Final evaluation
y_true, y_pred, s = evaluate(model, head, test_iter)

print("\nFinal Results (RNF)")
print("Accuracy:", np.mean(np.array(y_true) == np.array(y_pred)))
print("DP:", demographic_parity_difference(
    y_true, y_pred, sensitive_features=s))
print("EO:", equalized_odds_difference(y_true, y_pred, sensitive_features=s))
print("Positive rate:", np.mean(y_pred))

Using device: cpu

Training RNF on COMPAS...
Epoch 1: loss=0.6582, acc=60.74
Epoch 2: loss=0.6141, acc=67.01
Epoch 3: loss=0.6128, acc=67.29
Epoch 4: loss=0.6068, acc=67.85
Epoch 5: loss=0.6042, acc=68.59
Epoch 6: loss=0.6059, acc=67.87
Epoch 7: loss=0.6037, acc=68.12
Epoch 8: loss=0.6020, acc=68.31
Epoch 9: loss=0.6032, acc=68.10
Epoch 10: loss=0.2768, acc=63.43
Epoch 11: loss=0.2731, acc=65.46
Epoch 12: loss=0.2724, acc=65.88
Epoch 13: loss=0.2714, acc=66.02

Final Results (RNF)
Accuracy: 0.6457883369330454
DP: 0.1677908740809896
EO: 0.22042601749714721
Positive rate: 0.22354211663066956


### Baseline model (Stage 1 of RNF)

In [86]:
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils import data
from torch import optim
from tqdm import tqdm
from fairlearn.metrics import demographic_parity_difference, equalized_odds_difference
from mlp import Net

# Reproducibility
torch.manual_seed(1)
np.random.seed(1)
random.seed(1)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

 
# Hyperparameters (match RNF stage 1)
EPOCHS = 9
BATCH_SIZE = 64
HIDDEN_DIM = 50
LR = 1e-3

# Dataset
class Dataset(data.Dataset):
    def __init__(self, x, y, s):
        self.x = x
        self.y = y
        self.s = s

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.x[idx], int(self.y[idx]), int(self.s[idx])


def load_compas(batch_size):
    def load(path):
        df = pd.read_csv(path)
        X = df.drop(columns=["label", "race"]).astype("float32").values
        y = df["label"].astype(int).values
        s = df["race"].astype(int).values
        return Dataset(X, y, s)

    train = data.DataLoader(load("compas_train.csv"), batch_size, shuffle=True)
    test = data.DataLoader(load("compas_test.csv"), batch_size)
    return train, test


 
# Training
def train_epoch(model, loader, optimizer, loss_fn):
    model.train()
    y_true, y_pred = [], []

    for x, y, _ in loader:
        x = x.float().to(device)
        y = y.long().to(device)

        logits, _ = model(x)
        logp = F.log_softmax(logits, dim=1)
        loss = loss_fn(logp, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        y_true.extend(y.cpu().tolist())
        y_pred.extend(logp.argmax(dim=1).cpu().tolist())

    return np.mean(np.array(y_true) == np.array(y_pred))


# Evaluation
def evaluate(model, loader):
    model.eval()
    y_true, y_pred, s_all = [], [], []

    with torch.no_grad():
        for x, y, s in loader:
            x = x.float().to(device)
            logits, _ = model(x)
            pred = logits.argmax(dim=1)

            y_true.extend(y.tolist())
            y_pred.extend(pred.cpu().tolist())
            s_all.extend(s.tolist())

    return y_true, y_pred, s_all


# Main
train_iter, test_iter = load_compas(BATCH_SIZE)
INPUT_DIM = next(iter(train_iter))[0].shape[1]

model = Net(INPUT_DIM, HIDDEN_DIM, 2).to(device)
optimizer = optim.Adam(model.parameters(), lr=LR)
loss_fn = nn.NLLLoss()

print("\nTraining baseline MLP...")
for epoch in range(EPOCHS):
    acc = train_epoch(model, train_iter, optimizer, loss_fn)
    print(f"Epoch {epoch+1}: acc={acc*100:.2f}")



y_true, y_pred, s = evaluate(model, test_iter)

print("\nFinal Results (Baseline)")
print("Accuracy:", np.mean(np.array(y_true) == np.array(y_pred)))
print("DP:", demographic_parity_difference(
    y_true, y_pred, sensitive_features=s))
print("EO:", equalized_odds_difference(y_true, y_pred, sensitive_features=s))
print("Positive rate:", np.mean(y_pred))

Using device: cpu

Training baseline MLP...
Epoch 1: acc=61.00
Epoch 2: acc=67.69
Epoch 3: acc=67.50
Epoch 4: acc=67.78
Epoch 5: acc=68.06
Epoch 6: acc=68.19
Epoch 7: acc=67.59
Epoch 8: acc=68.06
Epoch 9: acc=68.50

Final Results (Baseline)
Accuracy: 0.6760259179265659
DP: 0.23107947251721322
EO: 0.2453879802206162
Positive rate: 0.4200863930885529
