In [3]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

rng = np.random.default_rng(42)
n = 500
gender = rng.integers(0, 2, size=n)
income = rng.normal(50_000 + 5_000 * (gender == 0), 8_000, size=n)
proxy = (income > 52_000).astype(int)
noise = rng.normal(0, 1, size=n)
approved = (0.02 * (income - 50_000) + 0.5 * (gender == 0) + noise > 0.3).astype(int)
df = pd.DataFrame({"income": income, "proxy": proxy, "approved": approved, "gender": gender})

X = df[["income", "proxy"]]
y = df["approved"]
g = df["gender"]
# train_test_split returns train and test for each array → 6 outputs
X_tr, X_te, y_tr, y_te, g_tr, g_te = train_test_split(X, y, g, test_size=0.3, random_state=0)
model = LogisticRegression(max_iter=1000)
model.fit(X_tr, y_tr)
preds = model.predict(X_te)

metrics = {}
for gv in [0, 1]:
    mask = (g_te == gv)
    tn, fp, fn, tp = confusion_matrix(y_te[mask], preds[mask]).ravel()
    metrics[gv] = {
        "FPR": fp / (fp + tn),
        "FNR": fn / (fn + tp),
    }

proxy_corr = np.corrcoef(df["proxy"], df["gender"])[0, 1]
print("By-group metrics:", metrics)
print("Proxy correlation (proxy~gender):", round(proxy_corr, 3))



By-group metrics: {0: {'FPR': 0.0, 'FNR': 0.0}, 1: {'FPR': 0.0, 'FNR': 0.0}}
Proxy correlation (proxy~gender): -0.227
