## Bias Mitigation using AIF360 - Heart Failure Prediction Dataset (Source: https://www.kaggle.com/datasets/fedesoriano/heart-failure-prediction/data)
Model Training and Evaluation

In [16]:
#load preprocessed data 
import pandas as pd
train_df = pd.read_csv("./data_subsets/train_25M_75F.csv")

X_test = pd.read_csv("./data_splits/X_test.csv")
y_test = pd.read_csv("./data_splits/y_test.csv")

#check out the data
train_df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,33,0,3,100.0,246.0,0,0,150.0,1,1.0,1,1
1,48,0,1,120.0,284.0,0,0,120.0,0,0.0,2,0
2,49,0,3,130.0,269.0,0,0,163.0,0,0.0,2,0
3,62,0,3,140.0,268.0,0,2,160.0,0,3.6,0,1
4,38,0,3,105.0,236.0,1,0,166.0,0,2.8,2,1


In [17]:
TARGET = "HeartDisease"
SENSITIVE = "Sex"   # 1 = Male, 0 = Female

categorical_cols = ['Sex','ChestPainType','FastingBS','RestingECG','ExerciseAngina','ST_Slope']
continuous_cols  = ['Age','RestingBP','Cholesterol','MaxHR','Oldpeak']

In [18]:
# Split train into X / y and keep sensitive feature for fairness evaluation
X_train = train_df.drop(columns=[TARGET])
y_train = train_df[TARGET]

In [19]:
# scale numeric features only, fit on train, transform test
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_num_scaled = pd.DataFrame(
    scaler.fit_transform(X_train[continuous_cols]),
    columns=continuous_cols, index=X_train.index
)
X_test_num_scaled = pd.DataFrame(
    scaler.transform(X_test[continuous_cols]),
    columns=continuous_cols, index=X_test.index
)

In [20]:
#one-hot encode categoricals; numeric are kept as is 
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(handle_unknown="ignore", drop="if_binary", sparse_output=False)
ohe.fit(X_train[categorical_cols])

X_train_cat = pd.DataFrame(
    ohe.transform(X_train[categorical_cols]),
    columns=ohe.get_feature_names_out(categorical_cols),
    index=X_train.index
)
X_test_cat = pd.DataFrame(
    ohe.transform(X_test[categorical_cols]),
    columns=ohe.get_feature_names_out(categorical_cols),
    index=X_test.index
)

In [21]:
# Assemble final matrices
X_train_ready = pd.concat([X_train_cat, X_train_num_scaled], axis=1)
X_test_ready  = pd.concat([X_test_cat,  X_test_num_scaled],  axis=1)

print("Final feature shapes:", X_train_ready.shape, X_test_ready.shape)

Final feature shapes: (600, 18) (184, 18)


In [22]:
# create sensitive attribute arrays - after creating X_train_ready and X_test_ready
A_train = X_train["Sex"].astype(int).to_numpy().ravel()  # 1=Male, 0=Female
A_test  = X_test["Sex"].astype(int).to_numpy().ravel()

### Traditional ML Models - Baseline: K-Nearest Neighbors (KNN) & Decision Tree (DT)

In [23]:
#import required libraries
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix
)

#define a function 
def evaluate_model(y_true, y_pred, model_name):
    print(f"=== {model_name} Evaluation ===")
    print("Accuracy :", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred, average='binary'))
    print("Recall   :", recall_score(y_true, y_pred, average='binary'))
    print("F1 Score :", f1_score(y_true, y_pred, average='binary'))
    print("\nClassification Report:\n", classification_report(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("\n" + "="*40 + "\n")

### PCA-KNN

In [24]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix
)
import numpy as np

#1) PCA + KNN pipeline (on one-hot encoded + scaled features)
pca_knn = Pipeline([
    ('pca', PCA(n_components=0.95, random_state=42)),  # keep 95% variance
    ('knn', KNeighborsClassifier(
        n_neighbors=15, metric='manhattan', weights='distance'
    ))
])

pca_knn.fit(X_train_ready, y_train)

# Inspect PCA details
n_comp = pca_knn.named_steps['pca'].n_components_
expl_var = pca_knn.named_steps['pca'].explained_variance_ratio_.sum()
print(f"PCA components: {n_comp} | Explained variance retained: {expl_var:.3f}")

# 2) Evaluate 
y_pred_pca_knn = pca_knn.predict(X_test_ready)
probs_pca_knn = pca_knn.predict_proba(X_test_ready)[:, 1]
  
evaluate_model(y_test, y_pred_pca_knn, "KNN (best params)")

PCA components: 11 | Explained variance retained: 0.952
=== KNN (best params) Evaluation ===
Accuracy : 0.8315217391304348
Precision: 0.9080459770114943
Recall   : 0.7745098039215687
F1 Score : 0.8359788359788359

Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.90      0.83        82
           1       0.91      0.77      0.84       102

    accuracy                           0.83       184
   macro avg       0.84      0.84      0.83       184
weighted avg       0.84      0.83      0.83       184

Confusion Matrix:
 [[74  8]
 [23 79]]




### Alternative Tuned & Pruned DT (Recall-focused)

In [25]:
# Alternative DT tuning focused on higher recall
# Changes vs previous:
#  - Remove calibration (predict uses raw tree probs at 0.5)
#  - Tune class_weight (heavier positive weights allowed)
#  - Broaden depth a bit but keep regularization via min_samples_* and tiny impurity decrease
#  - Prune only with very small ccp_alphas to avoid killing recall

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold, cross_val_score
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix
)
import numpy as np

# Simpler-but-expressive trees + tuned class weights
base_dt = DecisionTreeClassifier(random_state=42)

param_grid_simple = {
    "criterion": ["gini", "entropy"],                  # add "log_loss" if your sklearn supports it
    "max_depth": [4, 5, 6, 7, 8, 9, 10],               # a bit deeper to help recall
    "min_samples_split": [5, 10, 20],
    "min_samples_leaf": [1, 2, 4, 6],
    "min_impurity_decrease": [0.0, 1e-4, 1e-3],
    "class_weight": ["balanced", {0:1,1:2}, {0:1,1:3}, {0:1,1:4}],  # stronger push toward positives
}

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=42)

grid_simple = GridSearchCV(
    estimator=base_dt,
    param_grid=param_grid_simple,
    cv=cv,
    scoring="recall",      # prioritize sensitivity for class 1
    n_jobs=-1,
    verbose=0,
    refit=True
)
grid_simple.fit(X_train_ready, y_train)

best_params = grid_simple.best_params_
print("Stage A — Best DT params:", best_params)
print("Stage A — Best CV Recall:", round(grid_simple.best_score_, 4))

# Train a zero-pruned model with best params to get the pruning path
dt0 = DecisionTreeClassifier(random_state=42, **best_params, ccp_alpha=0.0).fit(X_train_ready, y_train)


# Stage B — Gentle cost-complexity pruning (favor small alphas)
path = dt0.cost_complexity_pruning_path(X_train_ready, y_train)
ccp_alphas = path.ccp_alphas

# Focus on tiny alphas only + 0.0 to avoid big recall loss
small_slice = ccp_alphas[: min(30, len(ccp_alphas))]  # first 30 values are typically the smallest
candidate_alphas = np.unique(np.r_[0.0, small_slice])

cv_scores = []
for alpha in candidate_alphas:
    dt_alpha = DecisionTreeClassifier(random_state=42, **best_params, ccp_alpha=alpha)
    rec = cross_val_score(dt_alpha, X_train_ready, y_train, cv=cv, scoring="recall", n_jobs=-1).mean()
    cv_scores.append((alpha, rec))

best_alpha, best_cv_recall = max(cv_scores, key=lambda x: x[1])
print(f"Stage B — Best ccp_alpha: {best_alpha:.6f} | CV Recall: {best_cv_recall:.4f}")

alt_best_dt = DecisionTreeClassifier(random_state=42, **best_params, ccp_alpha=best_alpha).fit(X_train_ready, y_train)


# Evaluation
y_pred = alt_best_dt.predict(X_test_ready)               
y_prob = alt_best_dt.predict_proba(X_test_ready)[:, 1]   

evaluate_model(y_test, y_pred, "Alternative Tuned & Pruned Decision Tree")

Stage A — Best DT params: {'class_weight': {0: 1, 1: 4}, 'criterion': 'entropy', 'max_depth': 4, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 4, 'min_samples_split': 5}
Stage A — Best CV Recall: 0.9733
Stage B — Best ccp_alpha: 0.000000 | CV Recall: 0.9733
=== Alternative Tuned & Pruned Decision Tree Evaluation ===
Accuracy : 0.8260869565217391
Precision: 0.7966101694915254
Recall   : 0.9215686274509803
F1 Score : 0.8545454545454545

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.71      0.78        82
           1       0.80      0.92      0.85       102

    accuracy                           0.83       184
   macro avg       0.84      0.81      0.82       184
weighted avg       0.83      0.83      0.82       184

Confusion Matrix:
 [[58 24]
 [ 8 94]]




### Ensemble Model - Tuned Random Forest (RF)

In [26]:
# Random Forest: hyperparameter tuning
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV


# 1) GridSearchCV over impactful RF params
rf = RandomForestClassifier(random_state=42)

param_grid = {
    "n_estimators": [200, 400, 600],
    "max_depth": [None, 8, 12, 16],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2", 0.8],  # 0.8 = 80% of features
    "class_weight": [None, "balanced"]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=cv,
    scoring="recall",          
    n_jobs=-1,
    verbose=1,
    refit=True
)

grid.fit(X_train_ready, y_train)
best_rf = grid.best_estimator_
print("Best RF params:", grid.best_params_)
print("Best CV F1:", grid.best_score_)

# 2) Evaluate best RF 
y_pred = best_rf.predict(X_test_ready)
y_prob = best_rf.predict_proba(X_test_ready)[:, 1]
evaluate_model(y_test, y_pred, "Tuned Random Forest")

Fitting 5 folds for each of 648 candidates, totalling 3240 fits
Best RF params: {'class_weight': None, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 400}
Best CV F1: 0.9566666666666667
=== Tuned Random Forest Evaluation ===
Accuracy : 0.842391304347826
Precision: 0.9010989010989011
Recall   : 0.803921568627451
F1 Score : 0.8497409326424871

Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.89      0.83        82
           1       0.90      0.80      0.85       102

    accuracy                           0.84       184
   macro avg       0.84      0.85      0.84       184
weighted avg       0.85      0.84      0.84       184

Confusion Matrix:
 [[73  9]
 [20 82]]




### Deep Learning - Multi-layer Perceptron

In [27]:
#import required library 
from sklearn.neural_network import MLPClassifier

### Adam MLP + Early Stopping

In [28]:
#Adam + Early Stopping 
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix

adammlp = MLPClassifier(
    hidden_layer_sizes=(64, 32),   # slightly smaller/deeper can help
    activation='relu',
    solver='adam',
    learning_rate_init=1e-3,       # smaller step can stabilize
    alpha=1e-3,                    # L2 regularization to reduce overfitting
    batch_size=32,
    max_iter=1000,                 # increased max_iter
    early_stopping=True,           # use a validation split internally
    validation_fraction=0.15,
    n_iter_no_change=25,          
    tol=1e-4,
    random_state=42
)

adammlp.fit(X_train_ready, y_train)  
y_pred_mlp = adammlp.predict(X_test_ready)                     
y_prob_mlp = adammlp.predict_proba(X_test_ready)[:, 1]         

evaluate_model(y_test, y_pred_mlp, "(Adam + EarlyStopping)")

=== (Adam + EarlyStopping) Evaluation ===
Accuracy : 0.8206521739130435
Precision: 0.8709677419354839
Recall   : 0.7941176470588235
F1 Score : 0.8307692307692308

Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.85      0.81        82
           1       0.87      0.79      0.83       102

    accuracy                           0.82       184
   macro avg       0.82      0.82      0.82       184
weighted avg       0.83      0.82      0.82       184

Confusion Matrix:
 [[70 12]
 [21 81]]




### Bias Mitigation AIF360

In [29]:
# Setup: install AIF360
# Uncomment the next line if running locally for the first time
!pip install aif360




[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: C:\Users\patri\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [30]:
import aif360
print("AIF360 version:", aif360.__version__)

AIF360 version: 0.6.1


In [31]:
import numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix
from sklearn.base import clone
from IPython.display import display 

from aif360.datasets import BinaryLabelDataset
from aif360.algorithms.preprocessing import Reweighing
from aif360.algorithms.postprocessing import EqOddsPostprocessing
from aif360.metrics import ClassificationMetric

# Config 
protected_attr = "Sex"  # 1=Male, 0=Female
PRIV_VALUE = 1          # privileged = Male
label_name = "label"
favorable_label, unfavorable_label = 1, 0
privileged_groups   = [{protected_attr: PRIV_VALUE}]
unprivileged_groups = [{protected_attr: 1 - PRIV_VALUE}]

# Ensure 1-D ints for targets
y_train = np.asarray(y_train).astype(int).ravel()
y_test  = np.asarray(y_test).astype(int).ravel()

# Sensitive attribute arrays
A_train = X_train["Sex"].astype(int).to_numpy().ravel()
A_test  = X_test["Sex"].astype(int).to_numpy().ravel()

def _to_bld(y, A):
    y = (y.values if hasattr(y,'values') else np.asarray(y)).ravel()
    A = (A.values if hasattr(A,'values') else np.asarray(A)).ravel()
    df = pd.DataFrame({"dummy": np.zeros(len(y)), label_name: y, protected_attr: A})
    return BinaryLabelDataset(df=df,
                              label_names=[label_name],
                              protected_attribute_names=[protected_attr],
                              favorable_label=favorable_label,
                              unfavorable_label=unfavorable_label)

def fair_metrics(y_true, y_pred, A, y_scores=None, absolute=True):
    t = _to_bld(y_true, A)
    p = _to_bld(y_pred, A)
    if y_scores is not None:
        p.scores = np.asarray(y_scores).reshape(-1,1)
    cm = ClassificationMetric(t, p,
        privileged_groups=privileged_groups,
        unprivileged_groups=unprivileged_groups)
    dp = cm.statistical_parity_difference()
    eo = cm.equal_opportunity_difference()
    return (abs(dp), abs(eo)) if absolute else (dp, eo)

def get_scores(model, X):
    if hasattr(model, "predict_proba"):
        return model.predict_proba(X)[:,1]
    if hasattr(model, "decision_function"):
        z = model.decision_function(X)
        return (z - z.min())/(z.max()-z.min()+1e-12)
    return model.predict(X).astype(float)

def selection_rate(y_pred, positive=1):
    y_pred = np.asarray(y_pred).ravel()
    return np.mean(y_pred == positive)

def per_group_table(y_true, y_pred, A, positive=1, group_name="Sex"):
    y_true = np.asarray(y_true).ravel()
    y_pred = np.asarray(y_pred).ravel()
    A = np.asarray(A).ravel()
    rows = []
    for g in np.unique(A):
        idx = (A == g)
        yt, yp = y_true[idx], y_pred[idx]
        # tn, fp, fn, tp with fixed label order [0,1]
        tn, fp, fn, tp = confusion_matrix(yt, yp, labels=[0,1]).ravel()
        tpr = tp / (tp + fn) if (tp + fn) else 0.0
        fpr = fp / (fp + tn) if (fp + tn) else 0.0
        rec = recall_score(yt, yp, pos_label=positive)   # equals TPR for binary
        sr  = selection_rate(yp, positive=positive)
        acc = accuracy_score(yt, yp)
        rows.append({group_name: g, "TPR": tpr, "FPR": fpr,
                     "Recall": rec, "SelectionRate": sr, "Accuracy": acc})
    return pd.DataFrame(rows).set_index(group_name)

def aif_diffs(y_true, y_pred, A, *, abs_vals=True):
    t = _to_bld(y_true, A)
    p = _to_bld(y_pred, A)
    cm = ClassificationMetric(t, p,
        privileged_groups=privileged_groups,
        unprivileged_groups=unprivileged_groups)
    dp = cm.statistical_parity_difference()
    eo = cm.average_odds_difference()   # Equalized odds gap (avg of TPR/FPR diffs)
    if abs_vals:
        dp, eo = abs(dp), abs(eo)
    return dp, eo

def print_row(title, acc, dp, eo, note=""):
    print(f"{title:>24s} | Acc {acc:.4f} | DP {dp:.4f} | EO {eo:.4f} {('|' if note else '')} {note}")

#to print a model cleanly
def report_model(name, y_true, y_pred, A, scores=None, note=""):
    acc = accuracy_score(y_true, y_pred)
    dp, eo = fair_metrics(y_true, y_pred, A, y_scores=scores, absolute=True)
    tbl = per_group_table(y_true, y_pred, A, positive=favorable_label, group_name="Sex").round(6)
    
    print(f"\n=== {name} ===")
    display(tbl)
    print(f"Overall -> Accuracy: {acc:.4f} | DP diff: {dp:.4f} | EO diff: {eo:.4f}"
          + (f" | {note}" if note else ""))
    
    return {"Model": name, "Accuracy": acc, "DP diff": dp, "EO diff": eo}


# Pre: compute reweighing weights ONCE on TRAIN
_bld_train = BinaryLabelDataset(
    df=pd.concat([pd.DataFrame(X_train_ready),
                  pd.Series(y_train, name=label_name),
                  pd.Series(A_train, name=protected_attr)], axis=1),
    label_names=[label_name],
    protected_attribute_names=[protected_attr],
    favorable_label=favorable_label,
    unfavorable_label=unfavorable_label
)
_rw = Reweighing(unprivileged_groups=unprivileged_groups,
                 privileged_groups=privileged_groups).fit(_bld_train)
_rw_weights = _rw.transform(_bld_train).instance_weights.ravel()

# Turn weights into a resampled training set
def resample_by_weights(X, y, A, weights, n_samples=None, random_state=42):
    rng = np.random.default_rng(random_state)
    Xn = np.asarray(X); yn = np.asarray(y).ravel(); An = np.asarray(A).ravel()
    w = np.clip(np.asarray(weights, dtype=float), 1e-12, None)
    p = w / w.sum()
    n = n_samples or len(yn)
    idx = rng.choice(len(yn), size=n, replace=True, p=p)
    return Xn[idx], yn[idx], An[idx]

Xrw, yrw, Arw = resample_by_weights(X_train_ready, y_train, A_train, _rw_weights,
                                    n_samples=len(y_train), random_state=42)

# Post: make a small TRAIN-based calibration split (no test leakage)
trn_X, cal_X, trn_y, cal_y, trn_A, cal_A = train_test_split(
    X_train_ready, y_train, A_train, test_size=0.12, stratify=y_train, random_state=42
)

#Make types consistent to avoid the PCA warning 
X_test_np = np.asarray(X_test_ready)
trn_X_np  = np.asarray(trn_X)
cal_X_np  = np.asarray(cal_X)

  vect_normalized_discounted_cumulative_gain = vmap(
  monte_carlo_vect_ndcg = vmap(vect_normalized_discounted_cumulative_gain, in_dims=(0,))


In [32]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")

In [33]:
# Baseline
knn_base = clone(pca_knn).fit(trn_X, trn_y)
yhat_base   = knn_base.predict(X_test_np)
scores_base = get_scores(knn_base, X_test_np)
summ_base = report_model("KNN baseline", y_test, yhat_base, A_test, scores=scores_base)

# Pre (Reweighing)
knn_pre     = clone(pca_knn).fit(Xrw, yrw)
yhat_pre    = knn_pre.predict(X_test_np)
scores_pre  = get_scores(knn_pre, X_test_np)
summ_pre = report_model("KNN pre: Reweigh", y_test, yhat_pre, A_test, scores=scores_pre,
                        note="resampled by AIF360 weights")

# Post (Equalized Odds)
cal_scores  = get_scores(knn_base, cal_X_np)
post = EqOddsPostprocessing(privileged_groups=privileged_groups,
                            unprivileged_groups=unprivileged_groups)
post.fit(_to_bld(cal_y, cal_A),
         _to_bld((cal_scores >= 0.5).astype(int), cal_A))

pred_post_bld = post.predict(_to_bld((scores_base >= 0.5).astype(int), A_test))
yhat_post     = pred_post_bld.labels.ravel().astype(int)

summ_post = report_model("KNN post: EqOdds", y_test, yhat_post, A_test, scores=scores_base,
                         note="calibrated on held-out TRAIN")


=== KNN baseline ===


Unnamed: 0_level_0,TPR,FPR,Recall,SelectionRate,Accuracy
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.666667,0.0625,0.666667,0.157895,0.894737
1,0.78125,0.16,0.78125,0.568493,0.80137


Overall -> Accuracy: 0.8207 | DP diff: 0.4106 | EO diff: 0.1146

=== KNN pre: Reweigh ===


Unnamed: 0_level_0,TPR,FPR,Recall,SelectionRate,Accuracy
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.833333,0.125,0.833333,0.236842,0.868421
1,0.8125,0.18,0.8125,0.59589,0.815068


Overall -> Accuracy: 0.8261 | DP diff: 0.3590 | EO diff: 0.0208 | resampled by AIF360 weights

=== KNN post: EqOdds ===


Unnamed: 0_level_0,TPR,FPR,Recall,SelectionRate,Accuracy
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.666667,0.21875,0.666667,0.289474,0.763158
1,0.78125,0.16,0.78125,0.568493,0.80137


Overall -> Accuracy: 0.7935 | DP diff: 0.2790 | EO diff: 0.1146 | calibrated on held-out TRAIN


# KNN + AIF360 

## Results overview
| Variant             | Accuracy | DP diff | EO diff (TPR gap) | DP+EO |
|---------------------|---------:|--------:|-------------------:|------:|
| Baseline            | 0.8207   | 0.4106  | 0.1146             | 0.5252 |
| Pre: Reweigh        | 0.8261   | 0.3590  | 0.0208             | **0.3798** |
| Post: EqualizedOdds | 0.7935   | **0.2790** | 0.1146         | 0.3936 |

---

## Per-group behavior (Female → 0, Male → 1)

### Baseline
- **Selection rate:** 0: **0.158**, 1: **0.568** → large selection gap (DP 0.41).
- **TPR (Recall):** 0: **0.667**, 1: **0.781** → men recalled better (EO 0.115).
- **FPR:** 0: **0.063**, 1: **0.160**.
- **Note:** Lowest fairness overall; accuracy mid-range.

### Pre-processing: Reweigh
- **Selection rate:** 0: **0.237**, 1: **0.596** → DP improves to **0.359**.
- **TPR (Recall):** 0: **0.833**, 1: **0.813** → **near-parity recall** (EO **0.021**, best).
- **FPR:** 0: **0.125** (↑), 1: **0.180** (↑).
- **Note:** Slightly **higher accuracy** than baseline and **best combined fairness (DP+EO)**.

### Post-processing: Equalized Odds
- **Selection rate:** 0: **0.289**, 1: **0.568** → **best DP** (**0.279**) by lifting female selection.
- **TPR (Recall):** 0: **0.667**, 1: **0.781** → recall gap returns to baseline (EO **0.115**).
- **FPR:** 0: **0.219** (↑ sharply), 1: **0.160** (≈).
- **Note:** **Lowest accuracy**; DP improves but driven by a big rise in female false positives.

---

## Implications
- **Most fair overall:** **Pre: Reweigh** (smallest DP+EO with accuracy ≈ baseline).
- **Best DP alone:** **Post: EqOdds**, but it reduces accuracy and increases female FPR markedly.
- **Baseline:** retains the largest selection disparity and a moderate TPR gap.

---

In [34]:
#Decision Tree (DT)
# Baseline
dt_base = clone(alt_best_dt).fit(trn_X_np, trn_y)
yhat_dt = dt_base.predict(X_test_np)
scores_dt = get_scores(dt_base, X_test_np)
_ = report_model("DT baseline", y_test, yhat_dt, A_test, scores=scores_dt)

# Pre (Reweighing)
dt_pre = clone(alt_best_dt).fit(Xrw, yrw)
yhat_dt_pre = dt_pre.predict(X_test_np)
scores_dt_pre = get_scores(dt_pre, X_test_np)
_ = report_model("DT pre: Reweigh", y_test, yhat_dt_pre, A_test, scores=scores_dt_pre,
                 note="resampled by AIF360 weights")

# Post (Equalized Odds)
cal_scores_dt = get_scores(dt_base, cal_X_np)
post_dt = EqOddsPostprocessing(privileged_groups=privileged_groups,
                               unprivileged_groups=unprivileged_groups)
post_dt.fit(_to_bld(cal_y, cal_A),
            _to_bld((cal_scores_dt >= 0.5).astype(int), cal_A))
yhat_dt_post = post_dt.predict(_to_bld((scores_dt >= 0.5).astype(int), A_test)).labels.ravel().astype(int)
_ = report_model("DT post: EqOdds", y_test, yhat_dt_post, A_test, scores=scores_dt,
                 note="calibrated on held-out TRAIN")


=== DT baseline ===


Unnamed: 0_level_0,TPR,FPR,Recall,SelectionRate,Accuracy
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.833333,0.28125,0.833333,0.368421,0.736842
1,0.9375,0.28,0.9375,0.712329,0.863014


Overall -> Accuracy: 0.8370 | DP diff: 0.3439 | EO diff: 0.1042

=== DT pre: Reweigh ===


Unnamed: 0_level_0,TPR,FPR,Recall,SelectionRate,Accuracy
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.666667,0.15625,0.666667,0.236842,0.815789
1,0.927083,0.26,0.927083,0.69863,0.863014


Overall -> Accuracy: 0.8533 | DP diff: 0.4618 | EO diff: 0.2604 | resampled by AIF360 weights

=== DT post: EqOdds ===


Unnamed: 0_level_0,TPR,FPR,Recall,SelectionRate,Accuracy
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.833333,0.3125,0.833333,0.394737,0.710526
1,0.9375,0.28,0.9375,0.712329,0.863014


Overall -> Accuracy: 0.8315 | DP diff: 0.3176 | EO diff: 0.1042 | calibrated on held-out TRAIN


# DT + AIF360 

## Results overview
| Variant             | Accuracy | DP diff | EO diff (TPR gap) | DP+EO |
|---------------------|---------:|--------:|-------------------:|------:|
| Baseline            | 0.8370   | 0.3439  | 0.1042             | 0.4481 |
| Pre: Reweigh        | 0.8533   | 0.4618  | 0.2604             | 0.7222 |
| Post: EqualizedOdds | 0.8315   | **0.3176** | **0.1042**      | **0.4218** |

---

## Per-group behavior (Female → 0, Male → 1)

### Baseline
- **Selection rate:** 0 **0.368**, 1 **0.712** → males flagged ~**1.93×** more.
- **TPR (Recall):** 0 **0.833**, 1 **0.938** (EO **0.104**).
- **FPR:** 0 **0.281**, 1 **0.280** (similar).
- **Note:** Moderate accuracy; moderate DP; small–moderate recall gap favoring men.

### Pre-processing: Reweigh
- **Accuracy** highest (**0.853**), but **DP** worsens (**0.462**) and **EO** widens (**0.260**).
- **Selection rate:** 0 **0.237**, 1 **0.699** → ~**2.95×** gap (larger than baseline).
- **TPR:** 0 **0.667** (↓), 1 **0.927** → bigger recall disparity.
- **Note:** Not helpful for fairness here.

### Post-processing: Equalized Odds
- **Accuracy** **0.832** (slightly below baseline).
- **DP** improves to **0.318**; **EO** unchanged vs baseline (**0.104**).
- **Selection rate:** 0 **0.395**, 1 **0.712** → ~**1.81×** gap (best of the three).
- **Side effect:** Female **FPR** rises to **0.313** and female accuracy drops to **0.711**.

---

## Implications
- **Fairest overall:** **Post: EqOdds** (smallest combined disparity **DP+EO ≈ 0.422**), trading a small accuracy drop—baseline is a close second with slightly higher DP.
- **Least fair:** **Pre: Reweigh** (both DP and EO worsen despite higher accuracy).

---

In [None]:
# Random Forest (RF) - Baseline
rf_base = clone(best_rf).fit(trn_X_np, trn_y)
yhat_rf = rf_base.predict(X_test_np)
scores_rf = get_scores(rf_base, X_test_np)
_ = report_model("RF baseline", y_test, yhat_rf, A_test, scores=scores_rf)

# Pre (Reweighing)
rf_pre = clone(best_rf).fit(Xrw, yrw)
yhat_rf_pre = rf_pre.predict(X_test_np)
scores_rf_pre = get_scores(rf_pre, X_test_np)
_ = report_model("RF pre: Reweigh", y_test, yhat_rf_pre, A_test, scores=scores_rf_pre,
                 note="resampled by AIF360 weights")

# Post (Equalized Odds)
cal_scores_rf = get_scores(rf_base, cal_X_np)
post_rf = EqOddsPostprocessing(privileged_groups=privileged_groups,
                               unprivileged_groups=unprivileged_groups)
post_rf.fit(_to_bld(cal_y, cal_A),
            _to_bld((cal_scores_rf >= 0.5).astype(int), cal_A))
yhat_rf_post = post_rf.predict(_to_bld((scores_rf >= 0.5).astype(int), A_test)).labels.ravel().astype(int)
_ = report_model("RF post: EqOdds", y_test, yhat_rf_post, A_test, scores=scores_rf,
                 note="calibrated on held-out TRAIN")


=== RF baseline ===


Unnamed: 0_level_0,TPR,FPR,Recall,SelectionRate,Accuracy
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.833333,0.0,0.833333,0.131579,0.973684
1,0.78125,0.14,0.78125,0.561644,0.808219


Overall -> Accuracy: 0.8424 | DP diff: 0.4301 | EO diff: 0.0521

=== RF pre: Reweigh ===


Unnamed: 0_level_0,TPR,FPR,Recall,SelectionRate,Accuracy
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.833333,0.0625,0.833333,0.184211,0.921053
1,0.822917,0.14,0.822917,0.589041,0.835616


Overall -> Accuracy: 0.8533 | DP diff: 0.4048 | EO diff: 0.0104 | resampled by AIF360 weights

=== RF post: EqOdds ===


Unnamed: 0_level_0,TPR,FPR,Recall,SelectionRate,Accuracy
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.833333,0.1875,0.833333,0.289474,0.815789
1,0.78125,0.14,0.78125,0.561644,0.808219


Overall -> Accuracy: 0.8098 | DP diff: 0.2722 | EO diff: 0.0521 | calibrated on held-out TRAIN


## RF + AIF360

### Results overview
| Variant             | Accuracy | DP diff | EO diff (TPR gap) | DP+EO |
|---------------------|---------:|--------:|-------------------:|------:|
| Baseline            | 0.8424   | 0.4301  | 0.0521             | 0.4822 |
| Pre: Reweigh        | 0.8533   | 0.4048  | 0.0104             | 0.4152 |
| Post: EqualizedOdds | 0.8098   | **0.2722** | 0.0521          | **0.3243** |

---

### Per-group behavior (Female → 0, Male → 1)

#### Baseline
- **Selection rate:** 0 **0.132**, 1 **0.562** → large DP gap.
- **TPR (Recall):** 0 **0.833**, 1 **0.781** → small recall gap (EO **0.052**).
- **FPR:** 0 **0.000**, 1 **0.140** (note: zero FPR for females likely reflects small-sample effects).
- **Note:** Good accuracy; biggest selection disparity.

#### Pre-processing: Reweigh
- **Accuracy** highest (**0.8533**).
- **DP diff** improves to **0.4048**; **EO diff** becomes **very small** (**0.0104**).
- **Per-group:** SR 0 **0.184**, 1 **0.589**; TPR 0 **0.833**, 1 **0.823**; FPR 0 **0.0625**, 1 **0.140**.
- **Note:** Best **accuracy–fairness trade-off**; recall nearly equalized, DP modestly reduced.

#### Post-processing: Equalized Odds
- **Accuracy** lowest (**0.8098**).
- **DP diff** **best** (**0.2722**) by raising female selection (SR 0 **0.289**, 1 **0.562**).
- **EO diff** same as baseline (**0.0521**).
- **Per-group:** Female **FPR increases** to **0.1875** (↑) while male FPR **0.140** (≈).
- **Note:** Strongest reduction in DP but driven by higher female false positives and lower accuracy.

---

### Implications
- **Fairest by DP+EO:** **Post: EqOdds** (≈ **0.324**) but with the largest accuracy drop and higher female FPR.
- **Best balance of accuracy and fairness:** **Pre: Reweigh** (Accuracy **0.8533**, DP+EO ≈ **0.415**), achieving near-parity recall with a moderate DP reduction.
- **Baseline** remains least fair on selection disparity despite reasonable accuracy.

---

In [36]:
#MLP - Baseline
mlp_base = clone(adammlp).fit(trn_X_np, trn_y)
yhat_mlp = mlp_base.predict(X_test_np)
scores_mlp = get_scores(mlp_base, X_test_np)  # works with predict_proba or decision_function
_ = report_model("MLP baseline", y_test, yhat_mlp, A_test, scores=scores_mlp)

# Pre (Reweighing)
mlp_pre = clone(adammlp).fit(Xrw, yrw)
yhat_mlp_pre = mlp_pre.predict(X_test_np)
scores_mlp_pre = get_scores(mlp_pre, X_test_np)
_ = report_model("MLP pre: Reweigh", y_test, yhat_mlp_pre, A_test, scores=scores_mlp_pre,
                 note="resampled by AIF360 weights")

# Post (Equalized Odds)
cal_scores_mlp = get_scores(mlp_base, cal_X_np)
post_mlp = EqOddsPostprocessing(privileged_groups=privileged_groups,
                                unprivileged_groups=unprivileged_groups)
post_mlp.fit(_to_bld(cal_y, cal_A),
             _to_bld((cal_scores_mlp >= 0.5).astype(int), cal_A))
yhat_mlp_post = post_mlp.predict(_to_bld((scores_mlp >= 0.5).astype(int), A_test)).labels.ravel().astype(int)
_ = report_model("MLP post: EqOdds", y_test, yhat_mlp_post, A_test, scores=scores_mlp,
                 note="calibrated on held-out TRAIN")


=== MLP baseline ===


Unnamed: 0_level_0,TPR,FPR,Recall,SelectionRate,Accuracy
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.666667,0.03125,0.666667,0.131579,0.921053
1,0.822917,0.2,0.822917,0.609589,0.815068


Overall -> Accuracy: 0.8370 | DP diff: 0.4780 | EO diff: 0.1562

=== MLP pre: Reweigh ===


Unnamed: 0_level_0,TPR,FPR,Recall,SelectionRate,Accuracy
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.666667,0.09375,0.666667,0.184211,0.868421
1,0.802083,0.26,0.802083,0.616438,0.780822


Overall -> Accuracy: 0.7989 | DP diff: 0.4322 | EO diff: 0.1354 | resampled by AIF360 weights

=== MLP post: EqOdds ===


Unnamed: 0_level_0,TPR,FPR,Recall,SelectionRate,Accuracy
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.833333,0.1875,0.833333,0.289474,0.815789
1,0.822917,0.2,0.822917,0.609589,0.815068


Overall -> Accuracy: 0.8152 | DP diff: 0.3201 | EO diff: 0.0104 | calibrated on held-out TRAIN


# MLP + AIF360  

## Results overview
| Variant             | Accuracy | DP diff | EO diff (TPR gap) | DP+EO |
|---------------------|---------:|--------:|-------------------:|------:|
| Baseline            | 0.8370   | 0.4780  | 0.1562             | 0.6342 |
| Pre: Reweigh        | 0.7989   | 0.4322  | 0.1354             | 0.5676 |
| Post: EqualizedOdds | 0.8152   | **0.3201** | **0.0104**      | **0.3305** |

---

## Per-group behavior (Female → 0, Male → 1)

### Baseline
- **Selection rate:** 0 **0.132**, 1 **0.610** → very large selection gap (DP **0.478**).
- **TPR (Recall):** 0 **0.667**, 1 **0.823** → males recalled better (EO **0.156**).
- **FPR:** 0 **0.031**, 1 **0.200**.
- **Note:** Best accuracy, worst fairness.

### Pre-processing: Reweigh
- **Selection rate:** 0 **0.184**, 1 **0.616** → DP improves to **0.432**.
- **TPR (Recall):** 0 **0.667**, 1 **0.802** → EO improves to **0.135**.
- **FPR:** rises for females (**0.094**) and males (**0.260**).
- **Note:** Modest fairness gain, but accuracy drops to **0.799**.

### Post-processing: Equalized Odds
- **Selection rate:** 0 **0.289**, 1 **0.610** → **best DP** (**0.320**), driven by higher female selection.
- **TPR (Recall):** 0 **0.833**, 1 **0.823** → **near-perfect recall parity** (EO **0.010**).
- **FPR:** female **0.188** (↑ vs baseline), male **0.200** (≈).
- **Note:** Strong fairness improvement with a moderate accuracy cost (**0.815**).

---

## Implications
- **Most fair overall:** **Post: Equalized Odds** (smallest DP and EO, lowest DP+EO ≈ **0.331**), at the cost of increased female FPR and a modest accuracy drop.
- **Baseline** maximizes accuracy but exhibits the largest selection and recall disparities.
- **Pre: Reweigh** offers limited fairness gains and the lowest accuracy among the three.

---

First fairness mitigation: pre- and post-processing was performed on the designated best performing models (KNN, DT, RF, MLP) for CVD prediction.  In addition, these results are compared to a fairness-aware in-processing model - Adversarial Debiasing offered by AIF360.

In [37]:
#Adversarial Debiasing - In-processing by AIF360
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

try:
    import tensorflow as tf
    from aif360.algorithms.inprocessing import AdversarialDebiasing

    # TF1 graph mode - required by AIF360's implementation 
    tf.compat.v1.disable_eager_execution()
    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

    # Build AIF360 datasets with FEATURES + label + sensitive attribute
    bld_tr = BinaryLabelDataset(
        df=pd.concat([
            pd.DataFrame(X_train_ready).reset_index(drop=True),
            pd.Series(y_train, name=label_name),
            pd.Series(A_train, name=protected_attr)
        ], axis=1),
        label_names=[label_name],
        protected_attribute_names=[protected_attr],
        favorable_label=favorable_label,
        unfavorable_label=unfavorable_label
    )

    bld_te = BinaryLabelDataset(
        df=pd.concat([
            pd.DataFrame(X_test_ready).reset_index(drop=True),
            pd.Series(y_test, name=label_name),
            pd.Series(A_test, name=protected_attr)
        ], axis=1),
        label_names=[label_name],
        protected_attribute_names=[protected_attr],
        favorable_label=favorable_label,
        unfavorable_label=unfavorable_label
    )

    # Train + predict inside a TF1 session
    sess = tf.compat.v1.Session()
    with sess.as_default():
        adv = AdversarialDebiasing(
            privileged_groups=privileged_groups,
            unprivileged_groups=unprivileged_groups,
            scope_name="adv_debias",
            debias=True,
            sess=sess
        )
        adv.fit(bld_tr)
        pred_te = adv.predict(bld_te)

        # Extract labels and (if available) scores
        yhat_adv = pred_te.labels.ravel().astype(int)
        scores_adv = getattr(pred_te, "scores", None)
        if scores_adv is None:
            scores_adv = yhat_adv.astype(float)

    # Clean up TF graph
    tf.compat.v1.reset_default_graph()
    sess.close()

    # Same structured output as other models
    _ = report_model(
        "ADV in-proc (AIF360)",
        y_test, yhat_adv, A_test,
        scores=scores_adv,
        note="trained on X_train_ready"
    )

except Exception as e:
    print("AdversarialDebiasing skipped:", type(e).__name__, e)



epoch 0; iter: 0; batch classifier loss: 0.670832; batch adversarial loss: 0.753568
epoch 1; iter: 0; batch classifier loss: 0.624208; batch adversarial loss: 0.765601
epoch 2; iter: 0; batch classifier loss: 0.582585; batch adversarial loss: 0.748832
epoch 3; iter: 0; batch classifier loss: 0.551592; batch adversarial loss: 0.724308
epoch 4; iter: 0; batch classifier loss: 0.520707; batch adversarial loss: 0.766118
epoch 5; iter: 0; batch classifier loss: 0.514107; batch adversarial loss: 0.749066
epoch 6; iter: 0; batch classifier loss: 0.522766; batch adversarial loss: 0.779790
epoch 7; iter: 0; batch classifier loss: 0.467463; batch adversarial loss: 0.768849
epoch 8; iter: 0; batch classifier loss: 0.516543; batch adversarial loss: 0.736276
epoch 9; iter: 0; batch classifier loss: 0.495456; batch adversarial loss: 0.739579
epoch 10; iter: 0; batch classifier loss: 0.467715; batch adversarial loss: 0.774186
epoch 11; iter: 0; batch classifier loss: 0.464730; batch adversarial los

Unnamed: 0_level_0,TPR,FPR,Recall,SelectionRate,Accuracy
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.833333,0.09375,0.833333,0.210526,0.894737
1,0.8125,0.22,0.8125,0.609589,0.80137


Overall -> Accuracy: 0.8207 | DP diff: 0.3991 | EO diff: 0.0208 | trained on X_train_ready


## Adversarial Debiasing   

### Results overview
| Variant                 | Accuracy | DP diff | EO diff (TPR gap) | DP+EO |
|-------------------------|---------:|--------:|-------------------:|------:|
| ADV in-proc (AIF360)    | 0.8207   | 0.3991  | 0.0208             | 0.4199 |

---

### Per-group reading (Female → 0, Male → 1)
- **Selection rate:** 0 **0.2105** vs 1 **0.6096** → males are flagged **~2.9×** as often (**DP ≈ 0.399**).
- **TPR (Recall):** 0 **0.8333** vs 1 **0.8125** → **near parity** (**EO ≈ 0.021**).
- **FPR:** 0 **0.0938** vs 1 **0.2200** → higher false positives for males, contributing to the selection gap.
- **Per-group accuracy:** 0 **0.8947**, 1 **0.8014** → ~0.09 absolute difference.

---

### Implications
- The adversarial model **achieves strong recall parity** (very small EO gap) while maintaining **moderate overall accuracy** (~0.82).
- **Selection disparity remains large** (high DP) driven by higher male FPR and selection rates.

---

In [38]:
# Grid-tune AIF360 AdversarialDebiasing for better DP/EO balance and print with report_model
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import tensorflow as tf
from aif360.algorithms.inprocessing import AdversarialDebiasing

# small search over key knobs; widen if needed
ADV_GRID = dict(
    adversary_loss_weight=[0.02, 0.05, 0.1, 0.2, 0.3],
    num_epochs=[40, 60, 80],
    batch_size=[64, 128],
    classifier_num_hidden_units=[32, 64]  # size of main net
)

def run_adv(loss_w=0.1, epochs=50, bs=128, hidden=64, seed=42):
    tf.compat.v1.reset_default_graph()
    tf.compat.v1.disable_eager_execution()
    tf.compat.v1.set_random_seed(seed)
    sess = tf.compat.v1.Session()
    with sess.as_default():
        adv = AdversarialDebiasing(
            privileged_groups=privileged_groups,
            unprivileged_groups=unprivileged_groups,
            debias=True,
            scope_name=f"adv_w{loss_w}_e{epochs}_b{bs}_h{hidden}",
            adversary_loss_weight=loss_w,
            num_epochs=epochs,
            batch_size=bs,
            classifier_num_hidden_units=hidden,
            sess=sess
        )
        adv.fit(bld_tr)
        pred_te = adv.predict(bld_te)
        yhat = pred_te.labels.ravel().astype(int)
        scores = getattr(pred_te, "scores", None)
        if scores is None:
            scores = yhat.astype(float)
    sess.close()
    tf.compat.v1.reset_default_graph()
    return yhat, scores

# Build once (as you did)
bld_tr = BinaryLabelDataset(
    df=pd.concat([pd.DataFrame(X_train_ready).reset_index(drop=True),
                  pd.Series(y_train, name=label_name),
                  pd.Series(A_train, name=protected_attr)], axis=1),
    label_names=[label_name], protected_attribute_names=[protected_attr],
    favorable_label=favorable_label, unfavorable_label=unfavorable_label
)
bld_te = BinaryLabelDataset(
    df=pd.concat([pd.DataFrame(X_test_ready).reset_index(drop=True),
                  pd.Series(y_test, name=label_name),
                  pd.Series(A_test, name=protected_attr)], axis=1),
    label_names=[label_name], protected_attribute_names=[protected_attr],
    favorable_label=favorable_label, unfavorable_label=unfavorable_label
)

# Search & pick the best by minimizing (DP + EO) with an accuracy floor
best = None
acc_floor = 0.86  # keep close to your current accuracy; adjust as you like
results = []
for w in ADV_GRID["adversary_loss_weight"]:
    for e in ADV_GRID["num_epochs"]:
        for bs in ADV_GRID["batch_size"]:
            for h in ADV_GRID["classifier_num_hidden_units"]:
                yhat, scores = run_adv(w, e, bs, h)
                acc = accuracy_score(y_test, yhat)
                dp, eo = fair_metrics(y_test, yhat, A_test, scores, absolute=True)
                obj = dp + eo
                results.append((obj, acc, dp, eo, w, e, bs, h, yhat, scores))
                if (best is None or obj < best[0]) and acc >= acc_floor:
                    best = (obj, acc, dp, eo, w, e, bs, h, yhat, scores)

# Report best and (optionally) a few runners-up
if best is None:
    # fallback: take global best even if below floor
    best = sorted(results, key=lambda t: t[0])[0]

obj, acc, dp, eo, w, e, bs, h, yhat_best, scores_best = best
_ = report_model(
    f"ADV in-proc (best) w={w}, e={e}, b={bs}, h={h}",
    y_test, yhat_best, A_test, scores=scores_best,
    note=f"combined gap (DP+EO)={obj:.4f}; acc={acc:.4f}"
)

epoch 0; iter: 0; batch classifier loss: 0.732277; batch adversarial loss: 0.591770
epoch 1; iter: 0; batch classifier loss: 0.647776; batch adversarial loss: 0.603194
epoch 2; iter: 0; batch classifier loss: 0.675193; batch adversarial loss: 0.553318
epoch 3; iter: 0; batch classifier loss: 0.554265; batch adversarial loss: 0.601640
epoch 4; iter: 0; batch classifier loss: 0.600965; batch adversarial loss: 0.611448
epoch 5; iter: 0; batch classifier loss: 0.588793; batch adversarial loss: 0.545440
epoch 6; iter: 0; batch classifier loss: 0.508106; batch adversarial loss: 0.618392
epoch 7; iter: 0; batch classifier loss: 0.473747; batch adversarial loss: 0.609448
epoch 8; iter: 0; batch classifier loss: 0.511311; batch adversarial loss: 0.609689
epoch 9; iter: 0; batch classifier loss: 0.508317; batch adversarial loss: 0.590390
epoch 10; iter: 0; batch classifier loss: 0.500133; batch adversarial loss: 0.562038
epoch 11; iter: 0; batch classifier loss: 0.476059; batch adversarial loss:

Unnamed: 0_level_0,TPR,FPR,Recall,SelectionRate,Accuracy
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.833333,0.125,0.833333,0.236842,0.868421
1,0.854167,0.1,0.854167,0.59589,0.869863


Overall -> Accuracy: 0.8696 | DP diff: 0.3590 | EO diff: 0.0208 | combined gap (DP+EO)=0.3799; acc=0.8696


# Adversarial Debiasing (tuned best: w=0.3, epochs=60, batch=128, hidden=32)  

## Results overview
| Model                        | Accuracy | DP diff | EO diff (TPR gap) | DP+EO |
|-----------------------------|---------:|--------:|-------------------:|------:|
| ADV in-proc (tuned best)    | 0.8696   | 0.3590  | 0.0208             | 0.3799 |

---

## Per-group behavior (Female → 0, Male → 1)
- **Selection rate:** 0 **0.237** vs 1 **0.596** → males flagged ~**2.5×** as often (**DP ≈ 0.359**).
- **TPR (Recall):** 0 **0.833** vs 1 **0.854** → **near-perfect recall parity** (**EO ≈ 0.021**).
- **FPR:** 0 **0.125** vs 1 **0.100** → slightly higher female precision (lower FPR for males would further raise selection for men).
- **Per-group accuracy:** 0 **0.868**, 1 **0.870** → very similar.

---

## Interpretation
- Tuning substantially **improves accuracy** and **keeps recall gaps minimal**, while **reducing DP** versus the untuned ADV runs (now ~0.36).  
- The remaining issue is **selection-rate disparity** (male SR still ~2.5× female), driven by score/threshold differences and small FPR asymmetry.  

---