In [5]:
import pandas as pd
import numpy as np

DATA_PATH = "data/student-mat.csv"
PASS_THRESHOLD = 10


In [6]:
students = pd.read_csv(DATA_PATH, sep=';')
students.head()


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [7]:
students["sex_bin"] = (students["sex"] == "M").astype(int)
students["age_bin"] = (students["age"] >= 18).astype(int)

students["G1_pass"] = (students["G1"] >= PASS_THRESHOLD).astype(int)
students["G3_pass"] = (students["G3"] >= PASS_THRESHOLD).astype(int)

students[["sex","sex_bin","age","age_bin","G1","G1_pass","G3","G3_pass"]].head()


Unnamed: 0,sex,sex_bin,age,age_bin,G1,G1_pass,G3,G3_pass
0,F,0,18,1,5,0,6,0
1,F,0,17,0,5,0,6,0
2,F,0,15,0,7,0,10,1
3,F,0,15,0,15,1,15,1
4,F,0,16,0,6,0,10,1


In [8]:
def positive_rate(data, outcome_col, group_mask, weight_col=None):
    group = data[group_mask]
    if group.empty:
        return np.nan
    if weight_col is None:
        return group[outcome_col].mean()
    weights = group[weight_col].to_numpy()
    values = group[outcome_col].to_numpy()
    if weights.sum() == 0:
        return np.nan
    return (weights * values).sum() / weights.sum()

def calculate_fairness_metrics(data, outcome_col, protected_col, unpriv_value, priv_value, weight_col=None):
    unpriv_mask = data[protected_col] == unpriv_value
    priv_mask = data[protected_col] == priv_value

    p_unpriv = positive_rate(data, outcome_col, unpriv_mask, weight_col)
    p_priv = positive_rate(data, outcome_col, priv_mask, weight_col)

    spd = p_unpriv - p_priv
    if p_priv == 0 or np.isnan(p_priv):
        di = np.nan
    else:
        di = p_unpriv / p_priv

    return {
        "protected_attr": protected_col,
        "outcome": outcome_col,
        "SPD": spd,
        "DI": di
    }


In [9]:
original_results = []

for outcome in ["G1_pass", "G3_pass"]:
    original_results.append(calculate_fairness_metrics(students, outcome, "sex_bin", 0, 1))
    original_results.append(calculate_fairness_metrics(students, outcome, "age_bin", 0, 1))

original_fairness = pd.DataFrame(original_results)
original_fairness


Unnamed: 0,protected_attr,outcome,SPD,DI
0,sex_bin,G1_pass,-0.073375,0.891959
1,age_bin,G1_pass,0.076386,1.130444
2,sex_bin,G3_pass,-0.066459,0.905849
3,age_bin,G3_pass,0.16876,1.307088


In [10]:
def compute_reweighing_weights(data, outcome_col, protected_cols):
    n = len(data)

    p_s = data.groupby(protected_cols).size().div(n).rename("p_s")
    p_y = data.groupby(outcome_col).size().div(n).rename("p_y")
    p_sy = data.groupby(protected_cols + [outcome_col]).size().div(n).rename("p_sy")

    p_s_df = p_s.reset_index()
    p_y_df = p_y.reset_index()
    p_sy_df = p_sy.reset_index()

    merged = p_sy_df.merge(p_s_df, on=protected_cols, how="left")
    merged = merged.merge(p_y_df, on=outcome_col, how="left")

    merged["weight"] = (merged["p_s"] * merged["p_y"]) / merged["p_sy"]

    data_with_weights = data.merge(
        merged[protected_cols + [outcome_col, "weight"]],
        on=protected_cols + [outcome_col],
        how="left"
    )

    weight_col_name = f"rw_weight_{outcome_col}"
    data_with_weights = data_with_weights.rename(columns={"weight": weight_col_name})
    return data_with_weights, weight_col_name


In [11]:
students_rw, rw_col = compute_reweighing_weights(students, "G3_pass", ["sex_bin", "age_bin"])
students_rw.head()


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,health,absences,G1,G2,G3,sex_bin,age_bin,G1_pass,G3_pass,rw_weight_G3_pass
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,3,6,5,6,6,0,1,0,0,0.795359
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,3,4,5,5,6,0,0,0,0,0.967982
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,3,10,7,8,10,0,0,0,1,1.016494
3,GP,F,15,U,GT3,T,4,2,health,services,...,5,2,15,14,15,0,0,1,1,1.016494
4,GP,F,16,U,GT3,T,3,3,other,other,...,5,4,6,10,10,0,0,0,1,1.016494


In [12]:
reweighted_results = []

for outcome in ["G1_pass", "G3_pass"]:
    reweighted_results.append(
        calculate_fairness_metrics(students_rw, outcome, "sex_bin", 0, 1, weight_col=rw_col)
    )
    reweighted_results.append(
        calculate_fairness_metrics(students_rw, outcome, "age_bin", 0, 1, weight_col=rw_col)
    )

reweighted_fairness = pd.DataFrame(reweighted_results)
reweighted_fairness


Unnamed: 0,protected_attr,outcome,SPD,DI
0,sex_bin,G1_pass,-0.03323322,0.949596
1,age_bin,G1_pass,-0.04999854,0.926232
2,sex_bin,G3_pass,0.0,1.0
3,age_bin,G3_pass,-2.220446e-16,1.0
