In [1]:
import os
import sys

src_path = r"D:\SEM 4\CS516\516 Project\src"
if src_path not in sys.path:
    sys.path.append(src_path)

In [2]:
import pandas as pd
from data_loader import load_dataset
from modeling import train_logistic_regression, evaluate_model, train_random_forest
from fairness_metrics import print_group_rates, disparate_impact, equal_opportunity
from preprocess import apply_reweighing

  vect_normalized_discounted_cumulative_gain = vmap(
  monte_carlo_vect_ndcg = vmap(vect_normalized_discounted_cumulative_gain, in_dims=(0,))


In [3]:
# Load data
df = load_dataset("../data/cleaned_dataset.csv")

# Define features and target
features = ['Gender_encoded', 'Age_encoded', 'EdLevel', 'YearsCode', 'YearsCodePro', 'ComputerSkills', 'PreviousSalary']
X = pd.get_dummies(df[features], drop_first=True)
y = df['Employment'].astype(int)

# Train and evaluate Random Forest model
model, X_test, y_test, y_pred = train_random_forest(X, y)
evaluate_model(y_test, y_pred)

# Fairness metrics
print_group_rates(df, 'Gender')
print_group_rates(df, 'Age')
print_group_rates(df, 'EdLevel')

disparate_impact(df, 'Woman', 'Man', 'Gender')
disparate_impact(df, '>35', '<35', 'Age')

equal_opportunity(y_test.reset_index(drop=True), pd.Series(y_pred), ['Man', 'Woman', 'NonBinary'], 'Gender', df.reset_index(drop=True))

Using a preprocessing mitigation strategy - AIF360’s Reweighing

In [4]:
# Apply AIF360 Reweighing
df = apply_reweighing(df, protected_attr='Gender', label_col='Employment')

# Define features and target
features = ['Gender_encoded', 'Age_encoded', 'EdLevel', 'YearsCode', 'YearsCodePro', 'ComputerSkills', 'PreviousSalary']
X = pd.get_dummies(df[features], drop_first=True)
y = df['Employment'].astype(int)
sample_weights = df['instance_weight']

# Train and evaluate model with sample weights from AIF360
model, X_test, y_test, y_pred = train_random_forest(X, y, sample_weights)
evaluate_model(y_test, y_pred)

# Fairness metrics
print_group_rates(df, 'Gender')
print_group_rates(df, 'Age')
print_group_rates(df, 'EdLevel')

disparate_impact(df, 'Woman', 'Man', 'Gender')
disparate_impact(df, '>35', '<35', 'Age')

equal_opportunity(
    y_test.reset_index(drop=True),
    pd.Series(y_pred),
    ['Man', 'Woman', 'NonBinary'],
    'Gender',
    df.reset_index(drop=True)
)

Accuracy: 0.8777621489178274
Classification Report:
               precision    recall  f1-score   support

           0       0.28      0.03      0.05      2571
           1       0.89      0.99      0.93     19468

    accuracy                           0.88     22039
   macro avg       0.58      0.51      0.49     22039
weighted avg       0.81      0.88      0.83     22039


Selection Rates by Gender:
0: 0.88
2: 0.91
1: 0.87

Selection Rates by Age:
<35: 0.90
>35: 0.85

Selection Rates by EdLevel:
Master: 0.88
Undergraduate: 0.90
PhD: 0.90
Other: 0.84
NoHigherEd: 0.80

Disparate Impact (Woman/Man): 0.00

Disparate Impact (>35/<35): 0.95

Equal Opportunity by group:
Man: TPR = 0.00
Woman: TPR = 0.00
NonBinary: TPR = 0.00
