## Load dataset

In [8]:
"""Load dataset"""
from sklearn.datasets import fetch_openml
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data = fetch_openml(data_id=1590, as_frame=True)
X_raw = data.data
X = pd.get_dummies(data.data)
y_true = (data.target == '>50K') * 1
# print(X[0:5])
# print(X_raw[0:5])
# print(Y[0:5])

In [7]:
"""Treating the sex of each individual as a sentitive feature"""
sex = data.data['sex']
sex.value_counts()
# A = X_raw["sex"]
# X = X_raw.drop(labels=['sex'], axis=1)
# X = pd.get_dummies(X)

# sc = StandardScaler()
# X_scaled = sc.fit_transform(X)
# X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# le = LabelEncoder()
# Y = le.fit_transform(Y)

Male      32650
Female    16192
Name: sex, dtype: int64

## Evaluating fairness-related metrics

In [33]:
from fairlearn.metrics import MetricFrame
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
import time
start_time = time.time()
classifier = DecisionTreeClassifier(min_samples_leaf=100, max_depth=40)
classifier.fit(X, y_true)
y_pred = classifier.predict(X)
finish_time = time.time()
gm = MetricFrame(accuracy_score, y_true, y_pred, sensitive_features=sex)
print(gm.overall)
print(gm.by_group)
print('disparity:', gm.by_group[0] - gm.by_group[1])
print('time taken:', finish_time-start_time)

0.86368289586831
sex
Female    0.931015
Male      0.830291
Name: accuracy_score, dtype: object
disparity: 0.10072435142758562
time taken: 0.35779690742492676


## Mitigating disparity

In [35]:
# Using ExponentiatedGradient as the mitigation approach
from fairlearn.reductions import ExponentiatedGradient, DemographicParity, EqualizedOdds
np.random.seed(0)  # set seed for consistent results with ExponentiatedGradient
constraint = EqualizedOdds()
start_time = time.time()
classifier = DecisionTreeClassifier(min_samples_leaf=100, max_depth=40)
mitigator = ExponentiatedGradient(classifier, constraint)
mitigator.fit(X, y_true, sensitive_features=sex)
y_pred_mitigated = mitigator.predict(X)
finish_time = time.time()
sr_mitigated = MetricFrame(accuracy_score, y_true, y_pred_mitigated, sensitive_features=sex)
print(sr_mitigated.overall)
print(sr_mitigated.by_group)
print('disparity:', sr_mitigated.by_group[0] - sr_mitigated.by_group[1])
print('time taken:', finish_time-start_time)

0.8594447401826297
sex
Female    0.917861
Male      0.830475
Name: accuracy_score, dtype: object
disparity: 0.08738593993063337
time taken: 68.59321928024292


In [None]:
# All possible mitigation, constraints in fairlearn
from fairlearn.reductions import GridSearch, ExponentiatedGradient
from fairlearn.reductions import AbsoluteLoss
from fairlearn.reductions import Moment, ClassificationMoment, UtilityParity, DemographicParity, EqualizedOdds, \
    TruePositiveRateParity, FalsePositiveRateParity, ErrorRateParity, ErrorRate, BoundedGroupLoss, LossMoment, SquareLoss, ZeroOneLoss

## Observations
1. Time consuming

## Questions
1. After the mitigation, can we ensure we are indeed satisfying the constraints
(What does constraint play a role in the mitigation)
2. Is there any other mitigation approach?
