In [1]:
import numpy as np
import pandas as pd

from aif360.datasets import (
                            CompasDataset,
                            GermanDataset
)
from aif360.metrics import (
                            BinaryLabelDatasetMetric,
                            ClassificationMetric
)
from aif360.algorithms.preprocessing import DisparateImpactRemover
from sklearn.linear_model import LogisticRegression 

pip install 'aif360[LFR]'


In [2]:
data = CompasDataset()

data_train, data_test = data.split([0.7], shuffle=True)

protected = 'sex'

privileged_groups = [{protected: 1}]
unprivileged_groups = [{protected: 0}]



In [3]:
data.protected_attributes

array([[0., 0.],
       [0., 0.],
       [0., 0.],
       ...,
       [0., 0.],
       [1., 0.],
       [1., 0.]])

In [3]:
metric_orig_train = BinaryLabelDatasetMetric(data_train,
                                             unprivileged_groups=unprivileged_groups,
                                             privileged_groups=privileged_groups)
print("#### Original training dataset")
print("Difference in mean outcomes "
      "between unprivileged and privileged "
      "groups = %f"
      % metric_orig_train.mean_difference())

print(metric_orig_train.mean_difference())
print(metric_orig_train.base_rate(privileged=None))
print(metric_orig_train.base_rate(privileged=True))
print(metric_orig_train.base_rate(privileged=False))

#### Original training dataset
Difference in mean outcomes between unprivileged and privileged groups = -0.107636
-0.10763593145089434
0.5465708989805376
0.6339066339066339
0.5262707024557396


In [4]:
data_train.labels.ravel()

array([1., 0., 1., ..., 1., 1., 0.])

In [5]:
X_tr = data_train.features
X_te = data_test.features
y_tr = data_train.labels.ravel()

lmod = LogisticRegression(class_weight='balanced', solver='liblinear')
lmod.fit(X_tr, y_tr)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [6]:
data_test_pred = data_test.copy()
data_test_pred.labels = lmod.predict(X_te)

p = [{protected: 1}]
u = [{protected: 0}]
cm = BinaryLabelDatasetMetric(data_test_pred, privileged_groups=p, unprivileged_groups=u)
print(cm.mean_difference())
print(cm.base_rate(privileged=None))
print(cm.base_rate(privileged=True))
print(cm.base_rate(privileged=False))
print(cm.disparate_impact())

-0.2794215388291874
0.5407887628309023
0.766016713091922
0.4865951742627346
0.6352278820375336


In [7]:
classifier_metric = ClassificationMetric(data_test, data_test_pred, privileged_groups=p, unprivileged_groups=u)
print(classifier_metric.mean_difference())
print(classifier_metric.base_rate(privileged=None))
print(classifier_metric.base_rate(privileged=True))
print(classifier_metric.base_rate(privileged=False))
print(classifier_metric.disparate_impact())

-0.2794215388291874
0.539708265802269
0.6796657381615598
0.5060321715817694
0.6352278820375336
