In [1]:
import numpy as np
import sklearn, sklearn.linear_model

import loader
import postprocess
import utils

data_dir = "data/adult"

## Attribute-aware

In [2]:
## Load the UCI Adult dataset

(inputs, labels, label_names, groups,
 group_names) = loader.load_adult(data_dir, remove_sensitive_attr=False)

n_classes = len(label_names)
n_groups = len(group_names)

# Normalize data
scaler = sklearn.preprocessing.StandardScaler()
inputs[:] = scaler.fit_transform(inputs)

display(loader.dataset_stats(labels, label_names, groups, group_names))

Group,Female,Male
Target,Unnamed: 1_level_1,Unnamed: 2_level_1
<=50K,14423,22732
>50K,1769,9918


In [3]:
## Split data by 0.35/0.35/0.3 for pre-training, post-training, and testing

(inputs_train, inputs_test, labels_train, labels_test, groups_train,
 groups_test) = sklearn.model_selection.train_test_split(
     inputs,
     labels,
     groups,
     test_size=0.3,
 )

(inputs_pretrain, inputs_postproc, labels_pretrain, labels_postproc,
 groups_pretrain, groups_postproc) = sklearn.model_selection.train_test_split(
     inputs_train,
     labels_train,
     groups_train,
     test_size=0.5,
 )

In [4]:
## Train a predictor for Y given X

predictor_y = sklearn.linear_model.LogisticRegression()
predictor_y.fit(inputs_pretrain, labels_pretrain)

# Define predict functions
predict_y = lambda x: predictor_y.predict_proba(x)
predict_a = lambda x: np.where(x[['Sex_Female', 'Sex_Male']].values > 0, 1, 0)  # attribute-aware
predict_ay = lambda x: np.einsum('ij,ik->ijk', predict_a(x), predict_y(x)
                                ).reshape(-1, n_groups, n_classes)

In [5]:
## Post-process for statistical parity

postprocessor = postprocess.PostProcessor(
    n_classes,
    n_groups,
    pred_y_fn=predict_y,
    pred_a_fn=predict_a,
    criterion='sp',
    alpha=0.001,
)

# Using cvxpy's default solver rather than Gurobi.
# Okay for small scale datasets/problems, but can be very slow for larger ones.
postprocessor.fit(inputs_postproc, solver=None)

# Evaluate
preds = np.argmax(predict_y(inputs_test), axis=1)
preds_fair = postprocessor.predict(inputs_test)

print(
    f"Attribute-aware post-processing result for statistical parity on UCI Adult:\n"
    f"  Original: accuracy={1-utils.error_rate(labels_test, preds):.4f}, delta_sp={utils.delta_sp(preds, groups_test, n_classes, n_groups):.4f}\n"
    f"  Post-processed: accuracy={1-utils.error_rate(labels_test, preds_fair):.4f}, delta_sp={utils.delta_sp(preds_fair, groups_test, n_classes, n_groups):.4f}"
)

Attribute-aware post-processing result for statistical parity on UCI Adult:
  Original: accuracy=0.8490, delta_sp=0.1761
  Post-processed: accuracy=0.8311, delta_sp=0.0043


In [6]:
## Post-process for binary equal opportunity

postprocessor = postprocess.PostProcessor(
    n_classes,
    n_groups,
    pred_ay_fn=predict_ay,
    criterion='eopp',
    alpha=0.001,
)
postprocessor.fit(inputs_postproc, solver=None)

# Evaluate
preds = np.argmax(predict_y(inputs_test), axis=1)
preds_fair = postprocessor.predict(inputs_test)

print(
    f"Attribute-aware post-processing result for binary equal opportunity on UCI Adult:\n"
    f"  Original: accuracy={1-utils.error_rate(labels_test, preds):.4f}, delta_eopp={utils.delta_eopp(labels_test, preds, groups_test, n_classes, n_groups):.4f}\n"
    f"  Post-processed: accuracy={1-utils.error_rate(labels_test, preds_fair):.4f}, delta_eopp={utils.delta_eopp(labels_test, preds_fair, groups_test, n_classes, n_groups):.4f}"
)

Attribute-aware post-processing result for binary equal opportunity on UCI Adult:
  Original: accuracy=0.8490, delta_eopp=0.0805
  Post-processed: accuracy=0.8486, delta_eopp=0.0212


In [7]:
## Post-process for equalized odds

postprocessor = postprocess.PostProcessor(
    n_classes,
    n_groups,
    pred_ay_fn=predict_ay,
    criterion='eo',
    alpha=0.001,
)
postprocessor.fit(inputs_postproc, solver=None)

# Evaluate
preds = np.argmax(predict_y(inputs_test), axis=1)
preds_fair = postprocessor.predict(inputs_test)

print(
    f"Attribute-aware post-processing result for equalized odds on UCI Adult:\n"
    f"  Original: accuracy={1-utils.error_rate(labels_test, preds):.4f}, delta_eo={utils.delta_eo(labels_test, preds, groups_test, n_classes, n_groups):.4f}\n"
    f"  Post-processed: accuracy={1-utils.error_rate(labels_test, preds_fair):.4f}, delta_eo={utils.delta_eo(labels_test, preds_fair, groups_test, n_classes, n_groups):.4f}"
)

Attribute-aware post-processing result for equalized odds on UCI Adult:
  Original: accuracy=0.8490, delta_eo=0.0805
  Post-processed: accuracy=0.8401, delta_eo=0.0510


## Attribute-blind

In [8]:
## Load the UCI Adult dataset

(inputs, labels, label_names, groups,
 group_names) = loader.load_adult(data_dir, remove_sensitive_attr=True)

n_classes = len(label_names)
n_groups = len(group_names)


# Normalize data
scaler = sklearn.preprocessing.StandardScaler()
inputs[:] = scaler.fit_transform(inputs)

display(loader.dataset_stats(labels, label_names, groups, group_names))

Group,Female,Male
Target,Unnamed: 1_level_1,Unnamed: 2_level_1
<=50K,14423,22732
>50K,1769,9918


In [9]:
## Split data by 0.35/0.35/0.3 for pre-training, post-training, and testing

(inputs_train, inputs_test, labels_train, labels_test, groups_train,
 groups_test) = sklearn.model_selection.train_test_split(
     inputs,
     labels,
     groups,
     test_size=0.3,
 )

(inputs_pretrain, inputs_postproc, labels_pretrain, labels_postproc,
 groups_pretrain, groups_postproc) = sklearn.model_selection.train_test_split(
     inputs_train,
     labels_train,
     groups_train,
     test_size=0.5,
 )

In [10]:
## Train predictors for Y given X, A given X, and (A, Y) given X

predictor_y = sklearn.linear_model.LogisticRegression()
predictor_y.fit(inputs_pretrain, labels_pretrain)

predictor_a = sklearn.linear_model.LogisticRegression()
predictor_a.fit(inputs_pretrain, groups_pretrain)

predictor_ay = sklearn.linear_model.LogisticRegression()
predictor_ay.fit(inputs_pretrain, groups_pretrain * n_classes + labels_pretrain)

# Define predict functions
predict_y = lambda x: predictor_y.predict_proba(x)
predict_a = lambda x: predictor_a.predict_proba(x)
predict_ay = lambda x: predictor_ay.predict_proba(x)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
## Post-process for statistical parity

postprocessor = postprocess.PostProcessor(
    n_classes,
    n_groups,
    pred_y_fn=predict_y,
    pred_a_fn=predict_a,
    criterion='sp',
    alpha=0.001,
)
postprocessor.fit(inputs_postproc, solver=None)

# Evaluate
preds = np.argmax(predict_y(inputs_test), axis=1)
preds_fair = postprocessor.predict(inputs_test)

print(
    f"Attribute-blind post-processing result for statistical parity on UCI Adult:\n"
    f"  Original: accuracy={1-utils.error_rate(labels_test, preds):.4f}, delta_sp={utils.delta_sp(preds, groups_test, n_classes, n_groups):.4f}\n"
    f"  Post-processed: accuracy={1-utils.error_rate(labels_test, preds_fair):.4f}, delta_sp={utils.delta_sp(preds_fair, groups_test, n_classes, n_groups):.4f}"
)

Attribute-blind post-processing result for statistical parity on UCI Adult:
  Original: accuracy=0.8482, delta_sp=0.1781
  Post-processed: accuracy=0.8269, delta_sp=0.0046


In [12]:
## Post-process for binary equal opportunity

postprocessor = postprocess.PostProcessor(
    n_classes,
    n_groups,
    pred_ay_fn=predict_ay,
    criterion='eopp',
    alpha=0.001,
)
postprocessor.fit(inputs_postproc, solver=None)

# Evaluate
preds = np.argmax(predict_y(inputs_test), axis=1)
preds_fair = postprocessor.predict(inputs_test)

print(
    f"Attribute-blind post-processing result for binary equal opportunity on UCI Adult:\n"
    f"  Original: accuracy={1-utils.error_rate(labels_test, preds):.4f}, delta_eopp={utils.delta_eopp(labels_test, preds, groups_test, n_classes, n_groups):.4f}\n"
    f"  Post-processed: accuracy={1-utils.error_rate(labels_test, preds_fair):.4f}, delta_eopp={utils.delta_eopp(labels_test, preds_fair, groups_test, n_classes, n_groups):.4f}"
)

Attribute-blind post-processing result for binary equal opportunity on UCI Adult:
  Original: accuracy=0.8482, delta_eopp=0.1077
  Post-processed: accuracy=0.8465, delta_eopp=0.0523


In [13]:
## Post-process for equalized odds

postprocessor = postprocess.PostProcessor(
    n_classes,
    n_groups,
    pred_ay_fn=predict_ay,
    criterion='eo',
    alpha=0.001,
)
postprocessor.fit(inputs_postproc, solver=None)

# Evaluate
preds = np.argmax(predict_y(inputs_test), axis=1)
preds_fair = postprocessor.predict(inputs_test)

print(
    f"Attribute-blind post-processing result for equalized odds on UCI Adult:\n"
    f"  Original: accuracy={1-utils.error_rate(labels_test, preds):.4f}, delta_eo={utils.delta_eo(labels_test, preds, groups_test, n_classes, n_groups):.4f}\n"
    f"  Post-processed: accuracy={1-utils.error_rate(labels_test, preds_fair):.4f}, delta_eo={utils.delta_eo(labels_test, preds_fair, groups_test, n_classes, n_groups):.4f}"
)

Attribute-blind post-processing result for equalized odds on UCI Adult:
  Original: accuracy=0.8482, delta_eo=0.1077
  Post-processed: accuracy=0.8374, delta_eo=0.0167
