In [1]:
from json import dumps
import os, urllib.request

from IPython.display import display
import numpy as np
import pandas as pd
import sklearn
from sklearn import linear_model, preprocessing

from projection_simplex_vectorized import projection_simplex
import postprocess

split_ratio_for_postprocessing = 0.5

seed = 33
rng = np.random.default_rng(seed)
noise_fn = lambda shape: rng.laplace(loc=0.0, scale=0.2 / 2, size=shape)

## Download and pre-process UCI Adult dataset

In [2]:
features = [
    "Age", "Workclass", "fnlwgt", "Education", "Education-Num",
    "Martial Status", "Occupation", "Relationship", "Race", "Sex",
    "Capital Gain", "Capital Loss", "Hours per week", "Country", "Target"
]

train_path = "data/adult/adult.data"
test_path = "data/adult/adult.test"
if any([not os.path.exists(p) for p in [train_path, test_path]]):
  os.makedirs("data/adult", exist_ok=True)
  urllib.request.urlretrieve(
      "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
      train_path)
  urllib.request.urlretrieve(
      "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test",
      test_path)

original_train = pd.read_csv(train_path,
                             names=features,
                             sep=r"\s*,\s*",
                             engine="python",
                             na_values="?")
original_test = pd.read_csv(test_path,
                            names=features,
                            sep=r"\s*,\s*",
                            engine="python",
                            na_values="?",
                            skiprows=1)

original_train["Age-Binned"] = np.where(original_train["Age"] <= 50, "<=50",
                                        ">50")
original_test["Age-Binned"] = np.where(original_test["Age"] <= 50, "<=50",
                                       ">50")


In [3]:
def data_transform(df):
  """Normalize features."""
  binary_data = pd.get_dummies(df)
  scaler = preprocessing.StandardScaler()
  data = pd.DataFrame(scaler.fit_transform(binary_data),
                      columns=binary_data.columns)
  data.index = df.index
  return data


original = pd.concat([original_train, original_test])
labels_original = original[["Target"
                           ]].replace("<=50K.",
                                      "<=50K").replace(">50K.", ">50K")
del original["Target"]
n_train = len(original_train)

label_names, labels = np.unique(labels_original, return_inverse=True)
n_labels = len(label_names)

data = data_transform(original)
train_data = data[:n_train]
train_labels = labels[:n_train]
test_data = data[n_train:]
test_labels = labels[n_train:]

In [4]:
def get_group_labels_and_print_statistics(sensitive_attributes):

  group_names, groups = np.unique(
      original[sensitive_attributes].to_numpy().astype(str),
      return_inverse=True,
      axis=0)
  n_groups = len(group_names)
  print("Demographic groups:",
        ', '.join(["'" + ', '.join(n) + "'" for n in group_names]))
  train_groups = groups[:n_train]
  test_groups = groups[n_train:]

  # Compute dataset statistics
  df = original.copy()
  df["Target"] = labels_original
  group_column = original[sensitive_attributes[0]]
  for attribute in sensitive_attributes[1:]:
    group_column = np.add(np.add(group_column, ", "), original[attribute])
  df["Group"] = group_column
  grouped = df.groupby(["Target", "Group"]).size().unstack()
  n_labels = len(grouped.index)
  n_groups = len(grouped.columns)
  counts = grouped.sum(axis=0)
  normalized = np.nan_to_num((grouped.to_numpy() / counts.to_numpy())).T
  diff = np.abs(normalized[:, None, :] - normalized[None, :, :])
  postprocessor = postprocess.PostProcessor()
  postprocessor.fit(
      np.concatenate([np.eye(n_labels) for _ in range(n_groups)], axis=0),
      np.repeat(np.arange(n_groups), n_labels), normalized.flatten())
  res = {
      "balanced_accuracy": {
          "perfect_postprocessed": (n_groups - postprocessor.score_) / n_groups
      },
      "dp_gap_linf_max": {
          "perfect_predictor": np.max(diff)
      },
      "dp_gap_l1_max": {
          "perfect_predictor": np.max(1 / 2 * np.sum(diff, axis=2))
      },
      "dp_gap_l1_avg": {
          "perfect_predictor":
              np.mean(1 / 2 * np.sum(diff, axis=2)[np.triu_indices(n_groups, 1)]
                     )
      },
  }

  display(pd.DataFrame(res))
  display(grouped / counts)
  display(pd.DataFrame(counts, columns=["Count"]).T)

  return train_groups, test_groups, n_groups

## Sensitive attribute "Sex"

In [5]:
sensitive_attributes = ["Sex"]
train_groups, test_groups, n_groups = get_group_labels_and_print_statistics(sensitive_attributes)
train_data_pre, train_data_post, train_labels_pre, train_labels_post, train_groups_pre, train_groups_post = sklearn.model_selection.train_test_split(
    train_data,
    train_labels,
    train_groups,
    test_size=split_ratio_for_postprocessing,
    random_state=seed)
train_labels_pre_one_hot = pd.get_dummies(train_labels_pre)

Demographic groups: 'Female', 'Male'


Unnamed: 0,balanced_accuracy,dp_gap_linf_max,dp_gap_l1_max,dp_gap_l1_avg
perfect_postprocessed,0.902742,,,
perfect_predictor,,0.194516,0.194516,0.194516


Group,Female,Male
Target,Unnamed: 1_level_1,Unnamed: 2_level_1
<=50K,0.890749,0.696233
>50K,0.109251,0.303767


Group,Female,Male
Count,16192,32650


### Linear regression via OLS

In [6]:
predictor = sklearn.linear_model.LinearRegression()
predictor.fit(train_data_pre, train_labels_pre_one_hot)
predict_fn = lambda X: projection_simplex(predictor.predict(X), axis=1)
postprocessor = postprocess.postprocess(predict_fn, train_data_post,
                                        train_groups_post)
res = postprocess.evaluate(predict_fn, postprocessor, test_data, test_labels,
                           test_groups, n_labels, n_groups)
display(pd.DataFrame(res))

Unnamed: 0,accuracy,balanced_accuracy,dp_gap_linf_max,dp_gap_l1_max,dp_gap_l1_avg
predictor,0.843929,0.863517,0.155611,0.155611,0.155611
postprocessor,0.820957,0.843434,9e-05,9e-05,9e-05


In [7]:
predictor = sklearn.linear_model.LinearRegression()
predictor.fit(train_data_pre, train_labels_pre_one_hot)
predict_fn = lambda X: projection_simplex(predictor.predict(X), axis=1)
postprocessor = postprocess.postprocess(predict_fn,
                                        train_data_post,
                                        train_groups_post,
                                        noise_fn=noise_fn,
                                        n_perturbations=20)
res = postprocess.evaluate(predict_fn,
                           postprocessor,
                           test_data,
                           test_labels,
                           test_groups,
                           n_labels,
                           n_groups,
                           noise_fn=noise_fn,
                           n_perturbations=1000)
print("With Laplace smoothing:")
display(pd.DataFrame(res))

With Laplace smoothing:


Unnamed: 0,accuracy,balanced_accuracy,dp_gap_linf_max,dp_gap_l1_max,dp_gap_l1_avg
predictor,0.829925,0.850955,0.162475,0.162475,0.162475
postprocessor,0.813439,0.83422,7.2e-05,7.2e-05,7.2e-05


### Logistic regression

In [8]:
predictor = sklearn.linear_model.LogisticRegression()
predictor.fit(train_data_pre, train_labels_pre)
predict_fn = predictor.predict_proba
postprocessor = postprocess.postprocess(predict_fn, train_data_post,
                                        train_groups_post)
res = postprocess.evaluate(predict_fn, postprocessor, test_data, test_labels,
                           test_groups, n_labels, n_groups)
display(pd.DataFrame(res))

Unnamed: 0,accuracy,balanced_accuracy,dp_gap_linf_max,dp_gap_l1_max,dp_gap_l1_avg
predictor,0.853203,0.872179,0.181922,0.181922,0.181922
postprocessor,0.830784,0.850986,0.004358,0.004358,0.004358


## Sensitive attribute "Race"

In [9]:
sensitive_attributes = ["Race"]
train_groups, test_groups, n_groups = get_group_labels_and_print_statistics(sensitive_attributes)
train_data_pre, train_data_post, train_labels_pre, train_labels_post, train_groups_pre, train_groups_post = sklearn.model_selection.train_test_split(
    train_data,
    train_labels,
    train_groups,
    test_size=split_ratio_for_postprocessing,
    random_state=seed)
train_labels_pre_one_hot = pd.get_dummies(train_labels_pre)

Demographic groups: 'Amer-Indian-Eskimo', 'Asian-Pac-Islander', 'Black', 'Other', 'White'


Unnamed: 0,balanced_accuracy,dp_gap_linf_max,dp_gap_l1_max,dp_gap_l1_avg
perfect_postprocessed,0.942918,,,
perfect_predictor,,0.152235,0.152235,0.087529


Group,Amer-Indian-Eskimo,Asian-Pac-Islander,Black,Other,White
Target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
<=50K,0.882979,0.730744,0.879189,0.876847,0.746013
>50K,0.117021,0.269256,0.120811,0.123153,0.253987


Group,Amer-Indian-Eskimo,Asian-Pac-Islander,Black,Other,White
Count,470,1519,4685,406,41762


### Linear regression via OLS

In [10]:
predictor = sklearn.linear_model.LinearRegression()
predictor.fit(train_data_pre, train_labels_pre_one_hot)
predict_fn = lambda X: projection_simplex(predictor.predict(X), axis=1)
postprocessor = postprocess.postprocess(predict_fn, train_data_post,
                                        train_groups_post)
res = postprocess.evaluate(predict_fn, postprocessor, test_data, test_labels,
                           test_groups, n_labels, n_groups)
display(pd.DataFrame(res))

Unnamed: 0,accuracy,balanced_accuracy,dp_gap_linf_max,dp_gap_l1_max,dp_gap_l1_avg
predictor,0.843929,0.869696,0.166392,0.166392,0.088722
postprocessor,0.825379,0.855494,0.023611,0.023611,0.010113


In [11]:
predictor = sklearn.linear_model.LinearRegression()
predictor.fit(train_data_pre, train_labels_pre_one_hot)
predict_fn = lambda X: projection_simplex(predictor.predict(X), axis=1)
postprocessor = postprocess.postprocess(predict_fn,
                                        train_data_post,
                                        train_groups_post,
                                        noise_fn=noise_fn,
                                        n_perturbations=20)
res = postprocess.evaluate(predict_fn,
                           postprocessor,
                           test_data,
                           test_labels,
                           test_groups,
                           n_labels,
                           n_groups,
                           noise_fn=noise_fn,
                           n_perturbations=1000)
print("With Laplace smoothing:")
display(pd.DataFrame(res))

With Laplace smoothing:


Unnamed: 0,accuracy,balanced_accuracy,dp_gap_linf_max,dp_gap_l1_max,dp_gap_l1_avg
predictor,0.829963,0.858885,0.146994,0.146994,0.08096
postprocessor,0.82162,0.850897,0.028707,0.028707,0.013827


### Logistic regression

In [12]:
predictor = sklearn.linear_model.LogisticRegression()
predictor.fit(train_data_pre, train_labels_pre)
predict_fn = predictor.predict_proba
postprocessor = postprocess.postprocess(predict_fn, train_data_post,
                                        train_groups_post)
res = postprocess.evaluate(predict_fn, postprocessor, test_data, test_labels,
                           test_groups, n_labels, n_groups)
display(pd.DataFrame(res))

Unnamed: 0,accuracy,balanced_accuracy,dp_gap_linf_max,dp_gap_l1_max,dp_gap_l1_avg
predictor,0.853203,0.876868,0.166234,0.166234,0.093397
postprocessor,0.840305,0.868155,0.01755,0.01755,0.008864


## Sensitive attribute "Sex" + "Race"

In [13]:
sensitive_attributes = ["Sex", "Race"]
train_groups, test_groups, n_groups = get_group_labels_and_print_statistics(sensitive_attributes)
train_data_pre, train_data_post, train_labels_pre, train_labels_post, train_groups_pre, train_groups_post = sklearn.model_selection.train_test_split(
    train_data,
    train_labels,
    train_groups,
    test_size=split_ratio_for_postprocessing,
    random_state=seed)
train_labels_pre_one_hot = pd.get_dummies(train_labels_pre)

Demographic groups: 'Female, Amer-Indian-Eskimo', 'Female, Asian-Pac-Islander', 'Female, Black', 'Female, Other', 'Female, White', 'Male, Amer-Indian-Eskimo', 'Male, Asian-Pac-Islander', 'Male, Black', 'Male, Other', 'Male, White'


Unnamed: 0,balanced_accuracy,dp_gap_linf_max,dp_gap_l1_max,dp_gap_l1_avg
perfect_postprocessed,0.932797,,,
perfect_predictor,,0.282129,0.282129,0.108358


Group,"Female, Amer-Indian-Eskimo","Female, Asian-Pac-Islander","Female, Black","Female, Other","Female, White","Male, Amer-Indian-Eskimo","Male, Asian-Pac-Islander","Male, Black","Male, Other","Male, White"
Target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
<=50K,0.918919,0.866538,0.942808,0.929032,0.88163,0.859649,0.660679,0.817417,0.844622,0.684531
>50K,0.081081,0.133462,0.057192,0.070968,0.11837,0.140351,0.339321,0.182583,0.155378,0.315469


Group,"Female, Amer-Indian-Eskimo","Female, Asian-Pac-Islander","Female, Black","Female, Other","Female, White","Male, Amer-Indian-Eskimo","Male, Asian-Pac-Islander","Male, Black","Male, Other","Male, White"
Count,185,517,2308,155,13027,285,1002,2377,251,28735


### Linear regression via OLS

In [14]:
predictor = sklearn.linear_model.LinearRegression()
predictor.fit(train_data_pre, train_labels_pre_one_hot)
predict_fn = lambda X: projection_simplex(predictor.predict(X), axis=1)
postprocessor = postprocess.postprocess(predict_fn, train_data_post,
                                        train_groups_post)
res = postprocess.evaluate(predict_fn, postprocessor, test_data, test_labels,
                           test_groups, n_labels, n_groups)
display(pd.DataFrame(res))

Unnamed: 0,accuracy,balanced_accuracy,dp_gap_linf_max,dp_gap_l1_max,dp_gap_l1_avg
predictor,0.843929,0.882178,0.275081,0.275081,0.090627
postprocessor,0.810024,0.864512,0.059331,0.059331,0.020043


In [15]:
predictor = sklearn.linear_model.LinearRegression()
predictor.fit(train_data_pre, train_labels_pre_one_hot)
predict_fn = lambda X: projection_simplex(predictor.predict(X), axis=1)
postprocessor = postprocess.postprocess(predict_fn,
                                        train_data_post,
                                        train_groups_post,
                                        noise_fn=noise_fn,
                                        n_perturbations=20)
res = postprocess.evaluate(predict_fn,
                           postprocessor,
                           test_data,
                           test_labels,
                           test_groups,
                           n_labels,
                           n_groups,
                           noise_fn=noise_fn,
                           n_perturbations=1000)
print("With Laplace smoothing:")
display(pd.DataFrame(res))

With Laplace smoothing:


Unnamed: 0,accuracy,balanced_accuracy,dp_gap_linf_max,dp_gap_l1_max,dp_gap_l1_avg
predictor,0.830006,0.87226,0.258626,0.258626,0.089999
postprocessor,0.807317,0.853878,0.066963,0.066963,0.022788


### Logistic regression

In [16]:
predictor = sklearn.linear_model.LogisticRegression()
predictor.fit(train_data_pre, train_labels_pre)
predict_fn = predictor.predict_proba
postprocessor = postprocess.postprocess(predict_fn, train_data_post,
                                        train_groups_post)
res = postprocess.evaluate(predict_fn, postprocessor, test_data, test_labels,
                           test_groups, n_labels, n_groups)
display(pd.DataFrame(res))

Unnamed: 0,accuracy,balanced_accuracy,dp_gap_linf_max,dp_gap_l1_max,dp_gap_l1_avg
predictor,0.853203,0.889084,0.285819,0.285819,0.103625
postprocessor,0.820158,0.867761,0.056353,0.056353,0.01868
