In [1]:
from json import dumps
import os, urllib.request

from IPython.display import display
import numpy as np
import pandas as pd
import sklearn
from sklearn import linear_model, preprocessing

from projection_simplex_vectorized import projection_simplex
import postprocess

split_ratio_for_postprocessing = 0.5

seed = 33
rng = np.random.default_rng(seed)
noise_fn = lambda shape: rng.laplace(loc=0.0, scale=0.2 / 5, size=shape)

## Download and pre-process UCI Communities and Crime dataset

In [2]:
features = [
    "state", "county", "community", "communityname", "fold", "population",
    "householdsize", "racepctblack", "racePctWhite", "racePctAsian",
    "racePctHisp", "agePct12t21", "agePct12t29", "agePct16t24", "agePct65up",
    "numbUrban", "pctUrban", "medIncome", "pctWWage", "pctWFarmSelf",
    "pctWInvInc", "pctWSocSec", "pctWPubAsst", "pctWRetire", "medFamInc",
    "perCapInc", "whitePerCap", "blackPerCap", "indianPerCap", "AsianPerCap",
    "OtherPerCap", "HispPerCap", "NumUnderPov", "PctPopUnderPov",
    "PctLess9thGrade", "PctNotHSGrad", "PctBSorMore", "PctUnemployed",
    "PctEmploy", "PctEmplManu", "PctEmplProfServ", "PctOccupManu",
    "PctOccupMgmtProf", "MalePctDivorce", "MalePctNevMarr", "FemalePctDiv",
    "TotalPctDiv", "PersPerFam", "PctFam2Par", "PctKids2Par",
    "PctYoungKids2Par", "PctTeen2Par", "PctWorkMomYoungKids", "PctWorkMom",
    "NumIlleg", "PctIlleg", "NumImmig", "PctImmigRecent", "PctImmigRec5",
    "PctImmigRec8", "PctImmigRec10", "PctRecentImmig", "PctRecImmig5",
    "PctRecImmig8", "PctRecImmig10", "PctSpeakEnglOnly", "PctNotSpeakEnglWell",
    "PctLargHouseFam", "PctLargHouseOccup", "PersPerOccupHous",
    "PersPerOwnOccHous", "PersPerRentOccHous", "PctPersOwnOccup",
    "PctPersDenseHous", "PctHousLess3BR", "MedNumBR", "HousVacant",
    "PctHousOccup", "PctHousOwnOcc", "PctVacantBoarded", "PctVacMore6Mos",
    "MedYrHousBuilt", "PctHousNoPhone", "PctWOFullPlumb", "OwnOccLowQuart",
    "OwnOccMedVal", "OwnOccHiQuart", "RentLowQ", "RentMedian", "RentHighQ",
    "MedRent", "MedRentPctHousInc", "MedOwnCostPctInc", "MedOwnCostPctIncNoMtg",
    "NumInShelters", "NumStreet", "PctForeignBorn", "PctBornSameState",
    "PctSameHouse85", "PctSameCity85", "PctSameState85", "LemasSwornFT",
    "LemasSwFTPerPop", "LemasSwFTFieldOps", "LemasSwFTFieldPerPop",
    "LemasTotalReq", "LemasTotReqPerPop", "PolicReqPerOffic", "PolicPerPop",
    "RacialMatchCommPol", "PctPolicWhite", "PctPolicBlack", "PctPolicHisp",
    "PctPolicAsian", "PctPolicMinor", "OfficAssgnDrugUnits",
    "NumKindsDrugsSeiz", "PolicAveOTWorked", "LandArea", "PopDens",
    "PctUsePubTrans", "PolicCars", "PolicOperBudg", "LemasPctPolicOnPatr",
    "LemasGangUnitDeploy", "LemasPctOfficDrugUn", "PolicBudgPerPop",
    "ViolentCrimesPerPop"
]

data_path = "data/communities/adult.data"
if not os.path.exists(data_path):
  os.makedirs("data/communities", exist_ok=True)
  urllib.request.urlretrieve(
      "https://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.data",
      data_path)

original = pd.read_csv(data_path,
                       names=features,
                       sep=r",",
                       engine="python",
                       na_values="?")

# Drop community name, state, and county, and columns with missing values
original = original.drop(["communityname", "state", "county"],
                         axis=1).dropna(axis=1)

In [3]:
def data_transform(df):
  """Normalize features."""
  binary_data = pd.get_dummies(df)
  scaler = preprocessing.StandardScaler()
  data = pd.DataFrame(scaler.fit_transform(binary_data),
                      columns=binary_data.columns)
  data.index = df.index
  return data


# Create 5 equidistance bins for ViolentCrimesPerPop column
labels_original = pd.cut(original["ViolentCrimesPerPop"], 5)

# Get sensitive attributes
minority_pct = np.stack([
    original[a].to_numpy()
    for a in ["racePctHisp", "racePctAsian", "racepctblack"]
],
                        axis=1)
minority_presence = np.array(["hispanic", "asian",
                              "black"])[minority_pct.argmax(axis=1)]
minority_presence[original["racePctWhite"] > 0.95] = "white"
original["MinorityPresence"] = minority_presence

label_names, labels = np.unique(labels_original, return_inverse=True)
n_labels = len(label_names)

data = original.copy()
data = data.drop(["ViolentCrimesPerPop", "fold"], axis=1)
data = data_transform(data)

In [4]:
def get_group_labels_and_print_statistics(sensitive_attributes):

  group_names, groups = np.unique(
      original[sensitive_attributes].to_numpy().astype(str),
      return_inverse=True,
      axis=0)
  n_groups = len(group_names)
  print("Demographic groups:",
        ', '.join(["'" + ', '.join(n) + "'" for n in group_names]))

  # Compute dataset statistics
  df = original.copy()
  df["Target"] = labels_original
  group_column = original[sensitive_attributes[0]]
  for attribute in sensitive_attributes[1:]:
    group_column = np.add(np.add(group_column, ", "), original[attribute])
  df["Group"] = group_column
  grouped = df.groupby(["Target", "Group"]).size().unstack()
  n_labels = len(grouped.index)
  n_groups = len(grouped.columns)
  counts = grouped.sum(axis=0)
  normalized = np.nan_to_num((grouped.to_numpy() / counts.to_numpy())).T
  diff = np.abs(normalized[:, None, :] - normalized[None, :, :])
  postprocessor = postprocess.PostProcessor()
  postprocessor.fit(
      np.concatenate([np.eye(n_labels) for _ in range(n_groups)], axis=0),
      np.repeat(np.arange(n_groups), n_labels), normalized.flatten())
  res = {
      "balanced_accuracy": {
          "perfect_postprocessed": (n_groups - postprocessor.score_) / n_groups
      },
      "dp_gap_linf_max": {
          "perfect_predictor": np.max(diff)
      },
      "dp_gap_l1_max": {
          "perfect_predictor": np.max(1 / 2 * np.sum(diff, axis=2))
      },
      "dp_gap_l1_avg": {
          "perfect_predictor":
              np.mean(1 / 2 * np.sum(diff, axis=2)[np.triu_indices(n_groups, 1)]
                     )
      },
  }

  display(pd.DataFrame(res))
  display(grouped / counts)
  display(pd.DataFrame(counts, columns=["Count"]).T)

  return groups, n_groups

In [5]:
sensitive_attributes = ["MinorityPresence"]
groups, n_groups = get_group_labels_and_print_statistics(sensitive_attributes)

Demographic groups: 'asian', 'black', 'hispanic', 'white'


Unnamed: 0,balanced_accuracy,dp_gap_linf_max,dp_gap_l1_max,dp_gap_l1_avg
perfect_postprocessed,0.766781,,,
perfect_predictor,,0.581516,0.581516,0.363946


Group,asian,black,hispanic,white
Target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(-0.001, 0.2]",0.768683,0.340625,0.417323,0.922141
"(0.2, 0.4]",0.156584,0.279687,0.367454,0.060827
"(0.4, 0.6]",0.032028,0.185938,0.131234,0.012165
"(0.6, 0.8]",0.02847,0.092188,0.049869,0.002433
"(0.8, 1.0]",0.014235,0.101562,0.034121,0.002433


Group,asian,black,hispanic,white
Count,562,640,381,411


## Post-process

### Linear regression via OLS

In [6]:
all_res = []

# get unique data["fold"]
print("Fold...", end="", flush=True)
for fold in original["fold"].unique():
  # print no new line
  print(f"{fold} ", end="", flush=True)

  train_data = data[original["fold"] != fold]
  train_labels = labels[original["fold"] != fold]
  train_groups = groups[original["fold"] != fold]
  test_data = data[original["fold"] == fold]
  test_labels = labels[original["fold"] == fold]
  test_groups = groups[original["fold"] == fold]

  # Train data split
  train_data_pre, train_data_post, train_labels_pre, train_labels_post, train_groups_pre, train_groups_post = sklearn.model_selection.train_test_split(
      train_data,
      train_labels,
      train_groups,
      test_size=split_ratio_for_postprocessing,
      random_state=seed)

  train_labels_pre_one_hot = pd.get_dummies(train_labels_pre)

  predictor = sklearn.linear_model.LinearRegression()
  predictor.fit(train_data_pre, train_labels_pre_one_hot)
  predict_fn = lambda X: projection_simplex(predictor.predict(X), axis=1)
  postprocessor = postprocess.postprocess(predict_fn, train_data_post,
                                          train_groups_post)
  all_res.append(
      postprocess.evaluate(predict_fn, postprocessor, test_data, test_labels,
                           test_groups, n_labels, n_groups))
print()

res = {
    k1: {k2: np.mean([d[k1][k2] for d in all_res]) for k2 in all_res[0][k1]
        } for k1, v1 in all_res[0].items()
}
display(pd.DataFrame(res))

Fold...1 2 3 4 5 6 7 8 9 10 


Unnamed: 0,accuracy,balanced_accuracy,dp_gap_linf_max,dp_gap_l1_max,dp_gap_l1_avg
predictor,0.66151,0.673644,0.532059,0.532059,0.338814
postprocessor,0.597317,0.607205,0.12533,0.140592,0.088431


In [7]:
all_res = []
print("With Laplace smoothing:")

# get unique data["fold"]
print("Fold...", end="", flush=True)
for fold in original["fold"].unique():
  # print no new line
  print(f"{fold} ", end="", flush=True)

  train_data = data[original["fold"] != fold]
  train_labels = labels[original["fold"] != fold]
  train_groups = groups[original["fold"] != fold]
  test_data = data[original["fold"] == fold]
  test_labels = labels[original["fold"] == fold]
  test_groups = groups[original["fold"] == fold]

  # Train data split
  train_data_pre, train_data_post, train_labels_pre, train_labels_post, train_groups_pre, train_groups_post = sklearn.model_selection.train_test_split(
      train_data,
      train_labels,
      train_groups,
      test_size=split_ratio_for_postprocessing,
      random_state=seed)

  train_labels_pre_one_hot = pd.get_dummies(train_labels_pre)

  predictor = sklearn.linear_model.LinearRegression()
  predictor.fit(train_data_pre, train_labels_pre_one_hot)
  predict_fn = lambda X: projection_simplex(predictor.predict(X), axis=1)
  postprocessor = postprocess.postprocess(predict_fn,
                                          train_data_post,
                                          train_groups_post,
                                          noise_fn=noise_fn,
                                          n_perturbations=10)
  all_res.append(
      postprocess.evaluate(predict_fn,
                           postprocessor,
                           test_data,
                           test_labels,
                           test_groups,
                           n_labels,
                           n_groups,
                           noise_fn=noise_fn,
                           n_perturbations=1000))
print()

res = {
    k1: {k2: np.mean([d[k1][k2] for d in all_res]) for k2 in all_res[0][k1]
        } for k1, v1 in all_res[0].items()
}
display(pd.DataFrame(res))

With Laplace smoothing:
Fold...1 2 3 4 5 6 7 8 9 10 


Unnamed: 0,accuracy,balanced_accuracy,dp_gap_linf_max,dp_gap_l1_max,dp_gap_l1_avg
predictor,0.661948,0.674289,0.532238,0.532238,0.340846
postprocessor,0.598172,0.608269,0.108782,0.119111,0.074183


### Logistic regression

In [8]:
all_res = []

# get unique data["fold"]
print("Fold...", end="", flush=True)
for fold in original["fold"].unique():
  # print no new line
  print(f"{fold} ", end="", flush=True)

  train_data = data[original["fold"] != fold]
  train_labels = labels[original["fold"] != fold]
  train_groups = groups[original["fold"] != fold]
  test_data = data[original["fold"] == fold]
  test_labels = labels[original["fold"] == fold]
  test_groups = groups[original["fold"] == fold]

  # Train data split
  train_data_predictor, train_data_postprocessor, train_labels_predictor, train_labels_postprocessor, train_groups_predictor, train_groups_postprocessor = sklearn.model_selection.train_test_split(
      train_data, train_labels, train_groups, test_size=0.5, random_state=seed)

  train_labels_predictor_one_hot = pd.get_dummies(train_labels_predictor)

  predictor = sklearn.linear_model.LogisticRegression(max_iter=500)
  predictor.fit(train_data_predictor, train_labels_predictor)
  predict_fn = predictor.predict_proba
  postprocessor = postprocess.postprocess(predict_fn, train_data_postprocessor,
                                          train_groups_postprocessor)
  all_res.append(
      postprocess.evaluate(predict_fn, postprocessor, test_data, test_labels,
                           test_groups, n_labels, n_groups))
print()

res = {
    k1: {k2: np.mean([d[k1][k2] for d in all_res]) for k2 in all_res[0][k1]
        } for k1, v1 in all_res[0].items()
}
display(pd.DataFrame(res))

Fold...1 2 3 4 5 6 7 8 9 10 


Unnamed: 0,accuracy,balanced_accuracy,dp_gap_linf_max,dp_gap_l1_max,dp_gap_l1_avg
predictor,0.67053,0.682312,0.592612,0.592612,0.384957
postprocessor,0.592304,0.598953,0.132564,0.156615,0.105621
