In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from folktables import ACSDataSource, ACSPublicCoverage

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
protected_attributes = [
    'SEX',
    'DIS',
    'NATIVITY',
    'DEAR',
    'DEYE',
    'MIG',
    'MIL',
    'AGEP',
    'DREM',
    'MAR',
]

In [3]:
data_source = ACSDataSource(survey_year='2018', horizon='5-Year', survey='person')
acs_data = data_source.get_data()

In [5]:
features, label, group = ACSPublicCoverage.df_to_pandas(acs_data)

In [6]:
# save the loaded data into a csv file
features.to_csv('./my_data/features.csv', index=False)
label.to_csv('./my_data/label.csv', index=False)
group.to_csv('./my_data/group.csv', index=False)

In [7]:
# load as numpy
features_np, label_np, group_np = ACSPublicCoverage.df_to_numpy(acs_data)

In [8]:
# save the numpy files
np.save('./my_data/features.npy', features_np)
np.save('./my_data/label.npy', label_np)
np.save('./my_data/group.npy', group_np)

In [18]:
for protect_attr in protected_attributes[:5]:
    print(f'Protected attribute: {protect_attr}')
    # Calculate prob_attr = P(protect_attr = 1) when attr can take 1 or 2
    prob_attr_1 = np.mean(features[protect_attr] == 1)
    prob_attr_0 = np.mean(features[protect_attr] == 2)
    print(f'{protect_attr} = 1: {prob_attr_1:.2f} and {protect_attr} = 2: {prob_attr_0:.2f}')

Protected attribute: SEX
SEX = 1: 0.43 and SEX = 2: 0.57
Protected attribute: DIS
DIS = 1: 0.15 and DIS = 2: 0.85
Protected attribute: NATIVITY
NATIVITY = 1: 0.85 and NATIVITY = 2: 0.15
Protected attribute: DEAR
DEAR = 1: 0.02 and DEAR = 2: 0.98
Protected attribute: DEYE
DEYE = 1: 0.03 and DEYE = 2: 0.97


In [27]:
# Calculate prob_attr = P(protect_attr = 1 or 4) when attr can take 0, 1, 2, 3, 4
protect_attr = 'MIL'
prob_attr_1 = np.mean((features[protect_attr] == 1) | (features[protect_attr] == 4))
prob_attr_0 = np.mean((features[protect_attr] == 0) | (features[protect_attr] == 2) | (features[protect_attr] == 3))
print(f'{protect_attr} = 1 or 4: {prob_attr_1:.2f} and {protect_attr} = 0, 2 or 3: {prob_attr_0:.2f}')

MIL = 1 or 4: 0.90 and MIL = 0, 2 or 3: 0.10


In [33]:
# Calculate prob_attr = P(protect_attr = 1) when attr can take 0, 1, 2, 3
protect_attr = 'MIG'
prob_attr_1 = np.mean(features[protect_attr] == 1)
prob_attr_0 = np.mean(features[protect_attr].isin([0, 2, 3]))
print(f'{protect_attr} = 1: {prob_attr_1:.2f} and {protect_attr} = 0, 2 or 3: {prob_attr_0:.2f}')

MIG = 1: 0.82 and MIG = 0, 2 or 3: 0.18


In [29]:
# Calculate prob_attr = P(protect_attr = 1) by thresholding at age 25
protect_attr = 'AGEP'
prob_attr_1 = np.mean(features[protect_attr] > 25)
prob_attr_0 = np.mean(features[protect_attr] <= 25)
print(f'{protect_attr} > 25: {prob_attr_1:.2f} and {protect_attr} <= 25: {prob_attr_0:.2f}')

AGEP > 25: 0.66 and AGEP <= 25: 0.34


In [32]:
# Calculate prob_attr = P(protect_attr = 1) when attr can take 1, 2, 3, 4, 5
protect_attr = 'MAR'
prob_attr_1 = np.mean(features[protect_attr] == 1)
prob_attr_0 = np.mean(features[protect_attr].isin([2, 3, 4, 5]))
print(f'{protect_attr} = 1: {prob_attr_1:.2f} and {protect_attr} = 0, 2, 3 or 4: {prob_attr_0:.2f}')

MAR = 1: 0.37 and MAR = 0, 2, 3 or 4: 0.63


In [35]:
# Calculate prob_attr = P(protect_attr = 1) when attr can take 0, 1, 2
protect_attr = 'DREM'
prob_attr_1 = np.mean(features[protect_attr] == 1)
prob_attr_0 = np.mean(features[protect_attr].isin([0,2]))
print(f'{protect_attr} = 1: {prob_attr_1:.2f} and {protect_attr} = 0, 2: {prob_attr_0:.2f}')

DREM = 1: 0.08 and DREM = 0, 2: 0.92
