In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from folktables import ACSDataSource, ACSPublicCoverage

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
protected_attributes = [
    'SEX',
    'DIS',
    'NATIVITY',
    'DEAR',
    'DEYE',
    'MIG',
    'MIL',
    'AGEP',
    'DREM',
    'MAR',
]

In [8]:
data_source = ACSDataSource(survey_year='2018', horizon='5-Year', survey='person')

In [9]:
acs_data = data_source.get_data(join_household=True)

: 

In [7]:
len(acs_data)

KeyError: 0

In [3]:
features, label, group = ACSPublicCoverage.df_to_pandas(acs_data)

KeyError: 'AGEP'

In [6]:
# save the loaded data into a csv file
# features.to_csv('./my_data/features.csv', index=False)
# label.to_csv('./my_data/label.csv', index=False)
# group.to_csv('./my_data/group.csv', index=False)

In [3]:
# load the pandas data frames
features = pd.read_csv('./my_data/features.csv')
label = pd.read_csv('./my_data/label.csv')
group = pd.read_csv('./my_data/group.csv')

In [7]:
# load as numpy
features_np, label_np, group_np = ACSPublicCoverage.df_to_numpy(acs_data)

In [8]:
# save the numpy files
np.save('./my_data/features.npy', features_np)
np.save('./my_data/label.npy', label_np)
np.save('./my_data/group.npy', group_np)

In [18]:
for protect_attr in protected_attributes[:5]:
    print(f'Protected attribute: {protect_attr}')
    # Calculate prob_attr = P(protect_attr = 1) when attr can take 1 or 2
    prob_attr_1 = np.mean(features[protect_attr] == 1)
    prob_attr_0 = np.mean(features[protect_attr] == 2)
    print(f'{protect_attr} = 1: {prob_attr_1:.2f} and {protect_attr} = 2: {prob_attr_0:.2f}')

Protected attribute: SEX
SEX = 1: 0.43 and SEX = 2: 0.57
Protected attribute: DIS
DIS = 1: 0.15 and DIS = 2: 0.85
Protected attribute: NATIVITY
NATIVITY = 1: 0.85 and NATIVITY = 2: 0.15
Protected attribute: DEAR
DEAR = 1: 0.02 and DEAR = 2: 0.98
Protected attribute: DEYE
DEYE = 1: 0.03 and DEYE = 2: 0.97


In [27]:
# Calculate prob_attr = P(protect_attr = 1 or 4) when attr can take 0, 1, 2, 3, 4
protect_attr = 'MIL'
prob_attr_1 = np.mean((features[protect_attr] == 1) | (features[protect_attr] == 4))
prob_attr_0 = np.mean((features[protect_attr] == 0) | (features[protect_attr] == 2) | (features[protect_attr] == 3))
print(f'{protect_attr} = 1 or 4: {prob_attr_1:.2f} and {protect_attr} = 0, 2 or 3: {prob_attr_0:.2f}')

MIL = 1 or 4: 0.90 and MIL = 0, 2 or 3: 0.10


In [33]:
# Calculate prob_attr = P(protect_attr = 1) when attr can take 0, 1, 2, 3
protect_attr = 'MIG'
prob_attr_1 = np.mean(features[protect_attr] == 1)
prob_attr_0 = np.mean(features[protect_attr].isin([0, 2, 3]))
print(f'{protect_attr} = 1: {prob_attr_1:.2f} and {protect_attr} = 0, 2 or 3: {prob_attr_0:.2f}')

MIG = 1: 0.82 and MIG = 0, 2 or 3: 0.18


In [29]:
# Calculate prob_attr = P(protect_attr = 1) by thresholding at age 25
protect_attr = 'AGEP'
prob_attr_1 = np.mean(features[protect_attr] > 25)
prob_attr_0 = np.mean(features[protect_attr] <= 25)
print(f'{protect_attr} > 25: {prob_attr_1:.2f} and {protect_attr} <= 25: {prob_attr_0:.2f}')

AGEP > 25: 0.66 and AGEP <= 25: 0.34


In [32]:
# Calculate prob_attr = P(protect_attr = 1) when attr can take 1, 2, 3, 4, 5
protect_attr = 'MAR'
prob_attr_1 = np.mean(features[protect_attr] == 1)
prob_attr_0 = np.mean(features[protect_attr].isin([2, 3, 4, 5]))
print(f'{protect_attr} = 1: {prob_attr_1:.2f} and {protect_attr} = 0, 2, 3 or 4: {prob_attr_0:.2f}')

MAR = 1: 0.37 and MAR = 0, 2, 3 or 4: 0.63


In [35]:
# Calculate prob_attr = P(protect_attr = 1) when attr can take 0, 1, 2
protect_attr = 'DREM'
prob_attr_1 = np.mean(features[protect_attr] == 1)
prob_attr_0 = np.mean(features[protect_attr].isin([0,2]))
print(f'{protect_attr} = 1: {prob_attr_1:.2f} and {protect_attr} = 0, 2: {prob_attr_0:.2f}')

DREM = 1: 0.08 and DREM = 0, 2: 0.92


In [5]:
model = make_pipeline(StandardScaler(), GradientBoostingClassifier(loss='exponential', n_estimators=5, max_depth=5))

In [6]:
X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
    features, label, group, test_size=0.2, random_state=0)

In [7]:
model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [None]:
def load_dataset():
    # load the pandas data frames
    features = pd.read_csv('./my_data/features.csv')
    label = pd.read_csv('./my_data/label.csv')
    return features, label

In [8]:
y_pred = model.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.7127843605199977


In [60]:
def demographic_parity(samples, y, attribute):
    # Calculate demographic parity for 'attribute'

    binary_attributes = ['SEX','DIS','NATIVITY','DEAR','DEYE']

    n = len(samples)

    if attribute in binary_attributes:
        # if the auditor doesn't test all subpopulations, we set that the demographic parity is null
        if not (0 < y[samples[attribute] == 1].sum().item() < n) or not (0 < y[samples[attribute] == 2].sum().item() < n):
            return 0

        prob_y_given_attribute_1 = y[samples[attribute] == 1].mean().item()  # P(y=1|attribute=1)
        prob_y_given_attribute_0 = y[samples[attribute] == 2].mean().item()  # P(y=1|attribute=0)

    elif attribute == 'MIL':
        prob_y_given_attribute_1 = y[samples[attribute].isin([1,4])].mean().item()
        prob_y_given_attribute_0 = y[samples[attribute].isin([0,2,3])].mean().item()

    elif attribute == 'MIG':
        prob_y_given_attribute_1 = y[samples[attribute] == 1].mean().item()
        prob_y_given_attribute_0 = y[samples[attribute].isin([0,2,3])].mean().item()
    
    elif attribute == 'AGEP':
        prob_y_given_attribute_1 = y[samples[attribute] > 25].mean().item()
        prob_y_given_attribute_0 = y[samples[attribute] <= 25].mean().item()
    
    elif attribute == 'MAR':
        prob_y_given_attribute_1 = y[samples[attribute] == 1].mean().item()
        prob_y_given_attribute_0 = y[samples[attribute].isin([2,3,4,5])].mean().item()

    elif attribute == 'DREM':
        prob_y_given_attribute_1 = y[samples[attribute] == 1].mean().item()
        prob_y_given_attribute_0 = y[samples[attribute].isin([0,2])].mean().item()

    else:
        raise ValueError('Attribute not supported')

    demographic_parity_attribute = abs(prob_y_given_attribute_1 - prob_y_given_attribute_0)
    return demographic_parity_attribute

In [49]:
protected_attributes = [
    'SEX',
    'DIS',
    'NATIVITY',
    'DEAR',
    'DEYE'
]

In [42]:
for attr in protected_attributes:
    dp = demographic_parity(features, label, attr)
    print(f'Demographic parity for {attr}: {dp:.2f}')

Demographic parity for SEX: 0.03
Demographic parity for DIS: 0.41
Demographic parity for NATIVITY: 0.05
Demographic parity for DEAR: 0.27
Demographic parity for DEYE: 0.31


In [61]:
demographic_parity(features, label, 'DREM')

0.4449089426372358

In [62]:
y_pred = model.predict(X_train)

In [66]:
y_pred.sum()

0

In [70]:
1.0 - label.mean().item()

0.7127970367941534

## Code for unbiasing DP estimation

In [2]:
from folk_tables import load_dataset, protected_attributes, class_mappings

In [91]:
X, y = load_dataset()

In [92]:
# Transform X based on class_mappings
X_transformed = X.copy()
for attr in protected_attributes:
    if attr == 'AGEP':
        X_transformed[attr] = X_transformed[attr].apply(lambda x: 1 if x > 25 else 0)
    else:
        class_mapping = class_mappings(attr)
        C1 = class_mapping[1] # list of values that are mapped to 1
        C0 = class_mapping[0] # list of values that are mapped to 0
        X_transformed[attr] = X_transformed[attr].apply(lambda x: 1 if x in C1 else 0)

In [83]:
from itertools import combinations

In [84]:
all_probs = dict()
n = 10

for k in range(2,3):
    print(f'Working on k={k}')
    all_probs[k] = dict()

    agent_combinations_list = list(combinations(range(n), k))

    for agent_combination in agent_combinations_list:
        agent_comb_str = ''.join([str(elem) for elem in agent_combination])
        
        all_probs[k][agent_comb_str] = dict()

        total_strings = 2**(k)
        binary_strings = [format(i, f'0{k}b') for i in range(total_strings)]

        attrs = [protected_attributes[i] for i in agent_combination]
        print(f'Working on {attrs}')
        for binary_string in binary_strings:
            
            pairs = [(attrs[i], int(binary_string[i])) for i in range(k)]

            # Restore X_transformed that satisfies the binary string
            X_temp = X_transformed
            for attr, val in pairs:
                X_temp = X_temp[X_temp[attr] == val]

            all_probs[k][agent_comb_str][binary_string] = len(X_temp) / len(X_transformed)


Working on k=2
Working on ['SEX', 'DIS']
Working on ['SEX', 'NATIVITY']
Working on ['SEX', 'DEAR']
Working on ['SEX', 'DEYE']
Working on ['SEX', 'MIG']
Working on ['SEX', 'MIL']
Working on ['SEX', 'AGEP']
Working on ['SEX', 'DREM']
Working on ['SEX', 'MAR']
Working on ['DIS', 'NATIVITY']
Working on ['DIS', 'DEAR']
Working on ['DIS', 'DEYE']
Working on ['DIS', 'MIG']
Working on ['DIS', 'MIL']
Working on ['DIS', 'AGEP']
Working on ['DIS', 'DREM']
Working on ['DIS', 'MAR']
Working on ['NATIVITY', 'DEAR']
Working on ['NATIVITY', 'DEYE']
Working on ['NATIVITY', 'MIG']
Working on ['NATIVITY', 'MIL']
Working on ['NATIVITY', 'AGEP']
Working on ['NATIVITY', 'DREM']
Working on ['NATIVITY', 'MAR']
Working on ['DEAR', 'DEYE']
Working on ['DEAR', 'MIG']
Working on ['DEAR', 'MIL']
Working on ['DEAR', 'AGEP']
Working on ['DEAR', 'DREM']
Working on ['DEAR', 'MAR']
Working on ['DEYE', 'MIG']
Working on ['DEYE', 'MIL']
Working on ['DEYE', 'AGEP']
Working on ['DEYE', 'DREM']
Working on ['DEYE', 'MAR']
Wo

In [11]:
# save the matrix
import pickle
with open('./my_data/all_probs_2.pkl', 'wb') as f:
    pickle.dump(all_probs, f)

In [90]:
d = all_probs[2]['25']

for k, v in d.items():
    print(k, v)

00 0.02566184940079252
01 0.12448811092246938
10 0.1549682966383366
11 0.6948817430384016


In [12]:
all_probs[2].keys()

dict_keys(['01', '02', '03', '04', '05', '06', '07', '08', '09', '12', '13', '14', '15', '16', '17', '18', '19', '23', '24', '25', '26', '27', '28', '29', '34', '35', '36', '37', '38', '39', '45', '46', '47', '48', '49', '56', '57', '58', '59', '67', '68', '69', '78', '79', '89'])

In [38]:
# print lengths
for k in range(1, n+1):
    # String with all zeros of length k
    all_zeros = ''.join(['0' for _ in range(k)])
    a_key = list(all_probs[k].keys())[0]
    print(f'k={k}: {len(all_probs[k])} {len(all_probs[k][a_key])}')
    
print(sum([len(all_probs[k]) for k in range(1, n+1)]))

k=1: 10 2
k=2: 45 4
k=3: 120 8
k=4: 210 16
k=5: 252 32
k=6: 210 64
k=7: 120 128
k=8: 45 256
k=9: 10 512
k=10: 1 1024
1023


In [1]:
from  folk_tables import load_dataset, protected_attributes, class_mappings, CS

In [2]:
X, y = load_dataset()

In [13]:
attr = 'SEX'
collaborators = [protected_attributes[i] for i in [2,3,4,5]]

In [14]:
subset, subset_y = CS(X, y, 100, attr, collaborators=collaborators)

All attributes: ['NATIVITY', 'DEAR', 'DEYE', 'MIG', 'SEX']
Processing 00000 with len 5
Len of subspace 0: 81547
Processing 00001 with len 5
Len of subspace 1: 66570
Processing 00010 with len 5
Len of subspace 2: 434697
Processing 00011 with len 5
Len of subspace 3: 279704
Processing 00100 with len 5
Len of subspace 4: 1094
Processing 00101 with len 5
Len of subspace 5: 988
Processing 00110 with len 5
Len of subspace 6: 7164
Processing 00111 with len 5
Len of subspace 7: 5054
Processing 01000 with len 5
Len of subspace 8: 548
Processing 01001 with len 5
Len of subspace 9: 663
Processing 01010 with len 5
Len of subspace 10: 3882
Processing 01011 with len 5
Len of subspace 11: 3534
Processing 01100 with len 5
Len of subspace 12: 170
Processing 01101 with len 5
Len of subspace 13: 250
Processing 01110 with len 5
Len of subspace 14: 1228
Processing 01111 with len 5
Len of subspace 15: 1279
Processing 10000 with len 5
Len of subspace 16: 454518
Processing 10001 with len 5
Len of subspace 17:

In [16]:
subset['NATIVITY'].mean()

1.5

## Print the heatmap

In [13]:
import pickle

In [99]:
g_p = '/home/dhasade/audits/ml-audits/results/new_matrices/gains_CS_seed5_budget500_repeat1.pkl'

In [100]:
error_matrix = pickle.load(open('/home/dhasade/audits/ml-audits/results/matrices/errors_CS_seed5_budget100_repeat1.pkl', 'rb'))
gain_matrix = pickle.load(open(g_p, 'rb'))

In [95]:
# Nicely print the error matrix
n = 5
for i in range(n):
    for j in range(n):
        print(f'{error_matrix[i][j]:.3f}', end=' ')
    print()

0.039 0.025 0.013 0.012 0.001 
0.005 0.001 0.054 0.026 0.249 
0.063 0.067 0.054 0.120 0.052 
0.033 0.147 0.158 0.032 0.170 
0.275 0.295 0.141 0.065 0.077 


In [101]:
# Nicely print the gain matrix
n = 5
for i in range(n):
    for j in range(n):
        print(f'{gain_matrix[i][j]:.3f}', end=' ')
    print()

1.000 0.829 0.618 5.427 1.544 
0.902 1.000 1.976 1.431 1.000 
7.164 2.419 1.000 4.708 1.817 
0.535 0.558 0.728 1.000 0.669 
0.257 0.380 0.159 0.089 1.000 


## Double checking dataframe calculations

In [72]:
X_tmp = X; y_tmp = y

In [73]:
attr = 'AGEP'

In [74]:
X_tmp = X_tmp[X[attr] > 50]

In [75]:
y_tmp = y_tmp.loc[X_tmp.index]

In [60]:
y_tmp.head()

Unnamed: 0,PUBCOV
0,False
4,False
5,False
6,False
17,True


In [61]:
X_tmp.head()

Unnamed: 0,AGEP,SCHL,MAR,SEX,DIS,ESP,CIT,MIG,MIL,ANC,NATIVITY,DEAR,DEYE,DREM,PINCP,ESR,ST,FER,RAC1P
0,59.0,1.0,1.0,2.0,1.0,0.0,1.0,1.0,4.0,1.0,1.0,2.0,2.0,2.0,0.0,6.0,1.0,0.0,1.0
4,51.0,16.0,1.0,1.0,2.0,0.0,1.0,1.0,3.0,1.0,1.0,2.0,2.0,2.0,24000.0,1.0,1.0,0.0,1.0
5,53.0,16.0,1.0,2.0,2.0,0.0,1.0,1.0,4.0,2.0,1.0,2.0,2.0,2.0,21000.0,1.0,1.0,0.0,1.0
6,51.0,16.0,1.0,1.0,2.0,0.0,1.0,1.0,4.0,4.0,1.0,2.0,2.0,2.0,28050.0,1.0,1.0,0.0,1.0
17,56.0,12.0,1.0,1.0,1.0,0.0,1.0,1.0,4.0,1.0,1.0,2.0,2.0,1.0,24000.0,6.0,1.0,0.0,2.0


In [76]:
X_tmp = X_tmp[X_tmp[attr] > 55]
y_tmp = y_tmp.loc[X_tmp.index]

In [79]:
X_tmp.index

Index([      0,      17,      22,      35,      36,      37,      43,      52,
            61,      63,
       ...
       5916510, 5916527, 5916528, 5916530, 5916532, 5916533, 5916537, 5916542,
       5916548, 5916563],
      dtype='int64', length=1039465)

In [80]:
y_tmp.index

Index([      0,      17,      22,      35,      36,      37,      43,      52,
            61,      63,
       ...
       5916510, 5916527, 5916528, 5916530, 5916532, 5916533, 5916537, 5916542,
       5916548, 5916563],
      dtype='int64', length=1039465)

In [4]:
n_attrs=3
n_subspaces = 2**n_attrs
binary_strings = [format(i, f'0{n_attrs}b') for i in range(n_subspaces)]

In [5]:
binary_strings

['000', '001', '010', '011', '100', '101', '110', '111']