- adult(1): sex (q=1, 2 groups)
- adult(2): sex + race (q=2, 4 groups)
- adult(3): sex + race + age (q=3, 8 groups)
- adult(4): sex + race + age + marital (q=4, 16 groups)

In [5]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

RAW_PATH = 'raw/adult/adult.data'
df = pd.read_csv(RAW_PATH, header=None, skipinitialspace=True)

# Columns: 0=age, 1=workclass, 2=fnlwgt, 3=education, 4=education-num,
#          5=marital-status, 6=occupation, 7=relationship, 8=race,
#          9=sex, 10=capital-gain, 11=capital-loss, 12=hours-per-week,
#          13=native-country, 14=income

# y: income label
y = (df[14].str.strip() == '<=50K').astype(int).values

# X: numerical features (age, fnlwgt, education-num, capital-gain, hours-per-week)
X_raw = df[[0, 2, 4, 10, 12]].values.astype(float)
X = StandardScaler().fit_transform(X_raw)

print(f'X: {X.shape}, y: {y.shape}')

X: (32561, 5), y: (32561,)


In [6]:
# Sensitive attributes (binary)
sex = (df[9].str.strip() == 'Male').astype(int).values
race = (df[8].str.strip() == 'White').astype(int).values
age = (df[0] >= 40).astype(int).values
marital = df[5].str.strip().isin(['Married-civ-spouse', 'Married-AF-spouse']).astype(int).values

print(f'sex: {np.bincount(sex)}')
print(f'race: {np.bincount(race)}')
print(f'age: {np.bincount(age)}')
print(f'marital: {np.bincount(marital)}')

sex: [10771 21790]
race: [ 4745 27816]
age: [18324 14237]
marital: [17562 14999]


In [7]:
# Save for each q
configs = {
    'adult(1)': [sex],
    'adult(2)': [sex, race],
    'adult(3)': [sex, race, age],
    'adult(4)': [sex, race, age, marital],
}

for name, attrs in configs.items():
    os.makedirs(name, exist_ok=True)
    S = np.column_stack(attrs)  # (n, q)
    
    np.save(f'{name}/X.npy', X.astype(np.float32))
    np.save(f'{name}/y.npy', y)
    np.save(f'{name}/S.npy', S)
    
    n_groups = len(np.unique([tuple(row) for row in S]))
    print(f'{name}: q={S.shape[1]}, {n_groups} groups, n={len(X)}')

adult(1): q=1, 2 groups, n=32561
adult(2): q=2, 2 groups, n=32561
adult(3): q=3, 2 groups, n=32561
adult(3): q=3, 2 groups, n=32561
adult(4): q=4, 2 groups, n=32561
adult(4): q=4, 2 groups, n=32561


In [8]:
# Verify
for name in configs.keys():
    X_l = np.load(f'{name}/X.npy')
    y_l = np.load(f'{name}/y.npy')
    S_l = np.load(f'{name}/S.npy')
    print(f'{name}: X={X_l.shape}, y={y_l.shape}, S={S_l.shape}')

adult(1): X=(32561, 5), y=(32561,), S=(32561, 1)
adult(2): X=(32561, 5), y=(32561,), S=(32561, 2)
adult(3): X=(32561, 5), y=(32561,), S=(32561, 3)
adult(4): X=(32561, 5), y=(32561,), S=(32561, 4)
