In [2]:
import numpy as np
import pandas as pd

from os import path
from urllib import request

In [3]:
# Column names
column_names = [
    'age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship',
    'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income'
]

# Income Label Mappings
train_labels_map = {'<=50K': 0, '>50K': 1}
test_labels_map = {'<=50K.': 0, '>50K.': 1}

In [4]:
def _check_and_download(filepath, url):
    if not path.exists(filepath):
        request.urlretrieve(url, filepath)

In [5]:
data_dir = "data"
train_data_file = path.join(data_dir, "adult.data")
test_data_file = path.join(data_dir, "adult.test")

In [8]:
# Read CSV Files

In [16]:
train_dataset = pd.read_csv(train_data_file, sep=',', header=None, names=column_names)
test_dataset = pd.read_csv(test_data_file, sep=',', header=0, names=column_names)         # The first line contains a random comment

In [17]:
train_dataset.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [19]:
test_dataset.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,16281.0,16281.0,16281.0,16281.0,16281.0,16281.0
mean,38.767459,189435.7,10.072907,1081.905104,87.899269,40.392236
std,13.849187,105714.9,2.567545,7583.935968,403.105286,12.479332
min,17.0,13492.0,1.0,0.0,0.0,1.0
25%,28.0,116736.0,9.0,0.0,0.0,40.0
50%,37.0,177831.0,10.0,0.0,0.0,40.0
75%,48.0,238384.0,12.0,0.0,0.0,45.0
max,90.0,1490400.0,16.0,99999.0,3770.0,99.0


In [22]:
# Preprocess strings
train_dataset = train_dataset.applymap(lambda x: x.strip() if isinstance(x, str) else x)
test_dataset = test_dataset.applymap(lambda x: x.strip() if isinstance(x, str) else x)

In [31]:
train_dataset.replace(to_replace='?', value=np.nan, inplace=True)
test_dataset.replace(to_replace='?', value=np.nan, inplace=True)
train_dataset.dropna(axis=0, inplace=True)
test_dataset.dropna(axis=0, inplace=True)

In [33]:
# Encode Labels
train_dataset.replace(train_labels_map, inplace=True)
test_dataset.replace(test_labels_map, inplace=True)

In [35]:
test_dataset.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,0
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,0
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,1
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,1
5,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,0


In [36]:
# Split Features and Labels
train_features, train_labels = train_dataset.drop('income', axis=1), train_dataset['income']
test_features, test_labels = test_dataset.drop('income', axis=1), test_dataset['income']

In [39]:
# Categorize whether a Column is a continuous variable or a categorical variable
continuous_vars = []
categorical_columns = []
for col in train_features.columns:
    if (train_features[col].isnull().sum() > 0):
        train_features.drop(col, axis=1, inplace=True)
    else:
        if (train_features[col].dtype == np.object):
            categorical_columns += [col]
        else:
            continuous_vars += [col]

In [44]:
# Emphasis on the sensitive attribute
sensitive_attribute = "sex"

sensitive_unique = train_features[sensitive_attribute].nunique()
print(sensitive_unique)

2


In [52]:
sensitive_train = np.logical_not(pd.Categorical(train_features[sensitive_attribute]).codes)

In [53]:
pd.Categorical(train_features[sensitive_attribute]).codes

array([1, 1, 1, ..., 0, 1, 0], dtype=int8)

In [56]:
sensitive_train

array([False, False, False, ...,  True, False,  True])

In [55]:
len(sensitive_train)

30162

In [60]:
pd.get_dummies(train_features, columns=categorical_columns, prefix_sep='=')

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,workclass=Federal-gov,workclass=Local-gov,workclass=Private,workclass=Self-emp-inc,...,native_country=Portugal,native_country=Puerto-Rico,native_country=Scotland,native_country=South,native_country=Taiwan,native_country=Thailand,native_country=Trinadad&Tobago,native_country=United-States,native_country=Vietnam,native_country=Yugoslavia
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,215646,9,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
3,53,234721,7,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,28,338409,13,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,257302,12,0,0,38,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
32557,40,154374,9,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
32558,58,151910,9,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
32559,22,201490,9,0,0,20,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [62]:
continuous_columns = [train_features.columns.get_loc(var) for var in continuous_vars]

In [63]:
continuous_columns

[0, 2, 4, 10, 11, 12]

In [66]:
pd.get_dummies(test_features, columns=categorical_columns, prefix_sep='=')

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,workclass=Federal-gov,workclass=Local-gov,workclass=Private,workclass=Self-emp-inc,...,native_country=Portugal,native_country=Puerto-Rico,native_country=Scotland,native_country=South,native_country=Taiwan,native_country=Thailand,native_country=Trinadad&Tobago,native_country=United-States,native_country=Vietnam,native_country=Yugoslavia
0,25,226802,7,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
1,38,89814,9,0,0,50,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
2,28,336951,12,0,0,40,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,44,160323,10,7688,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
5,34,198693,6,0,0,30,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16275,33,245211,13,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
16276,39,215419,13,0,0,36,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
16278,38,374983,13,0,0,50,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
16279,44,83891,13,5455,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [68]:
train_features = pd.get_dummies(train_features, columns=categorical_columns, prefix_sep='=')
test_features = pd.get_dummies(test_features, columns=categorical_columns, prefix_sep='=')

In [70]:
continuous_columns = [train_features.columns.get_loc(var) for var in continuous_vars]

In [71]:
test_features.insert(
            loc = train_features.columns.get_loc('native_country=Holand-Netherlands'),
            column='native_country=Holand-Netherlands', value=0
        )

In [73]:
train_features

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,workclass=Federal-gov,workclass=Local-gov,workclass=Private,workclass=Self-emp-inc,...,native_country=Portugal,native_country=Puerto-Rico,native_country=Scotland,native_country=South,native_country=Taiwan,native_country=Thailand,native_country=Trinadad&Tobago,native_country=United-States,native_country=Vietnam,native_country=Yugoslavia
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,215646,9,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
3,53,234721,7,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,28,338409,13,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,257302,12,0,0,38,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
32557,40,154374,9,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
32558,58,151910,9,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
32559,22,201490,9,0,0,20,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [74]:
test_features

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,workclass=Federal-gov,workclass=Local-gov,workclass=Private,workclass=Self-emp-inc,...,native_country=Portugal,native_country=Puerto-Rico,native_country=Scotland,native_country=South,native_country=Taiwan,native_country=Thailand,native_country=Trinadad&Tobago,native_country=United-States,native_country=Vietnam,native_country=Yugoslavia
0,25,226802,7,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
1,38,89814,9,0,0,50,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
2,28,336951,12,0,0,40,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,44,160323,10,7688,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
5,34,198693,6,0,0,30,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16275,33,245211,13,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
16276,39,215419,13,0,0,36,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
16278,38,374983,13,0,0,50,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
16279,44,83891,13,5455,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [75]:
one_hot_columns={}

In [81]:
for column_name in categorical_columns:
    ids = [i for i, col in enumerate(train_features.columns) if col.startswith("{}=".format(column_name))]
    if (len(ids) > 0):
        assert len(ids) == ids[-1] - ids[0] + 1
    one_hot_columns[column_name] = ids
    print(ids)

[6, 7, 8, 9, 10, 11, 12]
[13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28]
[29, 30, 31, 32, 33, 34, 35]
[36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
[50, 51, 52, 53, 54, 55]
[56, 57, 58, 59, 60]
[61, 62]
[63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103]


In [82]:
print("categorical features: ", one_hot_columns.keys())

categorical features:  dict_keys(['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country'])


In [83]:
column_ids = {col: idx for idx, col in enumerate(train_features.columns)}

In [84]:
column_ids

{'age': 0,
 'fnlwgt': 1,
 'education_num': 2,
 'capital_gain': 3,
 'capital_loss': 4,
 'hours_per_week': 5,
 'workclass=Federal-gov': 6,
 'workclass=Local-gov': 7,
 'workclass=Private': 8,
 'workclass=Self-emp-inc': 9,
 'workclass=Self-emp-not-inc': 10,
 'workclass=State-gov': 11,
 'workclass=Without-pay': 12,
 'education=10th': 13,
 'education=11th': 14,
 'education=12th': 15,
 'education=1st-4th': 16,
 'education=5th-6th': 17,
 'education=7th-8th': 18,
 'education=9th': 19,
 'education=Assoc-acdm': 20,
 'education=Assoc-voc': 21,
 'education=Bachelors': 22,
 'education=Doctorate': 23,
 'education=HS-grad': 24,
 'education=Masters': 25,
 'education=Preschool': 26,
 'education=Prof-school': 27,
 'education=Some-college': 28,
 'marital_status=Divorced': 29,
 'marital_status=Married-AF-spouse': 30,
 'marital_status=Married-civ-spouse': 31,
 'marital_status=Married-spouse-absent': 32,
 'marital_status=Never-married': 33,
 'marital_status=Separated': 34,
 'marital_status=Widowed': 35,
 'oc

In [85]:
test_column_ids = {col: idx for idx, col in enumerate(test_features.columns)}

In [86]:
test_column_ids

{'age': 0,
 'fnlwgt': 1,
 'education_num': 2,
 'capital_gain': 3,
 'capital_loss': 4,
 'hours_per_week': 5,
 'workclass=Federal-gov': 6,
 'workclass=Local-gov': 7,
 'workclass=Private': 8,
 'workclass=Self-emp-inc': 9,
 'workclass=Self-emp-not-inc': 10,
 'workclass=State-gov': 11,
 'workclass=Without-pay': 12,
 'education=10th': 13,
 'education=11th': 14,
 'education=12th': 15,
 'education=1st-4th': 16,
 'education=5th-6th': 17,
 'education=7th-8th': 18,
 'education=9th': 19,
 'education=Assoc-acdm': 20,
 'education=Assoc-voc': 21,
 'education=Bachelors': 22,
 'education=Doctorate': 23,
 'education=HS-grad': 24,
 'education=Masters': 25,
 'education=Preschool': 26,
 'education=Prof-school': 27,
 'education=Some-college': 28,
 'marital_status=Divorced': 29,
 'marital_status=Married-AF-spouse': 30,
 'marital_status=Married-civ-spouse': 31,
 'marital_status=Married-spouse-absent': 32,
 'marital_status=Never-married': 33,
 'marital_status=Separated': 34,
 'marital_status=Widowed': 35,
 'oc

In [94]:
import torch
device = torch.device("cuda" if torch.cuda.is_available else "cpu")
train_features = torch.tensor(train_features.values.astype(np.float32))
train_labels = torch.tensor(train_labels.values.astype(np.int64))
train_protected = torch.tensor(protected_train.as_type(np.bool))

AttributeError: 'builtin_function_or_method' object has no attribute 'astype'