In [2]:
import numpy as np
import pandas as pd

from os import path
from urllib import request

In [3]:
# Column names
column_names = [
    'age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship',
    'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income'
]

# Income Label Mappings
train_labels_map = {'<=50K': 0, '>50K': 1}
test_labels_map = {'<=50K.': 0, '>50K.': 1}

In [4]:
def _check_and_download(filepath, url):
    if not path.exists(filepath):
        request.urlretrieve(url, filepath)

In [5]:
data_dir = "data"
train_data_file = path.join(data_dir, "adult.data")
test_data_file = path.join(data_dir, "adult.test")

In [8]:
# Read CSV Files

In [16]:
train_dataset = pd.read_csv(train_data_file, sep=',', header=None, names=column_names)
test_dataset = pd.read_csv(test_data_file, sep=',', header=0, names=column_names)         # The first line contains a random comment

In [17]:
train_dataset.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [19]:
test_dataset.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,16281.0,16281.0,16281.0,16281.0,16281.0,16281.0
mean,38.767459,189435.7,10.072907,1081.905104,87.899269,40.392236
std,13.849187,105714.9,2.567545,7583.935968,403.105286,12.479332
min,17.0,13492.0,1.0,0.0,0.0,1.0
25%,28.0,116736.0,9.0,0.0,0.0,40.0
50%,37.0,177831.0,10.0,0.0,0.0,40.0
75%,48.0,238384.0,12.0,0.0,0.0,45.0
max,90.0,1490400.0,16.0,99999.0,3770.0,99.0


In [22]:
# Preprocess strings
train_dataset = train_dataset.applymap(lambda x: x.strip() if isinstance(x, str) else x)
test_dataset = test_dataset.applymap(lambda x: x.strip() if isinstance(x, str) else x)

In [31]:
train_dataset.replace(to_replace='?', value=np.nan, inplace=True)
test_dataset.replace(to_replace='?', value=np.nan, inplace=True)
train_dataset.dropna(axis=0, inplace=True)
test_dataset.dropna(axis=0, inplace=True)

In [33]:
# Encode Labels
train_dataset.replace(train_labels_map, inplace=True)
test_dataset.replace(test_labels_map, inplace=True)

In [35]:
test_dataset.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,0
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,0
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,1
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,1
5,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,0


In [36]:
# Split Features and Labels
train_features, train_labels = train_dataset.drop('income', axis=1), train_dataset['income']
test_features, test_labels = test_dataset.drop('income', axis=1), test_dataset['income']

In [39]:
# Categorize whether a Column is a continuous variable or a categorical variable
continuous_vars = []
categorical_columns = []
for col in train_features.columns:
    if (train_features[col].isnull().sum() > 0):
        train_features.drop(col, axis=1, inplace=True)
    else:
        if (train_features[col].dtype == np.object):
            categorical_columns += [col]
        else:
            continuous_vars += [col]

In [44]:
# Emphasis on the sensitive attribute
sensitive_attribute = "sex"

sensitive_unique = train_features[sensitive_attribute].nunique()
print(sensitive_unique)

2


In [45]:
protected_train = np.logical_not(pd.Categorical(train_features[sensitive_attribute]).codes)

In [51]:
pd.Categorical(train_features[sensitive_attribute]).codes

array([1, 1, 1, ..., 0, 1, 0], dtype=int8)

In [49]:
protected_train

array([False, False, False, ...,  True, False,  True])