In [None]:
!pip install fairlearn
import pandas as pd
import numpy as np

Collecting fairlearn
  Downloading fairlearn-0.7.0-py3-none-any.whl (177 kB)
[?25l[K     |█▉                              | 10 kB 27.7 MB/s eta 0:00:01[K     |███▊                            | 20 kB 34.7 MB/s eta 0:00:01[K     |█████▌                          | 30 kB 27.8 MB/s eta 0:00:01[K     |███████▍                        | 40 kB 20.1 MB/s eta 0:00:01[K     |█████████▎                      | 51 kB 12.9 MB/s eta 0:00:01[K     |███████████                     | 61 kB 10.9 MB/s eta 0:00:01[K     |█████████████                   | 71 kB 11.9 MB/s eta 0:00:01[K     |██████████████▊                 | 81 kB 13.3 MB/s eta 0:00:01[K     |████████████████▋               | 92 kB 13.3 MB/s eta 0:00:01[K     |██████████████████▌             | 102 kB 12.9 MB/s eta 0:00:01[K     |████████████████████▎           | 112 kB 12.9 MB/s eta 0:00:01[K     |██████████████████████▏         | 122 kB 12.9 MB/s eta 0:00:01[K     |████████████████████████        | 133 kB 12.9 MB/s e

## Introduction

In this notebook, I'm going through a number of steps to pre-process the Adult dataset so as to make it ready for modelling. The steps taken include chossing a subset of the features, re-coding the missing values as 'Missing' (a bit unconventional, usually this isn't set as a category but rather removed or replaced by the mean or mode), one-hot-encoding (aka dummy coding) the categorical variables and standardizing the dataset.

### 1. Loading the data

In [None]:
from fairlearn.datasets import fetch_adult

adult = fetch_adult()

### 2. Choosing a subset of the columns
The features are chosen because they are numerical or categorical but with a small numer of unique categories. The latter is good beacuse we want there to be enough observations in each category.

In [None]:
columns = ['age', 'workclass', 'education-num', 'sex', 'hours-per-week']

In [None]:
data = pd.DataFrame(data = adult.data, columns = adult.feature_names)[columns]

In [None]:
data.head()

Unnamed: 0,age,workclass,education-num,sex,hours-per-week
0,25.0,0.0,7.0,1.0,40.0
1,38.0,0.0,9.0,1.0,50.0
2,28.0,4.0,12.0,1.0,40.0
3,44.0,0.0,10.0,1.0,40.0
4,18.0,,10.0,0.0,30.0


### 3. Recoding the `workclass` variable
As of now, the column `workclass` contains numbers, but the numbers doesn't have any mean. They're just representing categories. Later on, we will one-hot-encode these, and to be able to better interpret them after that, I want to put the category names insteda of numbers into the dataframe.

In [None]:
# Fetch the categories of the variable 'workclass'
workclass_categories = adult.categories['workclass']

# Combine into a dictionary mapping category to the number in the dataframe
workclass_mapping = dict(zip(np.sort(data.workclass.unique()), workclass_categories))

# Replace the numerical value with the category
data['workclass'] = data['workclass'].map(workclass_mapping)

# Replace the missing values with 'Missing' - maybe there's an interesting pattern there?
data['workclass'] = data['workclass'].fillna('Missing')

### 4. Check for missing values

In [None]:
# Check for null values
data.isnull().sum()

age               0
workclass         0
education-num     0
sex               0
hours-per-week    0
dtype: int64

### 5. One-Hot-Encoding the categorical column `workclass`

In [None]:
# Turn the categorical column (workclass) into dummy features, aka One-Hot-Encoding
data = pd.get_dummies(data)

### 6. Remove and save the protected/sensitive attribute

In [None]:
# Remove sex - we do not want to pre-process that variable!
# With .pop() we save it, which is good, we need it later for the fairness evaluation
protected = data.pop('sex')

In [None]:
data.head()

Unnamed: 0,age,education-num,hours-per-week,workclass_Federal-gov,workclass_Local-gov,workclass_Missing,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay
0,25.0,7.0,40.0,0,0,0,0,1,0,0,0,0
1,38.0,9.0,50.0,0,0,0,0,1,0,0,0,0
2,28.0,12.0,40.0,0,1,0,0,0,0,0,0,0
3,44.0,10.0,40.0,0,0,0,0,1,0,0,0,0
4,18.0,10.0,30.0,0,0,1,0,0,0,0,0,0


### 7. Standardize the features

Normally, we'd want to fit the scaler to only the train set and transform both train and test set by the mean and std of the train set. In this case, we do not have a train and test set yet, so we will standardize it all together.

In [None]:
from sklearn.preprocessing import StandardScaler

# Fit the standardizer to the train set
scaler = StandardScaler()

# Scale the data
data_scaled = scaler.fit_transform(data)

# Put back to pandas dataframe
data = pd.DataFrame(data_scaled, columns = data.columns)

In [None]:
data.head()

Unnamed: 0,age,education-num,hours-per-week,workclass_Federal-gov,workclass_Local-gov,workclass_Missing,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay
0,-0.995129,-1.197259,-0.034087,-0.173795,-0.26194,-0.246558,-0.01431,0.663711,-0.189609,-0.293019,-0.205606,-0.02074
1,-0.046942,-0.419335,0.77293,-0.173795,-0.26194,-0.246558,-0.01431,0.663711,-0.189609,-0.293019,-0.205606,-0.02074
2,-0.776316,0.74755,-0.034087,-0.173795,3.817672,-0.246558,-0.01431,-1.50668,-0.189609,-0.293019,-0.205606,-0.02074
3,0.390683,-0.030373,-0.034087,-0.173795,-0.26194,-0.246558,-0.01431,0.663711,-0.189609,-0.293019,-0.205606,-0.02074
4,-1.505691,-0.030373,-0.841104,-0.173795,-0.26194,4.055836,-0.01431,-1.50668,-0.189609,-0.293019,-0.205606,-0.02074


### 8. Add protected attribute and label to the dataset

In [None]:
data['sex'] = protected
data['salary'] = adult.target

In [None]:
data.head()

Unnamed: 0,age,education-num,hours-per-week,workclass_Federal-gov,workclass_Local-gov,workclass_Missing,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,sex,salary
0,-0.995129,-1.197259,-0.034087,-0.173795,-0.26194,-0.246558,-0.01431,0.663711,-0.189609,-0.293019,-0.205606,-0.02074,1.0,<=50K
1,-0.046942,-0.419335,0.77293,-0.173795,-0.26194,-0.246558,-0.01431,0.663711,-0.189609,-0.293019,-0.205606,-0.02074,1.0,<=50K
2,-0.776316,0.74755,-0.034087,-0.173795,3.817672,-0.246558,-0.01431,-1.50668,-0.189609,-0.293019,-0.205606,-0.02074,1.0,>50K
3,0.390683,-0.030373,-0.034087,-0.173795,-0.26194,-0.246558,-0.01431,0.663711,-0.189609,-0.293019,-0.205606,-0.02074,1.0,>50K
4,-1.505691,-0.030373,-0.841104,-0.173795,-0.26194,4.055836,-0.01431,-1.50668,-0.189609,-0.293019,-0.205606,-0.02074,0.0,<=50K


### 9. Save dataframe to csv
Beacuse we are going to use the dataset in another notebook, I'm saving it to csv.

In [None]:
data.to_csv('adult_preprocessed.csv')