In [1]:
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

features = ["Age", "Workclass", "fnlwgt", "Education", "Education-Num", "Martial Status",
        "Occupation", "Relationship", "Race", "Sex", "Capital Gain", "Capital Loss",
        "Hours per week", "Country", "Target"] 

# Change these to local file if available
train_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
test_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test'
original_train = pd.read_csv(train_url, names=features, sep=r'\s*,\s*', 
                             engine='python', na_values="?", skipinitialspace=True)
# This will download 1.9M
original_test = pd.read_csv(test_url, names=features, sep=r'\s*,\s*', 
                            engine='python', na_values="?", skiprows=1, skipinitialspace=True)

In [2]:
original_train.isnull().sum()

Age                  0
Workclass         1836
fnlwgt               0
Education            0
Education-Num        0
Martial Status       0
Occupation        1843
Relationship         0
Race                 0
Sex                  0
Capital Gain         0
Capital Loss         0
Hours per week       0
Country            583
Target               0
dtype: int64

In [3]:
original_test.isnull().sum()

Age                 0
Workclass         963
fnlwgt              0
Education           0
Education-Num       0
Martial Status      0
Occupation        966
Relationship        0
Race                0
Sex                 0
Capital Gain        0
Capital Loss        0
Hours per week      0
Country           274
Target              0
dtype: int64

In [4]:
# drop null?

original_train.dropna(inplace=True)
original_test.dropna(inplace=True)

In [5]:
print('Original Train:', original_train.shape)
print('Original Test:', original_test.shape)

count_train = len(original_train)
count_test = len(original_test)
total = len(original_train)+len(original_test)
print('TOTAL:', total)
print('percentage of train:', count_train/total)
print('percentage of test:', count_test/total)

Original Train: (30162, 15)
Original Test: (15060, 15)
TOTAL: 45222
percentage of train: 0.6669762504975455
percentage of test: 0.33302374950245456


In [6]:
# update target to binary
original_train['Target'] = original_train['Target'].replace('<=50K', 0).replace('>50K', 1)
original_test['Target'] = original_test['Target'].replace('<=50K.', 0).replace('>50K.', 1)
original_train.head()

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education-Num,Martial Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours per week,Country,Target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [7]:
# datasets
from aif360.datasets import StandardDataset
# from aif360.datasets import AdultDataset

# metrics
from aif360.metrics import BinaryLabelDatasetMetric

# bias mitigation technique
from aif360.algorithms.preprocessing import Reweighing

# display
from IPython.display import Markdown, display

pip install 'aif360[LawSchoolGPA]'
pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[Reductions]'
pip install 'aif360[Reductions]'
pip install 'aif360[Reductions]'


In [8]:
# load dataset
dataset_train = StandardDataset(original_train, label_name='Target', 
                                favorable_classes=[1],
                                protected_attribute_names=['Sex'], 
                                privileged_classes=[['Male']],
                                categorical_features=['Workclass', 'Education', 'Martial Status', 'Occupation', 'Relationship', 'Race', 'Country'])

In [9]:
privileged_groups = [{'Sex': 1}]
unprivileged_groups = [{'Sex': 0}]

metric_orig_train = BinaryLabelDatasetMetric(dataset_train, 
                                             unprivileged_groups=unprivileged_groups,
                                             privileged_groups=privileged_groups)
display(Markdown("#### Original training dataset"))
print("Difference in mean outcomes between unprivileged and privileged groups = %f" % metric_orig_train.mean_difference())

#### Original training dataset

Difference in mean outcomes between unprivileged and privileged groups = -0.200159


In [10]:
# rebalancing

RW = Reweighing(unprivileged_groups=unprivileged_groups,
                privileged_groups=privileged_groups)
dataset_transf_train = RW.fit_transform(dataset_train)

In [11]:
metric_transf_train = BinaryLabelDatasetMetric(dataset_transf_train, 
                                               unprivileged_groups=unprivileged_groups,
                                               privileged_groups=privileged_groups)
display(Markdown("#### Transformed training dataset"))
print("Difference in mean outcomes between unprivileged and privileged groups = %f" % metric_transf_train.mean_difference())

#### Transformed training dataset

Difference in mean outcomes between unprivileged and privileged groups = 0.000000


In [12]:
# convert back to dataframe

transformed_train = dataset_transf_train.convert_to_dataframe()[0]

In [13]:
transformed_train

Unnamed: 0,Age,fnlwgt,Education-Num,Sex,Capital Gain,Capital Loss,Hours per week,Workclass=Federal-gov,Workclass=Local-gov,Workclass=Private,...,Country=Puerto-Rico,Country=Scotland,Country=South,Country=Taiwan,Country=Thailand,Country=Trinadad&Tobago,Country=United-States,Country=Vietnam,Country=Yugoslavia,Target
0,39.0,77516.0,13.0,1.0,2174.0,0.0,40.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,50.0,83311.0,13.0,1.0,0.0,0.0,13.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,38.0,215646.0,9.0,1.0,0.0,0.0,40.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,53.0,234721.0,7.0,1.0,0.0,0.0,40.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,28.0,338409.0,13.0,0.0,0.0,0.0,40.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27.0,257302.0,12.0,0.0,0.0,0.0,38.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
32557,40.0,154374.0,9.0,1.0,0.0,0.0,40.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
32558,58.0,151910.0,9.0,0.0,0.0,0.0,40.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
32559,22.0,201490.0,9.0,1.0,0.0,0.0,20.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [14]:
# Reference: https://towardsdatascience.com/mitigating-bias-in-ai-with-aif360-b4305d1f88a9

# update test data
num_train = len(original_train)
all_data = pd.concat([original_train, original_test], axis=0)
categorical_features = ['Workclass', 'Education', 'Martial Status', 'Occupation', 'Relationship', 'Race', 'Country'] #except sex
all_data = pd.get_dummies(
    all_data,
    prefix=None,
    prefix_sep='_',
    dummy_na=False,
    columns=categorical_features,
    sparse=False,
    drop_first=False,
    dtype=None
    )
all_data['Sex'] = all_data['Sex'].replace('Female', float(0)).replace('Male', float(1))

test_data = all_data[num_train:]

In [15]:
test_data.head()

Unnamed: 0,Age,fnlwgt,Education-Num,Sex,Capital Gain,Capital Loss,Hours per week,Target,Workclass_Federal-gov,Workclass_Local-gov,...,Country_Portugal,Country_Puerto-Rico,Country_Scotland,Country_South,Country_Taiwan,Country_Thailand,Country_Trinadad&Tobago,Country_United-States,Country_Vietnam,Country_Yugoslavia
0,25,226802,7,1.0,0,0,40,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,38,89814,9,1.0,0,0,50,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,28,336951,12,1.0,0,0,40,1,0,1,...,0,0,0,0,0,0,0,1,0,0
3,44,160323,10,1.0,7688,0,40,1,0,0,...,0,0,0,0,0,0,0,1,0,0
5,34,198693,6,1.0,0,0,30,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [16]:
# training with transformed data
X_train = transformed_train.drop(['Target'], axis=1).values
y_train = transformed_train['Target'].values
y_test = test_data['Target'].values
X_test = test_data.drop(['Target'], axis=1).values

In [17]:
from sklearn import linear_model

lr = linear_model.LogisticRegression()

lr.fit(X_train, y_train)

In [18]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(lr, X_train, y_train, cv=10)
print('Cross-Validation Accuracy Scores', scores)

Cross-Validation Accuracy Scores [0.78952602 0.7918462  0.78713528 0.79011936 0.78547745 0.79973475
 0.7831565  0.79343501 0.79111406 0.79409814]


In [19]:
prediction = lr.predict(X_test)

In [21]:
# test in aquitas
aquitas_df_reweighing_sex = original_test.copy()

# drop target
aquitas_df_reweighing_sex.drop('Target', axis=1, inplace=True)

# rename income to label_value
aquitas_df_reweighing_sex.rename(columns = {'Income':'label_value'}, inplace = True)

# append the pred with column score
aquitas_df_reweighing_sex['score'] = prediction

In [22]:
# save to csv
aquitas_df_reweighing_sex.to_csv('aquitas_df_reweighing_sex.csv')