In [1]:
import sys
sys.path.append("../")

In [2]:
import numpy as np
import pandas as pd
pd.set_option('display.width', 500)
import aequitas.detection.descriptive_stats as dstats
import aequitas.detection.metrics as metrics
import aequitas.mitigation.data as technique
import aequitas.mitigation.models as model
import aequitas.tools.data_manip as dm
import aequitas.tools as tools

In [3]:
#Import dataset
dataset_name="Census_Income_Dataset.csv"
dataset_directory="../datasets/"+dataset_name
dataset = pd.read_csv(dataset_directory)

In [4]:
# Dataset Pre-Processing

# remove fnlwgt column (per instructions)
dataset = dataset.drop('fnlwgt', axis=1)

# remove education column since there is an educution_num
dataset = dataset.drop('education', axis=1)

# impute the missing values
num_data = dataset.shape[0]
col_names = dataset.columns
for c in col_names:
	dataset[c] = dataset[c].replace("?", np.NaN)
dataset = dataset.apply(lambda x:x.fillna(x.value_counts().index[0]))

In [5]:
# display dataset structure
dataset_struct=dstats.analyse_dataset(dataset,verbose=True)

Dataset:
        Column Name Data Type Column Type (suggestion)  Number_Values                                             Values
0               age     int64               Continuous             74                                                  -
1         workclass      text      Categorical/Ordinal              8  [Private, Local-gov, Self-emp-not-inc, Federal...
2   educational-num     int64      Categorical/Ordinal             16  [7, 9, 12, 10, 6, 15, 4, 13, 14, 16, 3, 11, 5,...
3    marital-status      text      Categorical/Ordinal              7  [Never-married, Married-civ-spouse, Widowed, D...
4        occupation      text      Categorical/Ordinal             14  [Machine-op-inspct, Farming-fishing, Protectiv...
5      relationship      text      Categorical/Ordinal              6  [Own-child, Husband, Not-in-family, Unmarried,...
6              race      text      Categorical/Ordinal              5  [Black, White, Asian-Pac-Islander, Other, Amer...
7            gender    

In [6]:
# split dataset to training and test samples
training_sample,test_sample = dm.split_dataset(dataset,ratio=0.2,random_state=51)

# Define appropriate transformations for dataset (feature encoding and scaling if required)
transform_dict = {
    "income": {
        "encode": "labeling",
        "labels": {
            "<=50K": 0,
            ">50K": 1, 
        }
    },
    "gender": {
        "encode": "labeling",
        "labels": {
            "Female": 0,
            "Male": 1, 
        }
    },
    "workclass": {
        "encode": "labeling",
        "scaling": "min-max"
    },
    "race": {
        "encode": "labeling", 
        "scaling": "min-max"
    },
    "marital-status": {
        "encode": "labeling",
        "scaling": "min-max"
    },
    "occupation": {
        "encode": "labeling", 
        "scaling": "min-max"
    },
    "relationship": {
        "encode": "labeling", 
        "scaling": "min-max"
    },
    "native-country": {
        "encode": "labeling", 
        "scaling": "min-max"
    },
    "age":{
        "scaling": "standard"
    },
    "educational-num":{
        "scaling": "min-max"
    },
    "capital-gain":{
        "scaling": "standard"
    },
    "capital-loss":{
        "scaling": "standard"
    },
    "hours-per-week":{
        "scaling": "standard"
    }
}

# Transform the training sample
training_sample, transformers = dm.transform_training_data(training_sample, transform_dict)

# Transform the test sample
test_sample = dm.transform_test_data(dataset, transform_dict, transformers)

In [7]:
# define fairness parameters
class_attribute='income'
sensitive_attribute='gender'
outcome=1 # >50K
priv_group=1 # Male

In [8]:
# measure statistical parity before classification
print("Training sample:")
res=metrics.statistical_parity(training_sample,class_attribute,sensitive_attribute,positive_outcome=outcome,privileged_group=priv_group,verbose=True)

print("Test sample:")
res=metrics.statistical_parity(test_sample,class_attribute,sensitive_attribute,positive_outcome=outcome,privileged_group=priv_group,verbose=True)

Training sample:
Statistical/Demographic Parity:
Outcome:  1
     1         0
1  0.0  0.196285


Test sample:
Statistical/Demographic Parity:
Outcome:  1
     1         0
1  0.0  0.194516




In [9]:
# define classifier parameters
classifier_type="Decision_Tree"
classifier_params={
    "random_state":42, 
    "min_samples_leaf":10
}

In [10]:
# Train a classifier on training sample
clf=tools.train_classifier(training_sample,class_attribute,classifier_type,classifier_params)

# Test classifier on test sample
predicted_test_sample, _, _, _= tools.test_classifier(clf,test_sample,class_attribute,verbose=True)
prediction=np.array(predicted_test_sample[class_attribute])

Classifier Accuracy: 0.88


In [11]:
# measure statistical parity after classification on test sample
res=metrics.statistical_parity(predicted_test_sample,class_attribute,sensitive_attribute,positive_outcome=outcome,privileged_group=priv_group,verbose=True)

Statistical/Demographic Parity:
Outcome:  1
     1         0
1  0.0  0.169383




In [12]:
# Train a bias mitigated re-weighting classifier
print("Mitigation re-weighting!")
clf=model.reweighting(training_sample,class_attribute,sensitive_attribute,classifier_type,classifier_params)

# Test a classifier
predicted_test_sample, _, _, _= tools.test_classifier(clf,test_sample,class_attribute,verbose=True)
prediction=np.array(predicted_test_sample[class_attribute])

Mitigation re-weighting!
Classifier Accuracy: 0.87


In [13]:

# Measure Discrimination on predicted test sample
res=metrics.statistical_parity(predicted_test_sample,class_attribute,sensitive_attribute,positive_outcome=outcome,privileged_group=priv_group,verbose=True)

Statistical/Demographic Parity:
Outcome:  1
     1         0
1  0.0  0.078471


