In [1]:
import sys
sys.path.append("../")

In [2]:
import numpy as np
import pandas as pd
pd.set_option('display.width', 500)
from aequitas.engine import Aequitas
import aequitas.tools.data_manip as dm
import aequitas.tools as tools

In [3]:
#Import dataset
dataset_name="Census_Income_Dataset.csv"
dataset_directory="../datasets/"+dataset_name
dataset = pd.read_csv(dataset_directory)

In [4]:
# Dataset Pre-Processing

# remove fnlwgt column (per instructions)
dataset = dataset.drop('fnlwgt', axis=1)

# remove education column since there is an educution_num
dataset = dataset.drop('education', axis=1)

# impute the missing values
num_data = dataset.shape[0]
col_names = dataset.columns
for c in col_names:
	dataset[c] = dataset[c].replace("?", np.NaN)
dataset = dataset.apply(lambda x:x.fillna(x.value_counts().index[0]))

In [5]:
# Regroup race feature to White and Minorities
groups = [['White'], ['Black','Asian-Pac-Islander','Other','Amer-Indian-Eskimo']]
labels=['White','Minority']
dataset["race"]=dm.merge_values(dataset["race"],groups,labels)
print("Unique values: ",dataset["race"].unique())

Unique values:  ['Minority' 'White']


In [6]:
# We are going to demonstrate a few examples of parameter files for an Aequitas Object

# Empty parameters file (Example 1)
parameters={
}
Aeq_dataset=Aequitas(dataset,parameters)
Aeq_dataset.structure(verbose=True)

Dataset:
        Column Name Data Type Column Type (suggestion)  Number_Values                                             Values
0               age     int64               Continuous             74                                                  -
1         workclass      text      Categorical/Ordinal              8  [Private, Local-gov, Self-emp-not-inc, Federal...
2   educational-num     int64      Categorical/Ordinal             16  [7, 9, 12, 10, 6, 15, 4, 13, 14, 16, 3, 11, 5,...
3    marital-status      text      Categorical/Ordinal              7  [Never-married, Married-civ-spouse, Widowed, D...
4        occupation      text      Categorical/Ordinal             14  [Machine-op-inspct, Farming-fishing, Protectiv...
5      relationship      text      Categorical/Ordinal              6  [Own-child, Husband, Not-in-family, Unmarried,...
6              race      text                   Binary              2                                  [Minority, White]
7            gender    

In [7]:
#Basic parameters file (Example 2)
parameters={
    "class_attribute":{
        "name": 'income',
    },
}
Aeq_dataset=Aequitas(dataset,parameters)
Aeq_dataset.descriptive_stats(verbose=True)

Proportions: (income)
              0
<=50K  0.760718
>50K   0.239282



In [8]:
# A parameters file without any expectations of privileged groups  (Example 3)
parameters={
    "class_attribute":{
        "name": 'income',
        "positive_value":'>50K'
    },
    "sensitive_attributes":
    [
        {
            "name": 'gender',
        },
        {
            "name": 'race',
        }
    ]
}
Aeq_dataset=Aequitas(dataset,parameters)
Aeq_dataset.descriptive_stats(verbose=True)

Proportions: (income)
              0
<=50K  0.760718
>50K   0.239282

Proportions: (gender)
               0
Male    0.668482
Female  0.331518

Proportions: (race)
                 0
White     0.855043
Minority  0.144957

Outcome distribution by group:
           <=50K      >50K
Female  0.890749  0.109251
Male    0.696233  0.303767

Outcome distribution by group:
             <=50K      >50K
Minority  0.847458  0.152542
White     0.746013  0.253987


Association between gender and race.
Contingency Table:
race    Minority  White
gender                 
Female      3165  13027
Male        3915  28735

Chi-squared statistic: 497.9678182429906
Cramer's V: 0.10087228311688282
Degrees of Freedom: 1
p-value: 2.6310785315092373e-110
There is a statistically significant association between gender and race.

Association between gender and income.
Contingency Table:
income  <=50K  >50K
gender             
Female  14423  1769
Male    22732  9918

Chi-squared statistic: 2248.847679013691
Cramer's

In [9]:
# Lets go forward with a more detailed fairness analysis

#Lets split the dataset into training and test samples
training_sample,test_sample = dm.split_dataset(dataset,ratio=0.3, random_state=123)

In [10]:
# Define a parameters file with privileged groups
parameters={
    "class_attribute":{
        "name": 'income',
        "positive_value":'>50K'
    },
    "sensitive_attributes":
    [
        {
            "name": 'gender',
            "privileged_group":'Male'
        },
    ]
}

# Define two Aequitas Objects
Aeq_training=Aequitas(training_sample,parameters)
Aeq_test=Aequitas(test_sample,parameters)

In [11]:
# Get data on Aeq_training object
Aeq_training.structure()
Aeq_training.descriptive_stats()

# you can use the folowing techniques without defining privileged groups. in that case the results will be displayed
# as if all values could be privileged.
Aeq_training.statistical_parity(verbose=True)
Aeq_training.disparate_impact(verbose=True)

Probabilities:
          Male    Female
>50K  0.303156  0.106205

Statistical/Demographic Parity:
Outcome:  >50K
      Male    Female
Male   0.0  0.196951


Probabilities:
          Male    Female
>50K  0.303156  0.106205

Disparate Impact:
Outcome:  >50K
      Male   Female
Male   1.0  0.35033




In [12]:
# Define appropriate transformations for dataset
transform_dictionary = {
    "income": {
        "encode": "labeling",
        "labels": {
            "<=50K": 0,
            ">50K": 1, 
        }
    },
    "gender": {
        "encode": "labeling",
        "labels": {
            "Female": 0,
            "Male": 1, 
        }
    },
    "race": {
        "encode": "labeling",
        "labels": {
            "Minority": 0,
            "White": 1, 
        } 
    },
    "workclass": {
        "encode": "labeling",
        "scaling": "min-max"
    },
    "marital-status": {
        "encode": "labeling",
        "scaling": "min-max"
    },
    "occupation": {
        "encode": "labeling", 
        "scaling": "min-max"
    },
    "relationship": {
        "encode": "labeling", 
        "scaling": "min-max"
    },
    "native-country": {
        "encode": "labeling", 
        "scaling": "min-max"
    },
    "age":{
        "scaling": "standard"
    },
    "educational-num":{
        "scaling": "min-max"
    },
    "capital-gain":{
        "scaling": "standard"
    },
    "capital-loss":{
        "scaling": "standard"
    },
    "hours-per-week":{
        "scaling": "standard"
    }
}

# add transform instructions for techniques
Aeq_training.transform_instructions(transform_dictionary)
Aeq_test.transform_instructions(transform_dictionary)

In [13]:
# define classifier parameters
classifier_type="Decision_Tree"
classifier_params={
    "random_state":42, 
    "min_samples_leaf":10
}

# train an unbiased classifier (re-weighting) technique
clf=Aeq_training.mitigation_model(method='re-weighting', sensitive_attribute='gender', classifier=classifier_type, classifier_params=classifier_params)

In [14]:
# Lets do a classification to see the results on the test sample

# transform object's dataset to numeric values
Aeq_test.transform()

# Test classifier on test sample
class_attribute=Aeq_training.parameters["class_attribute"]["name"]
predicted_test_sample, _, _, _= tools.test_classifier(clf,Aeq_test.dataset,class_attribute,verbose=True)

# Inverse transform the predicted test sample and the test sample
Aeq_test.inverse_transform()

Classifier Accuracy: 0.83


In [15]:
# define a new prediction test sample
Aeq_predicted_test=Aeq_test.copy()
Aeq_predicted_test.set_dataset(predicted_test_sample)
Aeq_predicted_test.inverse_transform()

# check statistical parity on new prediction test sample
Aeq_predicted_test.statistical_parity(verbose=True)

Probabilities:
          Male    Female
>50K  0.208525  0.136814

Statistical/Demographic Parity:
Outcome:  >50K
      Male    Female
Male   0.0  0.071711




In [16]:
prediction=np.array(Aeq_predicted_test.dataset[class_attribute])

# check equal opportunity / equal odds on test sample
Aeq_test.equal_opportunity(prediction,verbose=True)
Aeq_test.equal_odds(prediction,verbose=True)

Confusion Metrics:  (Positive_outcome='>50K')
          Female         Male
TP    371.000000  1491.000000
TN   3990.000000  6260.000000
FP    292.000000   554.000000
FN    193.000000  1502.000000
TPR     0.657801     0.498162
TNR     0.931808     0.918697
FPR     0.068192     0.081303
FNR     0.342199     0.501838
FDR     0.440422     0.270905
FOR     0.046139     0.193507
PPV     0.559578     0.729095
NPV     0.953861     0.806493
RPP     0.136814     0.208525
RNP     0.863186     0.791475
ACC     0.899917     0.790354
Equality of Opportunity:  (Positive_outcome='>50K')
        Female  Male
Male -0.159639   0.0

Confusion Metrics:  (Positive_outcome='>50K')
          Female         Male
TP    371.000000  1491.000000
TN   3990.000000  6260.000000
FP    292.000000   554.000000
FN    193.000000  1502.000000
TPR     0.657801     0.498162
TNR     0.931808     0.918697
FPR     0.068192     0.081303
FNR     0.342199     0.501838
FDR     0.440422     0.270905
FOR     0.046139     0.193507
PPV

In [17]:
# display parameters file
Aeq_training.display()

Aequitas Dataset parameters:
{
    "class_attribute": {
        "name": "income",
        "positive_value": ">50K"
    },
    "sensitive_attributes": [
        {
            "name": "gender",
            "privileged_group": "Male"
        }
    ],
    "Mitigation": "False",
    "proportions": {
        "income": {
            "<=50K": 0.7622042177308491,
            ">50K": 0.2377957822691509
        },
        "gender": {
            "Male": 0.668138875076779,
            "Female": 0.3318611249232209
        }
    },
    "outcome_distribution_by_group": {
        "income/gender": {
            "Female": {
                "<=50K": 0.8937951701040014,
                ">50K": 0.1062048298959986
            },
            "Male": {
                "<=50K": 0.6968436720220637,
                ">50K": 0.30315632797793635
            }
        }
    },
    "contingency": [
        {
            "attribute1": "gender",
            "attribute2": "income",
            "contingency_table": {
   

In [18]:
#display parameters file
Aeq_test.display()

Aequitas Dataset parameters:
{
    "class_attribute": {
        "name": "income",
        "positive_value": ">50K"
    },
    "sensitive_attributes": [
        {
            "name": "gender",
            "privileged_group": "Male"
        }
    ],
    "Mitigation": "False",
    "transform_dictionary": {
        "income": {
            "encode": "labeling",
            "labels": {
                "<=50K": 0,
                ">50K": 1
            }
        },
        "gender": {
            "encode": "labeling",
            "labels": {
                "Female": 0,
                "Male": 1
            }
        },
        "race": {
            "encode": "labeling",
            "labels": {
                "Minority": 0,
                "White": 1
            }
        },
        "workclass": {
            "encode": "labeling",
            "scaling": "min-max"
        },
        "marital-status": {
            "encode": "labeling",
            "scaling": "min-max"
        },
        "occup