In [72]:
import sys
import numpy as np
import pandas as pd
from aif360.datasets import BinaryLabelDataset
from aif360.metrics import BinaryLabelDatasetMetric, ClassificationMetric
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn import metrics, preprocessing
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown, display
import copy
np.random.seed(1)

**Load data**

In [73]:
cols = ['age', 'workclass', 'fnlwgt', 'education', 'education.num', 'marital', 'occupation', 'relationship', 'race', 'gender', 'capgain', 'caploss', 'hours', 'country', 'income']
df_train = pd.read_csv('adult.data', names=cols, sep=",")
df_test = pd.read_csv('adult.test', names=cols, sep=",")

**Pre-processing**

In [74]:
#{1: {'State-gov': 0, 'Self-emp-not-inc': 1, 'Private': 2, 'Federal-gov': 3, 'Local-gov': 4, '?': 5,
#'Self-emp-inc': 6, 'Without-pay': 7, 'Never-worked': 8}, 
#3: {'Bachelors': 0, 'HS-grad': 1, '11th': 2, 'Masters': 3, '9th': 4, 
#'Some-college': 5, 'Assoc-acdm': 6, 'Assoc-voc': 7, '7th-8th': 8, 'Doctorate': 9, 'Prof-school': 10, '5th-6th': 11, '10th': 12, '1st-4th': 13, 'Preschool': 14, '12th': 15}, 
#5: {'Never-married': 0, 'Married-civ-spouse': 1, 'Divorced': 2, 'Married-spouse-absent': 3, 'Separated': 4, 'Married-AF-spouse': 5, 'Widowed': 6}, 
#6: {'Adm-clerical': 0, 'Exec-managerial': 1, 'Handlers-cleaners': 2, 'Prof-specialty': 3, 'Other-service': 4, 'Sales': 5, 'Craft-repair': 6, 'Transport-moving': 7, 'Farming-fishing': 8, 'Machine-op-inspct': 9, 'Tech-support': 10, '?': 11, 'Protective-serv': 12, 'Armed-Forces': 13, 'Priv-house-serv': 14},
#7: {'Not-in-family': 0, 'Husband': 1, 'Wife': 2, 'Own-child': 3, 'Unmarried': 4, 'Other-relative': 5}, 
#8: {'White': 0, 'Black': 1, 'Asian-Pac-Islander': 2, 'Amer-Indian-Eskimo': 3, 'Other': 4}, 
#9: {'Male': 1, 'Female': 0},
#13: {'United-States': 0, 'Cuba': 1, 'Jamaica': 2, 'India': 3, '?': 4, 'Mexico': 5, 'South': 6, 'Puerto-Rico': 7, 'Honduras': 8, 'England': 9, 'Canada': 10, 'Germany': 11, 'Iran': 12, 'Philippines': 13, 'Italy': 14, 'Poland': 15, 'Columbia': 16, 'Cambodia': 17, 'Thailand': 18, 'Ecuador': 19, 'Laos': 20, 'Taiwan': 21, 'Haiti': 22, 'Portugal': 23, 'Dominican-Republic': 24, 'El-Salvador': 25, 'France': 26, 'Guatemala': 27, 'China': 28, 'Japan': 29, 'Yugoslavia': 30, 'Peru': 31, 'Outlying-US(Guam-USVI-etc)': 32, 'Scotland': 33, 'Trinadad&Tobago': 34, 'Greece': 35, 'Nicaragua': 36, 'Vietnam': 37, 'Hong': 38, 'Ireland': 39, 'Hungary': 40, 'Holand-Netherlands': 41}, 14: {'<=50K': 0, '>50K': 1}, 0: {'|1x3 Cross validator': 0, '': 1}}

In [75]:
 def preprocess(df):
    df.isin(['?']).sum(axis=0)

    # replace missing values (?) to nan and then drop the columns
    df['country'] = df['country'].replace('?',np.nan)
    df['workclass'] = df['workclass'].replace('?',np.nan)
    df['occupation'] = df['occupation'].replace('?',np.nan)

    # dropping the NaN rows now
    df.dropna(how='any',inplace=True)
            
    df['income'] = df['income'].map({'<=50K': 0, '>50K': 1}).astype(int)
    df['gender'] = df['gender'].map({'Male': 1, 'Female': 0}).astype(int)
    df['workclass'] = df['workclass'].map({'State-gov': 0, 'Self-emp-not-inc': 1, 'Private': 2, 'Federal-gov': 3, 'Local-gov': 4, '?': 5,
                                           'Self-emp-inc': 6, 'Without-pay': 7, 'Never-worked': 8}).astype(int)
    df['education'] = df['education'].map({'Bachelors': 0, 'HS-grad': 1, '11th': 2, 'Masters': 3, '9th': 4, 
                                           'Some-college': 5, 'Assoc-acdm': 6, 'Assoc-voc': 7, '7th-8th': 8, 'Doctorate': 9, 
                                           'Prof-school': 10, '5th-6th': 11, '10th': 12, '1st-4th': 13, 'Preschool': 14, '12th': 15}).astype(int)
    df['marital'] = df['marital'].map({'Never-married': 0, 'Married-civ-spouse': 1, 'Divorced': 2, 'Married-spouse-absent': 3, 
                                                     'Separated': 4, 'Married-AF-spouse': 5, 'Widowed': 6}).astype(int)
    df['occupation'] = df['occupation'].map({'Adm-clerical': 0, 'Exec-managerial': 1, 'Handlers-cleaners': 2, 
                                             'Prof-specialty': 3, 'Other-service': 4, 'Sales': 5, 'Craft-repair': 6, 'Transport-moving': 7, 'Farming-fishing': 8, 
                                             'Machine-op-inspct': 9, 'Tech-support': 10, '?': 11, 'Protective-serv': 12, 'Armed-Forces': 13, 'Priv-house-serv': 14}).astype(int)
    df['relationship'] = df['relationship'].map({'Not-in-family': 0, 'Husband': 1, 'Wife': 2, 
                                                 'Own-child': 3, 'Unmarried': 4, 'Other-relative': 5}).astype(int)
    df['race'] = df['race'].map({'White': 0, 'Black': 1, 'Asian-Pac-Islander': 2, 'Amer-Indian-Eskimo': 3, 'Other': 4}).astype(int)
    df['country'] = df['country'].map({'United-States': 0, 'Cuba': 1, 'Jamaica': 2, 'India': 3, '?': 4, 'Mexico': 5, 'South': 6, 'Puerto-Rico': 7, 
                                       'Honduras': 8, 'England': 9, 'Canada': 10, 'Germany': 11, 'Iran': 12, 'Philippines': 13, 'Italy': 14, 
                                       'Poland': 15, 'Columbia': 16, 'Cambodia': 17, 'Thailand': 18, 'Ecuador': 19, 'Laos': 20, 'Taiwan': 21, 
                                       'Haiti': 22, 'Portugal': 23, 'Dominican-Republic': 24, 'El-Salvador': 25, 'France': 26, 'Guatemala': 27, 
                                       'China': 28, 'Japan': 29, 'Yugoslavia': 30, 'Peru': 31, 'Outlying-US(Guam-USVI-etc)': 32, 'Scotland': 33,
                                       'Trinadad&Tobago': 34, 'Greece': 35, 'Nicaragua': 36, 'Vietnam': 37, 'Hong': 38, 'Ireland': 39, 'Hungary': 40, 
                                       'Holand-Netherlands': 41}).astype(int)
    
    
    labels = df['age']
    proc = []
    for v in labels:
            if v <= 30:
                proc.append(1)
            elif v <= 40:
                proc.append(2)
            elif v <= 50:
                proc.append(3)
            else:
                proc.append(4)
    df['age']=proc 
    
    labels = df['hours']
    proc=[]
    for v in labels:
        if v<=25:
            proc.append(1)
        elif v<=41:
            proc.append(2)
        elif v<=55:
            proc.append(3)
        else:
            proc.append(4)
    df['hours']=proc
    
    df = df.drop(['fnlwgt', 'education.num', 'capgain', 'caploss', 'country'], axis = 1, inplace = True) 

In [76]:
preprocess(df_train)
preprocess(df_test)

In [77]:
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [78]:
# df_train = df_train[1:100]
# df_test = df_test[1:100]

**Privileged, unprivileged**


In [79]:
# privileged, unprivileged groups
privileged_groups = [{'gender': 1}] # Male
unprivileged_groups = [{'gender': 0}] # Female

**Function to compute fairness metrics**

In [80]:
def get_metrics(test_df, y_pred, unprivileged_groups, privileged_groups):
    # BLD constructor is taking arguments of Structured dataset
    test_bld = BinaryLabelDataset(df=test_df, label_names=['income'], protected_attribute_names=['gender'])
    
    pred_data = test_bld.copy()
    pred_data.labels = y_pred

    metric_selection = ClassificationMetric(
                    test_bld, pred_data,
                    unprivileged_groups=unprivileged_groups,
                    privileged_groups=privileged_groups)
    
    tnr_diff = metric_selection.true_negative_rate(1) - metric_selection.true_negative_rate(0)
    
    return [metric_selection.true_positive_rate_difference(), \
        metric_selection.statistical_parity_difference(),\
        tnr_diff,\
        metric_selection.accuracy()]

**Train classifier**

In [81]:
X_train = df_train.drop(columns='income')
y_train = df_train['income']

X_test = df_test.drop(columns='income')
y_test = df_test['income']

clf = RandomForestClassifier(max_depth=10, random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
metrics = get_metrics(df_test, y_pred, unprivileged_groups, privileged_groups)
print("Results: Original")
# 'TruePositiveRateDiff', 'StatisticalParityDiff', 'TrueNegativeRateDiff', 'Accuracy'
metrics

Results: Original


[-0.11157278063988774,
 -0.16672961606457132,
 -0.07813651283616496,
 0.8253652058432935]

**Fairness metric to retrieve, threshold on metric for iterations**

In [82]:
metricIndex = 1 #1=statistical parity
threshold = 0.0001

**Function to return the <attribute, val> pair removing which will result in the minimum parity difference**
Note that our search is for the pair that has the least absolute parity difference.

In [92]:
def getAttribute(X_train, y_train, X_test, y_test, f, attrList, attrVals):
    attrK = None
    attrKval = None
    f_curr = f
    indices = []
    
    cols = list(set(X_train.columns) - set(attrList))
    clf = RandomForestClassifier(max_depth=10, random_state=0)

    X_train_pred = copy.deepcopy(X_train)
    y_train_pred = copy.deepcopy(y_train)
    # tuples satisfying predicate
    for i in range(len(attrList)):
        print(i)
        X_train_pred = X_train_pred[X_train_pred[attrList[i]] == attrVals[i]]
        
    for col in cols:
        print(col)
        for val in X_train[col].unique():
            predIndices = X_train_pred[X_train_pred[col] == val].index        
            X_train_rest = X_train.drop(index=predIndices)
            y_train_rest = y_train.drop(index=predIndices)
        
#             print("#Rows left: ", len(X_train_rest))
            clf.fit(X_train_rest, y_train_rest)
            y_pred = clf.predict(X_test)

            f_i = get_metrics(pd.concat([X_test, y_test], axis=1), y_pred, privileged_groups, unprivileged_groups)[metricIndex]
            if ((abs(f_i) < abs(f_curr)) and (abs(f_i) > threshold)) : #closer to 0 implies fairer
                attrK = col
                attrKval = val
                f_curr = f_i
                print("Attribute passed: ", attrK)
                print("Attribute value passed: ", attrKval)
                print("Rows after removing predicate: ", len(X_train_rest))
                print("f_curr: ", f_curr)
    
    return [attrK, attrKval, f_curr]

**Function to get a set of predicates such that removing tuples that satisfy these predicates will decrease parity difference**

In [93]:
def getPredicates(X_train, y_train, X_test, y_test, f_0):    
    attrList = []
    attrVals = []
    found = True
    f_curr = f_0
    
    print("Size of X_train: ", len(X_train))
    
    depth=0
    k = len(list(set(X_train.columns) - set(attrList)))
    while (k > 0 and found):
        print("Depth: ", depth)
        depth += 1
        found = False
#         Testing on training data (could do on validation data)
        results = getAttribute(X_train, y_train, X_train, y_train, f_curr, attrList, attrVals)
        attrK = results[0]
        attrKval = results[1]
        f = results[2]
        
        if (attrK is not None):
            attrList.insert(len(attrList), attrK)
            attrVals.insert(len(attrVals), attrKval)
            f_curr = f
            
            print("Selected k: ", attrK)
            print("Selected k-val: ", attrKval)
            print("f: ", f)
            found = True
            
    return [attrList, attrVals]

**Get predicates on training data**

In [94]:
attrList, attrVals = getPredicates(X_train, y_train, X_test, y_test, metrics[metricIndex])
print(attrList)
print(attrVals)

Size of X_train:  30162
Depth:  0
marital
Attribute passed:  marital
Attribute value passed:  0
Rows after removing predicate:  20436
f_curr:  0.1347629751547424
Attribute passed:  marital
Attribute value passed:  1
Rows after removing predicate:  16097
f_curr:  0.03250703410903325
age
education
relationship
hours
gender
occupation
race
workclass
Selected k:  marital
Selected k-val:  1
f:  0.03250703410903325
Depth:  1
0
age
education
relationship
Attribute passed:  relationship
Attribute value passed:  1
Rows after removing predicate:  17708
f_curr:  0.03212775503021813
hours
gender
occupation
race
workclass
Selected k:  relationship
Selected k-val:  1
f:  0.03212775503021813
Depth:  2
0
1
age
education
hours
gender
Attribute passed:  gender
Attribute value passed:  1
Rows after removing predicate:  17709
f_curr:  0.023115618220082998
occupation
race
workclass
Selected k:  gender
Selected k-val:  1
f:  0.023115618220082998
Depth:  3
0
1
2
age
education
hours
occupation
race
workclass


In [16]:
print(attrList)

['marital', 'relationship', 'gender']


In [45]:
# attrList = ['marital', 'relationship', 'gender']
# attrVals = [1, 1, 1]

In [64]:
# print(len(X_train) - len(X_train[(X_train['marital']==1)]))
# print(len(X_train) - len(X_train[(X_train['marital']==1) & (X_train['relationship']==1)]))
# print(len(X_train) - len(X_train[(X_train['marital']==1) & (X_train['relationship']==1) & (X_train['gender']==1)]))

16097
17708
17709


**Testing on test data**

Compare initial fairness to final fairness

In [62]:
print("Initial #rows: ", len(X_train))
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(get_metrics(pd.concat([X_test, y_test], axis=1), y_pred, privileged_groups, unprivileged_groups)[metricIndex])

X_train_temp = copy.deepcopy(X_train)
y_train_temp = copy.deepcopy(y_train)
removeTupleIndices = list(range(len(X_train)))
for i in range(len(attrList)):
    col = attrList[i]
    val = attrVals[i]
#     print(col)
#     print(val)
    predIndices = X_train_temp[X_train_temp[col] == val].index
    X_train_temp = X_train_temp[X_train_temp[col] == val]
    X_train_rest = X_train.drop(index=predIndices)
    y_train_rest = y_train.drop(index=predIndices)
        
    print("#Rows left: ", len(X_train_rest))
    clf.fit(X_train_rest, y_train_rest)
    y_pred = clf.predict(X_test)

    print(get_metrics(pd.concat([X_test, y_test], axis=1), y_pred, privileged_groups, unprivileged_groups)[metricIndex])

Initial #rows:  30162
0.16672961606457132
marital
1
#Rows left:  16097
0.03377930419174387
relationship
1
#Rows left:  17708
0.038582701978855063
gender
1
#Rows left:  17709
0.027478600698372227


**Top-k heuristic**

**<Attribute, metric> for each attribute**

Remove tuples satisfying attribute=1 and record new fairness metric

In [65]:
K = X_train.columns
topKattrList = []
for col in K:
    for val in X_train[col].unique():
        removeTupleIndices = X_train[X_train[col] == val].index

        X_train_temp = X_train.drop(removeTupleIndices, inplace = False)
        y_train_temp = y_train.drop(removeTupleIndices, inplace = False)

        clf.fit(X_train_temp, y_train_temp)
        y_pred = clf.predict(X_test)

        f_i = get_metrics(pd.concat([X_test, y_test], axis=1), y_pred, privileged_groups, unprivileged_groups)[metricIndex]
        topKattrList.insert(len(attrList), [col, val, abs(f_i)])

In [66]:
print(topKattrList)

[['age', 2, 0.1584974636330573], ['age', 3, 0.14935293441648956], ['age', 4, 0.16405531140835458], ['hours', 4, 0.16650031831085685], ['hours', 3, 0.14954305637517262], ['hours', 1, 0.18019894844784315], ['hours', 2, 0.08574871433485667], ['gender', 0, 0.17404620228378637], ['gender', 1, 0.03138871814531957], ['race', 4, 0.16833415873971969], ['race', 3, 0.16732736287263167], ['race', 2, 0.16478631208553618], ['race', 1, 0.16841983197094307], ['race', 0, 0.1874438227022669], ['relationship', 5, 0.16196248547531822], ['relationship', 4, 0.16548904922190916], ['relationship', 3, 0.1709113363096373], ['relationship', 2, 0.23439754758319545], ['relationship', 1, 0.04613277834357236], ['relationship', 0, 0.1306857382915273], ['occupation', 14, 0.1682162903466809], ['occupation', 13, 0.1706459117731007], ['occupation', 12, 0.16759726063102798], ['occupation', 6, 0.16987234528073392], ['occupation', 10, 0.1617356951329601], ['occupation', 9, 0.16419500430983894], ['occupation', 8, 0.171684902

**Sort <attribute, metric> pairs in increasing order of metric** 

In [67]:
df = pd.DataFrame(topKattrList, columns = ['col', 'val', 'fval'])
df_sorted = df.sort_values(by=['fval'])

In [68]:
df_sorted

Unnamed: 0,col,val,fval
8,gender,1,0.031389
39,marital,1,0.033779
18,relationship,1,0.046133
6,hours,2,0.085749
30,occupation,3,0.115731
...,...,...,...
31,occupation,2,0.187301
13,race,0,0.187444
29,occupation,4,0.194998
17,relationship,2,0.234398


**Top-k heuristic**

Remove tuples satisfying predicates in increasing order of parity difference

In [71]:
def topkAttributes(X_train, y_train, df_sorted, k_num):
    X_train_temp = copy.deepcopy(X_train)
    y_train_temp = copy.deepcopy(y_train)
    for k in range(k_num):
        col = df_sorted.iloc[k]['col']
        val = df_sorted.iloc[k]['val']
        predIndices = X_train_temp[X_train_temp[col] == val].index        
        X_train_temp = X_train_temp[X_train_temp[col] == val]
        X_train_rest = X_train.drop(index=predIndices)
        y_train_rest = y_train.drop(index=predIndices)
        
        print("#Rows left: ", len(X_train_rest))
        clf.fit(X_train_rest, y_train_rest)
        y_pred = clf.predict(X_test)

        print(col)
        print(val)
        print(get_metrics(pd.concat([X_test, y_test], axis=1), y_pred, privileged_groups, unprivileged_groups)[metricIndex])
        
topkAttributes(X_train, y_train, df_sorted, 5)

#Rows left:  9782
gender
1
-0.03138871814531957
#Rows left:  17577
marital
1
0.09872474863752784
#Rows left:  17709
relationship
1
0.027478600698372227
#Rows left:  23466
hours
2
0.07422607595077377
#Rows left:  29367
occupation
3
0.1429208024494641
