In [1]:
import sys
import numpy as np
import pandas as pd
from aif360.datasets import BinaryLabelDataset
from aif360.metrics import BinaryLabelDatasetMetric, ClassificationMetric
from aif360.algorithms.preprocessing.optim_preproc import OptimPreproc
from aif360.algorithms.preprocessing.optim_preproc_helpers.data_preproc_functions import load_preproc_data_adult
from aif360.algorithms.preprocessing.optim_preproc_helpers.distortion_functions import get_distortion_adult
from aif360.algorithms.preprocessing.optim_preproc_helpers.opt_tools import OptTools
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn import metrics, preprocessing
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown, display
import copy
np.random.seed(1)

pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[LFR]'


In [2]:
# df = pd.read_csv('adult-small.csv')
df = pd.read_csv('adult.csv')

# check columns that have missing values\n
df.isin(['?']).sum(axis=0)

# replace missing values (?) to nan and then drop the columns
df['native.country'] = df['native.country'].replace('?',np.nan)
df['workclass'] = df['workclass'].replace('?',np.nan)
df['occupation'] = df['occupation'].replace('?',np.nan)

# dropping the NaN rows now
df.dropna(how='any',inplace=True)

df['income'] = df['income'].map({'<=50K': 0, '>50K': 1}).astype(int)
df['sex'] = df['sex'].map({'Male': 1, 'Female': 0}).astype(int)

In [3]:
# Data pre-processing (using one-hot encoding)

# For each category we made a separate column
df = pd.concat([df, pd.get_dummies(df['race'], prefix='race')],axis=1)
df = pd.concat([df, pd.get_dummies(df['marital.status'], prefix='marital')],axis=1)
df = pd.concat([df, pd.get_dummies(df['workclass'], prefix='workclass')],axis=1)
df = pd.concat([df, pd.get_dummies(df['relationship'], prefix='relationship')],axis=1)
df = pd.concat([df, pd.get_dummies(df['education'], prefix='education')],axis=1)
df = pd.concat([df, pd.get_dummies(df['occupation'], prefix='occupation')],axis=1)

# age binning
df['ageUnder18'] = np.where(df['age'] < 18, 1, 0)
df['age18to24'] = np.where((df['age'] >= 18) & (df['age'] <= 24), 1, 0)
df['age25to44'] = np.where((df['age'] >= 25) & (df['age'] <= 44), 1, 0)
df['age45to64'] = np.where((df['age'] >= 45) & (df['age'] <= 64), 1, 0)
df['ageAbove65'] = np.where(df['age'] >= 65, 1, 0)

# privileged, unprivileged groups
privileged_groups = [{'sex': 1}] # Male
unprivileged_groups = [{'sex': 0}] # Female

df = df.drop(columns=['workclass', 'fnlwgt', 'education', 'education.num', 'occupation', \
                      'relationship', 'marital.status', 'race', 'native.country', 'capital.gain', \
                      'capital.loss', 'hours.per.week', 'age'])

In [4]:
def get_metrics(test_df, y_pred, unprivileged_groups, privileged_groups):
    # BLD constructor is taking arguments of Structured dataset
    test_bld = BinaryLabelDataset(df=test_df, label_names=['income'], protected_attribute_names=['sex'])
    
    # Made a copy of the the bld dataset
    pred_data = test_bld.copy()
    pred_data.labels = y_pred

    metric_selection = ClassificationMetric(
                    test_bld, pred_data,
                    unprivileged_groups=unprivileged_groups,
                    privileged_groups=privileged_groups)
    
    tnr_diff = metric_selection.true_negative_rate(1) - metric_selection.true_negative_rate(0)
    
    return [metric_selection.true_positive_rate_difference(), \
        metric_selection.statistical_parity_difference(),\
        tnr_diff,\
        metric_selection.accuracy()]

In [5]:
# k-fold cross-validation to compute fairness over entire data

X = df.drop(columns='income')
y = df['income']

num_k = 5 # number of folds for cross-validation

# Metrics for each set of (train and test) sample
metrics = [None] * num_k

# Regression on original data
k_fold = KFold(n_splits=num_k, random_state=None, shuffle=False)
for k, (train, test) in enumerate(k_fold.split(X, y)):
    # Here train and test are the list of indices on which split is done
    # take out test set from X
    test_df = X.iloc[test].copy()
    test_df['income'] = y.iloc[test]
    
    reg = LogisticRegression(max_iter=300, solver = 'lbfgs')
    reg.fit(X.iloc[train], y.iloc[train])
    y_pred = reg.predict(X.iloc[test])
    metrics[k] = get_metrics(test_df, y_pred, unprivileged_groups, privileged_groups)

print("Results: Original")
mf_orig = pd.DataFrame(metrics, columns = ['TruePositiveRateDiff', 'StatisticalParityDiff', \
                                      'TrueNegativeRateDiff', 'Accuracy'])
mf_orig.mean()


Results: Original


TruePositiveRateDiff    -0.135077
StatisticalParityDiff   -0.181838
TrueNegativeRateDiff    -0.088531
Accuracy                 0.825942
dtype: float64

In [6]:
metricIndex = 1
threshold = 0.0001

In [7]:
def getPredicates(X_train, y_train, X_test, y_test, f_0):    
    attrList = []
    found = True
    f_curr = f_0
    
    K = X_train.columns
    while (len(K) > 0 and found):
        found = False
        results = getAttribute(X_train, y_train, K, X_test, y_test, f_curr)
        attrK = results[0]
        f = results[1]
        indices = results[2]
        print("Selected k: ", attrK)
        print("f: ", f)

        if (attrK is not None):
            K = K.drop(attrK)
            attrList.insert(len(attrList), attrK)
            f_curr = f
            X_train = X_train.drop(indices)
            y_train = y_train.drop(indices)
            found = True
            
    return attrList

In [8]:
# binary attributes
def getAttribute(X_train, y_train, K, X_test, y_test, f):
    attrK = None
    f_curr = f
    indices = []
#     removeTupleIndices = []
    
    reg = LogisticRegression(max_iter=300, solver = 'lbfgs')
    for col in K:
        removeTupleIndices = X_train[X_train[col] == 1].index

        X_train_temp = X_train.drop(removeTupleIndices, inplace = False)
        y_train_temp = y_train.drop(removeTupleIndices, inplace = False)

        reg.fit(X_train_temp, y_train_temp)
        y_pred = reg.predict(X_test)

        f_i = get_metrics(pd.concat([X_test, y_test], axis=1), y_pred, privileged_groups, unprivileged_groups)[metricIndex]
        if ((abs(f_i) < abs(f_curr)) and (abs(f_i) > threshold)) : #closer to 0 implies fairer
            attrK = col
            f_curr = f_i
            indices = copy.deepcopy(removeTupleIndices)
            print("attrK: ", attrK)
            print("f_curr: ", f_curr)
    
    return [attrK, f_curr, indices]

In [9]:
train, test = train_test_split(df, test_size = 0.2)

X_train = train.drop(columns='income')
y_train = train['income']

X_test = test.drop(columns='income')
y_test = test['income']

reg = LogisticRegression(max_iter=300, solver = 'lbfgs')
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
f_orig = get_metrics(pd.concat([X_test, y_test], axis=1), y_pred, unprivileged_groups, privileged_groups)

print(f_orig)

[-0.10686055672268907, -0.16753487179888252, -0.07622049167709677, 0.8312613956572187]


In [10]:
attrList = getPredicates(X_train, y_train, X_test, y_test, f_orig[metricIndex])

attrK:  sex
f_curr:  0.045699596819462854
Selected k:  sex
f:  0.045699596819462854
attrK:  race_Black
f_curr:  0.04347480014420749
attrK:  race_White
f_curr:  0.029787828796742875
attrK:  marital_Married-civ-spouse
f_curr:  -0.010756652882522613
attrK:  workclass_Private
f_curr:  -0.007870358573597164
Selected k:  workclass_Private
f:  -0.007870358573597164
attrK:  race_White
f_curr:  -0.005670664497331168
attrK:  marital_Married-civ-spouse
f_curr:  0.002508746174376529
attrK:  workclass_State-gov
f_curr:  -0.0021318287583951823
Selected k:  workclass_State-gov
f:  -0.0021318287583951823
attrK:  race_Other
f_curr:  -0.0016439048745210683
attrK:  marital_Married-civ-spouse
f_curr:  0.0005279114410582145
attrK:  education_10th
f_curr:  -0.00020927242072056595
Selected k:  education_10th
f:  -0.00020927242072056595
attrK:  marital_Married-civ-spouse
f_curr:  -0.0002039743847529591
Selected k:  marital_Married-civ-spouse
f:  -0.0002039743847529591
attrK:  workclass_Federal-gov
f_curr:  0.

In [11]:
attrList

['sex',
 'workclass_Private',
 'workclass_State-gov',
 'education_10th',
 'marital_Married-civ-spouse',
 'workclass_Federal-gov']