In [1]:
import sys
import numpy as np
import pandas as pd
from aif360.datasets import BinaryLabelDataset
from aif360.metrics import BinaryLabelDatasetMetric, ClassificationMetric
from aif360.algorithms.preprocessing.optim_preproc import OptimPreproc
from aif360.algorithms.preprocessing.optim_preproc_helpers.data_preproc_functions import load_preproc_data_adult
from aif360.algorithms.preprocessing.optim_preproc_helpers.distortion_functions import get_distortion_adult
from aif360.algorithms.preprocessing.optim_preproc_helpers.opt_tools import OptTools
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn import metrics, preprocessing
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown, display
np.random.seed(1)

pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[LFR]'


In [2]:
# Import the clean the adult dataset

df = pd.read_csv('adult.csv')

# df = df.sample(n = 5000)
# print(df.shape)

# check columns that have missing values\n
df.isin(['?']).sum(axis=0)

# replace missing values (?) to nan and then drop the columns
df['native.country'] = df['native.country'].replace('?',np.nan)
df['workclass'] = df['workclass'].replace('?',np.nan)
df['occupation'] = df['occupation'].replace('?',np.nan)

# dropping the NaN rows now
df.dropna(how='any',inplace=True)

df['income'] = df['income'].map({'<=50K': 0, '>50K': 1}).astype(int)
df['sex'] = df['sex'].map({'Male': 1, 'Female': 0}).astype(int)

In [3]:
# Data pre-processing (using one-hot encoding)

# For each category we made a separate column
df = pd.concat([df, pd.get_dummies(df['race'], prefix='race')],axis=1)
df = pd.concat([df, pd.get_dummies(df['marital.status'], prefix='marital')],axis=1)
df = pd.concat([df, pd.get_dummies(df['workclass'], prefix='workclass')],axis=1)
df = pd.concat([df, pd.get_dummies(df['relationship'], prefix='relationship')],axis=1)
df = pd.concat([df, pd.get_dummies(df['education'], prefix='education')],axis=1)
df = pd.concat([df, pd.get_dummies(df['occupation'], prefix='occupation')],axis=1)

# age binning
df['ageUnder18'] = np.where(df['age'] < 18, 1, 0)
df['age18to24'] = np.where((df['age'] >= 18) & (df['age'] <= 24), 1, 0)
df['age25to44'] = np.where((df['age'] >= 25) & (df['age'] <= 44), 1, 0)
df['age45to64'] = np.where((df['age'] >= 45) & (df['age'] <= 64), 1, 0)
df['ageAbove65'] = np.where(df['age'] >= 65, 1, 0)

# privileged, unprivileged groups
privileged_groups = [{'sex': 1}] # Male
unprivileged_groups = [{'sex': 0}] # Female

df = df.drop(columns=['workclass', 'fnlwgt', 'education', 'education.num', 'occupation', \
                      'relationship', 'marital.status', 'race', 'native.country', 'capital.gain', \
                      'capital.loss', 'hours.per.week', 'age'])
#
#df.columns[1]

In [4]:
# Computes fairness of data and returns the fairness metrics in a form of array

def get_metrics(test_df, y_pred, unprivileged_groups, privileged_groups):
    # BLD constructor is taking arguments of Structured dataset
    test_bld = BinaryLabelDataset(df=test_df, label_names=['income'], protected_attribute_names=['sex'])
    
    # Made a copy of the the bld dataset
    pred_data = test_bld.copy()
    pred_data.labels = y_pred

    # Using classification metric because we have 2 binary label datasets
    metric_selection = ClassificationMetric(
                    test_bld, pred_data,
                    unprivileged_groups=unprivileged_groups,
                    privileged_groups=privileged_groups)
    
    tnr_diff = metric_selection.true_negative_rate(1) - metric_selection.true_negative_rate(0)
    
    return [metric_selection.true_positive_rate_difference(), \
        metric_selection.statistical_parity_difference(),\
        tnr_diff,\
        metric_selection.accuracy()]

In [5]:
# Compute fairness over all the data

X = df.drop(columns='income')
y = df['income']

num_k = 5 # number of folds for cross-validation

# Metrics for each set of (train and test) sample
metrics = [None] * num_k

# Regression on original data
k_fold = KFold(n_splits=num_k, random_state=None, shuffle=False)
for k, (train, test) in enumerate(k_fold.split(X, y)):
    # Here train and test are the list of indices on which split is done
   
    # take out test set from X
    test_df = X.iloc[test].copy()
    test_df['income'] = y.iloc[test]
    
    reg = LogisticRegression(max_iter=300, solver = 'lbfgs')
    reg.fit(X.iloc[train], y.iloc[train])
    y_pred = reg.predict(X.iloc[test])
    metrics[k] = get_metrics(test_df, y_pred, unprivileged_groups, privileged_groups)

print("Results: Original")
mf_orig = pd.DataFrame(metrics, columns = ['TruePositiveRateDiff', 'StatisticalParityDiff', \
                                      'TrueNegativeRateDiff', 'Accuracy'])
mf_orig.mean()


Results: Original


TruePositiveRateDiff    -0.135077
StatisticalParityDiff   -0.181838
TrueNegativeRateDiff    -0.088531
Accuracy                 0.825942
dtype: float64

In [13]:
train, test = train_test_split(df, test_size = 0.3)


X_train = train.drop(columns='income')
y_train = train['income']

X_test = test.drop(columns='income')
y_test = test['income']

In [18]:
X_train['workclass_Without-pay'].unique()

array([0, 1], dtype=uint8)

In [20]:
# Try to find predicates for which the fairness of the model increases

# Taking three separate lists for candidate attribute, value, and fairness
# considering only statistical parity

attributes = []
values = []
infs = []

reg = LogisticRegression(max_iter=200, solver = 'lbfgs')
for i in range(0, len(X_train.columns)):
    print(X_train.columns[i])
    col_rem = X_train.columns[i]
    
    # Remove the predicate in training set
    col_train_rem_indices = X_train[X_train[col_rem] == 1].index

    X_train_temp = X_train.drop(col_train_rem_indices, inplace = False)
    y_train_temp = y_train.drop(col_train_rem_indices, inplace = False)

    reg.fit(X_train_temp, y_train_temp)

    y_temp_predict = reg.predict(X_test)

    # Get the metrics
    metrics = get_metrics(test, y_temp_predict, privileged_groups, unprivileged_groups)

    # Take the Statistical parity diff
    inf = mf_orig.mean()[1] - metrics[1]

    if (inf < 0) :
        attributes.append(col_rem)
        values.append(j)
        infs.append(inf)



sex
race_Amer-Indian-Eskimo
race_Asian-Pac-Islander
race_Black
race_Other
race_White
marital_Divorced
marital_Married-AF-spouse
marital_Married-civ-spouse
marital_Married-spouse-absent
marital_Never-married
marital_Separated
marital_Widowed
workclass_Federal-gov
workclass_Local-gov
workclass_Private
workclass_Self-emp-inc
workclass_Self-emp-not-inc
workclass_State-gov
workclass_Without-pay
relationship_Husband
relationship_Not-in-family
relationship_Other-relative
relationship_Own-child
relationship_Unmarried
relationship_Wife
education_10th
education_11th
education_12th
education_1st-4th
education_5th-6th
education_7th-8th
education_9th
education_Assoc-acdm
education_Assoc-voc
education_Bachelors
education_Doctorate
education_HS-grad
education_Masters
education_Preschool
education_Prof-school
education_Some-college
occupation_Adm-clerical
occupation_Armed-Forces
occupation_Craft-repair
occupation_Exec-managerial
occupation_Farming-fishing
occupation_Handlers-cleaners
occupation_Machin

In [21]:
infs

[-0.26617622686062253,
 -0.37406117640269737,
 -0.37325901570285736,
 -0.3719346630324497,
 -0.3747205895250185,
 -0.31721171878270127,
 -0.3688899340212898,
 -0.3748964956571295,
 -0.20975501197318314,
 -0.3763691223308213,
 -0.38787569341684913,
 -0.3697805175333754,
 -0.3737314698415368,
 -0.36486255124479355,
 -0.37011575052030127,
 -0.32887479824640364,
 -0.3656978704992257,
 -0.37967171436819236,
 -0.3775120424433526,
 -0.36912110441105417,
 -0.29588203642728556,
 -0.371907030903623,
 -0.36813198472757247,
 -0.37637464875658666,
 -0.3743798301123272,
 -0.4324525068112251,
 -0.3714235239134128,
 -0.37488544280559877,
 -0.3739018495478824,
 -0.3706047839362767,
 -0.3752206757925247,
 -0.372082937035734,
 -0.37339623685461093,
 -0.36470875081574394,
 -0.36765953058889306,
 -0.3063682082599998,
 -0.36287878545206487,
 -0.37769347500122896,
 -0.34747652631552906,
 -0.36912110441105417,
 -0.3612357790720273,
 -0.3670830138530521,
 -0.37178638902916544,
 -0.3687913978498937,
 -0.3740667