In [2]:
import sys
import numpy as np
import pandas as pd
from aif360.datasets import BinaryLabelDataset
from aif360.metrics import BinaryLabelDatasetMetric, ClassificationMetric
from aif360.algorithms.preprocessing.optim_preproc import OptimPreproc
from aif360.algorithms.preprocessing.optim_preproc_helpers.data_preproc_functions import load_preproc_data_adult
from aif360.algorithms.preprocessing.optim_preproc_helpers.distortion_functions import get_distortion_adult
from aif360.algorithms.preprocessing.optim_preproc_helpers.opt_tools import OptTools
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn import metrics, preprocessing
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown, display
np.random.seed(1)

In [3]:
# Import the clean the adult dataset

df = pd.read_csv('adult.csv')

# df = df.sample(n = 5000)
# print(df.shape)

# check columns that have missing values\n
df.isin(['?']).sum(axis=0)

# replace missing values (?) to nan and then drop the columns
df['native.country'] = df['native.country'].replace('?',np.nan)
df['workclass'] = df['workclass'].replace('?',np.nan)
df['occupation'] = df['occupation'].replace('?',np.nan)

# dropping the NaN rows now
df.dropna(how='any',inplace=True)

df['income'] = df['income'].map({'<=50K': 0, '>50K': 1}).astype(int)
df['sex'] = df['sex'].map({'Male': 1, 'Female': 0}).astype(int)

In [4]:
# Data pre-processing (using one-hot encoding)

# For each category we made a separate column
df = pd.concat([df, pd.get_dummies(df['race'], prefix='race')],axis=1)
df = pd.concat([df, pd.get_dummies(df['marital.status'], prefix='marital')],axis=1)
df = pd.concat([df, pd.get_dummies(df['workclass'], prefix='workclass')],axis=1)
df = pd.concat([df, pd.get_dummies(df['relationship'], prefix='relationship')],axis=1)
df = pd.concat([df, pd.get_dummies(df['education'], prefix='education')],axis=1)
df = pd.concat([df, pd.get_dummies(df['occupation'], prefix='occupation')],axis=1)

# age binning
df['ageUnder18'] = np.where(df['age'] < 18, 1, 0)
df['age18to24'] = np.where((df['age'] >= 18) & (df['age'] <= 24), 1, 0)
df['age25to44'] = np.where((df['age'] >= 25) & (df['age'] <= 44), 1, 0)
df['age45to64'] = np.where((df['age'] >= 45) & (df['age'] <= 64), 1, 0)
df['ageAbove65'] = np.where(df['age'] >= 65, 1, 0)

# privileged, unprivileged groups
privileged_groups = [{'sex': 1}] # Male
unprivileged_groups = [{'sex': 0}] # Female

df = df.drop(columns=['workclass', 'fnlwgt', 'education', 'education.num', 'occupation', \
                      'relationship', 'marital.status', 'race', 'native.country', 'capital.gain', \
                      'capital.loss', 'hours.per.week', 'age'])
#
#df.columns[1]

In [5]:
# Computes fairness of data and returns the fairness metrics in a form of array

def get_metrics(test_df, y_pred, unprivileged_groups, privileged_groups):
    # BLD constructor is taking arguments of Structured dataset
    test_bld = BinaryLabelDataset(df=test_df, label_names=['income'], protected_attribute_names=['sex'])
    
    # Made a copy of the the bld dataset
    pred_data = test_bld.copy()
    pred_data.labels = y_pred

    # Using classification metric because we have 2 binary label datasets
    metric_selection = ClassificationMetric(
                    test_bld, pred_data,
                    unprivileged_groups=unprivileged_groups,
                    privileged_groups=privileged_groups)
    
    tnr_diff = metric_selection.true_negative_rate(1) - metric_selection.true_negative_rate(0)
    
    return [metric_selection.true_positive_rate_difference(), \
        metric_selection.statistical_parity_difference(),\
        tnr_diff,\
        metric_selection.accuracy()]

In [6]:
# Compute fairness over all the data

X = df.drop(columns='income')
y = df['income']

num_k = 5 # number of folds for cross-validation

# Metrics for each set of (train and test) sample
metrics = [None] * num_k

# Regression on original data
k_fold = KFold(n_splits=num_k, random_state=None, shuffle=False)
for k, (train, test) in enumerate(k_fold.split(X, y)):
    # Here train and test are the list of indices on which split is done
   
    # take out test set from X
    test_df = X.iloc[test].copy()
    test_df['income'] = y.iloc[test]
    
    reg = LogisticRegression(max_iter=300, solver = 'lbfgs')
    reg.fit(X.iloc[train], y.iloc[train])
    y_pred = reg.predict(X.iloc[test])
    metrics[k] = get_metrics(test_df, y_pred, unprivileged_groups, privileged_groups)

print("Results: Original")
mf_orig = pd.DataFrame(metrics, columns = ['TruePositiveRateDiff', 'StatisticalParityDiff', \
                                      'TrueNegativeRateDiff', 'Accuracy'])
mf_orig.mean()


Results: Original


TruePositiveRateDiff    -0.135077
StatisticalParityDiff   -0.181838
TrueNegativeRateDiff    -0.088531
Accuracy                 0.825942
dtype: float64

In [12]:
# Try to find predicates for which the fairness of the model increases
train, test = train_test_split(df, test_size = 0.3)


X_train = train.drop(columns='income')
y_train = train['income']

'''
col_rem = X_train.columns[1]
col_rem_indices = X_train[X_train[col_rem] == 1].index
X_train_temp = X_train.drop(col_rem_indices, inplace = False)
'''

X_test = test.drop(columns='income')
y_test = test['income']

# Taking three separate lists for candidate attribute, value, and fairness
# considering only statistical parity

attributes = []
values = []
infs = []

reg = LogisticRegression(max_iter=200, solver = 'lbfgs')

for i in range(0, len(X_train.columns)):
    col_rem = X_train.columns[i]
    for j in range(0,2):
        
        # Remove the preduicates in training and test set
        col_train_rem_indices = X_train[X_train[col_rem] == j].index
        
        X_train_temp = X_train.drop(col_train_rem_indices, inplace = False)
        y_train_temp = y_train.drop(col_train_rem_indices, inplace = False)
        
        
        col_test_rem_indices = X_test[X_test[col_rem] == j].index
        
        X_test_temp = X_test.drop(col_test_rem_indices, inplace = False)
        y_test_temp = y_test.drop(col_test_rem_indices, inplace = False)
        
        # Make a validation set
        validation_set = test.drop(col_test_rem_indices, inplace = False)
        
        reg.fit(X_train_temp, y_train_temp)
        
        y_temp_predict = reg.predict(X_test_temp)
        
        # Get the metrics
        metrics = get_metrics(validation_set, y_temp_predict, privileged_groups, unprivileged_groups)
        
        # Take the Statistical parity diff
        inf = mf_orig.mean()[1] - metrics[1]
        
        if (inf < 0) :
            attributes.append(col_rem)
            values.append(j)
            infs.append(inf)



  TPR=TP / P, TNR=TN / N, FPR=FP / N, FNR=FN / P,
  GTPR=GTP / P, GTNR=GTN / N, GFPR=GFP / N, GFNR=GFN / P,
  / self.num_instances(privileged=privileged))
  TPR=TP / P, TNR=TN / N, FPR=FP / N, FNR=FN / P,
  GTPR=GTP / P, GTNR=GTN / N, GFPR=GFP / N, GFNR=GFN / P,
  / self.num_instances(privileged=privileged))


ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0

Int64Index([ 5798, 32434, 17838, 13370, 20170, 12187,  3494, 14011, 20876,
             5272,
            ...
            19077, 31792,    53, 20012,  6882, 17903, 18896, 12456, 23257,
             3735],
           dtype='int64', length=20908)
