# COMPAS  Data Analysis Notebook

In this notebook, we perform the replication of fair kit looking at the ProPublica COMPAS Dataset

In [7]:
# Load all necessary packages
import numpy as np
import sklearn as skl
import six
import tensorflow as tf

# datasets
from aif360.datasets import CompasDataset, BinaryLabelDataset

# metrics
from fklearn.metric_library import UnifiedMetricLibrary, classifier_quality_score

# models
from fklearn.scikit_learn_wrapper import LogisticRegression, KNeighborsClassifier, RandomForestClassifier, SVC
from aif360.algorithms.inprocessing import AdversarialDebiasing

# pre/post-processing algorithms
from aif360.algorithms.preprocessing import DisparateImpactRemover, Reweighing
from aif360.algorithms.postprocessing import CalibratedEqOddsPostprocessing, RejectOptionClassification

# model search
from fklearn.fair_selection_aif import ModelSearch, DEFAULT_ADB_PARAMS

In [8]:
dataset = CompasDataset()
# Specific protected group
unprivileged = [{'race': 0, 'sex': 0}]
privileged = [{'race': 1, 'sex': 1}]



## Without Fair Kit

First, we try to train a fair model without having access to Fair kit. We will do this by
1. Naively training a model for accuracy
2. Training a model using k-fold cross validation

We will do this for Logistic regression, and random forest models

In [3]:
# Helper function for calculating Fairness metrics
def getFairnessMetrics(metric_library):
    #accuracy
    acc = metric_library.accuracy_score()

    #fairness
    fairness_scores = []
    fairness_scores.append(acc)
    
    # equal opportunity difference
    eq_opp_diff = metric_library.equal_opportunity_difference()
    fairness_scores.append(eq_opp_diff)

    # average odds difference
    avg_odds_diff = metric_library.average_odds_difference()
    fairness_scores.append(avg_odds_diff)

    # statistical parity difference
    stat_parity_diff = metric_library.statistical_parity_difference()
    fairness_scores.append(stat_parity_diff)

    # Disperate impact
    disperate_impact = metric_library.disparate_impact()
    fairness_scores.append(disperate_impact)

    return (["acc","eq_opp_diff","avg_odds_diff","stat_parity_diff","disperate_impact"],(fairness_scores))

In [4]:
# First we import the models to use
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Split the data set into training and testing
test_frac =0.3
dataset_train, dataset_test = dataset.split([1-test_frac], shuffle=False)

X = dataset_train.features
y_true = dataset_train.labels.ravel()

### Logistic Regression

Training, and data collection of logistic regression models

In [5]:
# Logistic Regression Naive
logisticRegressionModel = LogisticRegression(random_state=0,solver = 'liblinear')
logisticRegressionModel.fit(X,y_true)

naive_predictions = dataset_test.copy()
naive_predictions.labels = logisticRegressionModel.predict(dataset_test.features).reshape(-1,1)

# Measure Fairness metrics
metric_library_naive = UnifiedMetricLibrary(dataset_test,
                                            naive_predictions,
                                            unprivileged_groups=unprivileged, privileged_groups = privileged)
metrics_array = getFairnessMetrics(metric_library_naive)
print(metrics_array)

(['acc', 'eq_opp_diff', 'avg_odds_diff', 'stat_parity_diff', 'disperate_impact'], [0.6720691518098325, 0.21507483962936558, 0.3566513783752062, 0.37391838478795003, 0.5642525909557747])


In [6]:
# Logistic Regression Regularized fit
logisticRegressionModel = LogisticRegression(random_state=0,solver = 'liblinear', C=0.1)
logisticRegressionModel.fit(X,y_true)


regularized_predictions = dataset_test.copy()
regularized_predictions.labels = logisticRegressionModel.predict(dataset_test.features).reshape(-1,1)

# Measure Fairness metrics
metric_library_regularized= UnifiedMetricLibrary(dataset_test,
                                            regularized_predictions,
                                            unprivileged_groups=unprivileged, privileged_groups = privileged)
metrics_array = getFairnessMetrics(metric_library_regularized)
print(metrics_array)

(['acc', 'eq_opp_diff', 'avg_odds_diff', 'stat_parity_diff', 'disperate_impact'], [0.6758508914100486, 0.24768353528153952, 0.3748641231478581, 0.39616493964320054, 0.5489814533292794])


In [7]:
# Logistic Regression Cross Validate fit
from sklearn.model_selection import cross_validate
logisticRegressionModel = LogisticRegression(random_state=0,solver = 'liblinear', C=0.1)
cv_results = cross_validate(logisticRegressionModel,X,y_true,cv = 10,return_estimator = True)

cv_results['test_score']
cv_model = cv_results['estimator'][list(cv_results['test_score']).index(max(cv_results['test_score']))]

cv_predictions = dataset_test.copy()
cv_predictions.labels = cv_model.predict(dataset_test.features).reshape(-1,1)

# Measure Fairness metrics
metric_library_cv= UnifiedMetricLibrary(dataset_test,
                                            cv_predictions,
                                            unprivileged_groups=unprivileged, privileged_groups = privileged)
metrics_array = getFairnessMetrics(metric_library_cv)
print(metrics_array)

(['acc', 'eq_opp_diff', 'avg_odds_diff', 'stat_parity_diff', 'disperate_impact'], [0.6753106428957321, 0.26265146115466864, 0.38234808608442267, 0.40489798098493746, 0.5425580062154904])


### Random Forest

Training, and data collection of random Forest models

In [8]:
# Random forest Naive
RandomForestModel = RandomForestClassifier(random_state=0,n_estimators=100)
RandomForestModel.fit(X,y_true)

naive_predictions = dataset_test.copy()
naive_predictions.labels = RandomForestModel.predict(dataset_test.features).reshape(-1,1)

# Measure Fairness metrics
metric_library = UnifiedMetricLibrary(dataset_test,
                                            naive_predictions,
                                            unprivileged_groups=unprivileged, privileged_groups = privileged)
metrics_array = getFairnessMetrics(metric_library)
print(metrics_array)

(['acc', 'eq_opp_diff', 'avg_odds_diff', 'stat_parity_diff', 'disperate_impact'], [0.6564019448946515, 0.2119565217391305, 0.22402624342136457, 0.26645123384253827, 0.6314506298252742])


In [9]:
# Random forest Regularized
RandomForestModel = RandomForestClassifier(random_state=0,n_estimators=100,max_depth=10)
RandomForestModel.fit(X,y_true)

predictions = dataset_test.copy()
predictions.labels = RandomForestModel.predict(dataset_test.features).reshape(-1,1)

# Measure Fairness metrics
metric_library = UnifiedMetricLibrary(dataset_test,
                                            predictions,
                                            unprivileged_groups=unprivileged, privileged_groups = privileged)
metrics_array = getFairnessMetrics(metric_library)
print(metrics_array)

(['acc', 'eq_opp_diff', 'avg_odds_diff', 'stat_parity_diff', 'disperate_impact'], [0.68827660723933, 0.2120456165359943, 0.324875043818706, 0.35367482106612547, 0.5973548190939495])


In [10]:
# Random forest CV
RandomForestModel = RandomForestClassifier(random_state=0,n_estimators=100)
cv_results = cross_validate(RandomForestModel,X,y_true,cv = 10,return_estimator = True)

cv_model = cv_results['estimator'][list(cv_results['test_score']).index(max(cv_results['test_score']))]

cv_predictions = dataset_test.copy()
cv_predictions.labels = cv_model.predict(dataset_test.features).reshape(-1,1)

# Measure Fairness metrics
metric_library = UnifiedMetricLibrary(dataset_test,
                                            cv_predictions,
                                            unprivileged_groups=unprivileged, privileged_groups = privileged)
metrics_array = getFairnessMetrics(metric_library)
print(metrics_array)

(['acc', 'eq_opp_diff', 'avg_odds_diff', 'stat_parity_diff', 'disperate_impact'], [0.6542409508373852, 0.22220242337847473, 0.22948997941006619, 0.26925542142933445, 0.6310203484116528])


## With Fairkit

Using fair kit to find the most fair model. The data generation part of this is also included in the "compasGridSearch.py" script.

In [13]:
from fklearn.scikit_learn_wrapper import LogisticRegression, KNeighborsClassifier, RandomForestClassifier, SVC
## Re setup data set (sanity check)
dataset = CompasDataset()
# Specific protected group
unprivileged = [{'race': 0, 'sex': 0}]
privileged = [{'race': 1, 'sex': 1}]

## Setup parameters

models = {'LogisticRegression': LogisticRegression, 'RandomForestClassifier': RandomForestClassifier }

metrics = {'UnifiedMetricLibrary': [UnifiedMetricLibrary,
                                    'accuracy_score',
                                    'average_odds_difference',
                                    'statistical_parity_difference',
                                    'equal_opportunity_difference',
                                    'disparate_impact'
                                   ]
          }

hyperparameters = {'LogisticRegression':{'penalty': ['l1', 'l2'], 'C': [0.1, 0.5, 1, 1.5],'solver':['liblinear']},
                   'RandomForestClassifier':{'n_estimators': ['warn', 10, 20, 30, 40, 50, 100]
                  }}

thresholds = [i * 10.0/100 for i in range(5)]

processor_args = {'unprivileged_groups': unprivileged, 'privileged_groups': privileged}

preprocessors=[DisparateImpactRemover()]
postprocessors=[CalibratedEqOddsPostprocessing(**processor_args)]




In [None]:
## Perform Grid search.
Search = ModelSearch(models, metrics, hyperparameters, thresholds) 
Search.grid_search(dataset, privileged=privileged, unprivileged=unprivileged, preprocessors=preprocessors, postprocessors=postprocessors)

Search.to_csv("search_output.csv")

In [18]:
# Import packages for visualization
from bokeh.io import output_notebook
from bokeh.application.handlers import FunctionHandler
from bokeh.application import Application

# load Bokeh
output_notebook()

In [19]:
from fklearn.interface.plot import *
import os, fklearn

# Define function that takes in a document and attaches the bokeh server to it
def modify_doc(doc):
    
    # Load custom styles (for notebook only)
    style = os.path.join(fklearn.__path__[0], 'interface', 'static', 'css', 'styles-notebook.css')
    custom_css = Div(text="<link rel='stylesheet' type='text/css' href=" + style + ">")
    add_btn = Button(label="Add Plot", button_type="success")
    remove_btn = Button(label="Remove Plot", button_type="danger")

    explanations = os.path.join(fklearn.__path__[0], 'interface', 'static', 'data', 'explanations.json')
    
    # Construct our viewport
    l = layout([
        [custom_css],
        create_plot("search_output.csv", explanations)
    ], sizing_mode="fixed", css_classes=["layout-container"])

    doc.add_root(l)
    
# Set up the application
handler = FunctionHandler(modify_doc)
app = Application(handler)

# Render visualization in the notebook
show(app)