# FaX AI with SHAP and AIF360 Data

This example presents how we can use the FaX AI, [AIF360](https://github.com/Trusted-AI/AIF360), and [SHAP](https://github.com/slundberg/shap) libraries together for marrying fairness with explainability.  

We use the COMPAS dataset (using the AIF360 loading methods) and how to train and measure our methods, the methods implemented in the AIF360 library, and sklearn supervised learning methods.

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# import tensorflow.compat.v1 as tf
# tf.disable_eager_execution()
# tf.logging.set_verbosity(tf.logging.ERROR)

from sklearn.compose import make_column_transformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier

# from aif360.sklearn.preprocessing import ReweighingMeta
from aif360.sklearn.inprocessing import AdversarialDebiasing, ExponentiatedGradientReduction, GridSearchReduction
from aif360.sklearn.postprocessing import CalibratedEqualizedOdds, PostProcessingMeta
from aif360.sklearn.datasets import fetch_adult, fetch_compas, fetch_german
from aif360.sklearn.metrics import disparate_impact_ratio, average_odds_error, generalized_fpr
from aif360.sklearn.metrics import generalized_fnr, difference, statistical_parity_difference,equal_opportunity_difference
import matplotlib.pyplot as plt

import shap
import FaX_methods
import aif360_utils

### Training MIM from FaX AI

In [2]:
data = pd.read_csv("X_vect.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,aa,ab,abandon,abandoned,abandoning,abandonment,abbott,abc,aberration,...,zimmerman,zionism,zionist,zionists,zip,zombie,zombies,zone,zones,zuma
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
len(data)

66708

In [4]:
train_df = pd.read_csv("train_df.csv")

In [5]:
len(train_df)

66708

In [6]:
data['prot_word_count'] = train_df['prot_word_count']

In [7]:
data['target'] = train_df['target']

In [8]:
# the number of non toxic comments was more than 10 times the number of toxic comments, we have subsampled it to the following distribution
print("Number of non toxic comments: "+str(len(data[data["target"] < 0.5])))
print("Number of toxic comments: "+str(len(data[data["target"] >= 0.5])))

Number of non toxic comments: 41566
Number of toxic comments: 25142


In [23]:
sample = data.sample(66000)

In [24]:
# the number of non toxic comments was more than 10 times the number of toxic comments, we have subsampled it to the following distribution
print("Number of non toxic comments: "+str(len(sample[sample["target"] < 0.5])))
print("Number of toxic comments: "+str(len(sample[sample["target"] >= 0.5])))

Number of non toxic comments: 41113
Number of toxic comments: 24887


In [25]:
y = sample['target'].apply(lambda x: x > .5).astype('int')

In [26]:
y

12758    1
30168    1
9135     0
17387    1
59336    0
        ..
23898    1
23884    1
5812     1
7947     0
41514    1
Name: target, Length: 66000, dtype: int64

In [27]:
sample.columns

Index(['Unnamed: 0', 'aa', 'ab', 'abandon', 'abandoned', 'abandoning',
       'abandonment', 'abbott', 'abc', 'aberration',
       ...
       'zionism', 'zionist', 'zionists', 'zip', 'zombie', 'zombies', 'zone',
       'zones', 'zuma', 'prot_word_count'],
      dtype='object', length=13757)

In [28]:
sample.drop(['Unnamed: 0', 'target'], axis = 1, inplace = True)

In [29]:
sample.columns

Index(['aa', 'ab', 'abandon', 'abandoned', 'abandoning', 'abandonment',
       'abbott', 'abc', 'aberration', 'abetted',
       ...
       'zionism', 'zionist', 'zionists', 'zip', 'zombie', 'zombies', 'zone',
       'zones', 'zuma', 'prot_word_count'],
      dtype='object', length=13755)

In [30]:
prot_attr = ['gay', 'black', 'islam', 'christian', 'white', 'muslim', 'homosexual', 'chinese', 'bisexual', 'jew', 'catholic', 'transgender', 'latino', 'hindu']

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(sample, y, test_size = 0.3, stratify = y)
print(len(X_train))
print(len(X_test))

46200
19800


In [32]:
X_train.columns

Index(['aa', 'ab', 'abandon', 'abandoned', 'abandoning', 'abandonment',
       'abbott', 'abc', 'aberration', 'abetted',
       ...
       'zionism', 'zionist', 'zionists', 'zip', 'zombie', 'zombies', 'zone',
       'zones', 'zuma', 'prot_word_count'],
      dtype='object', length=13755)

In [33]:
X_train_dropped = X_train.drop(["prot_word_count"], axis = 1)
X_test_dropped = X_test.drop(["prot_word_count"], axis = 1)
mim_aif = aif360_utils.FaXAIF(X_train_dropped, y_train, prot_attr, model_type = 'MIM')

#predict the test data using the model
# pred_prob = mim_aif.predict_proba(X_test_dropped)
# print(pred_prob)
pred = mim_aif.predict(X_test_dropped)
print(pred)

[0 0 0 ... 1 0 0]


In [34]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, pred))

0.7876262626262627


In [35]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0, max_iter = 400).fit(X_train_dropped, y_train)
pred_baseline = clf.predict(X_test_dropped)
print(accuracy_score(y_test, pred_baseline))

0.7898484848484848


In [36]:
X_test["y_test"] = y_test
X_test["y_preds"] = pred
X_test["y_preds_baseline"] = pred_baseline

not_toxic = X_test[X_test["y_test"] == 0]
not_toxic_having_prot_attr = not_toxic[not_toxic["prot_word_count"] == 1]
print("Fair Model - % of non toxic comments in test having prot_attr and have been wrongly classified as toxic: "+str(len(not_toxic_having_prot_attr[not_toxic_having_prot_attr["y_preds"] == 1])/len(not_toxic_having_prot_attr)*100))
print("Baseline Model - % of non toxic comments in test having prot_attr and have been wrongly classified as toxic: "+str(len(not_toxic_having_prot_attr[not_toxic_having_prot_attr["y_preds_baseline"] == 1])/len(not_toxic_having_prot_attr)*100))

Fair Model - % of non toxic comments in test having prot_attr and have been wrongly classified as toxic: 8.307908941563014
Baseline Model - % of non toxic comments in test having prot_attr and have been wrongly classified as toxic: 9.72776343581319


In [123]:
# Finding MDE

In [None]:
XZ = np.hstack((mim_aif.X, mim_aif.Z))
mde_mim = FaX_methods.MDE_ind(XZ)
inf_mim = mde_mim.influence(mim_aif.model, mim_aif.X)
cols = list(X_train_compas.columns)[1:] +[list(X_train_compas.columns)[0]] 
res = {cols[i]: inf_mim[i] for i in range(len(inf_mim))}
print(res)

In [None]:
for i in prot_attr:
    print(i)
    print(res[i])