In [1]:
import pandas as pd
import numpy as np
from lime.lime_tabular import LimeTabularExplainer
from sklearn.ensemble import RandomForestClassifier

# Summary

Summary of the datasets we plan to test on. Available (view only) at https://www.dropbox.com/sh/mvzemcjw4ftoq99/AADIeOp6oPXNZyt9y12h1F7ia?dl=0

In [2]:
# one-hot converts necessary features, appends Y to X
def clean_data(df, categoricals = [], binary_cats = [], labels=None, drops=[], sample_frac = 1.):
    df = df.drop(drops, axis=1)
    for oh in binary_cats:
        one_hot = pd.get_dummies(df[oh], drop_first=True, prefix=oh)
        df = df.drop(oh, axis=1)
        df = df.join(one_hot)
    for oh in categoricals:
        one_hot = pd.get_dummies(df[oh], prefix=oh)
        df = df.drop(oh, axis=1)
        df = df.join(one_hot)
    if labels is not None:
        df = pd.concat([df, labels], axis=1)
    if sample_frac < 1:
        df = df.sample(frac=sample_frac)
    return df

# COMPAS

Purpose of dataset: Predict recidivism for individuals

Target: 'two_year_recid'

Sensitive features: ['age','race','sex','age_cat=25 - 45','age_cat=Greater than 45','age_cat=Less than 25']

Source: https://github.com/propublica/compas-analysis

In [3]:
from aif360.datasets import CompasDataset

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
compas_df = CompasDataset().convert_to_dataframe()[0]
compas_df



Unnamed: 0,sex,age,race,juv_fel_count,juv_misd_count,juv_other_count,priors_count,age_cat=25 - 45,age_cat=Greater than 45,age_cat=Less than 25,...,c_charge_desc=Viol Injunct Domestic Violence,c_charge_desc=Viol Injunction Protect Dom Vi,c_charge_desc=Viol Pretrial Release Dom Viol,c_charge_desc=Viol Prot Injunc Repeat Viol,c_charge_desc=Violation License Restrictions,c_charge_desc=Violation Of Boater Safety Id,c_charge_desc=Violation of Injunction Order/Stalking/Cyberstalking,c_charge_desc=Voyeurism,c_charge_desc=arrest case no charge,two_year_recid
1,0.0,69.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,34.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,24.0,0.0,0.0,0.0,1.0,4.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7,0.0,44.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,41.0,1.0,0.0,0.0,0.0,14.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10996,0.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10997,0.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10999,0.0,57.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11000,1.0,33.0,0.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Bank Dataset

Purpose of dataset: The data is related with direct marketing campaigns (phone calls) of a Portuguese banking institution. The classification goal is to predict if the client will subscribe a term deposit (variable y)

Target: 'y'

Sensitive features: ['age', 'marital=married', 'marital=single', 'marital=divorced']

Source: https://archive.ics.uci.edu/ml/datasets/Bank+Marketing

In [5]:
from aif360.datasets import BankDataset

In [6]:
bank_df = BankDataset().convert_to_dataframe()[0]
bank_df



Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month=sep,day_of_week=fri,day_of_week=mon,day_of_week=thu,day_of_week=tue,day_of_week=wed,poutcome=failure,poutcome=nonexistent,poutcome=success,y
0,1.0,261.0,1.0,999.0,0.0,1.1,93.994,-36.4,4.857,5191.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.0,226.0,1.0,999.0,0.0,1.1,93.994,-36.4,4.857,5191.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.0,151.0,1.0,999.0,0.0,1.1,93.994,-36.4,4.857,5191.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1.0,307.0,1.0,999.0,0.0,1.1,93.994,-36.4,4.857,5191.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6,1.0,139.0,1.0,999.0,0.0,1.1,93.994,-36.4,4.857,5191.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,1.0,334.0,1.0,999.0,0.0,-1.1,94.767,-50.8,1.028,4963.6,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
41184,1.0,383.0,1.0,999.0,0.0,-1.1,94.767,-50.8,1.028,4963.6,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
41185,1.0,189.0,2.0,999.0,0.0,-1.1,94.767,-50.8,1.028,4963.6,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
41186,1.0,442.0,1.0,999.0,0.0,-1.1,94.767,-50.8,1.028,4963.6,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


# Folktables, ACS Income

Purpose of dataset: Predict whether individual's income is above $50k. Note I dropped the RELP (relationship) variable due to lack of documentation on its meaning as well as the OCCP (occupation) variable due to its massive size after one-hot encoding which led to errors in the math.

Target: 'PINCP', personal income

Sensitive features (after one hot): 'AGEP', 'SEX', 'MAR', 'RAC1P_1.0', 'RAC1P_2.0', 'RAC1P_3.0', 'RAC1P_4.0', 'RAC1P_5.0', 'RAC1P_6.0', 'RAC1P_7.0', 'RAC1P_8.0', 'RAC1P_9.0'

Other features: WKHP- hours worked per week, COW- class of worker, SCHL- years of schooling, POBP- place of birth

Source: https://github.com/zykls/folktables

In [7]:
from folktables import ACSDataSource, ACSIncome

In [8]:
data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
data = data_source.get_data(states=["MI"], download=True)
features, labels, _ = ACSIncome.df_to_pandas(data)

In [9]:
df = clean_data(features, categoricals=['COW', 'MAR', 'POBP', 'RAC1P'], labels=labels, 
                drops = ['RELP','OCCP'])
df

Unnamed: 0,AGEP,SCHL,WKHP,SEX,COW_1.0,COW_2.0,COW_3.0,COW_4.0,COW_5.0,COW_6.0,...,POBP_554.0,RAC1P_1.0,RAC1P_2.0,RAC1P_3.0,RAC1P_5.0,RAC1P_6.0,RAC1P_7.0,RAC1P_8.0,RAC1P_9.0,PINCP
0,20.0,19.0,6.0,2.0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,False
1,20.0,18.0,40.0,1.0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,False
2,81.0,23.0,50.0,1.0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,False
3,39.0,22.0,45.0,2.0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,True
4,21.0,19.0,14.0,1.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50003,36.0,22.0,24.0,2.0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,True
50004,35.0,22.0,40.0,1.0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,True
50005,40.0,21.0,50.0,1.0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,True
50006,38.0,21.0,40.0,2.0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,True


In [10]:
#df.to_csv('ACSIncome_MI_2018_sampled.csv',index=False)

# Student

Purpose of dataset: Predict student achievement in Portugese school. Using math grade only for this case.

Target: 'G3', final grade after the term

Sensitive features (after one hot): 'sex_M', 'Pstatus_T', 'address', 'Dalc', 'Walc', 'health'

Source: https://archive.ics.uci.edu/ml/datasets/student+performance

In [11]:
df_student = pd.read_csv('data/student/student-mat.csv', sep=';')
df_student

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,MS,M,20,U,LE3,A,2,2,services,services,...,5,5,4,4,5,4,11,9,9,9
391,MS,M,17,U,LE3,T,3,1,services,services,...,2,4,5,3,4,2,3,14,16,16
392,MS,M,21,R,GT3,T,1,1,other,other,...,5,5,3,3,3,3,3,10,8,7
393,MS,M,18,R,LE3,T,3,2,services,other,...,4,4,1,3,4,5,0,11,12,10


In [12]:
df_student = clean_data(df_student, 
                        categoricals=['Mjob','Fjob','guardian'],
                        binary_cats=['school','sex','address','famsize','Pstatus','schoolsup','famsup',
                                    'paid','activities','nursery','higher', 'internet', 'romantic'],
                        drops=['reason','G1','G2'])

In [13]:
df_student

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,...,Mjob_services,Mjob_teacher,Fjob_at_home,Fjob_health,Fjob_other,Fjob_services,Fjob_teacher,guardian_father,guardian_mother,guardian_other
0,18,4,4,2,2,0,4,3,4,1,...,0,0,0,0,0,0,1,0,1,0
1,17,1,1,1,2,0,5,3,3,1,...,0,0,0,0,1,0,0,1,0,0
2,15,1,1,1,2,3,4,3,2,2,...,0,0,0,0,1,0,0,0,1,0
3,15,4,2,1,3,0,3,2,2,1,...,0,0,0,0,0,1,0,0,1,0
4,16,3,3,1,2,0,4,3,2,1,...,0,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,20,2,2,1,2,2,5,5,4,4,...,1,0,0,0,0,1,0,0,0,1
391,17,3,1,2,1,0,2,4,5,3,...,1,0,0,0,0,1,0,0,1,0
392,21,1,1,1,1,3,5,5,3,3,...,0,0,0,0,1,0,0,0,0,1
393,18,3,2,3,1,0,4,4,1,3,...,1,0,0,0,1,0,0,0,1,0


In [14]:
#df_student.to_csv('student_cleaned.csv',index=False)