In [1]:
# required modules (skip if already installed)
if False:
    !pip install pyroaring
    !pip install pyfim 
    # if previous does not work, try: !conda install -c conda-forge pyfim
    !pip install lightgbm
    !pip install fairlearn

In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

# global imports
import sys
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# local imports
sys.path.append('../src/') # local path
import dd

In [3]:
# read folkstable data
census = pd.read_pickle('../data/ACSIncome.pkl') 
states = list(census['STATE'].unique())
# change to True to binarize RAC1P
if False: 
    census["RAC1P"][census["RAC1P"]!='White alone'] = 'Not White alone'

In [4]:
# predictive attributes (for models)
pred_atts = ['AGEP', 'COW', 'SCHL', 'MAR', 'OCCP', 'POBP', 'WKHP', 'SEX', 'RAC1P', 'STATE']
pred_all = pred_atts + ['class']
# discretized attributes (for DD)
disc_atts = ['AGEPgroup', 'COW', 'SCHL', 'MAR', 'OCCPgroup', 'POBPgroup', 'WKHPgroup', 'SEX', 'RAC1P', 'STATE']
disc_all = disc_atts + ['class']
# encode categorical values
df, encoders = dd.encode(census)
df.head()

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,WKHP,SEX,RAC1P,STATE,class,WKHPgroup,AGEPgroup,POBPgroup,OCCPgroup
0,1.0,0.0,23.0,2.0,296.0,10.0,20.0,0.0,4.0,1.0,0.0,1.0,0.0,5.0,22.0
1,36.0,2.0,5.0,2.0,225.0,14.0,39.0,1.0,8.0,1.0,0.0,3.0,3.0,5.0,12.0
2,24.0,0.0,22.0,2.0,421.0,0.0,39.0,1.0,8.0,1.0,0.0,3.0,1.0,5.0,21.0
3,1.0,5.0,23.0,2.0,161.0,0.0,1.0,0.0,8.0,1.0,0.0,0.0,0.0,5.0,8.0
4,4.0,2.0,0.0,2.0,246.0,9.0,49.0,1.0,8.0,1.0,0.0,4.0,0.0,5.0,20.0


In [5]:
# 0 = negative, 1 = positive
encoders['class'].classes_

array([False,  True])

In [6]:
# pretty printing long labels
pretty_rac1p = ['Alaska', 'Indian', 'Alaska-Indian', 'Asian', 'Black', 'Hawaiian', 'Other', 'Two+', 'White']
rac1p_2_pretty = { encoders['RAC1P'].classes_[i]:pretty_rac1p[i] for i in range(len(pretty_rac1p)) }
rac1p_2_pretty

{'Alaska Native alone': 'Alaska',
 'American Indian alone': 'Indian',
 'American Indian and Alaska Native tribes specified;or American Indian or Alaska Native,not specified and no other': 'Alaska-Indian',
 'Asian alone': 'Asian',
 'Black or African American alone': 'Black',
 'Native Hawaiian and Other Pacific Islander alone': 'Hawaiian',
 'Some Other Race alone': 'Other',
 'Two or More Races': 'Two+',
 'White alone': 'White'}

In [7]:
# split train test
X = df[pred_atts]
y = df['class'].astype(int)
X_train, X_test, y_train, y_test, census_train, census_test = train_test_split(X, y, census, test_size=0.33, random_state=42)

In [8]:
# training model and make predictions - replace with your favorite classifier
import lightgbm as lgb

clf = lgb.LGBMClassifier(random_state=42)
clf.fit(X_train, y_train)
# add predicted class in the census dataset (decoding back)
y_pred = clf.predict(X_test)
census_test['pred'] = encoders['class'].inverse_transform(y_pred)
# add predicted score in the adult_test
census_test['score'] = clf.predict_proba(X_test)[:,1]
census_test.head()

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,WKHP,SEX,RAC1P,STATE,class,WKHPgroup,AGEPgroup,POBPgroup,OCCPgroup,pred,score
517717,28,Employee of a private for-profit company orbus...,"Some college, but less than 1 year",Never married or under 15 years old,4700.0,37,50.0,Female,White alone,IL,False,>40,16-30,US,SAL,False,0.246374
319156,56,Federal government employee,"Some college, but less than 1 year",Divorced,4230.0,36,40.0,Male,Black or African American alone,CT,False,31-40,51-60,US,CLN,False,0.270322
507501,23,Employee of a private for-profit company orbus...,Bachelor's degree,Never married or under 15 years old,1430.0,36,40.0,Male,White alone,IL,True,31-40,16-30,US,ENG,False,0.422198
537835,33,Employee of a private for-profit company orbus...,Bachelor's degree,Married,4710.0,17,50.0,Female,White alone,IL,True,>40,31-40,US,SAL,True,0.833683
305930,38,Employee of a private for-profit company orbus...,Doctorate degree,Married,3250.0,34,40.0,Female,White alone,CT,True,31-40,31-40,US,MED,True,0.917323


### Performance metrics: misclassification error

In [9]:
# DD(filename or dataframe, unprotectedItem, predBadItem)
disc = dd.DD(census_test[disc_all+['pred']], unprotectedItem='class=True', predBadItem='pred=False')    
# confusion matrix
ctg = disc.ctg_any()
disc.print(ctg)
print("ERR = {:f}".format(ctg.err()))

-----
Context ALL
Size = 549285  Perc = 100.00%
           |pred=False|pred=True|      
class!=True|    302169|    44777|346946
class=True |     53463|   148876|202339
           |    355632|   193653|549285
ERR = 0.178851


### Performance metrics: discovering errors in predictions

In [10]:
# filtering condition: return None to filter out, or measure value
# contingency table ctg such that ctg.n() >= minSupp 
'''
     confusion matrix (unprotected=true.good)
     =========== pred.bad === pred.good === 
     true.bad        a            b       n1()
     true.good       c            d       n2()
     ===========    m1()  ===    m2()  ==  n()
'''
def check_err(ctg):
    return ctg.err() if ctg.n()>0 else None # else None required in cover_n

In [11]:
ctgs_err = disc.extract(testCond=check_err, minSupp=-100, topk=1000)

# Top 3
for v, ctg in ctgs_err[:3]:
    disc.print(ctg)
    print("ERR = {:f}".format(check_err(ctg)))

-----
Context COW=Self-employed in own not incorporated business,professional practice, or farm AND SCHL=Associate's degree AND OCCPgroup=MGR AND WKHPgroup=>40 AND SEX=Male AND MAR=Married AND RAC1P=White alone AND POBPgroup=US
Size = 119  Perc = 0.02%
           |pred=False|pred=True|   
class=False|        34|       25| 59
class=True |        38|       22| 60
           |        72|       47|119
ERR = 0.529412
-----
Context COW=Self-employed in own not incorporated business,professional practice, or farm AND SCHL=Associate's degree AND OCCPgroup=MGR AND WKHPgroup=>40 AND MAR=Married AND RAC1P=White alone AND POBPgroup=US
Size = 137  Perc = 0.02%
           |pred=False|pred=True|   
class=False|        42|       29| 71
class=True |        43|       23| 66
           |        85|       52|137
ERR = 0.525547
-----
Context COW=Self-employed in own not incorporated business,professional practice, or farm AND SCHL=Associate's degree AND OCCPgroup=MGR AND WKHPgroup=>40 AND SEX=Male AND MAR=

In [12]:
# sequential covering algorithm: 200 contingency tables
covers, residuals, times, uncovered, ctg_cov, ctg_uncov = \
    disc.cover_n([ctg for _,ctg in ctgs_err], check_err, 200, only_protected=False)

In [13]:
print('== Rejected ==')    
disc.print(ctg_cov)
print("ERR = {:f}".format(check_err(ctg_cov)))
print('== Selected ==')    
disc.print(ctg_uncov)
print("ERR = {:f}".format(check_err(ctg_uncov)))
print('\n== Cover CTs ==')    
for ctg, res in zip(covers, residuals):
    print(res)
    disc.print(ctg)
    print("ERR = {:f}".format(check_err(ctg)))

== Rejected ==
-----
Context <extensional>
Size = 20245  Perc = 3.69%
           |pred=False|pred=True|     
class!=True|      5775|     4512|10287
class=True |      4502|     5456| 9958
           |     10277|     9968|20245
ERR = 0.445246
== Selected ==
-----
Context <extensional>
Size = 529040  Perc = 96.31%
           |pred=False|pred=True|      
class!=True|    296394|    40265|336659
class=True |     48961|   143420|192381
           |    345355|   183685|529040
ERR = 0.168656

== Cover CTs ==
119
-----
Context COW=Self-employed in own not incorporated business,professional practice, or farm AND SCHL=Associate's degree AND OCCPgroup=MGR AND WKHPgroup=>40 AND SEX=Male AND MAR=Married AND RAC1P=White alone AND POBPgroup=US
Size = 119  Perc = 0.02%
           |pred=False|pred=True|   
class=False|        34|       25| 59
class=True |        38|       22| 60
           |        72|       47|119
ERR = 0.529412
114
-----
Context OCCPgroup=EDU AND STATE=TX AND AGEPgroup=51-60 AND SCHL=B

### Covering ground truth (experimental)

In [15]:
# filtering condition: return None to filter out, or measure value
# contingency table ctg such that ctg.n() >= minSupp 
'''
     contingency table for inference (unprotected=true.good, n1=a,n2=c)
     ===========
     true.bad      a             
     true.good     c             
     ===========   n()
'''
def check_acc(ctg):
    n = ctg.m1()
    return max(ctg.a,ctg.c)/n if n>0 else None

In [16]:
# DD(filename or dataframe, unprotectedItem, predBadItem)
disc = dd.DD(census_test[disc_all], unprotectedItem='class=True')    
# confusion matrix
ctg = disc.ctg_any()
disc.print(ctg)
print("ACC = {:f}".format(check_acc(ctg)))

-----
Context ALL
Size = 549285  Perc = 100.00%
           |      
class!=True|346946
class=True |202339
           |549285
ACC = 0.631632


In [17]:
ctgs_acc = disc.extract(testCond=check_acc, minSupp=-100, topk=1000)

# Top 3
for v, ctg in ctgs_acc[:3]:
    disc.print(ctg)
    print("ERR = {:f}".format(check_acc(ctg)))

-----
Context STATE=WV AND WKHPgroup=21-30 AND MAR=Never married or under 15 years old AND RAC1P=White alone AND POBPgroup=US
Size = 114  Perc = 0.02%
           |   
class=False|114
class=True |  0
           |114
ERR = 1.000000
-----
Context STATE=WV AND WKHPgroup=21-30 AND MAR=Never married or under 15 years old AND RAC1P=White alone
Size = 115  Perc = 0.02%
           |   
class=False|115
class=True |  0
           |115
ERR = 1.000000
-----
Context STATE=WV AND WKHPgroup=21-30 AND MAR=Never married or under 15 years old AND POBPgroup=US
Size = 125  Perc = 0.02%
           |   
class=False|125
class=True |  0
           |125
ERR = 1.000000


In [23]:
def check_acc_abs(ctg):
    n = ctg.m1()
    p = max(ctg.a,ctg.c)/n if n>0 else None
    return None if p is None else dd.Pair( (p*20)//20, p*n)

# sequential covering algorithm: 200 contingency tables
covers, residuals, times, uncovered, ctg_cov, ctg_uncov = \
    disc.cover_n([ctg for _,ctg in ctgs_acc], check_acc_abs, 200, only_protected=False)

In [25]:
print('== Rejected ==')    
disc.print(ctg_cov) 
tot_err = tot = 0
for ctg, _ in zip(covers, residuals):
    tot_err += check_acc(ctg)*ctg.n()
    tot += ctg.n()
print("ERR = {:f}".format(tot_err/tot))
print('== Selected ==')    
disc.print(ctg_uncov)
print('\n== Cover CTs ==')    
for ctg, res in zip(covers, residuals):
    disc.print(ctg)
    print("ERR = {:f}".format(check_acc(ctg)))

== Rejected ==
-----
Context <extensional>
Size = 14349  Perc = 2.61%
           |     
class!=True|14109
class=True |  240
           |14349
ERR = 1.000000
== Selected ==
-----
Context <extensional>
Size = 534936  Perc = 97.39%
           |      
class!=True|332837
class=True |202099
           |534936

== Cover CTs ==
355
-----
Context STATE=VA AND WKHPgroup=11-20 AND AGEPgroup=16-30 AND RAC1P=White alone AND POBPgroup=US
Size = 355  Perc = 0.06%
           |   
class=False|355
class=True |  0
           |355
ERR = 1.000000
299
-----
Context STATE=KY AND WKHPgroup=11-20 AND AGEPgroup=16-30
Size = 299  Perc = 0.05%
           |   
class=False|299
class=True |  0
           |299
ERR = 1.000000
274
-----
Context STATE=UT AND WKHPgroup=11-20 AND MAR=Never married or under 15 years old AND RAC1P=White alone
Size = 274  Perc = 0.05%
           |   
class=False|274
class=True |  0
           |274
ERR = 1.000000
249
-----
Context STATE=IA AND WKHPgroup=11-20 AND AGEPgroup=16-30
Size = 249  P