In [1]:
import time
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import log_loss
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn import datasets, metrics, model_selection, svm


# from notebooks/ directory
import sys, os

# ADDITIONS:
from src.RAI import DataBiasChecker, ModelBiasRanker, ModelBiasHandler


In [55]:
%%time
model_input_path = './data/raw/usa_IPUMS_2022.csv.gz'
ipums_select = pd.read_csv(model_input_path, compression='gzip', header=0, sep=',', quotechar='"', error_bad_lines=False,
                       usecols = ['INCTOT','MET2013','SEX','AGE',
                                  'RACE','HISPAN','EDUC', 'MARST','YEAR'])


CPU times: user 38.6 s, sys: 13.9 s, total: 52.6 s
Wall time: 52.6 s


In [56]:
ipums_select.RACE.value_counts()

1    12235684
2     1582218
6      614521
7      593293
8      431413
4      216873
3      174676
9       58519
5       40427
Name: RACE, dtype: int64

In [57]:
ipums_select['MARST'] = (ipums_select['MARST'].isin([1,2])).astype('int')
filt = ipums_select['MARST'] == 1
ipums_select.loc[filt, 'MARST'] = 'married'
ipums_select.loc[~filt, 'MARST'] = 'not_married'

In [58]:
ipums_select['EDUC'] = (ipums_select['EDUC'].isin([10,11])).astype('int')
filt = ipums_select['EDUC'] == 1
ipums_select.loc[filt, 'EDUC'] = 'college'
ipums_select.loc[~filt, 'EDUC'] = 'no_college'

In [59]:
ipums_select['SEX'] = (ipums_select['SEX'].isin([1])).astype('int')
filt = ipums_select['SEX'] == 1
ipums_select.loc[filt, 'SEX'] = 'Male'
ipums_select.loc[~filt, 'SEX'] = 'Female'

In [60]:
# ipums_select.drop(['YEAR','SERIAL','CBSERIAL','NUMPREC','SUBSAMP','HHWT','HHTYPE','MET2013','MORTGAGE','RENT','HISPAN','INCTOT'],axis=1, inplace=True)

In [61]:
ip = ipums_select.copy().rename({"EDUC":'education',
            "MARST":'marital-status',
            "SEX":'sex',
            "AGE":'age',
            "RACE":'race'}, axis=1)
ip

Unnamed: 0,YEAR,MET2013,sex,age,marital-status,race,HISPAN,education,INCTOT
0,2015,33660,Male,56,married,2,0,no_college,3450
1,2015,33660,Female,61,married,2,0,no_college,14000
2,2015,33660,Female,8,not_married,2,0,no_college,9999999
3,2015,33660,Male,61,not_married,2,0,no_college,35000
4,2015,0,Female,52,not_married,8,0,no_college,8800
...,...,...,...,...,...,...,...,...,...
15947619,2019,0,Female,63,not_married,1,0,no_college,117000
15947620,2019,0,Male,45,not_married,1,0,no_college,14000
15947621,2019,0,Female,85,not_married,1,0,no_college,18500
15947622,2019,0,Female,67,married,1,0,college,49900


In [62]:
black = ip.loc[ip.race ==2,:].reset_index(drop=True).loc[:50000,:]
white = ip.loc[ip.race ==1,:].reset_index(drop=True).loc[:16796,:]
asian = ip.loc[(ip.race ==4)|(ip.race ==5)|(ip.race ==6),:].reset_index(drop=True).loc[:16600,:]
american_indian = ip.loc[ip.race ==3,:].reset_index(drop=True).loc[:16600,:]

In [63]:
merge = pd.concat([black, white,asian,american_indian], axis=0)
# mask = merge.race == 2
# merge.loc[mask, 'black'] = 1
# merge.loc[~mask, 'black'] = 0

In [64]:
merge = merge.sample(frac=1)

In [65]:
merge.race.value_counts()

2    50001
1    16797
3    16601
6    11438
4     4240
5      923
Name: race, dtype: int64

In [66]:
merge

Unnamed: 0,YEAR,MET2013,sex,age,marital-status,race,HISPAN,education,INCTOT
11463,2015,35300,Female,32,not_married,3,2,no_college,11100
37144,2015,19740,Male,35,married,2,0,no_college,40000
10579,2015,0,Female,38,married,3,0,no_college,28000
2342,2015,0,Male,19,not_married,1,0,no_college,0
9585,2015,0,Female,77,not_married,1,0,no_college,12200
...,...,...,...,...,...,...,...,...,...
14984,2015,19820,Male,63,married,3,4,college,8000
12362,2015,38060,Male,19,not_married,2,0,no_college,1000
24353,2015,37100,Female,18,not_married,2,0,no_college,0
5509,2015,0,Female,73,not_married,1,0,no_college,12300


In [68]:
merge.to_csv('./data/raw/ipums_standard_black.csv', index=False)

In [117]:
import joblib
import pandas as pd
from sklearn import preprocessing
model = joblib.load('./data/artefacts/proxy_model_black_income_ipums_standard/model')

In [118]:
model.feature_name_

['education_college',
 'education_no_college',
 'marital-status_married',
 'marital-status_not_married',
 'sex_Female',
 'sex_Male',
 'age']

In [219]:
adult = pd.read_csv('./data/intermediate/proxy_model_race_income_adult_standard/dataset.csv')
adult

Unnamed: 0,black,rich,education_college,education_no_college,marital-status_married,marital-status_not_married,sex_Female,sex_Male,age
0,0,1,1,0,0,1,0,1,0.301370
1,0,1,1,0,1,0,0,1,0.452055
2,0,1,0,1,0,1,0,1,0.287671
3,1,1,0,1,1,0,0,1,0.493151
4,1,1,1,0,1,0,1,0,0.150685
...,...,...,...,...,...,...,...,...,...
32556,0,1,0,1,1,0,1,0,0.136986
32557,0,0,0,1,1,0,0,1,0.315068
32558,0,1,0,1,0,1,1,0,0.561644
32559,0,1,0,1,0,1,0,1,0.068493


In [212]:
adult = pd.read_csv('./data/raw/adult_standard.csv')
X = adult[['education','marital-status','sex','age']]

In [210]:
# X.to_json('./data/input/adult.json')

In [221]:
X = adult.drop(['black','rich'], axis=1)

In [222]:
pred_proba = model.predict_proba(X)[::,1]

In [223]:
pred_proba 

array([0.52038146, 0.35541449, 0.60388763, ..., 0.62741719, 0.57579467,
       0.39116335])

In [224]:
min(pred_proba), max(pred_proba)

(0.18129779796403822, 0.6550181634413215)

In [225]:
pg_rate = adult['black'].value_counts(normalize=True)[1]
print(pg_rate)
pg_rate_thresh = np.percentile(pred_proba, 100 * (1 - pg_rate))
print(pg_rate_thresh)

0.0959429992936335
0.6279886306673219


In [230]:
len(pred_proba)

32561

In [236]:
np.percentile(pred_proba, 90)

0.6279886306673219

In [248]:
(pred_proba >= np.percentile(pred_proba, 90)).sum()

3514

In [246]:
(pred_proba >= pg_rate_thresh).sum()

2623

In [228]:
adult['black_proxy'] = (pred_proba > pg_rate_thresh).astype(int)

In [229]:
crosstab_accuracy = pd.crosstab(adult['black'],adult['black_proxy'])
crosstab_accuracy

black_proxy,0,1
black,Unnamed: 1_level_1,Unnamed: 2_level_1
0,27274,2163
1,2664,460


In [159]:
pred = pd.read_json('./data/artefacts/proxy_model_black_income_ipums_standard/prediction.json')

In [160]:
len(pred), len(adult)

(32561, 32561)

In [198]:
crosstab_accuracy = pd.crosstab(adult['black'],pred['prediction'])
crosstab_accuracy

prediction,0,1
black,Unnamed: 1_level_1,Unnamed: 2_level_1
0,27274,2163
1,2664,460


In [146]:
from sklearn.metrics import accuracy_score, jaccard_score, roc_auc_score, roc_curve

In [147]:
accuracy_score(adult['black'], adult['black_proxy'])

0.8517551672245939

In [168]:
roc_auc_score(adult['black'], pred_proba)

0.63820283074203

In [149]:
pred_proba

array([0.52038146, 0.35541449, 0.60388763, ..., 0.62741719, 0.57579467,
       0.39116335])

In [150]:
model.predict_proba(X)

array([[0.47961854, 0.52038146],
       [0.64458551, 0.35541449],
       [0.39611237, 0.60388763],
       ...,
       [0.37258281, 0.62741719],
       [0.42420533, 0.57579467],
       [0.60883665, 0.39116335]])