In [3]:
import time
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import log_loss
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn import datasets, metrics, model_selection, svm


# from notebooks/ directory
import sys, os

# ADDITIONS:
from src.RAI import DataBiasChecker, ModelBiasRanker, ModelBiasHandler


In [4]:
%%time
model_input_path = './data/raw/usa_00004.csv.gz'
ipums_select = pd.read_csv(model_input_path, compression='gzip', header=0, sep=',', quotechar='"', error_bad_lines=False,
                       usecols = ['SEX','EDUC','MARST','AGE',
                                  'YEAR','MET2013',
                                 'RACE','INCTOT','HISPAN'])


CPU times: user 3min 30s, sys: 45.7 s, total: 4min 16s
Wall time: 4min 26s


In [5]:
ipums_select.YEAR.max(), ipums_select.YEAR.min()

(2020, 1850)

In [6]:
ipums_select = ipums_select[ipums_select.YEAR == 2019]
ipums_select = ipums_select.drop(['YEAR'], axis = 1)

In [7]:
ipums_select['MARST'] = (ipums_select['MARST'].isin([1,2])).astype('int')
filt = ipums_select['MARST'] == 1
ipums_select.loc[filt, 'MARST'] = 'married'
ipums_select.loc[~filt, 'MARST'] = 'not_married'

In [8]:
ipums_select['EDUC'] = (ipums_select['EDUC'].isin([10,11])).astype('int')
filt = ipums_select['EDUC'] == 1
ipums_select.loc[filt, 'EDUC'] = 'college'
ipums_select.loc[~filt, 'EDUC'] = 'no_college'

In [9]:
ipums_select['SEX'] = (ipums_select['SEX'].isin([1])).astype('int')
filt = ipums_select['SEX'] == 1
ipums_select.loc[filt, 'SEX'] = 'Male'
ipums_select.loc[~filt, 'SEX'] = 'Female'

In [23]:
ipums_select

Unnamed: 0,MET2013,SEX,AGE,MARST,RACE,HISPAN,EDUC,INCTOT
103777570,0.0,Male,39,not_married,2,0,no_college,9000.0
103777571,0.0,Female,21,not_married,1,0,no_college,150.0
103777572,11500.0,Male,19,not_married,2,0,no_college,1400.0
103777573,11500.0,Male,77,not_married,1,0,no_college,22700.0
103777574,33660.0,Male,41,not_married,2,0,no_college,0.0
...,...,...,...,...,...,...,...,...
107017118,0.0,Female,63,not_married,1,0,no_college,117000.0
107017119,0.0,Male,45,not_married,1,0,no_college,14000.0
107017120,0.0,Female,85,not_married,1,0,no_college,18500.0
107017121,0.0,Female,67,married,1,0,college,49900.0


In [24]:
ipums_to_predict = ipums_select.sample(random_state=4, n=100000)
# ipums_to_predict.drop(['RACE','HISPAN'])
ipums_to_predict.drop(['HISPAN'], axis=1)
ipums_to_predict.to_json('./data/input/ipums_standard_to_predict.json')

In [310]:
ipums_select.to_csv('./data/raw/ipums_processed_2019.csv', index=False)

In [25]:
ipums_to_predict

Unnamed: 0,MET2013,SEX,AGE,MARST,RACE,HISPAN,EDUC,INCTOT
105571849,0.0,Female,82,married,1,0,no_college,8400.0
105267896,0.0,Female,39,married,1,0,no_college,40000.0
106631470,19100.0,Male,55,not_married,1,0,no_college,30000.0
104829240,14010.0,Female,39,not_married,1,0,no_college,62000.0
105695179,35620.0,Male,61,married,1,0,college,490000.0
...,...,...,...,...,...,...,...,...
104562872,45300.0,Male,3,not_married,1,1,no_college,9999999.0
106397270,0.0,Female,74,not_married,1,0,college,38000.0
106073854,0.0,Male,72,married,1,0,no_college,56600.0
104024187,31080.0,Male,86,married,1,0,college,38000.0
