In [274]:
import pandas as pd
import numpy as np
from datetime import datetime

In [275]:
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)
pd.options.mode.chained_assignment = (
    None  # default='warn', this removes warning on dropping columns
)

In [276]:
df_16 = pd.read_stata("../Data/raw/2016/2016-ASR_Public_Use_File.dta")
df_17 = pd.read_stata("../Data/raw/2017/2017 ASR_Public_Use_File.dta")
df_18 = pd.read_stata("../Data/raw/2018/2018 ASR_Public_Use_File.dta")
df_19 = pd.read_stata("../Data/raw/2019/2019 ASR_Public_Use_File.dta")

df_16.name = "df_16"
df_17.name = "df_17"
df_18.name = "df_18"
df_19.name = "df_19"

df_16["survey_year"] = 2016
df_17["survey_year"] = 2017
df_18["survey_year"] = 2018
df_19["survey_year"] = 2019

  df_16["survey_year"] = 2016
  df_18["survey_year"] = 2018
  df_19["survey_year"] = 2019


In [277]:
col_16_ww = df_16.columns.values
col_17_ww = df_17.columns.values
col_18_ww = df_18.columns.values
col_19_ww = df_19.columns.values
all_cols_ww = np.union1d(
    np.union1d(col_16_ww, col_17_ww), np.union1d(col_18_ww, col_19_ww)
)

To begin, we are going to reduce our sample set to just 1 respondent per house. In the following cell we conduct this filtering, and remove person level weighting along with redundant household level weighting. This is in accordance with the example from pages 33-37 from the 2019 ASR User Guide_no_appendices PDF, which states the following: 

"For household-level analysis, you need to filter the data file so that you have one observation per household. The easiest way to do this is to select only observations where the value of the “respondent” variable is equal to 1.

After selecting the 1,506 observations where the respondent variable equals 1, you would use the weight variable “Weight_household” or the weight variable “Weight_household_pop” to get household-level estimates. These two household-level weight variables will produce the same estimates. However, when using the “Weight_household” variable the frequency counts will sum to the ASR sample size of 1,506 and when using the “Weight_household_pop” variable the frequency counts will sum to the population of 118,403.

The data file also includes 23 replicate weights for each of the four survey weights on the data file (“Weight_person,” “Weight_person_pop,” “Weight_household,”
34
“Weight_household_pop”). Replicate weights were created for each replicate sample to make it easier to estimate standard errors and confidence intervals which is covered in the section 5 of this user’s guide."

In [278]:
# Filter out weight columns, and remove the two we want to keep
weight_col = [col for col in all_cols_ww if col.startswith("Weight_")]
weight_col.remove("Weight_household")
weight_col.remove("Weight_household_pop")

for df_yr in [df_16, df_17, df_18, df_19]:
    for col_w in weight_col:
        if col_w in df_yr.columns.values:
            df_yr.drop(col_w, axis=1, inplace=True)

In [279]:
col_16 = df_16.columns.values
col_17 = df_17.columns.values
col_18 = df_18.columns.values
col_19 = df_19.columns.values
all_cols = np.union1d(np.union1d(col_16, col_17), np.union1d(col_18, col_19))

In [280]:
# Check that each dataframe has all of the same columns
col_df = pd.DataFrame(columns=["df_16", "df_17", "df_18", "df_19"], index=all_cols)

for df_yr in [df_16, df_17, df_18, df_19]:
    for col in all_cols:
        if col in df_yr.columns.values:
            col_df.loc[col, df_yr.name] = 1

col_df

Unnamed: 0,df_16,df_17,df_18,df_19
Weight_household,1,1,1,1
Weight_household_pop,1,1,1,1
cohort,1,1,1,1
hhid,1,1,1,1
numppl,1,1,1,1
personid,1,1,1,1
qn10a,1,1,1,1
qn10b,1,1,1,1
qn11a,1,1,1,1
qn11aa,1,1,1,1


In [281]:
df_all_yr = pd.concat([df_16, df_17, df_18, df_19])

# Covert all values to lowercase
df_all_yr = df_all_yr.map(lambda s: s.lower() if type(s) == str else s)

# Filter data to only be 1 respondent per house
rsp_df = df_all_yr[df_all_yr["respondent"] == "respondent"]

print(len(rsp_df))

rsp_df.sample(n=25, random_state=42)

6035


Unnamed: 0,hhid,qn1a,numppl,qn1b,qn1c,qn1d,qn1f,qn1g,qn1h,qn1i,qn1jyear,qn1k,qn1l,qn2a,qn2b,qn3a,qn3b,qn4a,qn4b,qn4c,qn4e,qn4j,qn5a,qn5b,qn5c,qn6a,qn6b,qn7,qn8a,qn8b,qn9,qn10a,qn10b,qn11a,qn11aa,qn12,qn13,qn18a,qn18b,qn18c,qn18d01,qn18dmnth,qn18dyear,qn18e,qn19b,qn20,qn24a,qn24b,qn25a,qn25b,qn25c,qn25d,qn26b,qn26d,qn26e,qn26estate,qn26f,qn26h,qn27a,qn27b01,qn27bmnth,qn27byear,qn27c,qn28a,qn28b,qn29b,qn29c,qn29c_months,cohort,ui_agect_arrival,qn30a,qn30d,qn31a,qn31d,qn31e,qn31f,qn31f_months,qn32a,qn32d,qn32e,qn33a,qn33d,qn33e,qn33f,qn33f_months,qn34a,qn34d,qn34e,qn34f,qn34f_months,qn35a,qn38a,qn38b,qn38c,ui_soi_pubassist,ui_soi,Weight_household,Weight_household_pop,personid,respondent,qn17_01,qn17_02,qn17_03,qn17_04,qn17_05,qn17_06,qn17_07,qn17_08,qn17_97,qn26ha_01,qn26ha_02,qn26ha_03,qn26ha_04,qn26ha_05,qn26ha_06,qn26ha_07,qn26ha_08,qn26ha_97,qn29a_01,qn29a_02,qn29a_03,qn29a_04,qn29a_05,qn29a_06,qn29a_07,qn29a_08,qn29a_09,qn29a_10,qn29a_11,qn29a_12,qn29a_97,qn29d_01,qn29d_02,qn29d_03,qn29d_04,qn29d_97,qn30b_01,qn30b_02,qn30b_03,qn30b_04,qn30b_05,qn31b_01,qn31b_02,qn31b_03,qn31b_04,qn31b_05,qn32b_01,qn32b_02,qn32b_03,qn32b_04,qn32b_05,qn33b_01,qn33b_02,qn33b_03,qn33b_04,qn33b_05,qn34b_01,qn34b_02,qn34b_03,qn34b_04,qn34b_05,ui_qn8a_annual,ui_qn10a_annual,ui_cashassist,ui_lfp,ui_emprate,ui_medicaidrma,ui_lpr,ui_school,ui_work,survey_year
2520,4533.0,(record respondent name),2.0,self,widowed,53.0,female,iraq,iraq,arab,2014.0,south,,14.0,technical school certification,civil servant (civilian in local or national g...,(record type of work),not well,well,yes,no,,no,,,,,,,,,,,never worked in the u.s.,,,no,,,,,,,,,,no,,no,,,,66.0,yes,,,better living situation/opportunity (cost of l...,yes,yes,(record month),june,2015.0,,yes,no,private physician,yes - covered in all months,,2014 to 2015,40 to 54 years,yes,12.0,yes,6.0,no,number of months,6.0,no,,,yes,12.0,yes,number of months,48.0,no,,,no months,,no,rented for cash rent,700.0,yes,receives public assistance,"receives public assistance, but earnings missing",1.2652,99.4704,45331.0,respondent,option not selected,option not selected,option not selected,child care or family responsibilities,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,volunteer your time,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,other government source,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,medicaid or refugee medical assistance,option not selected,option not selected,respondent,household member #2,option not selected,option not selected,option not selected,respondent,household member #2,option not selected,option not selected,option not selected,,,,,,option not selected,household member #2,option not selected,option not selected,option not selected,,,,,,,,receives cash assistance,not in labor force,not in labor force,individual receives rma/medicaid,already adjusted lpr status,none,not working now and never worked in us,2019
707,10000325.0,(record respondent name),1.0,self,never married,25.0,male,iraq,iraq,arab,2014.0,south,,14.0,university degree (other than medical),student,other,well,very well,no,no,,yes,no,,38.0,,13.0,,,,,,,,,,52.0,40.0,don't know,(record month),january,2014.0,no,hospitality/entertainment,"employee of a private company, business, or in...",yes,15.0,no,,,,36.0,yes,,,reunification with relatives,not applicable,yes,refused,,2014.0,,no,no,health clinic,not covered in any month,,2013 to 2014,18 to 24 years,no,,no,,,no months,,no,,,no,,,no months,,no,,,no months,,no,owned by you or someone in this household with...,1000.0,no,doesn't receive public assistance,"doesn't receive public assistance, but earning...",0.8917,82.8388,100003251.0,respondent,,,,,,,,,,,,,,,,,,,option not selected,self or household members,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,does not receive cash assistance,in labor force,employed,individual does not receive rma/medicaid,already adjusted lpr status,none,working now,2018
1880,3295.0,(record respondent name),5.0,self,divorced,36.0,female,bhutan,none,other,2017.0,north east,,10.0,secondary (or high school diploma),not employed,,not well,well,yes,no,,yes,no,,40.0,,12.0,,,,,,,,,,52.0,40.0,don't know,(record month),january,2018.0,yes,"personal services (laundry, barber, home care,...",don't know,no,,no,,,,28.0,yes,,,did not move to another state/it's the first s...,yes,yes,(record month),march,2019.0,,no,no,private physician,yes - covered in all months,,2018,25 to 39 years,yes,12.0,no,,,number of months,4.0,no,,,yes,12.0,yes,every month,,no,,,don't know,,no,rented for cash rent,1400.0,no,receives public assistance,"receives public assistance, but earnings missing",0.1763,13.861,32951.0,respondent,,,,,,,,,,option not selected,volunteer your time,help with homework,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,insurance through own employment,option not selected,option not selected,option not selected,insurance through own or family member's emplo...,option not selected,option not selected,option not selected,option not selected,option not selected,household member #2,household member #3,option not selected,option not selected,,,,,,,,,,,option not selected,household member #2,household member #3,option not selected,option not selected,,,,,,,,receives cash assistance,in labor force,employed,individual does not receive rma/medicaid,already adjusted lpr status,none,working now,2019
614,10000296.0,(record respondent name),4.0,self,never married,30.0,male,somalia,somalia,other,2015.0,northeast,,0.0,none,self-employed,laborer,not well,not well,no,no,,yes,no,,40.0,,16.0,,,,,,,,,,48.0,40.0,2200.0,(record month),april,2016.0,yes,manufacturing/production/factory,"employee of a private company, business, or in...",no,,no,,,,5.0,yes,,,did not move to another state/it's the first s...,not applicable,yes,(record month),february,2017.0,,no,no,emergency room at a hospital,not covered in any month,,2014 to 2015,25 to 39 years,no,,no,,,no months,,no,,,no,,,no months,,no,,,number of months,6.0,no,rented for cash rent,900.0,don't know,doesn't receive public assistance,receives earnings,1.5699,151.9108,100002961.0,respondent,,,,,,,,,,,,,,,,,,,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,insurance through family member's employment,option not selected,option not selected,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,does not receive cash assistance,in labor force,employed,individual does not receive rma/medicaid,already adjusted lpr status,none,working now,2017
2343,10001072.0,(record respondent name),2.0,self,widowed,56.0,female,iraq,iraq,arab,2016.0,south,,17.0,secondary (or high school diploma),not employed,,not well,well,yes,yes,no,no,,,,,,,,,,,never worked in the u.s.,,,no,,,,,,,,,,no,,no,,,,3.0,yes,,,reunification with relatives,not applicable,yes,(record month),january,2017.0,,yes,yes,health clinic,yes - covered in all months,,2017,40 to 54 years,yes,12.0,no,,,no months,,don't know,,,no,,,no months,,no,,,no months,,yes,rented for cash rent,1125.0,don't know,receives public assistance,"receives public assistance, but earnings missing",0.2303,21.3942,100010721.0,respondent,option not selected,option not selected,poor health or handicap,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,,,,,,,,,,option not selected,self or household members,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,insurance through own or family member's emplo...,option not selected,option not selected,option not selected,option not selected,respondent,option not selected,option not selected,option not selected,option not selected,,,,,,,,,,,,,,,,,,,,,,,does not receive cash assistance,not in labor force,not in labor force,individual does not receive rma/medicaid,already adjusted lpr status,none,not working now and never worked in past,2018
3516,6499.0,(record respondent name),5.0,self,never married,30.0,female,democratic republic of the congo,democratic republic of the congo,other,2018.0,north east,,don't know,none,employee in private sector,(record type of work),not at all,not well,no,yes,no,no,,,,,,,,,,,never worked in the u.s.,,,yes,,,,,,,,,,no,,no,,,,17.0,yes,,,refugee/asylum seeker (not further specified),yes,no,,,,yes,no,no,health clinic,yes - covered in all months,,2018,25 to 39 years,yes,12.0,yes,12.0,yes,don't know,,no,,,no,,,don't know,,no,,,don't know,,no,occupied without payment of cash rent,,no,receives public assistance,"receives public assistance, but earnings missing",0.1426,11.2084,64991.0,respondent,,,,,,,,,,option not selected,volunteer your time,help with homework,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,medicaid,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,don't know,don't know,don't know,don't know,don't know,respondent,household member #2,household member #3,household member #4,household member #5,respondent,option not selected,option not selected,option not selected,option not selected,,,,,,,,,,,,,,,,,,receives cash assistance,in labor force,unemployed,don't know and/or refused,plans to adjust lpr status in future,none,not working now and never worked in us,2019
1954,3396.0,(record respondent name),5.0,self,now married (note: spouse need not live in hou...,41.0,male,democratic republic of the congo,democratic republic of the congo,other,2018.0,midwest,,12.0,primary,self-employed,(record type of work),not at all,not well,yes,no,,yes,no,,40.0,,17.0,,,,,,,,,,52.0,40.0,don't know,(record month),march,2018.0,yes,other,"employee of a private company, business, or in...",yes,4.0,no,,,,11.0,yes,,,better living situation/opportunity (cost of l...,yes,yes,(record month),february,2019.0,,no,no,health clinic,yes - covered in all months,,2018,25 to 39 years,no,,no,,,number of months,0.0,no,,,no,,,number of months,0.0,no,,,number of months,0.0,no,rented for cash rent,729.0,yes,receives public assistance,"receives public assistance, but earnings missing",0.1775,13.9539,33961.0,respondent,,,,,,,,,,attend parent-teacher meetings,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,medicaid,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,medicaid or refugee medical assistance,option not selected,option not selected,,,,,,,,,,,,,,,,,,,,,,,,,,,,does not receive cash assistance,in labor force,employed,individual receives rma/medicaid,already adjusted lpr status,none,working now,2019
3188,99900340.0,(record respondent name),5.0,self,now married (note: spouse need not live in hou...,48.0,male,burma,other,chin,2016 or later,west,,5.0,none,employed (unspecified if private or government),laborer,not at all,not at all,no,no,,yes,no,,40.0,,13.05,,,,,,,,,,52.0,40.0,21000.0,(record month),may,2016.0,was not receiving cash assistance at that time,other (record industry),"employee of a private company, business, or in...",no,,no,,,,11.0,yes,,,was sent by immigration/refugee office/government,yes,yes,don't know,,,yes,no,no,health clinic,not covered in any month,,2016,40 to 54 years,yes,12.0,no,,,don't know,,no,,,yes,12.0,yes,every month,,don't know,,,no months,,no,rented for cash rent,780.0,yes,receives public assistance,receives both,0.7207,69.7413,999003401.0,respondent,,,,,,,,,,option not selected,volunteer your time,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,no medical expenses,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,,,,,,respondent,option not selected,option not selected,option not selected,option not selected,,,,,,,,,,,refused,refused,refused,refused,refused,,,,,,,,receives cash assistance,in labor force,employed,individual does not receive rma/medicaid,already adjusted lpr status,none,working now,2017
2330,4047.0,(record respondent name),1.0,self,never married,23.0,male,democratic republic of the congo,democratic republic of the congo,other,2018.0,south,,12.0,primary,student,(record type of work),not at all,not well,yes,no,,yes,no,,refused,,refused,refused,,,,,,,,,don't know,refused,don't know,(record month),july,2018.0,yes,refused,"employee of a private company, business, or in...",no,,no,,,,18.0,yes,,,other,not applicable,no,,,,yes,no,no,health clinic,not covered in any month,,2018,18 to 24 years,no,,no,,,number of months,0.0,don't know,,,no,,,number of months,0.0,no,,,number of months,0.0,no,rented for cash rent,refused,no,doesn't receive public assistance,"doesn't receive public assistance, but earning...",0.295,23.1917,40471.0,respondent,,,,,,,,,,,,,,,,,,,no medical expenses,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,refused,,does not receive cash assistance,in labor force,employed,individual does not receive rma/medicaid,plans to adjust lpr status in future,none,working now,2019
4383,99902298.0,(record head of household name),1.0,self,never married,32.0,female,bhutan,none,bhutanese,2015 or later,south,,10.0,primary,employee in private sector,"service worker (social worker, hairdresser, ho...",not well,well,yes,no,,yes,no,,40.0,,9.5,,,,,,,,,,50.0,40.0,don't know,(record month),june,2015.0,no,retail/wholesale trade/warehousing,"employee of a private company, business, or in...",no,,no,,,,24.0,yes,,,did not move to another state/it's the first s...,not applicable,yes,(record month),january,2017.0,,no,no,emergency room at a hospital,not covered in any month,,2015,25 to 39 years,no,,no,,,no months,,no,,,no,,,no months,,no,,,no months,,no,owned by you or someone in this household with...,1479.0,don't know,doesn't receive public assistance,"doesn't receive public assistance, but earning...",0.611,57.106,999022981.0,respondent,,,,,,,,,,,,,,,,,,,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,option not selected,other insurance,option not selected,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,does not receive cash assistance,in labor force,employed,individual does not receive rma/medicaid,already adjusted lpr status,none,working now,2016


In [282]:
# # There are repeats, but looking deeper, they don't actually seem like the same people
# c = rsp_df['personid'].value_counts()
# repeats = set()
# for i, v in c.items():
#     if v > 1:
#         repeats.add(i)

# rsp_df[rsp_df['personid'] == 100006711.0]

In [283]:
# Columns that shouldn't be any help given our granularity
misc_drop = [
    "hhid",
    "personid",
    "respondent",
    "qn1a",
    "qn3b",
    "qn26e",
    "qn1l",
    "qn5c",
    "qn8b",
    "qn10b",
]

# Binary answers where the _XX refers to if household member _XX was recipient
# Ex. qn31_03 is just wether household member 3 received food stamps
non_rsp_drop = [
    "qn30b_01",
    "qn30b_02",
    "qn30b_03",
    "qn30b_04",
    "qn30b_05",
    "qn31b_01",
    "qn31b_02",
    "qn31b_03",
    "qn31b_04",
    "qn31b_05",
    "qn32b_01",
    "qn32b_02",
    "qn32b_03",
    "qn32b_04",
    "qn32b_05",
    "qn33b_01",
    "qn33b_02",
    "qn33b_03",
    "qn33b_04",
    "qn33b_05",
    "qn34b_01",
    "qn34b_02",
    "qn34b_03",
    "qn34b_04",
    "qn34b_05",
]

rsp_df.drop(misc_drop, axis=1, inplace=True)
rsp_df.drop(non_rsp_drop, axis=1, inplace=True)

# rsp_df.drop(rsp_df[rsp_df['qn6b'] == "don't know"].index)

null_counts = rsp_df.isnull().sum()
print(null_counts[null_counts > 1000])

qn4j               4238
qn5b               1880
qn6a               1880
qn6b               5723
qn7                1880
qn8a               5547
qn9                5723
qn10a              5968
qn11a              4160
qn11aa             5209
qn12               5200
qn13               4160
qn18a              1054
qn18b              1054
qn18c              1054
qn18d01            1054
qn18dmnth          1953
qn18dyear          1366
qn18e              1054
qn19b              1054
qn20               1054
qn24b              5131
qn25b              5228
qn25c              5406
qn25d              5406
qn26estate         5794
qn27b01            1136
qn27bmnth          3407
qn27byear          2001
qn27c              4049
qn29c_months       5550
qn30d              2580
qn31d              5676
qn31e              5676
qn31f_months       4406
qn32d              5835
qn32e              5835
qn33d              4933
qn33e              4933
qn33f_months       5258
qn34d              5919
qn34e           

In [284]:
rsp_df["qn20"].value_counts()

qn20
employee of a private company, business, or individual    3591
don't know                                                 599
self-employed                                              265
state government employee                                  126
federal government employee                                123
other                                                       96
local government employee                                   71
none/not working                                            51
refused                                                     47
working without pay in family business                      12
Name: count, dtype: int64

In [285]:
to_drop = []

# Answer entry changed by year
rsp_df["qn1k"].replace("98.0", "don't know", inplace=True)
rsp_df["qn1k"].replace("98.0", "refused", inplace=True)

rsp_df["qn4j"] = rsp_df.apply(
    lambda row: row["qn4e"] if pd.isna(row["qn4j"]) else row["qn4j"], axis=1
)

# Question is if they worked more than 1 job in last week, blanks indicate they haven't worked any jobs in last week
rsp_df["qn5b"].fillna(value="no", inplace=True)

# Need this column to be numeric for separate calculation
rsp_df["qn6a"].fillna(value=0.0, inplace=True)
rsp_df["qn6a"].replace("don't know", "", inplace=True)
rsp_df["qn6a"].replace("refused", "", inplace=True)
rsp_df["qn6a"] = pd.to_numeric(rsp_df["qn6a"])

# Empties indicate they selected they didn't work second job, question is how many hours worked at second job.
# rsp_df['qn6b'] = rsp_df['qn6b'].cat.add_categories('0.0')
rsp_df["qn6b"].fillna(value=0.0, inplace=True)
rsp_df["qn6b"].replace("don't know", "", inplace=True)
rsp_df["qn6b"].replace("refused", "", inplace=True)
rsp_df["qn6b"] = pd.to_numeric(rsp_df["qn6b"])

# Need this column to be numeric for separate calculation
# Not sure how to fill empties. Adding 0 would indicate they worked for free which and could throw off calcs
rsp_df["qn7"].replace("don't know", "", inplace=True)
rsp_df["qn7"].replace("refused", "", inplace=True)
rsp_df["qn7"] = pd.to_numeric(rsp_df["qn7"])

# Empties exist if subject knew how many hours worked at primary job
rsp_df["qn8a"] = rsp_df.apply(
    lambda row: row["qn6a"] * row["qn7"] if pd.isna(row["qn8a"]) else row["qn8a"],
    axis=1,
)

# Question is on hours worked at second job in last week. Empties are if they indicated they didn't work second job.
# rsp_df['qn9'] = rsp_df['qn9'].cat.add_categories('0.0')
# rsp_df['qn9'].fillna(value='0.0', inplace = True)
rsp_df["qn9"].fillna(value=0.0, inplace=True)
rsp_df["qn9"].replace("don't know", "", inplace=True)
rsp_df["qn9"].replace("refused", "", inplace=True)
rsp_df["qn9"] = pd.to_numeric(rsp_df["qn9"])


# Empties exist if subject knew how many hours worked at primary job
rsp_df["qn10a"] = rsp_df.apply(
    lambda row: row["qn6b"] * row["qn9"] if pd.isna(row["qn10a"]) else row["qn10a"],
    axis=1,
)

# Question is skipped if indicated they worked in past week. This fill imputes the fields if the question had been asked.
rsp_df["qn11a"].fillna(value="yes", inplace=True)

# Only empty if they had worked in the past week. Imputing 0's for if the question had been asked.
rsp_df["qn11aa"].fillna(value=0, inplace=True)

# Changing def from "were you not working due to temp absence or layoff from job" to "Were you on temp absence or layoff"
rsp_df["qn12"].fillna(value="no, was not temporarily absent or on layoff", inplace=True)

# Question is if they've looked for work in past 4 weeks. Filling empties with 'employed' since they worked in past week
rsp_df["qn13"] = rsp_df["qn13"].astype("category")
rsp_df["qn13"] = rsp_df["qn13"].cat.add_categories("employed")
rsp_df["qn13"].fillna(value="employed", inplace=True)

# Ask how many weeks worked, blank indicates they haven't
rsp_df["qn18a"].fillna(value=0.0, inplace=True)
rsp_df["qn18a"].replace("don't know", "", inplace=True)
rsp_df["qn18a"].replace("refused", "", inplace=True)
rsp_df["qn18a"] = pd.to_numeric(rsp_df["qn18a"])

# How many hours usually worked, blank indicates they haven't
rsp_df["qn18b"].fillna(value=0.0, inplace=True)
rsp_df["qn18b"].replace("don't know", "", inplace=True)
rsp_df["qn18b"].replace("refused", "", inplace=True)
rsp_df["qn18b"] = pd.to_numeric(rsp_df["qn18b"])

# Income before taxes, blank indicates they haven't worked
rsp_df["qn18c"].fillna(value=0.0, inplace=True)
rsp_df["qn18c"].replace("don't know", "", inplace=True)
rsp_df["qn18c"].replace("refused", "", inplace=True)
rsp_df["qn18c"] = pd.to_numeric(rsp_df["qn18c"])

# Adding to Drop as it doesn't contain any information
to_drop.append("qn18d01")

# # Date of first job, commented out for now
# rsp_df['qn18dyear'].replace("2013 or earlier", 2013.0, inplace=True)
# rsp_df['qn18dyear'].replace("2018 or later", 2018.0, inplace=True)
# rsp_df['qn18d'] = rsp_df["qn18dmnth"].astype(str) + " " + rsp_df['qn18dyear'].astype(str)

# Blank if they are not working
rsp_df["qn20"].fillna(value="none/not working", inplace=True)

# Empties indicate they had not received any job training in past 12 months i.e. they have "0" training hours
rsp_df["qn24b"].fillna(value=0, inplace=True)

# Question is if they were attending school to obtain degree, blank indicates they are not in school
rsp_df["qn25b"].fillna(value="not in school", inplace=True)

# Question is if they were attending school to obtain degree, blank indicates they are not in school
rsp_df["qn25c"].fillna(value="not pursuing degree", inplace=True)

# Question is if they received degree blank indicates they are not in school
# Slight concern on this one as it might indicate they don't have a degree
rsp_df["qn25d"].fillna(value="not in school", inplace=True)

# These questions aren't asked if subject is looking for job, filling the empties with NA.
# In General, we might be able to decode these into 1 column before editing empties if we want.
for xx in ["01", "02", "03", "04", "05", "06", "07", "08", "97"]:
    qn = "qn17_" + xx
    qn26 = "qn26ha_" + xx
    rsp_df[qn] = rsp_df[qn].astype("category")
    rsp_df[qn26] = rsp_df[qn26].astype("category")
    rsp_df[qn] = rsp_df[qn].cat.add_categories("NA")
    rsp_df[qn26] = rsp_df[qn26].cat.add_categories("NA")
    rsp_df[qn].fillna(value="NA", inplace=True)
    rsp_df[qn26].fillna(value="NA", inplace=True)
    if xx in ["01", "02", "03", "04", "97"]:
        qn29 = "qn29d_" + xx
        rsp_df[qn29] = rsp_df[qn29].astype("category")
        rsp_df[qn29] = rsp_df[qn29].cat.add_categories("NA")
        rsp_df[qn29].fillna(value="NA", inplace=True)

# They indicated they didn't live in currents state a year ago, not perfect, but using initial state could be effective
rsp_df["qn26estate"] = rsp_df.apply(
    lambda row: row["qn1k"] if pd.isna(row["qn26estate"]) else row["qn26estate"], axis=1
)

# They haven't applied for citizenship, filling with NA
rsp_df["qn27bmnth"].fillna(value="NA", inplace=True)
rsp_df["qn27byear"].fillna(value="NA", inplace=True)

# Question is on if subject plans to apply for citizenship, empties indicate they already are
rsp_df["qn27c"].fillna(value="yes", inplace=True)

# Replacing for consistency
rsp_df["qn29c"].replace(
    "no - number of months not covered (range: 02-11)",
    "no - number of months not covered",
    inplace=True,
)

# Imputing based on answer to previous question
rsp_df["qn29c_months"] = rsp_df.apply(
    lambda row: 12.0
    if (pd.isna(row["qn29c_months"]) and row["qn29c"] == "yes - covered in all months")
    else row["qn29c_months"],
    axis=1,
)
rsp_df["qn29c_months"] = rsp_df.apply(
    lambda row: 0.0
    if (pd.isna(row["qn29c_months"]) and row["qn29c"] == "not covered in any month")
    else row["qn29c_months"],
    axis=1,
)
rsp_df["qn29c_months"] = rsp_df.apply(
    lambda row: 1.0
    if (pd.isna(row["qn29c_months"]) and row["qn29c"] == "not covered 1 month or less")
    else row["qn29c_months"],
    axis=1,
)

# Question is about total months on food stamps, empties indicate they never were.
rsp_df["qn30d"].fillna(value=0.0, inplace=True)
rsp_df["qn30d"].replace("don't know", "", inplace=True)
rsp_df["qn30d"].replace("refused", "", inplace=True)
rsp_df["qn30d"].replace("less than one month", 0.0, inplace=True)
rsp_df["qn30d"] = pd.to_numeric(rsp_df["qn30d"])


# Question is about total months on TANF, empties indicate they never were.
rsp_df["qn31d"].fillna(value=0.0, inplace=True)
rsp_df["qn31d"].replace("don't know", "", inplace=True)
rsp_df["qn31d"].replace("refused", "", inplace=True)
rsp_df["qn31d"] = pd.to_numeric(rsp_df["qn31d"])

# Question is if they used TANF in last month, empties indicate they never were on TANF
rsp_df["qn31e"].fillna(value="no", inplace=True)

# Answers are empty if subject answered they had never received TANF, or have always received TANF
## Not sure what to fill in the case the subject answered 'every month' to if they receive TANF
rsp_df["qn31f_months"] = rsp_df.apply(
    lambda row: 0.0
    if (pd.isna(row["qn31f_months"]) and row["qn31f"] == "no months")
    else row["qn31f_months"],
    axis=1,
)

# Question is about total months on RCA, empties indicate they never were.
rsp_df["qn32d"].fillna(value=0.0, inplace=True)
rsp_df["qn32d"].replace("don't know", "", inplace=True)
rsp_df["qn32d"].replace("refused", "", inplace=True)
rsp_df["qn32d"] = pd.to_numeric(rsp_df["qn32d"])

# Question is if they used RCA in last month, empties indicate they never were on TANF
rsp_df["qn32e"].fillna(value="no", inplace=True)

# Question is about total months on SSI, empties indicate they never were.
rsp_df["qn33d"].fillna(value=0.0, inplace=True)
rsp_df["qn33d"].replace("don't know", "", inplace=True)
rsp_df["qn33d"].replace("refused", "", inplace=True)
rsp_df["qn33d"] = pd.to_numeric(rsp_df["qn33d"])

# Question is if they used SSI in last month, empties indicate they never were on TANF
rsp_df["qn33e"].fillna(value="no", inplace=True)

# Answers are empty if subject answered they had never received TANF, or have always received SSI
## Not sure what to fill in the case the subject answered 'every month' to if they receive SSI
rsp_df["qn33f_months"] = rsp_df.apply(
    lambda row: 0.0
    if (pd.isna(row["qn33f_months"]) and row["qn33f"] == "no months")
    else row["qn33f_months"],
    axis=1,
)

# Question is about total months on GA, empties indicate they never were.
rsp_df["qn34d"].fillna(value=0.0, inplace=True)
rsp_df["qn34d"].replace("less than one month", 0.5, inplace=True)
rsp_df["qn34d"].replace("don't know", "", inplace=True)
rsp_df["qn34d"].replace("refused", "", inplace=True)
rsp_df["qn34d"] = pd.to_numeric(rsp_df["qn34d"])

# Question is if they used GA in last month, empties indicate they never were on TANF
rsp_df["qn34e"].fillna(value="no", inplace=True)

# Answers are empty if subject answered they had never received TANF, or have always received TANF
## Not sure what to fill in the case the subject answered 'every month' to if they receive GA
rsp_df["qn34f_months"] = rsp_df.apply(
    lambda row: 0.0
    if (pd.isna(row["qn34f_months"]) and row["qn34f"] == "no months")
    else row["qn34f_months"],
    axis=1,
)

# Filling in some empties to align with fills for qn8a, if qn8a is empty, fill with 0
rsp_df["ui_qn8a_annual"] = rsp_df.apply(
    lambda row: row["qn8a"] * 52
    if pd.isna(row["ui_qn8a_annual"])
    else row["ui_qn8a_annual"],
    axis=1,
)
rsp_df["ui_qn8a_annual"].fillna(value=0.0, inplace=True)

# Filling in some empties to align with fills for qn10a
rsp_df["ui_qn10a_annual"] = rsp_df.apply(
    lambda row: row["qn10a"] * 52
    if pd.isna(row["ui_qn10a_annual"])
    else row["ui_qn10a_annual"],
    axis=1,
)

rsp_df.drop(to_drop, axis=1, inplace=True)

In [286]:
# qn7 - Not sure how to fill empties. Adding 0 would indicate they worked for free which and could throw off calcs
# qn8a - Is reliant on qn7, if we decide on that, qn8 will be better
# qn18c - empties are not from people who answered "Don't know" or 'refused'
# qn18dmnth - blank if they never worked, or don't know when they started
# qn18dyear - blank if they never worked, or don't know when they started
# qn18e - blank if they weren't working. Filling with "no" implies they were working, and it didn't disqualify them from CA
# qn31f_months - Blanks are from if they answered they received this assistance every month, don't know, or refused
# qn33f_months - Blanks are from if they answered they received this assistance every month, don't know, or refused
null_counts = rsp_df.isnull().sum()
print(null_counts[null_counts > 100])

qn6a             150
qn7             2368
qn8a            1963
qn18a            830
qn18b            307
qn18c           2112
qn18dmnth       1953
qn18dyear       1366
qn18e           1054
qn19b           1054
qn27b01         1136
qn29c_months     291
qn30d            178
qn31f_months    1107
qn33f_months    1202
qn34f_months     990
qn38b            116
ui_school        185
dtype: int64


In [287]:
# Attempting to fill in missing data for remaining columns

# Set wage to 0 if person hasn't worked a job recently
rsp_df["qn7"] = rsp_df.apply(
    lambda row: 0 if row["qn5a"] == "no" else row["qn7"],
    axis=1,
)

# Set earnings to 0 if person hasn't worked a job recently
rsp_df["qn8a"] = rsp_df.apply(
    lambda row: 0 if row["qn5a"] == "no" else row["qn8a"],
    axis=1,
)

# Set weeks worked to 0 if person hasn't worked a job in US
rsp_df["qn18a"] = rsp_df.apply(
    lambda row: 0
    if row["qn5a"] == "no" and row["qn11a"] == "never worked in the u.s."
    else row["qn18a"],
    axis=1,
)

# Set hours worked to 0 if person hasn't worked a job in US
rsp_df["qn18b"] = rsp_df.apply(
    lambda row: 0
    if row["qn5a"] == "no" and row["qn11a"] == "never worked in the u.s."
    else row["qn18b"],
    axis=1,
)

# Set earnings to 0 if person hasn't worked a job in US
rsp_df["qn18c"] = rsp_df.apply(
    lambda row: 0
    if row["qn5a"] == "no" and row["qn11a"] == "never worked in the u.s."
    else row["qn18c"],
    axis=1,
)

# Set year to "never worked" if person hasn't worked a job in US
rsp_df["qn18dyear"] = rsp_df.apply(
    lambda row: "never worked in the u.s."
    if row["qn5a"] == "no" and row["qn11a"] == "never worked in the u.s."
    else row["qn18dyear"],
    axis=1,
)

# Set income / cash assistance to "never worked" if person hasn't worked a job in US
rsp_df["qn18e"] = rsp_df.apply(
    lambda row: "never worked in the u.s."
    if row["qn5a"] == "no" and row["qn11a"] == "never worked in the u.s."
    else row["qn18e"],
    axis=1,
)

# Set industry to "never worked" if person hasn't worked a job in US
rsp_df["qn19b"] = rsp_df.apply(
    lambda row: "never worked in the u.s."
    if row["qn5a"] == "no" and row["qn11a"] == "never worked in the u.s."
    else row["qn19b"],
    axis=1,
)

# Remove decimals from qn1jyear and qn27byear (mixed type with float for year and str for special categories)
rsp_df["qn1jyear"] = rsp_df["qn1jyear"].map(str).apply(lambda x: x.replace(".0", ""))
rsp_df["qn27byear"] = rsp_df["qn27byear"].map(str).apply(lambda x: x.replace(".0", ""))

In [290]:
# Removing columns

# Remove due to large amount of missing data that can't be logically filled
rsp_df.drop("qn18c", axis=1, inplace=True, errors="ignore")

# Drop 18a, 18d_month as level of granularity is not necessary
rsp_df.drop("qn18a", axis=1, inplace=True, errors="ignore")
rsp_df.drop("qn18dmnth", axis=1, inplace=True, errors="ignore")

# Removing qn27b01 (administrative column which is not needed)
rsp_df.drop("qn27b01", axis=1, inplace=True, errors="ignore")

# Removing qn31-33f (unnecessary granularity)
rsp_df.drop(
    ["qn31f_months", "qn33f_months", "qn34f_months"],
    axis=1,
    inplace=True,
    errors="ignore",
)

In [291]:
null_counts = rsp_df.isnull().sum()
print(null_counts[null_counts > 10])

qn6a            150
qn6b             16
qn7             494
qn8a             89
qn9              67
qn18b           307
qn18dyear       326
qn18e            14
qn19b            14
qn29c_months    291
qn30d           178
qn31d            28
qn32d            30
qn33d            38
qn34d            17
qn38b           116
ui_school       185
dtype: int64


In [292]:
# Drop data from non-respondents / errors
rsp_df.dropna(inplace=True)

len(rsp_df)

4425

In [294]:
rsp_df.to_csv("../Data/processed/dataset_2016-19", index=False)