## Merge the Clean Filtered Household and Clean Individual CSV's

In [1]:
import numpy as np
import pandas as pd

### Importing / Manipulating Household Data

In [2]:
df_household = pd.read_csv('faps_household_clean.csv', index_col = 0)
df_household.head()

Unnamed: 0,hhnum,initintrvmon,startmon,initfinaldays,initialdate_flag,startdate_edit,startlag,matchconsenthh,nonmetro,region,...,feedback4_1,feedback4_2,feedback4_3,feedback4_4,feedback4_5,feedback4_6,feedback4_7,feedback4_8,binadltfscat,rrindicator
0,100012,1,1,8,0,0,1,1,1,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,31
1,100015,8,8,8,0,0,1,1,0,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,30
2,100024,6,6,9,0,0,0,1,0,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,20
3,100026,7,7,10,0,0,1,1,0,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,31
4,100028,5,5,8,0,0,1,1,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,10


#### Calculating the Percent Poverty column for each household

In [3]:
df_household['percent_poverty'] = (df_household['inchhavg_r'] / df_household['povthresh_hh']) * 100

In [4]:
# Keeping the columns of interest for modeling
df_filtered = df_household.filter(['hhnum','adltfscat','binadltfscat','fincondition','region','rural','rrindicator','targetgroup',
    'inchhavg_r', 'snapnowhh','povthresh_hh','elig_units1','percent_poverty'])
print(df_household.shape)
df_filtered.head()

(4817, 282)


Unnamed: 0,hhnum,adltfscat,binadltfscat,fincondition,region,rural,rrindicator,targetgroup,inchhavg_r,snapnowhh,povthresh_hh,elig_units1,percent_poverty
0,100012,2,0,4,3,1,31,4,4667.33,1,2340.583333,0,199.408837
1,100015,2,0,3,3,0,30,2,1200.0,0,995.416667,1,120.552532
2,100024,2,0,3,2,0,20,3,5024.5,0,1318.75,0,381.004739
3,100026,3,1,3,3,1,31,2,1800.0,0,1281.166667,0,140.496943
4,100028,4,1,4,1,0,10,4,3998.0,1,3101.416667,1,128.908832


### Importing / Manipulating Individual Data

In [6]:
df_individual = pd.read_csv('faps_individual_clean.csv')
# Making columns lowercase to match household column names
df_individual.columns = [x.lower() for x in df_individual.columns]
print(df_individual.shape)
df_individual.head()

(4825, 19)


Unnamed: 0,hhnum,percent_female,pr_sex,avg_age_r,pr_age_r,hispanic,mode_racecat_r,alt_pnum,prop_white,prop_black,prop_other,prop_narace,educcat,pr_educcat,ncimmigrant_any,ncimmigrant_every,nchildren,nadults,disabled
0,100012,0.4,0,29.1,47.5,0.0,1,5,1.0,0.0,0.0,0.0,1,2,0,0,2,3,0
1,100015,0.0,0,62.5,62.5,0.0,2,1,0.0,1.0,0.0,0.0,2,2,0,0,0,1,0
2,100024,0.5,1,32.0,47.5,0.0,1,2,1.0,0.0,0.0,0.0,1,1,0,0,1,1,0
3,100026,0.5,1,27.5,27.5,0.0,1,2,1.0,0.0,0.0,0.0,3,3,0,0,0,2,0
4,100028,0.333333,1,26.583333,47.5,0.0,1,6,1.0,0.0,0.0,0.0,3,3,0,0,2,4,1


### Concatenate Dataframes by HHNUM

In [7]:
# Merging household and individual data
df_final = pd.merge(df_filtered, df_individual, on = 'hhnum', how = 'outer')

# For some reason, pd.merge converts the integers in df_filtered to floats...
# This will be fixed after getting rid of NaN's

print(df_final.shape)
df_final

(4826, 31)


Unnamed: 0,hhnum,adltfscat,binadltfscat,fincondition,region,rural,rrindicator,targetgroup,inchhavg_r,snapnowhh,...,prop_black,prop_other,prop_narace,educcat,pr_educcat,ncimmigrant_any,ncimmigrant_every,nchildren,nadults,disabled
0,100012,2.0,0.0,4.0,3.0,1.0,31.0,4.0,4667.33,1.0,...,0.0,0.0,0.0,1.0,2.0,0.0,0.0,2.0,3.0,0.0
1,100015,2.0,0.0,3.0,3.0,0.0,30.0,2.0,1200.00,0.0,...,1.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,1.0,0.0
2,100024,2.0,0.0,3.0,2.0,0.0,20.0,3.0,5024.50,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0
3,100026,3.0,1.0,3.0,3.0,1.0,31.0,2.0,1800.00,0.0,...,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,2.0,0.0
4,100028,4.0,1.0,4.0,1.0,0.0,10.0,4.0,3998.00,1.0,...,0.0,0.0,0.0,3.0,3.0,0.0,0.0,2.0,4.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4821,107940,,,,,,,,,,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,2.0,0.0
4822,109304,,,,,,,,,,...,0.0,1.0,0.0,3.0,3.0,0.0,0.0,0.0,1.0,0.0
4823,111139,,,,,,,,,,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,2.0,1.0,1.0
4824,113856,,,,,,,,,,...,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,1.0,0.0


### Check for NaN values in any household

There are 10 households with NaN values in the rows (from data_cleaning.ipynb)

In [8]:
df_final[df_final.isna().any(axis = 1)]

Unnamed: 0,hhnum,adltfscat,binadltfscat,fincondition,region,rural,rrindicator,targetgroup,inchhavg_r,snapnowhh,...,prop_black,prop_other,prop_narace,educcat,pr_educcat,ncimmigrant_any,ncimmigrant_every,nchildren,nadults,disabled
4257,117741,3.0,1.0,4.0,4.0,0.0,40.0,2.0,2160.0,0.0,...,,,,,,,,,,
4817,101575,,,,,,,,,,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
4818,102398,,,,,,,,,,...,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0
4819,103790,,,,,,,,,,...,0.0,1.0,0.0,2.0,2.0,0.0,0.0,2.0,2.0,0.0
4820,104671,,,,,,,,,,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,2.0,2.0,0.0
4821,107940,,,,,,,,,,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,2.0,0.0
4822,109304,,,,,,,,,,...,0.0,1.0,0.0,3.0,3.0,0.0,0.0,0.0,1.0,0.0
4823,111139,,,,,,,,,,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,2.0,1.0,1.0
4824,113856,,,,,,,,,,...,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,1.0,0.0
4825,118585,,,,,,,,,,...,1.0,0.0,0.0,1.0,1.0,2.0,2.0,0.0,2.0,0.0


Removing these rows

In [9]:
df_final = df_final[df_final.notna().all(axis = 1)]
print(df_final.shape)
df_final

(4816, 31)


Unnamed: 0,hhnum,adltfscat,binadltfscat,fincondition,region,rural,rrindicator,targetgroup,inchhavg_r,snapnowhh,...,prop_black,prop_other,prop_narace,educcat,pr_educcat,ncimmigrant_any,ncimmigrant_every,nchildren,nadults,disabled
0,100012,2.0,0.0,4.0,3.0,1.0,31.0,4.0,4667.33,1.0,...,0.0,0.0,0.0,1.0,2.0,0.0,0.0,2.0,3.0,0.0
1,100015,2.0,0.0,3.0,3.0,0.0,30.0,2.0,1200.00,0.0,...,1.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,1.0,0.0
2,100024,2.0,0.0,3.0,2.0,0.0,20.0,3.0,5024.50,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0
3,100026,3.0,1.0,3.0,3.0,1.0,31.0,2.0,1800.00,0.0,...,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,2.0,0.0
4,100028,4.0,1.0,4.0,1.0,0.0,10.0,4.0,3998.00,1.0,...,0.0,0.0,0.0,3.0,3.0,0.0,0.0,2.0,4.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4812,120049,3.0,1.0,3.0,3.0,0.0,30.0,2.0,2200.00,0.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0
4813,120067,3.0,1.0,5.0,3.0,1.0,31.0,4.0,2500.00,1.0,...,0.0,0.0,0.0,2.0,2.0,0.0,0.0,2.0,4.0,0.0
4814,120077,3.0,1.0,2.0,3.0,1.0,31.0,4.0,1009.00,1.0,...,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,1.0,0.0
4815,120078,3.0,1.0,4.0,3.0,0.0,30.0,4.0,523.60,1.0,...,1.0,0.0,0.0,2.0,2.0,0.0,0.0,1.0,1.0,0.0


### Removing households with weird values as noted in data_cleaning_indiv.ipynb

These include:
* RACECAT_R == 0
* EDUCCAT == 0
* NCIMMIGRANT == 2 -> In the USCITIZEN portion of notebook

There are 8 households with above properties

In [10]:
df_final[(df_final['mode_racecat_r'] == 0) | (df_final['educcat'] == 0) | (df_final['ncimmigrant_every'] == 2)]

Unnamed: 0,hhnum,adltfscat,binadltfscat,fincondition,region,rural,rrindicator,targetgroup,inchhavg_r,snapnowhh,...,prop_black,prop_other,prop_narace,educcat,pr_educcat,ncimmigrant_any,ncimmigrant_every,nchildren,nadults,disabled
762,103068,1.0,0.0,2.0,4.0,0.0,40.0,3.0,4246.676,0.0,...,0.0,0.0,1.0,1.0,1.0,1.0,0.0,2.0,3.0,0.0
1775,107438,3.0,1.0,4.0,1.0,0.0,10.0,2.0,2341.738,0.0,...,0.0,0.0,1.0,3.0,3.0,0.0,0.0,1.0,2.0,0.0
2295,109624,3.0,1.0,3.0,1.0,0.0,10.0,1.0,204.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
2915,112223,3.0,1.0,4.0,3.0,0.0,30.0,4.0,1058.4,1.0,...,0.0,1.0,0.0,3.0,3.0,2.0,2.0,0.0,1.0,0.0
3172,113295,1.0,0.0,2.0,1.0,0.0,10.0,1.0,971.0,0.0,...,0.0,0.0,1.0,3.0,0.0,0.0,0.0,1.0,1.0,1.0
3242,113595,1.0,0.0,2.0,4.0,0.0,40.0,4.0,1188.0,1.0,...,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3426,114319,2.0,0.0,3.0,4.0,0.0,40.0,4.0,1610.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,2.0,0.0
4425,118411,3.0,1.0,4.0,2.0,1.0,21.0,3.0,3518.662,0.0,...,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,2.0,0.0


Removing them:

In [11]:
df_final = df_final[(df_final['mode_racecat_r'] != 0) & (df_final['educcat'] != 0) & (df_final['ncimmigrant_every'] != 2)]
print(df_final.shape)
df_final

(4808, 31)


Unnamed: 0,hhnum,adltfscat,binadltfscat,fincondition,region,rural,rrindicator,targetgroup,inchhavg_r,snapnowhh,...,prop_black,prop_other,prop_narace,educcat,pr_educcat,ncimmigrant_any,ncimmigrant_every,nchildren,nadults,disabled
0,100012,2.0,0.0,4.0,3.0,1.0,31.0,4.0,4667.33,1.0,...,0.0,0.0,0.0,1.0,2.0,0.0,0.0,2.0,3.0,0.0
1,100015,2.0,0.0,3.0,3.0,0.0,30.0,2.0,1200.00,0.0,...,1.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,1.0,0.0
2,100024,2.0,0.0,3.0,2.0,0.0,20.0,3.0,5024.50,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0
3,100026,3.0,1.0,3.0,3.0,1.0,31.0,2.0,1800.00,0.0,...,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,2.0,0.0
4,100028,4.0,1.0,4.0,1.0,0.0,10.0,4.0,3998.00,1.0,...,0.0,0.0,0.0,3.0,3.0,0.0,0.0,2.0,4.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4812,120049,3.0,1.0,3.0,3.0,0.0,30.0,2.0,2200.00,0.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0
4813,120067,3.0,1.0,5.0,3.0,1.0,31.0,4.0,2500.00,1.0,...,0.0,0.0,0.0,2.0,2.0,0.0,0.0,2.0,4.0,0.0
4814,120077,3.0,1.0,2.0,3.0,1.0,31.0,4.0,1009.00,1.0,...,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,1.0,0.0
4815,120078,3.0,1.0,4.0,3.0,0.0,30.0,4.0,523.60,1.0,...,1.0,0.0,0.0,2.0,2.0,0.0,0.0,1.0,1.0,0.0


### Convert Unnecessary Floats back to Integers

In [12]:
df_final.columns

Index(['hhnum', 'adltfscat', 'binadltfscat', 'fincondition', 'region', 'rural',
       'rrindicator', 'targetgroup', 'inchhavg_r', 'snapnowhh', 'povthresh_hh',
       'elig_units1', 'percent_poverty', 'percent_female', 'pr_sex',
       'avg_age_r', 'pr_age_r', 'hispanic', 'mode_racecat_r', 'alt_pnum',
       'prop_white', 'prop_black', 'prop_other', 'prop_narace', 'educcat',
       'pr_educcat', 'ncimmigrant_any', 'ncimmigrant_every', 'nchildren',
       'nadults', 'disabled'],
      dtype='object')

In [13]:
df_final['adltfscat'] = pd.to_numeric(df_final['adltfscat'], downcast='integer')
df_final['binadltfscat'] = pd.to_numeric(df_final['binadltfscat'], downcast='integer')
df_final['fincondition'] = pd.to_numeric(df_final['fincondition'], downcast='integer')
df_final['region'] = pd.to_numeric(df_final['region'], downcast='integer')
df_final['rural'] = pd.to_numeric(df_final['rural'], downcast='integer')
df_final['rrindicator'] = pd.to_numeric(df_final['rrindicator'], downcast='integer')
df_final['targetgroup'] = pd.to_numeric(df_final['targetgroup'], downcast='integer')
df_final['snapnowhh'] = pd.to_numeric(df_final['snapnowhh'], downcast='integer')
df_final['elig_units1'] = pd.to_numeric(df_final['elig_units1'], downcast='integer')
df_final['pr_sex'] = pd.to_numeric(df_final['pr_sex'], downcast='integer')
df_final['mode_racecat_r'] = pd.to_numeric(df_final['mode_racecat_r'], downcast='integer')
df_final['alt_pnum'] = pd.to_numeric(df_final['alt_pnum'], downcast='integer')
df_final['educcat'] = pd.to_numeric(df_final['educcat'], downcast='integer')
df_final['pr_educcat'] = pd.to_numeric(df_final['pr_educcat'], downcast='integer')
df_final['ncimmigrant_any'] = pd.to_numeric(df_final['ncimmigrant_any'], downcast='integer')
df_final['ncimmigrant_every'] = pd.to_numeric(df_final['ncimmigrant_every'], downcast='integer')
df_final['nchildren'] = pd.to_numeric(df_final['nchildren'], downcast='integer')
df_final['nadults'] = pd.to_numeric(df_final['nadults'], downcast='integer')
df_final['disabled'] = pd.to_numeric(df_final['disabled'], downcast='integer')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['adltfscat'] = pd.to_numeric(df_final['adltfscat'], downcast='integer')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['binadltfscat'] = pd.to_numeric(df_final['binadltfscat'], downcast='integer')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['fincondition'] = pd.to_nume

In [14]:
df_final

Unnamed: 0,hhnum,adltfscat,binadltfscat,fincondition,region,rural,rrindicator,targetgroup,inchhavg_r,snapnowhh,...,prop_black,prop_other,prop_narace,educcat,pr_educcat,ncimmigrant_any,ncimmigrant_every,nchildren,nadults,disabled
0,100012,2,0,4,3,1,31,4,4667.33,1,...,0.0,0.0,0.0,1,2,0,0,2,3,0
1,100015,2,0,3,3,0,30,2,1200.00,0,...,1.0,0.0,0.0,2,2,0,0,0,1,0
2,100024,2,0,3,2,0,20,3,5024.50,0,...,0.0,0.0,0.0,1,1,0,0,1,1,0
3,100026,3,1,3,3,1,31,2,1800.00,0,...,0.0,0.0,0.0,3,3,0,0,0,2,0
4,100028,4,1,4,1,0,10,4,3998.00,1,...,0.0,0.0,0.0,3,3,0,0,2,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4812,120049,3,1,3,3,0,30,2,2200.00,0,...,1.0,0.0,0.0,1,1,0,0,1,1,0
4813,120067,3,1,5,3,1,31,4,2500.00,1,...,0.0,0.0,0.0,2,2,0,0,2,4,0
4814,120077,3,1,2,3,1,31,4,1009.00,1,...,0.0,0.0,0.0,2,2,0,0,0,1,0
4815,120078,3,1,4,3,0,30,4,523.60,1,...,1.0,0.0,0.0,2,2,0,0,1,1,0


## Final dataframe contains 4808 households with all the economic, geographic, and demographic publicly available data

### To CSV:

In [15]:
df_final.to_csv('faps_clean.csv')