## Data Cleaning for FAPS Household PUF 

### Imports

In [1]:
import numpy as np
import pandas as pd

### Data

In [2]:
df = pd.read_csv('faps_household_puf.csv')

In [3]:
df.head()

Unnamed: 0,hhnum,initintrvmon,startmon,initfinaldays,initialdate_flag,startdate_edit,startlag,matchconsenthh,nonmetro,region,...,feedback2,feedback3,feedback4_1,feedback4_2,feedback4_3,feedback4_4,feedback4_5,feedback4_6,feedback4_7,feedback4_8
0,100012,1,1,8,0,0,1,1,1,3,...,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,100015,8,8,8,0,0,1,1,0,3,...,6.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,100024,6,6,9,0,0,0,1,0,2,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,100026,7,7,10,0,0,1,1,0,3,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,100028,5,5,8,0,0,1,1,0,1,...,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4826 entries, 0 to 4825
Columns: 279 entries, hhnum to feedback4_8
dtypes: float64(75), int64(202), object(2)
memory usage: 10.3+ MB


### Filter out those with snapnowhh == -997

SNAPNOWHH - Anyone in household is receiving benefits
* 0 - No
* 1 - Yes
* -997 - Don't Know

In [5]:
df = df[df['snapnowhh'] != -997]

In [6]:
df.head()

Unnamed: 0,hhnum,initintrvmon,startmon,initfinaldays,initialdate_flag,startdate_edit,startlag,matchconsenthh,nonmetro,region,...,feedback2,feedback3,feedback4_1,feedback4_2,feedback4_3,feedback4_4,feedback4_5,feedback4_6,feedback4_7,feedback4_8
0,100012,1,1,8,0,0,1,1,1,3,...,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,100015,8,8,8,0,0,1,1,0,3,...,6.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,100024,6,6,9,0,0,0,1,0,2,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,100026,7,7,10,0,0,1,1,0,3,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,100028,5,5,8,0,0,1,1,0,1,...,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4824 entries, 0 to 4825
Columns: 279 entries, hhnum to feedback4_8
dtypes: float64(75), int64(202), object(2)
memory usage: 10.3+ MB


### Filter out fincondition == -997 and fincondition == -998

FINCONDITION - Household's reported financial condition
* 1 - Very comfortable and secure
* 2 - Able to make ends meet without much difficulty
* 3 - Occasionally have some difficulty making ends meet
* 4 - Tought to make ends meet but keeping your head above the water
* 5 - In over your head
* -997 - Don't know
* -998 - Refused

In [8]:
df = df[df['fincondition'] != -997]

In [9]:
df = df[df['fincondition'] != -998]

### Split adltfscat into a binary variable

ADLTFSCAT - Adult food security status - 30 Day
* 1 - High Food Security
* 2 - Marginal Food Security
* 3 - Low Food Security
* 4 - Very Low Food Security

In [10]:
df['adltfscat'].value_counts()

1    2515
2     959
3     784
4     559
Name: adltfscat, dtype: int64

BINADLTFSCAT

* 0 - Food Secure (ADLTFSCAT == 1 or ADLTFSCAT == 2)
* 1 - Food Insecure (ADLTFSCAT == 3 or ADLTFSCAT == 4)

In [11]:
df['binadltfscat'] = df['adltfscat'].apply(lambda x: x == 3 or x == 4)

In [12]:
df['binadltfscat'] = pd.get_dummies(df['binadltfscat'], drop_first=True)

In [13]:
df['binadltfscat'].value_counts()

0    3474
1    1343
Name: binadltfscat, dtype: int64

### Region/Rural Indicator

Create an indicator variable to include information about the region and rural status of a single household
Important for exploratory analysis graphs

In [14]:
df['rrindicator'] = df['region'] * 10 + df['rural']

### Write to a new CSV

faps_household_clean.csv

In [15]:
df.to_csv('faps_household_clean.csv')