# 20151230-predict-household-income-from-census

Related post:  
https://stharrold.github.io/20151230-predict-household-income-from-census.html

Purpose: Predict total annual household income.

## Initialization

### Imports

In [1]:
cd ~

/home/samuel_harrold


In [3]:
# Import standard packages.
import collections
import os
import pdb # TEST: Comment out pdb after testing.
import sys
# Import installed packages.
import matplotlib.pyplot as plt
import pandas as pd
# Import local packages.
sys.path.insert(
    0,
    os.path.join(os.path.curdir, 'stharrold.github.io/content/static/dsdemos'))
# TEST: Comment out autoreload after testing.
%reload_ext autoreload
%autoreload 2
import dsdemos as dsd
# IPython magic.
%matplotlib inline

## Globals

In [4]:
path_static = os.path.join(os.path.expanduser(r'~'), r'stharrold.github.io/content/static')
basename = r'20151230-predict-household-income-from-census'
path_disk = os.path.abspath(r'/mnt/disk-20151227t211000z/')
path_acs = os.path.join(path_disk, r'www2-census-gov/programs-surveys/acs/')
path_csv = os.path.join(path_acs, r'data/pums/2013/5-Year/ss13hdc.csv') # 'hdc' = 'housing DC'
path_ddict = os.path.join(path_acs, r'tech_docs/pums/data_dict/PUMSDataDict13.txt')

## Extract-transform-load

**TODO:**
* Just use pandas. Acknowledge dask.

In [5]:
%%time
with open(path_csv) as fobj:
    nlines = sum(1 for _ in fobj)
print("{path}:".format(path=path_csv))
print("size (MB) = {size:.1f}".format(size=os.path.getsize(path_csv)/1e6))
print("num lines = {nlines}".format(nlines=nlines))
df = pd.read_csv(path_csv)
print("df RAM usage (MB) = {mem:.1f}".format(mem=df.memory_usage().sum()/1e6))

/mnt/disk-20151227t211000z/www2-census-gov/programs-surveys/acs/data/pums/2013/5-Year/ss13hdc.csv:
size (MB) = 13.5
num lines = 17501
df RAM usage (MB) = 28.7
CPU times: user 412 ms, sys: 56 ms, total: 468 ms
Wall time: 466 ms


In [6]:
percentiles = [0.1587, 0.5000, 0.8413] # +1 std. dev., mean/median, -1 std. dev. for normal dist.
df.describe(percentiles=percentiles, include='all')

Unnamed: 0,insp,RT,SERIALNO,DIVISION,PUMA00,PUMA10,REGION,ST,ADJHSG,ADJINC,...,WGTP71,WGTP72,WGTP73,WGTP74,WGTP75,WGTP76,WGTP77,WGTP78,WGTP79,WGTP80
count,6561.0,17500,17500.0,17500.0,17500.0,17500.0,17500.0,17500.0,17500.0,17500.0,...,17500.0,17500.0,17500.0,17500.0,17500.0,17500.0,17500.0,17500.0,17500.0,17500.0
unique,,1,,,,,,,,,...,,,,,,,,,,
top,,H,,,,,,,,,...,,,,,,,,,,
freq,,17500,,,,,,,,,...,,,,,,,,,,
mean,999.282731,,2011068000000.0,5.0,56.427371,37.764171,3.0,11.0,1039364.231657,1048478.770229,...,17.050857,17.043486,17.05,17.049029,17.048,17.051543,17.0532,17.047029,17.046971,17.051486
std,1085.174484,,1401911000.0,0.0,55.291036,55.358495,0.0,0.0,31877.254257,29598.26989,...,17.593886,17.740566,17.534604,17.555515,17.558942,17.574232,17.623017,17.802284,17.267472,17.710924
min,0.0,,2009000000000.0,5.0,-9.0,-9.0,3.0,11.0,1000000.0,1007549.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15.9%,200.0,,2009001000000.0,5.0,-9.0,-9.0,3.0,11.0,1000000.0,1007549.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
50%,790.0,,2011001000000.0,5.0,101.0,-9.0,3.0,11.0,1035725.0,1054614.0,...,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0
84.1%,1500.0,,2013000000000.0,5.0,104.0,104.0,3.0,11.0,1086032.0,1085467.0,...,30.0,30.0,30.0,31.0,30.0,30.0,30.0,30.0,31.0,30.0


In [52]:
%pdb off

Automatic pdb calling has been turned OFF


In [77]:
test = dsd.census.parse_pumsdatadict13(path=path_ddict)

In [78]:
test.keys()

odict_keys(['name', 'date', 'record_types', 'notes'])

In [79]:
test['name']

'2013 ACS PUMS DATA DICTIONARY'

In [80]:
test['date']

'August 7, 2015'

In [81]:
test['record_types'].keys()

odict_keys(['HOUSING RECORD', 'PERSON RECORD'])

In [100]:
test['record_types']['HOUSING RECORD']['RT']

OrderedDict([('length', '1'),
             ('description', 'Record Type'),
             ('var_codes',
              OrderedDict([('H', 'Housing Record or Group Quarters Unit')]))])

In [83]:
test['record_types']['HOUSING RECORD']['PUMA']

OrderedDict([('length', '5'),
             ('description',
              'Public use microdata area code (PUMA) based on 2010 Census definition'),
             ('var_codes',
              OrderedDict([('00100..70301',
                            'Public use microdata area codes')])),
             ('notes',
              ['Note: Public use microdata areas (PUMAs) designate areas of 100,000 or more population.  Use with ST for unique code.'])])

In [84]:
test['record_types']['HOUSING RECORD']['ADJHSG']

OrderedDict([('length', '7'),
             ('description',
              'Adjustment factor for housing dollar amounts (6 implied decimal places)'),
             ('var_codes',
              OrderedDict([('1000000', '2013 factor (1.000000)')])),
             ('notes',
              ['Note: The value of ADJHSG inflation-adjusts reported housing costs to 2013 dollars and applies to variables CONP, ELEP, FULP, GASP, GRNTP, INSP, MHP, MRGP, SMOCP, RNTP, SMP, and WATP in the housing record. ADJHSG does not apply to AGS and TAXP because they are categorical variables that should not be inflation-adjusted. ADJHSG does not apply to VALP.'])])

In [91]:
test['record_types']['HOUSING RECORD']['NP']

OrderedDict([('length', '2'),
             ('description',
              'Number of person records following this housing record'),
             ('var_codes',
              OrderedDict([('00', 'Vacant unit'),
                           ('01',
                            'One person record (one person in household or any person in group quarters)'),
                           ('02..20',
                            'Number of person records (number of persons in household)')]))])

In [96]:
test['record_types']['HOUSING RECORD']['RWAT']

OrderedDict([('length', '1'),
             ('description', 'Hot and cold running water'),
             ('var_codes',
              OrderedDict([('b', 'N/A (GQ)'),
                           ('1', 'Yes'),
                           ('2', 'No'),
                           ('9',
                            'Case is from Puerto Rico, RWAT not applicable')]))])

In [97]:
test['record_types']['HOUSING RECORD']['SMP']

OrderedDict([('length', '5'),
             ('description',
              'Total payment on all second and junior mortgages and home equity loans (monthly amount)'),
             ('var_codes',
              OrderedDict([('bbbbb',
                            'N/A (GQ/vacant/not owned or being bought/ /no second or junior mortgages or home equity loans)'),
                           ('00001..99999',
                            '$1 to $99999 (Rounded and top-coded)')])),
             ('notes',
              ['Note: Use ADJHSG to adjust SMP to constant dollars.'])])

In [85]:
test['record_types']['HOUSING RECORD']['WGTP53']

OrderedDict([('length', '5'),
             ('description', 'Housing Weight replicate 53'),
             ('var_codes',
              OrderedDict([('-9999..09999',
                            'Integer weight of housing unit')]))])

In [86]:
test['record_types']['HOUSING RECORD']['WGTP54']

OrderedDict([('length', '5'),
             ('description', 'Housing Weight replicate 54'),
             ('var_codes',
              OrderedDict([('-9999..09999',
                            'Integer weight of housing unit')]))])

In [101]:
test['record_types']['PERSON RECORD']['RT']

OrderedDict([('length', '1'),
             ('description', 'Record Type'),
             ('var_codes', OrderedDict([('P', 'Person Record')]))])

In [102]:
test['record_types']['PERSON RECORD']['PUMA']

OrderedDict([('length', '5'),
             ('description',
              'Public use microdata area code (PUMA) based on 2010 Census definition'),
             ('var_codes',
              OrderedDict([('00100..70301',
                            'Public use microdata area codes')])),
             ('notes',
              ['Note: Public use microdata areas (PUMAs) designate areas of 100,000 or more population.  Use with ST for unique code.'])])

In [93]:
test['record_types']['PERSON RECORD']['DEAR']

OrderedDict([('length', '1'),
             ('description', 'Hearing difficulty'),
             ('var_codes', OrderedDict([('1', 'Yes'), ('2', 'No')]))])

In [94]:
test['record_types']['PERSON RECORD']['MARHYP']

OrderedDict([('length', '4'),
             ('description', 'Year last married'),
             ('var_codes',
              OrderedDict([('bbbb',
                            'N/A (age less than 15 years; never married)'),
                           ('1932', '1932 or earlier (Bottom-coded)'),
                           ('1933', '1933'),
                           ('1934', '1934'),
                           ('1935', '1935'),
                           ('1936', '1936'),
                           ('1937', '1937'),
                           ('1938', '1938'),
                           ('1939', '1939'),
                           ('1940', '1940'),
                           ('1941', '1941'),
                           ('1942', '1942'),
                           ('1943', '1943'),
                           ('1944', '1944'),
                           ('1945', '1945'),
                           ('1946', '1946'),
                           ('1947', '1947'),
                           ('1948'

In [88]:
test['record_types']['PERSON RECORD']['NWAB']

OrderedDict([('length', '1'),
             ('notes', ['(UNEDITED - See "Employment Status Recode" (ESR))']),
             ('description', 'Temporary absence from work'),
             ('var_codes',
              OrderedDict([('b',
                            'N/A (less than 16 years old/at work/on layoff)'),
                           ('1', 'Yes'),
                           ('2', 'No'),
                           ('3', 'Did not report')]))])

In [87]:
test['record_types']['PERSON RECORD']['INDP']

OrderedDict([('length', '4'),
             ('description',
              'Industry recode for 2013 and later based on 2012 IND codes'),
             ('var_codes',
              OrderedDict([('bbbb',
                            'N/A (less than 16 years old/NILF who last worked more than 5 years ago or never worked)'),
                           ('0170', 'AGR-CROP PRODUCTION'),
                           ('0180', 'AGR-ANIMAL PRODUCTION AND AQUACULTURE'),
                           ('0190', 'AGR-FORESTRY EXCEPT LOGGING'),
                           ('0270', 'AGR-LOGGING'),
                           ('0280', 'AGR-FISHING, HUNTING AND TRAPPING'),
                           ('0290',
                            'AGR-SUPPORT ACTIVITIES FOR AGRICULTURE AND FORESTRY'),
                           ('0370', 'EXT-OIL AND GAS EXTRACTION'),
                           ('0380', 'EXT-COAL MINING'),
                           ('0390', 'EXT-METAL ORE MINING'),
                           ('0470',
         

In [98]:
test['notes'][:5]

['Note for both Industry and Occupation lists in Data Dictionary:',
 '*  In cases where the SOC occupation code ends in X(s) or Y(s), two or more',
 '   SOC occupation codes were aggregated to correspond to a specific Census',
 '   occupation code. In these cases, the Census occupation description is used',
 '   for the SOC occupation title.']

## Export ipynb to html

In [None]:
path_ipynb = os.path.join(path_static, basename, basename+'.ipynb')
for template in ['basic', 'full']:
    path_html = os.path.splitext(path_ipynb)[0]+'-'+template+'.html'
    cmd = ['jupyter', 'nbconvert', '--to', 'html', '--template', template, path_ipynb, '--output', path_html]
    print(' '.join(cmd))
    subprocess.run(args=cmd, check=True)