# 20151230-predict-household-income-from-census

Related post:  
https://stharrold.github.io/20151230-predict-household-income-from-census.html

Purpose: Predict total annual household income.

## Initialization

### Imports

In [1]:
cd ~

/home/samuel_harrold


In [2]:
# Import standard packages.
import collections
import json
import os
import pdb # TEST: Comment out pdb after testing.
import pprint
import sys
# Import installed packages.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# Import local packages.
sys.path.insert(0, os.path.join(os.path.curdir, 'stharrold.github.io/content/static/dsdemos'))
# TEST: Comment out autoreload after testing.
%reload_ext autoreload
%autoreload 2
import dsdemos as dsd
# IPython magic.
%matplotlib inline

## Globals

In [22]:
# File paths
path_static = os.path.join(os.path.expanduser(r'~'), r'stharrold.github.io/content/static')
basename = r'20151230-predict-household-income-from-census'
path_disk = os.path.abspath(r'/mnt/disk-20151227t211000z/')
path_acs = os.path.join(path_disk, r'www2-census-gov/programs-surveys/acs/')
path_pcsv = os.path.join(path_acs, r'data/pums/2013/5-Year/ss13pdc.csv') # 'pdc' = 'person DC'
path_hcsv = os.path.join(path_acs, r'data/pums/2013/5-Year/ss13hdc.csv') # 'hdc' = 'housing DC'
path_ecsv = os.path.join(path_acs, r'tech_docs/pums/estimates/pums_estimates_9_13.csv')
path_dtxt = os.path.join(path_acs, r'tech_docs/pums/data_dict/PUMS_Data_Dictionary_2009-2013.txt')
# Statistics
percentiles = [0.1587, 0.5000, 0.8413] # +1 std. dev., mean/median, -1 std. dev. for normal dist.
# Weights
hwt = 'WGTP' # housing
hwts = [hwt+str(inum) for inum in range(1, 81)]
pwt = 'PWGTP' # person
pwts = [pwt+str(inum) for inum in range(1, 81)]

## Extract-transform-load

**TODO:**
* Just use pandas. Acknowledge dask.

### PUMS data

https://www.census.gov/programs-surveys/acs/data/data-via-ftp.html

http://www2.census.gov/programs-surveys/acs/data/pums/2013/5-Year/ss13pdc.csv

http://www2.census.gov/programs-surveys/acs/data/pums/2013/5-Year/ss13hdc.csv

In [4]:
%%time
for path in [path_pcsv, path_hcsv]:
    with open(path) as fobj:
        nlines = sum(1 for _ in fobj)
    print("{path}:".format(path=path))
    print("    size (MB) = {size:.1f}".format(size=os.path.getsize(path)/1e6))
    print("    num lines = {nlines}".format(nlines=nlines))
print()
dfp = pd.read_csv(path_pcsv)
dfh = pd.read_csv(path_hcsv)
for (name, df) in [('dfp', dfp), ('dfh', dfh)]:
    print("{name} RAM usage (MB) = {mem:.1f}".format(
            name=name, mem=df.memory_usage().sum()/1e6))
print()

/mnt/disk-20151227t211000z/www2-census-gov/programs-surveys/acs/data/pums/2013/5-Year/ss13pdc.csv:
    size (MB) = 30.5
    num lines = 30560
/mnt/disk-20151227t211000z/www2-census-gov/programs-surveys/acs/data/pums/2013/5-Year/ss13hdc.csv:
    size (MB) = 13.5
    num lines = 17501

dfp RAM usage (MB) = 72.1
dfh RAM usage (MB) = 28.7

CPU times: user 1.81 s, sys: 148 ms, total: 1.96 s
Wall time: 2.5 s


  """Implementation of execution-related magic functions."""


In [5]:
print(dfp.columns.values)
dfp.head()

['RT' 'SERIALNO' 'SPORDER' 'PUMA00' 'PUMA10' 'ST' 'ADJINC' 'PWGTP' 'AGEP'
 'CIT' 'CITWP05' 'CITWP12' 'COW' 'DDRS' 'DEAR' 'DEYE' 'DOUT' 'DPHY' 'DRAT'
 'DRATX' 'DREM' 'ENG' 'FER' 'GCL' 'GCM' 'GCR' 'HINS1' 'HINS2' 'HINS3'
 'HINS4' 'HINS5' 'HINS6' 'HINS7' 'INTP' 'JWMNP' 'JWRIP' 'JWTR' 'LANX' 'MAR'
 'MARHD' 'MARHM' 'MARHT' 'MARHW' 'MARHYP05' 'MARHYP12' 'MIG' 'MIL' 'MLPA'
 'MLPB' 'MLPCD' 'MLPE' 'MLPFG' 'MLPH' 'MLPI' 'MLPJ' 'MLPK' 'NWAB' 'NWAV'
 'NWLA' 'NWLK' 'NWRE' 'OIP' 'PAP' 'RELP' 'RETP' 'SCH' 'SCHG' 'SCHL' 'SEMP'
 'SEX' 'SSIP' 'SSP' 'WAGP' 'WKHP' 'WKL' 'WKW' 'WRK' 'YOEP05' 'YOEP12' 'ANC'
 'ANC1P05' 'ANC1P12' 'ANC2P05' 'ANC2P12' 'DECADE' 'DIS' 'DRIVESP' 'ESP'
 'ESR' 'FOD1P' 'FOD2P' 'HICOV' 'HISP' 'INDP' 'JWAP' 'JWDP' 'LANP05'
 'LANP12' 'MIGPUMA00' 'MIGPUMA10' 'MIGSP05' 'MIGSP12' 'MSP' 'NAICSP'
 'NATIVITY' 'NOP' 'OC' 'OCCP02' 'OCCP10' 'OCCP12' 'PAOC' 'PERNP' 'PINCP'
 'POBP05' 'POBP12' 'POVPIP' 'POWPUMA00' 'POWPUMA10' 'POWSP05' 'POWSP12'
 'PRIVCOV' 'PUBCOV' 'QTRBIR' 'RAC1P' 'RAC2P05' 'RAC2P

Unnamed: 0,RT,SERIALNO,SPORDER,PUMA00,PUMA10,ST,ADJINC,PWGTP,AGEP,CIT,...,PWGTP71,PWGTP72,PWGTP73,PWGTP74,PWGTP75,PWGTP76,PWGTP77,PWGTP78,PWGTP79,PWGTP80
0,P,2009000000403,1,102,-9,11,1085467,20,38,1,...,6,26,31,32,26,6,36,6,19,20
1,P,2009000001113,1,103,-9,11,1085467,13,78,1,...,13,30,12,13,4,4,18,24,4,21
2,P,2009000001113,2,103,-9,11,1085467,25,39,1,...,26,50,23,20,8,7,38,41,7,37
3,P,2009000001113,3,103,-9,11,1085467,17,8,1,...,15,32,17,15,6,4,26,32,5,30
4,P,2009000001978,1,103,-9,11,1085467,37,53,1,...,65,12,13,37,36,41,57,36,11,33


In [6]:
print(dfh.columns.values)
dfh.head()

['insp' 'RT' 'SERIALNO' 'DIVISION' 'PUMA00' 'PUMA10' 'REGION' 'ST' 'ADJHSG'
 'ADJINC' 'WGTP' 'NP' 'TYPE' 'ACR' 'AGS' 'BATH' 'BDSP' 'BLD' 'BUS' 'CONP'
 'ELEP' 'FS' 'FULP' 'GASP' 'HFL' 'MHP' 'MRGI' 'MRGP' 'MRGT' 'MRGX' 'REFR'
 'RMSP' 'RNTM' 'RNTP' 'RWAT' 'RWATPR' 'SINK' 'SMP' 'STOV' 'TEL' 'TEN'
 'TOIL' 'VACS' 'VALP' 'VEH' 'WATP' 'YBL' 'FES' 'FINCP' 'FPARC' 'GRNTP'
 'GRPIP' 'HHL' 'HHT' 'HINCP' 'HUGCL' 'HUPAC' 'HUPAOC' 'HUPARC' 'KIT' 'LNGI'
 'MULTG' 'MV' 'NOC' 'NPF' 'NPP' 'NR' 'NRC' 'OCPIP' 'PARTNER' 'PLM' 'PSF'
 'R18' 'R60' 'R65' 'RESMODE' 'SMOCP' 'SMX' 'SRNT' 'SVAL' 'TAXP' 'WIF'
 'WKEXREL' 'WORKSTAT' 'FACRP' 'FAGSP' 'FBATHP' 'FBDSP' 'FBLDP' 'FBUSP'
 'FCONP' 'FELEP' 'FFSP' 'FFULP' 'FGASP' 'FHFLP' 'FINSP' 'FKITP' 'FMHP'
 'FMRGIP' 'FMRGP' 'FMRGTP' 'FMRGXP' 'FMVP' 'FPLMP' 'FREFRP' 'FRMSP'
 'FRNTMP' 'FRNTP' 'FRWATP' 'FRWATPRP' 'FSINKP' 'FSMP' 'FSMXHP' 'FSMXSP'
 'FSTOVP' 'FTAXP' 'FTELP' 'FTENP' 'FTOILP' 'FVACSP' 'FVALP' 'FVEHP' 'FWATP'
 'FYBLP' 'WGTP1' 'WGTP2' 'WGTP3' 'WGTP4' 'WGTP5' 'WGTP6' '

Unnamed: 0,insp,RT,SERIALNO,DIVISION,PUMA00,PUMA10,REGION,ST,ADJHSG,ADJINC,...,WGTP71,WGTP72,WGTP73,WGTP74,WGTP75,WGTP76,WGTP77,WGTP78,WGTP79,WGTP80
0,600.0,H,2009000000403,5,102,-9,3,11,1086032,1085467,...,6,25,30,32,26,6,36,6,18,19
1,,H,2009000001113,5,103,-9,3,11,1086032,1085467,...,14,29,12,12,4,4,18,23,4,22
2,480.0,H,2009000001978,5,103,-9,3,11,1086032,1085467,...,65,12,14,37,36,41,57,36,11,34
3,,H,2009000002250,5,105,-9,3,11,1086032,1085467,...,4,4,4,4,23,14,11,4,20,21
4,2500.0,H,2009000002985,5,101,-9,3,11,1086032,1085467,...,66,45,10,35,34,10,34,55,50,10


### PUMS estimates for user verification

https://www.census.gov/programs-surveys/acs/technical-documentation/pums/documentation.2013.html

http://www2.census.gov/programs-surveys/acs/tech_docs/pums/estimates/pums_estimates_9_13.csv

http://www2.census.gov/programs-surveys/acs/tech_docs/pums/ACS2009_2013_PUMS_README.pdf

In [26]:
dfe = pd.read_csv(path_ecsv)
tfmask = dfe['state'] == 'District of Columbia'
dfe_dc = dfe.loc[tfmask]
dfe_dc

Unnamed: 0,st,state,characteristic,pums_est_09_to_13,pums_se_09_to_13,pums_moe_09_to_13
288,11,District of Columbia,Total population,619371,0,0
289,11,District of Columbia,Housing unit population (RELP=0-15),579281,0,0
290,11,District of Columbia,GQ population (RELP=16-17),40090,0,0
291,11,District of Columbia,GQ institutional population (RELP=16),7443,80,132
292,11,District of Columbia,GQ noninstitutional population (RELP=17),32647,80,132
293,11,District of Columbia,Total males (SEX=1),292566,361,595
294,11,District of Columbia,Total females (SEX=2),326805,361,595
295,11,District of Columbia,Age 0-4,36530,253,417
296,11,District of Columbia,Age 5-9,27658,636,1046
297,11,District of Columbia,Age 10-14,24621,598,984


In [99]:
print("Verify 'PERSON RECORD':")
tfmask_test_strs = collections.OrderedDict([
    ('Total population', "[True]*len(dfp)"),
    ('Housing unit population (RELP=0-15)', "np.logical_and(0 <= dfp['RELP'], dfp['RELP'] <= 15)"),
    ('GQ population (RELP=16-17)', "np.logical_and(16 <= dfp['RELP'], dfp['RELP'] <= 17)"),
    ('GQ institutional population (RELP=16)', "dfp['RELP'] == 16"),
    ('GQ noninstitutional population (RELP=17)', "dfp['RELP'] == 17"),
    ('Total males (SEX=1)', "dfp['SEX'] == 1"),
    ('Total females (SEX=2)', "dfp['SEX'] == 2"),
    ('Age 0-4', "np.logical_and(0 <= dfp['AGEP'], dfp['AGEP'] <= 4)"),
    ('Age 5-9', "np.logical_and(5 <= dfp['AGEP'], dfp['AGEP'] <= 9)"),
    ('Age 10-14', "np.logical_and(10 <= dfp['AGEP'], dfp['AGEP'] <= 14)"),
    ('Age 15-19', "np.logical_and(15 <= dfp['AGEP'], dfp['AGEP'] <= 19)"),
    ('Age 20-24', "np.logical_and(20 <= dfp['AGEP'], dfp['AGEP'] <= 24)"),
    ('Age 25-34', "np.logical_and(25 <= dfp['AGEP'], dfp['AGEP'] <= 34)"),
    ('Age 35-44', "np.logical_and(35 <= dfp['AGEP'], dfp['AGEP'] <= 44)"),
    ('Age 45-54', "np.logical_and(45 <= dfp['AGEP'], dfp['AGEP'] <= 54)"),
    ('Age 55-59', "np.logical_and(55 <= dfp['AGEP'], dfp['AGEP'] <= 59)"),
    ('Age 60-64', "np.logical_and(60 <= dfp['AGEP'], dfp['AGEP'] <= 64)"),
    ('Age 65-74', "np.logical_and(65 <= dfp['AGEP'], dfp['AGEP'] <= 74)"),
    ('Age 75-84', "np.logical_and(75 <= dfp['AGEP'], dfp['AGEP'] <= 84)"),
    ('Age 85 and over', "85 <= dfp['AGEP']")])
for char in tfmask_test_strs:
    print("'{char}':".format(char=char), end=' ')
    tfmask_ref = dfe_dc['characteristic'] == char
    ref = int(dfe_dc.loc[tfmask_ref, 'pums_est_09_to_13'].values[0].replace(',', ''))
    tfmask_test = eval(tfmask_test_strs[char])
    test = dfp.loc[tfmask_test, pwt].sum()
    assert ref == test
    print("(ref, test) = {tup}".format(tup=(ref, test)))

Verify 'PERSON RECORD':
'Total population': (ref, test) = (619371, 619371)
'Housing unit population (RELP=0-15)': (ref, test) = (579281, 579281)
'GQ population (RELP=16-17)': (ref, test) = (40090, 40090)
'GQ institutional population (RELP=16)': (ref, test) = (7443, 7443)
'GQ noninstitutional population (RELP=17)': (ref, test) = (32647, 32647)
'Total males (SEX=1)': (ref, test) = (292566, 292566)
'Total females (SEX=2)': (ref, test) = (326805, 326805)
'Age 0-4': (ref, test) = (36530, 36530)
'Age 5-9': (ref, test) = (27658, 27658)
'Age 10-14': (ref, test) = (24621, 24621)
'Age 15-19': (ref, test) = (40950, 40950)
'Age 20-24': (ref, test) = (58828, 58828)
'Age 25-34': (ref, test) = (134025, 134025)
'Age 35-44': (ref, test) = (84310, 84310)
'Age 45-54': (ref, test) = (75981, 75981)
'Age 55-59': (ref, test) = (35191, 35191)
'Age 60-64': (ref, test) = (31070, 31070)
'Age 65-74': (ref, test) = (38245, 38245)
'Age 75-84': (ref, test) = (22283, 22283)
'Age 85 and over': (ref, test) = (9679, 967

In [16]:
((4/80)*((dfh[cols_weights]-dfh[['HINCP']])**2).sum(axis=1))**0.5

30559

### Data dictionary

https://www.census.gov/programs-surveys/acs/technical-documentation/pums/documentation.2013.html

http://www2.census.gov/programs-surveys/acs/tech_docs/pums/data_dict/PUMS_Data_Dictionary_2009-2013.txt

In [23]:
ddict = dsd.census.parse_pumsdatadict(path=path_dtxt)

## Select features

**Notes:**
* Example consumer databases: http://www.consumerreports.org/cro/money/consumer-protection/big-brother-is-watching/overview/index.htm?rurl=http%3A%2F%2Fwww.consumerreports.org%2Fcro%2Fmoney%2Fconsumer-protection%2Fbig-brother-is-watching%2Foverview%2Findex.htm
* Random forests are scale invariant, so they can accommodate non-linear transformation.
* Cast all values to floats so that compatable with most algorithms and can use </> logic. Otherwise less informationally dense and may require deeper tree structure to find features.
* To "map to float" ('b' is N/A, mapped to 0; 1 is Yes; 2 is No; other values are special):  
```python
test = pd.DataFrame(data=[['b', 1.0], ['1', 1.0], ['2', 1.0], ['3', 1.1], ['4', 1.1]], columns=['COL', 'ADJ'])
tfmask = test['COL'].isin(['b'])
test.loc[tfmask, 'COL'] = 0.0
test['COL'] = test['COL'].astype(float)
print(test.dtypes)
test
```
* To "adjust for inflation":  
```python
test['ADJ'] *= 1e-6
tfmask = test['COL'] >= 3.0
test.loc[tfmask, 'COL'] *= test.loc[tfmask, 'ADJ']
test
```
* TODO: Remove vacant units ('NP') from data frame.
* TODO: Filter categorical variables from metadata (those without '..').

In [66]:
record_type = 'PERSON RECORD'
print_detail = False
for key in ddict['record_types'][record_type]:
    desc = ddict['record_types'][record_type][key]['description']
    if not (
        (key.startswith('F') and (desc.endswith(' flag') or desc.endswith(' edit')))
        or ('WGTP' in key and "Weight replicate" in desc)):
        if print_detail:
            print(key)
            pprint.pprint(ddict['record_types'][record_type][key])
        else:
            print("{key}: {desc}".format(key=key, desc=desc))

RT: Record Type
SERIALNO: Housing unit/GQ person serial number
SPORDER: Person number
PUMA00: Public use microdata area code (PUMA) based on Census 2000 definition for data collected prior to 2012. Use in combination with PUMA10.
PUMA10: Public use microdata area code (PUMA) based on 2010 Census definition for data Collected in 2012 or later. Use in combination with PUMA00.
ST: State Code
ADJINC: Adjustment factor for income and earnings dollar amounts (6 implied decimal places)
PWGTP: Person's weight
AGEP: Age
CIT: Citizenship status
CITWP05: Year of naturalization write-in for data collected prior to 2012
CITWP12: Year of naturalization write-in for data collected in 2012 or later
COW: Class of worker
DDRS: Self-care difficulty
DEAR: Hearing difficulty
DEYE: Vision difficulty
DOUT: Independent living difficulty
DPHY: Ambulatory difficulty
DRAT: Veteran service connected disability rating (percentage)
DRATX: Veteran service connected disability rating (checkbox)
DREM: Cognitive diffic

In [85]:
# Include columns that I think companies can easily get.
# for column details: https://www.census.gov/programs-surveys/acs/technical-documentation/pums/documentation.2013.html
# target: HINCP

cols_include = {
    'HOUSING RECORD': [
        'SERIALNO', 'PUMA00', 'PUMA10', 'ST', 'ADJHSG', 'ADJINC', 'WGTP', 'NP', 'BDSP', 'BLD', 'HINCP', 'R18', 'R65'],
    'PERSON RECORD': [
        'SERIALNO', 'SPORDER', 'PWGTP', 'AGEP', 'MAR', 'SCHL', 'INDP']
    }

In [86]:
for record_type in cols_include:
    print(record_type)
    for var_name in cols_include[record_type]:
        desc = ddict['record_types'][record_type][var_name]['description']
        print("{var}: {desc}".format(var=var_name, desc=desc))
    print()

HOUSING RECORD
SERIALNO: Housing unit/GQ person serial number
PUMA00: Public use microdata area code (PUMA) based on Census 2000 definition for data collected prior to 2012. Use in combination with PUMA10.
PUMA10: Public use microdata area code (PUMA) based on 2010 Census definition for data collected in 2012 or later. Use in combination with PUMA00.
ST: State Code
ADJHSG: Adjustment factor for housing dollar amounts (6 implied decimal places)
ADJINC: Adjustment factor for income and earnings dollar amounts (6 implied decimal places)
WGTP: Housing Weight
NP: Number of person records following this housing record
BDSP: Number of bedrooms
BLD: Units in structure
HINCP: Household income (past 12 months)
R18: Presence of persons under 18 years in household (unweighted)
R65: Presence of persons 65 years and over in household (unweighted)

PERSON RECORD
SERIALNO: Housing unit/GQ person serial number
SPORDER: Person number
PWGTP: Person's weight
AGEP: Age
MAR: Marital status
SCHL: Educational

In [80]:
record_type = 'PERSON RECORD'
var_codes = ddict['record_types'][record_type]['INDP']['var_codes']
indp_abbr = dict()
for var_code in var_codes.keys():
    indp_abbr[var_code] = var_codes[var_code].split(sep='-', maxsplit=1)[0][:3]
print(sorted(set(indp_abbr.values())))

['ADM', 'AGR', 'CON', 'EDU', 'ENT', 'EXT', 'FIN', 'INF', 'MED', 'MFG', 'MIL', 'Not', 'PRF', 'RET', 'SCA', 'SRV', 'TRN', 'UNE', 'UTL', 'WHL']


Actions for included columns:  
* HOUSING RECORD
    * SERIALNO: Use to join to PERSON RECORD.
    * PUMA00, PUMA10, ST: Combine and lookup lat-lon coordinates from census.gov.
    * ADJHSG, ADJINC: Multiply against other columns to adjust for inflation. See https://www.census.gov/library/publications/2009/acs/pums.html App 5.
    ADJHSG: CONP, ELEP, FULP, GASP, GRNTP, INSP, MHP, MRGP, SMOCP, RNTP, SMP, WATP
    ADJINC: INTP, OIP, PAP, PERNP, PINCP, RETP, SEMP, SSIP, SSP, WAGP
    * WGTP: Confirm with user verification file.
    * NP: Numerical. Map to float.
    * BDSP: Numerical. Map to float.
    * BLD: Map to median income.
    * HINCP: Include. Map household income to float. Adjust all with ADJINC.
    * R18: Include. Presence of persons under 18 years in household. Map to float.
    * R65: Include. Presence of persons 60+ years in household.
* PERSON RECORD
    * SERIALNO, SPORDER: Use as index.
    * PWGTP: Confirm user verification file.
    * AGEP: Map to float.
    * MAR: Map to float.
    * SCHL: Map to float.
    * INDP: (1) Map to float. (2) Map categories to median income.

## Export ipynb to html

In [158]:
!date --rfc-3339='seconds'

2016-01-01 05:40:17+00:00


In [None]:
path_ipynb = os.path.join(path_static, basename, basename+'.ipynb')
for template in ['basic', 'full']:
    path_html = os.path.splitext(path_ipynb)[0]+'-'+template+'.html'
    cmd = ['jupyter', 'nbconvert', '--to', 'html', '--template', template, path_ipynb, '--output', path_html]
    print(' '.join(cmd))
    subprocess.run(args=cmd, check=True)