# 20151230-predict-income-from-census

Related post:  
https://stharrold.github.io/20151230-predict-income-from-census.html

Purpose: Predict total annual household income.

## Initialization

### Imports

In [1]:
cd ~

/home/samuel_harrold


In [2]:
# Import standard packages.
import os
# Import installed packages.
import matplotlib.pyplot as plt
import pandas as pd
# Import local packages.
# TODO: remove autoreload after testing.
# %load_ext autoreload
# %autoreload 2
# IPython magic.
%matplotlib inline

## Globals

In [3]:
path_static = os.path.join(os.path.expanduser(r'~'), r'stharrold.github.io/content/static')
basename = r'20151230-predict-income-from-census'
path_disk = os.path.abspath(r'/mnt/disk-20151227t211000z/')
path_acs = os.path.join(path_disk, r'www2-census-gov/programs-surveys/acs/')
path_hus = os.path.join(path_acs, r'data/pums/2013/5-Year/ss13husa.csv') # 'hus' = 'housing US'
path_ddict = os.path.join(path_acs, r'tech_docs/pums/data_dict/PUMSDataDict13.txt')

## Extract-transform-load

**TODO:**
* Just use pandas. Acknowledge dask.

In [30]:
%%time
size_mem = 100e6 # limit ingested data to 100MB for this project
with open(path_hus) as fobj:
    nlines = sum(1 for _ in fobj)
nrows = int(nlines * 100e6 / os.path.getsize(path_hus))
print("nrows = {nrows}".format(nrows=nrows))
df_acs = pd.read_csv(path_hus, nrows=nrows)
print("df_acs RAM usage (MB) = {mem:.1f}".format(mem=df_acs.memory_usage().sum()/1e6))

nrows = 127136
df_acs RAM usage (MB) = 208.5
CPU times: user 5.93 s, sys: 1.41 s, total: 7.34 s
Wall time: 7.34 s


In [31]:
percentiles = [0.1587, 0.5000, 0.8413] # +1 std. dev., mean/median, -1 std. dev. for normal dist.
df_acs.describe(percentiles=percentiles, include='all')

Unnamed: 0,serialno,insp,RT,DIVISION,PUMA00,PUMA10,REGION,ST,ADJHSG,ADJINC,...,WGTP71,WGTP72,WGTP73,WGTP74,WGTP75,WGTP76,WGTP77,WGTP78,WGTP79,WGTP80
count,127136.0,77104.0,127136,127136.0,127136.0,127136.0,127136.0,127136.0,127136.0,127136.0,...,127136.0,127136.0,127136.0,127136.0,127136.0,127136.0,127136.0,127136.0,127136.0,127136.0
unique,,,1,,,,,,,,...,,,,,,,,,,
top,,,H,,,,,,,,...,,,,,,,,,,
freq,,,127136,,,,,,,,...,,,,,,,,,,
mean,2010954000000.0,822.495072,,6.203971,771.943879,576.476041,3.06799,1.06799,1041979.809291,1050881.213236,...,18.386035,18.379287,18.36398,18.377934,18.377116,18.382079,18.374363,18.372184,18.377533,18.375818
std,1406006000.0,763.566754,,0.75519,876.894897,898.711496,0.25173,0.25173,32077.633446,29495.626547,...,20.92366,20.959673,20.751945,20.953753,20.758336,20.786674,20.857573,20.826727,20.669387,21.13635
min,2009000000000.0,0.0,,6.0,-9.0,-9.0,3.0,1.0,1000000.0,1007549.0,...,-21.0,0.0,0.0,-5.0,-2.0,0.0,0.0,0.0,0.0,0.0
15.9%,2009001000000.0,0.0,,6.0,-9.0,-9.0,3.0,1.0,1000000.0,1007549.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
50%,2011001000000.0,750.0,,6.0,400.0,-9.0,3.0,1.0,1035725.0,1054614.0,...,13.0,12.0,13.0,12.0,13.0,13.0,13.0,13.0,13.0,12.0
84.1%,2013000000000.0,1300.0,,6.0,1900.0,1800.0,3.0,1.0,1086032.0,1085467.0,...,33.0,33.0,33.0,33.0,33.0,33.0,33.0,33.0,33.0,33.0


In [None]:
import pdb

In [None]:
%pdb on

In [None]:
# TODO: Move to package.
# TODO: use example data dict as test.
# TODO: def ddict_to_json
# Entries in the data dictionary are "codes for variables", using the ACS terminology.
# https://www.census.gov/programs-surveys/acs/technical-documentation/pums/documentation.html
# Example data dictionary: http://www2.census.gov/programs-surveys/acs/tech_docs/pums/data_dict/PUMSDataDict13.txt
# The data dictionary is not all encoded in UTF-8. Replace encoding errors when found.
ddict = dict()
with open(path_ddict, encoding='utf-8', errors='replace') as fobj:
    # TEST
    nrows = 0
    # Data dictionary name is line 1.
    ddict['name'] = fobj.readline().strip()
    # Data dictionary date is line 2.
    ddict['date'] = fobj.readline().strip()    
    # Initialize flags to catch lines.
    catch_var_name = None
    catch_var_desc = None
    catch_var_code = None
    for line in fobj:
        line = line.strip()
        # TEST: Only read first few records.
        print(line)
        nrows += 1
        if nrows >= 400:
            break
        # Record type is section header 'HOUSING RECORD' or 'PERSON RECORD'.
        if line.endswith('RECORD'):
            record_type = line
            ddict[record_type] = dict()
        # Variable name is preceded by newline.
        # Variable code is followed by newline.
        elif line == '':    
            catch_var_name = True
            catch_var_code = False
        # Variable name is 1 line.
        # Variable name is followed by variable description.
        elif catch_var_name and len(line.split()) == 2:
            (var_name, var_len) = line.split()
            ddict[record_type][var_name] = dict()
            ddict[record_type][var_name]['length'] = var_len
            catch_var_name = False
            catch_var_desc = True
        # Variable description is 1 line.
        # Variable description is followed by variable code(s).
        elif catch_var_desc:
            var_desc = line
            ddict[record_type][var_name]['description'] = var_desc
            catch_var_desc = False
            catch_var_code = True
        # Variable code(s) is 1+ line:
        #     00 .Vacant unit
        #     01 .One person record (one person in household or  
        #     .any person in group quarters)
        #     02..20 .Number of person records (number of persons in
        #     .household)
        # Variable code(s) is followed by newline.
        elif catch_var_code:
            # Example case: "01 .One person record (one person in household or"
            if not line.startswith('.'):
                # Correct explicit instances of misformatted data.
                try:
                    (var_code, var_code_desc) = line.split(sep=' .', maxsplit=1)
                    ddict[record_type][var_name][var_code] = var_code_desc
                except ValueError as err:
                    # Initialize flags to handle error cases.
                    raise_err = True
                    parsed_var_code = False
                    if record_type == 'HOUSING_RECORD':
                        # Case: For var_name = 'RWAT', 'RWATPR' the code and code description
                        # are split with '. ' instead of ' .'
                        #     9. Case is from ...
                        if (var_name == 'RWAT' or var_name == 'RWATPR') and line.startswith('9. Case is from'):
                            (var_code, var_code_desc) = line.split(sep='. ', maxsplit=1)
                            raise_err = False
                            parsed_var_code = True
                        # Case: For var_name = 'SMP' the variable description is continued on the next line.
                        #     Total payment on all second and junior mortgages and home equity loans
                        #     (monthly amount)
                        elif var_name == 'SMP' and line == '(monthly amount)':
                            var_desc = line
                            ddict[record_type][var_name]['description'] += ' '+var_desc
                            raise_err = False
                            parsed_var_code = False
                        else:
                            pass
                    else:
                        pass
                    if raise_err:
                        raise err
                    if parsed_var_code:
                        ddict[record_type][var_name][var_code] = var_code_desc
            # Example case: ".any person in group quarters)"
            else:
                var_code_desc = line.lstrip('.')
                ddict[record_type][var_name][var_code] += ' '+var_code_desc
        # Variable note is preceded by newline.
        # Variable note is 1 line.
        # Variable note is followed by newline.
        elif line.startswith('Note:'):
            var_note = line.lstrip('Note:').strip()
            ddict[record_type][var_name]['note'] = var_note
print('#'*80)
ddict

In [91]:
try:
    (var_code, var_code_desc) = line.split(sep=' .', maxsplit=1)
except ValueError as err:
    print('err raised')

err raised


In [None]:
## Export ipynb to html

In [None]:
path_ipynb = os.path.join(path_static, basename, basename+'.ipynb')
for template in ['basic', 'full']:
    path_html = os.path.splitext(path_ipynb)[0]+'-'+template+'.html'
    cmd = ['jupyter', 'nbconvert', '--to', 'html', '--template', template, path_ipynb, '--output', path_html]
    print(' '.join(cmd))
    subprocess.run(args=cmd, check=True)