# 1. Setup

In [1]:
!pwd

/home/umni2/a/umnilab/users/verma99/mk/spr_4711/code


In [2]:
from mobilkit.umni import *
from project import *

In [3]:
# import yaml

# 2. Prepare data

## 2.1. Download

In [4]:
def download_lodes(table='OD', year=2017, version=7,
                   job_type='JT00', state='in', part='main',
                   segment='S000', save=True, overwrite=False):
    """
    Download the LEHD LODES data from the Census FTP server.
    https://lehd.ces.census.gov/data

    For file structure and parameter details, see this document:
    https://lehd.ces.census.gov/data/lodes/LODES7/LODESTechDoc7.2.pdf
    """
    assert table in ['od', 'rac', 'wac']
    assert version in [5, 7, 8]
    assert part in ['main', 'aux']
    assert job_type in [f'JT{i:02}' for i in range(6)]
    assert segment in 'S000 SA01 SA02 SA03 SE01 SE02 SE03 SI01 SI02 SI03'.split()
    outfile = Path(f'../data/lodes/{table}.parquet')
    if outfile.exists() and not overwrite:
        return pd.read_parquet(outfile)
    root = 'https://lehd.ces.census.gov/data/lodes'
    fname = (f'{state}_{table}_' +
             (part if table == 'od' else segment) +
             f'_{job_type}_{year}.csv.gz')
    url = f'{root}/LODES{version}/{state}/{table}/{fname}'
    df = pd.read_csv(url).drop(columns='createdate', errors='ignore')
    idx_cols = {'od': ['h_geocode', 'w_geocode'],
                'rac': ['h_geocode'], 'wac': ['w_geocode']}[table]
    df = df.astype({col: str for col in idx_cols})
    if save:
        df.to_parquet(U.mkfile(outfile))
    return df

rac = download_lodes('rac', overwrite=0).disp(0) # t=0:05
wac = download_lodes('wac', overwrite=0).disp(0) # t=0:03
od = download_lodes('od', overwrite=0).disp() # t=0:17

169,646 rows x 42 cols; Memory: 64.7 MiB


Unnamed: 0,h_geocode,C000,CA01,CA02,CA03,CE01,CE02,CE03,CNS01,CNS02,CNS03,CNS04,CNS05,CNS06,CNS07,CNS08,CNS09,CNS10,CNS11,CNS12,CNS13,CNS14,CNS15,CNS16,CNS17,CNS18,CNS19,CNS20,CR01,CR02,CR03,CR04,CR05,CR07,CT01,CT02,CD01,CD02,CD03,CD04,CS01,CS02
,<object>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>


49,051 rows x 52 cols; Memory: 22.5 MiB


Unnamed: 0,w_geocode,C000,CA01,CA02,CA03,CE01,CE02,CE03,CNS01,CNS02,CNS03,CNS04,CNS05,CNS06,CNS07,CNS08,CNS09,CNS10,CNS11,CNS12,CNS13,CNS14,CNS15,CNS16,CNS17,CNS18,CNS19,CNS20,CR01,CR02,CR03,CR04,CR05,CR07,CT01,CT02,CD01,CD02,CD03,CD04,CS01,CS02,CFA01,CFA02,CFA03,CFA04,CFA05,CFS01,CFS02,CFS03,CFS04,CFS05
,<object>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>


2,558,389 rows x 12 cols; Memory: 546.5 MiB


Unnamed: 0,w_geocode,h_geocode,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03
,<object>,<object>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>
0.0,180010301001017,180010301001050,2,0,0,2,1,1,0,0,0,2


## 2.2. Clean OD table

In [5]:
def get_lodes_od(scales=SCALES, overwrite=False):
    """
    Process LODES OD table for all zones.
    For details of the table columns, see `../data/lodes/columns.yaml`.
    """
    outfile = Path('../data/lodes/lodes_od.parquet')
    if outfile.exists() and not overwrite:
        return pd.read_parquet(outfile)
    nchar = {'COUNTY': 5, 'TRACT': 11, 'BG': 12}
    od = pd.read_parquet('../data/lodes/od.parquet')
    od = od.rename(columns=dict(
        h_geocode='source',
        w_geocode='target',
        S000='total', # total no. of jobs
        SA01='age_young', # age: ≤29
        SA02='age_mid', # age: 30-54
        SA03='age_elder', # age: ≥55
        SE01='salary_low', # monthly earnings: ≤$1250
        SE02='salary_mid', # "": $1251-3333
        SE03='salary_high', # "": >$3333
        # SI01='sector_goods', # industry: goods producing
        # SI02='sector_utils', # industry: trade, transport, utilities
        # SI03='sector_serv', # industry: all other services
    )).drop(columns=['SI01', 'SI02', 'SI03'])
    df = pd.concat([
        (od.assign(source=od.source.str[:nchar[scale]],
                   target=od.target.str[:nchar[scale]])
         .groupby(['source', 'target']).sum().astype(np.int32)
         .reset_index().assign(scale=scale))
        for scale in scales
    ]).reset_index(drop=True)
    df = df.astype({x: CAT for x in ['scale', 'source', 'target']})
    df.to_parquet(U.mkfile(outfile))
    return df

od = get_lodes_od(overwrite=0).disp() # t=0:30

1,513,761 rows x 10 cols; Memory: 49.0 MiB


Unnamed: 0,source,target,total,age_young,age_mid,age_elder,salary_low,salary_mid,salary_high,scale
,<category>,<category>,<int32>,<int32>,<int32>,<int32>,<int32>,<int32>,<int32>,<category>
0.0,18001,18001,7488,1785,3676,2027,1920,2977,2591,COUNTY
