In [1]:
# import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import seaborn as sns

## LEHD/LODES

### Information

In [None]:
with open('../data/_etc/lehd/columns.yaml', 'rb') as f:
    lehd_info = yaml.safe_load(f)

### Download data

In [None]:
def download_lehd_lodes(dataset='od', job_type='JT00', state='in', part='main',
                        segment='S000', year=2019, lodes_version=7,
                        aggregate_bg=False, save=True, overwrite=False):
    outfile = Path(f'../data/_etc/lehd/lodes/{dataset}.parquet')
    assert lodes_version in [6, 7]
    assert dataset in ['od', 'rac', 'wac']
    assert part in ['main', 'aux']
    assert job_type in [f'JT{i:02}' for i in range(6)]
    assert segment in 'S000 SA01 SA02 SA03 SE01 SE02 SE03 SI01 SI02 SI03'.split()
    if outfile.exists() and not overwrite:
        return
    root = 'https://lehd.ces.census.gov/data/lodes'
    fname = {'od': f'{state}_od_{part}_{job_type}_{year}',
             'rac': f'{state}_rac_{segment}_{job_type}_{year}',
             'wac': f'{state}_wac_{segment}_{job_type}_{year}'}[dataset]
    url = f'{root}/LODES{lodes_version}/{state}/{dataset}/{fname}.csv.gz'
    df = pd.read_csv(url).drop(columns='createdate', errors='ignore')
    idx_cols = {'od': ['h_geocode', 'w_geocode'],
                'rac': ['h_geocode'], 'wac': ['w_geocode']}[dataset]
    df = df.astype({col: str for col in idx_cols})
    if aggregate_bg:
        for col in idx_cols:
            df[col] = df[col].str.slice(0, 12)
        df = df.groupby(idx_cols).sum().reset_index()
    if save:
        df.to_parquet(U.mkfile(outfile))
    return df

download_lehd_lodes('od') # t=0:17
download_lehd_lodes('rac') # t=0:05
download_lehd_lodes('wac') # t=0:03

In [3]:
def _disp(df, top=3):
    print('Shape: {}, Memory: {:.1f} MB'.format(df.shape, df.memory_usage(deep=True).sum()/1024**2))
    return df.head(top)
pd.DataFrame.disp = _disp

In [None]:
def download_lehd_lodes(dataset='od', job_type='JT00', state='in', part='main',
                        segment='S000', year=2019, lodes_version=7,
                        aggregate_bg=False, save=True):
    assert lodes_version in [6, 7]
    assert dataset in ['od', 'rac', 'wac']
    assert part in ['main', 'aux']
    assert job_type in [f'JT{i:02}' for i in range(6)]
    assert segment in 'S000 SA01 SA02 SA03 SE01 SE02 SE03 SI01 SI02 SI03'.split()
    root = f'https://lehd.ces.census.gov/data/lodes'
    fname = {'od': f'{state}_od_{part}_{job_type}_{year}',
            'rac': f'{state}_rac_{segment}_{job_type}_{year}',
            'wac': f'{state}_wac_{segment}_{job_type}_{year}'}[dataset]
    url = f'{root}/LODES{lodes_version}/{state}/{dataset}/{fname}.csv.gz'
    df = pd.read_csv(url).drop(columns='createdate', errors='ignore')
    idx_cols = {'od': ['h_geocode', 'w_geocode'],
                'rac': ['h_geocode'], 'wac': ['w_geocode']}[dataset]
    df = df.astype({col: str for col in idx_cols})
    if aggregate_bg:
        for col in idx_cols:
            df[col] = df[col].str.slice(0, 12)
        df = df.groupby(idx_cols).sum().reset_index()
    if save:
        fpath = U.mkdir(P.data / f'lehd_lodes/{dataset}') / f'{fname}.csv'
        df.to_csv(fpath, index=False)
    return df

## Download data

In [None]:
def download_lehd_lodes(dataset='od', job_type='JT00', state='in', part='main',
                        segment='S000', year=2019, lodes_version=7,
                        aggregate_bg=False, save=True):
    assert lodes_version in [6, 7]
    assert dataset in ['od', 'rac', 'wac']
    assert part in ['main', 'aux']
    assert job_type in [f'JT{i:02}' for i in range(6)]
    assert segment in 'S000 SA01 SA02 SA03 SE01 SE02 SE03 SI01 SI02 SI03'.split()
    root = f'https://lehd.ces.census.gov/data/lodes'
    fname = {'od': f'{state}_od_{part}_{job_type}_{year}',
            'rac': f'{state}_rac_{segment}_{job_type}_{year}',
            'wac': f'{state}_wac_{segment}_{job_type}_{year}'}[dataset]
    url = f'{root}/LODES{lodes_version}/{state}/{dataset}/{fname}.csv.gz'
    df = pd.read_csv(url).drop(columns='createdate', errors='ignore')
    idx_cols = {'od': ['h_geocode', 'w_geocode'],
                'rac': ['h_geocode'], 'wac': ['w_geocode']}[dataset]
    df = df.astype({col: str for col in idx_cols})
    if aggregate_bg:
        for col in idx_cols:
            df[col] = df[col].str.slice(0, 12)
        df = df.groupby(idx_cols).sum().reset_index()
    if save:
        fpath = U.mkdir(P.data / f'lehd_lodes/{dataset}') / f'{fname}.csv'
        df.to_csv(fpath, index=False)
    return df

In [None]:
rac = pd.read_csv(glob(str(P.data) + '/lehd_lodes/rac/*')[0]).disp()

4,811 rows x 42 cols; Memory: 1.5 MiB


Unnamed: 0,h_geocode,C000,CA01,CA02,CA03,CE01,CE02,CE03,CNS01,CNS02,CNS03,CNS04,CNS05,CNS06,CNS07,CNS08,CNS09,CNS10,CNS11,CNS12,CNS13,CNS14,CNS15,CNS16,CNS17,CNS18,CNS19,CNS20,CR01,CR02,CR03,CR04,CR05,CR07,CT01,CT02,CD01,CD02,CD03,CD04,CS01,CS02
,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>
0.0,180010301001,493,114,233,146,101,164,228,5,0,4,17,130,24,56,15,5,16,3,14,3,17,34,91,4,26,13,16,489,1,0,1,0,2,475,18,44,133,117,85,264,229


In [None]:
wac = pd.read_csv(glob(str(P.data) + '/lehd_lodes/wac/*')[0]).disp()

4,783 rows x 52 cols; Memory: 1.9 MiB


Unnamed: 0,w_geocode,C000,CA01,CA02,CA03,CE01,CE02,CE03,CNS01,CNS02,CNS03,CNS04,CNS05,CNS06,CNS07,CNS08,CNS09,CNS10,CNS11,CNS12,CNS13,CNS14,CNS15,CNS16,CNS17,CNS18,CNS19,CNS20,CR01,CR02,CR03,CR04,CR05,CR07,CT01,CT02,CD01,CD02,CD03,CD04,CS01,CS02,CFA01,CFA02,CFA03,CFA04,CFA05,CFS01,CFS02,CFS03,CFS04,CFS05
,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>
0.0,180010301001,102,17,40,45,15,48,39,1,0,0,26,62,0,0,6,0,0,0,4,0,0,0,0,0,0,3,0,99,2,1,0,0,0,96,6,11,36,28,10,76,26,0,0,0,0,0,0,0,0,0,0


In [None]:
od = pd.read_csv(glob(str(P.data) + '/lehd_lodes/od/*')[0]).disp()

1,036,602 rows x 12 cols; Memory: 94.9 MiB


Unnamed: 0,h_geocode,w_geocode,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03
,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>,<int64>
0.0,180010301001,180010301001,5,0,1,4,1,4,0,3,1,1


In [None]:
geom = pd.read_csv(P.data / 'lehd_lodes/in_xwalk.csv.gz').disp()

  geom = pd.read_csv(P.data / 'lehd_lodes/in_xwalk.csv.gz').disp()


267,071 rows x 43 cols; Memory: 330.3 MiB


Unnamed: 0,tabblk2010,st,stusps,stname,cty,ctyname,trct,trctname,bgrp,bgrpname,cbsa,cbsaname,zcta,zctaname,stplc,stplcname,ctycsub,ctycsubname,stcd116,stcd116name,stsldl,stsldlname,stsldu,stslduname,stschool,stschoolname,stsecon,stseconname,trib,tribname,tsub,tsubname,stanrc,stanrcname,necta,nectaname,mil,milname,stwib,stwibname,blklatdd,blklondd,createdate
,<int64>,<int64>,<object>,<object>,<int64>,<object>,<int64>,<object>,<int64>,<object>,<int64>,<object>,<int64>,<float64>,<int64>,<object>,<int64>,<object>,<int64>,<object>,<int64>,<object>,<int64>,<object>,<int64>,<object>,<int64>,<float64>,<object>,<object>,<int64>,<float64>,<int64>,<float64>,<int64>,<float64>,<float64>,<object>,<int64>,<object>,<float64>,<float64>,<int64>
0.0,180010301001049,18,IN,Indiana,18001,"Adams County, IN",18001030100,"301 (Adams, IN)",180010301001,"1 (Tract 301, Adams, IN)",19540,"Decatur, IN",46733,46733.0,9999999,,1800161884,"Preble township (Adams, IN)",1803,IN-03,18079,"State House District 79, IN",18019,"State Senate District 19, IN",1807680,"North Adams Community Schools, IN",9999999,,99999,,9999999,,9999999,,99999,,,,18180003,Economic Growth Region 3,40.867454,-85.0133,20211018


# Crosswalk

In [23]:
xwalk = pd.read_csv('../LEHD/in_xwalk.csv.gz')
xwalk.view()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Shape: (267071, 43), Memory: 3e+02 MB


Unnamed: 0,tabblk2010,st,stusps,stname,cty,ctyname,trct,trctname,bgrp,bgrpname,...,stanrcname,necta,nectaname,mil,milname,stwib,stwibname,blklatdd,blklondd,createdate
0,180010301001049,18,IN,Indiana,18001,"Adams County, IN",18001030100,"301 (Adams, IN)",180010301001,"1 (Tract 301, Adams, IN)",...,,99999,,,,18180003,Economic Growth Region 3,40.867454,-85.0133,20211018
1,180010301001022,18,IN,Indiana,18001,"Adams County, IN",18001030100,"301 (Adams, IN)",180010301001,"1 (Tract 301, Adams, IN)",...,,99999,,,,18180003,Economic Growth Region 3,40.912124,-85.064384,20211018
2,180010301001044,18,IN,Indiana,18001,"Adams County, IN",18001030100,"301 (Adams, IN)",180010301001,"1 (Tract 301, Adams, IN)",...,,99999,,,,18180003,Economic Growth Region 3,40.868008,-85.026306,20211018


In [24]:
xwalk.columns

Index(['tabblk2010', 'st', 'stusps', 'stname', 'cty', 'ctyname', 'trct',
       'trctname', 'bgrp', 'bgrpname', 'cbsa', 'cbsaname', 'zcta', 'zctaname',
       'stplc', 'stplcname', 'ctycsub', 'ctycsubname', 'stcd116',
       'stcd116name', 'stsldl', 'stsldlname', 'stsldu', 'stslduname',
       'stschool', 'stschoolname', 'stsecon', 'stseconname', 'trib',
       'tribname', 'tsub', 'tsubname', 'stanrc', 'stanrcname', 'necta',
       'nectaname', 'mil', 'milname', 'stwib', 'stwibname', 'blklatdd',
       'blklondd', 'createdate'],
      dtype='object')

In [25]:
xwalk.iloc[0]

tabblk2010                        180010301001049
st                                             18
stusps                                         IN
stname                                    Indiana
cty                                         18001
ctyname                          Adams County, IN
trct                                  18001030100
trctname                          301 (Adams, IN)
bgrp                                 180010301001
bgrpname                 1 (Tract 301, Adams, IN)
cbsa                                        19540
cbsaname                              Decatur, IN
zcta                                        46733
zctaname                                  46733.0
stplc                                     9999999
stplcname                                     NaN
ctycsub                                1800161884
ctycsubname           Preble township (Adams, IN)
stcd116                                      1803
stcd116name                                 IN-03


In [13]:
j2j = pd.read_csv('../LEHD/data/LEHD_j2j_in_all.csv')
j2j.shape

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


(121118, 88)

In [14]:
j2j.head()

Unnamed: 0.1,Unnamed: 0,periodicity,seasonadj,geo_level,geography,ind_level,industry,ownercode,sex,agegrp,...,sMainBS,sMainES,sNEHireSEarn_Dest,sENSepSEarn_Orig,sJobStaySEarn_Orig,sJobStaySEarn_Dest,sEESepSEarn_Orig,sEEHireSEarn_Dest,sAQSepSEarn_Orig,sAQHireSEarn_Dest
0,0,Q,U,S,18,A,0,A00,0,A00,...,-1,1,-1,-1,-1,-1,-1,5,-1,-1
1,1,Q,U,S,18,A,0,A00,0,A00,...,1,1,1,1,1,1,1,1,1,-1
2,2,Q,U,S,18,A,0,A00,0,A00,...,1,1,1,1,1,1,1,1,1,1
3,3,Q,U,S,18,A,0,A00,0,A00,...,1,1,1,1,1,1,1,1,1,1
4,4,Q,U,S,18,A,0,A00,0,A00,...,1,1,1,1,1,1,1,1,1,1


In [19]:
for k, v in j2j.iloc[0].to_dict().items(): print(k, v, sep='\t\t')

Unnamed: 0		0
periodicity		Q
seasonadj		U
geo_level		S
geography		18
ind_level		A
industry		00
ownercode		A00
sex		0
agegrp		A00
race		A0
ethnicity		A0
education		E0
firmage		0
firmsize		0
year		2001
quarter		2
agg_level		1025
MHire		353021
MSep		310294
MJobStart		369346
MJobEnd		336233
EEHire		110273
EESep		111730
AQHire		nan
AQSep		54313.0
J2JHire		nan
J2JSep		166043.0
NEHire		223155
ENSep		188205
NEPersist		nan
ENPersist		133892.0
NEFullQ		nan
ENFullQ		115173.0
MainB		2613435
MainE		2645340
EESepS		nan
EEHireS		nan
AQSepS		nan
AQHireS		nan
NEPersistS		nan
ENPersistS		nan
JobStayS		nan
MainBS		nan
MainES		2297041.0
NEHireSEarn_Dest		nan
ENSepSEarn_Orig		nan
JobStaySEarn_Orig		nan
JobStaySEarn_Dest		nan
EESepSEarn_Orig		nan
EEHireSEarn_Dest		nan
AQSepSEarn_Orig		nan
AQHireSEarn_Dest		nan
sMHire		1
sMSep		1
sMJobStart		1
sMJobEnd		1
sEEHire		1
sEESep		1
sAQHire		-1
sAQSep		1
sJ2JHire		-1
sJ2JSep		1
sNEHire		1
sENSep		1
sNEPersist		-1
sENPersist		1
sNEFullQ		-1
sENFullQ		1
sMainB		1
sMa

# Job-to-Job (J2J)

In [26]:
j2j = pd.read_csv('../LEHD/j2j/LEHD_j2j_in_all.csv'); j2j.view()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Shape: (121118, 88), Memory: 1e+02 MB


Unnamed: 0.1,Unnamed: 0,periodicity,seasonadj,geo_level,geography,ind_level,industry,ownercode,sex,agegrp,...,sMainBS,sMainES,sNEHireSEarn_Dest,sENSepSEarn_Orig,sJobStaySEarn_Orig,sJobStaySEarn_Dest,sEESepSEarn_Orig,sEEHireSEarn_Dest,sAQSepSEarn_Orig,sAQHireSEarn_Dest
0,0,Q,U,S,18,A,0,A00,0,A00,...,-1,1,-1,-1,-1,-1,-1,5,-1,-1
1,1,Q,U,S,18,A,0,A00,0,A00,...,1,1,1,1,1,1,1,1,1,-1
2,2,Q,U,S,18,A,0,A00,0,A00,...,1,1,1,1,1,1,1,1,1,1


# LODES
Origin-destination data

[Technical documentation](https://lehd.ces.census.gov/data/lodes/LODES7/LODESTechDoc7.5.pdf)

In [31]:
lodes = pd.read_csv('../LEHD/lodes/in_od_main_JT05_2019.csv.gz')
lodes.view()

Shape: (9790, 13), Memory: 1e+00 MB


Unnamed: 0,w_geocode,h_geocode,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03,createdate
0,180010302003000,180010301002016,1,0,1,0,0,0,1,0,0,1,20211018
1,180010307002032,181790404002022,1,1,0,0,0,0,1,0,0,1,20211018
2,180030008002000,180030108132020,1,0,0,1,0,0,1,0,0,1,20211018


In [36]:
lodes = lodes.rename(columns=dict(
    w_geocode = 'work_fips',
    h_geocode = 'home_fips',
    S000 = 'tot_jobs', # total no. of jobs
    SA01 = 'njobs_age_low', # age: ≤29
    SA02 = 'njobs_age_med', # age: 30-54
    SA03 = 'njobs_age_high', # age: ≥55
    SE01 = 'njobs_income_low', # monthly earnings: ≤$1250
    SE02 = 'njobs_income_med', # "": $1251-3333
    SE03 = 'njobs_income_high', # "": >$3333
    SI01 = 'njobs_indus_goods', # industry: goods producing
    SI02 = 'njobs_indus_util', # industry: trade, transport, utilities
    SI03 = 'njobs_indus_serv', # industry: all other services
))

In [37]:
lodes

Unnamed: 0,work_fips,home_fips,tot_jobs,njobs_age_low,njobs_age_med,njobs_age_high,njobs_income_low,njobs_income_med,njobs_income_high,njobs_indus_goods,njobs_indus_util,njobs_indus_serv,createdate
0,180010302003000,180010301002016,1,0,1,0,0,0,1,0,0,1,20211018
1,180010307002032,181790404002022,1,1,0,0,0,0,1,0,0,1,20211018
2,180030008002000,180030108132020,1,0,0,1,0,0,1,0,0,1,20211018
3,180030013001019,180030005001026,1,0,1,0,0,1,0,0,0,1,20211018
4,180030013001019,180030102023006,1,0,0,1,0,1,0,0,0,1,20211018
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9785,181830503004039,181830505003038,1,1,0,0,0,0,1,0,0,1,20211018
9786,181830504004008,181830501002019,1,0,1,0,0,0,1,0,0,1,20211018
9787,181830505002001,180030103071008,1,0,1,0,0,1,0,0,0,1,20211018
9788,181830505002001,181830504001046,1,0,1,0,0,0,1,0,0,1,20211018


In [40]:
lodes.work_fips.nunique()

484