In [10]:
from pyquery import PyQuery as pq
import requests
import simplejson as json
from tqdm import tqdm

import csv
from decimal import *
from glob import glob
import gzip
import os

In [11]:
downloaddir = "downloads/"
rawdir = "raw/"
parseddir = "parsed/"
for mydir in [downloaddir, rawdir, parseddir]:
    os.makedirs(mydir, exist_ok=True)

In [12]:
starturl = "https://lehd.ces.census.gov/data/lodes/"

In [13]:
r = requests.get(starturl)

In [14]:
lodesversion = pq(pq(r.content)("a")[-1]).attr("href")

In [15]:
baseurl = starturl + lodesversion

In [16]:
r = requests.get(baseurl)

In [17]:
states = {}
for row in pq(r.content)("tr"):
    if pq(pq(row)("img")).attr("src") == "/icons/folder.gif":
        state = pq(pq(row)("a")[0]).attr("href").replace("/", "")
        if state != "us":
            states[state] = None

In [18]:
badvalue = "None found"
fileyears = {}
for state in list(states.keys()):
    print(f"Searching {state}")
    stateurl = baseurl + state + "/od/"
    r = requests.get(stateurl)
    thingywanted = badvalue
    for row in pq(r.content)("tr")[2:]:
        links = pq(row)("a")
        if links:
            href = pq(pq(row)("a")[0]).attr("href")
            if "main_JT00" in href:
                thingywanted = href
                # print(thingywanted)
    if thingywanted == badvalue:
        fileyear = badvalue
    else:
        fileyear = thingywanted.split("_")[-1][:4]
    if fileyear == "2019" and state == "ar":    # Patch for bad Arkansas 2019 data
        fileyear = "2018"
        thingywanted = thingywanted.replace("2019", "2018")
    if fileyear not in fileyears:
        fileyears[fileyear] = []
    fileyears[fileyear].append(state)
    if thingywanted != badvalue:
        targetfilename = downloaddir + state + fileyear + ".csv.gz"
        if not os.path.exists(targetfilename):
            r = requests.get(stateurl + thingywanted)
            with open(targetfilename, "wb") as outfile:
                outfile.write(r.content)
        
        # Now get the aux files
        thingywanted = thingywanted.replace("_main_", "_aux_")
        targetfilename = downloaddir + state + fileyear + "_aux.csv.gz"
        if not os.path.exists(targetfilename):
            r = requests.get(stateurl + thingywanted)
            with open(targetfilename, "wb") as outfile:
                outfile.write(r.content)

Searching ak
Searching al
Searching ar
Searching az
Searching ca
Searching co
Searching ct
Searching dc
Searching de
Searching fl
Searching ga
Searching hi
Searching ia
Searching id
Searching il
Searching in
Searching ks
Searching ky
Searching la
Searching ma
Searching md
Searching me
Searching mi
Searching mn
Searching mo
Searching ms
Searching mt
Searching nc
Searching nd
Searching ne
Searching nh
Searching nj
Searching nm
Searching nv
Searching ny
Searching oh
Searching ok
Searching or
Searching pa
Searching pr
Searching ri
Searching sc
Searching sd
Searching tn
Searching tx
Searching ut
Searching va
Searching vt
Searching wa
Searching wi
Searching wv
Searching wy


In [19]:
print(json.dumps(fileyears, indent=4*' '))

{
    "2016": [
        "ak"
    ],
    "2020": [
        "al",
        "az",
        "ca",
        "co",
        "ct",
        "dc",
        "de",
        "fl",
        "ga",
        "hi",
        "ia",
        "id",
        "il",
        "in",
        "ks",
        "ky",
        "la",
        "ma",
        "md",
        "me",
        "mi",
        "mn",
        "mo",
        "mt",
        "nc",
        "nd",
        "ne",
        "nh",
        "nj",
        "nm",
        "nv",
        "ny",
        "oh",
        "ok",
        "or",
        "pa",
        "ri",
        "sc",
        "sd",
        "tn",
        "tx",
        "ut",
        "va",
        "vt",
        "wa",
        "wi",
        "wv",
        "wy"
    ],
    "2018": [
        "ar",
        "ms"
    ],
    "None found": [
        "pr"
    ]
}


In [20]:
dataindex = {}
downloadedfilesraw = sorted(list(glob(downloaddir + "*.gz")))

# Filter out aux files, which should be paired with the mains
downloadedfiles = []
for downloadedfileraw in downloadedfilesraw:
    if "_aux" not in downloadedfileraw:
        downloadedfiles.append(downloadedfileraw)
for downloadedfile in downloadedfiles:
    base = downloadedfile.replace("\\", "/").split("/")[-1]    #base filename is everything in the ultimate directory
    state = base[:2]
    dataindex[state] = downloadedfile   # Update with the latest year of data     

In [21]:
# Use main and aux GZIPs to build a single CSV, badly.
for state in tqdm(dataindex):
    sourcefilename = dataindex[state]
    targetfilename = rawdir + state + ".csv"
    with open(targetfilename, "wb") as outfile:
        with gzip.open(sourcefilename, 'rb') as infile:
            outfile.write(infile.read())
        with gzip.open(sourcefilename.replace(".csv", "_aux.csv"), 'rb') as infile:
            outfile.write(infile.read())

100%|██████████████████████████████████████████████████████████████████████████████████| 51/51 [00:57<00:00,  1.13s/it]


In [22]:
countyholder = {}
csvfiles = sorted(list(glob(rawdir + "*.csv")))
for csvfile in tqdm(csvfiles):
    stateholder = {}
    base = csvfile.replace("\\", "/").split("/")[-1][:2]
    targetfilename = parseddir + base + ".json"
    with open(csvfile, "r") as infile:
        reader = csv.DictReader(infile)
        for row in reader:
            if row['w_geocode'] != 'w_geocode':    # Skip extra header row from aux files
                work = row['w_geocode'][:5]
                home = row['h_geocode'][:5]
                pop = int(row['S000'])
                if work not in stateholder:
                    stateholder[work] = {}
                if home not in stateholder[work]:
                    stateholder[work][home] = 0
                stateholder[work][home] += pop

    getcontext().prec = 6    # Decimal degrees of precision
    betterstate = {}
    betterstate['fileyear'] = ""
    fileyear = "error"
    for myyear in fileyears:
        if base in fileyears[myyear]:
            fileyear = myyear
    betterstate['fileyear'] = fileyear
    betterstate['commute'] = {}
    betterstate['metadata'] = {}
    for work in sorted(list(stateholder.keys())):
        betterstate['commute'][work] = {}
        betterstate['metadata'][work] = {}
        betterstate['metadata'][work]['workers'] = 0
        betterstate['metadata'][work]['counties'] = len(stateholder[work])
        localwork = stateholder[work]
        localwork = dict(sorted(stateholder[work].items(), key=lambda x: x[1], reverse=True))
        for home in localwork:
            betterstate['commute'][work][home] = {}
            betterstate['commute'][work][home]['count'] = localwork[home]
            betterstate['commute'][work][home]['share'] = 0
            betterstate['metadata'][work]['workers'] += localwork[home]
    for work in betterstate['commute']:
        workvalue = Decimal(betterstate['metadata'][work]['workers'])
        for home in betterstate['commute'][work]:
            betterstate['commute'][work][home]['share'] = Decimal(betterstate['commute'][work][home]['count']) / workvalue
    for work in betterstate['commute']:
        countyholder[work] = betterstate['commute'][work]
    with open(targetfilename, "w") as outfile:
        outfile.write(json.dumps(betterstate, indent=4*' '))

100%|██████████████████████████████████████████████████████████████████████████████████| 51/51 [12:54<00:00, 15.19s/it]


In [23]:
sortedcounties = {}
for work in sorted(list(countyholder.keys())):
    sortedcounties[work] = countyholder[work]
with open("us-county-commuters.json", "w") as outfile:
    outfile.write(json.dumps(sortedcounties, indent=4*' '))