In [None]:
from pyquery import PyQuery as pq
import requests
import simplejson as json
from tqdm import tqdm

import csv
from decimal import *
from glob import glob
import gzip
import os

In [None]:
downloaddir = "downloads/"
rawdir = "raw/"
parseddir = "parsed/"
for mydir in [downloaddir, rawdir, parseddir]:
    os.makedirs(mydir, exist_ok=True)

In [None]:
starturl = "https://lehd.ces.census.gov/data/lodes/"

In [None]:
r = requests.get(starturl)

In [None]:
lodesversion = pq(pq(r.content)("a")[-1]).attr("href")

In [None]:
baseurl = starturl + lodesversion

In [None]:
r = requests.get(baseurl)

In [None]:
states = {}
for row in pq(r.content)("tr"):
    if pq(pq(row)("img")).attr("src") == "/icons/folder.gif":
        state = pq(pq(row)("a")[0]).attr("href").replace("/", "")
        if state != "us":
            states[state] = None

In [None]:
badvalue = "None found"
fileyears = {}
for state in list(states.keys()):
    print(f"Searching {state}")
    stateurl = baseurl + state + "/od/"
    r = requests.get(stateurl)
    thingywanted = badvalue
    for row in pq(r.content)("tr")[2:]:
        links = pq(row)("a")
        if links:
            href = pq(pq(row)("a")[0]).attr("href")
            if "main_JT00" in href:
                thingywanted = href
                # print(thingywanted)
    if thingywanted == badvalue:
        fileyear = badvalue
    else:
        fileyear = thingywanted.split("_")[-1][:4]
    if fileyear == "2019" and state == "ar":    # Patch for bad Arkansas 2019 data
        fileyear = "2018"
        thingywanted = thingywanted.replace("2019", "2018")
    if fileyear not in fileyears:
        fileyears[fileyear] = []
    fileyears[fileyear].append(state)
    if thingywanted != badvalue:
        targetfilename = downloaddir + state + fileyear + ".csv.gz"
        if not os.path.exists(targetfilename):
            r = requests.get(stateurl + thingywanted)
            with open(targetfilename, "wb") as outfile:
                outfile.write(r.content)
        
        # Now get the aux files
        thingywanted = thingywanted.replace("_main_", "_aux_")
        targetfilename = downloaddir + state + fileyear + "_aux.csv.gz"
        if not os.path.exists(targetfilename):
            r = requests.get(stateurl + thingywanted)
            with open(targetfilename, "wb") as outfile:
                outfile.write(r.content)

In [None]:
print(json.dumps(fileyears, indent=4*' '))

In [None]:
dataindex = {}
downloadedfilesraw = sorted(list(glob(downloaddir + "*.gz")))

# Filter out aux files, which should be paired with the mains
downloadedfiles = []
for downloadedfileraw in downloadedfilesraw:
    if "_aux" not in downloadedfileraw:
        downloadedfiles.append(downloadedfileraw)
for downloadedfile in downloadedfiles:
    base = downloadedfile.replace("\\", "/").split("/")[-1]    #base filename is everything in the ultimate directory
    state = base[:2]
    dataindex[state] = downloadedfile   # Update with the latest year of data     

In [None]:
# Use main and aux GZIPs to build a single CSV, badly.
for state in tqdm(dataindex):
    sourcefilename = dataindex[state]
    targetfilename = rawdir + state + ".csv"
    with open(targetfilename, "wb") as outfile:
        with gzip.open(sourcefilename, 'rb') as infile:
            outfile.write(infile.read())
        with gzip.open(sourcefilename.replace(".csv", "_aux.csv"), 'rb') as infile:
            outfile.write(infile.read())

In [None]:
countyholder = {}
csvfiles = sorted(list(glob(rawdir + "*.csv")))
for csvfile in tqdm(csvfiles):
    stateholder = {}
    base = csvfile.replace("\\", "/").split("/")[-1][:2]
    targetfilename = parseddir + base + ".json"
    with open(csvfile, "r") as infile:
        reader = csv.DictReader(infile)
        for row in reader:
            if row['w_geocode'] != 'w_geocode':    # Skip extra header row from aux files
                work = row['w_geocode'][:5]
                home = row['h_geocode'][:5]
                pop = int(row['S000'])
                if work not in stateholder:
                    stateholder[work] = {}
                if home not in stateholder[work]:
                    stateholder[work][home] = 0
                stateholder[work][home] += pop

    getcontext().prec = 6    # Decimal degrees of precision
    betterstate = {}
    betterstate['fileyear'] = ""
    fileyear = "error"
    for myyear in fileyears:
        if base in fileyears[myyear]:
            fileyear = myyear
    betterstate['fileyear'] = fileyear
    betterstate['commute'] = {}
    betterstate['metadata'] = {}
    for work in sorted(list(stateholder.keys())):
        betterstate['commute'][work] = {}
        betterstate['metadata'][work] = {}
        betterstate['metadata'][work]['workers'] = 0
        betterstate['metadata'][work]['counties'] = len(stateholder[work])
        localwork = stateholder[work]
        localwork = dict(sorted(stateholder[work].items(), key=lambda x: x[1], reverse=True))
        for home in localwork:
            betterstate['commute'][work][home] = {}
            betterstate['commute'][work][home]['count'] = localwork[home]
#            betterstate['commute'][work][home]['workshare'] = 0
#            betterstate['commute'][work][home]['homeshare'] = 0            
            betterstate['metadata'][work]['workers'] += localwork[home]
#    for work in betterstate['commute']:
#        workvalue = Decimal(betterstate['metadata'][work]['workers'])
#        for home in betterstate['commute'][work]:
#            betterstate['commute'][work][home]['workshare'] = Decimal(betterstate['commute'][work][home]['count']) / workvalue
            # betterstate['commute'][work][home]['homeshare'] = Decimal(betterstate['commute'][work][home]['count']) / Decimal(betterstate['metadata'][home]['workers'])
    for work in betterstate['commute']:
        countyholder[work] = betterstate['commute'][work]
    with open(targetfilename, "w") as outfile:
        outfile.write(json.dumps(betterstate, indent=4*' '))

In [None]:
betterstate['metadata']

In [None]:
sortedcounties = {}
for work in sorted(list(countyholder.keys())):
    sortedcounties[work] = countyholder[work]
with open("us-county-commuters.json", "w") as outfile:
    outfile.write(json.dumps(sortedcounties, indent=4*' '))

In [None]:
sortedcounties['12099']

In [None]:
# Now, because someone started relying on this and we can't change the existing data format ...
# Let's start pulling data we already dumped out, bring it back in, and try processing in a more useful way.
# We need to find out how many people are commuting into, and from, each county. First, into:

workerstats = {}
for filename in tqdm(glob(parseddir + "*.json")):
    with open(filename) as infile:
        raw = json.load(infile)
        for fips in raw['metadata']:
            if fips in workerstats:
                print(f"Duplicate {fips}")
            else:
                workerstats[fips] = {}
                workerstats[fips]["incounty"] = raw['metadata'][fips]["workers"]
                workerstats[fips]["fromcounty"] = 0

In [None]:
# Now build data on how many people from a county are workers, working anywhere from this county.

for workfips in sortedcounties:
    if workfips not in workerstats:
        print(f"Somehow missing {workfips}")
    else:
        for homefips in sortedcounties[workfips]:
            workerstats[homefips]["fromcounty"] += sortedcounties[workfips][homefips]["count"]

In [None]:
# Now, let's find out what counties are really important to a particular county

In [None]:
commutecut = 0.10   # If more than 10 percent of a county's workers come from here ...
homecut = 0.20      # If more than 20 percent of a county's workers go to there

In [None]:
cutdict = {}
for workfips in sortedcounties:
    commutebreak = int(float(commutecut) * float(workerstats[workfips]['incounty']))
    cutdict[workfips] = []
    for homefips in sortedcounties[workfips]:
        homebreak = int(float(homecut) * float(workerstats[homefips]['fromcounty']))
        localcount = sortedcounties[workfips][homefips]["count"]
        if localcount >= commutebreak or localcount >= homebreak:
            cutdict[workfips].append(homefips)

In [None]:
with open("county-highlights.json", "w") as outfile:
    outfile.write(json.dumps(cutdict, indent=4*' '))