In [1]:
from pyquery import PyQuery as pq
import requests
from tqdm import tqdm

import csv
from glob import glob
from io import TextIOWrapper, BytesIO
import os
import zipfile

In [2]:
hosturl = "https://www2.census.gov/programs-surveys/decennial/2020/data/01-Redistricting_File--PL_94-171/"
rawdatadir = "rawdata/"
geofilesdir = "geofiles/"
os.makedirs(rawdatadir, exist_ok=True)
os.makedirs(geofilesdir, exist_ok=True)

In [3]:
# Headers are from https://www.census.gov/programs-surveys/decennial-census/about/rdo/summary-files/2020.html
headersraw = """FILEID STUSAB SUMLEV GEOVAR GEOCOMP CHARITER CIFSN LOGRECNO GEOID GEOCODE REGION DIVISION STATE STATENS COUNTY COUNTYCC COUNTYNS COUSUB COUSUBCC COUSUBNS SUBMCD SUBMCDCC SUBMCDNS ESTATE ESTATECC ESTATENS CONCIT CONCITCC CONCITNS PLACE PLACECC PLACENS TRACT BLKGRP BLOCK AIANHH AIHHTLI AIANHHFP AIANHHCC AIANHHNS AITS AITSFP AITSCC AITSNS TTRACT TBLKGRP ANRC ANRCCC ANRCNS CBSA MEMI CSA METDIV NECTA NMEMI CNECTA NECTADIV CBSAPCI NECTAPCI UA UATYPE UR CD116 CD118 CD119 CD120 CD121 SLDU18 SLDU22 SLDU24 SLDU26 SLDU28 SLDL18 SLDL22 SLDL24 SLDL26 SLDL28 VTD VTDI ZCTA SDELM SDSEC SDUNI PUMA AREALAND AREAWATR BASENAME NAME FUNCSTAT GCUNI POP100 HU100 INTPTLAT INTPTLON LSADC PARTFLAG UGA"""
headers = headersraw.split()

In [4]:
fieldswanted = "FILEID STUSAB SUMLEV GEOVAR GEOCOMP CHARITER CIFSN LOGRECNO NAME GEOID GEOCODE STATE COUNTYCC POP100 INTPTLAT INTPTLON".split()

In [5]:
r = requests.get(hosturl)

In [6]:
html = r.content
table = pq(html)("table")

In [7]:
# Try to identify the states, which have a / in the name, and come after national entries:
seennational = False
statedirs = []
for link in pq(table)("a"):
    localurl = pq(link).attr("href")
    linktext = pq(link).text()
    if "National" in linktext:
        seennational = True
    if seennational:
        if "/" in linktext:    # If it's a directory, it's a state, and we want it:
            statedirs.append(localurl)

In [8]:
# Download the data, if we don't already have it

for statedir in tqdm(statedirs):
    r = requests.get(hosturl + statedir)
    localhtml = r.content
    for link in pq(localhtml)("a"):
        localurl = pq(link)("a").attr("href")
        if localurl:
            if ".pl.zip" in localurl:
                targetfilename = rawdatadir + localurl
                if not os.path.exists(targetfilename):
                    remoteurl = hosturl + statedir + localurl
                    r = requests.get(remoteurl)
                    if r.status_code != 200:
                        print(f"Error downloading {remoteurl}")
                    else:
                        with open(targetfilename, "wb") as outfile:
                            outfile.write(r.content)

100%|██████████████████████████████████████████████████████████████████████████████████| 53/53 [00:21<00:00,  2.49it/s]


In [12]:
# For all of our downloaded ZIPs, look for the "geo" files
for zipfilename in tqdm(glob(rawdatadir + "*")):
    zipfilename = zipfilename.replace("\\", "/")
    basename = zipfilename.split("/")[-1]
    shortname = basename[:2]
    with zipfile.ZipFile(zipfilename, mode="r") as archive:
        zipguts = archive.namelist()
        for zipgut in zipguts:
            if "geo" in zipgut:
                with archive.open(zipgut) as file_contents:
                    reader = csv.DictReader(TextIOWrapper(file_contents, 'latin-1'), delimiter="|", fieldnames=headers)
                    with open(geofilesdir + shortname + ".csv", "w", newline="", encoding="utf-8") as outfile:
                        writer = csv.writer(outfile)
                        writer.writerow(fieldswanted)
                        for row in reader:
                            line = {}
                            for fieldwanted in fieldswanted:
                                line[fieldwanted] = row[fieldwanted]
                            writer.writerow(list(line.values()))                        

100%|██████████████████████████████████████████████████████████████████████████████████| 52/52 [06:16<00:00,  7.24s/it]


In [13]:
print(line)

{'FILEID': 'PLST', 'STUSAB': 'WY', 'SUMLEV': '970', 'GEOVAR': '00', 'GEOCOMP': '00', 'CHARITER': '000', 'CIFSN': '00', 'LOGRECNO': '0069393', 'NAME': 'Remainder of Wyoming', 'GEOID': '9700000US5699999', 'GEOCODE': '5699999', 'STATE': '56', 'COUNTYCC': '', 'POP100': '1558', 'INTPTLAT': '+42.9996722', 'INTPTLON': '-108.5029026'}


In [14]:
shortfields = """STUSAB SUMLEV GEOCODE NAME POP100 INTPTLAT INTPTLON""".split()
lengthswanted = [2, 5]
with open("geo-highlights.csv", "w", newline="", encoding="utf-8") as outfile:
    writer = csv.writer(outfile)
    writer.writerow(shortfields)
    for geofile in tqdm(glob(geofilesdir + "*.csv")):
        with open(geofile, "r", encoding="utf-8") as infile:
            reader = list(csv.DictReader(infile))
            for row in reader:
                if len(row['GEOCODE']) in lengthswanted:
                    line = {}
                    for shortfield in shortfields:
                        line[shortfield] = row[shortfield]
                    writer.writerow(list(line.values()))

100%|██████████████████████████████████████████████████████████████████████████████████| 52/52 [01:24<00:00,  1.62s/it]
