In [12]:
"""Download census tract level population variables from 2010 using the census API."""

import pandas as pd
from census import Census
from us import states

import requests

TEST = False  # use smaller set of data for testing
YEAR = 2010  # census year

In [13]:
# use the census package to get census data from the census api
# get 2010 demographic data at the census tract for population variables of interest
# 2010 census summary file 1 (sf1) tech docs at https://www2.census.gov/programs-surveys/decennial/2010/technical-documentation/complete-tech-docs/summary-file/sf1.pdf
# SF1 API variables here https://api.census.gov/data/2010/dec/sf1/variables.html
# group P9 with race by hispanic ethnicity variables variables: https://api.census.gov/data/2010/dec/sf1/groups/P9.html
# 2010 ACS5 variables at https://api.census.gov/data/2010/acs/acs5/subject/variables.html

# read api key
with open("temp/census_api_key.txt", "r") as f:
    api_key = f.readlines()[0].strip("\n")

# connect to census api
c = Census(api_key, year=YEAR)

# get contiguous states
contiguous = pd.DataFrame(
    {"is_contiguous": states.mapping("fips", "is_contiguous", states=states.STATES)}
)

# get all state fip codes
state_names = pd.DataFrame(
    {"fips": states.mapping("fips", "name", states=states.STATES)}
)

# make dataframe of contiguous states and their fips codes
contiguous_states = (
    state_names.join(contiguous)
    .query("is_contiguous == True")
    .drop(columns="is_contiguous")
)

# add DC
contiguous_states = pd.concat(
    [
        contiguous_states,
        pd.DataFrame(
            ["District of Columbia"], index=[states.DC.fips], columns=["fips"]
        ),
    ]
)

# use test flag to get smaller set of data for testing or full set of data
if TEST:
    state_fips = [states.CA.fips]
else:
    state_fips = contiguous_states.index.tolist()

In [14]:
# create dictionary of census variables to download

# total population
# race/ethnicity
# income
# urban/rural

# NH = non-hispanic

# list of variables to get from the 2010 decennial census
census_codes = {
    "Name": "NAME",
    "GEO_ID": "GEO_ID",
    "Total Population": "P001001",
    "Urban Population": "P002002",
    "Rural Population": "P002005",
    "Hispanic": "P009002",
    "NH White": "P009005",
    "NH Black": "P009006",
    "NH American Indian and Alaska Native": "P009007",
    "NH Asian": "P009008",
    "NH Native Hawaiian and Other Pacific Islander": "P009009",
    "NH Other": "P009010",
}

census_variables = pd.DataFrame([census_codes]).T.reset_index()
census_variables.columns = ["variable name", "variable"]

# list of ACS5 variables to get

# language variables from https://api.census.gov/data/2010/acs/acs5/groups/B16001.html
# Use Liu et al categories https://ehp.niehs.nih.gov/doi/10.1289/EHP8584
acs5_codes = {
    "Per Capita Income": "B19301_001E",
    "Language spoken at home: only English": "B16001_002E",
}

acs5_variables = pd.DataFrame([acs5_codes]).T.reset_index()
acs5_variables.columns = ["variable name", "variable"]

In [15]:
# create dataframe of language variables using the B16001 group docs

r = requests.get("https://api.census.gov/data/2010/acs/acs5/groups/B16001.json")

# make dataframe of language variables
df_lang = pd.DataFrame(r.json()["variables"]).T
df_lang = df_lang.loc[df_lang.label.str.startswith("Estimate!!")]
df_lang_vars = df_lang.label.str.split("!!", expand=True).drop(0, axis="columns")
df_lang_vars.columns = ["Estimate", "Language", "Category"]

# create list of language variables, speaks english well and not well
speaks_english_not_well = df_lang_vars.loc[
    df_lang_vars.Category == 'Speak English less than "very well"'
].index.tolist()
speaks_english_well = df_lang_vars.loc[
    df_lang_vars.Category == 'Speak English "very well"'
].index.tolist()

In [16]:
# get census variables of interest for every census tract in list of states

dfl = []

for state_fip in state_fips:

    # get decennial census variables
    dft = pd.DataFrame(
        c.sf1.state_county_tract(
            census_variables.variable.tolist(), state_fip, Census.ALL, Census.ALL
        )
    )

    acs5_total_variable_list = (
        speaks_english_well + speaks_english_not_well + acs5_variables.variable.tolist()
    )

    # get 5-year acs variables (for 2010, this is 2006-2010)
    dft[acs5_total_variable_list] = pd.DataFrame(
        c.acs5.state_county_tract(
            acs5_total_variable_list,
            state_fip,
            Census.ALL,
            Census.ALL,
        )
    )[acs5_total_variable_list]

    dfl.append(dft)

df = pd.concat(dfl)

In [17]:
# aggregate individual languages into English or not English categories

df["Language other than English spoken at home, speaks English well"] = df[
    speaks_english_well
].sum(axis="columns")
df["Language other than English spoken at home, does not speak English well"] = df[
    speaks_english_not_well
].sum(axis="columns")

df = df.drop(columns=speaks_english_well + speaks_english_not_well)

In [18]:
# update columns with readable names

census_codes.update(acs5_codes)
df = df.rename(columns={v: k for k, v in census_codes.items()})

In [19]:
df.set_index(["GEO_ID", "state", "county", "tract"]).to_csv(
    "C:\\Users\\rrice\\OneDrive - Environmental Protection Agency (EPA)\\exposure disparities\\2010 Census downloaded from API\\2010 census and 2006-2010 acs5 data downloaded 11-1-2023.csv"
)

In [None]:
df