In [51]:
"""Assemble project dataset, including PM2.5 data, census data, regions, and urban/rural classification."""

import pandas as pd
import numpy as np
import geopandas

import us

# 1. Read PM2.5 data from 0_pm25_data.ipynb

In [52]:
dfpm = pd.read_csv(
    "C:\\Users\\rrice\\OneDrive - Environmental Protection Agency (EPA)\\exposure disparities\\thesis intermediate dataset\\census tract pm25 datasets 11-15-2023.csv"
)

In [54]:
# read in pm25 data
# switch to wide format data

dfpm_j = (
    dfpm.reset_index()
    .set_index(["GEOID", "GISJOIN", "longitude", "latitude", "year"])
    .drop(columns=["index"])
    .unstack()
    .reset_index()
    .set_index(["GEOID", "GISJOIN", "longitude", "latitude"])
    .dropna(axis="columns", how="all")
)

dfpm_j.columns = [
    str(x).strip("()''").replace("', ", "_").strip("_")
    for x in dfpm_j.columns.to_flat_index().tolist()
]  # hack together nice column names
dfpm_j = dfpm_j.reset_index()

In [None]:
pd.set_option("display.max_columns", 100)

dfpm_j.describe(include="all")

# 2. Add tract level demographic data

In [56]:
# updated to get census data straight from the census API, in 0_census_data.ipynb
dem_df = pd.read_csv(
    "C:\\Users\\rrice\\OneDrive - Environmental Protection Agency (EPA)\\exposure disparities\\2010 Census downloaded from API\\2010 census and 2006-2010 acs5 data downloaded 11-1-2023.csv"
)

In [57]:
# split string using US, only keep second part

dem_df["GEOID"] = dem_df["GEO_ID"].str.split("US").str[1].astype("int64")

In [58]:
# get state abbreviations from fips codes

abbr = []
for fips in dem_df.state.unique():
    try:
        abbr.append(us.states.lookup(str(fips).zfill(2)).abbr)
    except:
        abbr.append(np.nan)
abbr[-1] = "DC"

state_map = dict(zip(dem_df.state.unique(), abbr))

dem_df["state_abbr"] = dem_df["state"].replace(state_map)

In [59]:
# join pollutant dataset with census data

df = dfpm_j.set_index("GEOID").join(dem_df.set_index("GEOID"))

# 3. Add EPA and NCA Regions

In [60]:
epa_regions = {
    "CT": "Region 1",
    "ME": "Region 1",
    "MA": "Region 1",
    "NH": "Region 1",
    "RI": "Region 1",
    "VT": "Region 1",
    "NJ": "Region 2",
    "NY": "Region 2",
    "PR": "Region 2",
    "VI": "Region 2",
    "DE": "Region 3",
    "DC": "Region 3",
    "MD": "Region 3",
    "PA": "Region 3",
    "VA": "Region 3",
    "WV": "Region 3",
    "AL": "Region 4",
    "FL": "Region 4",
    "GA": "Region 4",
    "KY": "Region 4",
    "MS": "Region 4",
    "NC": "Region 4",
    "SC": "Region 4",
    "TN": "Region 4",
    "IL": "Region 5",
    "IN": "Region 5",
    "MI": "Region 5",
    "MN": "Region 5",
    "OH": "Region 5",
    "WI": "Region 5",
    "AR": "Region 6",
    "LA": "Region 6",
    "NM": "Region 6",
    "OK": "Region 6",
    "TX": "Region 6",
    "IA": "Region 7",
    "KS": "Region 7",
    "MO": "Region 7",
    "NE": "Region 7",
    "CO": "Region 8",
    "MT": "Region 8",
    "ND": "Region 8",
    "SD": "Region 8",
    "UT": "Region 8",
    "WY": "Region 8",
    "AZ": "Region 9",
    "CA": "Region 9",
    "HI": "Region 9",
    "NV": "Region 9",
    "AS": "Region 9",
    "GU": "Region 9",
    "MP": "Region 9",
    "AK": "Region 10",
    "ID": "Region 10",
    "OR": "Region 10",
    "WA": "Region 10",
}
df = pd.merge(
    df,
    pd.Series(epa_regions, name="EPA Region"),
    left_on="state_abbr",
    right_index=True,
)

In [61]:
# map states to NCA regions
# source: https://scenarios.globalchange.gov/regions_nca4

nca_regions = {
    "Northwest": [
        "WA",
        "OR",
        "ID",
    ],
    "Northern Great Plains": ["MT", "ND", "SD", "WY", "NE"],
    "Midwest": ["MN", "WI", "MI", "IA", "IL", "IN", "OH", "MO"],
    "Northeast": [
        "ME",
        "NH",
        "VT",
        "MA",
        "RI",
        "CT",
        "NY",
        "NJ",
        "PA",
        "WV",
        "MD",
        "DE",
        "DC",
    ],
    "Southwest": ["CA", "NV", "UT", "CO", "AZ", "NM"],
    "Southern Great Plains": ["TX", "OK", "KS"],
    "Southeast": ["AR", "LA", "MS", "AL", "TN", "KY", "GA", "FL", "SC", "NC", "VA"],
}
nca_s = (
    pd.Series(nca_regions, name="state_abbr")
    .explode()
    .reset_index()
    .rename({"index": "NCA Region"}, axis=1)
)

df["NCA Region"] = df["state_abbr"].map(
    nca_s.set_index("state_abbr").to_dict()["NCA Region"]
)

# 4. Add Rural-Urban Commuting Area Codes
classified into categories according to Luben et al 2009

In [62]:
# create ruca categories and join to dataset
# source: https://www.ers.usda.gov/data-products/rural-urban-commuting-area-codes/documentation/
# and categories from RUCA (group 1) in Messer et al 2009 https://doi.org/10.1016/j.annepidem.2009.09.006

"""
Messer et al text sent by Tom:

For these analyses, we categorized RUCA based on both primary and secondary traffic flows. The RUCA categories based on primary flows are as follows:
1) urban core area (RUCA code 1);
2) suburban area (RUCA code 2);
3) micropolitan area (RUCA codes 3, 4, 5, 6);
4) small town area (RUCA codes 7, 8, 9); and
5) rural area (RUCA code 10).

RUCA categories based on secondary flow patterns include the following:
1) urban focused (RUCA codes 1.0, 1.1, 2.0, 2.1, 3.0, 4.1, 5.1, 7.1, 8.1, 10.1);
2) large rural/town (micropolitan) focused (RUCA codes 4.0, 4.2, 5.0, 5.2, 6.0, 6.1);
3) small rural town focused (7.0, 7.2, 7.3, 7.4, 8.0, 8.2, 8.3, 8.4, 9.0, 9.1, 9.2);
4) isolated small rural town focused (10.0, 10.2, 10.3, 10.4, 10.5, 10.6).

"""

ruca_df = pd.read_excel(
    "C:\\Users\\rrice\\OneDrive - Environmental Protection Agency (EPA)\\exposure disparities\\Rural-Urban Commuting Area Codes\\ruca2010revised.xlsx",
    skiprows=1,
    dtype={
        "State-County-Tract FIPS Code (lookup by address at http://www.ffiec.gov/Geocode/)": "int64",
        "Secondary RUCA Code, 2010 (see errata)": str,
    },
).set_index(
    "State-County-Tract FIPS Code (lookup by address at http://www.ffiec.gov/Geocode/)"
)

# add RUCA primary categories

ruca_categories = {
    1: "Urban core",
    2: "Suburban",
    3: "Micropolitan",
    4: "Micropolitan",
    5: "Micropolitan",
    6: "Micropolitan",
    7: "Small town",
    8: "Small town",
    9: "Small town",
    10: "Rural",
}

df = df.join(ruca_df["Primary RUCA Code 2010"].map(ruca_categories).rename("RUCA 1"))

In [63]:
# add in updated income data from Tom Luben
"""
From 6/14/2024 email from Tom:

•	For all tracts within a CBSA: Identify if census tract is in 1st, 2nd, 3rd, or 4th quartile of Median Household Income compared with all other census tracts in that CBSA.
•	For all tracts outside of a CBSA: Identify if census tract is in 1st, 2nd, 3rd, or 4th quartile of Median Household Income compared with all other census tracts in the same state that are not in a CBSA.

"""


income_df = pd.read_csv(
    "C:/Users/RRice/OneDrive - Environmental Protection Agency (EPA)/exposure disparities/thesis intermediate dataset/updated Census tract income quartiles 6-17-2024.csv",
    dtype={"GEOID10": "int64", "Income quartile": "int64"},
).set_index("GEOID10")

df = df.join(income_df).drop("Per Capita Income", axis=1)

In [64]:
# export project dataset

# backup version to onedrive
df.to_csv(
    "C:\\Users\\rrice\\OneDrive - Environmental Protection Agency (EPA)\\exposure disparities\\thesis intermediate dataset\\total + wildfire pm and demographic data 6-17-2024.csv"
)

# parquet file for data file in github
df.to_parquet("data\\total + wildfire pm and demographic data 6-17-2024.parquet")

In [65]:
# export shapefile for use in GIS
tract_polygons = geopandas.read_file(
    "C:\\Users\\rrice\\OneDrive - Environmental Protection Agency (EPA)\\exposure disparities\\nhgis0004_shape\\nhgis0004_shapefile_tl2010_us_tract_2010\\US_tract_2010.shp",
    dtype={"GEOID10": "int64"},
)

out_shape = geopandas.GeoDataFrame(df.join(tract_polygons["geometry"]))

tract_polygons["GEOID10"] = tract_polygons["GEOID10"].astype("int64")

out_shape = geopandas.GeoDataFrame(
    df.join(tract_polygons.set_index("GEOID10")["geometry"])
)

out_shape.to_pickle("temp/out_shape.pkl")