In [17]:
"""Assign PM2.5 concentrations to Census tracts."""

from monetio.models import cmaq
import pandas as pd
import numpy as np
import xarray as xr
import geopandas
from scipy import interpolate

import calendar

# 1. interpolate Kirk Baker CMAQ dataset to census tracts

In [None]:
census_file = "C:\\Users\\rrice\\OneDrive - Environmental Protection Agency (EPA)\\exposure disparities\\nhgis0003_shape\\US_tract_cenpop_2010.shp"

census_points = geopandas.read_file(census_file, dtype={"GEOID": str})

# interpolate cmaq outputs to census tract level for every year in the dataset
dflist = []

for year in range(2006, 2021):
    if year in range(2007, 2019):
        cmaq_file = f"C:\\Users\\rrice\\OneDrive - Environmental Protection Agency (EPA)\\exposure disparities\\CMAQ kirk baker total and no-fire\\dailyavgs.{year}.12US2.baseline.ncf"

        cmaq_file_nofire = f"C:\\Users\\rrice\\OneDrive - Environmental Protection Agency (EPA)\\exposure disparities\\CMAQ kirk baker total and no-fire\\dailyavgs.{year}.12US2.baseline_0fire.ncf"

        # read both cmaq files and interpolate to census tract level

        interp_pm_list = []

        for file in [cmaq_file, cmaq_file_nofire]:
            ds = cmaq.open_dataset(fname=file)

            dfpm = (
                ds["PMIJ"]
                .to_dataframe()
                .groupby(["longitude", "latitude"])
                .mean()
                .reset_index()
            )

            interp_pm_list.append(
                interpolate.griddata(
                    points=dfpm[["longitude", "latitude"]],
                    values=dfpm["PMIJ"],
                    xi=census_points[["LONGITUDE", "LATITUDE"]],
                    method="cubic",
                )
            )

        dfi = pd.DataFrame(
            {
                "GEOID": census_points["GEOID"],
                "GISJOIN": census_points["GISJOIN"],
                "PM_total": interp_pm_list[0],
                "PM_nofire": interp_pm_list[1],
                "longitude": census_points["LONGITUDE"],
                "latitude": census_points["LATITUDE"],
            }
        )
    else:
        # initialize dataframe with NaNs for years in stanford dataset that aren't in cmaq dataset
        dfi = pd.DataFrame(
            {
                "GEOID": census_points["GEOID"],
                "GISJOIN": census_points["GISJOIN"],
                "PM_total": np.nan,
                "PM_nofire": np.nan,
                "longitude": census_points["LONGITUDE"],
                "latitude": census_points["LATITUDE"],
            }
        )

    dfi["year"] = year

    dflist.append(dfi)

dfe = pd.concat(dflist)
dfe["PM_wf"] = dfe["PM_total"] - dfe["PM_nofire"]  # wildfire-specific PM

# set negative wildfire PM values to 0
dfe.loc[dfe["PM_wf"] < 0, "PM_wf"] = 0

# 1.1 get fasqsd dataset in same format

In [19]:
# read in FAQSD data

dflist = []
for year in range(2006, 2020):
    dfy = pd.read_csv(
        f"C:\\Users\\rrice\\OneDrive - Environmental Protection Agency (EPA)\\exposure disparities\\CMAQ FAQSD data\\{year}_pm25_daily_average.txt.gz",
        compression="gzip",
        parse_dates=["Date"],
        dtype={"FIPS": str},
    )

    # rename pm25 concentration columns in years that they are inconsistent with the rest
    if year in [2015, 2016]:
        dfy = dfy.rename(
            {
                "Prediction": "pm25_daily_average(ug/m3)",
                "SEpred": "pm25_daily_average_stderr(ug/m3)",
                "Loc_Label1": "FIPS",
            },
            axis="columns",
        )
        dfy["FIPS"] = dfy["FIPS"].astype(str)

    dflist.append(dfy)

dff_all = pd.concat(dflist)

dff_all["year"] = dff_all.Date.dt.year

In [None]:
dff_all

In [21]:
# get annual average pm2.5
dff_am_all = dff_all.groupby(["FIPS", "year"])["pm25_daily_average(ug/m3)"].mean()
dff_am_all.name = "faqsd pm25"

# clean up FIPS code and align index with wildfire pm2.5 dataset
dff_am_all = dff_am_all.reset_index()
dff_am_all["FIPS"] = dff_am_all["FIPS"].str.strip().str.zfill(11)
dff_am_all = dff_am_all.set_index(["FIPS", "year"])

In [22]:
dfef = dfe.merge(
    dff_am_all, left_on=["GEOID", "year"], right_on=["FIPS", "year"], how="left"
)

In [23]:
# calculate wildfire-specific PM2.5 fraction using Kirk Baker CMAQ outputs

dfef["prop_wfpm25"] = dfef["PM_wf"] / dfef["PM_total"]

# recalculate wildfire-specific PM2.5 concentrations
# using FAQSD data and Kirk Baker CMAQ outputs

dfef["PM_wf"] = dfef["prop_wfpm25"] * dfef["faqsd pm25"]
dfef["PM_nofire"] = dfef["faqsd pm25"] - dfef["PM_wf"]
dfef["PM_total"] = dfef["faqsd pm25"]

# 2. join wildfire pm predictions from childs et al

In [None]:
dfchilds = pd.read_csv(
    "C:\\Users\\rrice\\OneDrive - Environmental Protection Agency (EPA)\\exposure disparities\\childs et al data\\smokePM2pt5_predictions_daily_tract_20060101-20201231.csv",
    parse_dates=["date"],
    dtype={"GEOID": str},
)

# calculate annual average accounting for leap years

dfchilds["year"] = dfchilds["date"].dt.year
dfchilds["days_in_year"] = 355
dfchilds.loc[dfchilds["year"].apply(calendar.isleap), "days_in_year"] = 366


def mean_smokePM(df):
    """Calculate mean smoke PM for a given year, accounting for leap years."""

    return df["smokePM_pred"].sum() / df["days_in_year"].iloc[0]


dfchilds_annmean = dfchilds.groupby(["GEOID", "year"]).apply(mean_smokePM)
dfchilds_annmean.name = "wfpm25_childs"

# join with cmaq data

dfpm = dfef.set_index(["GEOID", "year"]).join(dfchilds_annmean, how="outer")[
    [
        "longitude",
        "latitude",
        "GISJOIN",
        "PM_total",
        "PM_nofire",
        "PM_wf",
        "wfpm25_childs",
    ]
]
dfpm

# dfi.groupby("year").plot.scatter(x="longitude", y="latitude", c="PM_total", cmap="viridis")

In [None]:
# calculate r^2 between wildfire pm from cmaq and wildfire pm from childs et al
dfpm["PM_wf"].corr(dfpm["wfpm25_childs"]) ** 2

In [None]:
dfpm.describe()

In [27]:
# export to intermediate data folder

dfpm.to_csv(
    "C:\\Users\\rrice\\OneDrive - Environmental Protection Agency (EPA)\\exposure disparities\\thesis intermediate dataset\\census tract pm25 datasets 11-15-2023.csv"
)