In [9]:
"""98th percentile analysis of FAQSD daily data.
Run on UNC Longleaf with 128gb of RAM

Had to run the following to work with parquet files on cluster:
!pip install fastparquet

"""

import pandas as pd
from statsmodels.stats.weightstats import DescrStatsW
import seaborn as sns

import matplotlib.pyplot as plt

sns.set_context("paper", font_scale=1.5, rc={"figure.dpi": 300, "lines.linewidth": 2})

import os

In [2]:
# read project dataset, drop all columns with concentration data

columns = [
    "GISJOIN",
    "longitude",
    "latitude",
    #    "PM_total_2006",
    #    "PM_total_2007",
    #    "PM_total_2008",
    #    "PM_total_2009",
    #    "PM_total_2010",
    #    "PM_total_2011",
    #    "PM_total_2012",
    #    "PM_total_2013",
    #    "PM_total_2014",
    #    "PM_total_2015",
    #    "PM_total_2016",
    #    "PM_total_2017",
    #    "PM_total_2018",
    #    "PM_total_2019",
    #    "PM_nofire_2007",
    #    "PM_nofire_2008",
    #    "PM_nofire_2009",
    #    "PM_nofire_2010",
    #    "PM_nofire_2011",
    #    "PM_nofire_2012",
    #    "PM_nofire_2013",
    #    "PM_nofire_2014",
    #    "PM_nofire_2015",
    #    "PM_nofire_2016",
    #    "PM_nofire_2017",
    #    "PM_nofire_2018",
    #    "PM_wf_2007",
    #    "PM_wf_2008",
    #    "PM_wf_2009",
    #    "PM_wf_2010",
    #    "PM_wf_2011",
    #    "PM_wf_2012",
    #    "PM_wf_2013",
    #    "PM_wf_2014",
    #    "PM_wf_2015",
    #    "PM_wf_2016",
    #    "PM_wf_2017",
    #    "PM_wf_2018",
    #    "wfpm25_childs_2006",
    #    "wfpm25_childs_2007",
    #    "wfpm25_childs_2008",
    #    "wfpm25_childs_2009",
    #    "wfpm25_childs_2010",
    #    "wfpm25_childs_2011",
    #    "wfpm25_childs_2012",
    #    "wfpm25_childs_2013",
    #    "wfpm25_childs_2014",
    #    "wfpm25_childs_2015",
    #    "wfpm25_childs_2016",
    #    "wfpm25_childs_2017",
    #    "wfpm25_childs_2018",
    #    "wfpm25_childs_2019",
    #    "wfpm25_childs_2020",
    "GEO_ID",
    "state",
    "county",
    "tract",
    "Name",
    "Total Population",
    "Urban Population",
    "Rural Population",
    "Hispanic",
    "NH White",
    "NH Black",
    "NH American Indian and Alaska Native",
    "NH Asian",
    # "NH Native Hawaiian and Other Pacific Islander",
    # "NH Other",
    "Income quartile",
    "Language spoken at home: only English",
    "Language other than English spoken at home, speaks English well",
    "Language other than English spoken at home, does not speak English well",
    "state_abbr",
    "EPA Region",
    "NCA Region",
    "RUCA 1",
]

df = pd.read_parquet(
    "data/total + wildfire pm and demographic data 6-17-2024.parquet", columns=columns
)

df["FIPS"] = df.GEO_ID.str[9:]
df["PM dataset"] = "FAQSD"

# use categorical data type to sort the PM datasets
df["PM dataset"] = pd.Categorical(
    df["PM dataset"],
    ["CMAQ Total", "CMAQ No Fire", "CMAQ Fire", "Childs Fire"],
    ordered=True,
)

# same for RUCA 1 categories
df["RUCA 1"] = pd.Categorical(
    df["RUCA 1"],
    ["Urban core", "Suburban", "Micropolitan", "Small town", "Rural"],
    ordered=True,
)

# same for income quartile
df["Income quartile"] = pd.Categorical(
    df["Income quartile"],
    [1, 2, 3, 4],
    ordered=True,
)

In [4]:
def get_summary_statistics(
    df, weights_col="Total Population", value_col="Concentration"
):
    """Get summary statistics of pop. weighted PM2.5 concentrations."""

    # drop tracts with missing values
    df = df.dropna(subset=[value_col, weights_col])

    # calculate summary statistics
    desc_stats = DescrStatsW(df[value_col], weights=df[weights_col])

    return pd.Series(
        {
            # "98th percentile": desc_stats.mean,
            # "SD": desc_stats.std,
            # "10th percentile": desc_stats.quantile(0.1).values[0],
            # "25th percentile": desc_stats.quantile(0.25).values[0],
            # "50th percentile": desc_stats.quantile(0.5).values[0],
            # "75th percentile": desc_stats.quantile(0.75).values[0],
            # "90th percentile": desc_stats.quantile(0.9).values[0],
            "98th percentile": desc_stats.quantile(0.98).values[0],
        }
    )


def read_faqsd_year(path, year):
    """Read FAQSD data, filter by year."""

    dff = pd.read_parquet(path)
    dff["PM Dataset"] = "FAQSD"

    return dff.loc[dff.year == year]

In [None]:
# get descriptive statistics by year

for year in range(2007, 2019):
    destpath = (
        f"temp/FAQSD population-weighted 98th percentile exposure by year {year}.csv"
    )

    if not os.path.exists(destpath):
        dff = read_faqsd_year("data/FAQSD_pm25_daily_average.parquet", year)

        # join with sampled daily data for development

        dfp = (
            df.set_index("FIPS")
            .join(dff.set_index("FIPS"), how="left", rsuffix="_drop")
            .reset_index()
        )

        dfp.rename(columns={"pm25_daily_average(ug/m3)": "Concentration"}, inplace=True)

        ## Population-weighted PM2.5 concentrations, 2007-2018 (µg/m³)

        # dfd = (
        #    dfp.groupby(["PM dataset"])
        #    .apply(get_summary_statistics)
        #    .round(1)
        #    .sort_values(["PM dataset"], ascending=[False])
        # )

        # create table for each race/ethnicity group

        dfre = dfp.melt(
            id_vars=[
                "PM dataset",
                "Concentration",
                "GISJOIN",
                "state",
                "county",
                "tract",
                "Name",
                "year",
                "Income quartile",
                "Language spoken at home: only English",
                "Language other than English spoken at home, speaks English well",
                "Language other than English spoken at home, does not speak English well",
                "state_abbr",
                "EPA Region",
                "NCA Region",
                "RUCA 1",
            ],
            value_vars=[
                "Hispanic",
                "NH White",
                "NH Black",
                "NH American Indian and Alaska Native",
                "NH Asian",
                # "NH Native Hawaiian and Other Pacific Islander",
                # "NH Other",
            ],
            var_name="Race/ethnicity",
            value_name="Population",
        )

        ## PM2.5 concentrations by language spoken at home (µg/m³)

        # create table for each language group

        dfl = dfp.melt(
            id_vars=[
                "PM dataset",
                "Concentration",
                "GISJOIN",
                "state",
                "county",
                "tract",
                "Name",
                "year",
                "Income quartile",
                "state_abbr",
                "EPA Region",
                "NCA Region",
                "RUCA 1",
            ],
            value_vars=[
                "Language spoken at home: only English",
                "Language other than English spoken at home, speaks English well",
                "Language other than English spoken at home, does not speak English well",
            ],
            var_name="Language",
            value_name="Population",
        )

        # rename language categories with shorter names
        dfl["Language"].replace(
            {
                "Language spoken at home: only English": "Only English",
                "Language other than English spoken at home, speaks English well": "Other than English, speaks English well",
                "Language other than English spoken at home, does not speak English well": "Other than English, does not speak English well",
            },
            inplace=True,
        )

        ## Time series of PM2.5 concentrations by NCA Region (µg/m³)

        dfregy = (
            dfp.groupby(["NCA Region"])
            .apply(get_summary_statistics, weights_col="Total Population")
            .reset_index()
        )

        ## Time series of PM2.5 concentrations by urban-rural status (µg/m³)

        dfruca1 = (
            dfp.groupby(["RUCA 1"])
            .apply(get_summary_statistics, weights_col="Total Population")
            .reset_index()
        )

        ## Time series of PM2.5 concentration by race and ethnicity (µg/m³)

        dfresy = (
            dfre.groupby(["Race/ethnicity"])
            .apply(get_summary_statistics, weights_col="Population")
            .reset_index()
        )

        ## Time series of PM2.5 concentration by language spoken at home (µg/m³)

        dflsy = (
            dfl.groupby(["Language"])
            .apply(get_summary_statistics, weights_col="Population")
            .reset_index()
        )

        ## Time series of PM2.5 concentration by Census Tract-level income quartile (µg/m³)

        dfppci = (
            dfp.groupby(["Income quartile"])
            .apply(get_summary_statistics, weights_col="Total Population")
            .reset_index()
        )
        ## 98th percentile of overall exposure

        dfp["dummy"] = 1

        dfo = (
            dfp.groupby("dummy")
            .apply(get_summary_statistics, weights_col="Total Population")
            .reset_index()
        )

        # create dataframe of per year 98th percentile results

        dfregy["variable"] = "NCA Region"
        dfregy.rename(columns={"NCA Region": "value"}, inplace=True)

        dfruca1["variable"] = "RUCA 1"
        dfruca1.rename(columns={"RUCA 1": "value"}, inplace=True)

        dfresy["variable"] = "Race/ethnicity"
        dfresy.rename(columns={"Race/ethnicity": "value"}, inplace=True)

        dflsy["variable"] = "Language"
        dflsy.rename(columns={"Language": "value"}, inplace=True)

        dfppci["variable"] = "Income quartile"
        dfppci.rename(columns={"Income quartile": "value"}, inplace=True)

        dfo["variable"] = "Overall"

        export = pd.concat([dfregy, dfruca1, dfresy, dflsy, dfppci, dfo])

        export.to_csv(destpath)

In [6]:
# read and concat per-year results

dfl = []

for year in range(2007, 2019):
    dft = pd.read_csv(
        f"temp/FAQSD population-weighted 98th percentile exposure by year {year}.csv"
    )
    dft["year"] = year
    dfl.append(dft)

dfc = pd.concat(dfl).reset_index().drop(["Unnamed: 0", "index"], axis="columns")

In [7]:
dfc.to_csv("data/FAQSD 98th percentile annually by demographic group.csv")

In [10]:
if "dfc" not in locals():
    dfc = pd.read_csv("data/FAQSD 98th percentile annually by demographic group.csv")

In [11]:
# relabel the quartiles of income
dfc["value"] = dfc["value"].replace(
    {
        "1": "1 (lowest)",
        "4": "4 (highest)",
    }
)

# replace racial and ethnic group column names
dfc["value"] = dfc["value"].replace(
    {
        "NH White": "White",
        "NH Black": "Black",
        "NH American Indian and Alaska Native": "AIAN",
        "NH Asian": "Asian",
    }
)

In [None]:
# Figure S12
# plot results

fig, axs = plt.subplots(5, figsize=(12, 15))

categories = dfc["variable"].unique()

# add new line after comma in dfc["value"] column
dfc["value"] = dfc["value"].str.replace(", ", ",\n")

for i, ax in enumerate(axs):
    g = sns.lineplot(
        ax=ax,
        data=dfc.loc[dfc.variable == categories[i]],
        x="year",
        y="98th percentile",
        hue="value",
        style="value",
    )

    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles=handles, labels=labels)

    ax.set_title(categories[i])
    ax.set_ylabel("98th %ile PM$_{2.5}$ (µg/m³)")
    ax.set_xlabel("Year")
    sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
    sns.despine()

# set title of legend of last subplot
axs[-1].legend(title="Income quartile", loc="upper left", bbox_to_anchor=(1, 1))

# rename title of last subplot
axs[-1].set_title("Income quartile")

fig.tight_layout()

In [None]:
categories

In [None]:
# plot racial and ethnic group results only

# fig, ax = plt.subplots(1, figsize=(7, 2))

categories = ["Race/ethnicity"]

g = sns.lineplot(
    # ax=ax,
    data=dfc.loc[dfc.variable == categories[0]].sort_values("value"),
    x="year",
    y="98th percentile",
    hue="value",
    style="value",
    legend=True,
)

# handles, labels = ax.get_legend_handles_labels()
# ax.legend(handles=handles, labels=labels)

# ax.set_title(categories[0])
g.set_ylabel("98th %ile PM$_{2.5}$ (µg/m³)")
g.set_xlabel("Year")
# sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))

# move legend to the right of the plot using sns.move_legend
sns.move_legend(g, "upper left", bbox_to_anchor=(1, 0.6))

# remove legend title
g.get_legend().set_title(None)

# fig.tight_layout()

In [None]:
# plot overall results only
# Figure S11

# fig, ax = plt.subplots(1, figsize=(7, 2))

categories = ["Overall"]

fig, axs = plt.subplots(1, figsize=(8, 4))

g = sns.lineplot(
    # ax=ax,
    data=dfc.loc[dfc.variable == categories[0]].sort_values("value"),
    x="year",
    y="98th percentile",
)

# ax.set_title(categories[0])
g.set_ylabel("98th %ile PM$_{2.5}$ (µg/m³)")
g.set_xlabel("Year")