In [28]:
"""Analyses of areas with both high non-fire and fire PM2.5 concentrations."""

import pandas as pd
from statsmodels.stats.weightstats import DescrStatsW
import seaborn as sns
import matplotlib.pyplot as plt
import geopandas


# set matplotlib/seaborn to 300dpi
sns.set_context("paper", font_scale=1.5, rc={"figure.dpi": 300})

total_threshold = 9.6
nofire_threshold = 8.7
fire_threshold = 0.88

In [29]:
df = pd.read_parquet("data/total + wildfire pm and demographic data 6-17-2024.parquet")
# reshape to calculate summary statistics for all years
dfl = pd.wide_to_long(
    df,
    stubnames=["PM_total_", "PM_nofire_", "PM_wf_", "wfpm25_childs_"],
    i=["GISJOIN"],
    j="Year",
)
# create correlation matrix for each pm dataset
corr = dfl[["PM_total_", "PM_nofire_", "PM_wf_", "wfpm25_childs_"]].corr()
corr

# calculate overall average pm25 concentration for each dataset for each tract
overall_means = dfl.groupby("GISJOIN")[
    ["PM_total_", "PM_nofire_", "PM_wf_", "wfpm25_childs_"]
].mean()

overall_means.columns = [
    "CMAQ Total overall mean",
    "CMAQ Non-fire overall mean",
    "CMAQ Fire overall mean",
    "Childs Fire overall mean",
]

# join overall means to original dataframe
df = df.join(overall_means, on="GISJOIN")

# make income quartile group a categorical variable
df["Income quartile"] = pd.Categorical(
    df["Income quartile"],
    [1, 2, 3, 4],
    ordered=True,
)

# get tracts where the overall mean is greater than the threshold concentration
high_total_tracts = df[(df["CMAQ Total overall mean"] > total_threshold)]

# get tracts where the overall mean is greater than the threshold concentration
high_nofire_tracts = df[(df["CMAQ Non-fire overall mean"] > nofire_threshold)]

# get tracts where the overall mean is greater than the threshold concentration
high_fire_tracts = df[(df["CMAQ Fire overall mean"] > fire_threshold)]

# get tracts where both fire and nofire are greater than the threshold concentration
high_both_tracts = df[
    (df["CMAQ Non-fire overall mean"] > nofire_threshold)
    & (df["CMAQ Fire overall mean"] > fire_threshold)
]

In [None]:
def calc_language_distribution(df, name):
    """Calculate percent of population by language."""
    total = df["Total Population"].sum()
    df_summary = pd.DataFrame(
        {
            "% Language spoken at home: only English": df[
                "Language spoken at home: only English"
            ].sum()
            / total
            * 100,
            "% Language other than English spoken at home, speaks English well": df[
                "Language other than English spoken at home, speaks English well"
            ].sum()
            / total
            * 100,
            "% Language other than English spoken at home, does not speak English well": df[
                "Language other than English spoken at home, does not speak English well"
            ].sum()
            / total
            * 100,
        },
        index=[name],
    )
    return df_summary


language_table = pd.concat(
    [
        calc_language_distribution(df, name="Overall").round(1),
        calc_language_distribution(high_nofire_tracts, name="High Non-fire").round(1),
        calc_language_distribution(high_fire_tracts, name="High Fire").round(1),
        calc_language_distribution(high_both_tracts, name="High Both").round(1),
    ]
).T

language_table["name"] = "Language spoken at home (%)"
language_table = language_table.reset_index().set_index(["name", "index"])
language_table

In [None]:
def calc_region_distribution(df, name):
    """Calculate percent of population by RUCA."""
    region_distribution = (
        df.groupby("NCA Region")["Total Population"].sum()
        / df["Total Population"].sum()
        * 100
    )
    region_distribution.name = name
    return region_distribution


region_table = pd.DataFrame(
    [
        calc_region_distribution(df, name="Overall").round(1),
        calc_region_distribution(high_nofire_tracts, name="High Non-fire").round(1),
        calc_region_distribution(high_fire_tracts, name="High Fire").round(1),
        calc_region_distribution(high_both_tracts, name="High Both").round(1),
    ]
).T

region_table["name"] = "NCA Region (%)"
region_table.index.name = "index"
region_table = region_table.reset_index().set_index(["name", "index"])
region_table

In [None]:
def calc_ruca_distribution(df, name):
    """Calculate percent of population by RUCA."""
    ruca_distribution = (
        df.groupby("RUCA 1")["Total Population"].sum()
        / df["Total Population"].sum()
        * 100
    )
    ruca_distribution.name = name
    return ruca_distribution


ruca_table = pd.DataFrame(
    [
        calc_ruca_distribution(df, name="Overall").round(1),
        calc_ruca_distribution(high_nofire_tracts, name="High Non-fire").round(1),
        calc_ruca_distribution(high_fire_tracts, name="High Fire").round(1),
        calc_ruca_distribution(high_both_tracts, name="High Both").round(1),
    ]
).T

ruca_table["name"] = "RUCA (%)"
ruca_table = ruca_table.reset_index()
ruca_table.rename(columns={"RUCA 1": "index"}, inplace=True)

ruca_table["index"] = pd.Categorical(
    ruca_table["index"],
    ["Urban core", "Suburban", "Micropolitan", "Small town", "Rural"],
    ordered=True,
)

ruca_table = ruca_table.sort_values("index")

ruca_table = ruca_table.reset_index().set_index(["name", "index"])
ruca_table.drop(columns="level_0", inplace=True)

ruca_table

In [None]:
def calc_income_distribution(df, name):
    """Calculate percent of population in each income quartile."""
    income_distribution = (
        df.groupby("Income quartile", observed=True)["Total Population"].sum()
        / df["Total Population"].sum()
        * 100
    )
    income_distribution.name = name

    return income_distribution


income_table = pd.DataFrame(
    [
        calc_income_distribution(df, name="Overall").round(1),
        calc_income_distribution(high_nofire_tracts, name="High Non-fire").round(1),
        calc_income_distribution(high_fire_tracts, name="High Fire").round(1),
        calc_income_distribution(high_both_tracts, name="High Both").round(1),
    ]
).T

income_table["name"] = "Income quartile (%)"
income_table.index.name = "index"
income_table = income_table.reset_index().set_index(["name", "index"])
income_table

In [None]:
def calc_racial_ethnic_makeup(df, name):
    """Calculate total number of people and the racial/ethnic distribution."""
    total = df["Total Population"].sum()
    df_summary = pd.DataFrame(
        {
            "Total count": total,
            "White": df["NH White"].sum() / total * 100,
            "Black": df["NH Black"].sum() / total * 100,
            "Hispanic": df["Hispanic"].sum() / total * 100,
            "Asian": df["NH Asian"].sum() / total * 100,
            "AIAN": df["NH American Indian and Alaska Native"].sum() / total * 100,
        },
        index=[name],
    )
    return df_summary


# table of tracts with high pm25, defined as greater than the nth percentile

racial_ethnic_table = pd.concat(
    [
        calc_racial_ethnic_makeup(df, name="Overall").round(1),
        calc_racial_ethnic_makeup(high_nofire_tracts, name="High Non-fire").round(1),
        calc_racial_ethnic_makeup(high_fire_tracts, name="High Fire").round(1),
        calc_racial_ethnic_makeup(high_both_tracts, name="High Both").round(1),
    ]
).T

racial_ethnic_table["name"] = "Race and ethnicity (%)"
racial_ethnic_table = racial_ethnic_table.reset_index()
racial_ethnic_table.sort_values("index", inplace=True)
# racial_ethnic_table.index.name = "index"
racial_ethnic_table = (
    racial_ethnic_table.reset_index()
    .set_index(["name", "index"])
    .drop(columns="level_0")
)
racial_ethnic_table

In [None]:
out_table = pd.concat(
    [region_table, ruca_table, racial_ethnic_table, language_table, income_table]
)
out_table.to_csv("tables/tableS2.csv")
out_table

In [39]:
out_shape = geopandas.GeoDataFrame(pd.read_pickle("temp/out_shape.pkl"))

In [41]:
# label tracts with high pm25

out_shape["high_nofire_tracts"] = out_shape.index.isin(high_nofire_tracts.index)
out_shape["high_fire_tracts"] = out_shape.index.isin(high_fire_tracts.index)
out_shape["high_both_tracts"] = out_shape.index.isin(high_both_tracts.index)

In [None]:
from matplotlib.colors import LinearSegmentedColormap


fig, axs = plt.subplots(1, 3, figsize=(6, 6))
axs = axs.flatten()

labels = ["High non-fire PM₂.₅", "High fire PM₂.₅", "High fire & non-fire PM₂.₅"]

color_brewer_colors = ["#1f78b4", "#a6cee3", "#33a02c"]

for i, x in enumerate(["high_nofire_tracts", "high_fire_tracts", "high_both_tracts"]):

    cmap = LinearSegmentedColormap.from_list(
        name="custom", colors=["lightgray", color_brewer_colors[i]]
    )

    out_shape.plot(x, ax=axs[i], cmap=cmap, antialiased=False)
    axs[i].set_axis_off()

plt.tight_layout()

# add legend
import matplotlib.patches as mpatches

legend_elements = [
    mpatches.Patch(facecolor=color_brewer_colors[i], label=labels[i]) for i in range(3)
]

_ = fig.legend(
    handles=legend_elements,
    loc="lower center",
    ncol=3,
    bbox_to_anchor=(0.5, 0.35),
    fontsize=8,
    frameon=True,
)

fig.savefig(f"figures/high_pm25_tracts_pop_weighted_mean_thresh.png", dpi=300)

In [None]:
# what percentiles are the 9 ug/m3 values?

from scipy import stats

for year in range(2007, 2019):
    print(f"Year: {year}")
    print(stats.percentileofscore(out_shape[f"PM_total_{year}"].dropna(), 9))

In [None]:
grouped_bar_data = (
    out_table.loc[out_table.index.get_level_values(0) == "Race and ethnicity (%)"]
    .stack()
    .reset_index()
)

#  drop total count values
grouped_bar_data = grouped_bar_data[
    ~grouped_bar_data["index"].str.contains("Total count")
]

grouped_bar_data.columns = ["name", "index", "PM_threshold", "Percent"]

# # drop overall values
# grouped_bar_data = grouped_bar_data[
#     ~grouped_bar_data["PM_threshold"].str.contains("Overall")
# ]

grouped_bar_data["index"] = (
    grouped_bar_data["index"]
    .str.replace("% ", "")
    .str.replace("NH ", "")
    .str.replace("American Indian and Alaska Native", "AIAN")
)
grouped_bar_data

In [None]:
sns.set_context("talk", font_scale=1.3, rc={"figure.dpi": 300})


fig = sns.catplot(
    data=grouped_bar_data.sort_values("PM_threshold", ascending=False),
    kind="bar",
    x="PM_threshold",
    y="Percent",
    col="index",
    height=6,
    aspect=0.55,
    sharey=False,
    palette=["gray"] + color_brewer_colors,
)

# rotate x axis labels
for ax in fig.axes.flat:
    for label in ax.get_xticklabels():
        label.set_rotation(60)
        # anchor labels to the right
        # label.set_horizontalalignment("right")

fig.set_titles("{col_name}")

# remove x axis label
fig.set_xlabels("")

# set y axis to log scale
fig.set(yscale="linear")

# set y axis labels
fig.set_ylabels("Percent of population (%)")