In [1]:
import xarray as xr
import pandas as pd
from monetio.models import cmaq
from scipy import interpolate
import geopandas

# using geocube_env python environment (python 3.11)


Please install h5py to open files from the Amazon S3 servers.
Please install h5netcdf to open files from the Amazon S3 servers.


In [2]:
def interpolate_cmaq_census(cmaq_file: str, census_file: str) -> pd.DataFrame:
    """Perform cubic spline interpolation of CMAQ PM2.5 output to census tract centers of population.

    Args:
        cmaq_file (str): Path to CMAQ pseudonetcdf file.
        census_file (str): Path to census data point locations.

    Returns:
        pd.DataFrame: Census location input with PM2.5 field added.
    """
    ds = cmaq.open_dataset(fname=cmaq_file)

    # get annual average PM2.5
    dapm = (
        ds["PM25_AVG"].mean(dim="time").mean(dim="z")
    )  # taking mean of 1 level z to drop it

    census_points = geopandas.read_file(census_file)

    # transform pm2.5 and lat/long data into tidy dataframe
    vals = [
        dapm.values,
        dapm.coords["longitude"].values,
        dapm.coords["latitude"].values,
    ]

    pm25_df = pd.DataFrame(
        [pd.DataFrame(x).stack() for x in vals],
        index=["PM2.5", "longitude", "latitude"],
    ).T

    # perform interpolation of annual average pm2.5 data to census points
    census_points["pm25"] = interpolate.griddata(
        points=pm25_df[["longitude", "latitude"]],
        values=pm25_df["PM2.5"],
        xi=census_points[["LONGITUDE", "LATITUDE"]],
        method="cubic",
    )

    return census_points


In [3]:
# census block group level interpolation and join

cmaq_file = "C:\\Users\\rrice\\OneDrive - Environmental Protection Agency (EPA)\\exposure disparities\\EQUATES data\\HR2DAY_LST_ACONC_EQUATES_v532_12US1_2010.nc"
census_file = "C:\\Users\\rrice\\OneDrive - Environmental Protection Agency (EPA)\\exposure disparities\\nhgis0002_shape\\nhgis0002_shapefile_cenpop2010_us_blck_grp_cenpop_2010\\US_blck_grp_cenpop_2010.shp"
adi_path = "C:\\Users\\rrice\\OneDrive - Environmental Protection Agency (EPA)\\exposure disparities\\adi-download\\US_2020_ADI_Census Block Group_v3.2.csv"
dem_path = "C:\\Users\\rrice\\OneDrive - Environmental Protection Agency (EPA)\\exposure disparities\\nhgis0002_csv\\nhgis0002_ds172_2010_blck_grp.csv"

census_points = interpolate_cmaq_census(cmaq_file, census_file)

adi_df = pd.read_csv(adi_path)
adi_df
# join census demographic information

dem_df = pd.read_csv(
    dem_path,
    encoding="cp1252",
).drop(
    0
)  # read census data, drop first line of data descriptions

joined = (
    census_points.set_index(["GISJOIN"])
    .join(dem_df.set_index(["GISJOIN"]), how="outer") # this join is good - same number of rows in each census_points and dem_df
    .join(adi_df.set_index(["GISJOIN"]), how="left") #TODO: figure out why there are more adi census block groups than from the census files?
    .drop("geometry", axis="columns")
)

joined.to_csv("data/biol562 project dataset v1.csv")

  proj = self._crs.to_proj4(version=version)


<xarray.Dataset>
Dimensions:       (TSTEP: 365, VAR: 14, DATE-TIME: 2, LAY: 1, ROW: 299, COL: 459)
Dimensions without coordinates: TSTEP, VAR, DATE-TIME, LAY, ROW, COL
Data variables: (12/15)
    TFLAG         (TSTEP, VAR, DATE-TIME) int32 ...
    O3_MDA8       (TSTEP, LAY, ROW, COL) float32 ...
    O3_AVG        (TSTEP, LAY, ROW, COL) float32 ...
    CO_AVG        (TSTEP, LAY, ROW, COL) float32 ...
    NO_AVG        (TSTEP, LAY, ROW, COL) float32 ...
    NO2_AVG       (TSTEP, LAY, ROW, COL) float32 ...
    ...            ...
    PM25_AVG      (TSTEP, LAY, ROW, COL) float32 ...
    PM25_SO4_AVG  (TSTEP, LAY, ROW, COL) float32 ...
    PM25_NO3_AVG  (TSTEP, LAY, ROW, COL) float32 ...
    PM25_NH4_AVG  (TSTEP, LAY, ROW, COL) float32 ...
    PM25_OC_AVG   (TSTEP, LAY, ROW, COL) float32 ...
    PM25_EC_AVG   (TSTEP, LAY, ROW, COL) float32 ...
Attributes: (12/34)
    IOAPI_VERSION:  $Id: @(#) ioapi library version 3.1 $                    ...
    EXEC_ID:        ????????????????             

In [10]:
# census tract level interpolation and join
cmaq_file = "C:\\Users\\rrice\\OneDrive - Environmental Protection Agency (EPA)\\exposure disparities\\EQUATES data\\HR2DAY_LST_ACONC_EQUATES_v532_12US1_2010.nc"
census_file = "C:\\Users\\rrice\\OneDrive - Environmental Protection Agency (EPA)\\exposure disparities\\nhgis0003_shape\\US_tract_cenpop_2010.shp"
dem_path1 = "C:\\Users\\rrice\\OneDrive - Environmental Protection Agency (EPA)\\exposure disparities\\nhgis0003_csv\\nhgis0003_ds172_2010_tract.csv"
dem_path2 = "C:\\Users\\rrice\\OneDrive - Environmental Protection Agency (EPA)\\exposure disparities\\nhgis0003_csv\\nhgis0003_ds176_20105_tract.csv"
census_points = interpolate_cmaq_census(cmaq_file, census_file)

# join census demographic information

dem_df1 = pd.read_csv(dem_path1, encoding="cp1252",).drop(
    0
)  # read census data, drop first line of data descriptions

dem_df2 = pd.read_csv(dem_path2, encoding="cp1252",).drop(
    0
)  # read census data, drop first line of data descriptions

joined_tract = (
    census_points.set_index(["GISJOIN"])
    .join(dem_df1.set_index(["GISJOIN"]), how="outer")
    .join(dem_df2.set_index(["GISJOIN"]), how="outer", rsuffix="_drop")
    .drop("geometry", axis="columns")
)

joined_tract[[x for x in joined_tract.columns.tolist() if "_drop" not in x]].to_csv(
    "data/biol562 project dataset census tract level v1.csv"
)


  proj = self._crs.to_proj4(version=version)


<xarray.Dataset>
Dimensions:       (TSTEP: 365, VAR: 14, DATE-TIME: 2, LAY: 1, ROW: 299, COL: 459)
Dimensions without coordinates: TSTEP, VAR, DATE-TIME, LAY, ROW, COL
Data variables: (12/15)
    TFLAG         (TSTEP, VAR, DATE-TIME) int32 ...
    O3_MDA8       (TSTEP, LAY, ROW, COL) float32 ...
    O3_AVG        (TSTEP, LAY, ROW, COL) float32 ...
    CO_AVG        (TSTEP, LAY, ROW, COL) float32 ...
    NO_AVG        (TSTEP, LAY, ROW, COL) float32 ...
    NO2_AVG       (TSTEP, LAY, ROW, COL) float32 ...
    ...            ...
    PM25_AVG      (TSTEP, LAY, ROW, COL) float32 ...
    PM25_SO4_AVG  (TSTEP, LAY, ROW, COL) float32 ...
    PM25_NO3_AVG  (TSTEP, LAY, ROW, COL) float32 ...
    PM25_NH4_AVG  (TSTEP, LAY, ROW, COL) float32 ...
    PM25_OC_AVG   (TSTEP, LAY, ROW, COL) float32 ...
    PM25_EC_AVG   (TSTEP, LAY, ROW, COL) float32 ...
Attributes: (12/34)
    IOAPI_VERSION:  $Id: @(#) ioapi library version 3.1 $                    ...
    EXEC_ID:        ????????????????             

  dem_df1 = pd.read_csv(dem_path1, encoding="cp1252",).drop(
  dem_df2 = pd.read_csv(dem_path2, encoding="cp1252",).drop(
