In [None]:
import pandas as pd
import numpy as np

%reload_ext autoreload
%autoreload 2

# # Tell python where to look for modules.
import sys

sys.path.append("../../src")
from column_checks import get_dtypes
import load_data
from data_cleaning import *
from emissions import *

from pudl.analysis.allocate_net_gen import distribute_annually_reported_data_to_months

# Filling missing nox so2 in CEMS

In [None]:
year = 2020

In [None]:
# load the CEMS data
cems = load_data.load_cems_data(year)


# remove non-grid connected plants
cems = remove_plants(
    cems,
    non_grid_connected=True,
    remove_states=["PR"],
    steam_only_plants=False,
    distribution_connected_plants=False,
)

# manually remove steam-only units
cems = manually_remove_steam_units(cems)

# add a report date
cems = load_data.add_report_date(cems)

# remove data for any unit-months where there are incomplete data reported
# this is generally when there is a single observation reported for an entire month
cems = remove_incomplete_unit_months(cems)

# TODO: identify and remove any hourly values that appear to be outliers

# add a fuel type to each observation
cems = assign_fuel_type_to_cems(cems, year)

In [None]:
# replace all "missing" CO2 values with zero
cems["nox_mass_lb"] = cems["nox_mass_lb"].fillna(0)

# replace 0 reported CO2 values with missing values, if there was reported heat input
cems.loc[
    (cems["nox_mass_lb"] == 0) & (cems["fuel_consumed_mmbtu"] > 0),
    "nox_mass_lb",
] = np.NaN

# create a new df with all observations with missing co2 data
missing_nox = cems[cems["nox_mass_lb"].isnull()]
missing_nox

In [None]:
# replace all "missing" CO2 values with zero
cems["so2_mass_lb"] = cems["so2_mass_lb"].fillna(0)

# replace 0 reported CO2 values with missing values, if there was reported heat input
cems.loc[
    (cems["so2_mass_lb"] == 0) & (cems["fuel_consumed_mmbtu"] > 0),
    "so2_mass_lb",
] = np.NaN

# create a new df with all observations with missing co2 data
missing_so2 = cems[cems["so2_mass_lb"].isnull()]
missing_so2

In [None]:
epa_eia_crosswalk = load_data.load_epa_eia_crosswalk(year)
boiler_to_unit_crosswalk = epa_eia_crosswalk[
    ["plant_id_eia", "emissions_unit_id_epa", "boiler_id"]
].drop_duplicates()
boiler_to_unit_crosswalk

In [None]:
unit_controlled_nox_rates = emissions.calculate_unit_specific_controlled_nox_rates(year)
unit_controlled_nox_rates

In [None]:
# merge the controlled nox rates into missing_nox
missing_nox = missing_nox.merge(
    unit_controlled_nox_rates,
    how="left",
    on=["plant_id_eia", "emissions_unit_id_epa"],
    validate="m:1",
)
# calculate the controlled nox emissions based on the month
missing_nox = missing_nox.assign(
    controlled_nox_mass_lb=lambda x: np.where(
        ((x.report_date.dt.month >= 5) & (x.report_date.dt.month <= 9)),
        x.fuel_consumed_mmbtu * x.controlled_ozone_season_nox_ef_lb_per_mmbtu,
        x.fuel_consumed_mmbtu * x.controlled_non_ozone_season_nox_ef_lb_per_mmbtu,
    )
)

In [None]:
# load emission factors
nox_emission_factors = load_data.load_nox_emission_factors()
# remove duplicate factors
nox_emission_factors = nox_emission_factors.drop_duplicates(
    subset=[
        "prime_mover_code",
        "energy_source_code",
        "boiler_bottom_type",
        "boiler_firing_type",
    ]
)

In [None]:
# load the boiler firing type and merge in prime mover
boiler_firing_type = emissions.load_boiler_firing_type(year).merge(
    pd.read_sql("boilers_entity_eia", pudl_out.pudl_engine),
    how="left",
    on=["plant_id_eia", "boiler_id"],
    validate="m:1",
)

In [None]:
epa_eia_crosswalk = load_data.load_epa_eia_crosswalk(year)
boiler_to_unit_crosswalk = epa_eia_crosswalk[
    ["plant_id_eia", "emissions_unit_id_epa", "boiler_id"]
].drop_duplicates()

In [None]:
# merge in all possible fuel-specific emission factors for the pm boiler design parameters
boiler_factors = boiler_firing_type.merge(
    nox_emission_factors,
    how="left",
    on=["prime_mover_code", "boiler_bottom_type", "boiler_firing_type"],
)

# associate a unit
boiler_factors = boiler_factors.merge(
    boiler_to_unit_crosswalk, how="left", on=["plant_id_eia", "boiler_id"]
)

# aggregate by unit fuel
boiler_factors = (
    boiler_factors.groupby(
        [
            "plant_id_eia",
            "emissions_unit_id_epa",
            "energy_source_code",
            "emission_factor_denominator",
        ],
        dropna=False,
    )["emission_factor"]
    .mean()
    .reset_index()
)

boiler_factors