# Clean cems

Temporary notebook for cleaning cems using 860 net capacity 

Notes: 
* CEMS data is per smokestack/EPA unit: https://catalystcoop-pudl.readthedocs.io/en/latest/data_sources/epacems.html, while 860 capacity is per generator. So need to use crosswalk to sum generators to EPA units before checking against CEMS 
* Some CEMS spikes will be short enough to interpolate, while some won't. What's the cutoff? And when we don't interpolate, do we want to trash the whole month and use the residual, or just fill in the residual during missing times? 

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px

In [None]:
%reload_ext autoreload
%autoreload 2

# # Tell python where to look for modules.
import sys

sys.path.append("../../src/")

# import local modules
import src.load_data as load_data
from src.filepaths import outputs_folder
from src.column_checks import get_dtypes

In [None]:
year = 2020

In [None]:
cems = pd.read_csv(
    f"{outputs_folder()}/{year}/cems_cleaned_{year}.csv", dtype=get_dtypes()
)

In [None]:
# example CEMS data
px.line(
    cems[cems.plant_id_eia == 3],
    x="datetime_utc",
    y="net_generation_mwh",
    color="subplant_id",
)

# Simple cleaning 
At the subplant level, filter data where hourly CEMS generation exceeds 890 capacity. 

Active questions: 
* Do we need to build in a buffer for generation that slightly exceeds capacity but is still possible? In some cases, capacity is estimated; and we know that it's variable (eg, summer capacity < winter capacity; both summer and winter capacity often exceed nameplate capacity)
* Do we need to aggregate to the plant level to do this check? This would allow data to pass through the filter even if it's potentially labeled with the incorrect subplant. 

In [None]:
pudl_dat = load_data.initialize_pudl_out(year=year)

In [None]:
gens_pudl = pudl_dat.gens_eia860()

In [None]:
# Table is unique by plant ID, generator ID.
# For each plant and generator, find the maximum of the three capacity values (summer, winter, nameplate)
gens = gens_pudl.copy(deep=True)
gens["net_capacity_mw"] = gens.winter_capacity_mw.combine(gens.summer_capacity_mw, max)
gens["net_capacity_mw"] = gens.net_capacity_mw.combine(gens.capacity_mw, max)
gens = gens.loc[:, ["plant_id_eia", "generator_id", "net_capacity_mw"]]

In [None]:
# Subplants are defined on "plant_id_eia", largest of ("emissions_unit_id_epa", "generator_id")
subplant_crosswalk = pd.read_csv(f"{outputs_folder()}/{year}/subplant_crosswalk.csv")

In [None]:
gens_unit = gens.merge(
    subplant_crosswalk, how="left", on=["plant_id_eia", "generator_id"]
)
print(f"Setting {sum(gens_unit.subplant_id.isna())} NaN subplants to 1 in 860 data")
gens_unit.loc[gens_unit.subplant_id.isna()] = 1

In [None]:
# Group gens by subplant
gens_per_sub = gens_unit.groupby(["plant_id_eia", "subplant_id"]).sum().reset_index()

In [None]:
cems.subplant_id.dtype

In [None]:
print(f"Setting {sum(cems.subplant_id.isna())} NaN subplants to 1 in CEMS data")
cems.loc[cems.subplant_id.isna(), "subplant_id"] = 1

In [None]:
# Now that nans are gone, we can switch from the weird pandas int dtype to numpy dtype, which is required for merge
cems = cems.astype(dtype={"subplant_id": np.int32})

In [None]:
cems_cap = cems.merge(
    gens_per_sub[["plant_id_eia", "subplant_id", "net_capacity_mw"]],
    how="left",
    on=["plant_id_eia", "subplant_id"],
)

In [None]:
# TODO: here we're assuming that all columns are bad if net gen is bad, and that all bad rows have bad net gen.
dat_cols = [
    "gross_generation_mwh",
    "steam_load_1000_lb",
    "fuel_consumed_mmbtu",
    "co2_mass_lb",
    "nox_mass_lb",
    "so2_mass_lb",
    "plant_id_epa",
    "co2_mass_measurement_code",
    "nox_mass_measurement_code",
    "so2_mass_measurement_code",
    "report_date",
    "energy_source_code",
    "ch4_mass_lb",
    "n2o_mass_lb",
    "fuel_consumed_for_electricity_mmbtu",
    "co2_mass_lb_for_electricity",
    "ch4_mass_lb_for_electricity",
    "n2o_mass_lb_for_electricity",
    "nox_mass_lb_for_electricity",
    "so2_mass_lb_for_electricity",
    "co2_mass_lb_adjusted",
    "ch4_mass_lb_adjusted",
    "n2o_mass_lb_adjusted",
    "nox_mass_lb_adjusted",
    "so2_mass_lb_adjusted",
    "net_generation_mwh",
]
bad = cems_cap.net_generation_mwh > cems_cap.net_capacity_mw
cems_cap.loc[bad, dat_cols] = np.nan

In [None]:
cems_cap[bad]

In [None]:
## What proportion of CEMS data was ID'ed as bad using capacity filter?
sum(bad) / len(cems)

In [None]:
plant = 2410

print(
    gens_unit.loc[
        gens_unit.plant_id_eia == plant,
        ["plant_id_eia", "subplant_id", "generator_id", "net_capacity_mw"],
    ]
)

px.line(
    cems_cap[cems_cap.plant_id_eia == plant],
    x="datetime_utc",
    y="net_generation_mwh",
    color="subplant_id",
)

In [None]:
print(
    gens_unit.loc[
        gens_unit.plant_id_eia == plant,
        ["plant_id_eia", "subplant_id", "generator_id", "net_capacity_mw"],
    ]
)

px.line(
    cems[cems.plant_id_eia == plant],
    x="datetime_utc",
    y="net_generation_mwh",
    color="subplant_id",
    title=f"plant id = {plant}",
)

# Median/IQR cleaning 

### Advantages over capacity cleaning: 
Works consistantly for all data types, and isn't sensitive to plants occasionally producing over capacity

In [None]:
# Get per-plant, per-variable median and IQR
numeric_cols = [
    "gross_generation_mwh",
    "steam_load_1000_lb",
    "fuel_consumed_mmbtu",
    "co2_mass_lb",
    "ch4_mass_lb",
    "n2o_mass_lb",
    "nox_mass_lb",
    "so2_mass_lb",
    "co2_mass_lb_adjusted",
    "ch4_mass_lb_adjusted",
    "n2o_mass_lb_adjusted",
    "nox_mass_lb_adjusted",
    "so2_mass_lb_adjusted",
    "net_generation_mwh",
    "fuel_consumed_for_electricity_mmbtu",
    "co2_mass_lb_for_electricity",
    "co2_mass_lb_for_electricity_adjusted",
    "ch4_mass_lb_for_electricity",
    "ch4_mass_lb_for_electricity_adjusted",
    "n2o_mass_lb_for_electricity",
    "n2o_mass_lb_for_electricity_adjusted",
    "nox_mass_lb_for_electricity",
    "nox_mass_lb_for_electricity_adjusted",
    "so2_mass_lb_for_electricity",
    "so2_mass_lb_for_electricity_adjusted",
    "co2e_mass_lb",
    "co2e_mass_lb_adjusted",
    "co2e_mass_lb_for_electricity",
    "co2e_mass_lb_for_electricity_adjusted",
]
iqr = cems.groupby(["plant_id_eia", "subplant_id"])[numeric_cols].quantile(
    0.75
) - cems.groupby(["plant_id_eia", "subplant_id"])[numeric_cols].quantile(0.25)

In [None]:
median = cems.groupby(["plant_id_eia", "subplant_id"]).median()

In [None]:
lower_bound = median + (iqr * -3)
upper_bound = median + (iqr * 3)

In [None]:
cems_filtered = cems.copy()
# for plant in cems_filtered.plant_id_eia.unique():

In [None]:
cems_filtered = cems_filtered.set_index(["plant_id_eia", "subplant_id"])

In [None]:
cems_filtered.loc[(3, 1)]

In [None]:
checked = (cems_filtered.loc[(3, 1), lower_bound.columns] < lower_bound.loc[(3, 1)]) | (
    cems_filtered.loc[(3, 1), lower_bound.columns] > upper_bound.loc[(3, 1)]
)

In [None]:
checked.sum()

In [None]:
toplot = cems_filtered.loc[(3, 1)]
px.scatter(
    toplot,
    x="datetime_utc",
    y="fuel_consumed_mmbtu",
    color=checked["fuel_consumed_mmbtu"],
)

In [None]:
median.loc[(3, 1)]