# Clean cems

Temporary notebook for cleaning cems using 860 net capacity 

Notes: 
* CEMS data is per smokestack/EPA unit: https://catalystcoop-pudl.readthedocs.io/en/latest/data_sources/epacems.html, while 860 capacity is per generator. So need to use crosswalk to sum generators to EPA units before checking against CEMS 
* Some CEMS spikes will be short enough to interpolate, while some won't. What's the cutoff? And when we don't interpolate, do we want to trash the whole month and use the residual, or just fill in the residual during missing times? 

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px

In [None]:
%reload_ext autoreload
%autoreload 2

# # Tell python where to look for modules.
import sys

sys.path.append("../../hourly-egrid/")

# import local modules
import src.load_data as load_data

In [None]:
year = 2020

In [None]:
cems = pd.read_csv(f"../data/outputs/{year}/cems_{year}.csv", parse_dates=["datetime_utc"])

In [None]:
cems[(cems.datetime_utc=="2020-09-30T17:00:00+00:00") & (cems.plant_id_eia==3)]

In [None]:
px.scatter(cems[cems.plant_id_eia==3], x="datetime_utc", y="net_generation_mwh", color="unitid")

In [None]:
pudl_dat = load_data.initialize_pudl_out(year=year)

In [None]:
gens_pudl = pudl_dat.gens_eia860()

In [None]:
# Table is unique by plant ID, generator ID 
gens = gens_pudl.copy(deep=True)
gens["net_capacity_mw"] = gens.winter_capacity_mw.combine(gens.summer_capacity_mw, max)
gens = gens.loc[:,["plant_id_eia","generator_id","net_capacity_mw"]]

In [None]:
# Subplants are defined on "plant_id_eia", largest of ("unitid", "generator_id")
subplant_crosswalk = pd.read_csv(f"../data/outputs/{year}/subplant_crosswalk.csv")

In [None]:
gens_unit = gens.merge(subplant_crosswalk, how='left', on=['plant_id_eia','generator_id'])
print(f"Setting {sum(gens_unit.subplant_id.isna())} NaN subplants to 1 in 860 data")
gens_unit.loc[gens_unit.subplant_id.isna()] = 1

In [None]:
cems_subplant = cems.merge(subplant_crosswalk[["plant_id_eia", "unitid"]], how="left", on=["plant_id_eia", "unitid"])

In [None]:
print(f"Setting {sum(cems_subplant.subplant_id.isna())} NaN subplants to 1 in CEMS data")
cems_subplant.loc[cems_subplant.subplant_id.isna()] = 1

In [None]:
# Group gens by subplant 
gens_per_sub = gens_unit.groupby(["plant_id_eia", "subplant_id"]).sum().reset_index()

In [None]:
cems_cap = cems_subplant.merge(gens_per_sub[["plant_id_eia", "subplant_id", "net_capacity_mw"]], how='left', on=["plant_id_eia", "subplant_id"])

In [None]:
# TODO: here we're assuming that all columns are bad if net gen is bad, and that all bad rows have bad net gen. 
dat_cols = ['gross_generation_mwh', 'steam_load_1000_lb', 'fuel_consumed_mmbtu', 'co2_mass_lb', 'nox_mass_lb', 'so2_mass_lb', 'plant_id_epa', 'co2_mass_measurement_code', 'nox_mass_measurement_code', 'so2_mass_measurement_code', 'report_date', 'energy_source_code', 'ch4_mass_lb', 'n2o_mass_lb', 'fuel_consumed_for_electricity_mmbtu', 'co2_mass_lb_for_electricity', 'ch4_mass_lb_for_electricity', 'n2o_mass_lb_for_electricity', 'nox_mass_lb_for_electricity', 'so2_mass_lb_for_electricity', 'co2_mass_lb_adjusted', 'ch4_mass_lb_adjusted', 'n2o_mass_lb_adjusted', 'nox_mass_lb_adjusted', 'so2_mass_lb_adjusted','net_generation_mwh']
bad = cems_cap.net_generation_mwh > cems_cap.net_capacity_mw
cems_cap.loc[bad,dat_cols] = np.nan

In [None]:
cems_cap[bad]

In [None]:
plant = 3

print(gens_unit.loc[gens_unit.plant_id_eia==plant,["plant_id_eia", "subplant_id", "generator_id","net_capacity_mw"]])

px.line(cems_cap[cems_cap.plant_id_eia==plant], x="datetime_utc", y="net_generation_mwh", color="subplant_id")

In [None]:
cems_subplant[(cems_subplant.plant_id_eia==533) & (cems_subplant.generator_id=="1")]

In [None]:
# Example plant/unit with generation > capacity

eg_unit = cems_subplant[(cems_subplant.plant_id_eia==3) & (cems_subplant.generator_id=="A2C2")]
lims = gens_pudl[(gens_pudl.plant_id_eia==3) & (gens_pudl.generator_id=="A2C2")]

plt = px.line(eg_unit, x="datetime_utc",y="net_generation_mwh", color="generator_id")
plt.add_hline(lims.capacity_mw.values[0])
plt.add_hline(lims.winter_capacity_mw.values[0])
plt.add_hline(lims.summer_capacity_mw.values[0])


In [None]:
lims.summer_capacity_mw.values

In [None]:
gens[gens.plant_id_eia==3]

In [None]:
cems_cap[cems_cap.net_generation_mwh.isna()]

In [None]:
plant_attributes = pd.read_csv("../data/results/2020/plant_data/plant_static_attributes.csv")

In [None]:
plants = plant_attributes[(plant_attributes.ba_code=="SWPP") & (plant_attributes.data_availability=="cems_and_eia")].plant_id_eia

swpp_cems = cems[cems.plant_id_eia.isin(plants)]

px.line(swpp_cems, x="datetime_utc", y="net_generation_mwh", color="plant_id_eia")

In [None]:
plant_attributes.data_availability.unique()

# What's really the problem here 

* Looking at plant-level hourly data in SWPP, where do the spikes actually show up? 

In [None]:
synthetic = pd.read_csv("../data/results/2020/plant_data/hourly/us_units/synthetic_plant_data.csv", parse_dates=["datetime_utc"])
synthetic = synthetic.merge(plant_attributes, how='left', on='plant_id_eia')

In [None]:
synthetic

In [None]:
px.line(synthetic[synthetic.ba_code=="SWPP"], x="datetime_utc", y="net_generation_mwh", color="plant_id_eia")

In [None]:
plant_attributes[plant_attributes.plant_id_eia==918601]

In [None]:
profiles = pd.read_csv("../data/outputs/2020/hourly_profiles_2020.csv", parse_dates=["datetime_utc"])

In [None]:
profiles.head()

In [None]:
profiles[(profiles.ba_code=="SWPP") & (profiles.fuel_category=="biomass")]

In [None]:
px.line(profiles[(profiles.ba_code=="SWPP") & (profiles.fuel_category=="biomass")], x="datetime_utc", y="profile", color="profile_method")

In [None]:
profiles[(profiles.ba_code=="SWPP") & (profiles.fuel_category=="biomass")].sum()

In [None]:
cems = cems.merge(plant_attributes, how='left', on='plant_id_eia')

In [None]:
px.line(cems[(cems.fuel_category=="biomass") & (cems.ba_code=="SWPP")], x="datetime_utc", y=["gross_generation_mwh", "net_generation_mwh"])

In [None]:
cems[(cems.fuel_category=="biomass") & (cems.ba_code=="SWPP")].plant_id_eia.unique()

# Median/IQR cleaning 

In [None]:
cems.head()