In [None]:
# import packages
import pandas as pd
import numpy as np
import os
import plotly.express as px

%reload_ext autoreload
%autoreload 2

# # Tell python where to look for modules.
import sys

sys.path.append("../../../open-grid-emissions/src/")

import download_data
import load_data
from column_checks import get_dtypes
from filepaths import *
import impute_hourly_profiles
import data_cleaning
import output_data
import emissions
import validation
import gross_to_net_generation
import eia930

year = 2020
path_prefix = f"{year}/"

In [None]:
# load inputs to function
cems = pd.read_csv(
    outputs_folder(f"{path_prefix}/cems_subplant_{year}.csv"),
    dtype=get_dtypes(),
    parse_dates=["datetime_utc", "report_date"],
)
partial_cems_plant = pd.read_csv(
    outputs_folder(f"{path_prefix}/partial_cems_plant_{year}.csv"),
    dtype=get_dtypes(),
    parse_dates=["datetime_utc", "report_date"],
)
partial_cems_subplant = pd.read_csv(
    outputs_folder(f"{path_prefix}/partial_cems_subplant_{year}.csv"),
    dtype=get_dtypes(),
    parse_dates=["datetime_utc", "report_date"],
)
eia923_allocated = pd.read_csv(
    outputs_folder(f"{path_prefix}/eia923_allocated_{year}.csv"),
    dtype=get_dtypes(),
    parse_dates=["report_date"],
)
plant_attributes = pd.read_csv(
    outputs_folder(f"{path_prefix}/plant_static_attributes_{year}.csv"),
    dtype=get_dtypes(),
)

# select eia only data
eia_only_data = eia923_allocated[
    (eia923_allocated["hourly_data_source"] == "eia")
    & ~(eia923_allocated["fuel_consumed_mmbtu"].isna())
].copy()

## Why are NOx emissions from non-CEMS plants so high in CAISO?
According to the data source, about 70% of emitting generation is represented in CEMS, but only 4% of NOx emissions are. Do 30% of plants account for 96% of NOx emissions?

Are we over-counting NOx from non-cems plants?
Is there a lot of missing nox data for CEMS plants?

In [None]:
# add ba codes and plant primary fuel to all of the data
eia_only_data = eia_only_data.merge(
    plant_attributes[["plant_id_eia", "ba_code", "plant_primary_fuel"]],
    how="left",
    on="plant_id_eia",
    validate="m:1",
)
cems = cems.merge(
    plant_attributes[["plant_id_eia", "ba_code", "plant_primary_fuel"]],
    how="left",
    on="plant_id_eia",
    validate="m:1",
)

In [None]:
cems_caiso = cems[cems["ba_code"] == "CISO"].copy()
eia_caiso = eia_only_data[eia_only_data["ba_code"] == "CISO"].copy()

In [None]:
cems_caiso["nox_mass_lb_for_electricity"].sum()

In [None]:
eia_caiso["nox_mass_lb_for_electricity"].sum()

In [None]:
eia_caiso["nox_rate"] = (
    eia_caiso["nox_mass_lb_for_electricity"] / eia_caiso["net_generation_mwh"]
)
eia_caiso["nox_rate"] = eia_caiso["nox_rate"].replace(np.inf, np.nan)

In [None]:
eia_caiso.groupby(
    [
        "prime_mover_code",
        "energy_source_code",
    ]
)["nox_mass_lb_for_electricity"].sum()

In [None]:
eia_caiso.groupby(["prime_mover_code"])["nox_mass_lb_for_electricity"].sum()

In [None]:
eia_caiso.groupby(["energy_source_code"])["nox_mass_lb_for_electricity"].sum()

In [None]:
eia_caiso["nox_mass_lb_for_electricity"].sum()

## Investigate the capacity factor of plants in each dataset

In [None]:
subplant_nameplate = gross_to_net_generation.calculate_subplant_nameplate_capacity(year)

pudl_out = load_data.initialize_pudl_out(year)
gen_cap = pudl_out.gens_eia860()[["plant_id_eia", "generator_id", "capacity_mw"]]

In [None]:
eia_cf = eia_only_data.merge(
    gen_cap, how="left", on=["plant_id_eia", "generator_id"], validate="m:1"
)
eia_cf["capfac"] = eia_cf.net_generation_mwh / (
    eia_cf.report_date.dt.days_in_month * 24 * eia_cf.capacity_mw
)
eia_cf.loc[eia_cf["capfac"] > 1.2, "capfac"] = np.NaN
eia_cf.loc[eia_cf["capfac"] < 0, "capfac"] = np.NaN
eia_cf

In [None]:
px.histogram(eia_cf, x="capfac", nbins=15, histnorm="percent", width=500).update_xaxes(
    dtick=0.05
)

In [None]:
cems_cf = cems.merge(subplant_nameplate, how="left", on=["plant_id_eia", "subplant_id"])
cems_cf = cems_cf.groupby(["plant_id_eia", "subplant_id"])[
    ["net_generation_mwh", "capacity_mw"]
].sum()
cems_cf = cems_cf[cems_cf["capacity_mw"] > 0]
cems_cf["capfac"] = cems_cf["net_generation_mwh"] / cems_cf["capacity_mw"]
cems_cf.loc[cems_cf["capfac"] > 1.2, "capfac"] = np.NaN
cems_cf.loc[cems_cf["capfac"] < 0, "capfac"] = np.NaN
cems_cf

In [None]:
px.histogram(cems_cf, x="capfac", nbins=15, histnorm="percent", width=500).update_xaxes(
    dtick=0.05
)