In [None]:
# import packages
import pandas as pd
import numpy as np
import os
import plotly.express as px

%reload_ext autoreload
%autoreload 2

# # Tell python where to look for modules.
import sys
sys.path.append('../../../open-grid-emissions/src/')

import download_data
import load_data
from column_checks import get_dtypes
from filepaths import *
import impute_hourly_profiles
import data_cleaning
import output_data
import emissions
import validation
from gross_to_net_generation import *
import eia930

year = 2021
path_prefix = f"{year}/"

from logging_util import get_logger, configure_root_logger

configure_root_logger()
logger = get_logger("test")

In [None]:
cems = pd.read_csv(outputs_folder(f"{path_prefix}/cems_subplant_{year}.csv"), dtype=get_dtypes())
eia923_allocated = pd.read_csv(outputs_folder(f"{path_prefix}/eia923_allocated_{year}.csv"), dtype=get_dtypes())
#plant_attributes = pd.read_csv(outputs_folder(f"{path_prefix}/plant_static_attributes_{year}.csv"), dtype=get_dtypes())

In [None]:
validation.validate_gross_to_net_conversion(cems, eia923_allocated)

In [None]:
logger.info(
    "Checking that calculated net generation matches reported net generation in EIA-923...  "
)
# merge together monthly subplant totals from EIA and calculated from CEMS
eia_netgen = (
    eia923_allocated.groupby(
        ["plant_id_eia", "subplant_id", "report_date"], dropna=False
    )[["net_generation_mwh"]]
    .sum(min_count=1)
    .reset_index()
    .dropna(subset="net_generation_mwh")
)
calculated_netgen = (
    cems.groupby(["plant_id_eia", "subplant_id", "report_date"], dropna=False)[
        "net_generation_mwh"
    ]
    .sum()
    .reset_index()
)
validated_ng = eia_netgen.merge(
    calculated_netgen,
    how="inner",
    on=["plant_id_eia", "subplant_id", "report_date"],
    suffixes=("_eia", "_calc"),
    validate="1:1",
)

validated_ng = validated_ng.groupby("plant_id_eia")[
    ["net_generation_mwh_eia", "net_generation_mwh_calc"]
].sum()

validated_ng = validated_ng.round(3)
validated_ng = validated_ng[
    validated_ng[["net_generation_mwh_eia", "net_generation_mwh_calc"]].sum(axis=1)
    != 0
]

validated_ng["pct_error"] = (
    validated_ng["net_generation_mwh_calc"] - validated_ng["net_generation_mwh_eia"]
) / validated_ng["net_generation_mwh_eia"]

cems_net_not_equal_to_eia = validated_ng[validated_ng["pct_error"] != 0]

In [None]:
gtn_method = cems.loc[cems["plant_id_eia"].isin(list(cems_net_not_equal_to_eia.index)), ["plant_id_eia", "gtn_method"]].drop_duplicates()
gtn_method["gtn_method"] = gtn_method["gtn_method"].astype(str)
gtn_method.groupby("plant_id_eia").agg(["unique"]).droplevel(level=1, axis=1)

In [None]:
gtn_method = (
        cems[cems["plant_id_eia"].isin(list(cems_net_not_equal_to_eia.index))]
        .groupby("plant_id_eia")[["gtn_method"]]
        .agg(["unique"])
    )

In [None]:
cems = pd.read_csv(outputs_folder(f"{path_prefix}/cems_subplant_{year}.csv"), dtype=get_dtypes())
eia923_allocated = pd.read_csv(outputs_folder(f"{path_prefix}/eia923_allocated_{year}.csv"), dtype=get_dtypes())
plant_attributes = pd.read_csv(outputs_folder(f"{path_prefix}/plant_static_attributes_{year}.csv"), dtype=get_dtypes())

In [None]:
# aggregate the hourly cems data by subplant
gross_gen_data = cems[
    [
        "plant_id_eia",
        "subplant_id",
        "report_date",
        "datetime_utc",
        "gross_generation_mwh",
    ]
].copy()
# identify the 2nd percentile lowest hourly gross generation value in a month
min_gross = (
    gross_gen_data.groupby(
        ["plant_id_eia", "subplant_id", "report_date"], dropna=False
    )
    .agg({"gross_generation_mwh": lambda x: x.quantile(0.02)})
    .reset_index()
    .rename(columns={"gross_generation_mwh": "minimum_gross_generation_mwh"})
)
# identify the 98th percentile highest hourly gross generation value in a month
max_gross = (
    gross_gen_data.groupby(
        ["plant_id_eia", "subplant_id", "report_date"], dropna=False
    )
    .agg({"gross_generation_mwh": lambda x: x.quantile(0.98)})
    .reset_index()
    .rename(columns={"gross_generation_mwh": "maximum_gross_generation_mwh"})
)
subplant_capacity = calculate_subplant_nameplate_capacity(year)
# aggregate the cems data to the monthly level
gross_gen_data = (
    gross_gen_data.groupby(
        ["plant_id_eia", "subplant_id", "report_date"], dropna=False
    )
    .agg({"datetime_utc": "count", "gross_generation_mwh": "sum"})
    .reset_index()
    .rename(columns={"datetime_utc": "hours_in_month"})
)
net_gen_data = (
    eia923_allocated.dropna(subset=["net_generation_mwh"])
    .groupby(["plant_id_eia", "subplant_id", "report_date"], dropna=False)[
        "net_generation_mwh"
    ]
    .sum()
    .reset_index()
)

# combine monthly gross and net generation data where we have data for both
combined_gen_data = (
    gross_gen_data.merge(
        min_gross,
        how="left",
        on=["plant_id_eia", "subplant_id", "report_date"],
        validate="1:1",
    )
    .merge(
        max_gross,
        how="left",
        on=["plant_id_eia", "subplant_id", "report_date"],
        validate="1:1",
    )
    .merge(
        subplant_capacity,
        how="left",
        on=["plant_id_eia", "subplant_id"],
        validate="m:1",
    )
    .merge(
        net_gen_data,
        how="outer",
        on=["plant_id_eia", "subplant_id", "report_date"],
        indicator="data_source",
        validate="1:1",
    )
)
combined_gen_data["data_source"] = combined_gen_data["data_source"].replace(
    {"left_only": "cems_only", "right_only": "eia_only"}
)

In [None]:
combined_gen_data

In [None]:
validation.check_missing_or_zero_generation_matches(combined_gen_data)

In [None]:
# calculate other groupings at the plant and annual levels
annual_subplant_ratio = (
    combined_gen_data.dropna(subset=["gross_generation_mwh", "net_generation_mwh"])
    .groupby(["plant_id_eia", "subplant_id"], dropna=False)[
        ["gross_generation_mwh", "net_generation_mwh", "hours_in_month"]
    ]
    .sum()
    .reset_index()
)
monthly_plant_ratio = (
    combined_gen_data.dropna(subset=["gross_generation_mwh", "net_generation_mwh"])
    .groupby(["plant_id_eia", "report_date"], dropna=False)[
        ["gross_generation_mwh", "net_generation_mwh"]
    ]
    .sum()
    .reset_index()
)
annual_plant_ratio = (
    combined_gen_data.dropna(subset=["gross_generation_mwh", "net_generation_mwh"])
    .groupby(["plant_id_eia"], dropna=False)[
        ["gross_generation_mwh", "net_generation_mwh", "hours_in_month"]
    ]
    .sum()
    .reset_index()
)

# calculate the ratios at each aggregation level
# fill missing values (due to divide by zero) with zero
# replace infinite values with missing
combined_gen_data["monthly_subplant_ratio"] = (
    combined_gen_data["net_generation_mwh"]
    / combined_gen_data["gross_generation_mwh"]
).replace([np.inf, -np.inf], np.nan)
annual_subplant_ratio["annual_subplant_ratio"] = (
    annual_subplant_ratio["net_generation_mwh"]
    / annual_subplant_ratio["gross_generation_mwh"]
).replace([np.inf, -np.inf], np.nan)
monthly_plant_ratio["monthly_plant_ratio"] = (
    monthly_plant_ratio["net_generation_mwh"]
    / monthly_plant_ratio["gross_generation_mwh"]
).replace([np.inf, -np.inf], np.nan)
annual_plant_ratio["annual_plant_ratio"] = (
    annual_plant_ratio["net_generation_mwh"]
    / annual_plant_ratio["gross_generation_mwh"]
).replace([np.inf, -np.inf], np.nan)

In [None]:
validation.identify_anomalous_annual_plant_gtn_ratios(annual_plant_ratio)