In [1]:
# import packages
import numpy as np
import pandas as pd
import plotly.express as px

%reload_ext autoreload
%autoreload 2

# # Tell python where to look for modules.
import sys

sys.path.append("../../src")

# import local modules
import oge.load_data as load_data
import oge.data_cleaning as data_cleaning
import oge.gross_to_net_generation as gross_to_net_generation
import oge.impute_hourly_profiles as impute_hourly_profiles


from oge.column_checks import get_dtypes, apply_dtypes

year = 2020

In [None]:
# load data from csv
year = 2020
path_prefix = ""

cems = pd.read_csv(
    f"../data/outputs/{path_prefix}{year}/cems_{year}.csv",
    dtype=get_dtypes(),
    parse_dates=["datetime_utc", "report_date"],
)
partial_cems_scaled = pd.read_csv(
    f"../data/outputs/{path_prefix}{year}/partial_cems_scaled_{year}.csv",
    dtype=get_dtypes(),
    parse_dates=["datetime_utc", "report_date"],
)
eia923_allocated = pd.read_csv(
    f"../data/outputs/{path_prefix}{year}/eia923_allocated_{year}.csv",
    dtype=get_dtypes(),
    parse_dates=["report_date"],
)
plant_attributes = pd.read_csv(
    f"../data/outputs/{path_prefix}{year}/plant_static_attributes_{year}.csv"
)

# Validate that CEMS calculated net generation matches the net generation reported in EIA-923
It will only match for subplants where there is an overlap in the two data sources

In [None]:
# validate method

# merge together monthly subplant totals from EIA and calculated from CEMS
eia_netgen = (
    eia923_allocated.groupby(
        ["plant_id_eia", "subplant_id", "report_date"], dropna=False
    )
    .sum(min_count=1)["net_generation_mwh"]
    .reset_index()
    .dropna(subset="net_generation_mwh")
)
calculated_netgen = (
    cems.groupby(["plant_id_eia", "subplant_id", "report_date"], dropna=False)
    .sum()["net_generation_mwh"]
    .reset_index()
)
validated_ng = eia_netgen.merge(
    calculated_netgen,
    how="inner",
    on=["plant_id_eia", "subplant_id", "report_date"],
    suffixes=("_eia", "_calc"),
)

validated_ng = (
    validated_ng.groupby("plant_id_eia")
    .sum()
    .reset_index()
    .drop(columns=["subplant_id"])
)

validated_ng = validated_ng.round(3)
validated_ng = validated_ng[
    validated_ng[["net_generation_mwh_eia", "net_generation_mwh_calc"]].sum(axis=1) != 0
]

validated_ng["pct_error"] = (
    validated_ng["net_generation_mwh_calc"] - validated_ng["net_generation_mwh_eia"]
) / validated_ng["net_generation_mwh_eia"]
validated_ng.sort_values(by="pct_error")

# What is the most negative generation that a generator reports on average in a month?

In [None]:
# what is the most negative
negative_hourly = eia923_allocated.copy()[
    ["plant_id_eia", "generator_id", "report_date", "net_generation_mwh"]
].dropna()
negative_hourly["hours"] = negative_hourly.report_date.dt.daysinmonth * 24
negative_hourly["avg_generation_mw"] = (
    negative_hourly["net_generation_mwh"] / negative_hourly["hours"]
)
negative_hourly.sort_values(by="avg_generation_mw")

In [None]:
plant_to_test = 1391

cems_test = cems[cems["plant_id_eia"] == plant_to_test]
eia_test = eia923_allocated[eia923_allocated["plant_id_eia"] == plant_to_test]

In [None]:
# identify data source
all_data = eia_test.copy()

# create a binary column indicating whether data was reported in 923
columns_to_test = [
    "net_generation_mwh",
    "fuel_consumed_mmbtu",
    "fuel_consumed_for_electricity_mmbtu",
    "co2_mass_lb",
    "co2_mass_lb_adjusted",
]
all_data = all_data.assign(
    reported_eia923=lambda x: np.where(x[columns_to_test].notnull().all(axis=1), 1, 0)
)

cems_monthly = (
    cems_test.groupby(["plant_id_eia", "subplant_id", "report_date"], dropna=False)
    .count()["datetime_utc"]
    .reset_index()
)

cems_monthly["hourly_data_source"] = "cems"

# merge in the data source column from CEMS
all_data = all_data.merge(
    cems_monthly[["plant_id_eia", "subplant_id", "report_date", "hourly_data_source"]],
    how="left",
    on=["plant_id_eia", "subplant_id", "report_date"],
    validate="m:1",
)

# for the remaining plants, identify the hourly data source as EIA
all_data["hourly_data_source"] = all_data["hourly_data_source"].fillna("eia")

all_data.loc[
    (all_data["hourly_data_source"] == "cems")
    & (~all_data["net_generation_mwh"].isna()),
    "hourly_data_source",
] = "both"

all_data

In [None]:
gtn_conversions = gross_to_net_generation.calculate_gross_to_net_conversion_factors(
    cems_test, eia_test, plant_attributes, year
)

factors_to_use = gross_to_net_generation.filter_gtn_conversion_factors(gtn_conversions)

# merge the conversion factors we want to use into the cems_test data
cems_test = cems_test.merge(
    factors_to_use[
        [
            "plant_id_eia",
            "subplant_id",
            "report_date",
            "data_source",
            "annual_subplant_shift_mw",
            "annual_plant_shift_mw",
            "annual_subplant_ratio",
            "annual_plant_ratio",
            "annual_fleet_ratio",
        ]
    ],
    how="left",
    on=["plant_id_eia", "subplant_id", "report_date"],
)

"""units_in_subplant = count_cems_test_units_in_subplant(cems_test)
cems_test = cems_test.merge(
    units_in_subplant, how="left", on=["plant_id_eia", "subplant_id", "report_date"]
)"""

cems_test["gtn_method"] = "1_annual_subplant_shift_factor"
cems_test["net_generation_mwh"] = (
    cems_test["gross_generation_mwh"] + cems_test["annual_subplant_shift_mw"]
)

cems_test.loc[cems_test["net_generation_mwh"].isna(), "gtn_method"] = (
    "2_annual_subplant_ratio"
)
cems_test["net_generation_mwh"] = cems_test["net_generation_mwh"].fillna(
    cems_test["gross_generation_mwh"] * cems_test["annual_subplant_ratio"]
)

cems_test.loc[cems_test["net_generation_mwh"].isna(), "gtn_method"] = (
    "3_annual_plant_shift_factor"
)
cems_test["net_generation_mwh"] = cems_test["net_generation_mwh"].fillna(
    cems_test["gross_generation_mwh"] + cems_test["annual_plant_shift_mw"]
)

cems_test.loc[cems_test["net_generation_mwh"].isna(), "gtn_method"] = (
    "4_annual_plant_ratio"
)
cems_test["net_generation_mwh"] = cems_test["net_generation_mwh"].fillna(
    cems_test["gross_generation_mwh"] * cems_test["annual_plant_ratio"]
)

cems_test.loc[cems_test["net_generation_mwh"].isna(), "gtn_method"] = (
    "5_annual_fleet_ratio"
)
cems_test["net_generation_mwh"] = cems_test["net_generation_mwh"].fillna(
    cems_test["gross_generation_mwh"] * cems_test["annual_fleet_ratio"]
)

cems_test.loc[cems_test["net_generation_mwh"].isna(), "gtn_method"] = (
    "6_gross_equals_net"
)
cems_test["net_generation_mwh"] = cems_test["net_generation_mwh"].fillna(
    cems_test["gross_generation_mwh"]
)

# drop intermediate columns
cems_test = cems_test.drop(
    columns=[
        # "data_source",
        "annual_subplant_shift_mw",
        "annual_plant_shift_mw",
        "annual_subplant_ratio",
        "annual_plant_ratio",
        "annual_fleet_ratio",
    ]
)

In [None]:
factors_to_use

In [None]:
cems_test.groupby(["data_source", "subplant_id"], dropna=False).sum()[
    ["gross_generation_mwh", "net_generation_mwh"]
].reset_index()

In [None]:
eia_test.groupby(["hourly_data_source", "subplant_id"], dropna=False).sum()[
    "net_generation_mwh"
].reset_index()

In [None]:
factors_to_use.groupby(["data_source", "subplant_id"], dropna=False).sum()[
    "net_generation_mwh"
].reset_index()

# Examine hourly profiles

In [None]:
hourly_profiles = pd.read_csv(f"../data/outputs/{year}/hourly_profiles_{year}.csv")

In [None]:
data_to_graph = hourly_profiles[
    (hourly_profiles["fuel_category"] == "coal")
    & (hourly_profiles["ba_code"] == "MISO")
]

px.line(
    data_to_graph,
    x="datetime_utc",
    y=[
        "eia930_profile",
        "cems_profile",
        "residual_profile",
        "scaled_residual_profile",
        "shifted_residual_profile",
        "imputed_profile",
        "profile",
    ],
)

In [None]:
plant_to_test = 3399
subplant = 1

cems_to_graph = cems[
    (cems["plant_id_eia"] == plant_to_test) & (cems["subplant_id"] == subplant)
]

px.line(
    cems_to_graph, x="datetime_utc", y=["gross_generation_mwh", "net_generation_mwh"]
)