In [None]:
# import packages
import pandas as pd
import numpy as np


%reload_ext autoreload
%autoreload 2

# # Tell python where to look for modules.
import sys

sys.path.append("../../src/")

import oge.load_data as load_data
from oge.filepaths import *
from oge.logging_util import get_logger, configure_root_logger

import pudl.analysis.allocate_gen_fuel as allocate_gen_fuel


configure_root_logger()
logger = get_logger("test")


year = 2024
path_prefix = f"{year}/"

In [None]:
# Allocate fuel and generation across each generator-pm-energy source
gf = load_data.load_pudl_table("out_eia923__monthly_generation_fuel_combined", year)
bf = load_data.load_pudl_table("out_eia923__monthly_boiler_fuel", year)
gen = load_data.load_pudl_table("out_eia923__monthly_generation", year)
gens = load_data.load_pudl_table("out_eia__yearly_generators", year)
bga = load_data.load_pudl_table("core_eia860__assn_boiler_generator", year)

# NOTE: As of 12/7/2024, there is a bug in the pudl data where incorrect generators
# are getting introduced.
# See: https://github.com/catalyst-cooperative/pudl/issues/3987
# To fix this, we need to filter `gens` to remove data with a missing
# "data_maturity" column
# (As of 11/27/25) this problem still exists in the table
#gens = gens[~gens["data_maturity"].isna()]

gf, bf, gen, bga, gens = allocate_gen_fuel.select_input_data(
    gf=gf, bf=bf, gen=gen, bga=bga, gens=gens
)

freq="MS"
bf, gens_at_freq, gen = allocate_gen_fuel.standardize_input_frequency(bf, gens, gen, freq)
# Add any startup energy source codes to the list of energy source codes
gens_at_freq = allocate_gen_fuel.adjust_msw_energy_source_codes(gens_at_freq, gf, bf)
gens_at_freq = allocate_gen_fuel.add_missing_energy_source_codes_to_gens(gens_at_freq, gf, bf)

gens=gens_at_freq

In [None]:
stack_gens = allocate_gen_fuel.stack_generators(
    gens, cat_col="energy_source_code_num", stacked_col="energy_source_code"
).pipe(allocate_gen_fuel.apply_pudl_dtypes, group="eia")
# allocate the boiler fuel data to generators
bf_by_gens = (
    allocate_gen_fuel.allocate_bf_data_to_gens(bf, gens, bga)
    .set_index(allocate_gen_fuel.IDX_GENS_PM_ESC)
    .add_suffix("_bf_tbl")
    .reset_index()
    .pipe(allocate_gen_fuel.apply_pudl_dtypes, group="eia")
)
gf = (
    gf.set_index(allocate_gen_fuel.IDX_PM_ESC)[allocate_gen_fuel.DATA_COLUMNS]
    .add_suffix("_gf_tbl")
    .reset_index()
    .pipe(allocate_gen_fuel.apply_pudl_dtypes, group="eia")
)

In [None]:
gen_assoc = (
pd.merge(
    stack_gens,
    gen.rename(columns={"net_generation_mwh": "net_generation_mwh_g_tbl"}),
    on=allocate_gen_fuel.IDX_GENS,
    how="outer",
)
.merge(gf, on=allocate_gen_fuel.IDX_PM_ESC, how="outer", validate="m:1", indicator=True)
)

In [None]:
gen_assoc["gf_unique_to_gen"] = gen_assoc.groupby(
    ["plant_id_eia", "report_date", "prime_mover_code", "energy_source_code"]
)["generator_id"].transform("nunique") == 1


In [None]:
gen_assoc[gen_assoc["plant_id_eia"] == 55088].sort_values(by=["generator_id","report_date"])

In [None]:
# this is the query we want to use for retiring gens
gen_assoc[
    (gen_assoc.operational_status == "retired")
    & (
        (gen_assoc.report_date <= gen_assoc.generator_retirement_date)
        | (
            gen_assoc.net_generation_mwh_g_tbl.notnull()
            | (
                gen_assoc["gf_unique_to_gen"]
                & (
                    gen_assoc.net_generation_mwh_gf_tbl.notnull()
                    | gen_assoc.filter(like="fuel_consumed_").notnull().any(axis=1)
                )
            )
        )
    )
].sort_values(by=["plant_id_eia", "report_date"])

In [None]:
proposed_generators = gen_assoc.loc[
    (gen_assoc.operational_status == "proposed")
    & (
        gen_assoc.net_generation_mwh_g_tbl.notnull()
        | (
            gen_assoc["gf_unique_to_gen"]
            & (
                (gen_assoc.net_generation_mwh_gf_tbl.notnull())
                | (gen_assoc.filter(like="fuel_consumed_").notnull().any(axis=1))
            )
        )
    )
]
proposed_generators

In [None]:
gen_assoc[
    gen_assoc["gf_unique_to_gen"]
    & (gen_assoc["operational_status"] != "existing")
    & ((gen_assoc.filter(like="net_generation_mwh") > 0).any(axis=1))
    & (gen_assoc.report_date > gen_assoc.generator_retirement_date)
].sort_values(by=["plant_id_eia","report_date"])

In [None]:
gen_assoc.filter(like="net_generation_mwh")

In [None]:
existing = gen_assoc.loc[(gen_assoc.operational_status == "existing")]

retiring_generators = allocate_gen_fuel.identify_retiring_generators(gen_assoc)

retired_plants = allocate_gen_fuel.identify_retired_plants(gen_assoc)

proposed_generators = allocate_gen_fuel.identify_generators_coming_online(gen_assoc)

proposed_plants = allocate_gen_fuel.identify_proposed_plants(gen_assoc)

unassociated_plants = gen_assoc[gen_assoc.generator_id.isnull()]

In [None]:
proposed_plants[proposed_plants["plant_id_eia"] == 57991]

In [None]:
def identify_retiring_generators(gen_assoc: pd.DataFrame) -> pd.DataFrame:
    """Identify any generators that retire mid-year.

    We want to include all of the generator records within any given year that
    retired mid-year or any generators that reported any fuel use or generation.
    These are generators with a mid-year retirement date or which report
    generator-specific generation or fuel use after they are labeled as retired.
    """
    gen_assoc = gen_assoc.assign(report_year=lambda x: x.report_date.dt.year)
    # identify the complete set of generator ids that are retiring mid year
    # or have fuel or generation use while being labeled as retired.
    retiring_generator_identities = gen_assoc.loc[
        (gen_assoc.operational_status == "retired")
        & (
            (gen_assoc.report_date <= gen_assoc.generator_retirement_date)
            | gen_assoc.filter(like="net_generation_mwh").notnull().any(axis=1)
            | gen_assoc.filter(like="fuel_consumed").notnull().any(axis=1)
        ),
        ["plant_id_eia", "generator_id", "report_year"],
    ].drop_duplicates()

    # merge these ids into gen_assoc and keep all months of data for these gens
    retiring_generators = gen_assoc.merge(
        retiring_generator_identities,
        how="inner",
        on=["plant_id_eia", "generator_id", "report_year"],
    ).drop(columns=["report_year"])

    return retiring_generators

retiring_generators = identify_retiring_generators(gen_assoc)

In [None]:
retiring_generators[retiring_generators["plant_id_eia"] == 1]