In [None]:
# import packages
import pandas as pd
import numpy as np
import os
import plotly.express as px

# # Tell python where to look for modules.
import sys

sys.path.append("../../src/")

%reload_ext autoreload
%autoreload 2
import oge.load_data as load_data
from oge.filepaths import *
import oge.data_cleaning as data_cleaning
from oge.logging_util import get_logger, configure_root_logger

configure_root_logger()
logger = get_logger("test")


In [None]:
# NOTE: Need to introduce the concept of a static earliest year

# The earliest_year represents the earliest year that the OGE dataset will ever cover
# data back to 2001 is available in PUDL, but EIA-860 files prior to 2004 was distributed
# in a different format that may not be consistent with later files, and may use 
# different primary keys for generators.
# https://catalystcoop-pudl.readthedocs.io/en/v2023.12.01/data_sources/eia860.html#how-much-of-the-data-is-accessible-through-pudl
earliest_year = 2005
year = 2022

path_prefix = f"{year}/"

In [None]:

# although we could directly load all years at once from the cems parquet file, this
# would lead to a memoryerror, so we load one year at a time and drop duplicates before
# concatting the next year to the dataframe
cems_ids = []
# use 2001 as the start year as this is the earliest year that EIA data is available
# in PUDL, and we would likely never use data before this year.
for year in range(earliest_year, year + 1):
    cems_id_year = pd.read_parquet(
        downloads_folder("pudl/hourly_emissions_epacems.parquet"),
        filters=[["year", "==", year]],
        columns=["plant_id_epa", "plant_id_eia", "emissions_unit_id_epa"],
    ).drop_duplicates()
    cems_ids.append(cems_id_year)
    cems_ids = [pd.concat(cems_ids, axis=0).drop_duplicates()]
cems_ids = (
    pd.concat(cems_ids, axis=0)
    .drop_duplicates()
    .sort_values(by=["plant_id_eia", "emissions_unit_id_epa"])
)

In [None]:
#######################################################
# generate_subplant_ids()

import pudl.analysis.epacamd_eia as epacamd_eia

# load the crosswalk and filter it by the data that actually exists in cems
crosswalk = load_data.load_epa_eia_crosswalk(year)

# filter the crosswalk to drop any units that don't exist in CEMS
filtered_crosswalk = epacamd_eia.filter_crosswalk(crosswalk, cems_ids)

In [None]:
######################################################
from pudl.etl.glue_assets import make_subplant_ids

# use graph analysis to identify subplants
crosswalk_with_subplant_ids = make_subplant_ids(filtered_crosswalk)

# change the eia plant id to int
crosswalk_with_subplant_ids["plant_id_eia"] = crosswalk_with_subplant_ids[
    "plant_id_eia"
].astype(int)

# change the order of the columns
crosswalk_with_subplant_ids = crosswalk_with_subplant_ids[
    [
        "plant_id_epa",
        "emissions_unit_id_epa",
        "plant_id_eia",
        "generator_id",
        "subplant_id",
    ]
]

In [None]:
complete_gens = load_data.load_pudl_table(
    "denorm_generators_eia",
    columns=[
        "report_date",
        "plant_id_eia",
        "generator_id",
        "unit_id_pudl",
        "operational_status_code",
        "generator_operating_date",
        "generator_retirement_date",
        "original_planned_generator_operating_date",
        "current_planned_generator_operating_date",
    ],
)

In [None]:
complete_gens[(complete_gens["plant_id_eia"] == 136) & (complete_gens["generator_id"] == "2")]

In [None]:
complete_gens = load_data.load_pudl_table(
    "denorm_generators_eia",
    columns=[
        "report_date",
        "plant_id_eia",
        "generator_id",
        "unit_id_pudl",
        "operational_status_code",
        "generator_operating_date",
        "generator_retirement_date",
        "original_planned_generator_operating_date",
        "current_planned_generator_operating_date",
    ],
)


# drop any data that was reported prior to the earliest year
# only keep data for years <= the year
# this avoids using potentially preliminary early-release data
complete_gens = complete_gens[
    (complete_gens["report_date"].dt.year >= earliest_year)
    & (complete_gens["report_date"].dt.year <= year)
]

# for any retired gens, forward fill the most recently available unit_id_pudl to the
# most recent available year
complete_gens["unit_id_pudl"] = complete_gens.groupby(["plant_id_eia", "generator_id"])[
    "unit_id_pudl"
].ffill()

# only keep the most recent entry for each generator
complete_gens = complete_gens.sort_values(
    by=["plant_id_eia", "generator_id", "report_date"], ascending=True
).drop_duplicates(subset=["plant_id_eia", "generator_id"], keep="last")

# remove generators that are proposed but not yet under construction, or cancelled
status_codes_to_remove = ["CN", "IP", "P", "L", "T"]
complete_gens = complete_gens[
    ~complete_gens["operational_status_code"].isin(status_codes_to_remove)
]

# remove generators that retired prior to the earliest year
complete_gens = complete_gens[
    ~(
        (complete_gens["operational_status_code"] == "RE")
        & (complete_gens["generator_retirement_date"].dt.year < earliest_year)
    )
]

# remove generators that have no operating or retirement date, and the last time they
# reported data was prior to the current year. This is often proposed plants that are
# assigned a new plant_id_eia once operational
complete_gens = complete_gens[
    ~(
        (complete_gens["generator_operating_date"].isna())
        & (complete_gens["generator_retirement_date"].isna())
        & (complete_gens["report_date"].dt.year < year)
    )
]

In [None]:
# load unit codes for proposed generators
proposed_unit_ids = (
    pd.read_excel(
        downloads_folder(f"eia860/eia860{year}/3_1_Generator_Y{year}.xlsx"),
        sheet_name="Proposed",
        header=1,
        usecols=["Plant Code", "Generator ID", "Unit Code"],
    )
    .dropna(subset="Unit Code")
    .rename(
        columns={
            "Plant Code": "plant_id_eia",
            "Generator ID": "generator_id",
            "Unit Code": "unit_id_eia",
        }
    )
)
# create a numeric version of the ID, starting at 1
proposed_unit_ids["unit_id_eia_numeric"] = proposed_unit_ids.groupby(["plant_id_eia"])[
    "unit_id_eia"
].transform(lambda x: pd.factorize(x)[0] + 1)

# merge the proposed unit ids into the complete_gens
complete_gens = complete_gens.merge(
    proposed_unit_ids, how="left", on=["plant_id_eia", "generator_id"], validate="1:1"
)
# update the unit_id_eia_numeric to be one higher than the highest existing unit_id_pudl
# if unit_id_eia_numeric is NA, the updated value should also still be na
complete_gens["unit_id_eia_numeric"] = complete_gens[
    "unit_id_eia_numeric"
] + complete_gens.groupby("plant_id_eia")["unit_id_pudl"].transform("max").fillna(0)

# fill in missing unit_id_pudl with the updated values
complete_gens["unit_id_pudl"] = complete_gens["unit_id_pudl"].fillna(
    complete_gens["unit_id_eia_numeric"]
)

complete_gens = complete_gens.drop(columns=["unit_id_eia","unit_id_eia_numeric"])

In [None]:
complete_generator_ids = complete_gens[["plant_id_eia", "generator_id", "unit_id_pudl"]].drop_duplicates()

In [None]:
##################################################
# update the subplant_crosswalk to ensure completeness
# prepare the subplant crosswalk by adding a complete list of generators and adding
# the unit_id_pudl column
#complete_generator_ids = complete_gens[["plant_id_eia", "generator_id", "unit_id_pudl"]].drop_duplicates()

subplant_crosswalk_complete = crosswalk_with_subplant_ids.merge(
    complete_gens,
    how="outer",
    on=["plant_id_eia", "generator_id"],
    validate="m:1",
)
# also add a complete list of cems emissions_unit_id_epa
subplant_crosswalk_complete = subplant_crosswalk_complete.merge(
    cems_ids[["plant_id_eia", "emissions_unit_id_epa"]].drop_duplicates(),
    how="outer",
    on=["plant_id_eia", "emissions_unit_id_epa"],
    validate="m:1",
)

In [None]:
#NOTE: plant_parts_eia does not contain all of the unit_id_pudl
# we need to investigate another source (plant 613)

# instead we will use denorm_generators_eia

In [None]:
#NOTE: For any generators that are proposed but not existing, we should always sort these last in case they are 
# never completed so that they do not mess up the order
# eg plant 64811 UPT7

In [None]:
subplant_crosswalk_complete["sort_date"] = subplant_crosswalk_complete[
    "generator_operating_date"
].fillna(subplant_crosswalk_complete["generator_retirement_date"]).fillna(subplant_crosswalk_complete["original_planned_generator_operating_date"])

In [None]:
subplant_crosswalk_complete[subplant_crosswalk_complete["sort_date"].isna()]

In [None]:
subplant_crosswalk_complete[
    (subplant_crosswalk_complete["generator_operating_date"].isna())
    & (subplant_crosswalk_complete["generator_retirement_date"].isna())
    & (subplant_crosswalk_complete["report_date"].dt.year < year)
    #& (subplant_crosswalk_complete["current_planned_generator_operating_date"].dt.year >= year)
]

In [None]:
subplant_crosswalk_complete[subplant_crosswalk_complete["sort_date"].isna()].head(20)

In [None]:
# sort values to ensure static order
subplant_crosswalk = subplant_crosswalk_complete.sort_values(
    by=["plant_id_eia", "generator_operating_date", "generator_retirement_date", "generator_id"], ascending=True
).copy()

In [None]:
# load generator commercial operating dates
gen_cods = load_data.load_pudl_table(
    "generators_entity_eia",
    columns=[
        "plant_id_eia",
        "generator_id",
        "generator_operating_date",
        "original_planned_generator_operating_date",
    ],
).sort_values(
    by=[
        "plant_id_eia",
        "generator_id",
    ]
)

# fill missing dates with the planned operating date (for proposed generators)
gen_cods["generator_operating_date"] = gen_cods[
    "generator_operating_date"
].fillna(gen_cods["original_planned_generator_operating_date"])

gen_cods = gen_cods.drop(columns=["original_planned_generator_operating_date"])

In [None]:
# merge the dates into the dataframe
subplant_crosswalk_complete = subplant_crosswalk_complete.merge(
    gen_cods, how="left", on=["plant_id_eia", "generator_id"], validate="m:1"
)

In [None]:
subplant_crosswalk = subplant_crosswalk_complete.sort_values(by=["plant_id_eia","generator_operating_date","generator_id"], ascending=True).copy()

In [None]:
subplant_crosswalk[subplant_crosswalk["plant_id_eia"] == 422]

In [None]:
# NOTE: connect_ids() seems to be incorrectly grouping ids
# NOTE: subset should probably be [plant_id_iea, id_to_update]

In [None]:
def connect_ids(df, id_to_update, connecting_id):
    """Corrects an id value if it is connected by an id value in another column.

    if multiple subplant_id are connected by a single unit_id_pudl, this groups these
    subplant_id together
    if multiple unit_id_pudl are connected by a single subplant_id, this groups these
    unit_id_pudl together

    Args:
        df: dataframe containing columns with id_to_update and connecting_id columns
        subplant_unit_pairs
    """

    # get a table with all unique subplant to unit pairs
    subplant_unit_pairs = df[
        ["plant_id_eia", "subplant_id", "unit_id_pudl"]
    ].drop_duplicates()

    # identify if any non-NA id_to_update are duplicated, indicated that it is
    # associated with multiple connecting_id
    duplicates = subplant_unit_pairs[
        (subplant_unit_pairs.duplicated(subset=["plant_id_eia", connecting_id], keep=False))
        & (~subplant_unit_pairs[connecting_id].isna())
    ].copy()

    # if there are any duplicate units, indicating an incorrect id_to_update,
    # fix the id_to_update
    df[f"{id_to_update}_connected"] = df[id_to_update]
    if len(duplicates) > 0:
        # find the lowest number subplant id associated with each duplicated
        # unit_id_pudl
        duplicates.loc[:, f"{id_to_update}_to_replace"] = (
            duplicates.groupby(["plant_id_eia", connecting_id])[id_to_update]
            .min()
            .iloc[0]
        )
        # merge this replacement subplant_id into the dataframe and use it to update
        # the existing subplant id
        df = df.merge(
            duplicates,
            how="left",
            on=["plant_id_eia", id_to_update, connecting_id],
            validate="m:1",
        )
        df.update({f"{id_to_update}_connected": df[f"{id_to_update}_to_replace"]})
        df = df.drop(columns=f"{id_to_update}_to_replace")
    return df

In [None]:
##################################
df = subplant_crosswalk.copy()
id_to_update = "subplant_id"
connecting_id = "unit_id_pudl"

# get a table with all unique subplant to unit pairs
subplant_unit_pairs = df[
    ["plant_id_eia", "subplant_id", "unit_id_pudl"]
].drop_duplicates()

# identify if any non-NA id_to_update are duplicated, indicated that it is
# associated with multiple connecting_id
duplicates = subplant_unit_pairs[
    (subplant_unit_pairs.duplicated(subset=["plant_id_eia", connecting_id], keep=False))
    & (~subplant_unit_pairs[connecting_id].isna())
].copy()

# if there are any duplicate units, indicating an incorrect id_to_update,
# fix the id_to_update
df[f"{id_to_update}_connected"] = df[id_to_update]
if len(duplicates) > 0:
    # find the lowest number subplant id associated with each duplicated
    # unit_id_pudl
    duplicates.loc[:, f"{id_to_update}_to_replace"] = (
        duplicates.groupby(["plant_id_eia", connecting_id])[id_to_update]
        .min()
        .iloc[0]
    )
    # merge this replacement subplant_id into the dataframe and use it to update
    # the existing subplant id
    df = df.merge(
        duplicates,
        how="left",
        on=["plant_id_eia", id_to_update, connecting_id],
        validate="m:1",
    )
    df.update({f"{id_to_update}_connected": df[f"{id_to_update}_to_replace"]})
    #df = df.drop(columns=f"{id_to_update}_to_replace")

In [None]:
#####################################
# update_subplant_ids()

# Step 1: Create corrected versions of subplant_id and unit_id_pudl
# if multiple unit_id_pudl are connected by a single subplant_id,
# unit_id_pudl_connected groups these unit_id_pudl together
subplant_crosswalk = connect_ids(
    subplant_crosswalk, id_to_update="unit_id_pudl", connecting_id="subplant_id"
)

In [None]:
subplant_crosswalk[subplant_crosswalk["plant_id_eia"] == 422]

In [None]:

# if multiple subplant_id are connected by a single unit_id_pudl, group these
# subplant_id together
subplant_crosswalk = connect_ids(
    subplant_crosswalk, id_to_update="subplant_id", connecting_id="unit_id_pudl"
)

In [None]:
subplant_crosswalk[subplant_crosswalk["plant_id_eia"] == 880110]

In [None]:
# NOTE: we have to add sort=False to groupby so that it keeps the values in index order
# NOTE: ngroup() creates a unique number across the entire dataframe, not per group
# pd.factorize() instead creates unique IDs within each group

In [None]:
# NOTE: use_na_sentinel=False creates the same generator ID for all Na values
# so this will lead to incorrect subplant groupings. Eg plant 880110 has no generator_id
# so all 4 units are assigned a generator_id of 0.
# This needs to be fixed

In [None]:
# Step 2: Fill missing subplant_id
# We will use unit_id_pudl to fill missing subplant ids, so first we need to fill
# any missing unit_id_pudl. We do this by assigning a new unit_id_pudl to each
# generator that isn't already grouped into a unit

# since generat
# create a numeric version of each generator_id
# ngroup() creates a unique number for each element in the group
# each unit
subplant_crosswalk["numeric_generator_id"] = subplant_crosswalk.groupby(
    ["plant_id_eia"], dropna=False, sort=False
)["generator_id"].transform(lambda x: pd.factorize(x, use_na_sentinel=False)[0])

In [None]:
subplant_crosswalk

In [None]:
subplant_crosswalk[subplant_crosswalk["plant_id_eia"] == 613]

In [None]:
# when filling in missing unit_id_pudl, we don't want these numeric_generator_id to
# overlap existing unit_id to ensure this, we will add 1000 to each of these numeric
# generator ids to ensure they are unique 1000 was chosen as an arbitrarily high
# number, since the largest unit_id_pudl is ~ 10.
subplant_crosswalk["numeric_generator_id"] = (
    subplant_crosswalk["numeric_generator_id"] + 1000
)
# fill any missing unit_id_pudl with a number for each unique generator
subplant_crosswalk["subplant_id_filled"] = (
    subplant_crosswalk["subplant_id_connected"]
    .fillna(subplant_crosswalk["unit_id_pudl_connected"] + 100)
    .fillna(subplant_crosswalk["numeric_generator_id"])
)


In [None]:
# NOTE: The pudl bga table does not contain data for proposed units like Barry A3ST
# Even though this has a unit code that links it to other generators
# this means that some proposed gens won't be properly linked

In [None]:
bga = load_data.load_pudl_table("boiler_generator_assn_eia860")
bga[(bga["plant_id_eia"] == 3) & (bga["generator_id"] == "A3ST")]

In [None]:
bga[(bga["plant_id_eia"] == 613) & (bga["generator_id"] == "4GT1")]

In [None]:
subplant_crosswalk[subplant_crosswalk["plant_id_eia"] == 3]

In [None]:
# create a new unique subplant_id based on the connected subplant ids and the
# filled unit_id
subplant_crosswalk["new_subplant"] = subplant_crosswalk.groupby(
    ["plant_id_eia"],
    dropna=False, sort=False
)["subplant_id_filled"].transform(lambda x: pd.factorize(x, use_na_sentinel=False)[0])

In [None]:
subplant_crosswalk

In [None]:
subplant_crosswalk[subplant_crosswalk["plant_id_eia"] == 613]

## Plant 54

In [None]:
test = load_data.load_pudl_table(
    "generators_entity_eia"
)

test[test["plant_id_eia"] == 54]

In [None]:
test = load_data.load_pudl_table(
    "generators_eia860"
)

In [None]:
test[(test["plant_id_eia"] == 1) & (test["generator_id"] == "3")]