This notebook is set up to test code as needed.

In [None]:
# import packages
import pandas as pd
import numpy as np
import os

%reload_ext autoreload
%autoreload 2

# # Tell python where to look for modules.
import sys
sys.path.append('../../../open-grid-emissions/src/')

import download_data
import load_data
from column_checks import get_dtypes
from filepaths import *
import impute_hourly_profiles
import data_cleaning
import output_data
import emissions
import validation
import gross_to_net_generation
import eia930

year = 2020
path_prefix = f"{year}/"

# NOTES
- plant 1391 is an example of where a single `unit_id_pudl` is assigned multiple subplants
- plant 10745 is an example of where a `unit_id_pudl` overlaps multiple `subplant_id`       

## The goal
Make sure that there are no NA subplant IDs
Ensure some harmonization between subplant_id and unit_id_pudl

TODO:
 - I've now created code that updates the subplant id of units in the EIA-923 table, but now I need to incorporate this into the pipeline. I don't necessarily need to use EIA-923 allocated except that it should include all generators. Maybe I want to start with an EIA-860 table and do an outer join with the existing subplant table

In [None]:
import sqlalchemy as sa
import pudl.output.pudltabl

In [None]:
year = 2020
number_of_years = 1
start_year = year - (number_of_years - 1)
end_year = year

print("    Creating subplant IDs")
# load 5 years of monthly data from CEMS
cems_monthly = load_data.load_cems_gross_generation(start_year, end_year)

In [None]:
# load data
pudl_out = load_data.initialize_pudl_out(year)
bga = pudl_out.bga_eia860()
gens = pudl_out.gens_eia860()

eia923_allocated = pd.read_csv(f'{outputs_folder()}{path_prefix}/eia923_allocated_{year}.csv', dtype=get_dtypes())

# add unit ids to the allocated EIA-923 data
eia923_allocated = eia923_allocated.merge(bga[["plant_id_eia","generator_id","unit_id_pudl"]].drop_duplicates(), how="left", on=["plant_id_eia","generator_id"], validate="m:1")
#gens[["plant_id_eia","generator_id","unit_id_pudl"]]

In [None]:
gens.columns

In [None]:
bga[["plant_id_eia","generator_id","unit_id_pudl"]].drop_duplicates()

In [None]:
eia923_allocated[eia923_allocated["subplant_id"].isna() & ~eia923_allocated["unit_id_pudl"].isna()].sample(20)

In [None]:
test = eia923_allocated[eia923_allocated["plant_id_eia"] == 10745].copy()
test

In [None]:
test = eia923_allocated[eia923_allocated["plant_id_eia"] == 10]
subplant_unit_overlap = test[["plant_id_eia","subplant_id","unit_id_pudl"]].drop_duplicates()
duplicate_unit = subplant_unit_overlap[(subplant_unit_overlap.duplicated(subset="unit_id_pudl", keep=False)) & (~subplant_unit_overlap["unit_id_pudl"].isna())]
duplicate_unit

In [None]:
def create_updated_subplant_id(test):
    """
    Updates the subplant id.
        1. Because the current subplant_id code does not take boiler-generator associations into account, 
        there may be instances where the code assigns generators to different subplants when in fact, according
        to the boiler-generator association table, these generators are grouped into a single unit based on their
        boiler associations. The first step of this function is thus to identify if multiple subplant_id have 
        been assigned to a single unit_id_pudl. If so, we replace the existing subplant_ids with a single subplant_id.
        For example, if a generator A was assigned subplant_id 0 and generator B was assigned subplant_id 1, but 
        both generators A and B are part of unit_id_pudl 1, we would re-assign the subplant_id to both generators to
        0 (we always use the lowest number subplant_id in each unit_id_pudl group). This may result in some subplant_id
        being skipped, but this is okay because we will later renumber all subplant ids (i.e. if there were also a
        generator C with subplant_id 2, there would no be no subplant_id 1 at the plant)
        2. The second issue is that there are many NA subplant_id that we should fill. To do this, we first look at 
        unit_id_pudl. If a group of generators are assigned a unit_id_pudl but have NA subplant_ids, we assign a single 
        new subplant_id to this group of generators. If there are still generators at a plant that have both NA subplant_id
        and NA unit_id_pudl, we for now assume that each of these generators consitutes its own subplant. We thus assign a unique
        subplant_id to each generator that is unique from any existing subplant_id already at the plant. 

        Args:
            test: a dataframe with columns for ["plant_id_eia","subplant_id","unit_id_pudl", "generator_id]
    """
    # Step 1: check whether we need to use unit_id_pudl to update any grouped generators that had 
    # been mistakenly separated into separate subplants

    # get a table with all unique subplant to unit pairs
    subplant_unit_pairs = test[["plant_id_eia","subplant_id","unit_id_pudl"]].drop_duplicates()
    # identify if any non-NA unit_id_pudl are duplicated, indicated that it is associated with multiple subplant_id
    duplicate_unit = subplant_unit_pairs[(subplant_unit_pairs.duplicated(subset="unit_id_pudl", keep=False)) & (~subplant_unit_pairs["unit_id_pudl"].isna())]
    # if there are any duplicate units, indicating an incorrect subplant id, fix the subplant id
    test["subplant_id_unit_corrected"] = test["subplant_id"]
    if len(duplicate_unit) > 0:
        # TODO: Remove this print statement
        #print(duplicate_unit)
        # find the lowest number subplant id associated with each duplicated unit_id_pudl
        replacement_subplant_id = duplicate_unit.groupby(["plant_id_eia","unit_id_pudl"])["subplant_id"].min().reset_index()
        # merge this replacement subplant_id into the dataframe and use it to update the existing subplant id
        test = test.merge(replacement_subplant_id, how="left", on=["plant_id_eia","unit_id_pudl"], suffixes=(None,"_to_replace"))
        test["subplant_id_unit_corrected"].update(test["subplant_id_to_replace"])
    
    # Step 2: Fill missing subplant_id
    # We will use unit_id_pudl to fill missing subplant ids, so first we need to fill any missing unit_id_pudl
    # We do this by assigning a new unit_id_pudl to each generator that isn't already grouped into a unit
    
    # create a numeric version of each generator_id
    # ngroup() creates a unique number for each element in the group
    test["numeric_generator_id"] = test.groupby(["plant_id_eia","generator_id"], dropna=False).ngroup()
    # when filling in missing unit_id_pudl, we don't want these numeric_generator_id to overlap existing unit_id
    # to ensure this, we will add 1000 to each of these numeric generator ids to ensure they are unique
    test["numeric_generator_id"] = test["numeric_generator_id"] + 1000
    # fill any missing unit_id_pudl with a number for each unique generator
    test["unit_id_pudl_filled"] = test["unit_id_pudl"].fillna(test["numeric_generator_id"]) # - numeric_generator_id_shift
    # create a new unique subplant_id based on the corrected subplant ids and the filled unit_id
    test["new_subplant"] = test.groupby(["plant_id_eia","subplant_id_unit_corrected","unit_id_pudl_filled"], dropna=False).ngroup()

    # test that the subplant_id assignment is stably assigned to the same generator_id for the entire year
    validate_stable_id = test[["plant_id_eia","new_subplant","generator_id"]].drop_duplicates()
    validate_stable_id = validate_stable_id[validate_stable_id.duplicated(keep=False)]
    if len(validate_stable_id) > 0:
        print("WARNING: ID NOT STABLE")
        print(validate_stable_id)

    return test

In [None]:
# test with a small subset
test = eia923_allocated#[eia923_allocated["plant_id_eia"].isin([2410])].copy()
test = test.groupby("plant_id_eia").apply(create_updated_subplant_id)
test

In [None]:
test[(test["plant_id_eia"] == 57185) & (test["report_date"] == "2020-06-01")]

In [None]:
validate_stable_id = test[["new_subplant","generator_id"]].drop_duplicates()
validate_stable_id[validate_stable_id.duplicated(keep=False)]

In [None]:
test.head(20)

In [None]:
# test with a small subset
test = eia923_allocated[eia923_allocated["plant_id_eia"].isin([7242])].copy()
test = test.groupby("plant_id_eia").apply(create_updated_subplant_id)
test

In [None]:
# test for a single plant
create_updated_subplant_id(eia923_allocated[eia923_allocated["plant_id_eia"] == 10745].copy())

In [None]:
# test for a single plant
create_updated_subplant_id(eia923_allocated[eia923_allocated["plant_id_eia"] == 10745].copy())

In [None]:
test = eia923_allocated[eia923_allocated["plant_id_eia"] == 10745].copy()

# check whether there are multiple subplants associated with a single unit id
subplant_unit_overlap = test[["plant_id_eia","subplant_id","unit_id_pudl"]].drop_duplicates()
duplicate_unit = subplant_unit_overlap[(subplant_unit_overlap.duplicated(subset="unit_id_pudl", keep=False)) & (~subplant_unit_overlap["unit_id_pudl"].isna())]
duplicate_unit

In [None]:
test

In [None]:
smallest_filled_unit_id.fillna(0)

In [None]:
pd.isna(smallest_filled_unit_id)