This notebook is set up to test code as needed.

In [None]:
# import packages
import pandas as pd
import numpy as np
import os

%reload_ext autoreload
%autoreload 2

# # Tell python where to look for modules.
import sys
sys.path.append('../../../open-grid-emissions/src/')

import download_data
import load_data
from column_checks import get_dtypes
from filepaths import *
import impute_hourly_profiles
import data_cleaning
import output_data
import emissions
import validation
import gross_to_net_generation
import eia930

year = 2020
path_prefix = f"{year}/"

# NOTES
- plant 1391 is an example of where a single `unit_id_pudl` is assigned multiple subplants
- plant 10745 is an example of where a `unit_id_pudl` overlaps multiple `subplant_id`       

## The goal
Make sure that there are no NA subplant IDs
Ensure some harmonization between subplant_id and unit_id_pudl

TODO:
 - I've now created code that updates the subplant id of units in the EIA-923 table, but now I need to incorporate this into the pipeline. I don't necessarily need to use EIA-923 allocated except that it should include all generators. Maybe I want to start with an EIA-860 table and do an outer join with the existing subplant table

In [None]:
pudl_out = load_data.initialize_pudl_out(year)

In [None]:
# load data
pudl_out = load_data.initialize_pudl_out(year)
#bga = pudl_out.bga_eia860()
#complete_generator_ids = pudl_out.gens_eia860()[["plant_id_eia","generator_id","unit_id_pudl"]]

subplant_crosswalk = pd.read_csv(f'{outputs_folder()}{path_prefix}/subplant_crosswalk.csv', dtype=get_dtypes())



In [None]:
def update_subplant_ids(subplant_crosswalk):
    """
    Ensures a complete and accurate subplant_id mapping for all generators.

    NOTE:
        1. This function is a temporary placeholder until the `pudl.analysis.epa_crosswalk` code is updated.
        2. This function is meant to be applied using a .groupby("plant_id_eia").apply() function. This function
        will only properly work when applied to a single plant_id_eia at a time.

    Data Preparation
        Because the existing subplant_id crosswalk was only meant to map CAMD units to EIA generators, it
        is missing a large number of subplant_ids for generators that do not report to CEMS. Before applying this
        function to the subplant crosswalk, the crosswalk must be completed with all generators by outer
        merging in the complete list of generators from EIA-860 (specifically the gens_eia860 table from pudl).
        This dataframe also contains the complete list of `unit_id_pudl` mappings that will be necessary.

    This function follows several steps:
        1. Because the current subplant_id code does not take boiler-generator associations into account,
        there may be instances where the code assigns generators to different subplants when in fact, according
        to the boiler-generator association table, these generators are grouped into a single unit based on their
        boiler associations. The first step of this function is thus to identify if multiple subplant_id have
        been assigned to a single unit_id_pudl. If so, we replace the existing subplant_ids with a single subplant_id.
        For example, if a generator A was assigned subplant_id 0 and generator B was assigned subplant_id 1, but
        both generators A and B are part of unit_id_pudl 1, we would re-assign the subplant_id to both generators to
        0 (we always use the lowest number subplant_id in each unit_id_pudl group). This may result in some subplant_id
        being skipped, but this is okay because we will later renumber all subplant ids (i.e. if there were also a
        generator C with subplant_id 2, there would no be no subplant_id 1 at the plant)
        3. The second issue is that there are many NA subplant_id that we should fill. To do this, we first look at
        unit_id_pudl. If a group of generators are assigned a unit_id_pudl but have NA subplant_ids, we assign a single
        new subplant_id to this group of generators. If there are still generators at a plant that have both NA subplant_id
        and NA unit_id_pudl, we for now assume that each of these generators consitutes its own subplant. We thus assign a unique
        subplant_id to each generator that is unique from any existing subplant_id already at the plant.

        Args:
            subplant_crosswalk: a dataframe containing the output of `epa_crosswalk.make_subplant_ids` with
    """
    # Step 1: Create corrected versions of subplant_id and unit_id_pudl
    # if multiple unit_id_pudl are connected by a single subplant_id, unit_id_pudl_connected groups these unit_id_pudl together
    subplant_crosswalk = connect_ids(
        subplant_crosswalk, id_to_update="unit_id_pudl", connecting_id="subplant_id"
    )
    # if multiple subplant_id are connected by a single unit_id_pudl, group these subplant_id together
    subplant_crosswalk = connect_ids(
        subplant_crosswalk, id_to_update="subplant_id", connecting_id="unit_id_pudl"
    )

    # Step 2: Fill missing subplant_id
    # We will use unit_id_pudl to fill missing subplant ids, so first we need to fill any missing unit_id_pudl
    # We do this by assigning a new unit_id_pudl to each generator that isn't already grouped into a unit

    # create a numeric version of each generator_id
    # ngroup() creates a unique number for each element in the group
    subplant_crosswalk["numeric_generator_id"] = subplant_crosswalk.groupby(
        ["plant_id_eia", "generator_id"], dropna=False
    ).ngroup()
    # when filling in missing unit_id_pudl, we don't want these numeric_generator_id to overlap existing unit_id
    # to ensure this, we will add 1000 to each of these numeric generator ids to ensure they are unique
    subplant_crosswalk["numeric_generator_id"] = (
        subplant_crosswalk["numeric_generator_id"] + 1000
    )
    # fill any missing unit_id_pudl with a number for each unique generator
    subplant_crosswalk["unit_id_pudl_filled"] = subplant_crosswalk[
        "unit_id_pudl_connected"
    ].fillna(subplant_crosswalk["subplant_id_connected"] + 100).fillna(
        subplant_crosswalk["numeric_generator_id"]
    )
    # create a new unique subplant_id based on the connected subplant ids and the filled unit_id
    subplant_crosswalk["new_subplant"] = subplant_crosswalk.groupby(
        ["plant_id_eia", "subplant_id_connected", "unit_id_pudl_filled"],
        dropna=False,
    ).ngroup()

    return subplant_crosswalk


def connect_ids(df, id_to_update, connecting_id):
    """Corrects an id value if it is connected by an id value in another column.

    if multiple subplant_id are connected by a single unit_id_pudl, this groups these subplant_id together
    if multiple unit_id_pudl are connected by a single subplant_id, this groups these unit_id_pudl together

    Args:
        df: dataframe containing columns with id_to_update and connecting_id columns
        subplant_unit_pairs
    """

    # get a table with all unique subplant to unit pairs
    subplant_unit_pairs = df[
        ["plant_id_eia", "subplant_id", "unit_id_pudl"]
    ].drop_duplicates()

    # identify if any non-NA id_to_update are duplicated, indicated that it is associated with multiple connecting_id
    duplicates = subplant_unit_pairs[
        (subplant_unit_pairs.duplicated(subset=id_to_update, keep=False))
        & (~subplant_unit_pairs[id_to_update].isna())
    ].copy()

    # if there are any duplicate units, indicating an incorrect id_to_update, fix the id_to_update
    df[f"{connecting_id}_connected"] = df[connecting_id]
    if len(duplicates) > 0:
        # find the lowest number subplant id associated with each duplicated unit_id_pudl
        duplicates.loc[:, f"{connecting_id}_to_replace"] = (
            duplicates.groupby(["plant_id_eia", id_to_update])[connecting_id]
            .min()
            .iloc[0]
        )
        # merge this replacement subplant_id into the dataframe and use it to update the existing subplant id
        df = df.merge(
            duplicates,
            how="left",
            on=["plant_id_eia", id_to_update, connecting_id],
        )
        df[f"{connecting_id}_connected"].update(df[f"{connecting_id}_to_replace"])
    return df

In [None]:
# prepare the subplant crosswalk by adding a complete list of generators and adding the unit_id_pudl column
complete_generator_ids = pudl_out.gens_eia860()[
    ["plant_id_eia", "generator_id", "unit_id_pudl"]
]
test = subplant_crosswalk.merge(
    complete_generator_ids, how="outer", on=["plant_id_eia", "generator_id"]
)

# test with a small subset
#test = test[test["plant_id_eia"].isin([613])].copy()
test = test.groupby("plant_id_eia").apply(update_subplant_ids)

test

In [None]:
v = test[["plant_id_eia","unit_id_pudl","new_subplant"]].drop_duplicates()
v[v.duplicated(subset=["plant_id_eia","unit_id_pudl"], keep=False) & (~v.unit_id_pudl.isna())]

In [None]:
test

In [None]:
test["subplant_id"].update(
        test["new_subplant"]
    )
test.reset_index(drop=True)[
        [
            "plant_id_epa",
            "unitid",
            "plant_id_eia",
            "generator_id",
            "subplant_id",
            "unit_id_pudl",
        ]
    ]

In [None]:
test[test.plant_id_eia == 2379]

In [None]:
# prepare the subplant crosswalk by adding a complete list of generators and adding the unit_id_pudl column
complete_generator_ids = pudl_out.gens_eia860()[
    ["plant_id_eia", "generator_id", "unit_id_pudl"]
]
test = subplant_crosswalk.merge(
    complete_generator_ids, how="outer", on=["plant_id_eia", "generator_id"]
)

# get a table with all unique subplant to unit pairs
subplant_unit_pairs = test[
    ["plant_id_eia", "subplant_id", "unit_id_pudl"]
].drop_duplicates()
# identify if any non-NA unit_id_pudl are duplicated, indicated that it is associated with multiple subplant_id
duplicate_unit = subplant_unit_pairs[
    (subplant_unit_pairs.duplicated(subset=["plant_id_eia","subplant_id"], keep=False))
    & (~subplant_unit_pairs["subplant_id"].isna())
]
duplicate_unit

In [None]:
# TODO: Figure out how to ensure new subplant matches subplant ID when split across unit with NA
# Plant 613 issue

In [None]:
# update the subplant_id with the new_subplant
test["subplant_id"].update(test["new_subplant"])

# remove intermediate columns, keeping the unit_id_pudl column
test = test.reset_index(drop=True)[["plant_id_epa","unitid","plant_id_eia","generator_id","subplant_id","unit_id_pudl"]]

In [None]:
# how many subplant_id have changed?
test[(~test["subplant_id"].isna()) & (test["subplant_id"] != test["new_subplant"])]

In [None]:
test[test.plant_id_eia == 1]

# Test in pipeline

In [None]:
number_of_years = 2
data_cleaning.identify_subplants(year, number_of_years)

In [None]:
subplant_crosswalk = pd.read_csv(f'{outputs_folder()}{path_prefix}/subplant_crosswalk_{year}.csv', dtype=get_dtypes())

In [None]:
print("3. Cleaning EIA-923 data")
eia923_allocated, primary_fuel_table = data_cleaning.clean_eia923(year, False)

In [None]:
eia923_allocated[eia923_allocated.subplant_id.isna()]

In [None]:
validation.test_for_missing_subplant_id(eia923_allocated)

In [None]:
start_year = year - (number_of_years - 1)
end_year = year

print("    Creating subplant IDs")
# load 5 years of monthly data from CEMS
cems_ids = load_data.load_cems_ids(start_year, end_year)

In [None]:
cems_ids[cems_ids["plant_id_eia"] == 8]

In [None]:
import pudl.analysis.allocate_net_gen as allocate_gen_fuel
import pudl.analysis.epa_crosswalk as epa_crosswalk
import pudl.output.pudltabl

In [None]:
# load the crosswalk and filter it by the data that actually exists in cems
crosswalk = pudl.output.epacems.epa_crosswalk()

# update the crosswalk with manual matches
crosswalk = data_cleaning.manual_crosswalk_updates(crosswalk)

# strip leading zeros
crosswalk["CAMD_UNIT_ID"] = crosswalk["CAMD_UNIT_ID"].str.lstrip("0")
crosswalk["EIA_GENERATOR_ID"] = crosswalk["EIA_GENERATOR_ID"].str.lstrip("0")

# filter the crosswalk to drop any units that don't exist in CEMS
filtered_crosswalk = epa_crosswalk.filter_crosswalk(crosswalk, cems_ids)[
    [
        "plant_id_eia",
        "unitid",
        "CAMD_PLANT_ID",
        "CAMD_UNIT_ID",
        "CAMD_GENERATOR_ID",
        "EIA_PLANT_ID",
        "EIA_GENERATOR_ID",
    ]
]

# change the plant id to an int
filtered_crosswalk["EIA_PLANT_ID"] = filtered_crosswalk["EIA_PLANT_ID"].astype(int)

# filter to generators that exist in the EIA data
# load a list of unique generator ids that exist in EIA
pudl_out = load_data.initialize_pudl_out(year=None)
gens_eia860 = pudl_out.gens_eia860()
unique_eia_ids = gens_eia860[["plant_id_eia", "generator_id"]].drop_duplicates()
filtered_crosswalk = unique_eia_ids.merge(
    filtered_crosswalk,
    left_on=["plant_id_eia", "generator_id"],
    right_on=["EIA_PLANT_ID", "EIA_GENERATOR_ID"],
    how="inner",
    suffixes=("_actual", None),
    validate="1:m",
).drop(columns=["plant_id_eia_actual", "generator_id"])

In [None]:
filtered_crosswalk[filtered_crosswalk["plant_id_eia"] == 8]

In [None]:
crosswalk_with_subplant_ids = epa_crosswalk.make_subplant_ids(filtered_crosswalk)

# fix the column names
crosswalk_with_subplant_ids = crosswalk_with_subplant_ids.drop(
    columns=["plant_id_eia", "unitid", "CAMD_GENERATOR_ID"]
)
crosswalk_with_subplant_ids = crosswalk_with_subplant_ids.rename(
    columns={
        "CAMD_PLANT_ID": "plant_id_epa",
        "EIA_PLANT_ID": "plant_id_eia",
        "CAMD_UNIT_ID": "unitid",
        "EIA_GENERATOR_ID": "generator_id",
    }
)
# change the eia plant id to int
crosswalk_with_subplant_ids["plant_id_eia"] = crosswalk_with_subplant_ids[
    "plant_id_eia"
].astype(int)

# change the order of the columns
crosswalk_with_subplant_ids = crosswalk_with_subplant_ids[
    ["plant_id_epa", "unitid", "plant_id_eia", "generator_id", "subplant_id"]
]

In [None]:
crosswalk_with_subplant_ids[crosswalk_with_subplant_ids["plant_id_eia"] == 8]

In [None]:
# update the subplant_crosswalk to ensure completeness
# prepare the subplant crosswalk by adding a complete list of generators and adding the unit_id_pudl column
complete_generator_ids = pudl_out.gens_eia860()[
    ["plant_id_eia", "generator_id", "unit_id_pudl"]
].drop_duplicates()
subplant_crosswalk_complete = crosswalk_with_subplant_ids.merge(
    complete_generator_ids, how="outer", on=["plant_id_eia", "generator_id"]
)

In [None]:
subplant_crosswalk_complete[subplant_crosswalk_complete["plant_id_eia"] == 8]

In [None]:
complete_generator_ids[complete_generator_ids["plant_id_eia"] == 8]

In [None]:
gens = pudl_out.gens_eia860()

In [None]:
gens[gens["plant_id_eia"] == 8]

In [None]:
subplant_crosswalk = pd.read_csv(
        outputs_folder(f"{year}/subplant_crosswalk_{year}.csv"),
        dtype=get_dtypes(),
    )[["plant_id_eia", "unitid", "subplant_id"]].drop_duplicates().dropna()

In [None]:
subplant_crosswalk

In [None]:
test1[test1.duplicated(subset=["plant_id_eia","unitid"])]