# About this notebook
This notebook is used for testing code related to allocating the EIA-923 data before integrating it into PUDL and the data pipeline. 

I've also been using this to simply load EIA tables from the PUDL database to explore the EIA data as it exists in PUDL.

In [None]:
%reload_ext autoreload
%autoreload 2

import sys
sys.path.append('../../hourly-egrid/')

# Useful high-level external modules.
import numpy as np
import pandas as pd
import sqlalchemy as sa
import warnings
import plotly.express as px

import pudl

import src.data_cleaning as data_cleaning
import src.load_data as load_data
import pudl.analysis.allocate_net_gen as allocate_gen_fuel


In [None]:
year = 2020

pudl_db = 'sqlite:///../data/downloads/pudl/pudl_data/sqlite/pudl.sqlite'
pudl_engine = sa.create_engine(pudl_db)

pudl_out = pudl.output.pudltabl.PudlTabl(
    pudl_engine,
    freq='MS',
    start_date=f'{year}-01-01',
    end_date=f'{year}-12-31'
)

## Load EIA data

In [None]:
# extract all of the tables from pudl_out early in the process and select
# only the columns we need. this is for speed and clarity.

remove_this = pudl.analysis.allocate_net_gen
drop_interim_cols = False

IDX_GENS = ["report_date", "plant_id_eia", "generator_id"]
"""Id columns for generators."""

IDX_PM_ESC = ["report_date", "plant_id_eia", "energy_source_code", "prime_mover_code"]
"""Id columns for plant, prime mover & fuel type records."""

IDX_ESC = ["report_date", "plant_id_eia", "energy_source_code"]

IDX_U_ESC = ["report_date", "plant_id_eia", "energy_source_code", "unit_id_pudl"]

# extract all of the tables from pudl_out early in the process and select
# only the columns we need. this is for speed and clarity.
# extract all of the tables from pudl_out early in the process and select
# only the columns we need. this is for speed and clarity.
gf = (
    pudl_out.gf_eia923()
    .loc[
        :,
        IDX_PM_ESC
        + [
            "net_generation_mwh",
            "fuel_consumed_mmbtu",
            "fuel_consumed_for_electricity_mmbtu",
        ],
    ]
    .pipe(remove_this.manually_fix_energy_source_codes)
)
bf = (
    pudl_out.bf_eia923()
    .merge(
        pd.read_sql("boilers_entity_eia", pudl_out.pudl_engine),
        how="left",
        on=["plant_id_eia", "boiler_id"],
    )
    .loc[:, IDX_PM_ESC + ["fuel_consumed_mmbtu"]]
)
gen = (
    pudl_out.gen_original_eia923().loc[:, IDX_GENS + ["net_generation_mwh"]]
    # removes 4 records with NaN generator_id as of pudl v0.5
    .dropna(subset=IDX_GENS)
)
gens = pudl_out.gens_eia860().loc[
    :,
    IDX_GENS
    + [
        "prime_mover_code",
        "unit_id_pudl",
        "capacity_mw",
        "fuel_type_count",
        "operational_status",
        "retirement_date",
    ]
    + list(pudl_out.gens_eia860().filter(like="energy_source_code"))
    + list(pudl_out.gens_eia860().filter(like="startup_source_code")),
]
# add any startup energy source codes to the list of energy source codes
# fix MSW codes
gens = remove_this.adjust_energy_source_codes(gens, gf, bf)
# fix prime mover codes in gens so that they match the codes in the gf table
missing_pm = gens[gens["prime_mover_code"].isna()]
if not missing_pm.empty:
    warnings.warn(
        f"{len(missing_pm)} generators are missing prime mover codes in gens_eia860. "
        "This will result in incorrect allocation."
    )

In [None]:
# duplicate each entry 12 times to create an entry for each month of the year
gens = remove_this.create_monthly_gens_records(gens)

# the gen table is missing some generator ids. Let's fill this using the gens table, leaving a missing value for net generation
gen = gen.merge(
    gens[["plant_id_eia", "generator_id", "report_date"]],
    how="outer",
    on=["plant_id_eia", "generator_id", "report_date"],
)

In [None]:
stack_gens = remove_this.stack_generators(
    gens, cat_col="energy_source_code_num", stacked_col="energy_source_code"
)

bf_summed = (
    bf.groupby(by=IDX_PM_ESC, dropna=False)
    .sum(min_count=1)
    .add_suffix("_bf_tbl")
    .reset_index()
    .pipe(pudl.helpers.convert_cols_dtypes, "eia")
)
gf_pm_fuel_summed = (
    gf.groupby(by=IDX_PM_ESC)
    .sum(min_count=1)[
        [
            "fuel_consumed_mmbtu",
            "fuel_consumed_for_electricity_mmbtu",
            "net_generation_mwh",
        ]
    ]
    .add_suffix("_gf_tbl")
    .reset_index()
)
gf_fuel_summed = (
    gf.groupby(by=IDX_ESC)
    .sum(min_count=1)[
        ["fuel_consumed_mmbtu", "fuel_consumed_for_electricity_mmbtu"]
    ]
    .add_suffix("_gf_tbl_fuel")
    .reset_index()
)

In [None]:
gen_assoc = (
    pd.merge(stack_gens, gen, on=IDX_GENS, how="outer")
    .rename(columns={"net_generation_mwh": "net_generation_mwh_g_tbl"})
    .merge(gf_pm_fuel_summed, on=IDX_PM_ESC, how="left", validate="m:1")

)

In [None]:
gen_assoc

In [None]:
# Get a list of all of the plants with a retired generator and non-null/non-zero gf generation data reported after the retirement date
retired_with_gf = list(
        gen_assoc.loc[
            (gen_assoc.operational_status == "retired")
            & (gen_assoc.report_date > gen_assoc.retirement_date)
            & (gen_assoc.net_generation_mwh_g_tbl.isnull())
            & (gen_assoc.net_generation_mwh_gf_tbl.notnull())
            & (gen_assoc.net_generation_mwh_gf_tbl != 0),
            "plant_id_eia",
        ].unique()
    )

# create a table for all of these plants that identifies all of the unique operational statuses
plants_with_retired_generators = gen_assoc.loc[
    gen_assoc["plant_id_eia"].isin(retired_with_gf),
    ["plant_id_eia", "operational_status","retirement_date"],
].drop_duplicates()

# remove plants that have operational statuses other than retired
plants_with_nonretired_generators = list(plants_with_retired_generators.loc[(plants_with_retired_generators["operational_status"] != "retired"), "plant_id_eia"].unique())
plants_with_retired_generators = plants_with_retired_generators[~plants_with_retired_generators["plant_id_eia"].isin(plants_with_nonretired_generators)]

# only keep the plants where all retirement dates are before the current year
plants_retiring_after_start_date = list(plants_with_retired_generators.loc[plants_with_retired_generators["retirement_date"] >= pudl_out.start_date, "plant_id_eia"].unique())
entirely_retired_plants = plants_with_retired_generators[~plants_with_retired_generators["plant_id_eia"].isin(plants_retiring_after_start_date)]

entirely_retired_plants = list(entirely_retired_plants["plant_id_eia"].unique())

retired_plants = gen_assoc[
        gen_assoc["plant_id_eia"].isin(entirely_retired_plants)
    ]
retired_plants

In [None]:
retiring = gen_assoc.loc[
        (gen_assoc.operational_status == "retired")
        & (
            (gen_assoc.report_date <= gen_assoc.retirement_date)
            | (gen_assoc.net_generation_mwh_g_tbl.notnull())
        )
    ]

In [None]:
retired_plants[retired_plants['plant_id_eia'] == 1555]

In [None]:
gen_assoc[gen_assoc['plant_id_eia'] == 6082]

In [None]:
def remove_inactive_generators(gen_assoc):
    """
    Remove the retired generators.

    We don't want to associate net generation to generators that are retired
    (or proposed! or any other `operational_status` besides `existing`).

    We do want to keep the generators that retire mid-year and have generator
    specific data from the generation_eia923 table. Removing the generators
    that retire mid-report year and don't report to the generation_eia923 table
    is not exactly a great assumption. For now, we are removing them. We should
    employ a strategy that allocates only a portion of the generation to them
    based on their operational months (or by doing the allocation on a monthly
    basis).

    Args:
        gen_assoc (pandas.DataFrame): table of generators with stacked fuel
            types and broadcasted net generation data from the
            generation_eia923 and generation_fuel_eia923 tables. Output of
            `associate_generator_tables()`.
    """
    existing = gen_assoc.loc[(gen_assoc.operational_status == "existing")]
    # keep the gens that retired mid-report-year that have generator
    # specific data
    retiring = gen_assoc.loc[
        (gen_assoc.operational_status == "retired")
        & (
            (gen_assoc.report_date <= gen_assoc.retirement_date)
            | (gen_assoc.net_generation_mwh_g_tbl.notnull())
        )
    ]
    # sometimes a plant will report generation data before its proposed operating date
    # we want to keep any data that is reported for proposed generators
    proposed_generators = gen_assoc.loc[
        (gen_assoc.operational_status == "proposed")
        & (gen_assoc.net_generation_mwh_g_tbl.notnull())
    ]

    # when we do not have generator-specific generation for a proposed generator, we can also
    # look at whether there is generation reported from the gf table. However, if a proposed
    # generator is part of an existing plant, it is possible that this gf generation belongs
    # to one of the other existing generators. Thus, we want to identify those proposed generators
    # where the entire plant is proposed (since the gf-reported generation could only come from
    # one of the new generators).

    # Get a list of all of the plants that have a proposed generator with non-null and non-zero gf generation
    proposed_with_gf = list(
        gen_assoc.loc[
            (gen_assoc.operational_status == "proposed")
            & (gen_assoc.net_generation_mwh_gf_tbl.notnull())
            & (gen_assoc.net_generation_mwh_gf_tbl != 0),
            "plant_id_eia",
        ].unique()
    )

    # create a table for all of these plants that identifies all of the unique operational statuses
    plants_with_proposed_generators = gen_assoc.loc[
        gen_assoc["plant_id_eia"].isin(proposed_with_gf),
        ["plant_id_eia", "operational_status"],
    ].drop_duplicates()

    # filter this list to those plant ids where the only operational status is "proposed"
    # i.e. where the entire plant is new
    entirely_new_plants = plants_with_proposed_generators[
        (~plants_with_proposed_generators.duplicated(subset="plant_id_eia", keep=False))
        & (plants_with_proposed_generators["operational_status"] == "proposed")
    ]
    # convert this table into a list of these plant ids
    entirely_new_plants = list(entirely_new_plants["plant_id_eia"].unique())

    # keep data for these proposed plants in months where there is reported data
    proposed_plants = gen_assoc[
        gen_assoc["plant_id_eia"].isin(entirely_new_plants)
        & gen_assoc["net_generation_mwh_gf_tbl"].notnull()
    ]

    gen_assoc_removed = pd.concat([existing, retiring, proposed_generators, proposed_plants])
    
    return gen_assoc_removed

In [None]:
gen_assoc = (
    pd.merge(stack_gens, gen, on=IDX_GENS, how="outer")
    .rename(columns={"net_generation_mwh": "net_generation_mwh_g_tbl"})
    .merge(gf_pm_fuel_summed, on=IDX_PM_ESC, how="left", validate="m:1")
    .pipe(remove_inactive_generators)
    .merge(bf_summed, on=IDX_PM_ESC, how="left", validate="m:1")
    .merge(
        gf_fuel_summed,
        on=IDX_ESC,
        how="left",
        validate="m:1",
    )
)



# replace zeros with small number to avoid div by zero errors when calculating allocation fraction
DATA_COLUMNS = ["net_generation_mwh_g_tbl","fuel_consumed_mmbtu_gf_tbl","fuel_consumed_for_electricity_mmbtu_gf_tbl","net_generation_mwh_gf_tbl","fuel_consumed_mmbtu_bf_tbl","fuel_consumed_mmbtu_gf_tbl_fuel","fuel_consumed_for_electricity_mmbtu_gf_tbl_fuel"]
gen_assoc[DATA_COLUMNS] = gen_assoc[DATA_COLUMNS].replace(0,0.001)

In [None]:
gen_assoc

In [None]:
gen_assoc = (
    pd.merge(stack_gens, gen, on=IDX_GENS, how="outer")
    .rename(columns={"net_generation_mwh": "net_generation_mwh_g_tbl"})
    .merge(gf_pm_fuel_summed, on=IDX_PM_ESC, how="left", validate="m:1")
)

In [None]:
# when we do not have generator-specific generation for a proposed generator, we can also
# look at whether there is generation reported from the gf table. However, if a proposed
# generator is part of an existing plant, it is possible that this gf generation belongs
# to one of the other existing generators. Thus, we want to identify those proposed generators
# where the entire plant is proposed (since the gf-reported generation could only come from
# one of the new generators).

# Get a list of all of the plants that have a proposed generator with non-null and non-zero gf generation
proposed_with_gf = list(gen_assoc.loc[(gen_assoc.operational_status == "proposed") & (gen_assoc.net_generation_mwh_gf_tbl.notnull()) & (gen_assoc.net_generation_mwh_gf_tbl != 0), "plant_id_eia"].unique())

# create a table for all of these plants that identifies all of the unique operational statuses
plants_with_proposed_generators = gen_assoc.loc[gen_assoc["plant_id_eia"].isin(proposed_with_gf), ["plant_id_eia","operational_status"]].drop_duplicates()

# filter this list to those plant ids where the only operational status is "proposed" 
# i.e. where the entire plant is new
entirely_new_plants = plants_with_proposed_generators[(~plants_with_proposed_generators.duplicated(subset="plant_id_eia", keep=False)) & (plants_with_proposed_generators["operational_status"] == "proposed")]
# convert this table into a list of these plant ids
entirely_new_plants = list(entirely_new_plants['plant_id_eia'].unique())

# keep data for these proposed plants in months where there is reported data
proposed_plants = gen_assoc[gen_assoc['plant_id_eia'].isin(entirely_new_plants) & gen_assoc['net_generation_mwh_gf_tbl'].notnull()]

In [None]:
proposed_plants

In [None]:
new_plants[(~new_plants.duplicated(subset="plant_id_eia", keep=False)) & (new_plants["operational_status"] == "proposed")]

In [None]:
plant_id_to_investigate = 62562

In [None]:
gens[gens['plant_id_eia'] == plant_id_to_investigate]

In [None]:
gens.loc[gens['prime_mover_code'].isna(), ['plant_id_eia','generator_id']].drop_duplicates()

In [None]:
gens[gens['generator_id'].str.contains('ST')]

In [None]:
gen[gen['plant_id_eia'] == plant_id_to_investigate]

In [None]:
gf[gf['plant_id_eia'] == plant_id_to_investigate]