In [None]:
# import packages
import pandas as pd
import numpy as np
import os
import plotly.express as px

%reload_ext autoreload
%autoreload 2

import pudl.analysis.allocate_net_gen as allocate_gen_fuel
import pudl.analysis.epacamd_eia as epacamd_eia_crosswalk
import pudl.output.pudltabl

# # Tell python where to look for modules.
import sys
sys.path.append('../../../open-grid-emissions/src/')

import download_data
import load_data
from column_checks import get_dtypes
from filepaths import *
import impute_hourly_profiles
import data_cleaning
import output_data
import emissions
import validation
import gross_to_net_generation
import eia930

year = 2021
path_prefix = f"{year}/"

In [None]:
pudl_out = load_data.initialize_pudl_out(year=year)
gf, bf, bga, gens, gen = allocate_gen_fuel.extract_input_tables(pudl_out)

In [None]:
(
    eia923_allocated,
    primary_fuel_table,
    subplant_emission_factors,
) = data_cleaning.clean_eia923(year, False)

In [None]:
gens[gens["plant_id_eia"] == 59825]

In [None]:
gf[gf["plant_id_eia"] == 59825]

In [None]:
eia923_allocated[eia923_allocated["plant_id_eia"] == 59825]

In [None]:
gens_at_freq.loc[
                    ~(gens_at_freq["retirement_date"] < gens_at_freq["report_date"]),
                    :,
                ]

In [None]:
def add_missing_energy_source_codes_to_gens(gens_at_freq, gf):
    """Adds energy_source_codes that appear in the `gf` table but not `gens` to `gens`.

    In some cases, non-zero fuel consumption and net generation is reported
    in the EIA-923 generation and fuel table that is associated with an
    energy_source_code that is not associated with that plant-prime mover
    in the gens table, which would cause these data to get dropped when
    these two tables are merged. To fix this, for each plant-pm, this function
    identifies such esc, and adds them to the `gens_at_freq` table as new
    energy_source_code columns.
    """

    gf_missing_escs = identify_missing_gf_escs_in_gens(gens_at_freq, gf)

    # pivot these data to become new numbered energy_source_code_n columns starting at 7
    gf_missing_escs["num"] = (
        gf_missing_escs.groupby(["plant_id_eia", "prime_mover_code"]).cumcount() + 7
    )
    gf_missing_escs["num"] = gf_missing_escs["num"].astype(str)
    gf_missing_escs = gf_missing_escs.pivot(
        index=["plant_id_eia", "prime_mover_code"], columns="num"
    )[["energy_source_code"]]
    gf_missing_escs.columns = ["_".join(col) for col in gf_missing_escs.columns.values]
    gf_missing_escs = gf_missing_escs.reset_index()

    # merge these missing fuel columns into the gens data
    gens_at_freq = gens_at_freq.merge(
        gf_missing_escs, how="left", on=["plant_id_eia", "prime_mover_code"]
    )

    return gens_at_freq


def identify_missing_gf_escs_in_gens(gens_at_freq, gf):
    """For each plant, identifies energy_source_codes that exist in gf but not gens"""
    # identify where there is a missing fuel code in gen
    gf_missing_escs = []
    esc_columns = list(gens_at_freq.filter(like="_source_code").columns)
    # create a filtered version of gf that only includes rows with non-zero data
    gf_filtered = gf.loc[
        ((gf["fuel_consumed_mmbtu"] > 0) | (gf["net_generation_mwh"] != 0))
    ].copy()
    # create filtered version of gens to iterate through
    gens_filtered = (
        gens_at_freq.loc[
            ~(gens_at_freq["retirement_date"] < gens_at_freq["report_date"]),
            ["plant_id_eia", "prime_mover_code"] + esc_columns,
        ]
        .drop_duplicates()
        .replace(np.NaN, "N/A")
    )
    for plant in list(gf_filtered.plant_id_eia.unique()):  # get a list of all plants
        # get a list of unique prime movers at the plant
        pms_at_plant = list(
            gf_filtered.loc[
                gf_filtered["plant_id_eia"] == plant, "prime_mover_code"
            ].unique()
        )
        for pm in pms_at_plant:
            # get a list of escs that exist for the plant-pm in the gf data
            gf_esc = list(
                gf_filtered.loc[
                    (gf_filtered["plant_id_eia"] == plant)
                    & (gf_filtered["prime_mover_code"] == pm),
                    "energy_source_code",
                ].unique()
            )
            # get a list of ESCs that exist for that plant in the gens data
            # get unique values from all source code columns
            esc_in_gens = np.unique(
                gens_filtered.loc[
                    (gens_at_freq["plant_id_eia"] == plant)
                    & (gens_filtered["prime_mover_code"] == pm),
                    esc_columns,
                ].values  # create an array
            )
            # get rid of the string placeholder if it exists
            esc_in_gens = list(np.delete(esc_in_gens, np.where(esc_in_gens == "N/A")))

            # get a list of fuels that exist in gf but not gens
            esc_misisng_from_gens = list(set(gf_esc) - set(esc_in_gens))

            if len(esc_misisng_from_gens) > 0:
                gf_missing_escs.append(
                    gf_filtered.loc[
                        (gf_filtered["plant_id_eia"] == plant)
                        & (gf_filtered["prime_mover_code"] == pm)
                        & (
                            gf_filtered["energy_source_code"].isin(
                                esc_misisng_from_gens
                            )
                        ),
                        ["plant_id_eia", "prime_mover_code", "energy_source_code"],
                    ]
                )

    gf_missing_escs = pd.concat(gf_missing_escs, axis=0)

    # remove duplicate esc entries for each plant-pm
    gf_missing_escs = gf_missing_escs.drop_duplicates()

    return gf_missing_escs


In [None]:
pudl_out = load_data.initialize_pudl_out(year=year)

gf, bf, bga, gens, gen = allocate_gen_fuel.extract_input_tables(pudl_out)
bf, gens_at_freq, gen = allocate_gen_fuel.standardize_input_frequency(
        bf, gens, gen, freq=pudl_out.freq
    )
gens_at_freq = allocate_gen_fuel.adjust_msw_energy_source_codes(gens_at_freq, gf, bf)

In [None]:
gens_at_freq = add_missing_energy_source_codes_to_gens(gens_at_freq, gf)

In [None]:
gens_at_freq[gens_at_freq["plant_id_eia"] == 10633] # check 1004, 56798, 10633

In [None]:
# do the association!
gen_assoc = allocate_gen_fuel.associate_generator_tables(
    gens=gens_at_freq, gf=gf, gen=gen, bf=bf, bga=bga
)

# Generate a fraction to use to allocate net generation and fuel consumption by.
# These two methods create a column called `frac`, which will be a fraction
# to allocate net generation from the gf table for each `IDX_PM_ESC` group
gen_pm_fuel = allocate_gen_fuel.prep_alloction_fraction(gen_assoc)

# Net gen allocation
net_gen_alloc = allocate_gen_fuel.allocate_net_gen_by_gen_esc(gen_pm_fuel).pipe(
    allocate_gen_fuel._test_gen_pm_fuel_output, gf=gf, gen=gen
)
allocate_gen_fuel.test_gen_fuel_allocation(gen, net_gen_alloc)

# fuel allocation
fuel_alloc = allocate_gen_fuel.allocate_fuel_by_gen_esc(gen_pm_fuel)

# ensure that the allocated data has unique merge keys
net_gen_alloc_agg = allocate_gen_fuel.group_duplicate_keys(net_gen_alloc)
fuel_alloc_agg = allocate_gen_fuel.group_duplicate_keys(fuel_alloc)

# squish net gen and fuel allocation together
net_gen_fuel_alloc = pd.merge(
    net_gen_alloc_agg,
    fuel_alloc_agg,
    on=allocate_gen_fuel.IDX_GENS_PM_ESC + ["energy_source_code_num"],
    how="outer",
    validate="1:1",
    suffixes=("_net_gen_alloc", "_fuel_alloc"),
).sort_values(allocate_gen_fuel.IDX_GENS_PM_ESC)
_ = allocate_gen_fuel.test_original_gf_vs_the_allocated_by_gens_gf(
    gf=gf, gf_allocated=net_gen_fuel_alloc
)

net_gen_fuel_alloc = net_gen_fuel_alloc.loc[
    :,
    allocate_gen_fuel.IDX_GENS_PM_ESC + ["energy_source_code_num"] + allocate_gen_fuel.DATA_COLUMNS,
]

In [None]:
plant_total_gf = gf.groupby("plant_id_eia")[
    ["net_generation_mwh", "fuel_consumed_mmbtu", "fuel_consumed_for_electricity_mmbtu"]
].sum()
plant_total_alloc = net_gen_fuel_alloc.groupby("plant_id_eia")[
    ["net_generation_mwh", "fuel_consumed_mmbtu", "fuel_consumed_for_electricity_mmbtu"]
].sum()

plant_total_diff = abs(plant_total_gf - plant_total_alloc)
plant_total_diff[(plant_total_diff["fuel_consumed_mmbtu"] > 10) | (plant_total_diff["net_generation_mwh"] > 10)]


In [None]:
gens[gens["plant_id_eia"] == 3406]

In [None]:
gf[gf["plant_id_eia"] == 3406]

In [None]:
net_gen_alloc[net_gen_alloc["plant_id_eia"] == 2935]

In [None]:
# check 1004, 56798, 10633

In [None]:
gf[["fuel_consumed_mmbtu"]].sum()

In [None]:
net_gen_fuel_alloc[["fuel_consumed_mmbtu"]].sum()

In [None]:
gen_fuel_allocated = net_gen_fuel_alloc

In [None]:
# manually update energy source code when OTH
gen_fuel_allocated = data_cleaning.update_energy_source_codes(gen_fuel_allocated)

# round all values to the nearest tenth of a unit
gen_fuel_allocated.loc[
    :,
    [
        "net_generation_mwh",
        "fuel_consumed_mmbtu",
        "fuel_consumed_for_electricity_mmbtu",
    ],
] = gen_fuel_allocated.loc[
    :,
    [
        "net_generation_mwh",
        "fuel_consumed_mmbtu",
        "fuel_consumed_for_electricity_mmbtu",
    ],
].round(
    1
)

validation.test_for_missing_energy_source_code(gen_fuel_allocated)
validation.test_for_negative_values(gen_fuel_allocated)



In [None]:
# create a table that identifies the primary fuel of each generator and plant
primary_fuel_table = data_cleaning.create_primary_fuel_table(
    gen_fuel_allocated, pudl_out, True, year
)

In [None]:
primary_fuel_table[primary_fuel_table["plant_id_eia"] == 8023]

In [None]:
def add_missing_energy_source_codes_to_gens(gens_at_freq, gf):

    gf_missing_escs = identify_missing_gf_escs_in_gens(gens_at_freq, gf)

    # pivot these data to become new numbered energy_source_code_n columns starting at 7
    gf_missing_escs["num"] = (
        gf_missing_escs.groupby(["plant_id_eia", "prime_mover_code"]).cumcount() + 7
    )
    gf_missing_escs["num"] = gf_missing_escs["num"].astype(str)
    gf_missing_escs = gf_missing_escs.pivot(
        index=["plant_id_eia", "prime_mover_code"], columns="num"
    )[["energy_source_code"]]
    gf_missing_escs.columns = ["_".join(col) for col in gf_missing_escs.columns.values]
    gf_missing_escs = gf_missing_escs.reset_index()

    # merge these missing fuel columns into the gens data
    gens_at_freq = gens_at_freq.merge(
        gf_missing_escs, how="left", on=["plant_id_eia", "prime_mover_code"]
    )

    return gens_at_freq

def identify_missing_gf_escs_in_gens(gens_at_freq, gf):
    # identify where there is a missing fuel code in gen
    gf_missing_escs = []
    esc_columns = list(gens_at_freq.filter(like="_source_code").columns)
    for plant in list(gens.plant_id_eia.unique()):  # get a list of all plants
        # get a list of escs that exist for the plant in the gf data
        # that have non-zero fuel or generation
        gf_esc = list(
            gf.loc[
                (gf["plant_id_eia"] == plant)
                & ((gf["fuel_consumed_mmbtu"] > 0) | (gf["net_generation_mwh"] != 0)),
                "energy_source_code",
            ].unique()
        )
        # get a list of ESCs that exist for that plant in the gens data
        # get unique values from all source code columns
        esc_in_gens = np.unique(
            gens_at_freq.loc[gens_at_freq["plant_id_eia"] == plant, esc_columns]
            .replace(np.NaN, "N/A")  # replace NaN with string so we can use np.unique
            .values  # create an array
        )
        # get rid of the string placeholder if it exists
        esc_in_gens = list(np.delete(esc_in_gens, np.where(esc_in_gens == "N/A")))

        # get a list of fuels that exist in gf but not gens
        esc_misisng_from_gens = list(set(gf_esc) - set(esc_in_gens))
        if len(esc_misisng_from_gens) > 0:
            print(f"plant {plant} missing {esc_misisng_from_gens} from gens")
            gf_missing_escs.append(
                gf.loc[
                    (gf["plant_id_eia"] == plant)
                    & (gf["energy_source_code"].isin(esc_misisng_from_gens)),
                    ["plant_id_eia", "prime_mover_code", "energy_source_code"],
                ]
            )

    gf_missing_escs = pd.concat(gf_missing_escs, axis=0)

    # remove duplicate esc entries for each plant-pm
    gf_missing_escs = gf_missing_escs.drop_duplicates()

    return gf_missing_escs



### Rest of pipeline

In [None]:
gens_at_freq[gens_at_freq["plant_id_eia"] == 8023]

In [None]:
stack_gens = allocate_gen_fuel.stack_generators(
        gens_at_freq, cat_col="energy_source_code_num", stacked_col="energy_source_code"
    )

In [None]:
stack_gens[stack_gens["plant_id_eia"] == 8023]

In [None]:
# identify where there is a missing fuel code in gens
gf_missing_data = []
for plant in list(stack_gens.plant_id_eia.unique()):
    gf_esc = list(gf.loc[(gf["plant_id_eia"] == plant) & (gf["fuel_consumed_mmbtu"] > 0), "energy_source_code"].unique())
    gens_esc = list(stack_gens.loc[stack_gens["plant_id_eia"] == plant, "energy_source_code"].unique())
    missing_fuels = list(set(gf_esc) - set(gens_esc))
    if len(missing_fuels) > 0:
        print(f"plant {plant} missing {missing_fuels} from gens")
        gf_missing_data.append(gf[(gf["plant_id_eia"] == plant) & (gf["energy_source_code"].isin(missing_fuels))])

gf_missing_data = pd.concat(gf_missing_data, axis=0)

In [None]:
gf_missing_data.to_csv(outputs_folder(f"{year}_missing_fuel.csv"))

In [None]:
# allocate the boiler fuel data to generators
bf_by_gens = allocate_gen_fuel.allocate_bf_data_to_gens(bf, gens_at_freq, bga)
bf_by_gens = (
    bf_by_gens.set_index(allocate_gen_fuel.IDX_GENS_PM_ESC).add_suffix("_bf_tbl").reset_index()
)
gf = gf.set_index(allocate_gen_fuel.IDX_PM_ESC)[allocate_gen_fuel.DATA_COLUMNS].add_suffix("_gf_tbl").reset_index()

In [None]:
gen_assoc = (
    pd.merge(
        stack_gens,
        gen.rename(columns={"net_generation_mwh": "net_generation_mwh_g_tbl"}),
        on=allocate_gen_fuel.IDX_GENS,
        how="outer",
    )
    .merge(
        gf,
        on=allocate_gen_fuel.IDX_PM_ESC,
        how="outer",
        validate="m:1",
        indicator=True,  # used in _allocate_unassociated_records to find unassocited
    ))

In [None]:
gen_assoc = gen_assoc.pipe(allocate_gen_fuel.remove_inactive_generators)

In [None]:
gen_assoc = gen_assoc.pipe(
            allocate_gen_fuel._allocate_unassociated_records,
            idx_cols=allocate_gen_fuel.IDX_PM_ESC,
            col_w_unexpected_codes="prime_mover_code",
            data_columns=[f"{col}_gf_tbl" for col in allocate_gen_fuel.DATA_COLUMNS],
        ).drop(columns=["_merge"])  # drop do we can do this again in the bf_summed merge

In [None]:
gen_assoc = gen_assoc.drop_duplicates(subset=["plant_id_eia", "energy_source_code"], keep="last")

In [None]:
gen_assoc[gen_assoc["plant_id_eia"] == 63365]

In [None]:
gf[gf["plant_id_eia"] == 8023].head(5)

In [None]:
bf[bf["plant_id_eia"] == 8023]

In [None]:
bga[bga["plant_id_eia"] == 8023].head(5)

In [None]:
gens[gens["plant_id_eia"] == 8023].head(5)

In [None]:
gen[gen["plant_id_eia"] == 8023]