# About this notebook
This notebook is used for testing code related to allocating the EIA-923 data before integrating it into PUDL and the data pipeline. 

I've also been using this to simply load EIA tables from the PUDL database to explore the EIA data as it exists in PUDL.

In [None]:
%reload_ext autoreload
%autoreload 2

# Useful high-level external modules.
import numpy as np
import pandas as pd
import sqlalchemy as sa
import warnings
import plotly.express as px

import pudl

# # Tell python where to look for modules.
import sys
sys.path.append('../../../open-grid-emissions/src/')

import data_cleaning
import load_data
import pudl.analysis.allocate_net_gen as allocate_gen_fuel


In [None]:
year = 2020
pudl_out = load_data.initialize_pudl_out(year)

In [None]:
IDX_GENS = ["report_date", "plant_id_eia", "generator_id"]
"""Id columns for generators."""

IDX_GENS_PM_ESC = [
    "report_date",
    "plant_id_eia",
    "generator_id",
    "prime_mover_code",
    "energy_source_code",
]

IDX_PM_ESC = ["report_date", "plant_id_eia", "energy_source_code", "prime_mover_code"]
"""Id columns for plant, prime mover & fuel type records."""

IDX_B_PM_ESC = [
    "report_date",
    "plant_id_eia",
    "boiler_id",
    "energy_source_code",
    "prime_mover_code",
]
"""Id columns for plant, boiler, prime mover & fuel type records."""

IDX_ESC = ["report_date", "plant_id_eia", "energy_source_code"]

IDX_U_ESC = ["report_date", "plant_id_eia", "energy_source_code", "unit_id_pudl"]

In [None]:
# extract all of the tables from pudl_out early in the process and select
# only the columns we need. this is for speed and clarity.
gens = pudl_out.gens_eia860().loc[
    :,
    IDX_GENS
    + [
        "prime_mover_code",
        "unit_id_pudl",
        "capacity_mw",
        "fuel_type_count",
        "operational_status",
        "retirement_date",
    ]
    + list(pudl_out.gens_eia860().filter(like="energy_source_code"))
    + list(pudl_out.gens_eia860().filter(like="startup_source_code")),
]
gf = (
    pudl_out.gf_eia923()
    .loc[
        :,
        IDX_PM_ESC
        + [
            "net_generation_mwh",
            "fuel_consumed_mmbtu",
            "fuel_consumed_for_electricity_mmbtu",
        ],
    ]
    .pipe(allocate_gen_fuel.manually_fix_energy_source_codes)
)
bf = (
    pudl_out.bf_eia923()
    .merge(
        pd.read_sql("boilers_entity_eia", pudl_out.pudl_engine),
        how="left",
        on=["plant_id_eia", "boiler_id"],
    )
    .loc[:, IDX_B_PM_ESC + ["fuel_consumed_mmbtu"]]
).pipe(
    allocate_gen_fuel.distribute_annually_reported_data_to_months,
    key_columns=["plant_id_eia", "boiler_id", "energy_source_code"],
    data_column_name="fuel_consumed_mmbtu",
)
# load boiler generator associations
bga = pudl_out.bga_eia860().loc[
    :,
    [
        "plant_id_eia",
        "boiler_id",
        "generator_id",
        "report_date",
    ],
]
# allocate the boiler fuel data to generators
bf = allocate_gen_fuel.allocate_bf_data_to_gens(bf, gens, bga)

# add any startup energy source codes to the list of energy source codes
# fix MSW codes
gens = allocate_gen_fuel.adjust_energy_source_codes(gens, gf, bf)


# fix prime mover codes in gens so that they match the codes in the gf table
missing_pm = gens[gens["prime_mover_code"].isna()]
if not missing_pm.empty:
    warnings.warn(
        f"{len(missing_pm)} generators are missing prime mover codes in gens_eia860. "
        "This will result in incorrect allocation."
    )
# duplicate each entry in the gens table 12 times to create an entry for each month of the year
if pudl_out.freq == "MS":
    gens = pudl.helpers.expand_timeseries(
        df=gens, key_cols=["plant_id_eia", "generator_id"], freq="MS"
    )

gen = (
    pudl_out.gen_original_eia923().loc[:, IDX_GENS + ["net_generation_mwh"]]
    # removes 4 records with NaN generator_id as of pudl v0.5
    .dropna(subset=IDX_GENS)
).pipe(
    allocate_gen_fuel.distribute_annually_reported_data_to_months,
    key_columns=["plant_id_eia", "generator_id"],
    data_column_name="net_generation_mwh",
)

# the gen table is missing some generator ids. Let's fill this using the gens table, leaving a missing value for net generation
gen = gen.merge(
    gens[["plant_id_eia", "generator_id", "report_date"]],
    how="outer",
    on=["plant_id_eia", "generator_id", "report_date"],
)