## About this notebook
This notebook is used for testing the EIA-923 data cleaning process for historical years

In [None]:
# # Tell python where to look for modules.
import sys
import os

%reload_ext autoreload
%autoreload 2

sys.path.append("../../src")
import download_data
import validation
from filepaths import *
import data_cleaning


In [None]:
year = 2016

validation.validate_year(year)

# 0. Set up directory structure
path_prefix = "" 
path_prefix += f"{year}/"
os.makedirs(downloads_folder(), exist_ok=True)
os.makedirs(f"{outputs_folder()}{path_prefix}", exist_ok=True)

# download the raw EIA-923 and EIA-860 files for use in NOx/SO2 calculations until integrated into pudl
download_data.download_raw_eia860(year)
download_data.download_raw_eia923(year)

print("Cleaning EIA-923 data")
eia923_allocated, primary_fuel_table = data_cleaning.clean_eia923(year, False)

In [None]:
import pudl
import sqlalchemy as sa
import logging
import warnings
import pandas as pd
logger = logging.getLogger(__name__)

In [None]:
pudl_db = 'sqlite:///../../data/downloads/pudl/pudl_data/sqlite/pudl.sqlite'
pudl_engine = sa.create_engine(pudl_db)

pudl_out = pudl.output.pudltabl.PudlTabl(
    pudl_engine,
    freq='MS',
    start_date=f'{year}-01-01',
    end_date=f'{year}-12-31'
)

In [None]:
remove_this = pudl.analysis.allocate_net_gen
drop_interim_cols = False

IDX_GENS = ["report_date", "plant_id_eia", "generator_id"]
"""Id columns for generators."""

IDX_GENS_PM_ESC = [
    "report_date",
    "plant_id_eia",
    "generator_id",
    "prime_mover_code",
    "energy_source_code",
]

IDX_PM_ESC = ["report_date", "plant_id_eia", "energy_source_code", "prime_mover_code"]
"""Id columns for plant, prime mover & fuel type records."""

IDX_B_PM_ESC = [
    "report_date",
    "plant_id_eia",
    "boiler_id",
    "energy_source_code",
    "prime_mover_code",
]
"""Id columns for plant, boiler, prime mover & fuel type records."""

IDX_ESC = ["report_date", "plant_id_eia", "energy_source_code"]

IDX_U_ESC = ["report_date", "plant_id_eia", "energy_source_code", "unit_id_pudl"]

# extract all of the tables from pudl_out early in the process and select
# only the columns we need. this is for speed and clarity.
gens = pudl_out.gens_eia860().loc[
    :,
    IDX_GENS
    + [
        "prime_mover_code",
        "unit_id_pudl",
        "capacity_mw",
        "fuel_type_count",
        "operational_status",
        "retirement_date",
    ]
    + list(pudl_out.gens_eia860().filter(like="energy_source_code"))
    + list(pudl_out.gens_eia860().filter(like="startup_source_code")),
]
gf = (
    pudl_out.gf_eia923()
    .loc[
        :,
        IDX_PM_ESC
        + [
            "net_generation_mwh",
            "fuel_consumed_mmbtu",
            "fuel_consumed_for_electricity_mmbtu",
        ],
    ]
    .pipe(remove_this.manually_fix_energy_source_codes)
)
bf = (
    pudl_out.bf_eia923()
    .merge(
        pd.read_sql("boilers_entity_eia", pudl_out.pudl_engine),
        how="left",
        on=["plant_id_eia", "boiler_id"],
    )
    .loc[:, IDX_B_PM_ESC + ["fuel_consumed_mmbtu"]]
).pipe(
    remove_this.distribute_annually_reported_data_to_months,
    key_columns=["plant_id_eia", "boiler_id", "energy_source_code"],
    data_column_name="fuel_consumed_mmbtu",
)
# allocate the boiler fuel data to generators
bf = remove_this.allocate_bf_data_to_gens(bf, gens, pudl_out)

# add any startup energy source codes to the list of energy source codes
# fix MSW codes
gens = remove_this.adjust_energy_source_codes(gens, gf, bf)
# fix prime mover codes in gens so that they match the codes in the gf table
missing_pm = gens[gens["prime_mover_code"].isna()]
if not missing_pm.empty:
    warnings.warn(
        f"{len(missing_pm)} generators are missing prime mover codes in gens_eia860. "
        "This will result in incorrect allocation."
    )
    logger.info(
        missing_pm[
            [
                "report_date",
                "plant_id_eia",
                "generator_id",
                "prime_mover_code",
                "unit_id_pudl",
                "operational_status",
                "energy_source_code_1",
            ]
        ]
    )
# duplicate each entry in the gens table 12 times to create an entry for each month of the year
if pudl_out.freq == "MS":
    gens = pudl.helpers.expand_timeseries(
        df=gens, key_cols=["plant_id_eia", "generator_id"], freq="MS"
    )

gen = (
    pudl_out.gen_original_eia923().loc[:, IDX_GENS + ["net_generation_mwh"]]
    # removes 4 records with NaN generator_id as of pudl v0.5
    .dropna(subset=IDX_GENS)
).pipe(
    remove_this.distribute_annually_reported_data_to_months,
    key_columns=["plant_id_eia", "generator_id"],
    data_column_name="net_generation_mwh",
)

# the gen table is missing some generator ids. Let's fill this using the gens table, leaving a missing value for net generation
gen = gen.merge(
    gens[["plant_id_eia", "generator_id", "report_date"]],
    how="outer",
    on=["plant_id_eia", "generator_id", "report_date"],
)

# do the association!
gen_assoc = remove_this.associate_generator_tables(
    gf=gf, gen=gen, gens=gens, bf=bf, pudl_out=pudl_out
)

# Generate a fraction to use to allocate net generation and fuel consumption by.
# These two methods create a column called `frac`, which will be a fraction
# to allocate net generation from the gf table for each `IDX_PM_ESC` group
gen_pm_fuel = remove_this.prep_alloction_fraction(gen_assoc)

# Net gen allocation
net_gen_alloc = remove_this.allocate_net_gen_by_gen_esc(gen_pm_fuel).pipe(
    remove_this._test_gen_pm_fuel_output, gf=gf, gen=gen
)

remove_this._test_gen_fuel_allocation(gen, net_gen_alloc)

# drop all of the columns we needed to get to the `frac` column
if drop_interim_cols:
    net_gen_alloc = net_gen_alloc.loc[
        :,
        IDX_PM_ESC
        + [
            "generator_id",
            "energy_source_code_num",
            "net_generation_mwh",
        ],
    ]

# fuel allocation
fuel_alloc = remove_this.allocate_fuel_by_gen_esc(gen_pm_fuel)
if drop_interim_cols:
    fuel_alloc = fuel_alloc.loc[
        :,
        IDX_PM_ESC
        + [
            "generator_id",
            "energy_source_code_num",
            "fuel_consumed_mmbtu",
            "fuel_consumed_for_electricity_mmbtu",
        ],
    ]