# Testing PUDL implementation

In [71]:
%reload_ext autoreload
%autoreload 2

import sys
sys.path.append('../../hourly-egrid/')

# Useful high-level external modules.
import numpy as np
import pandas as pd
import sqlalchemy as sa

import pudl

import src.data_cleaning as data_cleaning


In [3]:
year = 2020

pudl_db = 'sqlite:///../data/pudl/pudl_data/sqlite/pudl.sqlite'
pudl_engine = sa.create_engine(pudl_db)

pudl_out = pudl.output.pudltabl.PudlTabl(
    pudl_engine,
    freq='MS',
    start_date=f'{year}-01-01',
    end_date=f'{year}-12-31'
)

In [53]:
# run entire allocation 
gen_allocated = pudl.analysis.allocate_net_gen.allocate_gen_fuel_by_generator_energy_source(pudl_out, drop_interim_cols=True)

  "'frac' column isn't adding up to 1 for each 'IDX_PM_ESC' "
  "'frac' column isn't adding up to 1 for each 'IDX_PM_ESC' "


In [59]:
gen_allocated

Unnamed: 0,report_date,plant_id_eia,energy_source_code,generator_id,energy_source_code_num,net_generation_mwh,fuel_consumed_mmbtu,fuel_consumed_for_electricity_mmbtu
80938,2020-01-01,1,DFO,1,energy_source_code_1,,,
80939,2020-01-01,1,DFO,2,energy_source_code_1,,,
80940,2020-01-01,1,DFO,3,energy_source_code_1,,,
80941,2020-01-01,1,DFO,5,energy_source_code_1,,,
80942,2020-01-01,1,WND,WT1,energy_source_code_1,15.1585,133.0,133.0
...,...,...,...,...,...,...,...,...
350451,2020-12-01,64816,NG,GEN1,energy_source_code_1,,,
350452,2020-12-01,64816,NG,GEN2,energy_source_code_1,,,
350453,2020-12-01,64816,NG,GEN3,energy_source_code_1,,,
350454,2020-12-01,64836,SUN,CATAL,energy_source_code_1,,,


In [105]:
# get a table of primary energy source codes
gen_primary_fuel = gen_allocated[gen_allocated['energy_source_code_num'] == 'energy_source_code_1'].drop_duplicates(subset=['plant_id_eia','generator_id'])[['plant_id_eia','generator_id','energy_source_code']]
# rename the energy source code column to gen primary fuel
gen_primary_fuel = gen_primary_fuel.rename(columns={'energy_source_code':'generator_primary_fuel'})

# calculate the total annual heat input by fuel type for each plant
plant_primary_fuel = gen_allocated.groupby(['plant_id_eia','energy_source_code']).sum()[['fuel_consumed_mmbtu']].reset_index()

# identify the energy source code with the greatest fuel consumption for each plant
plant_primary_fuel = plant_primary_fuel[plant_primary_fuel.groupby('plant_id_eia')['fuel_consumed_mmbtu'].transform(max) == plant_primary_fuel['fuel_consumed_mmbtu']][['plant_id_eia','energy_source_code']]

# rename the column to plant primary fuel
plant_primary_fuel = plant_primary_fuel.rename(columns={'energy_source_code':'plant_primary_fuel'})

# merge the plant primary fuel into the gen primary fuel
primary_fuel_table = gen_primary_fuel.merge(plant_primary_fuel, how='left', on='plant_id_eia')

primary_fuel_table

Unnamed: 0,plant_id_eia,generator_id,generator_primary_fuel,plant_primary_fuel
0,1,1,DFO,WND
1,1,2,DFO,WND
2,1,3,DFO,WND
3,1,5,DFO,WND
4,1,WT1,WND,WND
...,...,...,...,...
23735,64816,GEN1,NG,NG
23736,64816,GEN2,NG,NG
23737,64816,GEN3,NG,NG
23738,64836,CATAL,SUN,SUN


In [67]:
# calculate emissions
# TODO: update fuel codes
gen_allocated = data_cleaning.calculate_co2_from_fuel_consumption(gen_allocated, year)
gen_allocated

Unnamed: 0,report_date,plant_id_eia,energy_source_code,generator_id,energy_source_code_num,net_generation_mwh,fuel_consumed_mmbtu,fuel_consumed_for_electricity_mmbtu,co2_mass_tons,co2_mass_tons_adjusted
0,2020-01-01,1,DFO,1,energy_source_code_1,,,,,
1,2020-01-01,1,DFO,2,energy_source_code_1,,,,,
2,2020-01-01,1,DFO,3,energy_source_code_1,,,,,
3,2020-01-01,1,DFO,5,energy_source_code_1,,,,,
4,2020-01-01,1,WND,WT1,energy_source_code_1,15.1585,133.0,133.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
350527,2020-12-01,64816,NG,GEN1,energy_source_code_1,,,,,
350528,2020-12-01,64816,NG,GEN2,energy_source_code_1,,,,,
350529,2020-12-01,64816,NG,GEN3,energy_source_code_1,,,,,
350530,2020-12-01,64836,SUN,CATAL,energy_source_code_1,,,,,


In [68]:
gen_allocated[(gen_allocated['plant_id_eia'] == 3) & (gen_allocated['report_date'] == "2020-02-01")]

Unnamed: 0,report_date,plant_id_eia,energy_source_code,generator_id,energy_source_code_num,net_generation_mwh,fuel_consumed_mmbtu,fuel_consumed_for_electricity_mmbtu,co2_mass_tons,co2_mass_tons_adjusted
29275,2020-02-01,3,BIT,4,energy_source_code_1,-17.617499,0.0,0.0,0.0,0.0
29276,2020-02-01,3,BIT,5,energy_source_code_1,-134.725501,1344.0,1344.0,138.37824,138.37824
29277,2020-02-01,3,NG,1,energy_source_code_1,1758.034399,28612.72,28612.72,1672.127265,1672.127265
29278,2020-02-01,3,NG,2,energy_source_code_1,-176.090544,0.0,0.0,0.0,0.0
29279,2020-02-01,3,NG,4,energy_source_code_2,-575.16531,0.0,0.0,0.0,0.0
29280,2020-02-01,3,NG,5,energy_source_code_2,-4398.435545,1309.282,1309.282,76.514415,76.514415
29281,2020-02-01,3,NG,A1CT,energy_source_code_1,118310.0,1187371.0,1187371.0,69389.966501,69389.966501
29282,2020-02-01,3,NG,A1CT2,energy_source_code_1,108879.0,1187371.0,1187371.0,69389.966501,69389.966501
29283,2020-02-01,3,NG,A1ST,energy_source_code_1,119322.0,41325.15,41325.15,2415.041568,2415.041568
29284,2020-02-01,3,NG,A2C1,energy_source_code_1,124713.0,1282524.0,1282524.0,74950.726519,74950.726519


In [106]:
#gen_aggregated = pudl.analysis.allocate_net_gen.aggregate_gen_fuel_by_generator(pudl_out, gen_allocated, sum_cols=["net_generation_mwh", "fuel_consumed_mmbtu","fuel_consumed_for_electricity_mmbtu","co2_mass_tons","co2_mass_tons_adjusted"])
gen_aggregated = pudl.analysis.allocate_net_gen.agg_by_generator(gen_allocated, sum_cols=["net_generation_mwh", "fuel_consumed_mmbtu","fuel_consumed_for_electricity_mmbtu","co2_mass_tons","co2_mass_tons_adjusted"])

In [107]:
gen_aggregated[(gen_aggregated['plant_id_eia'] == 3) & (gen_aggregated['report_date'] == "2020-02-01")]

Unnamed: 0,report_date,plant_id_eia,generator_id,net_generation_mwh,fuel_consumed_mmbtu,fuel_consumed_for_electricity_mmbtu,co2_mass_tons,co2_mass_tons_adjusted
23485,2020-02-01,3,1,1758.034399,28612.72,28612.72,1672.127265,1672.127265
23486,2020-02-01,3,2,-176.090544,0.0,0.0,0.0,0.0
23487,2020-02-01,3,4,-592.782809,0.0,0.0,0.0,0.0
23488,2020-02-01,3,5,-4533.161046,2653.282,2653.282,214.892655,214.892655
23489,2020-02-01,3,A1CT,118310.0,1187371.0,1187371.0,69389.966501,69389.966501
23490,2020-02-01,3,A1CT2,108879.0,1187371.0,1187371.0,69389.966501,69389.966501
23491,2020-02-01,3,A1ST,119322.0,41325.15,41325.15,2415.041568,2415.041568
23492,2020-02-01,3,A2C1,124713.0,1282524.0,1282524.0,74950.726519,74950.726519
23493,2020-02-01,3,A2C2,125380.0,1282524.0,1282524.0,74950.726519,74950.726519
23494,2020-02-01,3,A2ST,127482.0,44636.85,44636.85,2608.577712,2608.577712


In [41]:
remove_this = pudl.analysis.allocate_net_gen
drop_interim_cols = False

In [40]:
IDX_GENS = ["report_date", "plant_id_eia", "generator_id"]
"""Id columns for generators."""

IDX_PM_ESC = ["report_date", "plant_id_eia", "energy_source_code", "prime_mover_code"]
"""Id columns for plant, prime mover & fuel type records."""

IDX_ESC = ["report_date", "plant_id_eia", "energy_source_code"]

IDX_U_ESC = ["report_date", "plant_id_eia", "energy_source_code", "unit_id_pudl"]

"""Data columns from generation_fuel_eia923 that are being allocated."""
# extract all of the tables from pudl_out early in the process and select
# only the columns we need. this is for speed and clarity.
gf = (
    pudl_out.gf_eia923()
    .loc[:, IDX_PM_ESC + ["net_generation_mwh", "fuel_consumed_mmbtu","fuel_consumed_for_electricity_mmbtu"]]
    .pipe(remove_this.group_msw_codes, IDX_PM_ESC) #TODO: we may not want to group MSW codes since these have different emissions
)
gen = (
    pudl_out.gen_original_eia923().loc[:, IDX_GENS + ["net_generation_mwh"]]
    # removes 4 records with NaN generator_id as of pudl v0.5
    .dropna(subset=IDX_GENS)
)
gens = pudl_out.gens_eia860().loc[
    :,
    IDX_GENS
    + [
        "prime_mover_code",
        "unit_id_pudl",
        "capacity_mw",
        "fuel_type_count",
        "operational_status",
        "retirement_date",
    ]
    + list(pudl_out.gens_eia860().filter(like="energy_source_code"))
    + list(pudl_out.gens_eia860().filter(like="startup_source_code")),
]
# add any startup energy source codes to the list of energy source codes
gens = remove_this.add_startup_energy_sources_to_esc_columns(gens)
# get a table of generator primary fuel that will be used later
#gen_primary_fuel = gens.copy()[['plant_id_eia','generator_id','energy_source_code_1']]
# duplicate each entry 12 times to create an entry for each month of the year
gens = remove_this.create_monthly_gens_records(gens)
# fix prime mover codes in gens so that they match the codes in the gf table
gens = remove_this.manually_fix_prime_movers(gens)
# the gen table is missing some generator ids. Let's fill this using the gens table, leaving a missing value for net generation
gen = gen.merge(gens[['plant_id_eia','generator_id','report_date']], how='outer', on=['plant_id_eia','generator_id','report_date'])
bf = (
    pudl_out.bf_eia923().merge(pd.read_sql("boilers_entity_eia", pudl_out.pudl_engine), how='left', on=['plant_id_eia','boiler_id'])
    .loc[:, IDX_U_ESC + ["fuel_consumed_mmbtu"]]
    #.pipe(remove_bf_nulls) # NOTE: this was causing an issue with allocation
    .pipe(remove_this.group_msw_codes, IDX_U_ESC)
)

In [81]:
gens[gens['plant_id_eia'] == 50626]

Unnamed: 0,report_date,plant_id_eia,generator_id,prime_mover_code,unit_id_pudl,capacity_mw,fuel_type_count,operational_status,retirement_date,energy_source_code_1,energy_source_code_2,energy_source_code_3,energy_source_code_4,energy_source_code_5,energy_source_code_6,energy_source_code_7,energy_source_code_8
16525,2020-01-01,50626,GEN1,OT,,5.7,1,existing,NaT,OTH,,,,,,,
16525,2020-02-01,50626,GEN1,OT,,5.7,1,existing,NaT,OTH,,,,,,,
16525,2020-03-01,50626,GEN1,OT,,5.7,1,existing,NaT,OTH,,,,,,,
16525,2020-04-01,50626,GEN1,OT,,5.7,1,existing,NaT,OTH,,,,,,,
16525,2020-05-01,50626,GEN1,OT,,5.7,1,existing,NaT,OTH,,,,,,,
16525,2020-06-01,50626,GEN1,OT,,5.7,1,existing,NaT,OTH,,,,,,,
16525,2020-07-01,50626,GEN1,OT,,5.7,1,existing,NaT,OTH,,,,,,,
16525,2020-08-01,50626,GEN1,OT,,5.7,1,existing,NaT,OTH,,,,,,,
16525,2020-09-01,50626,GEN1,OT,,5.7,1,existing,NaT,OTH,,,,,,,
16525,2020-10-01,50626,GEN1,OT,,5.7,1,existing,NaT,OTH,,,,,,,


In [46]:
gen_assoc = remove_this.associate_generator_tables(gf=gf, gen=gen, gens=gens, bf=bf)

# Generate a fraction to use to allocate net generation by.
# These two methods create a column called `frac`, which will be a fraction
# to allocate net generation from the gf table for each `IDX_PM_ESC` group
gen_pm_fuel = remove_this.prep_alloction_fraction(gen_assoc)

# Net gen allocation
net_gen_alloc = remove_this.allocate_net_gen_by_gen_esc(gen_pm_fuel).pipe(
    remove_this._test_gen_pm_fuel_output, gf=gf, gen=gen
)

remove_this._test_gen_fuel_allocation(gen, net_gen_alloc)

# drop all of the columns we needed to get to the `frac` column
if drop_interim_cols:
    net_gen_alloc = net_gen_alloc.loc[
        :,
        IDX_ESC
        + [
            "generator_id",
            "energy_source_code_num",
            "net_generation_mwh",
        ],
    ]

# fuel allocation
fuel_alloc = remove_this.allocate_fuel_by_gen_esc(gen_pm_fuel)
if drop_interim_cols:
    fuel_alloc = fuel_alloc.loc[
        :,
        IDX_ESC
        + [
            "generator_id",
            "energy_source_code_num",
            "fuel_consumed_mmbtu",
            "fuel_consumed_for_electricity_mmbtu"
        ],
    ]

  "'frac' column isn't adding up to 1 for each 'IDX_PM_ESC' "
  "'frac' column isn't adding up to 1 for each 'IDX_PM_ESC' "


### Step 1

In [25]:
from typing import List

def remove_bf_nulls(bf: pd.DataFrame):
    """
    Remove nulls in the unit_id_pudl and nulls or 0's in fuel_consumed_mmbtu.

    We need to drop some nulls and zero's here. drop the fuel 0's/nulls bc
    there will be nothing to allocate to/go off. drop the null units bc there
    is ~5% of bf records w/o units and the association happend on the units.

    Returns:
        a copy of ``bf``
    """
    if len(bf[bf.unit_id_pudl.isnull()]) / len(bf) > 0.06:
        raise AssertionError("There are more than ")
    bf = (
        bf[(bf.fuel_consumed_mmbtu != 0) | bf.fuel_consumed_mmbtu.isnull()]
        .dropna(subset=["unit_id_pudl"])
        .copy()
    )
    return bf


def group_msw_codes(df: pd.DataFrame, idx: List[str]):
    """
    Group the municipal solid waste codes as ``MSW``.

    The generators table does not contain the energy_source_code's ``MSB``
    (municipal_solid_waste_biogenic) or ``MSN`` (municipal_solid_nonbiogenic).
    Group these codes with the more generic ``MSW`` so we can associate the
    records with the generators.
    """
    mask_msw = df.energy_source_code.isin(["MSN", "MSB", "MSW"])
    new_nsw = (
        df.loc[mask_msw]
        .replace({"energy_source_code": {"MSN": "MSW", "MSB": "MSW"}})
        .groupby(idx, as_index=False, dropna=False)
        .sum(min_count=1)
    )
    df_out = pd.concat(
        [df.loc[~mask_msw], new_nsw],
        axis=0,
    )
    logger.info(
        f"Replaced {1-len(df_out)/len(df):.1%} MSN and MSB energy_source_codes w/ MSW"
    )
    return df_out


def allocate_fuel_by_gen_esc(gen_pm_fuel, gf, bf):
    """Allocate fuel consumption by generator/energy_source_code."""
    not_in_bf_tbl = gen_pm_fuel.loc[~gen_pm_fuel.in_bf_tbl]
    # we still need to develop a process for the gens that do show up in the BF table
    in_bf_tbl = gen_pm_fuel.loc[gen_pm_fuel.in_bf_tbl]

    not_in_bf_tbl = allocate_fuel_for_non_bf_gens(not_in_bf_tbl, debug=False)
    in_bf_tbl = allocate_fuel_for_in_bf_gens(in_bf_tbl, debug=False)

    # log coverage of
    not_in_bf_fuel_sum = gf.fuel_consumed_mmbtu.sum() - bf.fuel_consumed_mmbtu.sum()
    logger.info(
        f"{not_in_bf_tbl.fuel_consumed_mmbtu.sum()/ not_in_bf_fuel_sum:.02%}"
        "of the fuel from the gf table has been allocated to records not in the bf table"
    )
    logger.info(
        f"{in_bf_tbl.fuel_consumed_mmbtu.sum() / bf.fuel_consumed_mmbtu.sum():.02%} "
        "of the fuel from the bf table has been allocated"
    )
    # squish the two methods together
    fuel_alloc = pd.concat([in_bf_tbl, not_in_bf_tbl]).sort_values(IDX_U_ESC)
    logger.info(
        f"{fuel_alloc.fuel_consumed_mmbtu.sum() / gf.fuel_consumed_mmbtu.sum():.02%} "
        "of the fuel from the gf table has been allocated"
    )
    return fuel_alloc


def test_frac_cap_in_bf(in_bf_tbl, debug=False):
    """
    Test the frac_cap column for records w/ BF data.

    Raise:
        AssertionError: if `frac_cap` does not sum to 1 within
            each plant/fuel group (via `IDX_ESC`).
    """
    # frac_cap for each fuel group should sum to 1
    in_bf_tbl["frac_cap_test"] = in_bf_tbl.groupby(
        IDX_ESC + ["unit_id_pudl"], dropna=False
    )[["frac_cap"]].transform(sum, min_count=1)

    frac_cap_test = in_bf_tbl[~np.isclose(in_bf_tbl.frac_cap_test, 1)]
    if not frac_cap_test.empty:
        message = (
            "Mayday! Mayday! The `frac_cap` test has failed. We have "
            f"{len(frac_cap_test)} records who's `frac_cap` isn't summing to 1"
            " in each plant/fuel group. Check creation of "
            "`capacity_mw_fuel_in_bf_tbl_group` column in "
            "`prep_alloction_fraction()` or assignment of `frac_calc in "
            "`allocate_fuel_for_in_bf_gens()`"
        )
        if debug:
            warnings.warn(message)
        else:
            raise AssertionError(message)
    else:
        logger.info("You've passed the frac_cap test for the `in_bf_tbl` records")
    if not debug:
        in_bf_tbl = in_bf_tbl.drop(columns=["frac_cap_test"])

def allocate_fuel_for_in_bf_gens(in_bf_tbl, debug=False):
    """Allocate fuel consumption for records that are in the BF table."""
    in_bf_tbl = in_bf_tbl.assign(
        ########
        # we are going to equally allocate fuel burned within a unit
        # to the various generators (regardless of their prime mover)
        # based on capacity
        frac_cap=lambda x: x.capacity_mw / x.capacity_mw_fuel_in_bf_tbl_group,
        # fuel_consumed_mmbtu_bf * frac_cap
        fuel_consumed_mmbtu=lambda x: x.fuel_consumed_mmbtu_bf_tbl * x.frac_cap,
        fuel_consumed_for_electricity_mmbtu=lambda x: x.fuel_consumed_for_electricity_mmbtu_gf_tbl * x.frac_cap,
    )
    test_frac_cap_in_bf(in_bf_tbl, debug=debug)
    return in_bf_tbl

def allocate_fuel_for_non_bf_gens(not_in_bf_tbl, debug=False):
    """
    Allocate fuel consumption for records that are not in the BF table.

    Args:
        gen_pm_fuel (pandas.DataFrame): result of ``prep_alloction_fraction()``

    TODO: Plug this into the overall process. This function is dangling
    right now. I need to allocate the fuel for the "in_bf_tbl" gens
    and squish them together.
    """
    # what fuel should be assigned to these "not in bf"
    # records based on fuel groupings?
    not_in_bf_tbl = not_in_bf_tbl.assign(
        # fuel that should be allocated to the not-reporting-to-bf records
        # (the fuel in the gf tbl's fuel group - the fuel in the bf tbl's fuel
        # group) we must fill the BF tbl's nulls with zeros. For the plants
        # that just don't have any data in the bf tbl
        fuel_consumed_mmbtu_not_in_bf=lambda x: (
            x.fuel_consumed_mmbtu_gf_tbl_fuel - x.fuel_consumed_mmbtu_bf_tbl.fillna(0)
        ),
        # Get the frac_cap (used for allocating within gens that don't report
        # to bf). Portion of capacity for each PM_FUEL group
        frac_cap=lambda x: x.capacity_mw / x.capacity_mw_fuel_in_bf_tbl_group,
        # frac_cap * fuel_consumed_mmbtu_not_in_bf
        fuel_consumed_mmbtu=lambda x: x.frac_cap * x.fuel_consumed_mmbtu_not_in_bf,
        fuel_consumed_for_electricity_mmbtu=lambda x: x.frac_cap * x.fuel_consumed_for_electricity_mmbtu_gf_tbl_fuel,
    )

    test_frac_cap_not_in_bf(not_in_bf_tbl)
    test_not_bf_fuel_totals(not_in_bf_tbl, debug=debug)
    return not_in_bf_tbl


def test_frac_cap_not_in_bf(not_in_bf_tbl):
    """
    Test the fraction of the capacity within each prime mover/fuel group.

    Args:
        not_in_bf_tbl (pandas.DataFrame):

    Raises:
        AssertionError: if the `frac_cap` column sums to any number other
            than 1 for each `IDX_U_ESC` group.
    """
    frac_cap_test = not_in_bf_tbl.groupby(IDX_U_ESC)[["frac_cap"]].sum(min_count=1)
    non_one_frac_cap = frac_cap_test[
        frac_cap_test.frac_cap.notnull() & ~np.isclose(frac_cap_test.frac_cap, 1)
    ]
    if not non_one_frac_cap.empty:
        raise AssertionError(
            "The `frac_cap` column should always add up to one for each "
            f"group of: {IDX_U_ESC}. We got {len(non_one_frac_cap)}."
        )
    logger.info(
        "Yay you passed the test for the `frac_cap` column for the not_in_bf "
        "records.."
    )


def test_not_bf_fuel_totals(not_in_bf_tbl, debug=False):
    """
    Test the allocated fuel consumption for the records not in the BF table.

    Raises:
        AssertionError: If any
    """
    not_in_bf_tbl["fuel_consumed_mmbtu_gf_tbl_test"] = not_in_bf_tbl.groupby(
        IDX_U_ESC, dropna=False
    )[["fuel_consumed_mmbtu"]].transform(sum, min_count=1)
    fuel_test = not_in_bf_tbl[
        (
            ~np.isclose(
                not_in_bf_tbl.fuel_consumed_mmbtu_not_in_bf,
                not_in_bf_tbl.fuel_consumed_mmbtu_gf_tbl_test,
            )
        )
        & (not_in_bf_tbl.fuel_consumed_mmbtu_not_in_bf.notnull())
    ]
    if not fuel_test.empty:
        message = (
            "Oh dear, oh dear... the allocation of fuel consumption is "
            f"bro0oken. We got {len(fuel_test)} generator records who's fuel"
            "didn't add up to the total fuel in the gf table after subtracting "
            "the fuel reported in the bf table."
        )
        if debug:
            warnings.warn(message)
        else:
            raise AssertionError(message)
    else:
        logger.info(
            "Wahoo! You passed the test for fuel allocation for the not_in_bf "
            "records."
        )

def create_monthly_gens_records(gens):
    """
    Creates a duplicate record for each month of the year in the gens file
    """
    # If we want to allocate net generation at the monthly level, we need to ensure that the gens file has monthly records
    # to do this, we can duplicate the records in gens 11 times for each month, so that there is a record for each month of the year
    
    # create a copy of gens to hold the monthly data
    gens_month = gens.copy()

    month = 2
    while month <= 12:
        # add one month to the copied data each iteration
        gens_month['report_date'] = gens_month['report_date'] + pd.DateOffset(months=1)
        # concat this data to the gens file
        gens = pd.concat([gens, gens_month], axis=0)
        month += 1

    return gens

def manually_fix_prime_movers(df):
    """
    See https://github.com/catalyst-cooperative/pudl/issues/1585
    """

    # fix cogeneration issues
    plant_ids = [2465, 50150, 54268, 54410, 54262]
    for id in plant_ids:
        df.loc[(df['plant_id_eia'] == id) & (df['prime_mover_code'] == 'CT'), 'prime_mover_code'] = 'GT'
        df.loc[(df['plant_id_eia'] == id) & (df['prime_mover_code'] == 'CA'), 'prime_mover_code'] = 'ST'

    # fix missing code
    df.loc[(df['plant_id_eia'] == 50489), 'prime_mover_code'] = 'GT'
    df.loc[(df['plant_id_eia'] == 50489) & (df['generator_id'] == 'C3'), 'prime_mover_code'] = 'ST'

    df.loc[(df['plant_id_eia'] == 10884), 'prime_mover_code'] = 'GT'

    df.loc[(df['plant_id_eia'] == 58946), 'prime_mover_code'] = 'IC'

    df.loc[(df['plant_id_eia'] == 60610), 'prime_mover_code'] = 'OT'

    df.loc[(df['plant_id_eia'] == 7854), 'prime_mover_code'] = 'IC'

    df.loc[(df['plant_id_eia'] == 50628) & (df['generator_id'] == 'GEN1'), 'prime_mover_code'] = 'GT'
    df.loc[(df['plant_id_eia'] == 50628) & (df['generator_id'] == 'GEN2'), 'prime_mover_code'] = 'ST'
    df.loc[(df['plant_id_eia'] == 50628) & (df['generator_id'] == 'GEN3'), 'prime_mover_code'] = 'ST'

    df.loc[(df['plant_id_eia'] == 55088) & ~(df['generator_id'] == 'ST1'), 'prime_mover_code'] = 'GT'

    df.loc[(df['plant_id_eia'] == 52168) & (df['generator_id'] == 'GEN2'), 'prime_mover_code'] = 'ST'
    df.loc[(df['plant_id_eia'] == 52168) & (df['generator_id'] == 'GEN3'), 'prime_mover_code'] = 'GT'
    df.loc[(df['plant_id_eia'] == 52168) & (df['generator_id'] == 'GEN4'), 'prime_mover_code'] = 'GT'

    df.loc[(df['plant_id_eia'] == 55096) & (df['generator_id'] == 'GT'), 'prime_mover_code'] = 'GT'
    df.loc[(df['plant_id_eia'] == 55096) & (df['generator_id'] == 'ST'), 'prime_mover_code'] = 'ST'

    df.loc[(df['plant_id_eia'] == 7887) & (df['generator_id'] == '4'), 'prime_mover_code'] = 'GT'

    df.loc[(df['plant_id_eia'] == 6474) & (df['generator_id'] == 'GT1'), 'prime_mover_code'] = 'GT'
    df.loc[(df['plant_id_eia'] == 6474) & (df['generator_id'] == 'GT2'), 'prime_mover_code'] = 'GT'

    return df

def manually_update_energy_source_codes(df):
    """
    Manually update fuel source codes
    """
    # refinery with energy source = OTH
    df.loc[(df['plant_id_eia'] == 50626) & (df['generator_id'] == 'GEN1'), 'energy_source_code'] = 'OG'
    df.loc[(df['plant_id_eia'] == 56139) & (df['generator_id'] == 'NPCG'), 'energy_source_code'] = 'OG'

    return df


In [26]:
remove_this = pudl.analysis.allocate_net_gen

In [27]:
IDX_GENS = ["report_date", "plant_id_eia", "generator_id"]
"""Id columns for generators."""

IDX_PM_ESC = ["report_date", "plant_id_eia", "energy_source_code", "prime_mover_code"]
"""Id columns for plant, prime mover & fuel type records."""

IDX_ESC = ["report_date", "plant_id_eia", "energy_source_code"]

IDX_U_ESC = ["report_date", "plant_id_eia", "energy_source_code", "unit_id_pudl"]

gf = (
    pudl_out.gf_eia923()
    .loc[:, IDX_PM_ESC + ["net_generation_mwh", "fuel_consumed_mmbtu","fuel_consumed_for_electricity_mmbtu"]]
    .pipe(remove_this.group_msw_codes, IDX_PM_ESC) #TODO: we may not want to group MSW codes since these have different emissions
)
gen = (
    pudl_out.gen_original_eia923().loc[:, IDX_GENS + ["net_generation_mwh"]]
    # removes 4 records with NaN generator_id as of pudl v0.5
    .dropna(subset=IDX_GENS)
)
gens = pudl_out.gens_eia860().loc[
    :,
    IDX_GENS
    + [
        "prime_mover_code",
        "unit_id_pudl",
        "capacity_mw",
        "fuel_type_count",
        "operational_status",
        "retirement_date",
    ]
    + list(pudl_out.gens_eia860().filter(like="energy_source_code"))
    + list(pudl_out.gens_eia860().filter(like="startup_source_code")),
]

# add any startup energy source codes to the list of energy source codes
gens = remove_this.add_startup_energy_sources_to_esc_columns(gens)
# get a table of generator primary fuel that will be used later
gen_primary_fuel = gens.copy()[['plant_id_eia','generator_id','energy_source_code_1']]
# duplicate each entry 12 times to create an entry for each month of the year
gens = remove_this.create_monthly_gens_records(gens)
# fix prime mover codes in gens so that they match the codes in the gf table
gens = manually_fix_prime_movers(gens)
# the gen table is missing some generator ids. Let's fill this using the gens table, leaving a missing value for net generation
gen = gen.merge(gens[['plant_id_eia','generator_id','report_date']], how='outer', on=['plant_id_eia','generator_id','report_date'])
bf = (
    pudl_out.bf_eia923()#.merge(pd.read_sql("boilers_entity_eia", pudl_out.pudl_engine), how='left', on=['plant_id_eia','boiler_id'])
    .loc[:, IDX_U_ESC + ["fuel_consumed_mmbtu"]]
    #.pipe(remove_bf_nulls) # NOTE: this was causing an issue with allocation
    .pipe(remove_this.group_msw_codes, IDX_U_ESC)
    )


### Associate Generator Tables

In [28]:
def stack_generators(
    gens, cat_col="energy_source_code_num", stacked_col="energy_source_code"
):
    """
    Stack the generator table with a set of columns.

    Args:
        gens (pandas.DataFrame): generators_eia860 table with cols: ``IDX_GENS``
            and all of the `energy_source_code` columns
        cat_col (string): name of category column which will end up having the
            column names of cols_to_stack
        stacked_col (string): name of column which will end up with the stacked
            data from cols_to_stack

    Returns:
        pandas.DataFrame: a dataframe with these columns: idx_stack, cat_col,
        stacked_col

    """
    esc = list(gens.filter(like="energy_source_code"))
    gens_stack_prep = (
        pd.DataFrame(gens.set_index(IDX_GENS)[esc].stack(level=0))
        .reset_index()
        .rename(columns={"level_3": cat_col, 0: stacked_col})
        .pipe(pudl.helpers.apply_pudl_dtypes, "eia")
    )

    # merge the stacked df back onto the gens table
    # we first drop the cols_to_stack so we don't duplicate data
    gens_stack = pd.merge(
        gens.drop(columns=esc), gens_stack_prep, on=IDX_GENS, how="outer"
    )
    return gens_stack

def _associate_unconnected_records(eia_generators_merged):
    """
    Associate unassociated gen_fuel table records on idx_pm.

    There are a subset of generation_fuel_eia923 records which do not
    merge onto the stacked generator table on ``IDX_PM_ESC``. These records
    generally don't match with the set of prime movers and fuel types in the
    stacked generator table. In this method, we associate those straggler,
    unconnected records by merging these records with the stacked generators on
    the prime mover only.

    Args:
        eia_generators_merged (pandas.DataFrame)

    """
    # we're associating on the plant/pm level... but we only want to associated
    # these unassocaited records w/ the primary fuel type from stack_generators
    # so we're going to merge on energy_source_code_num and
    idx_pm = [
        "plant_id_eia",
        "prime_mover_code",
        "energy_source_code_num",
        "report_date",
    ]
    # we're going to only associate these unconnected fuel records w/
    # the primary fuel so we don't have to deal w/ double counting
    connected_mask = eia_generators_merged.generator_id.notnull()
    eia_generators_connected = eia_generators_merged[connected_mask]
    eia_generators_unconnected = (
        eia_generators_merged[~connected_mask]
        .rename(columns={"energy_source_code": "energy_source_unconnected"})
        .assign(energy_source_code_num="energy_source_code_1")
        .groupby(by=idx_pm)
        .sum(min_count=1)
        .reset_index()
    )
    eia_generators = (
        pd.merge(
            eia_generators_connected,
            eia_generators_unconnected[
                idx_pm + ["net_generation_mwh_gf_tbl", "fuel_consumed_mmbtu_gf_tbl","fuel_consumed_for_electricity_mmbtu_gf_tbl"]
            ],
            on=idx_pm,
            suffixes=("", "_unconnected"),
            how="left",
        )
        .assign(
            # we want the main and the unconnected net gen to be added together
            # but sometimes there is no main net gen and sometimes there is no
            # unconnected net gen
            net_generation_mwh_gf_tbl=lambda x: np.where(
                x.net_generation_mwh_gf_tbl.notnull()
                | x.net_generation_mwh_gf_tbl_unconnected.notnull(),
                x.net_generation_mwh_gf_tbl.fillna(0)
                + x.net_generation_mwh_gf_tbl_unconnected.fillna(0),
                np.nan,
            ),
            fuel_consumed_mmbtu_gf_tbl=lambda x: np.where(
                x.fuel_consumed_mmbtu_gf_tbl.notnull()
                | x.fuel_consumed_mmbtu_gf_tbl_unconnected.notnull(),
                x.fuel_consumed_mmbtu_gf_tbl.fillna(0)
                + x.fuel_consumed_mmbtu_gf_tbl_unconnected.fillna(0),
                np.nan,
            ),
            fuel_consumed_for_electricity_mmbtu_gf_tbl=lambda x: np.where(
                x.fuel_consumed_for_electricity_mmbtu_gf_tbl.notnull()
                | x.fuel_consumed_for_electricity_mmbtu_gf_tbl_unconnected.notnull(),
                x.fuel_consumed_for_electricity_mmbtu_gf_tbl.fillna(0)
                + x.fuel_consumed_for_electricity_mmbtu_gf_tbl_unconnected.fillna(0),
                np.nan,
            ),
        )  # we no longer need these _unconnected columns
        .drop(
            columns=[
                "net_generation_mwh_gf_tbl_unconnected",
                "fuel_consumed_mmbtu_gf_tbl_unconnected",
                "fuel_consumed_for_electricity_mmbtu_gf_tbl_unconnected",
            ]
        )
    )
    return eia_generators

In [29]:
stack_gens = stack_generators(
    gens, cat_col="energy_source_code_num", stacked_col="energy_source_code"
)

bf_summed = (
    bf.groupby(by=IDX_U_ESC, dropna=False)
    .sum(min_count=1)
    .add_suffix("_bf_tbl")
    .reset_index()
    .pipe(pudl.helpers.convert_cols_dtypes, "eia")
)
gf_pm_fuel_summed = (
    gf.groupby(by=IDX_PM_ESC)
    .sum(min_count=1)[["fuel_consumed_mmbtu","fuel_consumed_for_electricity_mmbtu", "net_generation_mwh"]]
    .add_suffix("_gf_tbl")
    .reset_index()
)
gf_fuel_summed = (
    gf.groupby(by=IDX_ESC)
    .sum(min_count=1)[["fuel_consumed_mmbtu","fuel_consumed_for_electricity_mmbtu"]]
    .add_suffix("_gf_tbl_fuel")
    .reset_index()
)

gen_assoc = (
    pd.merge(stack_gens, gen, on=IDX_GENS, how="outer")
    .pipe(remove_this.remove_inactive_generators)
    .rename(columns={"net_generation_mwh": "net_generation_mwh_g_tbl"})
    .merge(gf_pm_fuel_summed, on=IDX_PM_ESC, how="left", validate="m:1")
    .merge(bf_summed, on=IDX_U_ESC, how="left", validate="m:1")
    .merge(
        gf_fuel_summed,
        on=IDX_ESC,
        how="left",
        validate="m:1",
    )
)


# calculate the total capacity in every fuel group
gen_assoc = (
    pd.merge(
        gen_assoc,
        gen_assoc.groupby(by=IDX_ESC)[["capacity_mw", "net_generation_mwh_g_tbl"]]
        .sum(min_count=1)
        .add_suffix("_fuel")
        .reset_index(),
        on=IDX_ESC,
    )
    .pipe(pudl.helpers.apply_pudl_dtypes, "eia")
    .pipe(_associate_unconnected_records)
    .pipe(manually_update_energy_source_codes)
)

### Prep allocation fraction

In [30]:
# flag whether the generator exists in the
# generation table (this will be used later on)
# for calculating ratios to use to allocate net generation
gen_assoc = gen_assoc.assign(
    in_g_tbl=lambda x: np.where(x.net_generation_mwh_g_tbl.notnull(), True, False),
    in_bf_tbl=lambda x: np.where(
        x.fuel_consumed_mmbtu_bf_tbl.notnull(), True, False
    ),
)

gens_gb = gen_assoc.groupby(by=IDX_PM_ESC, dropna=False)
# get the total values for the merge group
# we would use on groupby here with agg but it is much slower
# so we're gb-ing twice w/ a merge
# gens_gb.agg({'net_generation_mwh_g_tbl': lambda x: x.sum(min_count=1),
#              'capacity_mw': lambda x: x.sum(min_count=1),
#              'in_g_tbl': 'all'},)
gen_pm_fuel = (
    gen_assoc.merge(  # flag if all generators exist in the generators_eia860 tbl
        gens_gb[["in_g_tbl"]].all().reset_index(),
        on=IDX_PM_ESC,
        suffixes=("", "_all"),
    )
    .merge(  # flag if some generators exist in the generators_eia860 tbl
        gens_gb[["in_g_tbl"]].any().reset_index(),
        on=IDX_PM_ESC,
        suffixes=("", "_any"),
    )
    .merge(  # flag if all generators exist in the boiler fuel tbl
        gens_gb[["in_bf_tbl"]].all().reset_index(),
        on=IDX_PM_ESC,
        suffixes=("", "_all"),
    )
    .merge(  # flag if some generators exist in the boiler fuel tbl
        gens_gb[["in_bf_tbl"]].any().reset_index(),
        on=IDX_PM_ESC,
        suffixes=("", "_any"),
    )
    # Net generation and capacity are both proxies that can be used
    # to allocate the generation which only shows up in generation_fuel.
    # fuel consumption from the bf table can be used as a proxy to allocate
    # fuel consumption that only shows up in generation_fuel
    # Sum them up across the whole plant-prime-fuel group so we can tell
    # what fraction of the total capacity each generator is.
    .merge(
        (
            gens_gb[["net_generation_mwh_g_tbl","fuel_consumed_mmbtu_bf_tbl", "capacity_mw"]]
            .sum(min_count=1)
            .add_suffix("_pm_fuel")
            .reset_index()
        ),
        on=IDX_PM_ESC,
    )
    .assign(
        # fill in the missing generation with zeros (this will help ensure
        # the calculations to run the fractions in `calc_allocation_ratios`
        # can be consistent)
        # do the same with missing fuel consumption
        net_generation_mwh_g_tbl=lambda x: x.net_generation_mwh_g_tbl.fillna(0),
        fuel_consumed_mmbtu_bf_tbl=lambda x: x.fuel_consumed_mmbtu_bf_tbl.fillna(0),
    )
)
# fuel consumed summed by prime mover and fuel from each table
# for f_col in ['fuel_consumed_mmbtu_gf_tbl', 'fuel_consumed_mmbtu_bf_tbl']:
# gen_pm_fuel[f'{f_col}_pm'] = (
#     gen_pm_fuel.groupby(IDX_PM, dropna=False)
#     [[f'{f_col}']].transform(sum, min_count=1)
# )
# gen_pm_fuel[f'{f_col}_fuel'] = (
#     gen_pm_fuel.groupby(IDX_U_ESC, dropna=False)
#     [[f'{f_col}']].transform(sum, min_count=1)
# )
# Add a column that indicates how much capacity comes from generators that
# report in the generation table, and how much comes only from generators
# that show up in the generation_fuel table.
gen_pm_fuel = pd.merge(
    gen_pm_fuel,
    gen_pm_fuel.groupby(by=IDX_PM_ESC + ["in_g_tbl"], dropna=False)[["capacity_mw"]]
    .sum(min_count=1)
    .add_suffix("_in_g_tbl_group")
    .reset_index(),
    on=IDX_PM_ESC + ["in_g_tbl"],
)
gen_pm_fuel["capacity_mw_fuel_in_bf_tbl_group"] = gen_pm_fuel.groupby(
    IDX_ESC + ["in_bf_tbl", "unit_id_pudl"], dropna=False
)[["capacity_mw"]].transform(sum, min_count=1)

In [99]:
test = gens_gb[["net_generation_mwh_g_tbl","fuel_consumed_mmbtu_bf_tbl", "capacity_mw"]].sum(min_count=1).reset_index()
test[(test['plant_id_eia'] == 3) & (test['report_date'] == "2020-02-01")]

Unnamed: 0,report_date,plant_id_eia,energy_source_code,prime_mover_code,net_generation_mwh_g_tbl,fuel_consumed_mmbtu_bf_tbl,capacity_mw
13786,2020-02-01,3,BIT,ST,-5197.0,1344.0,1192.5
13787,2020-02-01,3,NG,CA,246804.0,5025753.0,390.4
13788,2020-02-01,3,NG,CT,477282.0,10051510.0,680.4
13789,2020-02-01,3,NG,ST,1653.0,28612.57,306.2


In [100]:
gen_pm_fuel[(gen_pm_fuel['plant_id_eia'] == 3) & (gen_pm_fuel['report_date'] == "2020-02-01")]

Unnamed: 0,report_date,plant_id_eia,generator_id,prime_mover_code,unit_id_pudl,capacity_mw,fuel_type_count,operational_status,retirement_date,energy_source_code_num,energy_source_code,net_generation_mwh_g_tbl,fuel_consumed_mmbtu_gf_tbl,fuel_consumed_for_electricity_mmbtu_gf_tbl,net_generation_mwh_gf_tbl,fuel_consumed_mmbtu_bf_tbl,fuel_consumed_mmbtu_gf_tbl_fuel,fuel_consumed_for_electricity_mmbtu_gf_tbl_fuel,capacity_mw_fuel,net_generation_mwh_g_tbl_fuel,in_g_tbl,in_bf_tbl,in_g_tbl_all,in_g_tbl_any,in_bf_tbl_all,in_bf_tbl_any,net_generation_mwh_g_tbl_pm_fuel,fuel_consumed_mmbtu_bf_tbl_pm_fuel,capacity_mw_pm_fuel,capacity_mw_in_g_tbl_group,capacity_mw_fuel_in_bf_tbl_group
28602,2020-02-01,3,1,ST,1,153.1,2,existing,NaT,energy_source_code_1,NG,1837.0,29922.0,29922.0,-3391.657,28612.575,5055675.0,5055675.0,1377.0,725739.0,True,True,True,True,True,True,1653.0,28612.57,306.2,306.2,153.1
28603,2020-02-01,3,2,ST,2,153.1,2,existing,NaT,energy_source_code_1,NG,-184.0,29922.0,29922.0,-3391.657,0.0,5055675.0,5055675.0,1377.0,725739.0,True,True,True,True,True,True,1653.0,28612.57,306.2,306.2,153.1
28604,2020-02-01,3,A1CT,CT,6,170.1,2,existing,NaT,energy_source_code_1,NG,118310.0,4939791.0,4939791.0,477282.0,2416067.43,5055675.0,5055675.0,1377.0,725739.0,True,True,True,True,True,True,477282.0,10051510.0,680.4,680.4,535.4
28605,2020-02-01,3,A1CT2,CT,6,170.1,2,existing,NaT,energy_source_code_1,NG,108879.0,4939791.0,4939791.0,477282.0,2416067.43,5055675.0,5055675.0,1377.0,725739.0,True,True,True,True,True,True,477282.0,10051510.0,680.4,680.4,535.4
28606,2020-02-01,3,A2C1,CT,7,170.1,2,existing,NaT,energy_source_code_1,NG,124713.0,4939791.0,4939791.0,477282.0,2609685.785,5055675.0,5055675.0,1377.0,725739.0,True,True,True,True,True,True,477282.0,10051510.0,680.4,680.4,535.4
28607,2020-02-01,3,A2C2,CT,7,170.1,2,existing,NaT,energy_source_code_1,NG,125380.0,4939791.0,4939791.0,477282.0,2609685.785,5055675.0,5055675.0,1377.0,725739.0,True,True,True,True,True,True,477282.0,10051510.0,680.4,680.4,535.4
28608,2020-02-01,3,A1ST,CA,6,195.2,2,existing,NaT,energy_source_code_1,NG,119322.0,85962.0,85962.0,246804.0,2416067.43,5055675.0,5055675.0,1377.0,725739.0,True,True,True,True,True,True,246804.0,5025753.0,390.4,390.4,535.4
28609,2020-02-01,3,A2ST,CA,7,195.2,2,existing,NaT,energy_source_code_1,NG,127482.0,85962.0,85962.0,246804.0,2609685.785,5055675.0,5055675.0,1377.0,725739.0,True,True,True,True,True,True,246804.0,5025753.0,390.4,390.4,535.4
28610,2020-02-01,3,4,ST,4,403.7,2,existing,NaT,energy_source_code_1,BIT,-601.0,1344.0,1344.0,-152.343,0.0,1344.0,1344.0,1192.5,-5197.0,True,True,True,True,True,True,-5197.0,1344.0,1192.5,1192.5,403.7
28611,2020-02-01,3,5,ST,5,788.8,2,existing,NaT,energy_source_code_1,BIT,-4596.0,1344.0,1344.0,-152.343,1344.0,1344.0,1344.0,1192.5,-5197.0,True,True,True,True,True,True,-5197.0,1344.0,1192.5,1192.5,788.8


## Net Gen Allocation

In [32]:
# break out the table into these four different generator types.
all_gen = gen_pm_fuel.loc[gen_pm_fuel.in_g_tbl_all]
some_gen = gen_pm_fuel.loc[gen_pm_fuel.in_g_tbl_any & ~gen_pm_fuel.in_g_tbl_all]
gf_only = gen_pm_fuel.loc[~gen_pm_fuel.in_g_tbl_any]


# In the case where we have all of teh generation from the generation
# table, we still allocate, because the generation reported in these two
# tables don't always match perfectly
all_gen = all_gen.assign(
    frac_net_gen=lambda x: x.net_generation_mwh_g_tbl
    / x.net_generation_mwh_g_tbl_pm_fuel
)
# if any values are both zero, replace the nan with 1
all_gen.loc[(all_gen.net_generation_mwh_g_tbl == 0) & (all_gen.net_generation_mwh_g_tbl_pm_fuel == 0), 'frac_net_gen'] = 1
all_gen = all_gen.assign(frac=lambda x: x.frac_net_gen)
# _ = _test_frac(all_gen)

# a brief explaination of the equations below
# input definitions:
#   ng == net generation from the generation table (by generator)
#   ngf == net generation from the generation fuel table (summed by PM/Fuel)
#   ngt == total net generation from the generation table (summed by PM/Fuel)
#
# y = ngt / ngf (fraction of generation reporting in the generation table)
# z = ng * ngt (fraction of generation from generation table by generator)
# g = y * z  (fraction of generation reporting in generation table by generator - frac_gen)

some_gen = some_gen.assign(
    # fraction of the generation that should go to the generators that
    # report in the generation table
    frac_from_g_tbl=lambda x: x.net_generation_mwh_g_tbl_pm_fuel
    / x.net_generation_mwh_gf_tbl,
    # for records within these mix groups that do have net gen in the
    # generation table..
    frac_net_gen=lambda x: x.net_generation_mwh_g_tbl
    / x.net_generation_mwh_g_tbl_pm_fuel)  # generator based net gen from gen table

# if any values are both zero, replace the nan with 1
some_gen.loc[(some_gen.net_generation_mwh_g_tbl_pm_fuel == 0) & (some_gen.net_generation_mwh_gf_tbl == 0), 'frac_from_g_tbl'] = 1
some_gen.loc[(some_gen.net_generation_mwh_g_tbl == 0) & (some_gen.net_generation_mwh_g_tbl_pm_fuel == 0), 'frac_net_gen'] = 1

some_gen = some_gen.assign( 
    frac_gen=lambda x: x.frac_net_gen * x.frac_from_g_tbl,
    # fraction of generation that does not show up in the generation table
    frac_missing_from_g_tbl=lambda x: 1 - x.frac_from_g_tbl,
    capacity_mw_missing_from_g_tbl=lambda x: np.where(x.in_g_tbl, 0, x.capacity_mw),
    frac_cap=lambda x: x.frac_missing_from_g_tbl
    * (x.capacity_mw_missing_from_g_tbl / x.capacity_mw_in_g_tbl_group),
    # the real deal
    # this could aslo be `x.frac_gen + x.frac_cap` because the frac_gen
    # should be 0 for any generator that does not have net gen in the g_tbl
    # and frac_cap should be 0 for any generator that has net gen in the
    # g_tbl.
    frac=lambda x: np.where(x.in_g_tbl, x.frac_gen, x.frac_cap),
)
# _ = _test_frac(some_gen)

# Calculate what fraction of the total capacity is associated with each of
# the generators in the grouping.
gf_only = gf_only.assign(
    frac_cap=lambda x: x.capacity_mw / x.capacity_mw_pm_fuel,
    frac=lambda x: x.frac_cap,
)
# _ = _test_frac(gf_only)

# squish all of these methods back together.
net_gen_alloc = pd.concat([all_gen, some_gen, gf_only])
# null out the inf's
net_gen_alloc.loc[abs(net_gen_alloc.frac) == np.inf] = np.NaN
_ = remove_this._test_frac(net_gen_alloc)

# do the allocating-ing!
net_gen_alloc = (
    net_gen_alloc.assign(
        # we could x.net_generation_mwh_g_tbl.fillna here if we wanted to
        # take the net gen
        net_generation_mwh=lambda x: x.net_generation_mwh_gf_tbl
        * x.frac,
    )
    .pipe(remove_this.apply_pudl_dtypes, group="eia")
    .dropna(how="all")
)



In [36]:
net_gen_alloc[(net_gen_alloc['plant_id_eia'] == 3) & (net_gen_alloc['report_date'] == "2020-02-01")]

Unnamed: 0,report_date,plant_id_eia,generator_id,prime_mover_code,unit_id_pudl,capacity_mw,fuel_type_count,operational_status,retirement_date,energy_source_code_num,energy_source_code,net_generation_mwh_g_tbl,fuel_consumed_mmbtu_gf_tbl,fuel_consumed_for_electricity_mmbtu_gf_tbl,net_generation_mwh_gf_tbl,fuel_consumed_mmbtu_bf_tbl,fuel_consumed_mmbtu_gf_tbl_fuel,fuel_consumed_for_electricity_mmbtu_gf_tbl_fuel,capacity_mw_fuel,net_generation_mwh_g_tbl_fuel,in_g_tbl,in_bf_tbl,in_g_tbl_all,in_g_tbl_any,in_bf_tbl_all,in_bf_tbl_any,net_generation_mwh_g_tbl_pm_fuel,fuel_consumed_mmbtu_bf_tbl_pm_fuel,capacity_mw_pm_fuel,capacity_mw_in_g_tbl_group,capacity_mw_fuel_in_bf_tbl_group,frac_net_gen,frac,frac_from_g_tbl,frac_gen,frac_missing_from_g_tbl,capacity_mw_missing_from_g_tbl,frac_cap,net_generation_mwh
29211,2020-02-01,3,1,ST,1,153.1,2.0,existing,NaT,energy_source_code_1,NG,1837.0,29922.0,29922.0,-3391.657,28612.575,5055675.0,5055675.0,2569.5,720542.0,True,True,True,True,True,True,-3544.0,29921.85,1498.7,1498.7,153.1,-0.518341,-0.518341,,,,,,1758.034399
29212,2020-02-01,3,2,ST,2,153.1,2.0,existing,NaT,energy_source_code_1,NG,-184.0,29922.0,29922.0,-3391.657,0.0,5055675.0,5055675.0,2569.5,720542.0,True,True,True,True,True,True,-3544.0,29921.85,1498.7,1498.7,153.1,0.051919,0.051919,,,,,,-176.090544
29213,2020-02-01,3,4,ST,4,403.7,2.0,existing,NaT,energy_source_code_2,NG,-601.0,29922.0,29922.0,-3391.657,0.0,5055675.0,5055675.0,2569.5,720542.0,True,True,True,True,True,True,-3544.0,29921.85,1498.7,1498.7,403.7,0.169582,0.169582,,,,,,-575.16531
29214,2020-02-01,3,5,ST,5,788.8,2.0,existing,NaT,energy_source_code_2,NG,-4596.0,29922.0,29922.0,-3391.657,1309.275,5055675.0,5055675.0,2569.5,720542.0,True,True,True,True,True,True,-3544.0,29921.85,1498.7,1498.7,788.8,1.29684,1.29684,,,,,,-4398.435545
29215,2020-02-01,3,A1CT,CT,6,170.1,2.0,existing,NaT,energy_source_code_1,NG,118310.0,4939791.0,4939791.0,477282.0,2416067.43,5055675.0,5055675.0,2569.5,720542.0,True,True,True,True,True,True,477282.0,10051510.0,680.4,680.4,535.4,0.247883,0.247883,,,,,,118310.0
29216,2020-02-01,3,A1CT2,CT,6,170.1,2.0,existing,NaT,energy_source_code_1,NG,108879.0,4939791.0,4939791.0,477282.0,2416067.43,5055675.0,5055675.0,2569.5,720542.0,True,True,True,True,True,True,477282.0,10051510.0,680.4,680.4,535.4,0.228123,0.228123,,,,,,108879.0
29217,2020-02-01,3,A2C1,CT,7,170.1,2.0,existing,NaT,energy_source_code_1,NG,124713.0,4939791.0,4939791.0,477282.0,2609685.785,5055675.0,5055675.0,2569.5,720542.0,True,True,True,True,True,True,477282.0,10051510.0,680.4,680.4,535.4,0.261298,0.261298,,,,,,124713.0
29218,2020-02-01,3,A2C2,CT,7,170.1,2.0,existing,NaT,energy_source_code_1,NG,125380.0,4939791.0,4939791.0,477282.0,2609685.785,5055675.0,5055675.0,2569.5,720542.0,True,True,True,True,True,True,477282.0,10051510.0,680.4,680.4,535.4,0.262696,0.262696,,,,,,125380.0
29219,2020-02-01,3,A1ST,CA,6,195.2,2.0,existing,NaT,energy_source_code_1,NG,119322.0,85962.0,85962.0,246804.0,2416067.43,5055675.0,5055675.0,2569.5,720542.0,True,True,True,True,True,True,246804.0,5025753.0,390.4,390.4,535.4,0.483469,0.483469,,,,,,119322.0
29220,2020-02-01,3,A2ST,CA,7,195.2,2.0,existing,NaT,energy_source_code_1,NG,127482.0,85962.0,85962.0,246804.0,2609685.785,5055675.0,5055675.0,2569.5,720542.0,True,True,True,True,True,True,246804.0,5025753.0,390.4,390.4,535.4,0.516531,0.516531,,,,,,127482.0


# Create a fuel allocation function that parallels what we do for generation

In [21]:
# break out the table into these four different generator types.
all_gen = gen_pm_fuel.loc[gen_pm_fuel.in_bf_tbl_all]
some_gen = gen_pm_fuel.loc[gen_pm_fuel.in_bf_tbl_any & ~gen_pm_fuel.in_bf_tbl_all]
bf_only = gen_pm_fuel.loc[~gen_pm_fuel.in_bf_tbl_any]


# In the case where we have all of teh fuel from the bf
# table, we still allocate, because the fuel reported in these two
# tables don't always match perfectly
all_gen = all_gen.assign(
    frac_fuel=lambda x: x.fuel_consumed_mmbtu_bf_tbl
    / x.fuel_consumed_mmbtu_bf_tbl_pm_fuel
)
# if any values are both zero, replace the nan with 1
all_gen.loc[(all_gen.fuel_consumed_mmbtu_bf_tbl == 0) & (all_gen.fuel_consumed_mmbtu_bf_tbl_pm_fuel == 0), 'frac_fuel'] = 1
all_gen = all_gen.assign(frac=lambda x: x.frac_fuel)
# _ = _test_frac(all_gen)

# a brief explaination of the equations below
# input definitions:
#   ng == net generation from the generation table (by generator)
#   ngf == net generation from the generation fuel table (summed by PM/Fuel)
#   ngt == total net generation from the generation table (summed by PM/Fuel)
#
# y = ngt / ngf (fraction of generation reporting in the generation table)
# z = ng * ngt (fraction of generation from generation table by generator)
# g = y * z  (fraction of generation reporting in generation table by generator - frac_bf)

some_gen = some_gen.assign(
    # fraction of the generation that should go to the generators that
    # report in the bf table
    frac_from_bf_tbl=lambda x: x.fuel_consumed_mmbtu_bf_tbl_pm_fuel
    / x.fuel_consumed_mmbtu_gf_tbl,
    # for records within these mix groups that do have net gen in the
    # generation table..
    frac_fuel=lambda x: x.fuel_consumed_mmbtu_bf_tbl
    / x.fuel_consumed_mmbtu_bf_tbl_pm_fuel)  # generator based net gen from gen table

# if any values are both zero, replace the nan with 1
some_gen.loc[(some_gen.fuel_consumed_mmbtu_bf_tbl_pm_fuel == 0) & (some_gen.fuel_consumed_mmbtu_gf_tbl == 0), 'frac_from_bf_tbl'] = 1
some_gen.loc[(some_gen.fuel_consumed_mmbtu_bf_tbl == 0) & (some_gen.fuel_consumed_mmbtu_bf_tbl_pm_fuel == 0), 'frac_fuel'] = 1

some_gen = some_gen.assign( 
    frac_bf=lambda x: x.frac_fuel * x.frac_from_bf_tbl,
    # fraction of generation that does not show up in the boiler fuel table
    frac_missing_from_bf_tbl=lambda x: 1 - x.frac_from_bf_tbl,
    capacity_mw_missing_from_bf_tbl=lambda x: np.where(x.in_bf_tbl, 0, x.capacity_mw),
    frac_cap=lambda x: x.frac_missing_from_bf_tbl
    * (x.capacity_mw_missing_from_bf_tbl / x.capacity_mw_fuel_in_bf_tbl_group),
    # the real deal
    # this could aslo be `x.frac_bf + x.frac_cap` because the frac_bf
    # should be 0 for any generator that does not have net gen in the g_tbl
    # and frac_cap should be 0 for any generator that has net gen in the
    # g_tbl.
    frac=lambda x: np.where(x.in_bf_tbl, x.frac_bf, x.frac_cap),
)
# _ = _test_frac(some_gen)

# Calculate what fraction of the total capacity is associated with each of
# the generators in the grouping.
bf_only = bf_only.assign(
    frac_cap=lambda x: x.capacity_mw / x.capacity_mw_pm_fuel,
    frac=lambda x: x.frac_cap,
)
# _ = _test_frac(bf_only)

# squish all of these methods back together.
fuel_alloc = pd.concat([all_gen, some_gen, bf_only])
# null out the inf's
fuel_alloc.loc[abs(fuel_alloc.frac) == np.inf] = np.NaN
_ = remove_this._test_frac(fuel_alloc)

# do the allocating-ing!
fuel_alloc = (
    fuel_alloc.assign(
        # we could x.fuel_consumed_mmbtu_bf_tbl.fillna here if we wanted to
        # take the net gen
        fuel_consumed_mmbtu=lambda x: x.fuel_consumed_mmbtu_gf_tbl
        * x.frac,
    )
    .pipe(pudl.helpers.apply_pudl_dtypes, group="eia")
    .dropna(how="all")
)





In [50]:
fuel_alloc[(fuel_alloc['plant_id_eia'] == 3) & (fuel_alloc['report_date'] == "2020-02-01")]

Unnamed: 0,report_date,plant_id_eia,generator_id,prime_mover_code,unit_id_pudl,capacity_mw,fuel_type_count,operational_status,retirement_date,energy_source_code_num,energy_source_code,net_generation_mwh_g_tbl,fuel_consumed_mmbtu_gf_tbl,fuel_consumed_for_electricity_mmbtu_gf_tbl,net_generation_mwh_gf_tbl,fuel_consumed_mmbtu_bf_tbl,fuel_consumed_mmbtu_gf_tbl_fuel,fuel_consumed_for_electricity_mmbtu_gf_tbl_fuel,capacity_mw_fuel,net_generation_mwh_g_tbl_fuel,in_g_tbl,in_bf_tbl,in_g_tbl_all,in_g_tbl_any,in_bf_tbl_all,in_bf_tbl_any,net_generation_mwh_g_tbl_pm_fuel,fuel_consumed_mmbtu_bf_tbl_pm_fuel,capacity_mw_pm_fuel,capacity_mw_in_g_tbl_group,capacity_mw_fuel_in_bf_tbl_group,frac_fuel,frac,frac_from_bf_tbl,frac_bf,frac_missing_from_bf_tbl,capacity_mw_missing_from_bf_tbl,frac_cap,fuel_consumed_mmbtu,fuel_consumed_for_electricity_mmbtu
29211,2020-02-01,3,1,ST,1,153.1,2.0,existing,NaT,energy_source_code_1,NG,1837.0,29922.0,29922.0,-3391.657,28612.575,5055675.0,5055675.0,2569.5,720542.0,True,True,True,True,True,True,-3544.0,29921.85,1498.7,1498.7,153.1,0.956244,0.956244,,,,,,28612.72,28612.72
29212,2020-02-01,3,2,ST,2,153.1,2.0,existing,NaT,energy_source_code_1,NG,-184.0,29922.0,29922.0,-3391.657,0.0,5055675.0,5055675.0,2569.5,720542.0,True,True,True,True,True,True,-3544.0,29921.85,1498.7,1498.7,153.1,0.0,0.0,,,,,,0.0,0.0
29213,2020-02-01,3,4,ST,4,403.7,2.0,existing,NaT,energy_source_code_2,NG,-601.0,29922.0,29922.0,-3391.657,0.0,5055675.0,5055675.0,2569.5,720542.0,True,True,True,True,True,True,-3544.0,29921.85,1498.7,1498.7,403.7,0.0,0.0,,,,,,0.0,0.0
29214,2020-02-01,3,5,ST,5,788.8,2.0,existing,NaT,energy_source_code_2,NG,-4596.0,29922.0,29922.0,-3391.657,1309.275,5055675.0,5055675.0,2569.5,720542.0,True,True,True,True,True,True,-3544.0,29921.85,1498.7,1498.7,788.8,0.043756,0.043756,,,,,,1309.282,1309.282
29215,2020-02-01,3,A1CT,CT,6,170.1,2.0,existing,NaT,energy_source_code_1,NG,118310.0,4939791.0,4939791.0,477282.0,2416067.43,5055675.0,5055675.0,2569.5,720542.0,True,True,True,True,True,True,477282.0,10051510.0,680.4,680.4,535.4,0.240369,0.240369,,,,,,1187371.0,1187371.0
29216,2020-02-01,3,A1CT2,CT,6,170.1,2.0,existing,NaT,energy_source_code_1,NG,108879.0,4939791.0,4939791.0,477282.0,2416067.43,5055675.0,5055675.0,2569.5,720542.0,True,True,True,True,True,True,477282.0,10051510.0,680.4,680.4,535.4,0.240369,0.240369,,,,,,1187371.0,1187371.0
29217,2020-02-01,3,A2C1,CT,7,170.1,2.0,existing,NaT,energy_source_code_1,NG,124713.0,4939791.0,4939791.0,477282.0,2609685.785,5055675.0,5055675.0,2569.5,720542.0,True,True,True,True,True,True,477282.0,10051510.0,680.4,680.4,535.4,0.259631,0.259631,,,,,,1282524.0,1282524.0
29218,2020-02-01,3,A2C2,CT,7,170.1,2.0,existing,NaT,energy_source_code_1,NG,125380.0,4939791.0,4939791.0,477282.0,2609685.785,5055675.0,5055675.0,2569.5,720542.0,True,True,True,True,True,True,477282.0,10051510.0,680.4,680.4,535.4,0.259631,0.259631,,,,,,1282524.0,1282524.0
29219,2020-02-01,3,A1ST,CA,6,195.2,2.0,existing,NaT,energy_source_code_1,NG,119322.0,85962.0,85962.0,246804.0,2416067.43,5055675.0,5055675.0,2569.5,720542.0,True,True,True,True,True,True,246804.0,5025753.0,390.4,390.4,535.4,0.480737,0.480737,,,,,,41325.15,41325.15
29220,2020-02-01,3,A2ST,CA,7,195.2,2.0,existing,NaT,energy_source_code_1,NG,127482.0,85962.0,85962.0,246804.0,2609685.785,5055675.0,5055675.0,2569.5,720542.0,True,True,True,True,True,True,246804.0,5025753.0,390.4,390.4,535.4,0.519263,0.519263,,,,,,44636.85,44636.85


In [104]:
bf[(bf['plant_id_eia'] == 3) & (bf['report_date'] == "2020-12-01")]

Unnamed: 0,report_date,plant_id_eia,energy_source_code,unit_id_pudl,fuel_consumed_mmbtu
98230,2020-12-01,3,NG,1,0.0
98231,2020-12-01,3,NG,2,0.0
98232,2020-12-01,3,BIT,4,0.0
98233,2020-12-01,3,NG,4,0.0
98234,2020-12-01,3,SUB,4,0.0
98235,2020-12-01,3,BIT,5,1444479.536
98236,2020-12-01,3,NG,5,50487.118
98237,2020-12-01,3,SUB,5,0.0
98238,2020-12-01,3,NG,6,8930.277
98239,2020-12-01,3,NG,6,7775.982


## calculate emissions (after fuel alloc)

In [22]:
fuel_alloc

Unnamed: 0,report_date,plant_id_eia,generator_id,prime_mover_code,unit_id_pudl,capacity_mw,fuel_type_count,operational_status,retirement_date,energy_source_code_num,energy_source_code,net_generation_mwh_g_tbl,fuel_consumed_mmbtu_gf_tbl,fuel_consumed_for_electricity_mmbtu_gf_tbl,net_generation_mwh_gf_tbl,fuel_consumed_mmbtu_bf_tbl,fuel_consumed_mmbtu_gf_tbl_fuel,fuel_consumed_for_electricity_mmbtu_gf_tbl_fuel,capacity_mw_fuel,net_generation_mwh_g_tbl_fuel,in_g_tbl,in_bf_tbl,in_g_tbl_all,in_g_tbl_any,in_bf_tbl_all,in_bf_tbl_any,net_generation_mwh_g_tbl_pm_fuel,fuel_consumed_mmbtu_bf_tbl_pm_fuel,capacity_mw_pm_fuel,capacity_mw_in_g_tbl_group,capacity_mw_fuel_in_bf_tbl_group,frac_fuel,frac,frac_from_bf_tbl,frac_bf,frac_missing_from_bf_tbl,capacity_mw_missing_from_bf_tbl,frac_cap,fuel_consumed_mmbtu
22,2020-01-01,7,1,ST,1,69.0,1.0,existing,NaT,energy_source_code_1,NG,14810.0,191210.0,168842.0,14475.000,191209.650,191210.0,168842.0,138.0,14475.0,True,True,True,True,True,True,14475.0,191209.650,138.0,138.0,69.0,1.000000,1.000000,,,,,,191210.000000
23,2020-01-01,7,2,ST,2,69.0,1.0,existing,NaT,energy_source_code_1,NG,-335.0,191210.0,168842.0,14475.000,0.000,191210.0,168842.0,138.0,14475.0,True,True,True,True,True,True,14475.0,191209.650,138.0,138.0,69.0,0.000000,0.000000,,,,,,0.000000
26,2020-01-01,10,1,ST,1,299.2,1.0,existing,NaT,energy_source_code_1,NG,37157.0,916776.0,916776.0,82949.000,413099.799,929690.0,929690.0,1288.4,82949.0,True,True,True,True,True,True,82949.0,916776.137,568.4,568.4,299.2,0.450601,0.450601,,,,,,413099.737268
27,2020-01-01,10,2,ST,2,269.2,1.0,existing,NaT,energy_source_code_1,NG,45792.0,916776.0,916776.0,82949.000,503676.338,929690.0,929690.0,1288.4,82949.0,True,True,True,True,True,True,82949.0,916776.137,568.4,568.4,269.2,0.549399,0.549399,,,,,,503676.262732
81,2020-01-01,26,1,ST,1,272.0,3.0,existing,NaT,energy_source_code_1,NG,-779.0,76584.0,76584.0,7459.711,0.000,76584.0,76584.0,2012.8,403475.0,True,True,True,True,True,True,403475.0,76583.590,2012.8,2012.8,272.0,0.000000,0.000000,,,,,,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
350143,2020-12-01,64816,GEN1,IC,,0.4,1.0,existing,NaT,energy_source_code_1,NG,0.0,,,,0.000,,,1.2,,False,False,False,False,False,False,,,1.2,1.2,1.2,,0.333333,,,,,0.333333,
350144,2020-12-01,64816,GEN2,IC,,0.4,1.0,existing,NaT,energy_source_code_1,NG,0.0,,,,0.000,,,1.2,,False,False,False,False,False,False,,,1.2,1.2,1.2,,0.333333,,,,,0.333333,
350145,2020-12-01,64816,GEN3,IC,,0.4,1.0,existing,NaT,energy_source_code_1,NG,0.0,,,,0.000,,,1.2,,False,False,False,False,False,False,,,1.2,1.2,1.2,,0.333333,,,,,0.333333,
350146,2020-12-01,64836,CATAL,PV,,1.0,1.0,existing,NaT,energy_source_code_1,SUN,0.0,,,,0.000,,,1.0,,False,False,False,False,False,False,,,1.0,1.0,1.0,,1.000000,,,,,1.000000,


# Testing local functions

In [None]:
%reload_ext autoreload
%autoreload 2

# Tell python where to look for modules. 
# Depending on how your jupyter handles working directories, this may not be needed.
import sys
sys.path.append('../../hourly-egrid/')

# Useful high-level external modules.
import numpy as np
import pandas as pd


from src.distribute_eia923 import *
from src.distribute_eia923 import _associate_unconnected_records, _associate_energy_source_only


year = 2020

In [None]:
load_data.load_pudl_table("generation_fuel_eia923", year=year)

In [None]:
pudl_db = 'sqlite:///../data/pudl/pudl_data/sqlite/pudl.sqlite'
pudl_engine = sa.create_engine(pudl_db)


# extract all of the tables from pudl_out early in the process and select
# only the columns we need. this is for speed and clarity.

# gf contains the more complete generation and fuel data at the plant prime mover level
gf = load_data.load_pudl_table("generation_fuel_eia923", year=year).loc[
    :, IDX_PM_FUEL + ["net_generation_mwh", "fuel_consumed_mmbtu"]
].pipe(apply_dtype)
# load the nuclear gf data
gf_nuc = load_data.load_pudl_table("generation_fuel_nuclear_eia923", year=year).loc[
    :, IDX_PM_FUEL + ["net_generation_mwh", "fuel_consumed_mmbtu"]
].pipe(apply_dtype)
# concat the nuclear data with the main gf dataframe
gf = pd.concat([gf,gf_nuc], axis=0)
# remove non-grid connected plants
gf = data_cleaning.remove_non_grid_connected_plants(gf)

# gen contrains more granular generation data at the generator level for a subset of generators
gen = (
    load_data.load_pudl_table("generation_eia923", year=year).loc[:, IDX_GENS + ["net_generation_mwh"]]
    # removes 4 records with NaN generator_id as of pudl v0.5
    .dropna(subset=IDX_GENS)
).pipe(apply_dtype)
# remove non-grid connected plants
gen = data_cleaning.remove_non_grid_connected_plants(gen)

# gens contains a complete list of all generators
gens = load_data.load_pudl_table("generators_eia860", year=year).loc[
    :,
    IDX_GENS
    + [
        "capacity_mw",
        "operational_status",
        "retirement_date",
    ]
    + list(load_data.load_pudl_table("generators_eia860", year=year).filter(like="energy_source_code")),
]
# remove non-grid connected plants
gens = data_cleaning.remove_non_grid_connected_plants(gens)
# get a list of fuel types for later
gen_primary_fuel = gens.copy()[['plant_id_eia','generator_id','energy_source_code_1']]
# add the prime mover code to the gens df from generators entity
gens = gens.merge(load_data.load_pudl_table("generators_entity_eia").loc[:,["plant_id_eia", "generator_id","prime_mover_code"]],
                    how='left', 
                    on=["plant_id_eia", "generator_id"]).pipe(apply_dtype)
# add records for each month of the year
gens = data_cleaning.create_monthly_gens_records(gens, year)
# remove retired generator months
#gens = remove_retired_generators(gens)

# the gen table is missing some generator ids. Let's fill this using the gens table, leaving a missing value for net generation
gen = gen.merge(gens[['plant_id_eia','generator_id','report_date']], how='outer', on=['plant_id_eia','generator_id','report_date'])

In [None]:
bga = load_data.load_pudl_table("boiler_generator_assn_eia860", year=2020)

In [None]:
gen_entity = load_data.load_pudl_table("generators_entity_eia").loc[:,["plant_id_eia", "generator_id","prime_mover_code"]]

In [None]:
gen_entity[gen_entity['plant_id_eia'] == 50410]

In [None]:
gf[gf['plant_id_eia'] == 50410]

In [None]:
gen[gen['plant_id_eia'] == 50410]

In [None]:
stack_gens = stack_generators(
        gens, cat_col="energy_source_code_num", stacked_col="energy_source_code"
    )

gen_assoc = (
    pd.merge(stack_gens, gen, on=IDX_GENS, how="outer")
    #.pipe(remove_retired_generators)
    .merge(
        gf.groupby(by=IDX_PM_FUEL, as_index=False).sum(min_count=1),
        on=IDX_PM_FUEL,
        suffixes=("_g_tbl", "_gf_tbl"),
        how="outer",
    )
)

gen_assoc = (
        pd.merge(
            gen_assoc,
            gen_assoc.groupby(by=IDX_FUEL)[["capacity_mw", "net_generation_mwh_g_tbl"]]
            .sum(min_count=1)
            .add_suffix("_fuel")
            .reset_index(),
            on=IDX_FUEL,
        )
        .pipe(apply_dtype)
        .pipe(_associate_unconnected_records)
        .pipe(_associate_energy_source_only, gf=gf)
    )

In [None]:
stack_gens[stack_gens['plant_id_eia'] == 61242]

In [None]:
gen_assoc[gen_assoc['plant_id_eia'] == 62562]

In [None]:
stack_gens[stack_gens['plant_id_eia'] == 57846]

In [None]:
year = 2020
pudl_db = 'sqlite:///../data/pudl/pudl_data/sqlite/pudl.sqlite'
pudl_engine = sa.create_engine(pudl_db)

start_date='2020-01-01'
end_date='2020-12-31'

plants_ba = plants_eia860(pudl_engine, start_date=start_date, end_date=end_date)[['plant_id_eia','balancing_authority_code_eia','state']]

plants_ba

In [None]:
gen_allocated = allocate_gen_fuel_by_gen(year=2020)

In [None]:
gen_allocated[gen_allocated['plant_id_eia'] == 3].sum()

In [None]:
year = 2020

IDX_GENS = ["plant_id_eia", "generator_id", "report_date"]
"""Id columns for generators."""

IDX_PM_FUEL = ["plant_id_eia", "prime_mover_code", "energy_source_code", "report_date"]
"""Id columns for plant, prime mover & fuel type records."""

IDX_FUEL = ["report_date", "plant_id_eia", "energy_source_code"]

DATA_COLS = ["net_generation_mwh", "fuel_consumed_mmbtu"]
"""Data columns from generation_fuel_eia923 that are being allocated."""

pudl_db = 'sqlite:///../data/pudl/pudl_data/sqlite/pudl.sqlite'
pudl_engine = sa.create_engine(pudl_db)

# specify the date filter for retrieving data
year_filter = f"report_date >= '{year}-01-01' AND report_date <= '{year}-12-01'"

gens = load_data.load_pudl_table(f"SELECT * FROM generators_eia860 WHERE {year_filter}").loc[
    :,
    IDX_GENS
    + [
        "capacity_mw",
        "operational_status",
        "retirement_date",
    ]
    + list(load_data.load_pudl_table(f"SELECT * FROM generators_eia860 WHERE {year_filter}").filter(like="energy_source_code")),
]
# add the prime mover code to the gens df from generators entity
gens = gens.merge(load_data.load_pudl_table("generators_entity_eia").loc[:,["plant_id_eia", "generator_id","prime_mover_code"]],
                    how='left', 
                    on=["plant_id_eia", "generator_id"]).pipe(apply_dtype)

existing = gens.loc[(gens.operational_status == "existing")]
# keep the gens for each month until they retire, if they have any data to report in that month
retiring = gens.loc[
    (gens.operational_status == "retired")
    & (gens.report_date <= gens.retirement_date)
]

gens = pd.concat([existing, retiring])

In [None]:
gens[gens['plant_id_eia'] == 57846]

In [None]:
geo_in_eia = gens[gens['energy_source_code_1'] == 'GEO'].groupby('plant_id_eia')['prime_mover_code'].unique().astype(str).reset_index()
eia_geo_plants = list(geo_in_eia.plant_id_eia.unique())
geo_in_eia

In [None]:
# update teh geothermal geotypes with new generators
geothermal_geotype = pd.read_csv('../data/egrid/egrid_static_tables/table_geothermal_geotype.csv')
epa_geo_plants = list(geothermal_geotype.plant_id_eia.unique())
geothermal_geotype

In [None]:
steam_geo = list(geothermal_geotype[geothermal_geotype['geotype_code'] == 'S']['plant_id_eia'])

In [None]:
plants = load_data.load_pudl_table("plants_entity_eia")

In [None]:
plants[plants['plant_id_eia'].isin(steam_geo)]

In [None]:
geo_not_in_epa = list(set(eia_geo_plants) - set(epa_geo_plants))

geo_in_eia[geo_in_eia['plant_id_eia'].isin(geo_not_in_epa)]

In [None]:
def calculate_geothermal_emission_factors():
    """
    Updates the list of geothermal plants provided by EPA using EIA data
    Calculates a weighted average EF for each plant-month based on the fraction 
    of fuel consumed from each type of prime mover (steam, binary, flash)
    """

    # load the eia generation fuel data
    generation_fuel_eia923 = pd.read_sql(f"SELECT * FROM generation_fuel_eia923 WHERE report_date >= '{year}-01-01' AND report_date <= '{year}-12-01'", pudl_engine)

    # create a dataframe of total heat input by prime mover for each geothermal plant
    geo_in_eia = generation_fuel_eia923[generation_fuel_eia923['energy_source_code'] == 'GEO'].groupby(['plant_id_eia','prime_mover_code','report_date']).sum()['fuel_consumed_mmbtu'].reset_index()
    # remove prime movers for which there was no heat input
    geo_in_eia = geo_in_eia[geo_in_eia['fuel_consumed_mmbtu'] > 0]

    # merge in the EPA's assigned Geotype
    geothermal_geotype = pd.read_csv('../data/egrid/egrid_static_tables/table_geothermal_geotype.csv')
    geo_in_eia = geo_in_eia.merge(geothermal_geotype[['plant_id_eia','geotype_code']], how='left', on='plant_id_eia')

    # identify plants with multiple prime mover types
    multi_type_plants = geo_in_eia.groupby(['plant_id_eia','prime_mover_code']).count().reset_index().groupby('plant_id_eia').count()['prime_mover_code']
    multi_type_plants = multi_type_plants[multi_type_plants > 1]
    multi_type_plants = list(multi_type_plants.index)

    #update the geotype codes for plants with multiple types
    # for plants identified as flash steam that also have a binary component, update to binary
    geo_in_eia.loc[(geo_in_eia['plant_id_eia'].isin(multi_type_plants)) & (geo_in_eia['geotype_code'] == 'F') & (geo_in_eia['prime_mover_code'] == 'BT'),'geotype_code'] = 'B'
    # for plants identified as binary that also have a steam component, update to flash (it seems that all other multi-types are F/B combinatioms)
    geo_in_eia.loc[(geo_in_eia['plant_id_eia'].isin(multi_type_plants)) & (geo_in_eia['geotype_code'] == 'B') & (geo_in_eia['prime_mover_code'] == 'ST'),'geotype_code'] = 'F'

    # if EPA assigned a plant as flash or steam, but EIA identified it as binary, re-assign as binary
    geo_in_eia.loc[(geo_in_eia['prime_mover_code'] == 'BT') & (geo_in_eia['geotype_code'].isin(['F','S'])), 'geotype_code'] = 'B'

    # if EPA assigned a plant as binary, but EIA identified it as a steam turbine, re-assign as flash
    # we use flash instead of steam, b/c flash is more common than steam according to EIA
    # Source: https://www.eia.gov/energyexplained/geothermal/geothermal-power-plants.php
    geo_in_eia.loc[(geo_in_eia['prime_mover_code'] == 'ST') & (geo_in_eia['geotype_code'].isin(['B'])), 'geotype_code'] = 'F'

    # where plants are missing a geotype code, assign based on the EIA-identified prime mover
    geo_in_eia.loc[(geo_in_eia['geotype_code'].isna()) & (geo_in_eia['prime_mover_code'] == 'BT'), 'geotype_code'] = 'B'
    geo_in_eia.loc[(geo_in_eia['geotype_code'].isna()) & (geo_in_eia['prime_mover_code'] == 'ST'), 'geotype_code'] = 'F'

    # calculate the fraction of heat input from each prime mover in each month
    fuel_frac = (geo_in_eia.set_index(['plant_id_eia','report_date','geotype_code'])[['fuel_consumed_mmbtu']] / geo_in_eia.groupby(['plant_id_eia','report_date']).sum()).reset_index()
    fuel_frac = fuel_frac.rename(columns={'fuel_consumed_mmbtu': 'fuel_frac'})
    geo_in_eia = geo_in_eia.merge(fuel_frac, how='left', on=['plant_id_eia','report_date','geotype_code'])

    # calculate a weighted average emission factor for each plant

    # load geothermal efs
    geothermal_efs = pd.read_csv('../data/egrid/egrid_static_tables/table_C6_geothermal_emission_factors.csv')[['geotype_code','co2_lb_per_mmbtu']]
    # convert lb to ton
    geothermal_efs['co2_tons_per_mmbtu'] = geothermal_efs['co2_lb_per_mmbtu'] / 2000
    geothermal_efs = geothermal_efs[['geotype_code','co2_tons_per_mmbtu']]
    # merge in the emission factor
    geo_in_eia = geo_in_eia.merge(geothermal_efs, how='left', on='geotype_code')
    # multiply the emission factor by the fraction
    geo_in_eia['co2_tons_per_mmbtu'] = geo_in_eia['fuel_frac'] * geo_in_eia['co2_tons_per_mmbtu']  

    # groupby plant and month to get the weighted emission factor
    geo_in_eia = geo_in_eia.groupby(['plant_id_eia','report_date']).sum()['co2_tons_per_mmbtu'].reset_index()

    # if there are any plants missing from our list, add them back in

    # identify the plants that are in the epa geotype table but not the EIA-derived one
    epa_geo_plants = list(geothermal_geotype.plant_id_eia.unique())
    plants_from_eia = list(geo_in_eia.plant_id_eia.unique())
    missing_plants = list(set(epa_geo_plants) - set(plants_from_eia))

    # create a dataframe with the geotype of all misisng plants
    missing_plants = geothermal_geotype.loc[geothermal_geotype['plant_id_eia'].isin(missing_plants), ['plant_id_eia','geotype_code']]

    # merge in the efs
    missing_plants = missing_plants.merge(geothermal_efs, how='left', on='geotype_code')

    # drop the geotype code
    missing_plants = missing_plants.drop(columns=['geotype_code'])

    # create a record for each month of the year
    missing_plants = create_monthly_gens_records(missing_plants, year)

    # concat the missing plants to the other dataframe
    geo_efs = pd.concat([geo_in_eia, missing_plants], axis=0)

    return geo_efs


In [None]:
geo_efs = calculate_geothermal_emission_factors()
geo_efs

# Identify mismatches between 923 tables

In [None]:
import sqlalchemy as sa
# specify the year for analysis
year = 2020

# specify the relative path to the sqllite database, and create an sqalchemy engine
pudl_db = 'sqlite:///../data/pudl/pudl_data/sqlite/pudl.sqlite'
pudl_engine = sa.create_engine(pudl_db)

# load the generation fuel data for the year
generation_eia923 = pd.read_sql(f"SELECT * FROM generation_eia923 WHERE report_date >= '{year}-01-01' AND report_date <= '{year}-12-01'", pudl_engine)
generation_fuel_eia923 = pd.read_sql(f"SELECT * FROM generation_fuel_eia923 WHERE report_date >= '{year}-01-01' AND report_date <= '{year}-12-01'", pudl_engine)
boiler_fuel_eia923 = pd.read_sql(f"SELECT * FROM boiler_fuel_eia923 WHERE report_date >= '{year}-01-01' AND report_date <= '{year}-12-01'", pudl_engine)

In [None]:
gen_agg = generation_eia923.groupby(['plant_id_eia']).sum().reset_index()
gf_agg = generation_fuel_eia923.groupby(['plant_id_eia']).sum()['net_generation_mwh'].reset_index()

In [None]:
generation_eia923[generation_eia923['plant_id_eia'] == 64408]

In [None]:
generation_fuel_eia923[generation_fuel_eia923['plant_id_eia'] == 64408]

In [None]:
compare = gen_agg.merge(gf_agg, how='inner', on=['plant_id_eia'], suffixes=('_g','_gf'))
compare[(compare['net_generation_mwh_g'].round(0) != compare['net_generation_mwh_gf'].round(0)) & (compare['net_generation_mwh_g'] > 0)]

In [None]:
boiler_fuel_eia923['fuel_consumed_mmbtu'] = boiler_fuel_eia923['fuel_consumed_units'] * boiler_fuel_eia923['fuel_mmbtu_per_unit']

In [None]:
gf_agg_f = generation_fuel_eia923.groupby(['plant_id_eia','report_date']).sum()['fuel_consumed_mmbtu'].reset_index()
bf_agg = boiler_fuel_eia923.groupby(['plant_id_eia','report_date']).sum()['fuel_consumed_mmbtu'].reset_index()
compare = gf_agg_f.merge(bf_agg, how='inner', on=['plant_id_eia','report_date'], suffixes=('_gf','_bf'))
compare[(compare['fuel_consumed_mmbtu_gf'].round(0) != compare['fuel_consumed_mmbtu_bf'].round(0)) & (compare['fuel_consumed_mmbtu_bf'] > 0)]

In [None]:
generation_fuel_eia923[(generation_fuel_eia923['plant_id_eia'] == 3)].sum()

In [None]:
generation_fuel_eia923[(generation_fuel_eia923['plant_id_eia'] == 404) & (generation_fuel_eia923['report_date'] == '2020-01-01')]

In [None]:
import src.load_data as load_data
import sqlalchemy as sa

year = 2020

IDX_GENS = ["plant_id_eia", "generator_id", "report_date"]
"""Id columns for generators."""

IDX_PM_FUEL = ["plant_id_eia", "prime_mover_code", "energy_source_code", "report_date"]
"""Id columns for plant, prime mover & fuel type records."""

IDX_FUEL = ["report_date", "plant_id_eia", "energy_source_code"]

DATA_COLS = ["net_generation_mwh", "fuel_consumed_mmbtu"]
"""Data columns from generation_fuel_eia923 that are being allocated."""

pudl_db = 'sqlite:///../data/pudl/pudl_data/sqlite/pudl.sqlite'
pudl_engine = sa.create_engine(pudl_db)

# specify the date filter for retrieving data
year_filter = f"report_date >= '{year}-01-01' AND report_date <= '{year}-12-01'"

# extract all of the tables from pudl_out early in the process and select
# only the columns we need. this is for speed and clarity.
gf = load_data.load_pudl_table(f"SELECT * FROM generation_fuel_eia923 WHERE {year_filter}").loc[
    :, IDX_PM_FUEL + ["net_generation_mwh", "fuel_consumed_mmbtu"]
].pipe(apply_dtype)
gen = (
    load_data.load_pudl_table(f"SELECT * FROM generation_eia923 WHERE {year_filter}").loc[:, IDX_GENS + ["net_generation_mwh"]]
    # removes 4 records with NaN generator_id as of pudl v0.5
    .dropna(subset=IDX_GENS)
).pipe(apply_dtype)
gens = load_data.load_pudl_table(f"SELECT * FROM generators_eia860 WHERE {year_filter}").loc[
    :,
    IDX_GENS
    + [
        "capacity_mw",
        "operational_status",
        "retirement_date",
    ]
    + list(load_data.load_pudl_table(f"SELECT * FROM generators_eia860 WHERE {year_filter}").filter(like="energy_source_code")),
]
# add the prime mover code to the gens df from generators entity
gens = gens.merge(load_data.load_pudl_table("generators_entity_eia").loc[:,["plant_id_eia", "generator_id","prime_mover_code"]],
                    how='left', 
                    on=["plant_id_eia", "generator_id"]).pipe(apply_dtype)

In [None]:
gf[gf['net_generation_mwh'] < 0]

In [None]:
# duplicate the entries for each month
gens_month = gens.copy()

month = 2
while month <= 12:
    # add one month to the copied data each iteration
    gens_month['report_date'] = gens_month['report_date'] + pd.DateOffset(months=1)
    # concat this data to the gens file
    gens = pd.concat([gens, gens_month], axis = 0)
    month += 1

