In [1]:
# import packages
import pandas as pd
import numpy as np
import os
import plotly.express as px

%reload_ext autoreload
%autoreload 2

# # Tell python where to look for modules.
import sys

sys.path.append("../../src/")

import oge.download_data as download_data
import oge.load_data as load_data
from oge.column_checks import get_dtypes, apply_dtypes, DATA_COLUMNS
from oge.filepaths import *
import oge.impute_hourly_profiles as impute_hourly_profiles
import oge.data_cleaning as data_cleaning
import oge.output_data as output_data
import oge.emissions as emissions
import oge.helpers as helpers
import oge.validation as validation
import oge.gross_to_net_generation as gross_to_net_generation
import oge.eia930 as eia930
from oge.logging_util import get_logger, configure_root_logger
import oge.constants as constants

configure_root_logger()
logger = get_logger("test")


year = 2024
path_prefix = f"{year}/"

# Test that our 923 check is working
I'm not sure that it is catching all dropped values
EG 64672 BLUJS

In [2]:
# import packages
import pandas as pd
# # Tell python where to look for modules.
import sys

sys.path.append("../../src/")
import oge.data_cleaning as data_cleaning
import oge.load_data as load_data
import pudl.analysis.allocate_gen_fuel as allocate_gen_fuel
year = 2024

In [3]:
# Allocate fuel and generation across each generator-pm-energy source
gf = load_data.load_pudl_table("out_eia923__monthly_generation_fuel_combined", year)
bf = load_data.load_pudl_table("out_eia923__monthly_boiler_fuel", year)
gen = load_data.load_pudl_table("out_eia923__monthly_generation", year)
gens = load_data.load_pudl_table("out_eia__yearly_generators", year)
bga = load_data.load_pudl_table("core_eia860__assn_boiler_generator", year)

# NOTE: As of 12/7/2024, there is a bug in the pudl data where incorrect generators
# are getting introduced.
# See: https://github.com/catalyst-cooperative/pudl/issues/3987
# To fix this, we need to filter `gens` to remove data with a missing
# "data_maturity" column
# (As of 11/27/25) this problem still exists in the table
#gens = gens[~gens["data_maturity"].isna()]

gf, bf, gen, bga, gens = allocate_gen_fuel.select_input_data(
    gf=gf, bf=bf, gen=gen, bga=bga, gens=gens
)
gen_fuel_allocated = allocate_gen_fuel.allocate_gen_fuel_by_generator_energy_source(
    gf,
    bf,
    gen,
    bga,
    gens,
    freq="MS",
)


2025-12-03 08:30:43 [    INFO] catalystcoop.pudl.analysis.allocate_gen_fuel:478 The granular data tables contain 61.9% of the fuel and 56.1% of net generation in the higher-coverage generation_fuel_eia923 table.
2025-12-03 08:30:43 [    INFO] catalystcoop.pudl.analysis.allocate_gen_fuel:1604 Distributing 0.1% annually reported records to months.
  .ffill()
  .ffill()
2025-12-03 08:31:27 [    INFO] catalystcoop.pudl.analysis.allocate_gen_fuel:1604 Distributing 0.3% annually reported records to months.
  .ffill()
2025-12-03 08:31:46 [    INFO] catalystcoop.pudl.analysis.allocate_gen_fuel:1061 Associating and allocating 152 (0.0%) records with unexpected prime_mover_code.
2025-12-03 08:31:48 [    INFO] catalystcoop.pudl.analysis.allocate_gen_fuel:1061 Associating and allocating 1039 (0.3%) records with unexpected prime_mover_code.
2025-12-03 08:31:53 [    INFO] catalystcoop.pudl.analysis.allocate_gen_fuel:1266 Ratio calc types: 
   All gens w/in generation table:  85674#, 1.5e+07 MW
   So

In [4]:
%reload_ext autoreload
%autoreload 2
import oge.validation as validation

In [None]:
# plant 55088, the ST prime mover is getting dropped from the allocation
# there is no ST prime mover in the gens table. But it looks like the CA prime mover is missing data
# similar with 955: reports HY and GT generation, but only IC in gens table. Owned by City of Peru, IL. Looks like they may be reporting generation for the wrong generator

In [12]:
gens[gens["plant_id_eia"] == 56615]

Unnamed: 0,report_date,plant_id_eia,generator_id,prime_mover_code,unit_id_pudl,capacity_mw,fuel_type_count,operational_status,generator_retirement_date,energy_source_code_1,energy_source_code_2,energy_source_code_3,energy_source_code_4,energy_source_code_5,energy_source_code_6,planned_energy_source_code_1,startup_source_code_1,startup_source_code_2,startup_source_code_3,startup_source_code_4
18223,2024-01-01,56615,G100,HY,,4.6,1,retired,2022-10-01,WAT,,,,,,,,,,
18224,2024-01-01,56615,G200,HY,,4.3,1,proposed,NaT,WAT,,,,,,,,,,


In [14]:
gen[gen["plant_id_eia"] == 56615]

Unnamed: 0,report_date,plant_id_eia,generator_id,net_generation_mwh


In [13]:
gf[gf["plant_id_eia"] == 56615]

Unnamed: 0,report_date,plant_id_eia,energy_source_code,prime_mover_code,net_generation_mwh,fuel_consumed_mmbtu,fuel_consumed_for_electricity_mmbtu
7686,2024-01-01,56615,WAT,HY,-12.0,0.0,0.0
24889,2024-02-01,56615,WAT,HY,-11.0,0.0,0.0
42095,2024-03-01,56615,WAT,HY,-11.0,0.0,0.0
59305,2024-04-01,56615,WAT,HY,-11.0,0.0,0.0
76534,2024-05-01,56615,WAT,HY,-11.0,0.0,0.0
93807,2024-06-01,56615,WAT,HY,-12.0,0.0,0.0
111077,2024-07-01,56615,WAT,HY,-14.0,0.0,0.0
128371,2024-08-01,56615,WAT,HY,-15.0,0.0,0.0
145683,2024-09-01,56615,WAT,HY,-13.0,0.0,0.0
163011,2024-10-01,56615,WAT,HY,-12.0,0.0,0.0


In [8]:
gen_fuel_allocated[gen_fuel_allocated["plant_id_eia"] == 55088]

Unnamed: 0,report_date,plant_id_eia,generator_id,prime_mover_code,energy_source_code,energy_source_code_num,net_generation_mwh,fuel_consumed_mmbtu,fuel_consumed_for_electricity_mmbtu
17200,2024-01-01,55088,GT 1,CT,NG,energy_source_code_1,2223.171361,6704.968,6704.968
17201,2024-01-01,55088,GT2,CT,NG,energy_source_code_1,2223.171361,6704.968,6704.968
17202,2024-01-01,55088,GTP1,GT,NG,energy_source_code_1,395344.475082,3795108.0,2921722.0
17203,2024-01-01,55088,ST1,CA,BFG,energy_source_code_1,134829.375,406635.0,406635.0
17204,2024-01-01,55088,ST1,CA,NG,energy_source_code_2,6727.460568,20289.67,20289.67
51552,2024-02-01,55088,GT 1,CT,NG,energy_source_code_1,2182.540761,7113.308,7113.308
51553,2024-02-01,55088,GT2,CT,NG,energy_source_code_1,2182.540761,7113.308,7113.308
51554,2024-02-01,55088,GTP1,GT,NG,energy_source_code_1,352557.583077,3421179.0,2624045.0
51555,2024-02-01,55088,ST1,CA,BFG,energy_source_code_1,121678.742188,396576.0,396576.0
51556,2024-02-01,55088,ST1,CA,NG,energy_source_code_2,6604.509739,21525.33,21525.33


In [7]:
validation.check_allocated_gf_matches_input_gf(year, gen_fuel_allocated)

                                 net_generation_mwh_gf  fuel_consumed_mmbtu_gf  fuel_consumed_for_electricity_mmbtu_gf  net_generation_mwh_alloc  fuel_consumed_mmbtu_alloc  fuel_consumed_for_electricity_mmbtu_alloc source_table
plant_id_eia energy_source_code                                                                                                                                                                                                    
955          JF                           7.600000e+01                  1492.0                                  1492.0                       NaN                        NaN                                        NaN     input_gf
             WAT                          1.557600e+04                 53147.0                                 53147.0                       NaN                        NaN                                        NaN     input_gf
55088        NG                           4.716937e+06              41534750.0          

In [None]:
validation.check_allocated_gf_matches_input_gf(year, gen_fuel_allocated)

In [6]:
plant_total_gf = gf.groupby(["plant_id_eia", "energy_source_code"], dropna=False)[
    [
        "net_generation_mwh",
        "fuel_consumed_mmbtu",
        "fuel_consumed_for_electricity_mmbtu",
    ]
].sum(min_count=1)

plant_total_alloc = gen_fuel_allocated.groupby(
    ["plant_id_eia", "energy_source_code"], dropna=False
)[
    [
        "net_generation_mwh",
        "fuel_consumed_mmbtu",
        "fuel_consumed_for_electricity_mmbtu",
    ]
].sum(min_count=1)



In [5]:
import numpy as np

In [7]:
mismatched_allocation = plant_total_gf.merge(
    plant_total_alloc,
    how="outer",
    on=["plant_id_eia", "energy_source_code"],
    validate="1:1",
    suffixes=("_gf", "_alloc"),
    indicator="source_table",
)
mismatched_allocation = mismatched_allocation[
    mismatched_allocation[
        [
            "net_generation_mwh_gf",
            "fuel_consumed_mmbtu_gf",
            "fuel_consumed_for_electricity_mmbtu_gf",
            "net_generation_mwh_alloc",
            "fuel_consumed_mmbtu_alloc",
            "fuel_consumed_for_electricity_mmbtu_alloc",
        ]
    ].sum(axis=1)
    != 0
]
mismatched_allocation["source_table"] = mismatched_allocation["source_table"].map({"left_only": "input_gf", "right_only": "allocated_gf", "both": "both"})
mismatched_allocation[
    ~np.isclose(mismatched_allocation["net_generation_mwh_gf"], mismatched_allocation["net_generation_mwh_alloc"], atol=0.0001)
    | ~np.isclose(mismatched_allocation["fuel_consumed_mmbtu_gf"], mismatched_allocation["fuel_consumed_mmbtu_alloc"], atol=0.0001)
    | ~np.isclose(mismatched_allocation["fuel_consumed_for_electricity_mmbtu_gf"], mismatched_allocation["fuel_consumed_for_electricity_mmbtu_alloc"], atol=0.0001)
]

Unnamed: 0_level_0,Unnamed: 1_level_0,net_generation_mwh_gf,fuel_consumed_mmbtu_gf,fuel_consumed_for_electricity_mmbtu_gf,net_generation_mwh_alloc,fuel_consumed_mmbtu_alloc,fuel_consumed_for_electricity_mmbtu_alloc,source_table
plant_id_eia,energy_source_code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
955,JF,76.0,1492.0,1492.0,,,,input_gf
955,WAT,15576.0,53147.0,53147.0,,,,input_gf
55088,NG,4716937.0,41534750.0,31899452.0,4561549.0,41092520.0,31457220.0,both
56615,WAT,-147.0,0.0,0.0,,,,input_gf


In [None]:
test.columns

In [None]:
plant_total_gf = plant_total_gf[plant_total_gf.sum(axis=1) != 0]
plant_total_alloc = plant_total_alloc[plant_total_alloc.sum(axis=1) != 0]
symmetric_difference = set(plant_total_gf.index).symmetric_difference(
        set(plant_total_alloc.index)
    )

In [None]:
symmetric_difference

In [None]:
gens = load_data.load_pudl_table("core_eia860__scd_generators", year=None)
gens[gens["data_maturity"].isna()][["plant_id_eia","generator_id","report_date","data_maturity"]]

In [None]:
gens.dropna()

In [None]:
gens[gens["plant_id_eia"] == 1]

In [None]:
# load intermediate output data as needed
cems = pd.read_csv(
    outputs_folder(f"{year}/cems_subplant_{year}.csv.zip"),
    compression="zip",
    parse_dates=["datetime_utc", "report_date"],
)
partial_cems_subplant = pd.read_csv(
    outputs_folder(f"{year}/partial_cems_subplant_{year}.csv.zip"),
    compression="zip",
    parse_dates=["datetime_utc", "report_date"],
)
partial_cems_plant = pd.read_csv(
    outputs_folder(f"{year}/partial_cems_plant_{year}.csv.zip"),
    compression="zip",
    parse_dates=["datetime_utc", "report_date"],
)
eia923_allocated = pd.read_csv(
    outputs_folder(f"{year}/eia923_allocated_{year}.csv.zip"),
    compression="zip",
    parse_dates=["report_date"],
)
plant_attributes = pd.read_csv(
    outputs_folder(f"{year}/plant_static_attributes_{year}.csv.zip"), compression="zip"
)
primary_fuel_table = pd.read_csv(
    outputs_folder(f"{year}/primary_fuel_table_{year}.csv.zip"), compression="zip"
)
monthly_eia_data_to_shape = eia923_allocated[
    (eia923_allocated["hourly_data_source"] == "eia")
]

Notes:
- SWPP petroleum data is missing from the profile data in Nov 2024. It seems like a merge error since the data seems to exist in the raw 930 data
- There's zero petroleum gen in that month so it is getting removed
- This is a bug in the profile calculation. 
- In this case, we should be using a flat profile. 
- It also seems like the 

In [None]:
profiles = pd.read_csv(
    outputs_folder(f"{year}/hourly_profiles_{year}.csv.zip"),
    compression="zip",
    parse_dates=["datetime_utc", "report_date"],
    dtype=get_dtypes()
)
profiles

In [None]:
profiles[
    (profiles["ba_code"] == "SWPP")
    & (profiles["fuel_category"] == "petroleum")
    & (profiles["datetime_utc"].dt.month == 11)
]

In [None]:
px.line(profiles[
    (profiles["ba_code"] == "SWPP")
    & (profiles["fuel_category"] == "petroleum")
    & (profiles["datetime_utc"].dt.month == 11)
], x="datetime_utc", y=["cems_profile"])

In [None]:
data930 = pd.read_csv(outputs_folder(f"{path_prefix}/eia930/eia930_elec.csv"), index_col=0)
data930

In [None]:
data930["EBA.SWPP-ALL.NG.OIL.H"]

In [None]:
clean_930_file = outputs_folder(f"{path_prefix}/eia930/eia930_elec.csv")
eia930_data = eia930.load_chalendar_for_pipeline(clean_930_file, year=year)
# until we can fix the physics reconciliation, we need to apply some
# post-processing steps
eia930_data = eia930.remove_imputed_ones(eia930_data)
eia930_data = eia930.remove_months_with_zero_data(eia930_data)

In [None]:
eia930_data

In [None]:
eia930_data[
    (eia930_data["ba_code"] == "SWPP")
    & (eia930_data["fuel_category_eia930"] == "petroleum")
    & (eia930_data["datetime_utc"].dt.month == 11)
]

In [None]:
test = partial_cems_subplant["plant_id_eia"].copy()
test.iloc[0] = pd.NA
test.astype("Int32")

In [None]:
pd.to_numeric(partial_cems_subplant["plant_id_eia"], downcast="integer").memory_usage()

# Compare EIA-923

In [None]:
import pudl.analysis.allocate_gen_fuel as allocate_gen_fuel

In [None]:
# Allocate fuel and generation across each generator-pm-energy source
gf = load_data.load_pudl_table("out_eia923__monthly_generation_fuel_combined", year)
bf = load_data.load_pudl_table("out_eia923__monthly_boiler_fuel", year)
gen = load_data.load_pudl_table("out_eia923__monthly_generation", year)
gens = load_data.load_pudl_table("out_eia__yearly_generators", year)
bga = load_data.load_pudl_table("core_eia860__assn_boiler_generator", year)

# NOTE: As of 12/7/2024, there is a bug in the pudl data where incorrect generators
# are getting introduced.
# See: https://github.com/catalyst-cooperative/pudl/issues/3987
# To fix this, we need to filter `gens` to remove data with a missing
# "data_maturity" column
# (As of 11/27/25) this problem still exists in the table
#gens = gens[~gens["data_maturity"].isna()]

gf, bf, gen, bga, gens = allocate_gen_fuel.select_input_data(
    gf=gf, bf=bf, gen=gen, bga=bga, gens=gens
)
gen_fuel_allocated = allocate_gen_fuel.allocate_gen_fuel_by_generator_energy_source(
    gf,
    bf,
    gen,
    bga,
    gens,
    freq="MS",
)


In [None]:
gens[gens["plant_id_eia"] == 64672]
# this is proposed but not picked up by the filters
# we need to be able to identify gf data that is unique to the generator
# create a flag indicating that the PM/FUEL is unique to that generator

In [None]:
gen_fuel_allocated[gen_fuel_allocated["plant_id_eia"] == 64672]

In [None]:
gen_fuel_allocated_filtered

In [None]:
gen_fuel_allocated.compare(gen_fuel_allocated_filtered)

In [None]:
# NOTE: instead of running this in pudl, we can load the data directly from pudl.
# however, we have some changes to this code in the oge_dev
gen_fuel_allocated_pudl = load_data.load_pudl_table(
    "out_eia923__monthly_generation_fuel_by_generator_energy_source", year
)


In [None]:
gen_fuel_allocated

In [None]:
gen_fuel_allocated_pudl

In [None]:
test = gen_fuel_allocated.merge(gen_fuel_allocated_pudl, how="outer", on=["report_date","plant_id_eia","generator_id","prime_mover_code","energy_source_code"], validate="1:1", suffixes=(None,"_pudl"))


In [None]:
test[~np.isclose(test["net_generation_mwh"], test["net_generation_mwh_pudl"], atol=0.1)]

In [None]:
gens[gens["plant_id_eia"] == 66]