In [1]:
# import packages
import numpy as np
import pandas as pd
import os
import plotly.express as px
from IPython.display import display

%reload_ext autoreload
%autoreload 2

# Tell python where to look for modules. 
import sys
sys.path.append('../../../open-grid-emissions/src/')

# import local modules
import load_data
import validate_egrid
from filepaths import *

#from column_checks import get_dtypes

# Specify the year for validation

In [10]:
year = 2021

## Compare Our Results to eGRID

In [11]:
# load our annual plant level data
oge_plant = validate_egrid.load_oge_plant_data(year)

# Load the eGRID plant table
egrid_plant = validate_egrid.load_egrid_plant_file(year)


### Identify plants missing from our results that exist in eGRID


In [12]:
(
    missing_from_calc,
    PLANTS_MISSING_FROM_CALCULATION,
) = validate_egrid.identify_plants_missing_from_oge(
    egrid_plant, oge_plant, year
)
missing_from_calc


Unnamed: 0,plant_id_eia,plant_id_egrid,plant_name_eia,ba_code,state,plant_primary_fuel,chp_flag,net_generation_mwh,fuel_consumed_mmbtu,fuel_consumed_for_electricity_mmbtu,co2_mass_lb,nox_mass_lb,so2_mass_lb,co2_mass_lb_for_electricity_adjusted,fuel_consumed_mmbtu_ozone_season,fuel_data_source_annual,fuel_data_source_ozone


### Identify plants missing from eGRID that are in our calculations

In [13]:
(
    missing_from_egrid,
    PLANTS_MISSING_FROM_EGRID,
) = validate_egrid.identify_plants_missing_from_egrid(egrid_plant, oge_plant, year)


missing_from_egrid

Unnamed: 0,plant_id_eia,plant_name_eia,ba_code,plant_primary_fuel,net_generation_mwh,fuel_consumed_for_electricity_mmbtu,fuel_consumed_mmbtu,data_availability,operational_status,current_planned_operating_date,retirement_date
0,2518,Shoreham,NYIS,DFO,3966.0,61689.0,61689.0,eia_only,[existing],[None],[None]
1,7922,North Plant,MISO,DFO,20.8,218.0,218.0,eia_only,[retired],[None],[2011-09-01]
2,10154,Chocolate Bayou Works,ERCO,NG,784064.1,4103418.6,12008364.9,eia_only,[existing],[None],[None]
3,10350,Greenleaf Unit One,CISO,NG,1058.65,10484.0,10484.0,cems_only,"[retired, proposed]","[None, 2022-08-01]","[2019-03-01, None]"
4,10397,Indiana Harbor,MISO,BFG,225367.0,1141549.5,8839609.9,eia_only,,,
5,54995,5 AC Station,MISO,BFG,497881.0,2642313.7,10866629.0,eia_only,,,
6,61852,Eagle Shadow Mountain Solar Farm,NEVP,SUN,31264.0,276467.0,276467.0,eia_only,[proposed],[2023-03-01],[None]
7,64927,Aktina Solar,ERCO,SUN,110147.0,974030.0,974030.0,eia_only,[proposed],[2022-12-01],[None]


### Compare whether totals for plants with EPA IDs that differ from EIA IDs match

In [14]:
validate_egrid.check_plants_with_different_egrid_eia_plant_ids(oge_plant, egrid_plant)


   plant_id_egrid  net_generation_mwh_oge  net_generation_mwh_egrid  percent_diff
5           10474                723248.0                       NaN           NaN


### Identify plants where our BA assignment does not match eGRID

In [15]:
validate_egrid.check_ba_code_assignment(oge_plant, egrid_plant)

                             plant_name_eia ba_code_egrid state_egrid ba_code_oge state_oge
plant_id_eia                                                                               
6456                           Carver Falls          ISNE          NY        NYIS        NY
64715                            Dusenberry          ISNE          NY        NYIS        NY
57600                      Fishers Island 1          ISNE          NY        NYIS        NY
64692                    Fredonia Solar LLC          ISNE          NY        NYIS        NY
64719                         Glenmere Lake          ISNE          NY        NYIS        NY
60463                Greene County Meter #1          ISNE          NY        NYIS        NY
64716                      Knapp East (CSG)          ISNE          NY        NYIS        NY
64720                       Knapp West(CSG)          ISNE          NY        NYIS        NY
65723                 LR Wheatfield Solar 1          ISNE          NY        NYI

### Identify whether the fuel codes of each plant match

In [16]:
validate_egrid.check_primary_fuel_assignment(oge_plant, egrid_plant)

                          plant_name_eia plant_primary_fuel_egrid plant_primary_fuel_oge
plant_id_eia                                                                            
7484                 NSB Nuiqsut Utility                      DFO                     NG
56076                           ABC Coke                      COG                     OG
3                                  Barry                      BIT                     NG
54802              WestRock Coated Board                       NG                    BLQ
160                       Apache Station                      SUB                     NG
...                                  ...                      ...                    ...
54098                Kaukauna Paper Mill                      BIT                    BLQ
50933                   Rhinelander Mill                      BIT                    WAT
10477         Wisconsin Rapids Pulp Mill                       NG                    BLQ
10743         Morgant

# Evaluate Plant-level discrepencies

In [None]:
# evaluate all plants
comparison_count, compared = validate_egrid.compare_plant_level_results_to_egrid(
    annual_plant_results, egrid_plant, PLANTS_MISSING_FROM_EGRID
)
comparison_count

In [None]:
compared[(compared["ba_code"] == "SOCO") & (compared["co2_mass_lb_status"] != "!exact")]

# Split data into different groups based on known discrepencies

There are certain classes of plants where there are known issues that we know will prevent the totals from matching:
- There are certain power plants that are missing altogether from eGRID
- Geothermal power plants: EGRID sometimes has incorrect geotypes that may lead to different emissions estimates
- Nuclear power plants: eGRID is generally missing fuel consumption data
- Fuel cells are assumed by eGRID to have zero emissions
- Plants that only report CEMS data for part of the year are sometimes missing data for the rest of the year in eGRID
- CHP plants use a slightly different methodology
- plants that report data to the bf table or gen table in EIA-923 may be missing data from teh generation and fuel table.
- "proposed" plants sometimes report data before they are officially online and egrid doesn't always reflect that.


We could identify plants where the total fuel doesn't match EIA-923, but matches the total fuel for a subset of prime movers

In [None]:
pudl_out = load_data.initialize_pudl_out(year)
eia923_allocated = pd.read_csv(
    outputs_folder(f"{year}/eia923_allocated_{year}.csv"),
    dtype=get_dtypes(),
    parse_dates=["report_date"],
)

annual_plant_results_segmented = validate_egrid.segment_plants_by_known_issues(
    annual_plant_results,
    egrid_plant,
    eia923_allocated,
    pudl_out,
    PLANTS_MISSING_FROM_EGRID,
)


## Plant Metric

In [None]:
flag_columns = [
    "flag_missing_egrid",
    "flag_geothermal",
    "flag_nuclear",
    "flag_fuel_cell",
    "flag_partial_year",
    "flag_chp",
    "flag_bf_gen_reporter",
    "flag_plant_w_proposed_gen",
    "flag_proposed_plant",
]

# all data without known issues
segment_to_compare = annual_plant_results_segmented[
    annual_plant_results_segmented[flag_columns].sum(axis=1) == 0
]

comparison_count, compared = validate_egrid.compare_plant_level_results_to_egrid(
    segment_to_compare, egrid_plant, PLANTS_MISSING_FROM_EGRID
)
comparison_count


In [None]:
validate_egrid.compare_egrid_fuel_total(segment_to_compare, egrid_plant).sum()


In [None]:
# examine results for a specific data flag
flag = "flag_proposed_plant"

# all data without known issues
segment_to_compare = annual_plant_results_segmented[
    annual_plant_results_segmented[flag] == 1
]

comparison_count, compared = validate_egrid.compare_plant_level_results_to_egrid(
    segment_to_compare, egrid_plant, PLANTS_MISSING_FROM_EGRID
)
comparison_count


In [None]:
# evaluate all plants
comparison_count, compared = validate_egrid.compare_plant_level_results_to_egrid(
    oge_plant, egrid_plant, PLANTS_MISSING_FROM_EGRID
)
comparison_count

In [None]:
compared

### Explore a specific set of plants

In [None]:
# specify the dataframe, metric, and status to explore
comparison_df = compared
metric = "so2_mass_lb"
status = "<50%"

comparison_df = comparison_df.merge(egrid_plant.set_index("plant_id_egrid")[[metric]], how="left", left_index=True, right_index=True, suffixes=(None,"_egrid"))
comparison_df = comparison_df.merge(annual_plant_results.set_index("plant_id_egrid")[[metric]], how="left", left_index=True, right_index=True, suffixes=(None,"_calc"))

# show the data
columns_to_show = ["plant_name_eia", "ba_code", "state", metric, f"{metric}_status", f"{metric}_egrid", f"{metric}_calc"]
comparison_df.loc[(comparison_df[f"{metric}_status"] == status), columns_to_show]


## Compare Annual BA values to eGRID BA file

In [2]:
year = 2021
path_prefix = year

DATA_COLUMNS = [
    "net_generation_mwh",
    "fuel_consumed_mmbtu",
    #"fuel_consumed_for_electricity_mmbtu",
    "co2_mass_lb",
]

In [3]:
# Load the eGRID plant table
egrid_plant = validate_egrid.load_egrid_plant_file(year)

egrid_ba = validate_egrid.load_egrid_ba_file(year)

# aggregate the plant data up to the BA level
egrid_plant_ba_agg = egrid_plant.groupby(["ba_code"]).sum()[DATA_COLUMNS].reset_index()

egrid_plant_ba_agg["generated_co2_rate_lb_per_mwh"] = egrid_plant_ba_agg["co2_mass_lb"] / egrid_plant_ba_agg["net_generation_mwh"]

  egrid_plant_ba_agg = egrid_plant.groupby(["ba_code"]).sum()[DATA_COLUMNS].reset_index()


In [4]:
# load our annual ba data
oge_ba = []

for filename in os.listdir(
    results_folder(f"{path_prefix}/power_sector_data/annual/us_units/")
):
    ba = filename.split(".")[0]
    ba_data = pd.read_csv(
        results_folder(f"{path_prefix}/power_sector_data/annual/us_units/{filename}"),
        usecols=(["fuel_category"] + DATA_COLUMNS),
    )
    ba_data = ba_data[ba_data["fuel_category"] == "total"].drop(
        columns=["fuel_category"]
    )
    ba_data["ba_code"] = ba
    ba_data = ba_data[["ba_code"] + DATA_COLUMNS]
    oge_ba.append(ba_data)

oge_ba = pd.concat(oge_ba, axis=0)

oge_ba["generated_co2_rate_lb_per_mwh"] = oge_ba["co2_mass_lb"] / oge_ba["net_generation_mwh"]


In [5]:
percent_diff_from_egrid = (
    (
        oge_ba.set_index("ba_code").replace(0, 0.1)
        - egrid_plant_ba_agg.set_index("ba_code").replace(0, 0.1)
    )
    / egrid_plant_ba_agg.set_index("ba_code").replace(0, 0.1)
).round(2)


In [8]:
# divide our calculation by the BA totals from eGRID
# if there are 0 values, replace with 0.1, so that div by zero doesn't return missing value
ba_metric = (
    oge_ba.replace(0, 0.1)
    .set_index("ba_code")
    .div(egrid_plant_ba_agg.set_index("ba_code").replace(0, 0.1))
    .sort_values(by="co2_mass_lb")
    .round(3)
)
ba_metric = ba_metric - 1

total = pd.DataFrame(
    oge_ba[DATA_COLUMNS + ["generated_co2_rate_lb_per_mwh"]]
    .sum()
    .div(egrid_plant_ba_agg[DATA_COLUMNS + ["generated_co2_rate_lb_per_mwh"]].sum())
    .rename("Total")
).T
total = total - 1

# calculate the difference in the number of plants in each region
oge_plant_count = oge_plant[
    ~(
        oge_plant[
            [
                "net_generation_mwh",
                "fuel_consumed_mmbtu",
                "fuel_consumed_for_electricity_mmbtu",
                "co2_mass_lb",
            ]
        ].sum(axis=1)
        == 0
    )
].groupby("ba_code", dropna=False)["plant_id_eia"].nunique().rename("num_plants_oge")

egrid_plant_count = egrid_plant[
        ~(
            egrid_plant[
                [
                    "net_generation_mwh",
                    "fuel_consumed_mmbtu",
                    "fuel_consumed_for_electricity_mmbtu",
                    "co2_mass_lb",
                ]
            ].sum(axis=1)
            == 0
        )
    ].groupby("ba_code", dropna=False)["plant_id_eia"].nunique().rename("num_plants_egrid")

ba_metric = ba_metric.merge(
    oge_plant_count, how="left", left_index=True, right_index=True
).sort_index()

ba_metric = ba_metric.merge(
    egrid_plant_count, how="left", left_index=True, right_index=True
).sort_index()

ba_metric = ba_metric.sort_values(by=["generated_co2_rate_lb_per_mwh"], ascending=True)

ba_metric = pd.concat([ba_metric, total], axis=0).round(3)

ba_metric = ba_metric[DATA_COLUMNS + ["generated_co2_rate_lb_per_mwh", "num_plants_oge","num_plants_egrid"]]

columns_to_check = [
    "net_generation_mwh",
    "fuel_consumed_mmbtu",
    #"fuel_consumed_for_electricity_mmbtu",
    "co2_mass_lb",
    "generated_co2_rate_lb_per_mwh"
]

ba_metric[columns_to_check] = ba_metric[columns_to_check] * 100

# ignore miscellaneous BAs
ba_metric = ba_metric[~ba_metric.index.str.contains("MS")]

ba_metric = ba_metric[~np.isclose(ba_metric[columns_to_check], 0, atol=0.5).all(axis=1)]

with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(ba_metric)


Unnamed: 0,net_generation_mwh,fuel_consumed_mmbtu,co2_mass_lb,generated_co2_rate_lb_per_mwh,num_plants_oge,num_plants_egrid
HST,0.7,0.2,-13.0,-13.5,1.0,1.0
PGE,22.5,9.0,12.1,-8.5,73.0,73.0
IID,0.0,0.0,-8.2,-8.2,54.0,54.0
HECO,18.8,18.2,14.3,-3.8,42.0,31.0
CISO,3.9,-0.3,0.7,-3.1,1276.0,1269.0
PACW,0.0,-1.0,-1.9,-1.9,99.0,99.0
FMPP,1.5,0.2,0.1,-1.4,14.0,14.0
PNM,1.3,0.0,0.0,-1.3,75.0,75.0
PACE,0.0,-0.7,-0.4,-0.4,154.0,154.0
SCEG,1.2,1.4,1.5,0.3,63.0,63.0


In [9]:
plant_cause = compared[compared["ba_code"].isin(ba_metric.index)]
plant_cause = plant_cause.loc[
    (
        ~plant_cause[
            [
                "net_generation_mwh_status",
                "fuel_consumed_mmbtu_status",
                "co2_mass_lb_status",
            ]
        ].isin(["!exact", "+/-1%", "egrid_missing_zero_value_from_oge"])
    ).any(axis=1),
    [
        "plant_name_eia",
        "ba_code",
        "state",
        "net_generation_mwh_status",
        "net_generation_mwh_oge",
        "net_generation_mwh_egrid",
        "fuel_consumed_mmbtu_status",
        "fuel_consumed_mmbtu_oge",
        "fuel_consumed_mmbtu_egrid",
        "co2_mass_lb_status",
        "co2_mass_lb_oge",
        "co2_mass_lb_egrid",
    ],
]

eia_gf = load_data.load_pudl_table("generation_fuel_eia923", year=year)
eia_esc = eia_gf[eia_gf["fuel_consumed_mmbtu"] > 0].groupby("plant_id_eia")[["energy_source_code"]].agg(["unique"]).droplevel(level=1, axis=1).add_suffix("_eia923")
eia_gf = eia_gf.groupby("plant_id_eia")[["net_generation_mwh","fuel_consumed_mmbtu"]].sum().add_suffix("_eia923")


egrid_unit_fuel = pd.read_excel(
    downloads_folder(f"egrid/egrid{year}_data.xlsx"),
    sheet_name=f"UNT{str(year)[-2:]}",
    header=1,
    usecols=["ORISPL","FUELU1"],
).rename(columns={"ORISPL":"plant_id_egrid","FUELU1":"energy_source_code_egrid"})
egrid_unit_fuel = egrid_unit_fuel.groupby("plant_id_egrid")[["energy_source_code_egrid"]].agg(["unique"]).droplevel(level=1, axis=1)

plant_cause = plant_cause.merge(eia_gf, how="left", left_index=True, right_index=True)
plant_cause = plant_cause.merge(eia_esc, how="left", left_index=True, right_index=True)
plant_cause = plant_cause.merge(egrid_unit_fuel, how="left", left_index=True, right_index=True)
plant_cause = plant_cause.merge(egrid_plant[["plant_id_egrid", "fuel_data_source_annual","fuel_data_source_ozone"]].set_index("plant_id_egrid"), how="left", left_index=True, right_index=True)

# add oge data source
plant_metadata = pd.read_csv(results_folder(f"{year}/plant_data/plant_metadata.csv"))
plant_metadata = plant_metadata.groupby("plant_id_eia")[["data_source"]].agg(["unique"]).droplevel(level=1, axis=1).add_suffix("_oge")
plant_metadata['data_source_oge'] = [','.join(map(str, l)) for l in plant_metadata['data_source_oge']]

plant_cause = plant_cause.merge(plant_metadata, how="left", left_index=True, right_index=True)

plant_cause  = plant_cause[[
        "plant_name_eia",
        "ba_code",
        "state",
        "net_generation_mwh_status",
        "net_generation_mwh_oge",
        "net_generation_mwh_egrid",
        "net_generation_mwh_eia923",
        "fuel_consumed_mmbtu_status",
        "fuel_consumed_mmbtu_oge",
        "fuel_consumed_mmbtu_egrid",
        "fuel_consumed_mmbtu_eia923",
        "data_source_oge",
        "fuel_data_source_annual",
        "fuel_data_source_ozone",
        "co2_mass_lb_status",
        "co2_mass_lb_oge",
        "co2_mass_lb_egrid",
        "energy_source_code_eia923",
        "energy_source_code_egrid"
    ]]


NameError: name 'compared' is not defined

In [None]:
from itertools import product
import validation

plants_with_missing_months = load_data.load_pudl_table("generation_fuel_eia923", year=year)
# drop plants that only report annually
plants_with_missing_months = validation.identify_reporting_frequency(plants_with_missing_months, year)
plants_with_missing_months = plants_with_missing_months[plants_with_missing_months["eia_data_resolution"] == "monthly"]
# create a dataframe with a complete set of months for each plant
complete_months = pd.DataFrame(list(product(plants_with_missing_months.plant_id_eia.unique(), plants_with_missing_months.report_date.unique())), columns=["plant_id_eia","report_date"])
plants_with_missing_months = complete_months.merge(plants_with_missing_months, how="outer", on=["plant_id_eia","report_date"])
# identify plants that have all missing data for a complete month
plants_with_missing_months = plants_with_missing_months.groupby(["plant_id_eia","report_date"], dropna=False)[["net_generation_mwh"]].sum(min_count=1).reset_index()
plants_with_missing_months = plants_with_missing_months.loc[plants_with_missing_months["net_generation_mwh"].isna(), "plant_id_eia"].unique()
plants_with_missing_months

## TODO
- [ ] Add flags for when plants are in incorrect BA vs proposed/retired plants
- [ ] streamline code and split into separate functions

In [None]:
for ba_code in ba_metric.index:
    # for ba_code in ["TEPC"]:
    print(f"Likely issues in {ba_code}:")
    for col in ["net_generation_mwh", "fuel_consumed_mmbtu", "co2_mass_lb"]:
        # idenfity the percentage difference between the BA-level data
        pct_diff = ba_metric.loc[ba_code, col]

        # filters
        fewer_plants_in_oge = (
            ba_metric.loc[ba_code, "num_plants_oge"]
            < ba_metric.loc[ba_code, "num_plants_egrid"]
        )
        more_plants_in_oge = (
            ba_metric.loc[ba_code, "num_plants_oge"]
            > ba_metric.loc[ba_code, "num_plants_egrid"]
        )
        egrid_value_less_than_oge_value = (
            plant_cause_ba[f"{col}_egrid"] < plant_cause_ba[f"{col}_oge"]
        )
        oge_value_not_close_to_egrid_value = ~plant_cause_ba[f"{col}_status"].isin(
            ["!exact", "+/-1%", "egrid_missing_zero_value_from_oge"]
        )
        oge_fuel_close_to_egrid_fuel = plant_cause_ba[
            f"fuel_consumed_mmbtu_status"
        ].isin(["!exact", "+/-1%", "egrid_missing_zero_value_from_oge"])
        oge_value_close_to_eia923_value = np.isclose(
            plant_cause_ba[f"{col}_oge"],
            plant_cause_ba[f"{col}_eia923"],
            atol=1,
        )
        egrid_value_close_to_eia923_value = np.isclose(
            plant_cause_ba[f"{col}_egrid"],
            plant_cause_ba[f"{col}_eia923"],
            atol=1,
        )

        if (pct_diff < 0.5) & (pct_diff > -0.5):
            pass
        else:
            plant_cause_ba = plant_cause[plant_cause["ba_code"] == ba_code]
            print(f"     {col}: {pct_diff.round(3)}% diff from eGRID")
            # check if it could be affected by the number of plants
            if (pct_diff < -0.5) & fewer_plants_in_oge:
                print("          fewer plants assigned to this BA in OGE")
            if (pct_diff > 0.5) & more_plants_in_oge:
                print("          more plants assigned to this BA in OGE")
            if col != "co2_mass_lb":
                if (
                    len(
                        plant_cause_ba[
                            oge_value_not_close_to_egrid_value
                            & (plant_cause_ba["fuel_data_source_annual"] == "EPA/CAMD")
                            & (plant_cause_ba["fuel_data_source_ozone"] == "EPA/CAMD")
                            & (plant_cause_ba["data_source_oge"].str.contains("EIA"))
                            & egrid_value_less_than_oge_value
                        ]
                    )
                    > 0
                ):
                    print("          eGRID not filling missing CEMS data with EIA")
                # check if rounding error in EIA gf allocation
                if (
                    len(
                        plant_cause_ba[
                            plant_cause_ba[f"{col}_status"].isin(["+/-1%"])
                            & (
                                plant_cause_ba[f"{col}_egrid"].round(0)
                                == plant_cause_ba[f"{col}_eia923"].round(0)
                            )
                            & (
                                plant_cause_ba[f"{col}_oge"].round(0)
                                != plant_cause_ba[f"{col}_eia923"].round(0)
                            )
                        ]
                    )
                    > 0
                ):
                    print("          small error in OGE EIA-923 allocation")
            if col == "net_generation_mwh":
                if (
                    len(
                        plant_cause_ba[
                            oge_value_not_close_to_egrid_value
                            & egrid_value_less_than_oge_value
                            & egrid_value_close_to_eia923_value
                            & ~oge_fuel_close_to_egrid_fuel
                        ]
                    )
                    > 0
                ):
                    print("          OGE potentially double-counting data")
                elif (
                    len(
                        plant_cause_ba[
                            oge_value_not_close_to_egrid_value
                            & oge_fuel_close_to_egrid_fuel
                            & egrid_value_close_to_eia923_value
                            & egrid_value_less_than_oge_value
                            & plant_cause_ba.index.isin(plants_with_missing_months)
                        ]
                    )
                    > 0
                ):
                    print(
                        "          Missing net generation data from EIA-923 not filled with CEMS in eGRID"
                    )
            if col == "fuel_consumed_mmbtu":
                # if (ba_metric.loc[ba_code, col] > 0.5) & (ba_metric.loc[ba_code, "net_generation_mwh"] == 0):
                # check if eGRID is only using CEMS data
                if (
                    len(
                        plant_cause_ba[
                            oge_value_not_close_to_egrid_value
                            & oge_value_close_to_eia923_value
                            & egrid_value_less_than_oge_value
                        ]
                    )
                    > 0
                ):
                    print("          OGE potentially double-counting data")
                # in cases where EIA data is used in both eGRID and OGE, and OGE matches EIA923 but eGRID doesn't
                if (
                    len(
                        plant_cause_ba[
                            oge_value_not_close_to_egrid_value
                            & (
                                plant_cause_ba["fuel_data_source_annual"].str.contains(
                                    "EIA"
                                )
                            )
                            & (plant_cause_ba["data_source_oge"].str.contains("EIA"))
                            & oge_value_close_to_eia923_value
                            & egrid_value_less_than_oge_value
                        ]
                    )
                    > 0
                ):
                    print("          eGRID missing fuel consumption from EIA-923 GF")
            if col == "co2_mass_lb":
                if (
                    len(
                        plant_cause_ba[
                            (
                                plant_cause_ba["energy_source_code_eia923"]
                                .dropna()
                                .apply(set)
                                - plant_cause_ba["energy_source_code_egrid"]
                                .dropna()
                                .apply(set)
                            ).astype(bool)
                            & oge_value_not_close_to_egrid_value
                            & oge_fuel_close_to_egrid_fuel
                        ]
                    )
                    > 0
                ):
                    print(
                        "          Incorrect fuel type assigned to fuel consumption in eGRID"
                    )
                elif (
                    len(
                        plant_cause_ba[
                            oge_value_not_close_to_egrid_value
                            & oge_fuel_close_to_egrid_fuel
                        ]
                    )
                    > 0
                ):
                    print("          Incorrect EF possibly being used")


In [None]:
col = "co2_mass_lb"

plant_cause_ba[
                            ~plant_cause_ba[f"{col}_status"].isin(
                                ["!exact", "+/-1%", "egrid_missing_zero_value_from_oge"]
                            )
                            & plant_cause_ba[f"fuel_consumed_mmbtu_status"].isin(
                                ["!exact", "+/-1%", "egrid_missing_zero_value_from_oge"]
                            )
                        ]

In [None]:
plant_metadata = pd.read_csv(results_folder(f"{year}/plant_data/plant_metadata.csv"))
plant_metadata = plant_metadata.groupby("plant_id_eia")[["data_source"]].agg(["unique"]).droplevel(level=1, axis=1).add_suffix("_oge")
plant_metadata['data_source_oge'] = [','.join(map(str, l)) for l in plant_metadata['data_source_oge']]
plant_metadata

## Explore specific plants


In [None]:
plant_to_explore = 58223


In [None]:
egrid_plant[egrid_plant["plant_id_eia"] == plant_to_explore]


In [None]:
annual_plant_results[annual_plant_results["plant_id_eia"] == plant_to_explore]


In [None]:
eia923_allocated[eia923_allocated["plant_id_eia"] == plant_to_explore]


In [None]:
eia923_allocated.loc[
    eia923_allocated["plant_id_eia"] == plant_to_explore,
    ["generator_id", "subplant_id"],
].drop_duplicates()
