In [1]:
# import packages
import numpy as np
import pandas as pd
import os
import plotly.express as px
from IPython.display import display

%reload_ext autoreload
%autoreload 2

# Tell python where to look for modules.
import sys

sys.path.append("../../src")

# import local modules
import oge.load_data as load_data
import oge.validation as validation
from oge.filepaths import *

from oge.column_checks import get_dtypes

# Specify the year for validation

In [None]:
year = 2021

## Compare Our Results to eGRID

In [None]:
# load our annual plant level data
annual_plant_results = pd.read_csv(
    results_folder(f"{year}/plant_data/annual/us_units/plant_data.csv"),
    dtype=get_dtypes(),
)
plant_attributes = pd.read_csv(
    outputs_folder(f"{year}/plant_static_attributes_{year}.csv"), dtype=get_dtypes()
)
annual_plant_results = annual_plant_results.merge(
    plant_attributes, how="left", on="plant_id_eia"
)

# add a egrid id
annual_plant_results = validation.add_egrid_plant_id(
    annual_plant_results, from_id="eia", to_id="egrid"
)

# Load the eGRID plant table
egrid_plant = validation.load_egrid_plant_file(year)

### Identify plants missing from our results that exist in eGRID


In [None]:
(
    missing_from_calc,
    PLANTS_MISSING_FROM_CALCULATION,
) = validation.identify_plants_missing_from_our_calculations(
    egrid_plant, annual_plant_results, year
)
missing_from_calc

### Identify plants missing from eGRID that are in our calculations

In [None]:
(
    missing_from_egrid,
    PLANTS_MISSING_FROM_EGRID,
) = validation.identify_plants_missing_from_egrid(egrid_plant, annual_plant_results)

# how many of the plants missing from egrid have non-zero data
missing_from_egrid.loc[
    missing_from_egrid["fuel_consumed_mmbtu"] != 0,
    [
        "plant_id_eia",
        "plant_name_eia",
        "plant_primary_fuel",
        "net_generation_mwh",
        "fuel_consumed_for_electricity_mmbtu",
        "fuel_consumed_mmbtu",
    ],
]

### Compare whether totals for plants with EPA IDs that differ from EIA IDs match

In [None]:
# identify where there is a single egrid plant id for multiple eia plant ids
double_ids = annual_plant_results[
    annual_plant_results["plant_id_egrid"].duplicated(keep=False)
]
double_ids = (
    double_ids.groupby("plant_id_egrid").sum()["net_generation_mwh"].reset_index()
)  # focus on net generation for now
# merge the egrid data
double_ids = double_ids.merge(
    egrid_plant[["plant_id_egrid", "net_generation_mwh"]],
    how="left",
    on="plant_id_egrid",
    suffixes=("_calc", "_egrid"),
)
double_ids["percent_diff"] = (
    (double_ids["net_generation_mwh_calc"] - double_ids["net_generation_mwh_egrid"])
    / double_ids["net_generation_mwh_egrid"]
).round(3)
double_ids

In [None]:
# compare egrid vs eia plant ids
annual_plant_results[
    annual_plant_results["plant_id_egrid"].duplicated(keep=False)
].groupby(["plant_id_egrid", "plant_id_eia"]).sum()

### Identify plants where our BA assignment does not match eGRID

In [None]:
ba_code_match = egrid_plant.set_index("plant_id_eia")[
    ["plant_name_eia", "ba_code"]
].merge(
    annual_plant_results.set_index("plant_id_eia")[["ba_code"]],
    how="inner",
    left_index=True,
    right_index=True,
    suffixes=("_egrid", "_calc"),
)

# plants with missing ba code
# ba_code_match[(ba_code_match['ba_code_calc'].isna()) & ~(ba_code_match['ba_code_egrid'].isna())]
ba_code_match[ba_code_match["ba_code_calc"] != ba_code_match["ba_code_egrid"]]

In [None]:
# how many of these mismatches are for non-missing bas
ba_code_match[
    (ba_code_match["ba_code_calc"] != ba_code_match["ba_code_egrid"])
    & ~(ba_code_match["ba_code_egrid"].isna())
]

### Identify whether the fuel codes of each plant match

In [None]:
fuel_match = egrid_plant.set_index("plant_id_eia")[
    ["plant_name_eia", "plant_primary_fuel"]
].merge(
    annual_plant_results.set_index("plant_id_eia")[["plant_primary_fuel"]],
    how="inner",
    left_index=True,
    right_index=True,
    suffixes=("_egrid", "_calc"),
)

fuel_match[
    fuel_match["plant_primary_fuel_egrid"] != fuel_match["plant_primary_fuel_calc"]
]

# Evaluate Plant-level discrepencies

In [None]:
# evaluate all plants
comparison_count, compared = validation.compare_plant_level_results_to_egrid(
    annual_plant_results, egrid_plant, PLANTS_MISSING_FROM_EGRID
)
comparison_count

In [None]:
compared[(compared["ba_code"] == "SOCO") & (compared["co2_mass_lb_status"] != "!exact")]

# Split data into different groups based on known discrepencies

There are certain classes of plants where there are known issues that we know will prevent the totals from matching:
- There are certain power plants that are missing altogether from eGRID
- Geothermal power plants: EGRID sometimes has incorrect geotypes that may lead to different emissions estimates
- Nuclear power plants: eGRID is generally missing fuel consumption data
- Fuel cells are assumed by eGRID to have zero emissions
- Plants that only report CEMS data for part of the year are sometimes missing data for the rest of the year in eGRID
- CHP plants use a slightly different methodology
- plants that report data to the bf table or gen table in EIA-923 may be missing data from teh generation and fuel table.
- "proposed" plants sometimes report data before they are officially online and egrid doesn't always reflect that.


We could identify plants where the total fuel doesn't match EIA-923, but matches the total fuel for a subset of prime movers

In [None]:
pudl_out = load_data.initialize_pudl_out(year)
eia923_allocated = pd.read_csv(
    outputs_folder(f"{year}/eia923_allocated_{year}.csv"),
    dtype=get_dtypes(),
    parse_dates=["report_date"],
)

annual_plant_results_segmented = validation.segment_plants_by_known_issues(
    annual_plant_results,
    egrid_plant,
    eia923_allocated,
    pudl_out,
    PLANTS_MISSING_FROM_EGRID,
)

## Plant Metric

In [None]:
flag_columns = [
    "flag_missing_egrid",
    "flag_geothermal",
    "flag_nuclear",
    "flag_fuel_cell",
    "flag_partial_year",
    "flag_chp",
    "flag_bf_gen_reporter",
    "flag_plant_w_proposed_gen",
    "flag_proposed_plant",
]

# all data without known issues
segment_to_compare = annual_plant_results_segmented[
    annual_plant_results_segmented[flag_columns].sum(axis=1) == 0
]

comparison_count, compared = validation.compare_plant_level_results_to_egrid(
    segment_to_compare, egrid_plant, PLANTS_MISSING_FROM_EGRID
)
comparison_count

In [None]:
validation.compare_egrid_fuel_total(segment_to_compare, egrid_plant).sum()

In [None]:
# examine results for a specific data flag
flag = "flag_proposed_plant"

# all data without known issues
segment_to_compare = annual_plant_results_segmented[
    annual_plant_results_segmented[flag] == 1
]

comparison_count, compared = validation.compare_plant_level_results_to_egrid(
    segment_to_compare, egrid_plant, PLANTS_MISSING_FROM_EGRID
)
comparison_count

In [None]:
# evaluate all plants
comparison_count, compared = validation.compare_plant_level_results_to_egrid(
    annual_plant_results, egrid_plant, PLANTS_MISSING_FROM_EGRID
)
comparison_count

### Explore a specific set of plants

In [None]:
# specify the dataframe, metric, and status to explore
comparison_df = compared
metric = "so2_mass_lb"
status = "<50%"

comparison_df = comparison_df.merge(
    egrid_plant.set_index("plant_id_egrid")[[metric]],
    how="left",
    left_index=True,
    right_index=True,
    suffixes=(None, "_egrid"),
)
comparison_df = comparison_df.merge(
    annual_plant_results.set_index("plant_id_egrid")[[metric]],
    how="left",
    left_index=True,
    right_index=True,
    suffixes=(None, "_calc"),
)

# show the data
columns_to_show = [
    "plant_name_eia",
    "ba_code",
    "state",
    metric,
    f"{metric}_status",
    f"{metric}_egrid",
    f"{metric}_calc",
]
comparison_df.loc[(comparison_df[f"{metric}_status"] == status), columns_to_show]

## Compare Annual BA values to eGRID BA file

In [None]:
year = 2021
path_prefix = year

DATA_COLUMNS = [
    "net_generation_mwh",
    "fuel_consumed_mmbtu",
    "fuel_consumed_for_electricity_mmbtu",
    "co2_mass_lb",
]

In [None]:
# Load the eGRID plant table
egrid_plant = validation.load_egrid_plant_file(year)

egrid_ba = validation.load_egrid_ba_file(year)

# aggregate the plant data up to the BA level
egrid_plant_ba_agg = egrid_plant.groupby(["ba_code"]).sum()[DATA_COLUMNS].reset_index()

egrid_plant_ba_agg["generated_co2_rate_lb_per_mwh"] = (
    egrid_plant_ba_agg["co2_mass_lb"] / egrid_plant_ba_agg["net_generation_mwh"]
)

In [None]:
# load our annual ba data
calculated_ba = []

for filename in os.listdir(
    results_folder(f"{path_prefix}/power_sector_data/annual/us_units/")
):
    ba = filename.split(".")[0]
    ba_data = pd.read_csv(
        results_folder(f"{path_prefix}/power_sector_data/annual/us_units/{filename}"),
        usecols=(["fuel_category"] + DATA_COLUMNS),
    )
    ba_data = ba_data[ba_data["fuel_category"] == "total"].drop(
        columns=["fuel_category"]
    )
    ba_data["ba_code"] = ba
    ba_data = ba_data[["ba_code"] + DATA_COLUMNS]
    calculated_ba.append(ba_data)

calculated_ba = pd.concat(calculated_ba, axis=0)

calculated_ba["generated_co2_rate_lb_per_mwh"] = (
    calculated_ba["co2_mass_lb"] / calculated_ba["net_generation_mwh"]
)

In [None]:
percent_diff_from_egrid = (
    (
        calculated_ba.set_index("ba_code").replace(0, 0.1)
        - egrid_plant_ba_agg.set_index("ba_code").replace(0, 0.1)
    )
    / egrid_plant_ba_agg.set_index("ba_code").replace(0, 0.1)
).round(2)

In [None]:
# divide our calculation by the BA totals from eGRID
# if there are 0 values, replace with 0.1, so that div by zero doesn't return missing value
ba_metric = (
    calculated_ba.replace(0, 0.1)
    .set_index("ba_code")
    .div(egrid_plant_ba_agg.set_index("ba_code").replace(0, 0.1))
    .sort_values(by="co2_mass_lb")
    .round(3)
)
ba_metric = ba_metric - 1

total = pd.DataFrame(
    calculated_ba[DATA_COLUMNS + ["generated_co2_rate_lb_per_mwh"]]
    .sum()
    .div(egrid_plant_ba_agg[DATA_COLUMNS + ["generated_co2_rate_lb_per_mwh"]].sum())
    .rename("Total")
).T
total = total - 1

# calculate the difference in the number of plants in each region
plant_count = (
    annual_plant_results[
        ~(
            annual_plant_results[
                [
                    "net_generation_mwh",
                    "fuel_consumed_mmbtu",
                    "fuel_consumed_for_electricity_mmbtu",
                    "co2_mass_lb",
                ]
            ].sum(axis=1)
            == 0
        )
    ]
    .groupby("ba_code", dropna=False)["plant_id_eia"]
    .nunique()
    - egrid_plant[
        ~(
            egrid_plant[
                [
                    "net_generation_mwh",
                    "fuel_consumed_mmbtu",
                    "fuel_consumed_for_electricity_mmbtu",
                    "co2_mass_lb",
                ]
            ].sum(axis=1)
            == 0
        )
    ]
    .groupby("ba_code", dropna=False)["plant_id_eia"]
    .nunique()
).rename("num_plants")

ba_metric = ba_metric.merge(
    plant_count, how="left", left_index=True, right_index=True
).sort_index()

ba_metric = ba_metric.sort_values(by=["generated_co2_rate_lb_per_mwh"], ascending=True)

ba_metric = pd.concat([ba_metric, total], axis=0).round(2)

ba_metric = ba_metric[DATA_COLUMNS + ["generated_co2_rate_lb_per_mwh", "num_plants"]]

columns_to_check = [
    "net_generation_mwh",
    "fuel_consumed_mmbtu",
    "fuel_consumed_for_electricity_mmbtu",
    "co2_mass_lb",
]

with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(ba_metric[~(ba_metric[columns_to_check] == 0).all(axis=1)])

## Explore specific plants


In [None]:
plant_to_explore = 58223

In [None]:
egrid_plant[egrid_plant["plant_id_eia"] == plant_to_explore]

In [None]:
annual_plant_results[annual_plant_results["plant_id_eia"] == plant_to_explore]

In [None]:
eia923_allocated[eia923_allocated["plant_id_eia"] == plant_to_explore]

In [None]:
eia923_allocated.loc[
    eia923_allocated["plant_id_eia"] == plant_to_explore,
    ["generator_id", "subplant_id"],
].drop_duplicates()