In [None]:
# import packages
import numpy as np
import pandas as pd
import os
import plotly.express as px
from IPython.display import display

%reload_ext autoreload
%autoreload 2

# Tell python where to look for modules. 
import sys
sys.path.append('../../hourly-egrid/')

# import local modules
import src.load_data as load_data
import src.validation as validation

from src.column_checks import get_dtypes

# Specify the year for validation

In [None]:
year = 2020

## Compare Our Results to eGRID

In [None]:
# load our annual plant level data
annual_plant_results = pd.read_csv(
    f"../data/results/{year}/plant_data/annual/us_units/plant_data.csv",
    dtype=get_dtypes(),
)
plant_attributes = pd.read_csv(
    f"../data/outputs/{year}/plant_static_attributes_{year}.csv", dtype=get_dtypes()
)
annual_plant_results = annual_plant_results.merge(
    plant_attributes, how="left", on="plant_id_eia"
)

# add a egrid id
annual_plant_results = validation.add_egrid_plant_id(
    annual_plant_results, from_id="eia", to_id="egrid"
)



In [None]:
# Load the eGRID plant table
egrid_plant = validation.load_egrid_plant_file(year)


### Identify plants missing from our results that exist in eGRID


In [None]:
missing_from_calc, PLANTS_MISSING_FROM_CALCULATION = validation.identify_plants_missing_from_our_calculations(egrid_plant, annual_plant_results, year)
missing_from_calc

### Identify plants missing from eGRID that are in our calculations

In [None]:
missing_from_egrid, PLANTS_MISSING_FROM_EGRID = validation.identify_plants_missing_from_egrid(egrid_plant, annual_plant_results)


In [None]:
# how many of the plants missing from egrid have non-zero data
missing_from_egrid[missing_from_egrid["fuel_consumed_mmbtu"] != 0]

### Compare whether totals for plants with EPA IDs that differ from EIA IDs match

In [None]:
# identify where there is a single egrid plant id for multiple eia plant ids
double_ids = annual_plant_results[
    annual_plant_results["plant_id_egrid"].duplicated(keep=False)
]
double_ids = (
    double_ids.groupby("plant_id_egrid").sum()["net_generation_mwh"].reset_index()
)  # focus on net generation for now
# merge the egrid data
double_ids = double_ids.merge(
    egrid_plant[["plant_id_egrid", "net_generation_mwh"]],
    how="left",
    on="plant_id_egrid",
    suffixes=("_calc", "_egrid"),
)
double_ids["percent_diff"] = (
    (double_ids["net_generation_mwh_calc"] - double_ids["net_generation_mwh_egrid"])
    / double_ids["net_generation_mwh_egrid"]
).round(3)
double_ids


### Identify plants where our BA assignment does not match eGRID

In [None]:
ba_code_match = egrid_plant.set_index("plant_id_eia")[["plant_name", "ba_code"]].merge(
    annual_plant_results.set_index("plant_id_eia")[["ba_code"]],
    how="inner",
    left_index=True,
    right_index=True,
    suffixes=("_egrid", "_calc"),
)

# plants with missing ba code
# ba_code_match[(ba_code_match['ba_code_calc'].isna()) & ~(ba_code_match['ba_code_egrid'].isna())]
ba_code_match[ba_code_match["ba_code_calc"] != ba_code_match["ba_code_egrid"]]



In [None]:
# how many of these mismatches are for non-missing bas
ba_code_match[
    (ba_code_match["ba_code_calc"] != ba_code_match["ba_code_egrid"])
    & ~(ba_code_match["ba_code_egrid"].isna())
]

### Identify whether the fuel codes of each plant match

In [None]:
fuel_match = egrid_plant.set_index("plant_id_eia")[
    ["plant_name", "plant_primary_fuel"]
].merge(
    annual_plant_results.set_index("plant_id_eia")[["plant_primary_fuel"]],
    how="inner",
    left_index=True,
    right_index=True,
    suffixes=("_egrid", "_calc"),
)

fuel_match[
    fuel_match["plant_primary_fuel_egrid"] != fuel_match["plant_primary_fuel_calc"]
]



# Split data into different groups based on known discrepencies

There are certain classes of plants where there are known issues that we know will prevent the totals from matching:
- Nuclear power plants: eGRID is generally missing fuel consumption data
- Geothermal power plants: EGRID sometimes has incorrect geotypes that may lead to different emissions estimates
- There are certain power plants for which eGRID is missing fuel data from an entire prime mover
- Plants that only report to CEMS during the ozone season
- There are certain power plants that are missing altogether from eGRID
- Fuel cells are assumed by eGRID to have zero emissions
- plants where data is missing from the boiler fuel table in EIA-923


We could identify plants where the total fuel doesn't match EIA-923, but matches the total fuel for a subset of prime movers

In [None]:
pudl_out = load_data.initialize_pudl_out(year)
eia923_allocated = pd.read_csv(
    f"../data/outputs/{year}/eia923_allocated_{year}.csv",
    dtype=get_dtypes(),
    parse_dates=["report_date"],
)

missing_plants, geothermal_plants, nuclear_plants, fuel_cell_plants, ozone_season_plants, chp_plants, all_other_plants = validation.segment_plants_by_known_issues(annual_plant_results, egrid_plant, eia923_allocated, pudl_out, PLANTS_MISSING_FROM_EGRID)
PLANTS_IN_ALL_OTHER_PLANTS = list(all_other_plants["plant_id_egrid"].unique())

## Identify where eGRID might be missing data
It seems that there are quite a few generators where fuel consumption data is missing

In [None]:
# where is egrid missing data?

egrid_eia_comparison = validation.identify_potential_missing_fuel_in_egrid(pudl_out, year, egrid_plant)

plants_missing_more_than_1_percent_fuel = egrid_eia_comparison[
    (egrid_eia_comparison["percent_difference"] < -0.01)
    & (egrid_eia_comparison["plant_primary_fuel"] != "NUC")
    & (egrid_eia_comparison["source"] == "both")
    & (egrid_eia_comparison["plant_id_egrid"].isin(PLANTS_IN_ALL_OTHER_PLANTS))
].sort_values(by="percent_difference")
plants_missing_more_than_1_percent_fuel


In [None]:
# how much emissions does this account for?
# group by fuel code
missing_emissions = (
    plants_missing_more_than_1_percent_fuel.groupby("plant_primary_fuel")
    .sum()
    .reset_index()
)

# get emission factors
emission_factors = load_data.load_ghg_emission_factors()[
    ["energy_source_code", "co2_lb_per_mmbtu"]
]
missing_emissions = missing_emissions.merge(
    emission_factors,
    how="left",
    left_on="plant_primary_fuel",
    right_on="energy_source_code",
)
missing_emissions["co2_mass_lb"] = (
    missing_emissions["difference"] * missing_emissions["co2_lb_per_mmbtu"]
)
missing_emissions.sum()


## Plant Metric

In [None]:
nuclear_comparison_count, compared_nuclear = validation.compare_plant_level_results_to_egrid(
    nuclear_plants, egrid_plant, PLANTS_MISSING_FROM_EGRID
)
nuclear_comparison_count


In [None]:
geothermal_comparison_count, compared_geothermal = validation.compare_plant_level_results_to_egrid(
    geothermal_plants, egrid_plant, PLANTS_MISSING_FROM_EGRID
)
geothermal_comparison_count


In [None]:
fuel_cell_comparison_count, compared_fuel_cell = validation.compare_plant_level_results_to_egrid(
    fuel_cell_plants, egrid_plant, PLANTS_MISSING_FROM_EGRID
)
fuel_cell_comparison_count


In [None]:
missing_comparison_count, compared_missing = validation.compare_plant_level_results_to_egrid(
    missing_plants, egrid_plant, PLANTS_MISSING_FROM_EGRID
)
missing_comparison_count


In [None]:
ozone_comparison_count, compared_ozone = validation.compare_plant_level_results_to_egrid(
    ozone_season_plants, egrid_plant, PLANTS_MISSING_FROM_EGRID
)
ozone_comparison_count


In [None]:
chp_comparison_count, compared_chp = validation.compare_plant_level_results_to_egrid(
    chp_plants, egrid_plant, PLANTS_MISSING_FROM_EGRID
)
chp_comparison_count

In [None]:
all_other_comparison_count, compared_all_other = validation.compare_plant_level_results_to_egrid(
    all_other_plants, egrid_plant, PLANTS_MISSING_FROM_EGRID
)
all_other_comparison_count


### Explore a specific set of plants

In [None]:
# specify the dataframe, metric, and status to explore
comparison_df = compared_chp
metric = "fuel_consumed_for_electricity_mmbtu"
status = "-50% to -10%"

# show the data
columns_to_show = ["plant_name","ba_code","state", metric, f"{metric}_status"]
comparison_df.loc[(comparison_df[f"{metric}_status"] == status), columns_to_show]

## Compare Annual BA values to eGRID BA file

In [None]:
year = 2020
path_prefix = year

# Load the eGRID plant table
egrid_plant = validation.load_egrid_plant_file(year)

egrid_ba = validation.load_egrid_ba_file(year)

# aggregate the plant data up to the BA level
data_columns = [
    "net_generation_mwh",
    "fuel_consumed_mmbtu",
    "fuel_consumed_for_electricity_mmbtu",
    "co2_mass_lb",
    "co2_mass_lb_adjusted",
]
egrid_plant_ba_agg = egrid_plant.groupby(["ba_code"]).sum()[data_columns].reset_index()



In [None]:
# load our annual ba data
DATA_COLUMNS = [
    "net_generation_mwh",
    "fuel_consumed_mmbtu",
    "fuel_consumed_for_electricity_mmbtu",
    "co2_mass_lb",
    "co2_mass_lb_adjusted",
]

calculated_ba = []

for filename in os.listdir(
    f"../data/results/{path_prefix}/power_sector_data/annual/us_units/"
):
    ba = filename.split(".")[0]
    ba_data = pd.read_csv(
        f"../data/results/{path_prefix}/power_sector_data/annual/us_units/{filename}",
        usecols=(["fuel_category"] + DATA_COLUMNS),
    )
    ba_data = ba_data[ba_data["fuel_category"] == "total"].drop(
        columns=["fuel_category"]
    )
    ba_data["ba_code"] = ba
    ba_data = ba_data[["ba_code"] + DATA_COLUMNS]
    calculated_ba.append(ba_data)

calculated_ba = pd.concat(calculated_ba, axis=0)


In [None]:
percent_diff_from_egrid = (
    (
        calculated_ba.set_index("ba_code").replace(0, 0.1)
        - egrid_plant_ba_agg.set_index("ba_code").replace(0, 0.1)
    )
    / egrid_plant_ba_agg.set_index("ba_code").replace(0, 0.1)
).round(2)

In [None]:
percent_diff_from_egrid.sort_values(by="net_generation_mwh")

In [None]:
# divide our calculation by the BA totals from eGRID
# if there are 0 values, replace with 0.1, so that div by zero doesn't return missing value
ba_metric = (
    calculated_ba.replace(0, 0.1)
    .set_index("ba_code")
    .div(egrid_plant_ba_agg.set_index("ba_code").replace(0, 0.1))
    .sort_values(by="co2_mass_lb")
    .round(3)
)

total = pd.DataFrame(
    calculated_ba[data_columns]
    .sum()
    .div(egrid_plant_ba_agg[data_columns].sum())
    .rename("Total")
).T

# calculate the difference in the number of plants in each region
# plant_count = (plant_annual_total.groupby('ba_code', dropna=False).count()['plant_id_egrid'] - egrid_plant.groupby('ba_code', dropna=False).count()['plant_id_egrid']).rename('num_plants')
# ba_metric = ba_metric.merge(plant_count, how='left', left_index=True, right_index=True).drop(columns=['plant_id_egrid']).sort_index()

ba_metric = pd.concat([ba_metric, total], axis=0).round(2)

ba_metric = ba_metric[data_columns]

columns_to_check = [
    "net_generation_mwh",
    "fuel_consumed_mmbtu",
    "fuel_consumed_for_electricity_mmbtu",
    "co2_mass_lb",
]

with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(ba_metric[~(ba_metric[columns_to_check] == 1).all(axis=1)])


## Explore specific plants

### Notes

Net generation < 50%:
 - 2617 has negative net generation
 - 10444, 59395: some plants are hybrid fossil / solar plants, but the primary energy source code is getting listed as PV for certain generators, which is causing an issue

Net generation > 50%:
 - 335 has two extra units (CT1 and CT2) that report to CEMS but not EIA. Likely correct
 - 7288 reports DFO to CEMS, but NG to EIA - there's probably some generators not reporting
 - 60698 double counting generation from cEMS because of 90% heat threhshold in a month - might be fixed once we tackle heat input
 - 58256 is a solar/hybrid and in generators_entity_eia the battery portion is associated with a separate generator id (should only be 1)


Other:
 - plant 3754 has heat input in cems and eia that don't match
 - 2401 has generation in both cems and eia
 - 50933 might have allocation issue (doesn't appear in CEMS)

To check
 - 1404 reports generation to CEMS in December, but not to EIA. This is likely correct
 - plant 2504 has three units (120, 121, 122) that don't appear in EIA, and in CEMS only report steam. 

Fuel > 50%
 - Plant 3116 reports much more heat input to CEMS during ozone months than to EIA

BA Totals
 - TEPC and SRP are off because the Gila River Generator is shared between SRP and TEPC, and eGRID reports all generation from this project belonging to TEPC


In [None]:
subplant_crosswalk = pd.read_csv(f"../data/outputs/{year}/subplant_crosswalk.csv")
cems_unit_monthly = (
    cems.groupby(["plant_id_eia", "unitid", "report_date"], dropna=False).sum().reset_index()
)
cems_unit = cems.groupby(["plant_id_eia", "unitid"], dropna=False).sum().reset_index()

In [None]:
plant_to_explore = 58223


In [None]:
egrid_plant[egrid_plant["plant_id_eia"] == plant_to_explore]



In [None]:
annual_plant_results[annual_plant_results["plant_id_eia"] == plant_to_explore]

In [None]:
eia923_allocated[eia923_allocated["plant_id_eia"] == plant_to_explore]

In [None]:
eia923_allocated[eia923_allocated["plant_id_eia"] == plant_to_explore].sum()

In [None]:
cems_unit[cems_unit["plant_id_eia"] == plant_to_explore]

In [None]:
cems_unit_monthly[cems_unit_monthly["plant_id_eia"] == plant_to_explore].sum()

In [None]:
cems[(cems["plant_id_eia"] == plant_to_explore)]

In [None]:
cems[(cems["gross_generation_mwh"] == 0) & (cems["fuel_consumed_mmbtu"] == 0)]

In [None]:
subplant_crosswalk.loc[subplant_crosswalk["plant_id_eia"] == plant_to_explore, ["unitid","generator_id","subplant_id"]].drop_duplicates()

In [None]:
eia923_allocated.loc[eia923_allocated["plant_id_eia"] == plant_to_explore, ["generator_id","subplant_id"]].drop_duplicates()

In [None]:
cems_unit_monthly.loc[cems_unit_monthly["plant_id_eia"] == plant_to_explore, ["unitid","subplant_id"]].drop_duplicates()