In [None]:
# import packages
import numpy as np
import pandas as pd
import os
import plotly.express as px
from IPython.display import display

%reload_ext autoreload
%autoreload 2

# Tell python where to look for modules. 
import sys
sys.path.append('../../hourly-egrid/')

# import local modules
import src.load_data as load_data
from src.data_cleaning import assign_ba_code_to_plant
import src.validation as validation

from src.column_checks import get_dtypes, apply_dtypes

# Specify the year for validation

In [None]:
year = 2020

## Compare Our Results to eGRID

In [None]:
# load our annual plant level data
annual_plant_results = pd.read_csv(f"../data/results/{year}/plant_data/annual/us_units/plant_data.csv", dtype=get_dtypes())
plant_attributes = pd.read_csv(f"../data/outputs/{year}/plant_static_attributes_{year}.csv", dtype=get_dtypes())
annual_plant_results = annual_plant_results.merge(plant_attributes, how="left", on="plant_id_eia")

# add a egrid id
annual_plant_results = validation.add_egrid_plant_id(annual_plant_results, from_id='eia', to_id='egrid')

In [None]:
# Load the eGRID plant table
egrid_plant = validation.load_egrid_plant_file(year)

### Identify plants missing from our results that exist in eGRID


In [None]:
# identify any plants that are in egrid but not our totals, and any plants that are in our totals, but not egrid
PLANTS_MISSING_FROM_CALCULATION = list(set(egrid_plant['plant_id_eia'].unique()) - set(annual_plant_results['plant_id_eia'].unique()))

# Which plants are included in eGRID but are missing from our calculations?
missing_from_calc = egrid_plant[egrid_plant['plant_id_egrid'].isin(PLANTS_MISSING_FROM_CALCULATION)]

# see if any of these plants are retired
generators_eia860 = load_data.load_pudl_table('generators_eia860', year=year)
missing_from_calc.merge(generators_eia860[["plant_id_eia","operational_status","current_planned_operating_date",'retirement_date']].drop_duplicates(), how='left', on='plant_id_eia')

### Identify plants missing from eGRID that are in our calculations

In [None]:
# Which plants are in our calculations, but are missing from eGRID?
PLANTS_MISSING_FROM_EGRID = list(set(annual_plant_results['plant_id_egrid'].unique()) - set(egrid_plant['plant_id_egrid'].unique()))

plant_names = load_data.load_pudl_table('plants_entity_eia')[['plant_id_eia','plant_name_eia','sector_name_eia']]
missing_from_egrid = annual_plant_results[annual_plant_results['plant_id_egrid'].isin(PLANTS_MISSING_FROM_EGRID)].merge(plant_names, how='left', on='plant_id_eia')

missing_from_egrid

In [None]:
# how many of the plants missing from egrid have non-zero data
missing_from_egrid[missing_from_egrid['fuel_consumed_mmbtu'] != 0]

### Compare whether totals for plants with EPA IDs that differ from EIA IDs match

In [None]:
# identify where there is a single egrid plant id for multiple eia plant ids
double_ids = annual_plant_results[annual_plant_results['plant_id_egrid'].duplicated(keep=False)]
double_ids = double_ids.groupby('plant_id_egrid').sum()['net_generation_mwh'].reset_index() # focus on net generation for now
# merge the egrid data
double_ids = double_ids.merge(egrid_plant[['plant_id_egrid','net_generation_mwh']], how='left', on='plant_id_egrid', suffixes=('_calc','_egrid'))
double_ids['percent_diff'] = ((double_ids['net_generation_mwh_calc'] - double_ids['net_generation_mwh_egrid']) / double_ids['net_generation_mwh_egrid']).round(3)
double_ids

### Identify plants where our BA assignment does not match eGRID

In [None]:
ba_code_match = egrid_plant.set_index('plant_id_eia')[['plant_name','ba_code']].merge(annual_plant_results.set_index('plant_id_eia')[['ba_code']], how='inner', left_index=True, right_index=True, suffixes=("_egrid",'_calc'))

# plants with missing ba code
#ba_code_match[(ba_code_match['ba_code_calc'].isna()) & ~(ba_code_match['ba_code_egrid'].isna())]
ba_code_match[ba_code_match['ba_code_calc'] != ba_code_match['ba_code_egrid']]

In [None]:
# how many of these mismatches are for non-missing bas
ba_code_match[(ba_code_match['ba_code_calc'] != ba_code_match['ba_code_egrid']) & ~(ba_code_match['ba_code_egrid'].isna())]

### Identify whether the fuel codes of each plant match

In [None]:
fuel_match = egrid_plant.set_index('plant_id_eia')[['plant_name','plant_primary_fuel']].merge(annual_plant_results.set_index('plant_id_eia')[['plant_primary_fuel']], how='inner', left_index=True, right_index=True, suffixes=("_egrid",'_calc'))

fuel_match[fuel_match['plant_primary_fuel_egrid'] != fuel_match['plant_primary_fuel_calc']]

# Split data into different groups based on known discrepencies

There are certain classes of plants where there are known issues that we know will prevent the totals from matching:
- Nuclear power plants: eGRID is generally missing fuel consumption data
- Geothermal power plants: EGRID sometimes has incorrect geotypes that may lead to different emissions estimates
- There are certain power plants for which eGRID is missing fuel data from an entire prime mover
- Plants that only report to CEMS during the ozone season
- There are certain power plants that are missing altogether from eGRID
- Fuel cells are assumed by eGRID to have zero emissions
- plants where data is missing from the boiler fuel table in EIA-923


We could identify plants where the total fuel doesn't match EIA-923, but matches the total fuel for a subset of prime movers

In [None]:
pudl_out = load_data.initialize_pudl_out(year)
eia923_allocated = pd.read_csv(f'../data/outputs/{year}/eia923_allocated_{year}.csv', dtype=get_dtypes(), parse_dates=['report_date'])


In [None]:
all_other_plants = annual_plant_results.copy()

# missing plants
missing_plants = annual_plant_results[annual_plant_results["plant_id_eia"].isin(PLANTS_MISSING_FROM_EGRID)]
all_other_plants = all_other_plants[~all_other_plants["plant_id_eia"].isin(list(missing_plants.plant_id_eia.unique()))]

# geothermal
geothermal_plants = annual_plant_results[annual_plant_results["plant_primary_fuel"] == "GEO"]
all_other_plants = all_other_plants[~all_other_plants["plant_id_eia"].isin(list(geothermal_plants.plant_id_eia.unique()))]

# nuclear
nuclear_plants = annual_plant_results[annual_plant_results["plant_primary_fuel"] == "NUC"]
all_other_plants = all_other_plants[~all_other_plants["plant_id_eia"].isin(list(nuclear_plants.plant_id_eia.unique()))]

# fuel cells
gens_eia860 = pudl_out.gens_eia860()
PLANTS_WITH_FUEL_CELLS = list(gens_eia860.loc[gens_eia860["prime_mover_code"] == "FC", "plant_id_eia"].unique())
fuel_cell_plants = annual_plant_results[annual_plant_results["plant_id_eia"].isin(PLANTS_WITH_FUEL_CELLS)]
all_other_plants = all_other_plants[~all_other_plants["plant_id_eia"].isin(list(fuel_cell_plants.plant_id_eia.unique()))]

# ozone season reporters
# identify all of the plants with generators that report data from both EIA and CEMS
multi_source_reporters = eia923_allocated[["plant_id_eia","generator_id","hourly_data_source"]].drop_duplicates()
MULTI_SOURCE_PLANTS = list(multi_source_reporters.loc[multi_source_reporters.duplicated(subset=["plant_id_eia","generator_id"] ,keep=False), "plant_id_eia"].unique())
ozone_season_plants = annual_plant_results[annual_plant_results["plant_id_eia"].isin(MULTI_SOURCE_PLANTS)]
all_other_plants = all_other_plants[~all_other_plants["plant_id_eia"].isin(list(ozone_season_plants.plant_id_eia.unique()))]

# missing prime movers



## Identify where eGRID might be missing data
It seems that there are quite a few generators where fuel consumption data is missing

In [None]:
egrid_plant[egrid_plant['energy_source_code'] == 'NUC']

In [None]:
pudl_out = load_data.initialize_pudl_out(year)

# load the EIA generator fuel data
IDX_PM_ESC = ["report_date", "plant_id_eia", "energy_source_code", "prime_mover_code"]
gf = pudl_out.gf_eia923().loc[
    :,
    IDX_PM_ESC
    + [
        "net_generation_mwh",
        "fuel_consumed_mmbtu",
        "fuel_consumed_for_electricity_mmbtu",
    ],
]

# add egrid plant ids
egrid_crosswalk = pd.read_csv(
    "../data/manual/egrid_static_tables/table_C5_crosswalk_of_EIA_ID_to_EPA_ID.csv"
)
eia_to_egrid_id = dict(
    zip(list(egrid_crosswalk["plant_id_eia"]), list(egrid_crosswalk["plant_id_egrid"]))
)
gf["plant_id_egrid"] = gf["plant_id_eia"]
gf["plant_id_egrid"].update(gf["plant_id_egrid"].map(eia_to_egrid_id))

# calculate an annual total for each plant
gf_total = gf.groupby(["plant_id_egrid"]).sum().reset_index()

# choose a metric to compare
metric = "fuel_consumed_mmbtu"

# merge the annual EIA-923 data into the egrid data
egrid_eia_comparison = (
    egrid_plant[
        ["plant_id_egrid", "plant_name", "ba_code", "energy_source_code", metric]
    ]
    .merge(
        gf_total[["plant_id_egrid", metric]],
        how="outer",
        on="plant_id_egrid",
        suffixes=("_egrid", "_eia923"),
        indicator="source",
    )
    .round(0)
)
egrid_eia_comparison[f"{metric}_egrid"] = egrid_eia_comparison[
    f"{metric}_egrid"
].fillna(0)
# calculate an absolute difference and percent difference between the two values
egrid_eia_comparison["difference"] = (
    egrid_eia_comparison[f"{metric}_egrid"] - egrid_eia_comparison[f"{metric}_eia923"]
)
egrid_eia_comparison["percent_difference"] = (
    egrid_eia_comparison[f"{metric}_egrid"] - egrid_eia_comparison[f"{metric}_eia923"]
) / egrid_eia_comparison[f"{metric}_eia923"]
egrid_eia_comparison.loc[
    egrid_eia_comparison["difference"] == 0, "percent_difference"
] = 0




In [None]:
# add cems data
cems_total = cems.copy()[['plant_id_eia',metric]]
cems_total["plant_id_egrid"] = cems_total["plant_id_eia"]
cems_total["plant_id_egrid"].update(cems_total["plant_id_egrid"].map(eia_to_egrid_id))
cems_total = cems_total.groupby('plant_id_egrid').sum()[metric].reset_index().rename(columns={metric:f"{metric}_cems"})

# merge cems data into egrid 
egrid_eia_comparison = egrid_eia_comparison.merge(cems_total, how='outer', on='plant_id_egrid')

In [None]:
egrid_eia_comparison[egrid_eia_comparison['source'] == 'left_only']

In [None]:
# egrid seems to be missing fuel consumption data for most nuclear power plants
missing_nuclear = egrid_eia_comparison[egrid_eia_comparison['energy_source_code'] == 'NUC']
missing_nuclear.sum()

In [None]:
egrid_eia_comparison[(egrid_eia_comparison['percent_difference']  < - 0.01)]

In [None]:
# where is egrid missing data?
egrid_eia_comparison[(egrid_eia_comparison['percent_difference'] < -0.01) & (egrid_eia_comparison['energy_source_code'] != 'NUC')]#.sort_values(by='percent_difference').head(20)

In [None]:
# how much emissions does this account for?
# group by fuel code
missing_emissions = egrid_eia_comparison[(egrid_eia_comparison['percent_difference'] < -0.01) & (egrid_eia_comparison['energy_source_code'] != 'NUC')].groupby('energy_source_code').sum().reset_index()

# get emission factors
emission_factors = load_data.load_ghg_emission_factors()[['energy_source_code', 'co2_lb_per_mmbtu']]
missing_emissions = missing_emissions.merge(emission_factors, how='left', on='energy_source_code')
missing_emissions['co2_mass_lb'] = missing_emissions['difference'] * missing_emissions['co2_lb_per_mmbtu']
missing_emissions.sum()

## Plant Metric

In [None]:
def compare_plant_level_results(plant_data, egrid_plant, PLANTS_MISSING_FROM_EGRID):
    # standardize column names and index so that the two dfs can be divided
    calculated_to_compare = plant_data.groupby('plant_id_egrid', dropna=False).sum().drop(columns=['plant_id_eia'])

    # drop the plants that have no data in eGRID
    plants_with_no_data_in_egrid = list(egrid_plant[egrid_plant[['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb','co2_mass_lb_adjusted']].sum(axis=1) == 0]['plant_id_egrid'])
    egrid_plant = egrid_plant[~egrid_plant['plant_id_eia'].isin(plants_with_no_data_in_egrid)]

    egrid_to_compare = egrid_plant.set_index(['plant_id_egrid']).drop(columns=['ba_code','state','plant_name','plant_id_eia'])
    # only keep plants that are in the comparison data
    egrid_to_compare = egrid_to_compare[egrid_to_compare.index.isin(list(calculated_to_compare.index.unique()))]

    # divide calculated value by egrid value
    compared = calculated_to_compare.div(egrid_to_compare).merge(egrid_plant[['plant_id_egrid','plant_name','ba_code', 'state']], how='left', left_index=True, right_on='plant_id_egrid').set_index('plant_id_egrid')
    compared['plant_name'] = compared['plant_name'].fillna('unknown')

    # create a dataframe that merges the two sources of data together
    compared_merged = calculated_to_compare.merge(egrid_to_compare, how='left', on='plant_id_egrid', suffixes=('_calc','_egrid'))

    # for each column, change missing values to zero if both values are zero (only nan b/c divide by zero)
    for col in ['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb_adjusted','co2_mass_lb']:
        # identify plants with zero values for both
        plant_ids = list(compared_merged[(compared_merged[f'{col}_calc'] == 0) & (compared_merged[f'{col}_egrid'] == 0)].index)
        compared.loc[compared.index.isin(plant_ids), col] = 1

    # for each column, categorize the data based on how far it is off from egrid
    for col in ['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb_adjusted','co2_mass_lb']:
        # add a new column
        compared[f'{col}_status'] = pd.cut(x=compared[col], 
                                        bins=[-999999999,0,0.5,0.9,0.99,0.9999,1,1.0001,1.01,1.1,1.5,999999999], 
                                        labels=['negative','<50%','+/-50%','+/-10%','+/-1%','!exact','!exact','+/-1%','+/-10%','+/-50%','>50%'], 
                                        ordered=False)
        # replace any missing values with missing
        compared[f'{col}_status'] = compared[f'{col}_status'].astype(str)  
        compared[f'{col}_status'] = compared[f'{col}_status'].fillna('missing')
        compared[f'{col}_status'] = compared[f'{col}_status'].replace('nan','missing')
        compared.loc[(compared.index.isin(PLANTS_MISSING_FROM_EGRID)),f'{col}_status'] = 'not_in_egrid'

        # identify which plants are missing from egrid vs calculated values
    for col in ['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb_adjusted','co2_mass_lb']:
        # identify plants that are missing in egrid
        plants_missing_egrid = list(compared_merged[(compared_merged[f'{col}_calc'] > 0) & (compared_merged[f'{col}_egrid'].isna())].index)
        compared.loc[compared.index.isin(plants_missing_egrid), f'{col}_status'] = 'missing_in_egrid'
        # identify plants that are missing from our calculations
        plants_missing_calc = list(compared_merged[(compared_merged[f'{col}_calc'].isna()) & (compared_merged[f'{col}_egrid'] > 0)].index)
        compared.loc[compared.index.isin(plants_missing_calc), f'{col}_status'] = 'missing_in_calc'
        # identify where our calculations are missing a zero value
        plants_missing_zero_calc = list(compared_merged[(compared_merged[f'{col}_calc'].isna()) & (compared_merged[f'{col}_egrid'] == 0)].index)
        compared.loc[compared.index.isin(plants_missing_zero_calc), f'{col}_status'] = 'calc_missing_zero_value_from_egrid'
        # identify where egrid has a missing value instead of a zero
        plants_missing_zero_egrid = list(compared_merged[(compared_merged[f'{col}_calc'] == 0) & (compared_merged[f'{col}_egrid'].isna())].index)
        compared.loc[compared.index.isin(plants_missing_zero_egrid), f'{col}_status'] = 'egrid_missing_zero_value_from_calc'
        # identify where egrid has a zero value where we have a positive value
        plants_incorrect_zero_egrid = list(compared_merged[(compared_merged[f'{col}_calc'] > 0) & (compared_merged[f'{col}_egrid'] == 0)].index)
        compared.loc[compared.index.isin(plants_incorrect_zero_egrid), f'{col}_status'] = 'calc_positive_but_egrid_zero'

    # create a dataframe that counts how many plants are in each category
    comparison_count = []
    for col in ['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb_adjusted','co2_mass_lb']:
        count = compared.groupby(f'{col}_status', dropna=False).count()['plant_name'].rename(col)
        count.index = count.index.rename('status')
        comparison_count.append(count)

    comparison_count = pd.concat(comparison_count, axis=1).fillna(0).astype(int)
    comparison_count = pd.concat([comparison_count, pd.DataFrame(comparison_count.sum().rename('Total')).T], axis=0)   
    return comparison_count


In [None]:
compare_plant_level_results(nuclear_plants, egrid_plant, PLANTS_MISSING_FROM_EGRID)

In [None]:
compare_plant_level_results(geothermal_plants, egrid_plant, PLANTS_MISSING_FROM_EGRID)

In [None]:
compare_plant_level_results(fuel_cell_plants, egrid_plant, PLANTS_MISSING_FROM_EGRID)

In [None]:
compare_plant_level_results(missing_plants, egrid_plant, PLANTS_MISSING_FROM_EGRID)

In [None]:
compare_plant_level_results(ozone_season_plants, egrid_plant, PLANTS_MISSING_FROM_EGRID)

In [None]:
compare_plant_level_results(all_other_plants, egrid_plant, PLANTS_MISSING_FROM_EGRID)

In [None]:
print(comparison_count.to_markdown())

## Compare Annual BA values to eGRID BA file

In [None]:
year = 2020
path_prefix = year

# Load the eGRID plant table
egrid_plant = validation.load_egrid_plant_file(year)

egrid_ba = validation.load_egrid_ba_file(year)

#aggregate the plant data up to the BA level
data_columns = ['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb','co2_mass_lb_adjusted']
egrid_plant_ba_agg = egrid_plant.groupby(['ba_code']).sum()[data_columns].reset_index()

In [None]:
egrid_plant_ba_agg

In [None]:
# load our annual ba data
DATA_COLUMNS = ['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb','co2_mass_lb_adjusted']

calculated_ba = []

for filename in os.listdir(f"../data/results/{path_prefix}/power_sector_data/annual/us_units/"):
    ba= filename.split(".")[0]
    ba_data = pd.read_csv(f"../data/results/{path_prefix}/power_sector_data/annual/us_units/{filename}", usecols=(["fuel_category"] + DATA_COLUMNS))
    ba_data = ba_data[ba_data["fuel_category"] == "total"].drop(columns=["fuel_category"])
    ba_data["ba_code"] = ba
    ba_data = ba_data[["ba_code"] + DATA_COLUMNS]
    calculated_ba.append(ba_data)

calculated_ba = pd.concat(calculated_ba, axis=0)

In [None]:
percent_diff_from_egrid = ((calculated_ba.set_index("ba_code").replace(0,0.1) - egrid_plant_ba_agg.set_index("ba_code").replace(0,0.1)) / egrid_plant_ba_agg.set_index("ba_code").replace(0,0.1)).round(2)

In [None]:
percent_diff_from_egrid.sort_values(by="net_generation_mwh")

In [None]:


# divide our calculation by the BA totals from eGRID
# if there are 0 values, replace with 0.1, so that div by zero doesn't return missing value
ba_metric = plant_annual_total.groupby('ba_code', dropna=False).sum().drop(columns='plant_id_eia').replace(0,0.1).div(egrid_ba.set_index('ba_code').replace(0,0.1)).sort_values(by='co2_mass_lb').round(3)

total = pd.DataFrame(plant_annual_total[data_columns].sum().div(egrid_ba[data_columns].sum()).rename('Total')).T

# calculate the difference in the number of plants in each region
plant_count = (plant_annual_total.groupby('ba_code', dropna=False).count()['plant_id_egrid'] - egrid_plant.groupby('ba_code', dropna=False).count()['plant_id_egrid']).rename('num_plants')
ba_metric = ba_metric.merge(plant_count, how='left', left_index=True, right_index=True).drop(columns=['plant_id_egrid']).sort_index()

ba_metric = pd.concat([ba_metric, total], axis=0).round(2) 

ba_metric = ba_metric[data_columns + ['num_plants']]

columns_to_check = ['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb']

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(ba_metric[~(ba_metric[columns_to_check] == 1).all(axis=1)])

In [None]:
# how much co2 comes from CEMS vs EIA
data_source_by_ba = pd.pivot_table(eia923_allocated, values='co2_mass_lb', index='ba_code', columns='data_source', dropna=False, aggfunc=np.sum).replace(0,0.001)
data_source_by_ba = data_source_by_ba.div(data_source_by_ba.sum(axis=1), axis=0)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(data_source_by_ba.round(3).fillna(0).sort_values(by='cems'))

In [None]:
print(ba_metric.to_markdown())

## Explore specific plants

### Notes

Net generation < 50%:
 - 2617 has negative net generation
 - 10444, 59395: some plants are hybrid fossil / solar plants, but the primary energy source code is getting listed as PV for certain generators, which is causing an issue

Net generation > 50%:
 - 335 has two extra units (CT1 and CT2) that report to CEMS but not EIA. Likely correct
 - 7288 reports DFO to CEMS, but NG to EIA - there's probably some generators not reporting
 - 60698 double counting generation from cEMS because of 90% heat threhshold in a month - might be fixed once we tackle heat input
 - 58256 is a solar/hybrid and in generators_entity_eia the battery portion is associated with a separate generator id (should only be 1)


Other:
 - plant 3754 has heat input in cems and eia that don't match
 - 2401 has generation in both cems and eia
 - 50933 might have allocation issue (doesn't appear in CEMS)

To check
 - 1404 reports generation to CEMS in December, but not to EIA. This is likely correct
 - plant 2504 has three units (120, 121, 122) that don't appear in EIA, and in CEMS only report steam. 

Fuel > 50%
 - Plant 3116 reports much more heat input to CEMS during ozone months than to EIA

BA Totals
 - TEPC and SRP are off because the Gila River Generator is shared between SRP and TEPC, and eGRID reports all generation from this project belonging to TEPC


In [None]:
# examine specific plants in a category
value = 'net_generation_mwh'
status = '>50%'

#compared_merged.loc[64877,:]

compared[compared[f'{value}_status'] == status].sort_values(by=value)

In [None]:
compared[compared[f'{value}_status'] == status].sort_values(by=value).sample(10)

In [None]:
plant_to_explore = 58380

In [None]:
egrid_plant[egrid_plant['plant_id_eia'] == plant_to_explore]

In [None]:
plant_annual_total[plant_annual_total['plant_id_eia'] == plant_to_explore]

In [None]:
eia923_allocated[eia923_allocated['plant_id_eia'] == plant_to_explore].sum()

In [None]:
cems_unit_monthly = cems.groupby(['plant_id_eia','unitid','report_date']).sum().reset_index()


In [None]:
cems_unit_monthly[cems_unit_monthly['plant_id_eia'] == plant_to_explore].sum()

In [None]:
cems[(cems['plant_id_eia'] == plant_to_explore) & (cems['operating_time_hours'] > 0)]

In [None]:
cems_plant_annual[cems_plant_annual['plant_id_eia'] == plant_to_explore]

In [None]:
# there are some plants that report heat input and co2 in CEMS, but are missing net generation data
# TODO: we should maybe try and fill net generation data using EIA-923?
cems_plant_annual[cems_plant_annual['net_generation_mwh'] == 0]

## Explore BA Matching
### Notes
TEPC and SRP are off because the Gila River Generator is shared between SRP and TEPC, and eGRID reports all generation from this project belonging to TEPC

In [None]:
ba_plant_match = plant_annual_total[plant_annual_total['ba_code'].isna()].merge(egrid_plant[egrid_plant['ba_code'].isna()], how='left', on='plant_id_egrid', suffixes=("_calc",'_egrid'))
ba_plant_match[ba_plant_match['net_generation_mwh_calc'].round(0) != ba_plant_match['net_generation_mwh_egrid'].round(0)]

In [None]:
egrid_plant[egrid_plant['ba_code'] == 'CPLE']

In [None]:
compare_plants_in_ba = egrid_plant.merge(plant_annual_total, how='outer', on='plant_id_egrid', suffixes=('_egrid','_calc'))

In [None]:
ba = 'CPLE'
metric = 'fuel_consumed_mmbtu'

compare_plants_in_ba[((compare_plants_in_ba['ba_code_egrid'] == ba) | (compare_plants_in_ba['ba_code_calc'] == ba)) & (compare_plants_in_ba[f'{metric}_egrid'].round(0) != compare_plants_in_ba[f'{metric}_calc'].round(0))]

In [None]:
plant_annual_total[plant_annual_total['ba_code'] == 'AMPL']