In [1]:
# import packages
import numpy as np
import pandas as pd
import os
import plotly.express as px
from IPython.display import display

%reload_ext autoreload
%autoreload 2

# Tell python where to look for modules. 
import sys
sys.path.append('../../hourly-egrid/')

# import local modules
import src.load_data as load_data
from src.data_cleaning import assign_ba_code_to_plant
import src.validation as validation

from src.column_checks import get_dtypes, apply_dtypes

# Specify the year for validation

In [2]:
year = 2020

## Compare Our Results to eGRID

In [6]:
# load our annual plant level data
annual_plant_results = pd.read_csv(f"../data/results/{year}/plant_data/annual/us_units/plant_data.csv", dtype=get_dtypes())
plant_attributes = pd.read_csv(f"../data/outputs/{year}/plant_static_attributes_{year}.csv", dtype=get_dtypes())
annual_plant_results = annual_plant_results.merge(plant_attributes, how="left", on="plant_id_eia")

# add a egrid id
annual_plant_results = validation.add_egrid_plant_id(annual_plant_results, from_id='eia', to_id='egrid')

In [36]:
# Load the eGRID plant table
egrid_plant = validation.load_egrid_plant_file(year)

### Identify plants missing from our results that exist in eGRID


In [10]:
# identify any plants that are in egrid but not our totals, and any plants that are in our totals, but not egrid
PLANTS_MISSING_FROM_CALCULATION = list(set(egrid_plant['plant_id_eia'].unique()) - set(annual_plant_results['plant_id_eia'].unique()))

# Which plants are included in eGRID but are missing from our calculations?
missing_from_calc = egrid_plant[egrid_plant['plant_id_egrid'].isin(PLANTS_MISSING_FROM_CALCULATION)]

# see if any of these plants are retired
generators_eia860 = load_data.load_pudl_table('generators_eia860', year=year)
missing_from_calc.merge(generators_eia860.groupby('plant_id_eia')['retirement_date'].unique().reset_index(), how='left', on='plant_id_eia')

Unnamed: 0,ba_code,state,plant_id_egrid,plant_name,energy_source_code,net_generation_mwh,fuel_consumed_mmbtu,fuel_consumed_for_electricity_mmbtu,co2_mass_lb,co2_mass_lb_adjusted,plant_id_eia,retirement_date
0,CISO,CA,328,Borel,WAT,-204.0,,,0.0,0.0,328,[2017-09-01]
1,MISO,IA,7922,Brooklyn City North Plant,DFO,42.0,,,,,7922,[2011-09-01]
2,SWPP,KS,64074,Cimarron Bend III,WND,52458.0,,,0.0,0.0,64074,[None]
3,MISO,MN,63304,"Allium Community Solar Garden, LLC",SUN,62.0,,,0.0,0.0,63304,[None]
4,MISO,MN,63309,"Honeysuckle Solar, LLC",SUN,47.0,,,0.0,0.0,63309,[None]
5,MISO,MN,63311,"Lantana Solar, LLC",SUN,28.0,,,0.0,0.0,63311,[None]
6,SWPP,ND,63258,Aurora Wind Project,WND,26419.0,,,0.0,0.0,63258,[None]
7,ERCO,TX,62562,"High Lonesome Wind Power, LLC Hybrid",WND,461210.0,,,0.0,0.0,62562,[None]
8,ERCO,TX,62620,Sage Draw Wind,WND,965397.0,,,0.0,0.0,62620,[None]


### Identify plants missing from eGRID that are in our calculations

In [11]:
# Which plants are in our calculations, but are missing from eGRID?
PLANTS_MISSING_FROM_EGRID = list(set(annual_plant_results['plant_id_egrid'].unique()) - set(egrid_plant['plant_id_egrid'].unique()))

plant_names = load_data.load_pudl_table('plants_entity_eia')[['plant_id_eia','plant_name_eia','sector_name_eia']]
missing_from_egrid = annual_plant_results[annual_plant_results['plant_id_egrid'].isin(PLANTS_MISSING_FROM_EGRID)].merge(plant_names, how='left', on='plant_id_eia')

missing_from_egrid

Unnamed: 0,plant_id_eia,gross_generation_mwh,net_generation_mwh,steam_load_1000_lb,fuel_consumed_mmbtu,fuel_consumed_for_electricity_mmbtu,co2_mass_lb,ch4_mass_lb,n2o_mass_lb,nox_mass_lb,so2_mass_lb,co2_mass_lb_for_electricity,ch4_mass_lb_for_electricity,n2o_mass_lb_for_electricity,nox_mass_lb_for_electricity,so2_mass_lb_for_electricity,co2_mass_lb_adjusted,ch4_mass_lb_adjusted,n2o_mass_lb_adjusted,nox_mass_lb_adjusted,so2_mass_lb_adjusted,plant_primary_fuel,data_availability,ba_code,ba_code_physical,state,distribution_flag,fuel_category,fuel_category_eia930,timezone,plant_id_egrid,plant_name_eia,sector_name_eia
0,72,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,WAT,eia_only,CISO,CISO,CA,False,hydro,hydro,America/Los_Angeles,72,Venice Hydro,Electric Utility
1,172,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DFO,eia_only,MISO,MISO,AR,False,petroleum,petroleum,America/Chicago,172,Osceola (AR),Electric Utility
2,193,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DFO,eia_only,SPA,SPA,AR,False,petroleum,petroleum,America/Chicago,193,Municipal Light,Electric Utility
3,242,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,WAT,eia_only,CISO,CISO,CA,True,hydro,hydro,America/Los_Angeles,242,Hamilton Branch,Electric Utility
4,248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,WAT,eia_only,CISO,CISO,CA,True,hydro,hydro,America/Los_Angeles,248,Inskip,Electric Utility
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
349,64311,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,SUN,eia_only,PJM,PJM,NC,True,solar,solar,America/New_York,64311,"Sun Farm VIII, LLC",IPP Non-CHP
350,64385,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,SUN,eia_only,PJM,PJM,NC,True,solar,solar,America/New_York,64385,Gliden (Op Zone),IPP Non-CHP
351,64460,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,SUN,eia_only,MISO,MISO,MI,True,solar,solar,America/Detroit,64460,"Jack Francis Solar, LLC",IPP Non-CHP
352,64523,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,SUN,eia_only,CISO,CISO,CA,True,solar,solar,America/Los_Angeles,64523,Atascadero State Hospital,IPP Non-CHP


In [53]:
# how many of the plants missing from egrid have non-zero data
missing_from_egrid[missing_from_egrid['fuel_consumed_mmbtu'] != 0]

Unnamed: 0,plant_id_eia,gross_generation_mwh,net_generation_mwh,steam_load_1000_lb,fuel_consumed_mmbtu,fuel_consumed_for_electricity_mmbtu,co2_mass_lb,ch4_mass_lb,n2o_mass_lb,nox_mass_lb,so2_mass_lb,co2_mass_lb_for_electricity,ch4_mass_lb_for_electricity,n2o_mass_lb_for_electricity,nox_mass_lb_for_electricity,so2_mass_lb_for_electricity,co2_mass_lb_adjusted,ch4_mass_lb_adjusted,n2o_mass_lb_adjusted,nox_mass_lb_adjusted,so2_mass_lb_adjusted,plant_primary_fuel,data_availability,ba_code,ba_code_physical,state,distribution_flag,fuel_category,fuel_category_eia930,timezone,plant_id_egrid,plant_name_eia,sector_name_eia
30,1146,0.0,0.0,0.0,6.0,6.0,980.0,0.0,0.0,18.6,1.8,980.0,0.0,0.0,18.6,1.8,980.0,0.0,0.0,18.6,1.8,DFO,eia_only,MISO,MISO,IA,False,petroleum,petroleum,America/Chicago,1146,Harlan,Electric Utility
32,1224,0.0,0.0,0.0,120.0,120.0,19598.4,0.0,0.0,372.0,34.4,19598.4,0.0,0.0,372.0,34.4,19598.4,0.0,0.0,372.0,34.4,DFO,eia_only,SWPP,SWPP,KS,True,petroleum,petroleum,America/Chicago,1224,Bird City,Electric Utility
35,1324,0.0,0.0,0.0,3769.2,3769.2,447914.5,9.2,0.0,9566.8,57.4,447914.5,9.2,0.0,9566.8,57.4,447914.5,9.2,0.0,9566.8,57.4,NG,eia_only,SWPP,SWPP,KS,True,natural_gas,natural_gas,America/Chicago,1324,Sharon Spring,Electric Utility
48,2090,0.0,0.0,0.0,2185.0,2185.0,356854.1,14.4,2.9,1931.7,1103.5,356854.1,14.4,2.9,1931.7,1103.5,356854.1,14.4,2.9,1931.7,1103.5,DFO,eia_only,SWPP,SWPP,MO,False,petroleum,petroleum,America/Chicago,2090,Nevada,Electric Utility
55,2225,0.0,0.0,0.0,6.0,6.0,979.9,0.0,0.0,18.6,1.7,979.9,0.0,0.0,18.6,1.7,979.9,0.0,0.0,18.6,1.7,DFO,eia_only,SWPP,SWPP,NE,True,petroleum,petroleum,America/Chicago,2225,Campbell,Electric Utility
59,2528,0.0,0.0,0.0,3068.0,3068.0,501065.9,20.2,4.0,10046.4,889.4,501065.9,20.2,4.0,10046.4,889.4,501065.9,20.2,4.0,10046.4,889.4,DFO,eia_only,NYIS,NYIS,NY,True,petroleum,petroleum,America/New_York,2528,Harris Lake,Electric Utility
65,2902,0.0,0.0,0.0,18.0,18.0,2939.8,0.1,0.0,55.9,5.2,2939.8,0.1,0.0,55.9,5.2,2939.8,0.1,0.0,55.9,5.2,DFO,eia_only,PJM,PJM,OH,True,petroleum,petroleum,America/New_York,2902,Arcanum,Electric Utility
66,2909,0.0,0.0,0.0,56.0,56.0,6545.4,0.0,0.0,22.6,0.0,6545.4,0.0,0.0,22.6,0.0,6545.4,0.0,0.0,22.6,0.0,NG,eia_only,PJM,PJM,OH,False,natural_gas,natural_gas,America/New_York,2909,West 41st Street,Electric Utility
79,6281,0.0,0.0,0.0,1242.8,1242.8,202974.1,8.0,0.5,4010.1,360.6,202974.1,8.0,0.5,4010.1,360.6,202974.1,8.0,0.5,4010.1,360.6,DFO,eia_only,AKMS,AKMS,AK,False,petroleum,petroleum,America/Anchorage,6281,Kodiak Microgrid,Electric Utility
90,7723,0.0,0.0,0.0,836.0,836.0,136535.7,5.5,1.2,2684.1,242.2,136535.7,5.5,1.2,2684.1,242.2,136535.7,5.5,1.2,2684.1,242.2,DFO,eia_only,AKMS,AKMS,AK,False,petroleum,petroleum,America/Anchorage,7723,Nymans Plant Microgrid,Electric Utility


### Compare whether totals for plants with EPA IDs that differ from EIA IDs match

In [54]:
# identify where there is a single egrid plant id for multiple eia plant ids
double_ids = annual_plant_results[annual_plant_results['plant_id_egrid'].duplicated(keep=False)]
double_ids = double_ids.groupby('plant_id_egrid').sum()['net_generation_mwh'].reset_index() # focus on net generation for now
# merge the egrid data
double_ids = double_ids.merge(egrid_plant[['plant_id_egrid','net_generation_mwh']], how='left', on='plant_id_egrid', suffixes=('_calc','_egrid'))
double_ids['percent_diff'] = ((double_ids['net_generation_mwh_calc'] - double_ids['net_generation_mwh_egrid']) / double_ids['net_generation_mwh_egrid']).round(3)
double_ids

Unnamed: 0,plant_id_egrid,net_generation_mwh_calc,net_generation_mwh_egrid,percent_diff
0,562,295534.8,295534.003,0.0
1,762,5274.45,13132.0,-0.598
2,1416,2488371.62,3497026.0,-0.288
3,2709,-60765.26,5829409.996,-1.01
4,3612,806755.7,4045388.003,-0.801
5,4076,110539.3,110539.0,0.0
6,10474,670410.9,670410.997,-0.0
7,55306,2084080.34,8487498.0,-0.754
8,55375,2349720.06,5447419.084,-0.569
9,55481,3633472.54,7276336.0,-0.501


### Identify plants where our BA assignment does not match eGRID

In [21]:
ba_code_match = egrid_plant.set_index('plant_id_eia')[['plant_name','ba_code']].merge(annual_plant_results.set_index('plant_id_eia')[['ba_code']], how='inner', left_index=True, right_index=True, suffixes=("_egrid",'_calc'))

# plants with missing ba code
#ba_code_match[(ba_code_match['ba_code_calc'].isna()) & ~(ba_code_match['ba_code_egrid'].isna())]
ba_code_match[ba_code_match['ba_code_calc'] != ba_code_match['ba_code_egrid']]

Unnamed: 0_level_0,plant_name,ba_code_egrid,ba_code_calc
plant_id_eia,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
58982,Allison Creek Hydro,,AKMS
60243,Ambler,,AKMS
7462,Angoon,,AKMS
7182,Aniak,,AKMS
62,Annex Creek,,AKMS
...,...,...,...
772,W H Hill,,HECO
54827,Wailuku River Hydroelectric,,HECO
768,Waimea,,HECO
10413,Wainiha Hydro,,HIMS


In [23]:
# how many of these mismatches are for non-missing bas
ba_code_match[(ba_code_match['ba_code_calc'] != ba_code_match['ba_code_egrid']) & ~(ba_code_match['ba_code_egrid'].isna())]

Unnamed: 0_level_0,plant_name,ba_code_egrid,ba_code_calc
plant_id_eia,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


### Identify whether the fuel codes of each plant match

In [37]:
fuel_match = egrid_plant.set_index('plant_id_eia')[['plant_name','plant_primary_fuel']].merge(annual_plant_results.set_index('plant_id_eia')[['plant_primary_fuel']], how='inner', left_index=True, right_index=True, suffixes=("_egrid",'_calc'))

fuel_match[fuel_match['plant_primary_fuel_egrid'] != fuel_match['plant_primary_fuel_calc']]

Unnamed: 0_level_0,plant_name,plant_primary_fuel_egrid,plant_primary_fuel_calc
plant_id_eia,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7484,NSB Nuiqsut Utility,DFO,NG
3,Barry,BIT,NG
50359,Sloss Industries Corp,NG,OG
54802,WestRock Coated Board,NG,BLQ
202,Carl Bailey,NG,RFO
...,...,...,...
54098,Kaukauna Paper Mill,BIT,BLQ
50476,Packaging of America Tomahawk Mill,WDS,NG
4078,Weston,SUB,RC
10477,Wisconsin Rapids Pulp Mill,NG,BLQ


## Identify where eGRID might be missing data
It seems that there are quite a few generators where fuel consumption data is missing

In [27]:
egrid_plant[egrid_plant['energy_source_code'] == 'NUC']

Unnamed: 0,ba_code,state,plant_id_egrid,plant_name,energy_source_code,net_generation_mwh,fuel_consumed_mmbtu,fuel_consumed_for_electricity_mmbtu,co2_mass_lb,co2_mass_lb_adjusted,plant_id_eia
169,TVA,AL,46,Browns Ferry,NUC,28774850.0,0.0,0.0,0.0,0.0,46
201,SOCO,AL,6001,Joseph M Farley,NUC,14775800.0,0.0,0.0,0.0,0.0,6001
244,MISO,AR,8055,Arkansas Nuclear One,NUC,15063050.0,0.0,0.0,0.0,0.0,8055
425,SRP,AZ,6008,Palo Verde,NUC,31552430.0,0.0,0.0,0.0,0.0,6008
931,CISO,CA,6099,Diablo Canyon,NUC,16258700.0,0.0,0.0,0.0,0.0,6099
2614,ISNE,CT,566,Millstone,NUC,15714860.0,0.0,0.0,0.0,0.0,566
2954,FPL,FL,6045,St Lucie,NUC,16800370.0,0.0,0.0,0.0,0.0,6045
2980,FPL,FL,621,Turkey Point,NUC,19004410.0,43740624.0,43740624.0,5200028000.0,5200028000.0,621
3056,SOCO,GA,6051,Edwin I Hatch,NUC,13967880.0,0.0,0.0,0.0,0.0,6051
3232,SOCO,GA,649,Vogtle,NUC,18857820.0,0.0,0.0,0.0,0.0,649


In [None]:
pudl_out = load_data.initialize_pudl_out(year)

# load the EIA generator fuel data
IDX_PM_ESC = ["report_date", "plant_id_eia", "energy_source_code", "prime_mover_code"]
gf = pudl_out.gf_eia923().loc[
    :,
    IDX_PM_ESC
    + [
        "net_generation_mwh",
        "fuel_consumed_mmbtu",
        "fuel_consumed_for_electricity_mmbtu",
    ],
]

# add egrid plant ids
egrid_crosswalk = pd.read_csv(
    "../data/manual/egrid_static_tables/table_C5_crosswalk_of_EIA_ID_to_EPA_ID.csv"
)
eia_to_egrid_id = dict(
    zip(list(egrid_crosswalk["plant_id_eia"]), list(egrid_crosswalk["plant_id_egrid"]))
)
gf["plant_id_egrid"] = gf["plant_id_eia"]
gf["plant_id_egrid"].update(gf["plant_id_egrid"].map(eia_to_egrid_id))

# calculate an annual total for each plant
gf_total = gf.groupby(["plant_id_egrid"]).sum().reset_index()

# choose a metric to compare
metric = "fuel_consumed_mmbtu"

# merge the annual EIA-923 data into the egrid data
egrid_eia_comparison = (
    egrid_plant[
        ["plant_id_egrid", "plant_name", "ba_code", "energy_source_code", metric]
    ]
    .merge(
        gf_total[["plant_id_egrid", metric]],
        how="outer",
        on="plant_id_egrid",
        suffixes=("_egrid", "_eia923"),
        indicator="source",
    )
    .round(0)
)
egrid_eia_comparison[f"{metric}_egrid"] = egrid_eia_comparison[
    f"{metric}_egrid"
].fillna(0)
# calculate an absolute difference and percent difference between the two values
egrid_eia_comparison["difference"] = (
    egrid_eia_comparison[f"{metric}_egrid"] - egrid_eia_comparison[f"{metric}_eia923"]
)
egrid_eia_comparison["percent_difference"] = (
    egrid_eia_comparison[f"{metric}_egrid"] - egrid_eia_comparison[f"{metric}_eia923"]
) / egrid_eia_comparison[f"{metric}_eia923"]
egrid_eia_comparison.loc[
    egrid_eia_comparison["difference"] == 0, "percent_difference"
] = 0




In [None]:
# add cems data
cems_total = cems.copy()[['plant_id_eia',metric]]
cems_total["plant_id_egrid"] = cems_total["plant_id_eia"]
cems_total["plant_id_egrid"].update(cems_total["plant_id_egrid"].map(eia_to_egrid_id))
cems_total = cems_total.groupby('plant_id_egrid').sum()[metric].reset_index().rename(columns={metric:f"{metric}_cems"})

# merge cems data into egrid 
egrid_eia_comparison = egrid_eia_comparison.merge(cems_total, how='outer', on='plant_id_egrid')

In [None]:
egrid_eia_comparison[egrid_eia_comparison['source'] == 'left_only']

In [None]:
# egrid seems to be missing fuel consumption data for most nuclear power plants
missing_nuclear = egrid_eia_comparison[egrid_eia_comparison['energy_source_code'] == 'NUC']
missing_nuclear.sum()

In [None]:
egrid_eia_comparison[(egrid_eia_comparison['percent_difference']  < - 0.01)]

In [None]:
# where is egrid missing data?
egrid_eia_comparison[(egrid_eia_comparison['percent_difference'] < -0.01) & (egrid_eia_comparison['energy_source_code'] != 'NUC')]#.sort_values(by='percent_difference').head(20)

In [None]:
# how much emissions does this account for?
# group by fuel code
missing_emissions = egrid_eia_comparison[(egrid_eia_comparison['percent_difference'] < -0.01) & (egrid_eia_comparison['energy_source_code'] != 'NUC')].groupby('energy_source_code').sum().reset_index()

# get emission factors
emission_factors = load_data.load_ghg_emission_factors()[['energy_source_code', 'co2_lb_per_mmbtu']]
missing_emissions = missing_emissions.merge(emission_factors, how='left', on='energy_source_code')
missing_emissions['co2_mass_lb'] = missing_emissions['difference'] * missing_emissions['co2_lb_per_mmbtu']
missing_emissions.sum()

## Plant Metric

In [45]:
# standardize column names and index so that the two dfs can be divided
calculated_to_compare = annual_plant_results.groupby('plant_id_egrid', dropna=False).sum().drop(columns=['plant_id_eia'])

# drop the plants that have no data in eGRID
plants_with_no_data_in_egrid = list(egrid_plant[egrid_plant[['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb','co2_mass_lb_adjusted']].sum(axis=1) == 0]['plant_id_egrid'])
egrid_plant = egrid_plant[~egrid_plant['plant_id_eia'].isin(plants_with_no_data_in_egrid)]

egrid_to_compare = egrid_plant.set_index(['plant_id_egrid']).drop(columns=['ba_code','state','plant_name','plant_id_eia'])

# divide calculated value by egrid value
compared = calculated_to_compare.div(egrid_to_compare).merge(egrid_plant[['plant_id_egrid','plant_name','ba_code', 'state']], how='left', left_index=True, right_on='plant_id_egrid').set_index('plant_id_egrid')
compared['plant_name'] = compared['plant_name'].fillna('unknown')

# create a dataframe that merges the two sources of data together
compared_merged = calculated_to_compare.merge(egrid_to_compare, how='outer', on='plant_id_egrid', suffixes=('_calc','_egrid'))

# for each column, change missing values to zero if both values are zero (only nan b/c divide by zero)
for col in ['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb_adjusted','co2_mass_lb']:
    # identify plants with zero values for both
    plant_ids = list(compared_merged[(compared_merged[f'{col}_calc'] == 0) & (compared_merged[f'{col}_egrid'] == 0)].index)
    compared.loc[compared.index.isin(plant_ids), col] = 1


# for each column, categorize the data based on how far it is off from egrid
for col in ['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb_adjusted','co2_mass_lb']:
    # add a new column
    compared[f'{col}_status'] = pd.cut(x=compared[col], 
                                       bins=[-999999999,0,0.5,0.9,0.99,0.9999,1,1.0001,1.01,1.1,1.5,999999999], 
                                       labels=['negative','<50%','+/-50%','+/-10%','+/-1%','!exact','!exact','+/-1%','+/-10%','+/-50%','>50%'], 
                                       ordered=False)
    # replace any missing values with missing
    compared[f'{col}_status'] = compared[f'{col}_status'].astype(str)  
    compared[f'{col}_status'] = compared[f'{col}_status'].fillna('missing')
    compared[f'{col}_status'] = compared[f'{col}_status'].replace('nan','missing')
    compared.loc[(compared.index.isin(plants_not_in_egrid)),f'{col}_status'] = 'not_in_egrid'


In [47]:
compared_merged

Unnamed: 0_level_0,gross_generation_mwh,net_generation_mwh_calc,steam_load_1000_lb,fuel_consumed_mmbtu_calc,fuel_consumed_for_electricity_mmbtu_calc,co2_mass_lb_calc,ch4_mass_lb,n2o_mass_lb,nox_mass_lb,so2_mass_lb,co2_mass_lb_for_electricity,ch4_mass_lb_for_electricity,n2o_mass_lb_for_electricity,nox_mass_lb_for_electricity,so2_mass_lb_for_electricity,co2_mass_lb_adjusted_calc,ch4_mass_lb_adjusted,n2o_mass_lb_adjusted,nox_mass_lb_adjusted,so2_mass_lb_adjusted,plant_primary_fuel,net_generation_mwh_egrid,fuel_consumed_mmbtu_egrid,fuel_consumed_for_electricity_mmbtu_egrid,co2_mass_lb_egrid,co2_mass_lb_adjusted_egrid
plant_id_egrid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
1,0.00,347.00,0.0,3045.00,3045.00,0.000000e+00,0.00,0.00,0.00,0.00,0.000000e+00,0.00,0.00,0.00,0.00,0.000000e+00,0.00,0.00,0.00,0.00,WND,347.00,3.045000e+03,3.045000e+03,0.000000e+00,0.000000e+00
2,0.00,139170.20,0.0,1220939.00,1220939.00,0.000000e+00,0.00,0.00,0.00,0.00,0.000000e+00,0.00,0.00,0.00,0.00,0.000000e+00,0.00,0.00,0.00,0.00,WAT,139170.00,1.220939e+06,1.220939e+06,0.000000e+00,0.000000e+00
3,11671959.25,11269731.84,0.0,98306042.56,98306042.56,1.493019e+10,1044647.82,145977.25,8077342.14,8773449.13,1.493019e+10,1044647.82,145977.25,8077342.14,8773449.13,1.493019e+10,1044647.82,145977.25,8077342.14,8773449.13,BIT,10499145.97,9.189588e+07,9.189588e+07,1.369345e+10,1.369345e+10
4,0.00,554613.60,0.0,4865620.80,4865620.80,0.000000e+00,0.00,0.00,0.00,0.00,0.000000e+00,0.00,0.00,0.00,0.00,0.000000e+00,0.00,0.00,0.00,0.00,WAT,554613.00,4.865621e+06,4.865621e+06,0.000000e+00,0.000000e+00
7,0.00,50435.00,619549.0,1242320.91,313356.00,1.469500e+08,2733.07,273.29,192922.25,741.46,0.000000e+00,0.00,0.00,0.00,0.00,0.000000e+00,0.00,0.00,0.00,0.00,NG,50435.00,8.867576e+05,6.850165e+05,1.053918e+08,8.141472e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63309,,,,,,,,,,,,,,,,,,,,,SUN,47.00,,,0.000000e+00,0.000000e+00
63311,,,,,,,,,,,,,,,,,,,,,SUN,28.00,,,0.000000e+00,0.000000e+00
63258,,,,,,,,,,,,,,,,,,,,,WND,26419.00,,,0.000000e+00,0.000000e+00
62562,,,,,,,,,,,,,,,,,,,,,WND,461210.00,,,0.000000e+00,0.000000e+00


In [48]:


# identify which plants are missing from egrid vs calculated values
for col in ['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb_adjusted','co2_mass_lb']:
    # identify plants that are missing in egrid
    plants_missing_egrid = list(compared_merged[(compared_merged[f'{col}_calc'] > 0) & (compared_merged[f'{col}_egrid'].isna())].index)
    compared.loc[compared.index.isin(plants_missing_egrid), f'{col}_status'] = 'missing_in_egrid'
    # identify plants that are missing from our calculations
    plants_missing_calc = list(compared_merged[(compared_merged[f'{col}_calc'].isna()) & (compared_merged[f'{col}_egrid'] > 0)].index)
    compared.loc[compared.index.isin(plants_missing_calc), f'{col}_status'] = 'missing_in_calc'
    # identify where our calculations are missing a zero value
    plants_missing_zero_calc = list(compared_merged[(compared_merged[f'{col}_calc'].isna()) & (compared_merged[f'{col}_egrid'] == 0)].index)
    compared.loc[compared.index.isin(plants_missing_zero_calc), f'{col}_status'] = 'calc_missing_zero_value_from_egrid'
    # identify where egrid has a missing value instead of a zero
    plants_missing_zero_egrid = list(compared_merged[(compared_merged[f'{col}_calc'] == 0) & (compared_merged[f'{col}_egrid'].isna())].index)
    compared.loc[compared.index.isin(plants_missing_zero_egrid), f'{col}_status'] = 'egrid_missing_zero_value_from_calc'
    # identify where egrid has a zero value where we have a positive value
    plants_incorrect_zero_egrid = list(compared_merged[(compared_merged[f'{col}_calc'] > 0) & (compared_merged[f'{col}_egrid'] == 0)].index)
    compared.loc[compared.index.isin(plants_incorrect_zero_egrid), f'{col}_status'] = 'calc_positive_but_egrid_zero'

# create a dataframe that counts how many plants are in each category
comparison_count = []
for col in ['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb_adjusted','co2_mass_lb']:
    count = compared.groupby(f'{col}_status', dropna=False).count()['plant_name'].rename(col)
    count.index = count.index.rename('status')
    comparison_count.append(count)

comparison_count = pd.concat(comparison_count, axis=1).fillna(0).astype(int)
comparison_count = pd.concat([comparison_count, pd.DataFrame(comparison_count.sum().rename('Total')).T], axis=0)   
comparison_count

Unnamed: 0,net_generation_mwh,fuel_consumed_mmbtu,fuel_consumed_for_electricity_mmbtu,co2_mass_lb_adjusted,co2_mass_lb
!exact,9077,8469,7898,7608,8145
+/-1%,771,699,641,900,755
+/-10%,129,423,584,640,551
+/-50%,36,267,588,443,341
<50%,10,11,48,43,9
>50%,9,110,203,138,108
egrid_missing_zero_value_from_calc,354,320,324,333,323
missing,1,9,9,1,1
missing_in_calc,8,0,0,0,0
missing_in_egrid,1,46,42,151,161


In [None]:
print(comparison_count.to_markdown())

## Compare Annual BA values to eGRID BA file

In [None]:
year = 2020
path_prefix = year

# Load the eGRID plant table
egrid_plant = validation.load_egrid_plant_file(year)

egrid_ba = validation.load_egrid_ba_file(year)

#aggregate the plant data up to the BA level
data_columns = ['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb','co2_mass_lb_adjusted']
egrid_plant_ba_agg = egrid_plant.groupby(['ba_code']).sum()[data_columns].reset_index()

In [None]:
egrid_plant_ba_agg

In [None]:
# load our annual ba data
DATA_COLUMNS = ['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb','co2_mass_lb_adjusted']

calculated_ba = []

for filename in os.listdir(f"../data/results/{path_prefix}/power_sector_data/annual/us_units/"):
    ba= filename.split(".")[0]
    ba_data = pd.read_csv(f"../data/results/{path_prefix}/power_sector_data/annual/us_units/{filename}", usecols=(["fuel_category"] + DATA_COLUMNS))
    ba_data = ba_data[ba_data["fuel_category"] == "total"].drop(columns=["fuel_category"])
    ba_data["ba_code"] = ba
    ba_data = ba_data[["ba_code"] + DATA_COLUMNS]
    calculated_ba.append(ba_data)

calculated_ba = pd.concat(calculated_ba, axis=0)

In [None]:
percent_diff_from_egrid = ((calculated_ba.set_index("ba_code").replace(0,0.1) - egrid_plant_ba_agg.set_index("ba_code").replace(0,0.1)) / egrid_plant_ba_agg.set_index("ba_code").replace(0,0.1)).round(2)

In [None]:
percent_diff_from_egrid.sort_values(by="net_generation_mwh")

In [None]:


# divide our calculation by the BA totals from eGRID
# if there are 0 values, replace with 0.1, so that div by zero doesn't return missing value
ba_metric = plant_annual_total.groupby('ba_code', dropna=False).sum().drop(columns='plant_id_eia').replace(0,0.1).div(egrid_ba.set_index('ba_code').replace(0,0.1)).sort_values(by='co2_mass_lb').round(3)

total = pd.DataFrame(plant_annual_total[data_columns].sum().div(egrid_ba[data_columns].sum()).rename('Total')).T

# calculate the difference in the number of plants in each region
plant_count = (plant_annual_total.groupby('ba_code', dropna=False).count()['plant_id_egrid'] - egrid_plant.groupby('ba_code', dropna=False).count()['plant_id_egrid']).rename('num_plants')
ba_metric = ba_metric.merge(plant_count, how='left', left_index=True, right_index=True).drop(columns=['plant_id_egrid']).sort_index()

ba_metric = pd.concat([ba_metric, total], axis=0).round(2) 

ba_metric = ba_metric[data_columns + ['num_plants']]

columns_to_check = ['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb']

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(ba_metric[~(ba_metric[columns_to_check] == 1).all(axis=1)])

In [None]:
# how much co2 comes from CEMS vs EIA
data_source_by_ba = pd.pivot_table(eia923_allocated, values='co2_mass_lb', index='ba_code', columns='data_source', dropna=False, aggfunc=np.sum).replace(0,0.001)
data_source_by_ba = data_source_by_ba.div(data_source_by_ba.sum(axis=1), axis=0)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(data_source_by_ba.round(3).fillna(0).sort_values(by='cems'))

In [None]:
print(ba_metric.to_markdown())

## Explore specific plants

### Notes

Net generation < 50%:
 - 2617 has negative net generation
 - 10444, 59395: some plants are hybrid fossil / solar plants, but the primary energy source code is getting listed as PV for certain generators, which is causing an issue

Net generation > 50%:
 - 335 has two extra units (CT1 and CT2) that report to CEMS but not EIA. Likely correct
 - 7288 reports DFO to CEMS, but NG to EIA - there's probably some generators not reporting
 - 60698 double counting generation from cEMS because of 90% heat threhshold in a month - might be fixed once we tackle heat input
 - 58256 is a solar/hybrid and in generators_entity_eia the battery portion is associated with a separate generator id (should only be 1)


Other:
 - plant 3754 has heat input in cems and eia that don't match
 - 2401 has generation in both cems and eia
 - 50933 might have allocation issue (doesn't appear in CEMS)

To check
 - 1404 reports generation to CEMS in December, but not to EIA. This is likely correct
 - plant 2504 has three units (120, 121, 122) that don't appear in EIA, and in CEMS only report steam. 

Fuel > 50%
 - Plant 3116 reports much more heat input to CEMS during ozone months than to EIA

BA Totals
 - TEPC and SRP are off because the Gila River Generator is shared between SRP and TEPC, and eGRID reports all generation from this project belonging to TEPC


In [49]:
# examine specific plants in a category
value = 'net_generation_mwh'
status = '>50%'

#compared_merged.loc[64877,:]

compared[compared[f'{value}_status'] == status].sort_values(by=value)

Unnamed: 0_level_0,ch4_mass_lb,ch4_mass_lb_adjusted,ch4_mass_lb_for_electricity,co2_mass_lb,co2_mass_lb_adjusted,co2_mass_lb_for_electricity,fuel_consumed_for_electricity_mmbtu,fuel_consumed_mmbtu,gross_generation_mwh,n2o_mass_lb,n2o_mass_lb_adjusted,n2o_mass_lb_for_electricity,net_generation_mwh,nox_mass_lb,nox_mass_lb_adjusted,nox_mass_lb_for_electricity,plant_primary_fuel,so2_mass_lb,so2_mass_lb_adjusted,so2_mass_lb_for_electricity,steam_load_1000_lb,plant_name,ba_code,state,net_generation_mwh_status,fuel_consumed_mmbtu_status,fuel_consumed_for_electricity_mmbtu_status,co2_mass_lb_adjusted_status,co2_mass_lb_status
plant_id_egrid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
1599,,,,1.003067,1.003067,,1.00161,1.00161,,,,,1.673042,,,,,,,,,Canal Station,ISNE,MA,>50%,+/-1%,+/-1%,+/-1%,+/-1%
55641,,,,1.0815,1.0815,,1.083786,1.083786,,,,,1.832314,,,,,,,,,Riverside Energy Center,MISO,WI,>50%,+/-10%,+/-10%,+/-10%,+/-10%
54415,,,,1.933183,0.0,,1.933123,1.933123,,,,,1.8801,,,,,,,,,Cadillac Renewable Energy,MISO,MI,>50%,>50%,>50%,negative,>50%
57842,,,,1.948314,1.948314,,1.964999,1.964999,,,,,1.959909,,,,,,,,,Wabash River Highland Plant,MISO,IN,>50%,>50%,>50%,>50%,>50%
60698,,,,0.935923,0.935923,,0.935926,0.935926,,,,,2.057536,,,,,,,,,Stanton Energy Reliability Center,CISO,CA,>50%,+/-10%,+/-10%,+/-10%,+/-10%
7277,,,,1.48226,1.48226,,1.492837,1.492837,,,,,2.142148,,,,,,,,,Lincoln Combustion Turbine,DUK,NC,>50%,+/-50%,+/-50%,+/-50%,+/-50%
58256,,,,1.0,1.0,,1.0,1.0,,,,,2.660714,,,,,,,,,Los Alamos PV Site,PNM,NM,>50%,!exact,!exact,!exact,!exact
315,,,,1.045406,1.045406,,1.04632,1.04632,,,,,3.074543,,,,,,,,,AES Alamitos,CISO,CA,>50%,+/-10%,+/-10%,+/-10%,+/-10%
335,,,,1.023804,1.023804,,1.024413,1.024413,,,,,11.230376,,,,,,,,,AES Huntington Beach,CISO,CA,>50%,+/-10%,+/-10%,+/-10%,+/-10%


In [None]:
compared[compared[f'{value}_status'] == status].sort_values(by=value).sample(10)

In [None]:
plant_to_explore = 58380

In [None]:
egrid_plant[egrid_plant['plant_id_eia'] == plant_to_explore]

In [None]:
plant_annual_total[plant_annual_total['plant_id_eia'] == plant_to_explore]

In [None]:
eia923_allocated[eia923_allocated['plant_id_eia'] == plant_to_explore].sum()

In [None]:
cems_unit_monthly = cems.groupby(['plant_id_eia','unitid','report_date']).sum().reset_index()


In [None]:
cems_unit_monthly[cems_unit_monthly['plant_id_eia'] == plant_to_explore].sum()

In [None]:
cems[(cems['plant_id_eia'] == plant_to_explore) & (cems['operating_time_hours'] > 0)]

In [None]:
cems_plant_annual[cems_plant_annual['plant_id_eia'] == plant_to_explore]

In [None]:
# there are some plants that report heat input and co2 in CEMS, but are missing net generation data
# TODO: we should maybe try and fill net generation data using EIA-923?
cems_plant_annual[cems_plant_annual['net_generation_mwh'] == 0]

## Explore BA Matching
### Notes
TEPC and SRP are off because the Gila River Generator is shared between SRP and TEPC, and eGRID reports all generation from this project belonging to TEPC

In [None]:
ba_plant_match = plant_annual_total[plant_annual_total['ba_code'].isna()].merge(egrid_plant[egrid_plant['ba_code'].isna()], how='left', on='plant_id_egrid', suffixes=("_calc",'_egrid'))
ba_plant_match[ba_plant_match['net_generation_mwh_calc'].round(0) != ba_plant_match['net_generation_mwh_egrid'].round(0)]

In [None]:
egrid_plant[egrid_plant['ba_code'] == 'CPLE']

In [None]:
compare_plants_in_ba = egrid_plant.merge(plant_annual_total, how='outer', on='plant_id_egrid', suffixes=('_egrid','_calc'))

In [None]:
ba = 'CPLE'
metric = 'fuel_consumed_mmbtu'

compare_plants_in_ba[((compare_plants_in_ba['ba_code_egrid'] == ba) | (compare_plants_in_ba['ba_code_calc'] == ba)) & (compare_plants_in_ba[f'{metric}_egrid'].round(0) != compare_plants_in_ba[f'{metric}_calc'].round(0))]

In [None]:
plant_annual_total[plant_annual_total['ba_code'] == 'AMPL']