In [13]:
# import packages
import numpy as np
import pandas as pd
import os
import plotly.express as px
from IPython.display import display

%reload_ext autoreload
%autoreload 2

# Tell python where to look for modules. 
import sys
sys.path.append('../../../open-grid-emissions/src/')

# import local modules
import load_data
import validation
from filepaths import *

from column_checks import get_dtypes

# Specify the year for validation

In [14]:
year = 2021

## Compare Our Results to eGRID

In [15]:
# load our annual plant level data
annual_plant_results = pd.read_csv(
    results_folder(f"{year}/plant_data/annual/us_units/plant_data.csv"),
    dtype=get_dtypes(),
)
plant_attributes = pd.read_csv(
    outputs_folder(f"{year}/plant_static_attributes_{year}.csv"), dtype=get_dtypes()
)
annual_plant_results = annual_plant_results.merge(
    plant_attributes, how="left", on="plant_id_eia"
)

# add a egrid id
annual_plant_results = validation.add_egrid_plant_id(
    annual_plant_results, from_id="eia", to_id="egrid"
)

# Load the eGRID plant table
egrid_plant = validation.load_egrid_plant_file(year)


### Identify plants missing from our results that exist in eGRID


In [16]:
(
    missing_from_calc,
    PLANTS_MISSING_FROM_CALCULATION,
) = validation.identify_plants_missing_from_our_calculations(
    egrid_plant, annual_plant_results, year
)
missing_from_calc


Unnamed: 0,ba_code,state,plant_id_egrid,plant_name_eia,plant_primary_fuel,chp_flag,net_generation_mwh,fuel_consumed_mmbtu,fuel_consumed_for_electricity_mmbtu,co2_mass_lb,co2_mass_lb_for_electricity_adjusted,nox_mass_lb,so2_mass_lb,fuel_consumed_mmbtu_ozone_season,fuel_data_source_annual,fuel_data_source_ozone,plant_id_eia


### Identify plants missing from eGRID that are in our calculations

In [17]:
(
    missing_from_egrid,
    PLANTS_MISSING_FROM_EGRID,
) = validation.identify_plants_missing_from_egrid(egrid_plant, annual_plant_results)

# how many of the plants missing from egrid have non-zero data
missing_from_egrid.loc[
    missing_from_egrid["fuel_consumed_mmbtu"] != 0,
    [
        "plant_id_eia",
        "plant_name_eia",
        "plant_primary_fuel",
        "net_generation_mwh",
        "fuel_consumed_for_electricity_mmbtu",
        "fuel_consumed_mmbtu",
    ],
]


Unnamed: 0,plant_id_eia,plant_name_eia,plant_primary_fuel,net_generation_mwh,fuel_consumed_for_electricity_mmbtu,fuel_consumed_mmbtu
1,2518,Shoreham,DFO,3966.0,61689.0,61689.0
3,7922,North Plant,DFO,20.8,218.0,218.0
4,10154,Chocolate Bayou Works,NG,784064.1,4103418.6,12008364.9
5,10350,Greenleaf Unit One,NG,1058.65,10484.0,10484.0
6,10397,Indiana Harbor,BFG,225367.0,1141549.5,8839609.9
8,54995,5 AC Station,BFG,497881.0,2642313.7,10866629.0
9,61852,Eagle Shadow Mountain Solar Farm,SUN,31264.0,276467.0,276467.0
10,64927,Aktina Solar,SUN,110147.0,974030.0,974030.0


### Compare whether totals for plants with EPA IDs that differ from EIA IDs match

In [18]:
# identify where there is a single egrid plant id for multiple eia plant ids
double_ids = annual_plant_results[
    annual_plant_results["plant_id_egrid"].duplicated(keep=False)
]
double_ids = (
    double_ids.groupby("plant_id_egrid").sum()["net_generation_mwh"].reset_index()
)  # focus on net generation for now
# merge the egrid data
double_ids = double_ids.merge(
    egrid_plant[["plant_id_egrid", "net_generation_mwh"]],
    how="left",
    on="plant_id_egrid",
    suffixes=("_calc", "_egrid"),
)
double_ids["percent_diff"] = (
    (double_ids["net_generation_mwh_calc"] - double_ids["net_generation_mwh_egrid"])
    / double_ids["net_generation_mwh_egrid"]
).round(3)
double_ids


  double_ids.groupby("plant_id_egrid").sum()["net_generation_mwh"].reset_index()


Unnamed: 0,plant_id_egrid,net_generation_mwh_calc,net_generation_mwh_egrid,percent_diff
0,562,159772.01,159772.001,0.0
1,1416,2813263.01,2813263.0,0.0
2,2709,5750398.69,5750398.948,-0.0
3,3612,3617639.19,3617639.0,0.0
4,4076,122743.41,122744.0,-0.0
5,10474,723248.0,,
6,55306,7066888.99,7066889.0,-0.0
7,55375,5899776.09,5899775.879,0.0
8,55481,6173339.01,6173339.0,0.0
9,55508,19724.2,19724.002,0.0


In [19]:
# compare egrid vs eia plant ids
annual_plant_results[
    annual_plant_results["plant_id_egrid"].duplicated(keep=False)
].groupby(["plant_id_egrid", "plant_id_eia"]).sum()


  ].groupby(["plant_id_egrid", "plant_id_eia"]).sum()


Unnamed: 0_level_0,Unnamed: 1_level_0,net_generation_mwh,fuel_consumed_mmbtu,fuel_consumed_for_electricity_mmbtu,co2_mass_lb,ch4_mass_lb,n2o_mass_lb,co2e_mass_lb,nox_mass_lb,so2_mass_lb,co2_mass_lb_for_electricity,ch4_mass_lb_for_electricity,n2o_mass_lb_for_electricity,co2e_mass_lb_for_electricity,nox_mass_lb_for_electricity,so2_mass_lb_for_electricity,co2_mass_lb_adjusted,ch4_mass_lb_adjusted,n2o_mass_lb_adjusted,co2e_mass_lb_adjusted,nox_mass_lb_adjusted,so2_mass_lb_adjusted,co2_mass_lb_for_electricity_adjusted,ch4_mass_lb_for_electricity_adjusted,n2o_mass_lb_for_electricity_adjusted,co2e_mass_lb_for_electricity_adjusted,nox_mass_lb_for_electricity_adjusted,so2_mass_lb_for_electricity_adjusted,distribution_flag,shaped_plant_id
plant_id_egrid,plant_id_eia,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
562,562,154647.6,1925741.0,1925741.0,233849300.0,4795.19,563.05,234136800.0,249618.98,46079.25,233849300.0,4795.19,563.05,234136800.0,249618.98,46079.25,233849300.0,4795.19,563.05,234136800.0,249618.98,46079.25,233849300.0,4795.19,563.05,234136800.0,249618.98,46079.25,0,911305.0
562,57068,5124.41,109043.18,109043.18,16280370.0,693.22,137.06,16337130.0,13743.41,6634.31,16280370.0,693.22,137.06,16337130.0,13743.41,6634.31,16280370.0,693.22,137.06,16337130.0,13743.41,6634.31,16280370.0,693.22,137.06,16337130.0,13743.41,6634.31,0,0.0
1416,1416,44546.01,572730.44,572730.44,68073870.0,1260.01,126.01,68143420.0,53240.39,343.7,68073870.0,1260.01,126.01,68143420.0,53240.39,343.7,68073870.0,1260.01,126.01,68143420.0,53240.39,343.7,68073870.0,1260.01,126.01,68143420.0,53240.39,343.7,0,918605.0
1416,56565,2768717.0,20098601.69,20098601.69,2388857000.0,44216.91,4421.69,2391297000.0,208389.55,12059.84,2388857000.0,44216.91,4421.69,2391297000.0,208389.55,12059.84,2388857000.0,44216.91,4421.69,2391297000.0,208389.55,12059.84,2388857000.0,44216.91,4421.69,2391297000.0,208389.55,12059.84,0,0.0
2709,7538,271937.69,3411752.69,3411752.69,417040100.0,7600.11,774.03,417463500.0,269415.9,12358.21,417040100.0,7600.11,774.03,417463500.0,269415.9,12358.21,417040100.0,7600.11,774.03,417463500.0,269415.9,12358.21,417040100.0,7600.11,774.03,417463500.0,269415.9,12358.21,0,0.0
2709,58215,5478461.0,38806338.96,38806338.96,4612786000.0,85374.05,8537.48,4617499000.0,1731123.85,24620.13,4612786000.0,85374.05,8537.48,4617499000.0,1731123.85,24620.13,4612786000.0,85374.05,8537.48,4617499000.0,1731123.85,24620.13,4612786000.0,85374.05,8537.48,4617499000.0,1731123.85,24620.13,0,0.0
3612,3612,928715.19,10543213.09,10543213.09,1256248000.0,23198.91,2320.42,1257529000.0,1588862.27,6480.31,1256248000.0,23198.91,2320.42,1257529000.0,1588862.27,6480.31,1256248000.0,23198.91,2320.42,1257529000.0,1588862.27,6480.31,1256248000.0,23198.91,2320.42,1257529000.0,1588862.27,6480.31,0,0.0
3612,7512,2688924.0,19283846.02,19283846.02,2292025000.0,42424.46,4242.45,2294367000.0,462341.55,11571.79,2292025000.0,42424.46,4242.45,2294367000.0,462341.55,11571.79,2292025000.0,42424.46,4242.45,2294367000.0,462341.55,11571.79,2292025000.0,42424.46,4242.45,2294367000.0,462341.55,11571.79,0,0.0
4076,4076,66733.52,1040287.06,1040287.06,123408000.0,2288.66,228.73,123534300.0,216702.57,621.41,123408000.0,2288.66,228.73,123534300.0,216702.57,621.41,123408000.0,2288.66,228.73,123534300.0,216702.57,621.41,123408000.0,2288.66,228.73,123534300.0,216702.57,621.41,0,0.0
4076,7799,56009.89,805736.8,805736.8,95021820.0,1772.58,177.23,95119670.0,69098.3,482.5,95021820.0,1772.58,177.23,95119670.0,69098.3,482.5,95021820.0,1772.58,177.23,95119670.0,69098.3,482.5,95021820.0,1772.58,177.23,95119670.0,69098.3,482.5,0,912905.0


### Identify plants where our BA assignment does not match eGRID

In [20]:
ba_code_match = egrid_plant.set_index("plant_id_eia")[["plant_name_eia", "ba_code"]].merge(
    annual_plant_results.set_index("plant_id_eia")[["ba_code"]],
    how="inner",
    left_index=True,
    right_index=True,
    suffixes=("_egrid", "_calc"),
)

# plants with missing ba code
# ba_code_match[(ba_code_match['ba_code_calc'].isna()) & ~(ba_code_match['ba_code_egrid'].isna())]
ba_code_match[ba_code_match["ba_code_calc"] != ba_code_match["ba_code_egrid"]]


Unnamed: 0_level_0,plant_name_eia,ba_code_egrid,ba_code_calc
plant_id_eia,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
57053,Alakanuk,,AKMS
58982,Allison Creek Hydro,,AKMS
60243,Ambler,,AKMS
7462,Angoon,,AKMS
7182,Aniak,,AKMS
...,...,...,...
65721,RT405 Westerlo Solar 2,ISNE,NYIS
60115,Town of Halfmoon,ISNE,NYIS
62764,Westtown CSG,ISNE,NYIS
64725,Williams Rd,ISNE,NYIS


In [21]:
# how many of these mismatches are for non-missing bas
ba_code_match[
    (ba_code_match["ba_code_calc"] != ba_code_match["ba_code_egrid"])
    & ~(ba_code_match["ba_code_egrid"].isna())
]


Unnamed: 0_level_0,plant_name_eia,ba_code_egrid,ba_code_calc
plant_id_eia,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6456,Carver Falls,ISNE,NYIS
64715,Dusenberry,ISNE,NYIS
57600,Fishers Island 1,ISNE,NYIS
64692,Fredonia Solar LLC,ISNE,NYIS
64719,Glenmere Lake,ISNE,NYIS
60463,Greene County Meter #1,ISNE,NYIS
64716,Knapp East (CSG),ISNE,NYIS
64720,Knapp West(CSG),ISNE,NYIS
65723,LR Wheatfield Solar 1,ISNE,NYIS
60114,Oneida County- DPW,ISNE,NYIS


### Identify whether the fuel codes of each plant match

In [22]:
fuel_match = egrid_plant.set_index("plant_id_eia")[
    ["plant_name_eia", "plant_primary_fuel"]
].merge(
    annual_plant_results.set_index("plant_id_eia")[["plant_primary_fuel"]],
    how="inner",
    left_index=True,
    right_index=True,
    suffixes=("_egrid", "_calc"),
)

fuel_match[
    fuel_match["plant_primary_fuel_egrid"] != fuel_match["plant_primary_fuel_calc"]
]


Unnamed: 0_level_0,plant_name_eia,plant_primary_fuel_egrid,plant_primary_fuel_calc
plant_id_eia,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7484,NSB Nuiqsut Utility,DFO,NG
56076,ABC Coke,COG,OG
3,Barry,BIT,NG
54802,WestRock Coated Board,NG,BLQ
160,Apache Station,SUB,NG
...,...,...,...
50933,Rhinelander Mill,BIT,WAT
10477,Wisconsin Rapids Pulp Mill,NG,BLQ
10743,Morgantown Energy Facility,NG,WC
54472,Simplot Phosphates,OTH,MSB


## Compare data from different sources

# Split data into different groups based on known discrepencies

There are certain classes of plants where there are known issues that we know will prevent the totals from matching:
- There are certain power plants that are missing altogether from eGRID
- Geothermal power plants: EGRID sometimes has incorrect geotypes that may lead to different emissions estimates
- Nuclear power plants: eGRID is generally missing fuel consumption data
- Fuel cells are assumed by eGRID to have zero emissions
- Plants that only report CEMS data for part of the year are sometimes missing data for the rest of the year in eGRID
- CHP plants use a slightly different methodology
- plants that report data to the bf table or gen table in EIA-923 may be missing data from teh generation and fuel table.
- "proposed" plants sometimes report data before they are officially online and egrid doesn't always reflect that.


We could identify plants where the total fuel doesn't match EIA-923, but matches the total fuel for a subset of prime movers

In [None]:
pudl_out = load_data.initialize_pudl_out(year)
eia923_allocated = pd.read_csv(
    outputs_folder(f"{year}/eia923_allocated_{year}.csv"),
    dtype=get_dtypes(),
    parse_dates=["report_date"],
)

annual_plant_results_segmented = validation.segment_plants_by_known_issues(
    annual_plant_results,
    egrid_plant,
    eia923_allocated,
    pudl_out,
    PLANTS_MISSING_FROM_EGRID,
)


## Plant Metric

In [None]:
flag_columns = [
    "flag_missing_egrid",
    "flag_geothermal",
    "flag_nuclear",
    "flag_fuel_cell",
    "flag_partial_year",
    "flag_chp",
    "flag_bf_gen_reporter",
    "flag_plant_w_proposed_gen",
    "flag_proposed_plant",
]

# all data without known issues
segment_to_compare = annual_plant_results_segmented[
    annual_plant_results_segmented[flag_columns].sum(axis=1) == 0
]

comparison_count, compared = validation.compare_plant_level_results_to_egrid(
    segment_to_compare, egrid_plant, PLANTS_MISSING_FROM_EGRID
)
comparison_count


In [None]:
validation.compare_egrid_fuel_total(segment_to_compare, egrid_plant).sum()


In [None]:
# examine results for a specific data flag
flag = "flag_proposed_plant"

# all data without known issues
segment_to_compare = annual_plant_results_segmented[
    annual_plant_results_segmented[flag] == 1
]

comparison_count, compared = validation.compare_plant_level_results_to_egrid(
    segment_to_compare, egrid_plant, PLANTS_MISSING_FROM_EGRID
)
comparison_count


In [None]:
# evaluate all plants
comparison_count, compared = validation.compare_plant_level_results_to_egrid(
    annual_plant_results, egrid_plant, PLANTS_MISSING_FROM_EGRID
)
comparison_count

### Explore a specific set of plants

In [None]:
# specify the dataframe, metric, and status to explore
comparison_df = compared
metric = "so2_mass_lb"
status = "<50%"

comparison_df = comparison_df.merge(egrid_plant.set_index("plant_id_egrid")[[metric]], how="left", left_index=True, right_index=True, suffixes=(None,"_egrid"))
comparison_df = comparison_df.merge(annual_plant_results.set_index("plant_id_egrid")[[metric]], how="left", left_index=True, right_index=True, suffixes=(None,"_calc"))

# show the data
columns_to_show = ["plant_name_eia", "ba_code", "state", metric, f"{metric}_status", f"{metric}_egrid", f"{metric}_calc"]
comparison_df.loc[(comparison_df[f"{metric}_status"] == status), columns_to_show]


## Compare Fleet Totals

In [39]:
oge_comp = annual_plant_results[
    [
        "plant_id_eia",
        "plant_id_egrid",
        "ba_code",
        "plant_primary_fuel",
        "fuel_consumed_mmbtu",
        "fuel_consumed_for_electricity_mmbtu",
        "net_generation_mwh",
        "co2_mass_lb",
    ]
]
oge_comp["source"] = "OGE"
egrid_comp = egrid_plant[
    [
        "plant_id_eia",
        "plant_id_egrid",
        "plant_name_eia",
        "ba_code",
        "plant_primary_fuel",
        "fuel_data_source_annual",
        "fuel_data_source_ozone",
        "fuel_consumed_mmbtu",
        "fuel_consumed_for_electricity_mmbtu",
        "net_generation_mwh",
        "co2_mass_lb",
    ]
]
egrid_comp["source"] = "egrid"

plant_compare = pd.concat([egrid_comp, oge_comp], axis=0)
plant_compare = plant_compare.sort_values(by="plant_id_eia")
plant_compare = plant_compare[
    [   "source",
        "plant_id_eia",
        "plant_id_egrid",
        "plant_name_eia",
        "ba_code",
        "plant_primary_fuel",
        "fuel_data_source_annual",
        "fuel_data_source_ozone",
        "fuel_consumed_mmbtu",
        "fuel_consumed_for_electricity_mmbtu",
        "net_generation_mwh",
        "co2_mass_lb",
    ]
]
plant_compare

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  oge_comp["source"] = "OGE"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  egrid_comp["source"] = "egrid"


Unnamed: 0,source,plant_id_eia,plant_id_egrid,plant_name_eia,ba_code,plant_primary_fuel,fuel_data_source_annual,fuel_data_source_ozone,fuel_consumed_mmbtu,fuel_consumed_for_electricity_mmbtu,net_generation_mwh,co2_mass_lb
107,egrid,1,1,Sand Point,,DFO,EIA,EIA,36674.0,36674.0,2863.00,5.796432e+06
0,OGE,1,1,,AKMS,DFO,,,36674.0,36674.0,2863.10,5.796227e+06
157,egrid,2,2,Bankhead Dam,SOCO,WAT,EIA,EIA,1905189.0,1905189.0,215446.00,0.000000e+00
1,OGE,2,2,,SOCO,WAT,,,1905189.0,1905189.0,215445.90,0.000000e+00
158,egrid,3,3,Barry,SOCO,BIT,EPA/CAMD,EPA/CAMD,97913749.0,97913749.0,11792285.98,1.515480e+10
...,...,...,...,...,...,...,...,...,...,...,...,...
8372,egrid,65815,65815,NY - Mines Press,NYIS,SUN,,,,,,0.000000e+00
6396,egrid,65817,65817,"Clear Solar I, LLC",DUK,SUN,,,,,,0.000000e+00
1971,egrid,65824,65824,"VS LADWPGLP Francisco, LLC",LDWP,SUN,,,,,,0.000000e+00
10398,egrid,65836,65836,Toyah Power Station,ERCO,MWH,,,,,,0.000000e+00


In [40]:
plant_compare[(plant_compare["ba_code"]=="ISNE") & (plant_compare["plant_primary_fuel"].isin(["DFO"]))]

Unnamed: 0,source,plant_id_eia,plant_id_egrid,plant_name_eia,ba_code,plant_primary_fuel,fuel_data_source_annual,fuel_data_source_ozone,fuel_consumed_mmbtu,fuel_consumed_for_electricity_mmbtu,net_generation_mwh,co2_mass_lb
2342,egrid,544,544,Devon,ISNE,DFO,EPA/CAMD,EPA/CAMD,91501.732,91501.732,-1190.000,14847688.00
2409,egrid,563,563,South Meadow Station,ISNE,DFO,EPA/CAMD,EPA/CAMD,23290.500,23290.500,1138.999,3772600.00
338,OGE,581,581,,ISNE,DFO,,,6527.600,6527.600,248.990,1066087.69
2389,egrid,581,581,Norwich,ISNE,DFO,EPA/CAMD; EIA,EPA/CAMD,6248.600,6248.600,249.000,1020558.00
696,OGE,1484,1484,,ISNE,DFO,,,5085.000,5085.000,251.200,830482.40
...,...,...,...,...,...,...,...,...,...,...,...,...
9682,OGE,63203,63203,,ISNE,DFO,,,7254.000,7254.000,393.000,1184723.40
4642,egrid,64378,64378,Signature Breads Chelsea,ISNE,DFO,EIA,EIA,82.000,82.000,9.000,13392.00
10498,OGE,64378,64378,,ISNE,DFO,,,82.000,82.000,9.100,13392.10
10585,OGE,64515,64515,,ISNE,DFO,,,741.600,741.600,77.200,121118.60


In [None]:
year = 2021
path_prefix = year

# Load the eGRID plant table
egrid_plant = validation.load_egrid_plant_file(year)

# aggregate the plant data up to the BA level
data_columns = [
    "net_generation_mwh",
    "fuel_consumed_mmbtu",
    "fuel_consumed_for_electricity_mmbtu",
    "co2_mass_lb",
    "co2_mass_lb_for_electricity_adjusted",
]
egrid_plant_ba_agg = egrid_plant.groupby(["ba_code"]).sum()[data_columns].reset_index()


## Compare Annual BA values to eGRID BA file

In [23]:
year = 2021
path_prefix = year

# Load the eGRID plant table
egrid_plant = validation.load_egrid_plant_file(year)

egrid_ba = validation.load_egrid_ba_file(year)

# aggregate the plant data up to the BA level
data_columns = [
    "net_generation_mwh",
    "fuel_consumed_mmbtu",
    "fuel_consumed_for_electricity_mmbtu",
    "co2_mass_lb",
    "co2_mass_lb_for_electricity_adjusted",
]
egrid_plant_ba_agg = egrid_plant.groupby(["ba_code"]).sum()[data_columns].reset_index()


  egrid_plant_ba_agg = egrid_plant.groupby(["ba_code"]).sum()[data_columns].reset_index()


In [24]:
# load our annual ba data
DATA_COLUMNS = [
    "net_generation_mwh",
    "fuel_consumed_mmbtu",
    "fuel_consumed_for_electricity_mmbtu",
    "co2_mass_lb",
    "co2_mass_lb_adjusted",
]

calculated_ba = []

for filename in os.listdir(
    results_folder(f"{path_prefix}/power_sector_data/annual/us_units/")
):
    ba = filename.split(".")[0]
    ba_data = pd.read_csv(
        results_folder(f"{path_prefix}/power_sector_data/annual/us_units/{filename}"),
        usecols=(["fuel_category"] + DATA_COLUMNS),
    )
    ba_data = ba_data[ba_data["fuel_category"] == "total"].drop(
        columns=["fuel_category"]
    )
    ba_data["ba_code"] = ba
    ba_data = ba_data[["ba_code"] + DATA_COLUMNS]
    calculated_ba.append(ba_data)

calculated_ba = pd.concat(calculated_ba, axis=0)


In [28]:
calculated_ba

Unnamed: 0,ba_code,net_generation_mwh,fuel_consumed_mmbtu,fuel_consumed_for_electricity_mmbtu,co2_mass_lb,co2_mass_lb_adjusted
3,AEC,4636142.51,3.580977e+07,3.580977e+07,5.350280e+09,5.290299e+09
6,AECI,25778308.74,2.292933e+08,2.286906e+08,3.680804e+10,3.679351e+10
5,AKMS,4532224.16,5.054550e+07,4.173276e+07,6.193302e+09,6.193302e+09
5,AVA,8446548.44,9.021210e+07,7.367600e+07,7.184644e+09,2.437270e+09
2,AVRN,6990559.73,5.777564e+07,5.757414e+07,2.786162e+09,2.786162e+09
...,...,...,...,...,...,...
7,WACM,40882095.33,4.380295e+08,4.377169e+08,7.753577e+10,7.750868e+10
4,WALC,16604012.81,1.476798e+08,1.476798e+08,8.291619e+09,8.291619e+09
1,WAUW,3014397.54,2.665632e+07,2.665632e+07,0.000000e+00,0.000000e+00
0,WWA,644950.25,5.703292e+06,5.703292e+06,0.000000e+00,0.000000e+00


In [25]:
percent_diff_from_egrid = (
    (
        calculated_ba.set_index("ba_code").replace(0, 0.1)
        - egrid_plant_ba_agg.set_index("ba_code").replace(0, 0.1)
    )
    / egrid_plant_ba_agg.set_index("ba_code").replace(0, 0.1)
).round(2)


In [26]:
percent_diff_from_egrid.sort_values(by="net_generation_mwh")


Unnamed: 0_level_0,co2_mass_lb,co2_mass_lb_adjusted,co2_mass_lb_for_electricity_adjusted,fuel_consumed_for_electricity_mmbtu,fuel_consumed_mmbtu,net_generation_mwh
ba_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
TEPC,-0.21,,,-0.27,-0.27,-0.23
AEC,0.27,,,0.00,0.00,0.00
ISNE,0.01,,,0.00,0.01,-0.00
WWA,0.00,,,-0.00,-0.00,0.00
LDWP,0.00,,,0.00,0.00,-0.00
...,...,...,...,...,...,...
HECO,0.14,,,0.17,0.18,0.19
PGE,0.12,,,0.10,0.09,0.22
AKMS,,,,,,
HIMS,,,,,,


In [27]:
# divide our calculation by the BA totals from eGRID
# if there are 0 values, replace with 0.1, so that div by zero doesn't return missing value
ba_metric = (
    calculated_ba.replace(0, 0.1)
    .set_index("ba_code")
    .div(egrid_plant_ba_agg.set_index("ba_code").replace(0, 0.1))
    .sort_values(by="co2_mass_lb")
    .round(3)
)

total = pd.DataFrame(
    calculated_ba[data_columns]
    .sum()
    .div(egrid_plant_ba_agg[data_columns].sum())
    .rename("Total")
).T

# calculate the difference in the number of plants in each region
# plant_count = (plant_annual_total.groupby('ba_code', dropna=False).count()['plant_id_egrid'] - egrid_plant.groupby('ba_code', dropna=False).count()['plant_id_egrid']).rename('num_plants')
# ba_metric = ba_metric.merge(plant_count, how='left', left_index=True, right_index=True).drop(columns=['plant_id_egrid']).sort_index()

ba_metric = pd.concat([ba_metric, total], axis=0).round(2)

ba_metric = ba_metric[data_columns]

columns_to_check = [
    "net_generation_mwh",
    "fuel_consumed_mmbtu",
    "fuel_consumed_for_electricity_mmbtu",
    "co2_mass_lb",
]

with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(ba_metric[~(ba_metric[columns_to_check] == 1).all(axis=1)])


KeyError: "['co2_mass_lb_for_electricity_adjusted'] not in index"

## Explore specific plants

### Notes

BA Totals
 - TEPC and SRP are off because the Gila River Generator is shared between SRP and TEPC, and eGRID reports all generation from this project belonging to TEPC


In [None]:
plant_to_explore = 58223


In [None]:
egrid_plant[egrid_plant["plant_id_eia"] == plant_to_explore]


In [None]:
annual_plant_results[annual_plant_results["plant_id_eia"] == plant_to_explore]


In [None]:
eia923_allocated[eia923_allocated["plant_id_eia"] == plant_to_explore]


In [None]:
eia923_allocated.loc[
    eia923_allocated["plant_id_eia"] == plant_to_explore,
    ["generator_id", "subplant_id"],
].drop_duplicates()
