In [2]:
# import packages
import pandas as pd

%reload_ext autoreload
%autoreload 2

# # Tell python where to look for modules.
import sys
sys.path.append('../../../open-grid-emissions/src/')

from column_checks import get_dtypes
import load_data
from filepaths import *

year = 2021
path_prefix = f"{year}/"

# How much of the data reported in EIA-923 comes from annually-reported plants?

In [19]:
pudl_out = load_data.initialize_pudl_out(year)

plant_frequency = pudl_out.plants_eia860()[["plant_id_eia", "reporting_frequency_code"]]

# load the allocated EIA data
eia923_allocated = pd.read_csv(
    f"{outputs_folder()}{path_prefix}/eia923_allocated_{year}.csv",
    dtype=get_dtypes(),
    parse_dates=["report_date"],
)
eia923_allocated = eia923_allocated.merge(
    plant_frequency, how="left", on="plant_id_eia", validate="m:1"
)

[32m2023-06-29 09:23:00[0m [1;30m[    INFO][0m [34mcatalystcoop.pudl.transform.eia861:456[0m Started with 323 missing BA Codes out of 13488 records (2.39%)
[32m2023-06-29 09:23:05[0m [1;30m[    INFO][0m [34mcatalystcoop.pudl.transform.eia861:480[0m Ended with 323 missing BA Codes out of 13488 records (2.39%)
[32m2023-06-29 09:23:05[0m [1;30m[    INFO][0m [34mcatalystcoop.pudl.output.eia860:177[0m 97.6% of plant records have consistently reported BA Codes
[32m2023-06-29 09:23:05[0m [1;30m[    INFO][0m [34mcatalystcoop.pudl.output.eia860:227[0m Before any filling treatment has been applied. 2.4% of records have no BA codes
[32m2023-06-29 09:23:05[0m [1;30m[    INFO][0m [34mcatalystcoop.pudl.output.eia860:227[0m Backfilling and consistent value is the same. Filled w/ most consistent BA code. 2.4% of records have no BA codes
[32m2023-06-29 09:23:06[0m [1;30m[    INFO][0m [34mcatalystcoop.pudl.output.eia860:227[0m SWPP is most consistent value. Filled w/ 

In [50]:
# sum data by plant
miso_data = eia923_allocated.groupby(["plant_id_eia", "reporting_frequency_code"])[
    "net_generation_mwh",
    "fuel_consumed_mmbtu",
    "fuel_consumed_for_electricity_mmbtu",
    "co2_mass_lb",
].sum().reset_index()

# merge BA and data availability
plant_attributes = pd.read_csv(
    outputs_folder(f"{path_prefix}/plant_static_attributes_{year}.csv"),
    dtype=get_dtypes(),
)
miso_data = miso_data.merge(
    plant_attributes[["plant_id_eia", "ba_code", "plant_primary_fuel", "fuel_category","data_availability"]],
    how="outer",
    on="plant_id_eia",
)

# filter to MISO only plants
miso_data = miso_data[miso_data["ba_code"] == "MISO"]

# add a column for clean vs emitting
miso_data["emissions_category"] = "emitting"
miso_data.loc[miso_data["fuel_category"].isin(["hydro", "nuclear", "solar", "wind"]), "emissions_category"] = "non-emitting"

# change AM frequency to A
miso_data.loc[miso_data["reporting_frequency_code"] == "AM", "reporting_frequency_code"] = "A"

miso_data.loc[miso_data["data_availability"] == "cems_only", "data_availability"] = "cems_and_eia"
miso_data["data_availability"] = miso_data["data_availability"].cat.remove_unused_categories()

miso_data

  miso_data = eia923_allocated.groupby(["plant_id_eia", "reporting_frequency_code"])[


Unnamed: 0,plant_id_eia,reporting_frequency_code,net_generation_mwh,fuel_consumed_mmbtu,fuel_consumed_for_electricity_mmbtu,co2_mass_lb,ba_code,plant_primary_fuel,fuel_category,data_availability,emissions_category
19,30,A,-200.0,197.3,197.3,3.222260e+04,MISO,DFO,petroleum,eia_only,emitting
26,51,M,1077152.0,13804652.0,13804652.0,2.766411e+09,MISO,LIG,coal,cems_and_eia,emitting
84,166,A,138495.2,1224712.0,1224712.0,0.000000e+00,MISO,WAT,hydro,eia_only,non-emitting
85,170,A,196186.0,2285235.0,2285235.0,2.670983e+08,MISO,NG,natural_gas,cems_and_eia,emitting
86,172,A,0.0,0.0,0.0,0.000000e+00,MISO,DFO,petroleum,eia_only,emitting
...,...,...,...,...,...,...,...,...,...,...,...
11286,65453,,,,,,MISO,SUN,solar,eia_only,non-emitting
11287,65455,,,,,,MISO,SUN,solar,eia_only,non-emitting
11290,65497,,,,,,MISO,SUN,solar,eia_only,non-emitting
11348,65774,,,,,,MISO,SUN,solar,eia_only,non-emitting


In [52]:
# get statistics about where plants report
miso_data.groupby(
    ["data_availability", "reporting_frequency_code", "emissions_category"], dropna=False
)["plant_id_eia"].count().reset_index().pivot(
    index=["data_availability", "reporting_frequency_code"],
    columns=["emissions_category"],
    values="plant_id_eia",
)

Unnamed: 0_level_0,emissions_category,emitting,non-emitting
data_availability,reporting_frequency_code,Unnamed: 2_level_1,Unnamed: 3_level_1
cems_and_eia,,18,0
cems_and_eia,A,104,0
cems_and_eia,M,130,0
eia_only,,20,14
eia_only,A,395,982
eia_only,M,112,168


In [53]:
# get statistics about where plants report
miso_data.groupby(
    ["data_availability", "reporting_frequency_code", "fuel_category"], dropna=False
)["plant_id_eia"].count().reset_index().pivot(
    index=["data_availability", "reporting_frequency_code"],
    columns=["fuel_category"],
    values="plant_id_eia",
)

Unnamed: 0_level_0,fuel_category,biomass,coal,hydro,natural_gas,nuclear,other,petroleum,solar,waste,wind
data_availability,reporting_frequency_code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
cems_and_eia,,1,2,0,15,0,0,0,0,0,0
cems_and_eia,A,4,0,0,89,0,0,11,0,0,0
cems_and_eia,M,2,62,0,63,0,0,3,0,0,0
eia_only,,4,0,2,8,0,5,3,11,0,1
eia_only,A,81,10,148,108,0,5,190,591,1,243
eia_only,M,18,8,13,44,11,10,27,43,5,101


In [58]:
(miso_data.groupby(["data_availability","reporting_frequency_code"], dropna=True)[
    ["net_generation_mwh", "co2_mass_lb"]
].sum() / miso_data[
    ["net_generation_mwh", "co2_mass_lb"]
].sum() * 100).round(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,net_generation_mwh,co2_mass_lb
data_availability,reporting_frequency_code,Unnamed: 2_level_1,Unnamed: 3_level_1
cems_and_eia,A,2.7,2.6
cems_and_eia,M,62.6,82.8
eia_only,A,9.2,8.2
eia_only,M,25.5,6.4


In [None]:
data_from_annual.loc["A",:].rename("% of EIA-923 input data from EIA annual reporters")

## What percentage of the EIA data we actually use is from annually-reporting plants?
Even though ~10% of the data in EIA-923 is reported annually, we do not directly use all of the EIA-923 in our final results, since in some cases we have CEMS data that supercedes the EIA data. So what percentage of the EIA data we actually do use comes from annually-reporting plants?

In [None]:
annual_eia_used = eia923_allocated[eia923_allocated["hourly_data_source"] != "cems"].groupby(["reporting_frequency_code"], dropna=False)[["fuel_consumed_mmbtu", "net_generation_mwh","co2_mass_lb"]].sum() / eia923_allocated[["fuel_consumed_mmbtu", "net_generation_mwh","co2_mass_lb"]].sum() * 100
annual_eia_used.loc["Total Percent"] = annual_eia_used.sum()
annual_eia_used

In [None]:
annual_eia_used.loc["A",:].rename("% of output data from EIA annual reporters")

## What percentage of the data that we get from multiple sources is annually reported?
For subplants where all of the data comes from EIA, the annual reporting may lead to lower-quality allocation of the data to each month and hour, but there should not be a risk of double counting. However, for some plants, we draw data for some months from CEMS and other months from EIA-923. If the annually-reported data was not allocated to the correct month (relative to the CEMS data), there is a risk that we may double-count or under count data from that subplant. 

In [None]:
multi_source_subplants = eia923_allocated[
    ["plant_id_eia", "subplant_id", "hourly_data_source"]
].drop_duplicates().drop(columns="hourly_data_source")

multi_source_subplants = multi_source_subplants[
            multi_source_subplants.duplicated(
                subset=["plant_id_eia", "subplant_id"])]

multi_source_subplants = eia923_allocated.merge(multi_source_subplants, how="inner", on=["plant_id_eia", "subplant_id"])


In [None]:
# what percent of the total EIA-923 data comes from subplants with annually-reported data and multiple sources?
multi_source_summary = (multi_source_subplants.groupby(["respondent_frequency"], dropna=False)[["fuel_consumed_mmbtu", "net_generation_mwh","co2_mass_lb"]].sum() / eia923_allocated[["fuel_consumed_mmbtu", "net_generation_mwh","co2_mass_lb"]].sum() * 100)
multi_source_summary.loc["Total Percent"] = multi_source_summary.sum()
multi_source_summary

In [None]:
multi_source_summary.loc["A",:].rename("% of output data mixing CEMS and annually-reported EIA data")

In [None]:
pd.concat([pd.DataFrame(data_from_annual.loc["A",:].rename("% of EIA-923 input data from EIA annual reporters").round(2)).T, pd.DataFrame(annual_eia_used.loc["A",:].rename("% of output data from EIA annual reporters").round(2)).T, pd.DataFrame(multi_source_summary.loc["A",:].rename("% of output data mixing CEMS and annually-reported EIA data").round(2)).T], axis=0)