In [None]:
# import packages
import pandas as pd

%reload_ext autoreload
%autoreload 2

# # Tell python where to look for modules.
import sys
sys.path.append('../../../open-grid-emissions/src/')

from column_checks import get_dtypes
import load_data
from filepaths import *

year = 2020
path_prefix = f"{year}/"

# How much of the data reported in EIA-923 comes from annually-reported plants?

In [None]:
pudl_out = load_data.initialize_pudl_out(year)

plant_frequency = pudl_out.plants_eia860()[["plant_id_eia","respondent_frequency"]]

# load the allocated EIA data
eia923_allocated = pd.read_csv(f'{outputs_folder()}{path_prefix}/eia923_allocated_{year}.csv', dtype=get_dtypes(), parse_dates=['report_date'])
eia923_allocated = eia923_allocated.merge(plant_frequency, how="left", on="plant_id_eia", validate="m:1")

In [None]:
data_from_annual = eia923_allocated.groupby(["respondent_frequency"], dropna=False)[["fuel_consumed_mmbtu", "net_generation_mwh","co2_mass_lb"]].sum() / eia923_allocated[["fuel_consumed_mmbtu", "net_generation_mwh","co2_mass_lb"]].sum() * 100
data_from_annual.loc["Total Percent"] = data_from_annual.sum()
data_from_annual

In [None]:
data_from_annual.loc["A",:].rename("% of EIA-923 input data from EIA annual reporters")

## What percentage of the EIA data we actually use is from annually-reporting plants?
Even though ~10% of the data in EIA-923 is reported annually, we do not directly use all of the EIA-923 in our final results, since in some cases we have CEMS data that supercedes the EIA data. So what percentage of the EIA data we actually do use comes from annually-reporting plants?

In [None]:
annual_eia_used = eia923_allocated[eia923_allocated["hourly_data_source"] != "cems"].groupby(["respondent_frequency"], dropna=False)[["fuel_consumed_mmbtu", "net_generation_mwh","co2_mass_lb"]].sum() / eia923_allocated[["fuel_consumed_mmbtu", "net_generation_mwh","co2_mass_lb"]].sum() * 100
annual_eia_used.loc["Total Percent"] = annual_eia_used.sum()
annual_eia_used

In [None]:
annual_eia_used.loc["A",:].rename("% of output data from EIA annual reporters")

## What percentage of the data that we get from multiple sources is annually reported?
For subplants where all of the data comes from EIA, the annual reporting may lead to lower-quality allocation of the data to each month and hour, but there should not be a risk of double counting. However, for some plants, we draw data for some months from CEMS and other months from EIA-923. If the annually-reported data was not allocated to the correct month (relative to the CEMS data), there is a risk that we may double-count or under count data from that subplant. 

In [None]:
multi_source_subplants = eia923_allocated[
    ["plant_id_eia", "subplant_id", "hourly_data_source"]
].drop_duplicates().drop(columns="hourly_data_source")

multi_source_subplants = multi_source_subplants[
            multi_source_subplants.duplicated(
                subset=["plant_id_eia", "subplant_id"])]

multi_source_subplants = eia923_allocated.merge(multi_source_subplants, how="inner", on=["plant_id_eia", "subplant_id"])


In [None]:
# what percent of the total EIA-923 data comes from subplants with annually-reported data and multiple sources?
multi_source_summary = (multi_source_subplants.groupby(["respondent_frequency"], dropna=False)[["fuel_consumed_mmbtu", "net_generation_mwh","co2_mass_lb"]].sum() / eia923_allocated[["fuel_consumed_mmbtu", "net_generation_mwh","co2_mass_lb"]].sum() * 100)
multi_source_summary.loc["Total Percent"] = multi_source_summary.sum()
multi_source_summary

In [None]:
multi_source_summary.loc["A",:].rename("% of output data mixing CEMS and annually-reported EIA data")

In [None]:
pd.concat([pd.DataFrame(data_from_annual.loc["A",:].rename("% of EIA-923 input data from EIA annual reporters").round(2)).T, pd.DataFrame(annual_eia_used.loc["A",:].rename("% of output data from EIA annual reporters").round(2)).T, pd.DataFrame(multi_source_summary.loc["A",:].rename("% of output data mixing CEMS and annually-reported EIA data").round(2)).T], axis=0)