In [68]:
# import packages
import pandas as pd

%reload_ext autoreload
%autoreload 2

# # Tell python where to look for modules.
import sys
sys.path.append('../../../open-grid-emissions/src/')

from column_checks import get_dtypes
import load_data
from filepaths import *

year = 2019
path_prefix = f"{year}/"

# How much of the data reported in EIA-923 comes from annually-reported plants?

In [69]:
pudl_out = load_data.initialize_pudl_out(year)

plant_frequency = pudl_out.plants_eia860()[["plant_id_eia","respondent_frequency"]]

# load the allocated EIA data
eia923_allocated = pd.read_csv(f'{outputs_folder()}{path_prefix}/eia923_allocated_{year}.csv', dtype=get_dtypes(), parse_dates=['report_date'])
eia923_allocated = eia923_allocated.merge(plant_frequency, how="left", on="plant_id_eia", validate="m:1")



FileNotFoundError: [Errno 2] No such file or directory: 'A:\\GitHub\\open-grid-emissions\\data\\outputs\\2019//eia923_allocated_2019.csv'

In [None]:
data_from_annual = eia923_allocated.groupby(["respondent_frequency"], dropna=False)[["fuel_consumed_mmbtu", "net_generation_mwh","co2_mass_lb"]].sum() / eia923_allocated[["fuel_consumed_mmbtu", "net_generation_mwh","co2_mass_lb"]].sum() * 100
data_from_annual.loc["Total Percent"] = data_from_annual.sum()
data_from_annual

Unnamed: 0_level_0,fuel_consumed_mmbtu,net_generation_mwh,co2_mass_lb
respondent_frequency,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,9.743375,9.744905,3.140016
AM,4.361269,2.881034,7.352079
M,85.574484,87.12955,89.011431
,0.320871,0.244511,0.496474
Total Percent,100.0,100.0,100.0


## What percentage of the EIA data we actually use is from annually-reporting plants?
Even though ~10% of the data in EIA-923 is reported annually, we do not directly use all of the EIA-923 in our final results, since in some cases we have CEMS data that supercedes the EIA data. So what percentage of the EIA data we actually do use comes from annually-reporting plants?

In [66]:
annual_eia_used = eia923_allocated[eia923_allocated["hourly_data_source"] != "cems"].groupby(["respondent_frequency"], dropna=False)[["fuel_consumed_mmbtu", "net_generation_mwh","co2_mass_lb"]].sum() / eia923_allocated[["fuel_consumed_mmbtu", "net_generation_mwh","co2_mass_lb"]].sum() * 100
annual_eia_used.loc["Total Percent"] = annual_eia_used.sum()
annual_eia_used

Unnamed: 0_level_0,fuel_consumed_mmbtu,net_generation_mwh,co2_mass_lb
respondent_frequency,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,8.680985,8.874646,1.868628
AM,2.404752,0.870232,4.654741
M,36.387153,33.407622,9.971367
,0.124723,0.070856,0.164541
Total Percent,47.597613,43.223356,16.659277


## What percentage of the data that we get from multiple sources is annually reported?
For subplants where all of the data comes from EIA, the annual reporting may lead to lower-quality allocation of the data to each month and hour, but there should not be a risk of double counting. However, for some plants, we draw data for some months from CEMS and other months from EIA-923. If the annually-reported data was not allocated to the correct month (relative to the CEMS data), there is a risk that we may double-count or under count data from that subplant. 

In [53]:
multi_source_subplants = eia923_allocated[
    ["plant_id_eia", "subplant_id", "hourly_data_source"]
].drop_duplicates().drop(columns="hourly_data_source")

multi_source_subplants = multi_source_subplants[
            multi_source_subplants.duplicated(
                subset=["plant_id_eia", "subplant_id"])]

multi_source_subplants = eia923_allocated.merge(multi_source_subplants, how="inner", on=["plant_id_eia", "subplant_id"])


In [65]:
# what percent of the total EIA-923 data comes from subplants with annually-reported data and multiple sources?
multi_source_summary = (multi_source_subplants.groupby(["respondent_frequency"], dropna=False)[["fuel_consumed_mmbtu", "net_generation_mwh","co2_mass_lb"]].sum() / eia923_allocated[["fuel_consumed_mmbtu", "net_generation_mwh","co2_mass_lb"]].sum() * 100)
multi_source_summary.loc["Total Percent"] = multi_source_summary.sum()
multi_source_summary

Unnamed: 0_level_0,fuel_consumed_mmbtu,net_generation_mwh,co2_mass_lb
respondent_frequency,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.724816,0.531644,0.870369
AM,0.904512,0.814791,1.208854
M,15.159077,14.420226,26.512067
,0.115595,0.116639,0.152573
Total,16.904001,15.8833,28.743863
