In [None]:
# import packages
import pandas as pd

%reload_ext autoreload
%autoreload 2

# # Tell python where to look for modules.
import sys
sys.path.append('../../../open-grid-emissions/src/')

from column_checks import get_dtypes
import load_data
from filepaths import *

year = 2021
path_prefix = f"{year}/"

# How much of the data reported in EIA-923 comes from annually-reported plants?

In [None]:
# load annual plant level outputs from OGE
plant_data = pd.read_csv(
    results_folder(f"{year}/plant_data/annual/us_units/plant_data.csv")
)[
    [
        "plant_id_eia",
        "net_generation_mwh",
        "fuel_consumed_for_electricity_mmbtu",
        "co2_mass_lb_for_electricity",
    ]
]

# merge plant attributes
plant_attributes = pd.read_csv(
    outputs_folder(f"{path_prefix}/plant_static_attributes_{year}.csv"),
    dtype=get_dtypes(),
)
plant_data = plant_data.merge(
    plant_attributes[
        [
            "plant_id_eia",
            "ba_code",
            "plant_primary_fuel",
            "fuel_category",
            "data_availability",
        ]
    ],
    how="outer",
    on="plant_id_eia",
    validate="1:1",
)

# merge reporting frequency
pudl_out = load_data.initialize_pudl_out(year)
plant_frequency = pudl_out.plants_eia860()[
    ["plant_id_eia", "plant_name_eia", "state", "reporting_frequency_code"]
]
plant_data = plant_data.merge(
    plant_frequency, how="left", on="plant_id_eia", validate="1:1"
)

plant_data

In [None]:
# filter to MISO only plants
miso_data = plant_data[plant_data["ba_code"] == "MISO"].copy()

# add a column for clean vs emitting
miso_data["emissions_category"] = "emitting"
miso_data.loc[
    miso_data["fuel_category"].isin(["hydro", "nuclear", "solar", "wind"]),
    "emissions_category",
] = "non-emitting"
miso_data.loc[
    miso_data["plant_primary_fuel"].isin(["MWH","PUR", "WH"]),
    "emissions_category",
] = "non-emitting"

# change AM frequency to A
miso_data.loc[
    miso_data["reporting_frequency_code"] == "AM", "reporting_frequency_code"
] = "A"

miso_data.loc[
    miso_data["data_availability"] == "cems_only", "data_availability"
] = "cems_and_eia"
miso_data["data_availability"] = miso_data[
    "data_availability"
].cat.remove_unused_categories()


# add reporting lag column
miso_data["reporting_lag"] = "no_data_reported"
miso_data.loc[
    (miso_data["data_availability"] == "eia_only")
    & (miso_data["reporting_frequency_code"] == "A"),
    "reporting_lag",
] = "10-22 months"
miso_data.loc[
    (miso_data["data_availability"] == "eia_only")
    & (miso_data["reporting_frequency_code"] == "M"),
    "reporting_lag",
] = "2-3 months"
miso_data.loc[
    (miso_data["data_availability"] == "eia_only")
    & (miso_data["reporting_frequency_code"].isna()),
    "reporting_lag",
] = "10-22 months"  # assume worst case
miso_data.loc[
    (miso_data["data_availability"] == "cems_and_eia")
    & (miso_data["reporting_frequency_code"] == "M"),
    "reporting_lag",
] = "1-3 months"
miso_data.loc[
    (miso_data["data_availability"] == "cems_and_eia")
    & (miso_data["reporting_frequency_code"] == "A"),
    "reporting_lag",
] = "1-4 months"
miso_data.loc[
    (miso_data["data_availability"] == "cems_and_eia")
    & (miso_data["reporting_frequency_code"].isna()),
    "reporting_lag",
] = "1-4 months"  # assume worse case
miso_data.loc[
    miso_data[
        [
            "net_generation_mwh",
            "fuel_consumed_for_electricity_mmbtu",
            "co2_mass_lb_for_electricity",
        ]
    ]
    .isnull()
    .all(axis=1),
    "reporting_lag",
] = "no_data_reported"
miso_data["reporting_lag"] = miso_data["reporting_lag"].astype("category")

miso_data

In [None]:
# get statistics about where plants report
miso_data.groupby(
    ["reporting_lag", "emissions_category"], dropna=False
)["plant_id_eia"].count().reset_index().pivot(
    index=["reporting_lag"],
    columns=["emissions_category"],
    values="plant_id_eia",
)

In [None]:
# get statistics about where plants report
miso_data.groupby(
    [ "reporting_lag", "fuel_category"], dropna=False
)["plant_id_eia"].count().reset_index().pivot(
    index=[ "reporting_lag"],
    columns=["fuel_category"],
    values="plant_id_eia",
)

In [None]:
(miso_data.groupby(["emissions_category","reporting_lag"], dropna=True)[
    ["net_generation_mwh", "co2_mass_lb_for_electricity"]
].sum() ).round(1)

In [None]:
plant_report = miso_data[
    [
        "plant_id_eia",
        "plant_name_eia",
        "state",
        "fuel_category",
        "emissions_category",
        "data_availability",
        "reporting_frequency_code",
        "reporting_lag",
        "net_generation_mwh",
        "co2_mass_lb_for_electricity",
    ]
]
plant_report = plant_report.rename(
    columns={"reporting_frequency_code": "eia923_reporting_frequency"}
)
plant_report.to_csv(outputs_folder("miso_plant_reporting_analysis.csv"))

## What percentage of the EIA data we actually use is from annually-reporting plants?
Even though ~10% of the data in EIA-923 is reported annually, we do not directly use all of the EIA-923 in our final results, since in some cases we have CEMS data that supercedes the EIA data. So what percentage of the EIA data we actually do use comes from annually-reporting plants?

In [None]:
annual_eia_used = eia923_allocated[eia923_allocated["hourly_data_source"] != "cems"].groupby(["reporting_frequency_code"], dropna=False)[["fuel_consumed_mmbtu", "net_generation_mwh","co2_mass_lb"]].sum() / eia923_allocated[["fuel_consumed_mmbtu", "net_generation_mwh","co2_mass_lb"]].sum() * 100
annual_eia_used.loc["Total Percent"] = annual_eia_used.sum()
annual_eia_used

In [None]:
annual_eia_used.loc["A",:].rename("% of output data from EIA annual reporters")

## What percentage of the data that we get from multiple sources is annually reported?
For subplants where all of the data comes from EIA, the annual reporting may lead to lower-quality allocation of the data to each month and hour, but there should not be a risk of double counting. However, for some plants, we draw data for some months from CEMS and other months from EIA-923. If the annually-reported data was not allocated to the correct month (relative to the CEMS data), there is a risk that we may double-count or under count data from that subplant. 

In [None]:
multi_source_subplants = eia923_allocated[
    ["plant_id_eia", "subplant_id", "hourly_data_source"]
].drop_duplicates().drop(columns="hourly_data_source")

multi_source_subplants = multi_source_subplants[
            multi_source_subplants.duplicated(
                subset=["plant_id_eia", "subplant_id"])]

multi_source_subplants = eia923_allocated.merge(multi_source_subplants, how="inner", on=["plant_id_eia", "subplant_id"])


In [None]:
# what percent of the total EIA-923 data comes from subplants with annually-reported data and multiple sources?
multi_source_summary = (multi_source_subplants.groupby(["respondent_frequency"], dropna=False)[["fuel_consumed_mmbtu", "net_generation_mwh","co2_mass_lb"]].sum() / eia923_allocated[["fuel_consumed_mmbtu", "net_generation_mwh","co2_mass_lb"]].sum() * 100)
multi_source_summary.loc["Total Percent"] = multi_source_summary.sum()
multi_source_summary

In [None]:
multi_source_summary.loc["A",:].rename("% of output data mixing CEMS and annually-reported EIA data")

In [None]:
pd.concat([pd.DataFrame(data_from_annual.loc["A",:].rename("% of EIA-923 input data from EIA annual reporters").round(2)).T, pd.DataFrame(annual_eia_used.loc["A",:].rename("% of output data from EIA annual reporters").round(2)).T, pd.DataFrame(multi_source_summary.loc["A",:].rename("% of output data mixing CEMS and annually-reported EIA data").round(2)).T], axis=0)