# Evaluate CEMS coverage

1) What plant hours included in 923 are entirely missing from CEMS? 
2) What plant months have underreported CEMS generation compared to their 923-reported generation? 
3) Is the gap between 923 and CEMS consistent with 930 data? 

*Note: use processed CEMS data output by data_pipeline notebook.*

In [10]:
import pandas as pd
import sqlalchemy as sa
import numpy as np

import plotly.express as px
import plotly.graph_objects as go

In [11]:
year=2019

In [12]:
# Read data output by data_pipeline
# Advantages over getting direct from PUDL: 
cems = pd.read_csv(f"../data/output/cems{year}.csv", parse_dates=["operating_datetime_utc"])

  cems = pd.read_csv(f"../data/output/cems{year}.csv", parse_dates=["operating_datetime_utc"])


In [13]:
# specify the relative path to the sqllite database, and create an sqalchemy engine
pudl_db = 'sqlite:///../data/pudl/pudl_data/sqlite/pudl.sqlite'
pudl_engine = sa.create_engine(pudl_db)


# 1. What plant months in 923 are entirely missing from CEMS? 

In [14]:
cems.head()

Unnamed: 0.1,Unnamed: 0,plant_id_eia,operating_datetime_utc,co2_mass_tons,heat_content_mmbtu,gross_generation_mwh,gross_load_mw,report_date,gtn_ratio,net_generation_mwh,plant_id_epa,operating_time_hours,co2_mass_measurement_code,facility_id
0,0,3.0,2019-01-01 06:00:00+00:00,143.40294,7129.9,645.0,666.0,2019-01,0.961957,620.462497,3,1.0,Measured,1
1,1,3.0,2019-01-01 07:00:00+00:00,101.80844,7039.1,638.0,638.0,2019-01,0.961957,613.728795,3,1.0,Measured,1
2,2,3.0,2019-01-01 08:00:00+00:00,84.8182,6966.4004,626.0,626.0,2019-01,0.961957,602.185307,3,1.0,Measured,1
3,3,3.0,2019-01-01 09:00:00+00:00,86.04841,6969.3003,625.0,625.0,2019-01,0.961957,601.22335,3,1.0,Measured,1
4,4,3.0,2019-01-01 10:00:00+00:00,88.1431,6829.8,625.0,625.0,2019-01,0.961957,601.22335,3,1.0,Measured,1


In [15]:
# PUDL reports 923 monthly generation data across two tables, generation_eia923 and generation_fuel_eia923

# Load generation, filter for year. PUDL says that this only contains ~55% of reported generation 
# (https://catalystcoop-pudl.readthedocs.io/en/latest/data_sources/eia923.html)
# Table is large, so filter before load
gen_923 = pd.read_sql(f"SELECT * FROM generation_fuel_eia923 \
                           WHERE report_date >= '{year}-01-01' \
                           AND report_date <= '{year}-12-30'", pudl_engine)
gen_923.report_date = pd.to_datetime(gen_923.report_date)
gen_923.report_date = gen_923.report_date.dt.tz_localize("UTC")
gen_923.head()

Unnamed: 0,plant_id_eia,report_date,energy_source_code,fuel_type_code_pudl,fuel_type_code_aer,prime_mover_code,fuel_consumed_units,fuel_consumed_for_electricity_units,fuel_mmbtu_per_unit,fuel_consumed_mmbtu,fuel_consumed_for_electricity_mmbtu,net_generation_mwh
0,1,2019-01-01 00:00:00+00:00,DFO,oil,DFO,IC,351.0,351.0,5.825,2045.0,2045.0,195.676
1,1,2019-02-01 00:00:00+00:00,DFO,oil,DFO,IC,392.0,392.0,5.825,2283.0,2283.0,217.498
2,1,2019-03-01 00:00:00+00:00,DFO,oil,DFO,IC,388.0,388.0,5.825,2260.0,2260.0,215.544
3,1,2019-04-01 00:00:00+00:00,DFO,oil,DFO,IC,357.0,357.0,5.825,2080.0,2080.0,198.082
4,1,2019-05-01 00:00:00+00:00,DFO,oil,DFO,IC,388.0,388.0,5.825,2260.0,2260.0,215.578


In [47]:
# Group 923 by plants because some plants report multiple fuel/prime mover combos in this table 
month_923 = gen_923.groupby(["plant_id_eia","report_date"]).sum()

In [17]:
############################## IRRELEVANT: even annually reporting plants report monthly data. ###########################

# Some 923 data is reported monthly, some annually. To aggregate CEMS correctly, assign report_date to each CEMS 
# I *think* that PUDL converts dates to beginning-of-month: https://catalystcoop-pudl.readthedocs.io/en/latest/_modules/pudl/transform/eia923.html#_yearly_to_monthly_records
# TODO: this is slooooooooow, could maybe find a way to speed it up 
#       eg by assuming relationship between report date 

# cems["report_date"] = np.nan
# cems["report_date"] = pd.to_datetime(cems["report_date"])
# i=0
# for plant in cems.plant_id_eia.unique():
#     if i%100 == 0:
#         print(f"{i/len(cems.plant_id_eia.unique())}...",end="")
#     i += 1
#     if plant not in gen_923.plant_id_eia.index: # 35 CEMS plants missing from EIA-923
#         continue
#     # Collect 923 report dates for this plant 
#     this_plant_dates = gen_923[gen_923.plant_id_eia==plant].report_date
#     # At most we will have monthly resolution
#     for month in cems[cems.plant_id_eia == plant].operating_datetime_utc.dt.month.unique():
        
#         # What is the latest 923 report date before the CEMS report date month? 
#         if len(this_plant_dates[this_plant_dates.dt.month <= month])==0:
#             print(f"month {month}, plant {plant} has no matching report date")
#             continue
#         target_date = max(this_plant_dates[this_plant_dates.dt.month <= month])

#         # Set all rows with this plant and month 
#         cems.loc[(cems.plant_id_eia == plant) & (cems.operating_datetime_utc.dt.month==month),"923_report_date"] = target_date

In [48]:
# Assign report date: first day of month in question
cems["report_date"] = pd.to_datetime({"month":cems.operating_datetime_utc.dt.month, "year":cems.operating_datetime_utc.dt.year,"day":1})\
    .dt.tz_localize("UTC")
month_cems = cems.groupby(["plant_id_eia","report_date"]).sum()

In [49]:
# What plants are missing? 
print(f"# plants in CEMS not in 923 = {len(np.setdiff1d(cems.plant_id_eia, gen_923.plant_id_eia))}")
print(f"# plants in 923 not in CEMS = {len(np.setdiff1d(gen_923.plant_id_eia, cems.plant_id_eia))}")
print(f"\r\nTotal plants in 923 = {len(gen_923.plant_id_eia.unique())}")
print(f"Total plants in CEMS = {len(cems.plant_id_eia.unique())}")

# plants in CEMS not in 923 = 35
# plants in 923 not in CEMS = 8394

Total plants in 923 = 9755
Total plants in CEMS = 1396


In [50]:
# Add CEMS generation numbers (allocated by month) to 932 numbers to compare month-by-month missing data
month_cems_merge = month_cems[["co2_mass_tons","heat_content_mmbtu","gross_generation_mwh","net_generation_mwh"]].rename(columns={\
    "co2_mass_tons":"co2_mass_tons_cems",\
    "heat_content_mmbtu":"heat_content_mmbtu_cems",\
    "gross_generation_mwh":"gross_generation_mwh_cems",\
    "net_generation_mwh":"net_generation_mwh_cems"})
month_923 = month_923.merge(month_cems_merge,how="left", \
    left_index=True, right_index=True)
month_923.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,fuel_consumed_units,fuel_consumed_for_electricity_units,fuel_mmbtu_per_unit,fuel_consumed_mmbtu,fuel_consumed_for_electricity_mmbtu,net_generation_mwh,co2_mass_tons_cems,heat_content_mmbtu_cems,gross_generation_mwh_cems,net_generation_mwh_cems
plant_id_eia,report_date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,2019-01-01 00:00:00+00:00,351.0,351.0,5.825,2829.0,2829.0,283.677,,,,
1,2019-02-01 00:00:00+00:00,392.0,392.0,5.825,3000.0,3000.0,298.067,,,,
1,2019-03-01 00:00:00+00:00,388.0,388.0,5.825,3064.0,3064.0,305.876,,,,
1,2019-04-01 00:00:00+00:00,357.0,357.0,5.825,2978.0,2978.0,298.932,,,,
1,2019-05-01 00:00:00+00:00,388.0,388.0,5.825,3052.0,3052.0,304.552,,,,


In [53]:
# Add helper column to indicate where data is missing from CEMS 
month_923["missing_cems"] = month_923["heat_content_mmbtu_cems"].isna()
# 33 rows where fuel consumed in CEMS is 0 but not in 923
month_923.loc[(month_923["heat_content_mmbtu_cems"] == 0) & (month_923["fuel_consumed_mmbtu"] != 0),["missing_cems"]] = True

In [54]:
# What is the generation of plants not in CEMS? 
not_cems_ids = np.setdiff1d(gen_923.plant_id_eia, cems.plant_id_eia)
shared_ids = np.intersect1d(cems.plant_id_eia, gen_923.plant_id_eia)

fig = go.Figure()
fig.add_trace(go.Histogram(x=month_923[~month_923.missing_cems]["fuel_consumed_mmbtu"], name="In CEMS and 923"))
fig.add_trace(go.Histogram(x=month_923[month_923.missing_cems]["fuel_consumed_mmbtu"],name="Not in CEMS"))

# Overlay both histograms
fig.update_layout(barmode='overlay')
# Make log 
fig.update_yaxes(type="log", title_text="# of plant months")
fig.update_xaxes(title_text="Fuel consumed in 923 reporting period (mmbtu)")
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

In [55]:
# What percent of hours are in CEMS?
print(f"{sum(month_923.missing_cems)/len(month_923.missing_cems)} plant-months in 923 are entirely missing in CEMS")

0.872479780792858 plant-months in 923 are entirely missing in CEMS


# 2. What plant-months have some CEMS data, but less than they report in 923? 

I don't know if this matters: eGRID documentation says that CEMS reports are taken over 923 (section 3.1.1.1 in 2020 technical guide)

In [67]:
has_cems = month_923[~month_923.missing_cems].copy()
has_cems["fraction_reported_cems"] = (has_cems.fuel_consumed_mmbtu - has_cems.heat_content_mmbtu_cems)/has_cems.fuel_consumed_mmbtu
px.histogram(has_cems.loc[has_cems["fraction_reported_cems"].abs() < 3, "fraction_reported_cems"], log_y=False, range_x=(-1,1))

In [70]:
# How does the # of plants not reporting cems change over time? 

counts = month_923.groupby("report_date").count()
px.line(counts, x=counts.index, y="missing_cems")

In [71]:
# How does the production of plants not reporting cems change over time? 
no_cems = month_923[month_923.missing_cems].groupby("report_date").sum()

px.line(no_cems, x=no_cems.index, y="fuel_consumed_mmbtu")

In [113]:
# Side note: the 923 fuel consumption data is not internally consistent. 

px.histogram((month_923.fuel_consumed_mmbtu - (month_923.fuel_consumed_units * month_923.fuel_mmbtu_per_unit))/month_923.fuel_consumed_mmbtu, log_y=True, \
    labels={"value":"mmBtu gap = (fuel_consumed_mmbtu - fuel_consumed_units*fuel_mmbtu_per_unit)/fuel_consumed_mmbtu"}, \
        title="Inconsistencies in EIA-923 fuel consumption metrics")

In [114]:
month_923

Unnamed: 0_level_0,Unnamed: 1_level_0,fuel_consumed_units,fuel_consumed_for_electricity_units,fuel_mmbtu_per_unit,fuel_consumed_mmbtu,fuel_consumed_for_electricity_mmbtu,net_generation_mwh,co2_mass_tons_cems,heat_content_mmbtu_cems,gross_generation_mwh_cems,net_generation_mwh_cems,missing_cems
plant_id_eia,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,1,351.0,351.0,5.825,2829.0,2829.0,283.677,,,,,True
1,2,392.0,392.0,5.825,3000.0,3000.0,298.067,,,,,True
1,3,388.0,388.0,5.825,3064.0,3064.0,305.876,,,,,True
1,4,357.0,357.0,5.825,2978.0,2978.0,298.932,,,,,True
1,5,388.0,388.0,5.825,3052.0,3052.0,304.552,,,,,True
...,...,...,...,...,...,...,...,...,...,...,...,...
63918,9,122858.0,17573.0,1.040,127772.0,18276.0,1606.953,,,,,True
63918,10,101384.0,14502.0,1.040,105439.0,15082.0,1326.080,,,,,True
63918,11,77181.0,11040.0,1.040,80268.0,11482.0,1009.516,,,,,True
63918,12,108184.0,15474.0,1.040,112511.0,16093.0,1415.019,,,,,True


# 3. Does availibility of CEMS data explain changes in CEMS relative to 930 over the year? 