# Differences between CEMS-reporting and non-CEMS-reporting plants

Some key characteristics (from Greg):
* Nameplate capacity
* Capacity factor
* Primary fuel type
* Heat rate
* Prime mover type, esp for natural gas plants

In [4]:
import pandas as pd
import numpy as np

import plotly.express as px

In [5]:
%reload_ext autoreload
%autoreload 2

# Tell python where to look for modules. 
# Depending on how your jupyter handles working directories, this may not be needed.
import sys
sys.path.append('../../hourly-egrid/')

from src.load_data import load_pudl_table

In [6]:
year = 2020

In [7]:
# gather plant IDs of reporting plants from cems data

cems = pd.read_csv(f"../data/output/cems_{year}_cleaned_20220415.csv")

In [21]:
# load 890 data, 923 data

eia890 = load_pudl_table("generators_eia860", year=year)
eia923 = load_pudl_table("generation_fuel_eia923", year=year)
gen923 = load_pudl_table("generation_eia923", year=year)

In [10]:
eia890.c

Unnamed: 0,plant_id_eia,generator_id,utility_id_eia,report_date,operational_status_code,operational_status,ownership_code,capacity_mw,summer_capacity_mw,summer_capacity_estimate,...,minimum_load_mw,uprate_derate_during_year,uprate_derate_completed_date,current_planned_operating_date,summer_estimated_capability_mw,winter_estimated_capability_mw,retirement_date,owned_by_non_utility,reactive_power_output_mvar,data_source
0,64880,75020,64382.0,2020-01-01,T,proposed,S,1.5,1.5,,...,,,,2021-05-01,,,,,,eia860
1,64879,PV,64366.0,2020-01-01,U,proposed,S,118.0,118.0,,...,,,,2021-12-01,,,,,,eia860
2,64879,BESS,64366.0,2020-01-01,U,proposed,S,66.0,33.0,,...,,,,2021-12-01,,,,,,eia860
3,64878,PV,64366.0,2020-01-01,U,proposed,S,130.0,130.0,,...,,,,2022-05-01,,,,,,eia860
4,64878,BESS,64366.0,2020-01-01,U,proposed,S,72.9,36.0,,...,,,,2022-05-01,,,,,,eia860
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30048,1,WT1,63560.0,2020-01-01,OA,existing,S,0.5,0.1,,...,0.1,0.0,,,,,,,,eia860
30049,1,5,63560.0,2020-01-01,OA,existing,S,0.7,0.4,,...,0.3,0.0,,,,,,,,eia860
30050,1,3,63560.0,2020-01-01,OP,existing,S,0.5,0.3,,...,0.3,0.0,,,,,,,,eia860
30051,1,2,63560.0,2020-01-01,OP,existing,S,0.9,0.3,,...,0.3,0.0,,,,,,,,eia860


In [14]:
all_ids = set(eia890.plant_id_eia.unique())
all_ids.update(set(eia923.plant_id_eia.unique()))
all_ids.update(set(cems.plant_id_eia.unique()))

In [24]:
plants = pd.DataFrame(index = all_ids)
plants["in_CEMS"] = False
plants.loc[cems.plant_id_eia.unique(),"in_CEMS"] = True

In [25]:
px.pie(plants, names="in_CEMS")

# Capacity

In [26]:
plants["capacity"] = eia890.groupby("plant_id_eia").sum().capacity_mw

In [39]:
fig = px.histogram(plants, x="capacity", color="in_CEMS", log_y=True)
# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)

# Generation, capacity factor

In [31]:
plants["generation"] = eia923.groupby("plant_id_eia").sum().net_generation_mwh

In [40]:
fig = px.histogram(plants, x="generation", color="in_CEMS", log_y=True)
# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)

In [42]:
# todo I'm sure there's a built in python function for this
if year%4 == 0: 
    n_hours = 366*24
else:
    n_hours = 365*24

plants["capacity_factor"] = (plants["generation"]/n_hours)/plants["capacity"]

In [43]:
fig = px.histogram(plants, x="capacity_factor", color="in_CEMS")
# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)

In [41]:
plants

Unnamed: 0,in_CEMS,capacity,fuel_generation,gen_generation,generation,capacity_factor
1,False,4.0,3.470000e+02,,3.470000e+02,86.750000
2,False,53.9,1.391700e+05,,1.391700e+05,2582.003711
3,True,3615.5,1.049915e+07,10499146.0,1.049915e+07,2903.926431
4,False,225.0,5.546130e+05,,5.546130e+05,2464.946667
7,False,138.0,5.043500e+04,50435.0,5.043500e+04,365.471014
...,...,...,...,...,...,...
64876,False,10.0,,,,
64877,False,36.0,,,,
64878,False,202.9,,,,
64879,False,184.0,,,,


# Heat rate

Heat rate = energy consumed / generation, in mmBtu/MWh

In [47]:
plants["fuel_consumed"] = eia923.groupby("plant_id_eia").sum().fuel_consumed_mmbtu
plants["heat_rate"] = plants["fuel_consumed"]/plants["generation"]
# assume heat_rate = 0 should be NaN, these are plants that didn't consume anything
plants.loc[plants["heat_rate"]==0,"heat_rate"] = np.nan

In [49]:
fig = px.histogram(plants, x="heat_rate", color="in_CEMS", log_y=True)
# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)

# Use eGRID 

This avoids potential data issues with 860, 923 that are fixed by eGRID

In [8]:
egrid_plant = pd.read_excel(f'../data/egrid/egrid{year}_data.xlsx', 
                            sheet_name=f'PLNT{str(year)[-2:]}', 
                            header=1, 
                            usecols=["ORISPL", # Plant code
                            "PLHTRT", # heat rate
                             "CAPFAC",# capacity factor
                             "NAMEPCAP",# nameplate capacity
                             "CHPFLAG", # combined heat and power
                             "ELCALLOC", # CHP electric allocation factor
                             "PLCO2AN", # annual CO2 emissions (tons)
                             "PLPRMFL", # plant primary fuel
                             "PLFUELCT", # plant fuel category 
                             "NUMUNT", # number of units
                             "NUMGEN", # number of generators
                             "PLNGENAN"]) # annual generation 


In [32]:
# Fix eGRID IDs
# TODO move into helper function, this code is reused between here and data_pipeline

egrid_crosswalk = pd.read_csv('../data/egrid/egrid_static_tables/2020/table_C5_crosswalk_of_EIA_ID_to_EPA_ID.csv')
epaid_to_eiaid = dict(zip(list(egrid_crosswalk['EGRID ID']), list(egrid_crosswalk['EIA ID'])))
print(f" Updatating {len(egrid_plant[egrid_plant['ORISPL'].isin(list(egrid_crosswalk['EGRID ID']))])} plant codes from eGRID")

egrid_plant['plant_id_eia'] = egrid_plant['ORISPL'].map(lambda x: epaid_to_eiaid.get(x, x))
egrid_plant = egrid_plant.set_index("plant_id_eia")

 Updatating 5 plant codes from eGRID


In [34]:
cems_reporters = cems.plant_id_eia.unique()
print(f"{len(set(cems_reporters).difference(egrid_plant.index.unique()))} CEMS plants not in eGRID")
cems_reporters = list(set(cems_reporters).intersection(egrid_plant.index.unique()))

egrid_plant["in_CEMS"] = False
egrid_plant.loc[cems_reporters, "in_CEMS"] = True

13 CEMS plants not in eGRID


In [35]:
px.pie(egrid_plant, names="in_CEMS")

In [41]:
fig = px.histogram(egrid_plant, x="NAMEPCAP", color="in_CEMS", log_y=False, title="Capacity", histnorm='probability')
# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)

In [39]:
fig = px.histogram(egrid_plant, x="CAPFAC", color="in_CEMS", log_y=False, title="Capacity factor", histnorm='probability')
# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)

In [42]:
#px.bar(egrid_plant,color="PLPRMFL", x="in_CEMS")

KeyError: (nan, '', '', '', '')

In [43]:
egrid_plant.PLPRMFL

plant_id_eia
60814    WND
54452     NG
57053    DFO
58982    WAT
60243    DFO
        ... 
57967     NG
55479    SUB
56319    SUB
56596    SUB
6101     SUB
Name: PLPRMFL, Length: 12668, dtype: object