# Differences between CEMS-reporting and non-CEMS-reporting plants

Some key characteristics (from Greg):
* Nameplate capacity
* Capacity factor
* Primary fuel type
* Heat rate
* Prime mover type, esp for natural gas plants

In [None]:
import pandas as pd
import numpy as np

import plotly.express as px

In [None]:
%reload_ext autoreload
%autoreload 2

# Tell python where to look for modules. 
# Depending on how your jupyter handles working directories, this may not be needed.
import sys
sys.path.append('../../hourly-egrid/')

from src.load_data import load_pudl_table

In [None]:
year = 2020

In [None]:
# gather plant IDs of reporting plants from cems data

cems = pd.read_csv(f"../data/outputs/cems_{year}_cleaned_20220415.csv")

In [None]:
# load 890 data, 923 data

eia890 = load_pudl_table("generators_eia860", year=year)
eia923 = load_pudl_table("generation_fuel_eia923", year=year)
gen923 = load_pudl_table("generation_eia923", year=year)

In [None]:
eia890.c

In [None]:
all_ids = set(eia890.plant_id_eia.unique())
all_ids.update(set(eia923.plant_id_eia.unique()))
all_ids.update(set(cems.plant_id_eia.unique()))

In [None]:
plants = pd.DataFrame(index = all_ids)
plants["in_CEMS"] = False
plants.loc[cems.plant_id_eia.unique(),"in_CEMS"] = True

In [None]:
px.pie(plants, names="in_CEMS")

# Capacity

In [None]:
plants["capacity"] = eia890.groupby("plant_id_eia").sum().capacity_mw

In [None]:
fig = px.histogram(plants, x="capacity", color="in_CEMS", log_y=True)
# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)

# Generation, capacity factor

In [None]:
plants["generation"] = eia923.groupby("plant_id_eia").sum().net_generation_mwh

In [None]:
fig = px.histogram(plants, x="generation", color="in_CEMS", log_y=True)
# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)

In [None]:
# todo I'm sure there's a built in python function for this
if year%4 == 0: 
    n_hours = 366*24
else:
    n_hours = 365*24

plants["capacity_factor"] = (plants["generation"]/n_hours)/plants["capacity"]

In [None]:
fig = px.histogram(plants, x="capacity_factor", color="in_CEMS")
# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)

In [None]:
plants

# Heat rate

Heat rate = energy consumed / generation, in mmBtu/MWh

In [None]:
plants["fuel_consumed"] = eia923.groupby("plant_id_eia").sum().fuel_consumed_mmbtu
plants["heat_rate"] = plants["fuel_consumed"]/plants["generation"]
# assume heat_rate = 0 should be NaN, these are plants that didn't consume anything
plants.loc[plants["heat_rate"]==0,"heat_rate"] = np.nan

In [None]:
fig = px.histogram(plants, x="heat_rate", color="in_CEMS", log_y=True)
# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)

# Use eGRID 

This avoids potential data issues with 860, 923 that are fixed by eGRID

In [None]:
egrid_plant = pd.read_excel(f'../data/downloads/egrid/egrid{year}_data.xlsx', 
                            sheet_name=f'PLNT{str(year)[-2:]}', 
                            header=1, 
                            usecols=["ORISPL", # Plant code
                            "PLHTRT", # heat rate
                             "CAPFAC",# capacity factor
                             "NAMEPCAP",# nameplate capacity
                             "CHPFLAG", # combined heat and power
                             "ELCALLOC", # CHP electric allocation factor
                             "PLCO2AN", # annual CO2 emissions (tons)
                             "PLPRMFL", # plant primary fuel
                             "PLFUELCT", # plant fuel category 
                             "NUMUNT", # number of units
                             "NUMGEN", # number of generators
                             "PLNGENAN"]) # annual generation 


In [None]:
# Fix eGRID IDs
# TODO move into helper function, this code is reused between here and data_pipeline

egrid_crosswalk = pd.read_csv('../data/manual/egrid_static_tables/table_C5_crosswalk_of_EIA_ID_to_EPA_ID.csv')
epaid_to_eiaid = dict(zip(list(egrid_crosswalk['EGRID ID']), list(egrid_crosswalk['EIA ID'])))
print(f" Updatating {len(egrid_plant[egrid_plant['ORISPL'].isin(list(egrid_crosswalk['EGRID ID']))])} plant codes from eGRID")

egrid_plant['plant_id_eia'] = egrid_plant['ORISPL'].map(lambda x: epaid_to_eiaid.get(x, x))
egrid_plant = egrid_plant.set_index("plant_id_eia")

In [None]:
cems_reporters = cems.plant_id_eia.unique()
print(f"{len(set(cems_reporters).difference(egrid_plant.index.unique()))} CEMS plants not in eGRID")
cems_reporters = list(set(cems_reporters).intersection(egrid_plant.index.unique()))

egrid_plant["in_CEMS"] = False
egrid_plant.loc[cems_reporters, "in_CEMS"] = True

In [None]:
px.pie(egrid_plant, names="in_CEMS")

In [None]:
fig = px.histogram(egrid_plant, x="NAMEPCAP", color="in_CEMS", log_y=False, title="Capacity", histnorm='probability')
# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)

In [None]:
fig = px.histogram(egrid_plant, x="CAPFAC", color="in_CEMS", log_y=False, title="Capacity factor", histnorm='probability')
# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)

In [None]:
#px.bar(egrid_plant,color="PLPRMFL", x="in_CEMS")

In [None]:
egrid_plant.PLPRMFL