In [1]:
# Tell python where to look for modules. 
# Depending on how your jupyter handles working directories, this may not be needed.
import sys
sys.path.append('../../hourly-egrid/')

In [2]:
# import the necessary packages
%reload_ext autoreload
%autoreload 2

# import packages
import os
import requests
import tarfile
import sqlalchemy as sa
from pathlib import Path
import pandas as pd
import plotly.express as px
import numpy as np

import src.data_cleaning as data_cleaning
import src.gross_to_net_generation as gross_to_net_generation
import src.load_data as load_data
import src.distribute_eia923 as distribute_eia923

  from pandas import Int64Index as NumericIndex


# Specify the year for analysis

In [4]:
year = 2020

# 1. Download data

 - Downloads the pre-cleaned PUDL versions of EIA-923, EIA-860, and EPA CEMS data  
 - Downloads EPA eGRID data  
 - Downloads EIA-930 data  
 - Downloads the EPA Power Sector Data Crosswalk

TODO
- [x] The code for downloading the files could probably be made into functions
- [ ] Investigate other packages besides `requests` that would download these files faster

In [6]:
############### PUDL Database ######################

load_data.download_pudl_data(zenodo_url = 'https://zenodo.org/record/6349861/files/pudl-v0.6.0-2022-03-12.tgz')

################# eGRID data #########################

# the 2019 and 2020 data appear to be hosted on different urls
egrid_files_to_download = ['https://www.epa.gov/sites/default/files/2021-02/egrid2019_data.xlsx', 
                           'https://www.epa.gov/system/files/documents/2022-01/egrid2020_data.xlsx']

load_data.download_egrid_files(egrid_files_to_download)

############# EIA-930 data #####################

load_data.download_eia930_data(years_to_download=[year])

########## Power Sector Data Crosswalk #############
# NOTE: Check for new releases at https://github.com/USEPA/camd-eia-crosswalk

load_data.download_epa_psdc(psdc_url='https://github.com/USEPA/camd-eia-crosswalk/releases/download/v0.2.1/epa_eia_crosswalk.csv')


PUDL data already downloaded
egrid2019_data.xlsx already downloaded
egrid2020_data.xlsx already downloaded
2020_Jan_Jun data already downloaded
2020_Jul_Dec data already downloaded
epa_eia_crosswalk.csv already downloaded


# Load emissions data reported to CEMS
There are three broad categories of plants based on their CAMD reporting status:
1. Units that report to CAMD year-round (for these plants, emissions data is used directly from CEMS)
2. Units that only report to CAMD during the ozone season (May-Sept) (for these units, non-ozone season data is taken from EIA 923)
3. Units that do not report to CAMD (generally fossil units < 25MW and non-fossil generators)

There are also certain plants that report to CAMD but do not procuce electricty for the grid, and need to be removed from the CEMS data:
- Non grid connected plants
- Steam-only plants


In [None]:
# NOTE: all of the functions in this section could be run by calling clean_cems()
#cems = data_cleaning.clean_cems(year)

In [102]:
# load the CEMS data
cems = load_data.load_cems_data(year)

# remove non-grid connected plants
cems = data_cleaning.remove_non_grid_connected_plants(cems)

# remove plants that only report steam generation and no electrical generation
cems = data_cleaning.remove_heating_only_plants(cems)

# add a report date
cems = data_cleaning.add_report_date(cems)

# identify cems reporting status
cems = data_cleaning.determine_cems_reporting_status(cems)

# TODO: identify and remove any hourly values that appear to be outliers


# fill in missing hourly emissions data using the fuel type and heat input
cems = data_cleaning.fill_cems_missing_co2(cems, year)

Removing 45 plants that are not grid-connected
Removing 77 plants that only produce heat and no power


In [103]:
# identify any remaining missing values
# TODO: Try to identify fuel types

units_with_no_fuel_type = list(cems[cems['co2_mass_tons'].isnull()]['cems_id'].unique())
print(f"Unable to find fuel types for the following plants_units: {units_with_no_fuel_type}")

cems[cems['co2_mass_tons'].isnull()]

Unable to find fuel types for the following plants_units: ['1004_CTG1']


Unnamed: 0,plant_id_eia,unitid,cems_id,operating_datetime_utc,operating_time_hours,gross_load_mw,gross_generation_mwh,steam_load_1000_lbs,heat_content_mmbtu,co2_mass_tons,co2_mass_measurement_code,plant_id_epa,unit_id_epa,report_date,cems_reporting_category,energy_source_code
10900967,1004,CTG1,1004_CTG1,2020-04-10 04:00:00+00:00,0.03,0.0,0.0,,0.0009,,Measured,1004,90673,2020-04-01,full_year,SGC
11492782,1004,CTG1,1004_CTG1,2020-11-02 03:00:00+00:00,0.03,0.0,0.0,,0.0009,,Measured,1004,90673,2020-11-01,full_year,SGC


In [104]:
# For now, lets drop these from the data
cems = cems[~cems['cems_id'].isin(units_with_no_fuel_type)]

In [105]:
# remove any observations from cems where zero operation is reported for an entire month
# although this data could be considered to be accurately reported, let's remove it so that we can double check against the eia data
# TODO: check if any of these observations are from geothermal generators
cems = data_cleaning.remove_cems_with_zero_monthly_emissions(cems)

removing 6958152 observations from cems


In [106]:
# add information about the balancing authority 
#cems = cems.drop(columns=['ba_code','state'])
cems = data_cleaning.assign_ba_code_to_plant(cems, year)

## Explore outlier detection
We need to come up with a method that filters out observations that are significantly higher than normal operation. The challenge is that some plants only operate a handful of hours each year, so their operation looks spikey, which would be identified as an outlier using typical detection methods. 

In [None]:
max = cems.replace(0, np.NaN).groupby(['cems_id'])['heat_content_mmbtu'].max()
mean = cems.replace(0, np.NaN).groupby(['cems_id'])['heat_content_mmbtu'].mean()
stdev = cems.replace(0, np.NaN).groupby(['cems_id'])['heat_content_mmbtu'].std()

max[max > mean + (3 * stdev)]

In [None]:
px.line(cems[cems['cems_id'] == '10298_CG803'], x='operating_datetime_utc', y='heat_content_mmbtu')

In [None]:
px.line(cems[cems['cems_id'] == '1012_2'], x='operating_datetime_utc', y='heat_content_mmbtu')

# 2. Get monthly data for all plants/units where data is missing from CEMS
We have now identified all plants that report the full year to CEMS, and all plants that report a partial year. We will now use the EIA-923 data to fill in the missing pieces.

1. Load EIA-923 data, and standardize heat input and generation data across the tables
2. Identify all plants/months for which we do not have CEMS data

We need to be able to match the EIA data to the CEMS data based on units so we know which data will be used to fill the missing data


In [194]:
# Distribute net generation and heat input data reported by the three different EIA-923 tables
# NOTE: this code was copied and modified from `pudl.analysis.allocate_net_gen`
# NOTE: this code allocates net generation based on the proportion of net generation reported, rather than by nameplate capacity (which eGRID does)
# NOTE: the code was modified to perform the allocation on a monthly basis, rather than an annual basis

# HIGH PRIORITIES
# DONE: remove non-grid connected plants from this dataframe
# DONE: Denormalize data by balancing authority/state. BA assignment from EIA-860
# TODO: look into whether net_gen from generation table should be preserved if available
# TODO: figure out what happens when each table has different values for net gen (allocate difference, or take precedence)
# DONE: add nuclear generators to this list
# TODO: Remove any plants located in Puerto Rico

# FUEL ASSIGNMENT
# DONE: calculate total emissions from gf based on fuel and heat input and distribute in addition to net generation and fuel consumed
# TODO: when aggregating back to generator records, keep the fuel type that accounted for most heat input
# TODO: denormalize data by fuel type both primary fuel type by generator, and primary fuel by plant (assuming that's how reported to ISOs)
# primary fuel type is currently assigned based on the annual primary fuel type. This should be changed to assign base on monthly fuel type

# NEXT PRIORITIES
# TODO: allocate heat input data from boiler_fuel_eia923() See: https://github.com/catalyst-cooperative/pudl/pull/1096
# TODO: Also distribute heat input for electricity consumption

# LOWER PRIORITIES
# TODO: fix allocation of net generation when reported net generation is negative?
# TODO: investigate generators for which frac column is not adding to 1.0

gen_fuel_allocated = distribute_eia923.allocate_gen_fuel_by_gen(year=year)

# flag any generator-months for which we already have cems data
gen_fuel_allocated = data_cleaning.identify_emissions_data_source(cems, gen_fuel_allocated)

# create a separate dataframe containing only the generators for which we do not have CEMS data
monthly_eia_data_to_distribute = gen_fuel_allocated[(gen_fuel_allocated['data_source'] == 'eia_only') & ~(gen_fuel_allocated['fuel_consumed_mmbtu'].isna())]

gen_fuel_allocated.sample(10)

Removing 0 plants that are not grid-connected
Removing 0 plants that are not grid-connected
Removing 1 plants that are not grid-connected


        plant_id_eia prime_mover_code energy_source_code report_date  frac  \
3413             377               CA                 NG  2020-04-01   2.0   
3414             377               CT                 NG  2020-04-01   2.0   
3441             377               CA                 NG  2020-11-01   2.0   
3442             377               CT                 NG  2020-11-01   2.0   
3445             377               CA                 NG  2020-12-01   2.0   
...              ...              ...                ...         ...   ...   
105133         58207               CA                 NG  2020-03-01   4.0   
105137         58207               CA                 NG  2020-04-01   4.0   
105145         58207               CA                 NG  2020-06-01   4.0   
105165         58207               CA                 NG  2020-11-01   4.0   
105169         58207               CA                 NG  2020-12-01   4.0   

        net_generation_mwh_g_tbl  frac_fuel  net_generation_mwh

Unnamed: 0,plant_id_eia,generator_id,report_date,net_generation_mwh,fuel_consumed_mmbtu,co2_mass_tons,ba_code,state,energy_source_code_1,data_source
2166,78,2,2020-03-01,6687.304148,58667.7289,0.0,,AK,WAT,eia_only
215834,58805,INV1,2020-05-01,79.5648,698.0,0.0,DUK,NC,SUN,eia_only
226576,59604,FLS1,2020-04-01,874.19,7669.0,0.0,DUK,NC,SUN,eia_only
104365,8906,3,2020-06-01,29172.0,398383.819864,23304.477789,NYIS,NY,NG,cems
120926,50447,GEN6,2020-11-01,235.11,2062.666667,0.0,ISNE,ME,WAT,eia_only
192965,57443,GEN2,2020-11-01,539.25,5959.125,378.404438,MISO,MI,LFG,eia_only
28145,1330,7,2020-02-01,0.0,0.0,0.0,SWPP,KS,DFO,eia_only
211640,58527,1,2020-11-01,63.827,560.0,0.0,IID,CA,SUN,eia_only
53399,2758,2,2020-11-01,8967.18075,78669.0,0.0,PJM,NC,WAT,eia_only
255524,61722,IRV1W,2020-10-01,-214.398818,0.0,0.0,CISO,CA,MWH,eia_only


In [176]:
# for which plants are we still missing co2 data?
gen_fuel_allocated[(gen_fuel_allocated['co2_mass_tons'].isna()) & (gen_fuel_allocated['fuel_consumed_mmbtu'] > 0)]

Unnamed: 0,plant_id_eia,generator_id,report_date,net_generation_mwh,fuel_consumed_mmbtu,co2_mass_tons,ba_code,state,energy_source_code_1,data_source
123496,50626,GEN1,2020-01-01,742.746,381046.0,,MISO,LA,OTH,eia_only
123497,50626,GEN1,2020-02-01,700.137,359168.0,,MISO,LA,OTH,eia_only
123498,50626,GEN1,2020-03-01,654.929,335974.0,,MISO,LA,OTH,eia_only
123499,50626,GEN1,2020-04-01,628.143,322234.0,,MISO,LA,OTH,eia_only
123500,50626,GEN1,2020-05-01,562.301,288461.0,,MISO,LA,OTH,eia_only
123501,50626,GEN1,2020-06-01,626.327,321326.0,,MISO,LA,OTH,eia_only
123502,50626,GEN1,2020-07-01,702.505,360388.0,,MISO,LA,OTH,eia_only
123503,50626,GEN1,2020-08-01,836.592,429186.0,,MISO,LA,OTH,eia_only
123504,50626,GEN1,2020-09-01,730.017,374505.0,,MISO,LA,OTH,eia_only
123505,50626,GEN1,2020-10-01,829.739,425649.0,,MISO,LA,OTH,eia_only


In [165]:
# investigate single plants
gen_fuel_allocated[gen_fuel_allocated['plant_id_eia'] == 54262]

Unnamed: 0,plant_id_eia,generator_id,report_date,net_generation_mwh,fuel_consumed_mmbtu,co2_mass_tons,ba_code,state,energy_source_code_1,data_source
132599,54262,1,2020-01-01,,,,MISO,MN,NG,eia_only
132600,54262,1,2020-02-01,,,,MISO,MN,NG,eia_only
132601,54262,1,2020-03-01,,,,MISO,MN,NG,eia_only
132602,54262,1,2020-04-01,,,,MISO,MN,NG,eia_only
132603,54262,1,2020-05-01,,,,MISO,MN,NG,eia_only
...,...,...,...,...,...,...,...,...,...,...
132659,54262,7,2020-08-01,5.608434,59.337349,4.845488,MISO,MN,DFO,eia_only
132660,54262,7,2020-09-01,6.198795,64.759036,5.288223,MISO,MN,DFO,eia_only
132661,54262,7,2020-10-01,4.427711,45.481928,3.714054,MISO,MN,DFO,eia_only
132662,54262,7,2020-11-01,6.789157,64.759036,5.288223,MISO,MN,DFO,eia_only


In [18]:
# investigate plants that don't have fuel codes
plants = load_data.load_pudl_table("plants_entity_eia")
gf = load_data.load_pudl_table("generation_fuel_eia923", year=year)
#plants[plants['plant_id_eia'] == 50626]
#gf[gf['plant_id_eia'] == 50626
# look at plants that are refineries
#gf[gf['plant_id_eia'].isin(list(plants.fillna('').loc[plants.fillna('')['plant_name_eia'].str.contains('refin', case=False)]['plant_id_eia']))]

In [146]:
# what percent of generators are in CEMS vs not
gen_fuel_allocated.groupby('data_source').sum() / gen_fuel_allocated.groupby('data_source').sum().sum(axis=0)

Unnamed: 0_level_0,plant_id_eia,net_generation_mwh,fuel_consumed_mmbtu,co2_mass_tons
data_source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cems,0.104032,0.559486,0.51684,0.832288
eia_only,0.895968,0.440514,0.48316,0.167712


## Calculate CEMS net generation
Now that we have accurate net generation data from EIA, we can use this to calculate a net generation ratio to convert the CEMS gross generation to hourly net generation

For now, we will calculate the allocation at the plant level. However, in the future, we may want to calculate for each EPA unit, which will require developing a scheme for allocating each EPA unit to EIA generator

In [198]:
#cems = cems.drop(columns=['net_generation_mwh','gross_to_net_ratio','net_gen_method'])
cems = data_cleaning.convert_gross_to_net_generation(cems, gen_fuel_allocated)

  return np.dot(wresid, wresid) / self.df_resid
  r = _umath_linalg.det(a, signature=signature)
  return np.dot(wresid, wresid) / self.df_resid
  r = _umath_linalg.det(a, signature=signature)
  return np.dot(wresid, wresid) / self.df_resid
  return self.resid / sigma / np.sqrt(1 - hii)
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return np.dot(wresid, wresid) / self.df_resid
  r = _umath_linalg.det(a, signature=signature)
  return np.dot(wresid, wresid) / self.df_resid
  r = _umath_linalg.det(a, signature=signature)
  return self.resid / sigma / np.sqrt(1 - hii)
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return 1 - self.ssr/self.centered_tss
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return np.dot(wresid, wresid) / self.df_resid
  r = _umath_linalg.det(a, signature=signature

In [199]:
# what percent of gross generation was allocated using each method?
cems.groupby('net_gen_method', dropna=False).sum()['gross_generation_mwh'] / cems['gross_generation_mwh'].sum()

net_gen_method
annual_regression    0.032129
monthly_ratio        0.967871
Name: gross_generation_mwh, dtype: float32

# Adjust emissions
We next need to make certain adjustments to the data:
 - [ ] Calculate emissions for Geothermal plants
 - [ ] Adjust heat input/emissions from CHP plants by proportion used for electric generation
 - EPA adjusts biomass emissions, but not sure if we want to do that. Need to look into it more

# Output CEMS data

In [22]:
# output the cems data to csv for others to use (update the date)
cems_for_export = cems.loc[cems['co2_mass_tons'] > 0, ['plant_id_eia', 'unitid', 'operating_datetime_utc',
       'gross_generation_mwh','net_generation_mwh','steam_load_1000_lbs', 'heat_content_mmbtu', 'co2_mass_tons',
       'report_date', 'cems_reporting_category', 'energy_source_code',
       'ba_code', 'state']]

date = '20220415'
cems_for_export.to_csv(f'../data/output/cems_{year}_cleaned_{date}.csv')

# Compare results to eGRID totals

Before we allocate the data to the hourly level, we should double check that the total annual emissions / generation values match the "official" data published in eGRID at the annual level.

In [200]:
# Aggregate total calculated values
###################################

# combine cems and eia data
cems_plant_annual = cems.groupby(['ba_code','state','plant_id_eia'], dropna=False).sum()[['net_generation_mwh','heat_content_mmbtu','co2_mass_tons']].rename(columns={'heat_content_mmbtu':'heat_input_mmbtu'}).reset_index()
eia_plant_annual = monthly_eia_data_to_distribute.groupby(['ba_code','state','plant_id_eia'], dropna=False).sum()[['net_generation_mwh','fuel_consumed_mmbtu','co2_mass_tons']].rename(columns={'fuel_consumed_mmbtu':'heat_input_mmbtu'}).reset_index()

plant_annual_total = pd.concat([cems_plant_annual,eia_plant_annual], axis=0)

# group any plants that have records from both datasets
plant_annual_total = plant_annual_total.groupby(['ba_code','state','plant_id_eia'], dropna=False).sum().reset_index()

# For plants that have different EPA and EIA plant IDs, the plant ID in eGRID is usually the EPA ID, but sometimes the EIA ID
# however, there are sometime 2 EIA IDs for a single eGRID ID, so we need to group the data in the EIA table by the egrid id
# We need to update all of the egrid plant IDs to the EIA plant IDs
egrid_crosswalk = pd.read_csv('../data/egrid/egrid_static_tables/2020/table_C5_crosswalk_of_EIA_ID_to_EPA_ID.csv')
eia_to_egrid_id = dict(zip(list(egrid_crosswalk['plant_id_eia']), list(egrid_crosswalk['plant_id_egrid'])))
egrid_to_eia_id = dict(zip(list(egrid_crosswalk['plant_id_egrid']), list(egrid_crosswalk['plant_id_eia'])))
plant_annual_total['plant_id_egrid'] = plant_annual_total['plant_id_eia']
plant_annual_total['plant_id_egrid'].update(plant_annual_total['plant_id_egrid'].map(eia_to_egrid_id))

# Load the eGRID plant table
############################

# load plant level data from egrid
egrid_plant = pd.read_excel(f'../data/egrid/egrid{year}_data.xlsx', 
                            sheet_name=f'PLNT{str(year)[-2:]}', 
                            header=1, 
                            usecols=['BACODE','PSTATABB', 'PLPRMFL','ORISPL', 'PNAME','PLGENATN', 'PLGENATR', 'PLHTIANT','UNCO2','PLCO2AN'])
# calculate total net generation from reported renewable and nonrenewable generation
egrid_plant['net_generation_mwh'] = egrid_plant['PLGENATN'] + egrid_plant['PLGENATR']
egrid_plant = egrid_plant.drop(columns=['PLGENATN', 'PLGENATR'])
# rename the columns
egrid_plant = egrid_plant.rename(columns={'BACODE':'ba_code',
                                          'PSTATABB':'state',
                                          'PLPRMFL':'energy_source_code',
                                          'ORISPL':'plant_id_egrid',
                                          'PNAME':'plant_name',
                                          'PLHTIANT':'heat_input_mmbtu',
                                          'UNCO2':'co2_mass_tons_unadjusted',
                                          'PLCO2AN':'co2_mass_tons'})

# if egrid has a missing value for co2 for a clean plant, replace with zero
clean_fuels = ['SUN','MWH','WND', 'WAT','WH','PUR','NUC']
egrid_plant.loc[egrid_plant['energy_source_code'].isin(clean_fuels), 'co2_mass_tons'] = egrid_plant.loc[egrid_plant['energy_source_code'].isin(clean_fuels), 'co2_mass_tons'].fillna(0)
egrid_plant.loc[egrid_plant['energy_source_code'].isin(clean_fuels), 'co2_mass_tons_unadjusted'] = egrid_plant.loc[egrid_plant['energy_source_code'].isin(clean_fuels), 'co2_mass_tons_unadjusted'].fillna(0)

# reorder the columns                                         
egrid_plant = egrid_plant[['ba_code', 'state', 'plant_id_egrid', 'plant_name', 'net_generation_mwh', 'heat_input_mmbtu', 'co2_mass_tons', 'co2_mass_tons_unadjusted']]

# remove any plants that habe no reported data
# NOTE: it seems that egrid includes a lot of proposed projects that are not yet operating, but just has missing data for them
plants_with_no_data_in_egrid = list(egrid_plant[egrid_plant[['net_generation_mwh','heat_input_mmbtu','co2_mass_tons','co2_mass_tons_unadjusted']].sum(axis=1) == 0]['plant_id_egrid'])
egrid_plant = egrid_plant[~egrid_plant['plant_id_egrid'].isin(plants_with_no_data_in_egrid)]

# We also want to remove any plants that are located in Puerto Rico
egrid_plant = egrid_plant[(egrid_plant['state'] != 'PR')]

# create a column for eia id
egrid_plant['plant_id_eia'] = egrid_plant['plant_id_egrid']
egrid_plant['plant_id_eia'].update(egrid_plant['plant_id_eia'].map(egrid_to_eia_id))

## Identify plants that are missing from each dataset

In [201]:
# identify any plants that are in egrid but not our totals, and any plants that are in our totals, but not egrid
plant_not_in_calc = list(set(egrid_plant['plant_id_eia'].unique()) - set(plant_annual_total['plant_id_eia'].unique()))
plants_not_in_egrid = list(set(plant_annual_total['plant_id_egrid'].unique()) - set(egrid_plant['plant_id_egrid'].unique()))

# Which plants are included in eGRID but are missing from our calculations?
missing_from_calc = egrid_plant[egrid_plant['plant_id_egrid'].isin(plant_not_in_calc)]

#missing_from_calc.to_csv('../data/temp/plants_missing_from_calcs.csv', index=False)

# see if any of these plants are retired
generators_eia860 = load_data.load_pudl_table('generators_eia860', year=year)
missing_from_calc.merge(generators_eia860.groupby('plant_id_eia')['retirement_date'].unique().reset_index(), how='left', on='plant_id_eia')

Unnamed: 0,ba_code,state,plant_id_egrid,plant_name,net_generation_mwh,heat_input_mmbtu,co2_mass_tons,co2_mass_tons_unadjusted,plant_id_eia,retirement_date
0,SOCO,AL,50359,Sloss Industries Corp,0.0,0.0,0.0,96648.513,50359,[None]
1,CISO,CA,328,Borel,-204.0,,0.0,0.0,328,[2017-09-01]
2,MISO,IA,7922,Brooklyn City North Plant,42.0,,,,7922,[2011-09-01]
3,ISNE,ME,10491,Mead Rumford Cogen,0.0,0.0,0.0,58381.352,10491,[None]
4,MISO,MN,1979,Hibbing,0.0,0.0,0.0,54323.0,1979,[None]
5,BPAT,OR,58109,Georgia-Pacific Toledo Mill,0.0,0.0,0.0,678968.418,58109,[None]
6,CPLE,SC,10795,Camden South Carolina,0.0,0.0,0.0,50693.776,10795,"[None, 2019-11-01]"
7,SC,SC,57470,Marlboro Mill,0.0,0.0,0.0,1005072.203,57470,[None]
8,BPAT,WA,58352,McKinley Paper Co. - Washington Mill,0.0,0.0,0.0,72703.25,58352,[None]
9,BPAT,WA,50231,SDS Lumber Gorge Energy Division,0.0,0.0,0.0,32925.781,50231,"[None, 2001-02-01]"


In [None]:
# notes: plant 7922 is being removed in gen_fuel_allocated because it is marked as a retired plant, and only has generation in the gf table
# most of these other plants are being removed because they report 0 net generation, even though they have fuel consumption for electricity
# not sure if these plants are exporting electricity to the grid. If not, it is probably safe to leave them off of the grid average emissions calculation

In [202]:
# Which plants are in our calculations, but are missing from eGRID?
plant_names = load_data.load_pudl_table('plants_entity_eia')[['plant_id_eia','plant_name_eia','sector_name_eia']]
missing_from_egrid = plant_annual_total[plant_annual_total['plant_id_eia'].isin(plants_not_in_egrid)].merge(plant_names, how='left', on='plant_id_eia')

missing_from_egrid

Unnamed: 0,ba_code,state,plant_id_eia,net_generation_mwh,heat_input_mmbtu,co2_mass_tons,plant_id_egrid,plant_name_eia,sector_name_eia
0,AECI,MO,2127,0.0000,0.0,0.00000,2127,Gallatin (MO),Electric Utility
1,AECI,MO,2141,0.0000,0.0,0.00000,2141,Macon,Electric Utility
2,AECI,MO,2146,0.0000,0.0,0.00000,2146,Monroe (MO),Electric Utility
3,AECI,MO,8110,0.0000,0.0,0.00000,8110,Stanberry,Electric Utility
4,AECI,MO,56126,0.0000,0.0,0.00000,56126,Sub 2 Generating Station,Electric Utility
...,...,...,...,...,...,...,...,...,...
334,,AK,57053,0.0000,45.0,3.67470,57053,Alakanuk,Electric Utility
335,,AK,60250,0.0000,441.0,36.01206,60250,Swampy Acres Microgrid,Electric Utility
336,,HI,774,0.0000,0.0,0.00000,774,Waiau Hydro,Electric Utility
337,,HI,63280,0.0000,0.0,0.00000,63280,"AES Kekaha Solar, LLC Hybrid",IPP Non-CHP


In [204]:
# identify where there is a single egrid plant id for multiple eia plant ids
double_ids = plant_annual_total[plant_annual_total['plant_id_egrid'].duplicated(keep=False)]
double_ids = double_ids.groupby('plant_id_egrid').sum()['net_generation_mwh'].reset_index() # focus on net generation for now
# merge the egrid data
double_ids = double_ids.merge(egrid_plant[['plant_id_egrid','net_generation_mwh']], how='left', on='plant_id_egrid', suffixes=('_calc','_egrid'))
double_ids['percent_diff'] = ((double_ids['net_generation_mwh_calc'] - double_ids['net_generation_mwh_egrid']) / double_ids['net_generation_mwh_egrid']).round(3)
double_ids

Unnamed: 0,plant_id_egrid,net_generation_mwh_calc,net_generation_mwh_egrid,percent_diff
0,562,295575.7,295534.003,0.0
1,1416,3497043.0,3497026.0,0.0
2,2709,5829410.0,5829409.996,0.0
3,3612,4045388.0,4045388.003,-0.0
4,4076,110539.0,110539.0,0.0
5,10474,670411.0,670410.997,0.0
6,55306,8487498.0,8487498.0,0.0
7,55375,5450411.0,5447419.084,0.001
8,55481,7276336.0,7276336.0,0.0
9,55508,22841.0,22841.0,-0.0


## Identify plants for which we are missing a BA assignment
(of the plants not already missing from our calculated totals)

In [205]:
ba_code_match = egrid_plant.set_index('plant_id_eia')[['plant_name','ba_code']].merge(plant_annual_total.set_index('plant_id_eia')[['ba_code']], how='inner', left_index=True, right_index=True, suffixes=("_egrid",'_calc'))

# plants with missing ba code
ba_code_match[(ba_code_match['ba_code_calc'].isna()) & ~(ba_code_match['ba_code_egrid'].isna())]

Unnamed: 0_level_0,plant_name,ba_code_egrid,ba_code_calc
plant_id_eia,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


## Identify plants for which we have incorrectly assigned the BA code

In [206]:
# plants with incorrect ba code
ba_code_match[(ba_code_match['ba_code_calc'] != ba_code_match['ba_code_egrid']) & ~(ba_code_match['ba_code_calc'].isna())]

Unnamed: 0_level_0,plant_name,ba_code_egrid,ba_code_calc
plant_id_eia,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


## Identify where our calculated totals do not match eGRID's totals

In [207]:
# standardize column names and index so that the two dfs can be divided
calculated_to_compare = plant_annual_total.groupby('plant_id_egrid').sum().drop(columns=['plant_id_eia'])
calculated_to_compare['co2_mass_tons_unadjusted'] = calculated_to_compare['co2_mass_tons']

# drop the plants that have no data in eGRID
egrid_plant = egrid_plant[~egrid_plant['plant_id_eia'].isin(plants_with_no_data_in_egrid)]

egrid_to_compare = egrid_plant.set_index(['plant_id_egrid']).drop(columns=['ba_code','state','plant_name','plant_id_eia'])

# divide calculated value by egrid value
compared = calculated_to_compare.div(egrid_to_compare).merge(egrid_plant[['plant_id_egrid','plant_name','ba_code', 'state']], how='left', left_index=True, right_on='plant_id_egrid').set_index('plant_id_egrid')
compared['plant_name'] = compared['plant_name'].fillna('unknown')

# create a dataframe that merges the two sources of data together
compared_merged = calculated_to_compare.merge(egrid_to_compare, how='outer', on='plant_id_egrid', suffixes=('_calc','_egrid'))

# for each column, change missing values to zero if both values are zero (only nan b/c divide by zero)
for col in ['net_generation_mwh','heat_input_mmbtu','co2_mass_tons','co2_mass_tons_unadjusted']:
    # identify plants with zero values for both
    plant_ids = list(compared_merged[(compared_merged[f'{col}_calc'] == 0) & (compared_merged[f'{col}_egrid'] == 0)].index)
    compared.loc[compared.index.isin(plant_ids), col] = 1

# for each column, categorize the data based on how far it is off from egrid
for col in ['net_generation_mwh','heat_input_mmbtu','co2_mass_tons','co2_mass_tons_unadjusted']:
    # add a new column
    compared[f'{col}_status'] = pd.cut(x=compared[col], 
                                       bins=[-999999999,0,0.5,0.9,0.99,0.9999,1,1.0001,1.01,1.1,1.5,999999999], 
                                       labels=['negative','<50%','+/-50%','+/-10%','+/-1%','!exact','!exact','+/-1%','+/-10%','+/-50%','>50%'], 
                                       ordered=False)
    # replace any missing values with missing
    compared[f'{col}_status'] = compared[f'{col}_status'].astype(str)  
    compared[f'{col}_status'] = compared[f'{col}_status'].fillna('missing')
    compared[f'{col}_status'] = compared[f'{col}_status'].replace('nan','missing')
    compared.loc[(compared.index.isin(plants_not_in_egrid)),f'{col}_status'] = 'not_in_egrid'

# identify which plants are missing from egrid vs calculated values
for col in ['net_generation_mwh','heat_input_mmbtu','co2_mass_tons','co2_mass_tons_unadjusted']:
    # identify plants that are missing in egrid
    plants_missing_egrid = list(compared_merged[(compared_merged[f'{col}_calc'] > 0) & (compared_merged[f'{col}_egrid'].isna())].index)
    compared.loc[compared.index.isin(plants_missing_egrid), f'{col}_status'] = 'missing_in_egrid'
    # identify plants that are missing from our calculations
    plants_missing_calc = list(compared_merged[(compared_merged[f'{col}_calc'].isna()) & (compared_merged[f'{col}_egrid'] > 0)].index)
    compared.loc[compared.index.isin(plants_missing_calc), f'{col}_status'] = 'missing_in_calc'
    # identify where our calculations are missing a zero value
    plants_missing_zero_calc = list(compared_merged[(compared_merged[f'{col}_calc'].isna()) & (compared_merged[f'{col}_egrid'] == 0)].index)
    compared.loc[compared.index.isin(plants_missing_zero_calc), f'{col}_status'] = 'missing_zero_in_calc'
    # identify where egrid has a missing value instead of a zero
    plants_missing_zero_egrid = list(compared_merged[(compared_merged[f'{col}_calc'] == 0) & (compared_merged[f'{col}_egrid'].isna())].index)
    compared.loc[compared.index.isin(plants_missing_zero_egrid), f'{col}_status'] = 'missing_zero_in_egrid'
    # identify where egrid has a zero value where we have a positive value
    plants_incorrect_zero_egrid = list(compared_merged[(compared_merged[f'{col}_calc'] > 0) & (compared_merged[f'{col}_egrid'] == 0)].index)
    compared.loc[compared.index.isin(plants_incorrect_zero_egrid), f'{col}_status'] = '>50%'

# create a dataframe that counts how many plants are in each category
comparison_count = []
for col in ['net_generation_mwh','heat_input_mmbtu','co2_mass_tons','co2_mass_tons_unadjusted']:
    count = compared.groupby(f'{col}_status', dropna=False).count()['plant_name'].rename(col)
    count.index = count.index.rename('status')
    comparison_count.append(count)

comparison_count = pd.concat(comparison_count, axis=1)
comparison_count = pd.concat([comparison_count, pd.DataFrame(comparison_count.sum().rename('Total')).T], axis=0)   
comparison_count

Unnamed: 0,net_generation_mwh,heat_input_mmbtu,co2_mass_tons,co2_mass_tons_unadjusted
!exact,9854,8096,7398,8232
+/-1%,28,544,595,730
+/-10%,62,318,399,473
+/-50%,38,282,329,310
<50%,5,6,4,12
>50%,49,776,1184,151
missing,1,2,1,1
missing_in_calc,2,1,1,10
missing_in_egrid,3,56,161,161
missing_zero_in_calc,9,9,10,1


In [208]:
print(comparison_count.to_markdown())

|                       |   net_generation_mwh |   heat_input_mmbtu |   co2_mass_tons |   co2_mass_tons_unadjusted |
|:----------------------|---------------------:|-------------------:|----------------:|---------------------------:|
| !exact                |                 9854 |               8096 |            7398 |                       8232 |
| +/-1%                 |                   28 |                544 |             595 |                        730 |
| +/-10%                |                   62 |                318 |             399 |                        473 |
| +/-50%                |                   38 |                282 |             329 |                        310 |
| <50%                  |                    5 |                  6 |               4 |                         12 |
| >50%                  |                   49 |                776 |            1184 |                        151 |
| missing               |                    1 |                

In [209]:
# examine specific plants in a category
value = 'net_generation_mwh'
status = '<50%'

#compared_merged.loc[64877,:]

compared[compared[f'{value}_status'] == status].sort_values(by=value)

Unnamed: 0_level_0,net_generation_mwh,heat_input_mmbtu,co2_mass_tons,co2_mass_tons_unadjusted,plant_name,ba_code,state,net_generation_mwh_status,heat_input_mmbtu_status,co2_mass_tons_status,co2_mass_tons_unadjusted_status
plant_id_egrid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
10444,0.005074,1.371797,0.999974,0.999974,SEGS VIII,CISO,CA,<50%,+/-50%,!exact,!exact
59395,0.198818,1.0,1.0,1.0,City of Tulare Water Facility,CISO,CA,<50%,!exact,!exact,!exact
3754,0.240895,1.122869,1.124058,1.124058,Penny Lane Gas Turbine,ISNE,VT,<50%,+/-50%,+/-50%,+/-50%
2716,0.321054,0.970796,0.972717,0.972717,W H Weatherspoon,CPLE,NC,<50%,+/-10%,+/-10%,+/-10%
2401,0.449478,0.887968,0.883065,0.883065,Essex,PJM,NJ,<50%,+/-50%,+/-50%,+/-50%


In [None]:
# notes: some plants are hybrid fossil / solar plants, but the primary energy source code is getting listed as PV for certain generators, which is causing an issue
# this applies to 10444 59395

# plant 3754 has heat input in cems and eia that don't match
# 2617 has negative net generation
# 2401 has generation in both cems and eia


In [191]:
egrid_plant[egrid_plant['plant_id_eia'] == 2401]

Unnamed: 0,ba_code,state,plant_id_egrid,plant_name,net_generation_mwh,heat_input_mmbtu,co2_mass_tons,co2_mass_tons_unadjusted,plant_id_eia
8192,PJM,NJ,2401,Essex,617.0,15184.006,910.6,910.6,2401


In [192]:
plant_annual_total[plant_annual_total['plant_id_eia'] == 2401]

Unnamed: 0,ba_code,state,plant_id_eia,net_generation_mwh,heat_input_mmbtu,co2_mass_tons,plant_id_egrid
7625,PJM,NJ,2401,277.328003,13482.90918,804.118787,2401


In [193]:
gen_fuel_allocated[gen_fuel_allocated['plant_id_eia'] == 2401]

Unnamed: 0,plant_id_eia,generator_id,report_date,net_generation_mwh,fuel_consumed_mmbtu,co2_mass_tons,ba_code,state,energy_source_code_1,data_source
46941,2401,9,2020-01-01,50.519,1226.0,72.47874,PJM,NJ,NG,cems
46942,2401,9,2020-02-01,44.136,1075.0,63.5076,PJM,NJ,NG,cems
46943,2401,9,2020-03-01,45.208,1101.0,65.29599,PJM,NJ,NG,cems
46944,2401,9,2020-04-01,40.818,993.0,58.71552,PJM,NJ,NG,cems
46945,2401,9,2020-05-01,34.96,849.0,50.44686,PJM,NJ,NG,cems
46946,2401,9,2020-06-01,56.781,1383.0,81.50712,PJM,NJ,NG,cems
46947,2401,9,2020-07-01,86.457,2100.0,123.5553,PJM,NJ,NG,cems
46948,2401,9,2020-08-01,75.743,1844.0,108.59466,PJM,NJ,NG,cems
46949,2401,9,2020-09-01,48.288,1173.0,69.23472,PJM,NJ,NG,cems
46950,2401,9,2020-10-01,52.875,1284.0,75.57486,PJM,NJ,NG,eia_only


In [169]:
monthly_eia_data_to_distribute[monthly_eia_data_to_distribute['plant_id_eia'] == 3754]

Unnamed: 0,plant_id_eia,generator_id,report_date,net_generation_mwh,fuel_consumed_mmbtu,co2_mass_tons,ba_code,state,energy_source_code_1,data_source
67471,3754,GT1,2020-02-01,33.246,619.0,50.54754,ISNE,VT,DFO,eia_only
67472,3754,GT1,2020-03-01,25.837,481.0,39.27846,ISNE,VT,DFO,eia_only
67473,3754,GT1,2020-04-01,28.705,533.0,43.52478,ISNE,VT,DFO,eia_only
67479,3754,GT1,2020-10-01,38.923,727.0,59.36682,ISNE,VT,DFO,eia_only


In [174]:
# there are some plants that report heat input and co2 in CEMS, but are missing net generation data
# TODO: we should maybe try and fill net generation data using EIA-923?
cems_plant_annual[cems_plant_annual['net_generation_mwh'] == 0]

Unnamed: 0,ba_code,state,plant_id_eia,net_generation_mwh,heat_input_mmbtu,co2_mass_tons
108,CISO,CA,57073,0.0,476779.2,28141.999997
109,CISO,CA,57074,0.0,484786.0,28602.299972
110,CISO,CA,57075,0.0,480863.2,28373.799968
125,CPLE,NC,2707,0.0,7067.434,572.020997
127,CPLE,NC,2716,0.0,16812.83,1361.799019
265,FPC,FL,7294,0.0,5400.395,321.151797
319,ISNE,CT,56189,0.0,10511.38,858.359701
374,ISNE,VT,3734,0.0,3513.84,284.729996
375,ISNE,VT,3754,0.0,7887.08,639.309984
747,PJM,DE,591,0.0,1516.64,122.799997


In [172]:
cems[cems['plant_id_eia'] == 57073]

Unnamed: 0,plant_id_eia,unitid,cems_id,operating_datetime_utc,operating_time_hours,gross_load_mw,gross_generation_mwh,steam_load_1000_lbs,heat_content_mmbtu,co2_mass_tons,...,plant_id_epa,unit_id_epa,report_date,cems_reporting_category,energy_source_code,ba_code,state,net_generation_mwh,gross_to_net_ratio,net_gen_method
1749864,57073,BLR1,57073_BLR1,2020-01-01 08:00:00+00:00,1.0,0.0,0.0,,51.900002,3.1,...,57073,90614,2020-01-01,full_year,SUN,CISO,CA,0.0,inf,monthly_ratio
1749865,57073,BLR1,57073_BLR1,2020-01-01 09:00:00+00:00,1.0,0.0,0.0,,53.000000,3.1,...,57073,90614,2020-01-01,full_year,SUN,CISO,CA,0.0,inf,monthly_ratio
1749866,57073,BLR1,57073_BLR1,2020-01-01 10:00:00+00:00,1.0,0.0,0.0,,51.900002,3.1,...,57073,90614,2020-01-01,full_year,SUN,CISO,CA,0.0,inf,monthly_ratio
1749867,57073,BLR1,57073_BLR1,2020-01-01 11:00:00+00:00,1.0,0.0,0.0,,51.900002,3.1,...,57073,90614,2020-01-01,full_year,SUN,CISO,CA,0.0,inf,monthly_ratio
1749868,57073,BLR1,57073_BLR1,2020-01-01 12:00:00+00:00,1.0,0.0,0.0,,51.900002,3.1,...,57073,90614,2020-01-01,full_year,SUN,CISO,CA,0.0,inf,monthly_ratio
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3542323,57073,BLR1,57073_BLR1,2021-01-01 03:00:00+00:00,0.0,0.0,0.0,,0.000000,0.0,...,57073,90614,2020-12-01,full_year,SUN,CISO,CA,0.0,inf,monthly_ratio
3542324,57073,BLR1,57073_BLR1,2021-01-01 04:00:00+00:00,0.0,0.0,0.0,,0.000000,0.0,...,57073,90614,2020-12-01,full_year,SUN,CISO,CA,0.0,inf,monthly_ratio
3542325,57073,BLR1,57073_BLR1,2021-01-01 05:00:00+00:00,0.0,0.0,0.0,,0.000000,0.0,...,57073,90614,2020-12-01,full_year,SUN,CISO,CA,0.0,inf,monthly_ratio
3542326,57073,BLR1,57073_BLR1,2021-01-01 06:00:00+00:00,0.0,0.0,0.0,,0.000000,0.0,...,57073,90614,2020-12-01,full_year,SUN,CISO,CA,0.0,inf,monthly_ratio


In [None]:
# TODO: Compare the fuel input from CEMS to the input from EIA to see if they are close
# in general, we will trust the CEMS data over the EIA data unless there are significant differences
# may need to aggregate to plant level since there is not a 1:1 match between units and generators

# for plants where there is data reported in cems, see how off it is from data reported in eia
cems_plant_monthly = cems.groupby(['plant_id_eia','report_date']).sum()[['heat_content_mmbtu']].reset_index()
gf_plant_monthly = gen_fuel_allocated.groupby(['plant_id_eia','report_date']).sum().reset_index()
gf_plant_monthly = gf_plant_monthly.merge(cems_plant_monthly, how='left', on=['plant_id_eia','report_date'])
gf_plant_monthly = gf_plant_monthly[gf_plant_monthly['heat_content_mmbtu'].notnull()]
gf_plant_monthly['pct_diff'] = (gf_plant_monthly['heat_content_mmbtu'] - gf_plant_monthly['fuel_consumed_mmbtu']) / gf_plant_monthly['fuel_consumed_mmbtu']

# identify where there are plants that report 0 heat input to cems but have data in eia_923
gf_plant_monthly[(gf_plant_monthly['heat_content_mmbtu'] == 0) & (gf_plant_monthly['fuel_consumed_mmbtu'] > 0)]

## Compare data at BA level

In [210]:
# load egrid BA totals
egrid_ba = pd.read_excel(f'../data/egrid/egrid{year}_data.xlsx', sheet_name=f'BA{str(year)[-2:]}', header=1, usecols=['BANAME','BACODE','BAHTIANT','BANGENAN','BACO2AN'])
# rename the columns
egrid_ba = egrid_ba.rename(columns={'BANAME':'ba_name',
                                    'BACODE':'ba_code',
                                    'BAHTIANT':'heat_input_mmbtu',
                                    'BANGENAN':'net_generation_mwh',
                                    'BACO2AN':'co2_mass_tons'})

# divide our calculation by the BA totals from eGRID
# if there are 0 values, replace with 0.1, so that div by zero doesn't return missing value
ba_metric = plant_annual_total.groupby('ba_code', dropna=False).sum().drop(columns='plant_id_eia').replace(0,0.1).div(egrid_ba.set_index('ba_code').drop(columns='ba_name').replace(0,0.1)).sort_values(by='co2_mass_tons').round(3)

# calculate the difference in the number of plants in each region
plant_count = (plant_annual_total.groupby('ba_code', dropna=False).count()['plant_id_egrid'] - egrid_plant.groupby('ba_code', dropna=False).count()['plant_id_egrid']).rename('num_plants')
ba_metric = ba_metric.merge(plant_count, how='left', left_index=True, right_index=True).drop(columns=['plant_id_egrid'])

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(ba_metric)

Unnamed: 0_level_0,co2_mass_tons,heat_input_mmbtu,net_generation_mwh,num_plants
ba_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,0.29,0.339,0.309,8.0
TEPC,0.859,0.818,0.697,0.0
DEAA,0.997,0.997,1.0,0.0
GRIF,0.999,0.999,1.0,0.0
OVEC,1.0,1.0,1.0,0.0
CSTO,1.0,1.0,1.0,0.0
NSB,1.0,1.0,1.0,0.0
SPA,1.0,1.0,1.0,1.0
CEA,1.0,1.0,1.0,0.0
WAUW,1.0,1.0,1.0,0.0


In [211]:
print(ba_metric.to_markdown())

| ba_code   |   co2_mass_tons |   heat_input_mmbtu |   net_generation_mwh |   num_plants |
|:----------|----------------:|-------------------:|---------------------:|-------------:|
| nan       |           0.29  |              0.339 |                0.309 |            8 |
| TEPC      |           0.859 |              0.818 |                0.697 |            0 |
| DEAA      |           0.997 |              0.997 |                1     |            0 |
| GRIF      |           0.999 |              0.999 |                1     |            0 |
| OVEC      |           1     |              1     |                1     |            0 |
| CSTO      |           1     |              1     |                1     |            0 |
| NSB       |           1     |              1     |                1     |            0 |
| SPA       |           1     |              1     |                1     |            1 |
| CEA       |           1     |              1     |                1     |            0 |

# Assign monthly data to hourly profile
We now, in theory, have complete data on national-level heat input, net generation, and emissions, from a combination of two sources:
    1. hourly data from CEMS
    2. momthly data for generators that don't report to CEMS

For the second category of monthly data, we need to figure out how to allocate the monthly level data to each hour. 

In [None]:
# categorize generators by broad fuel categories (clean, geothermal, biofuel, fossil)
clean_fuels = ['SUN','MWH','WND', 'WAT','WH','PUR','NUC']
fossil_fuels = ['NG', 'DFO','OG','WDS','BIT','OTH','PC','SUB', 'LIG','KER', 'RC', 'WO','RFO', 'WC', 'SGC', 'SGP', 'PG', 'JF','BFG']
bio_fuels = ['AB','BG','BLQ','DG','LFG','MSB','MSW','OBG','OBL','OBS','SLW','WDL','WDS']
geo_fuel = ['GEO']
# TODO: Figure out what to do with MSW

# Output data 

Save data to a CSV so we can separate generation of and analysis of hourly e-grid numbers