In [1]:
# Tell python where to look for modules. 
# Depending on how your jupyter handles working directories, this may not be needed.
import sys
sys.path.append('../../hourly-egrid/')

In [2]:
# import the necessary packages
%reload_ext autoreload
%autoreload 2

# import packages
import os
import requests
import tarfile
import sqlalchemy as sa
from pathlib import Path
import pandas as pd
import plotly.express as px
import numpy as np

import src.data_cleaning as data_cleaning
import src.gross_to_net_generation as gross_to_net_generation
import src.load_data as load_data
import src.distribute_eia923 as distribute_eia923

  from pandas import Int64Index as NumericIndex


# Specify the year for analysis

In [3]:
year = 2020

# 1. Download data

 - Downloads the pre-cleaned PUDL versions of EIA-923, EIA-860, and EPA CEMS data  
 - Downloads EPA eGRID data  
 - Downloads EIA-930 data  
 - Downloads the EPA Power Sector Data Crosswalk

TODO
- [x] The code for downloading the files could probably be made into functions
- [ ] Investigate other packages besides `requests` that would download these files faster

In [4]:
############### PUDL Database ######################

load_data.download_pudl_data(zenodo_url = 'https://zenodo.org/record/6349861/files/pudl-v0.6.0-2022-03-12.tgz')

################# eGRID data #########################

# the 2019 and 2020 data appear to be hosted on different urls
egrid_files_to_download = ['https://www.epa.gov/sites/default/files/2021-02/egrid2019_data.xlsx', 
                           'https://www.epa.gov/system/files/documents/2022-01/egrid2020_data.xlsx']

load_data.download_egrid_files(egrid_files_to_download)

############# EIA-930 data #####################

load_data.download_eia930_data(years_to_download=[year])

########## Power Sector Data Crosswalk #############
# NOTE: Check for new releases at https://github.com/USEPA/camd-eia-crosswalk

load_data.download_epa_psdc(psdc_url='https://github.com/USEPA/camd-eia-crosswalk/releases/download/v0.2.1/epa_eia_crosswalk.csv')


PUDL data already downloaded
egrid2019_data.xlsx already downloaded
egrid2020_data.xlsx already downloaded
2020_Jan_Jun data already downloaded
2020_Jul_Dec data already downloaded
epa_eia_crosswalk.csv already downloaded


# 1. Clean EIA-923 Generation and Fuel Data at the Monthly Level

In [15]:
# Distribute net generation and heat input data reported by the three different EIA-923 tables
# NOTE: this code was copied and modified from `pudl.analysis.allocate_net_gen`
# NOTE: this code allocates net generation based on the proportion of net generation reported, rather than by nameplate capacity (which eGRID does)
# NOTE: the code was modified to perform the allocation on a monthly basis, rather than an annual basis

# HIGH PRIORITIES
# DONE: remove non-grid connected plants from this dataframe
# DONE: Denormalize data by balancing authority/state. BA assignment from EIA-860
# TODO: look into whether net_gen from generation table should be preserved if available
# TODO: figure out what happens when each table has different values for net gen (allocate difference, or take precedence)
# DONE: add nuclear generators to this list
# TODO: Remove any plants located in Puerto Rico

# FUEL ASSIGNMENT
# DONE: calculate total emissions from gf based on fuel and heat input and distribute in addition to net generation and fuel consumed
# TODO: when aggregating back to generator records, keep the fuel type that accounted for most heat input
# TODO: denormalize data by fuel type both primary fuel type by generator, and primary fuel by plant (assuming that's how reported to ISOs)
# primary fuel type is currently assigned based on the annual primary fuel type. This should be changed to assign base on monthly fuel type

# NEXT PRIORITIES
# TODO: allocate heat input data from boiler_fuel_eia923() See: https://github.com/catalyst-cooperative/pudl/pull/1096
# TODO: Also distribute heat input for electricity consumption

# LOWER PRIORITIES
# TODO: fix allocation of net generation when reported net generation is negative?
# TODO: investigate generators for which frac column is not adding to 1.0

gen_fuel_allocated = distribute_eia923.allocate_gen_fuel_by_gen(year=year)

Removing 0 plants that are not grid-connected
Removing 0 plants that are not grid-connected
Removing 1 plants that are not grid-connected


        plant_id_eia prime_mover_code energy_source_code report_date  frac  \
1326             141               ST                DFO  2020-01-01   3.0   
1348             141               ST                DFO  2020-12-01   3.0   
1350             141               ST                 NG  2020-01-01   3.0   
1372             141               ST                 NG  2020-12-01   3.0   
3259             350               ST                 NG  2020-03-01   2.0   
...              ...              ...                ...         ...   ...   
133878         60903               CT                 NG  2020-05-01   2.0   
133885         60903               CA                 NG  2020-09-01   2.0   
133886         60903               CT                 NG  2020-09-01   2.0   
133887         60903               CA                 NG  2020-10-01   2.0   
133888         60903               CT                 NG  2020-10-01   2.0   

        net_generation_mwh_g_tbl  net_generation_mwh_gf_tbl  fr

In [5]:
# for which plants are we still missing co2 data?
gen_fuel_allocated[(gen_fuel_allocated['co2_mass_tons'].isna()) & (gen_fuel_allocated['fuel_consumed_mmbtu'] > 0)]

Unnamed: 0,plant_id_eia,generator_id,report_date,net_generation_mwh,fuel_consumed_mmbtu,fuel_consumed_for_electricity_mmbtu,co2_mass_tons,ba_code,state,energy_source_code


# 2. Clean Hourly Data from CEMS
There are three broad categories of plants based on their CAMD reporting status:
1. Units that report to CAMD year-round (for these plants, emissions data is used directly from CEMS)
2. Units that only report to CAMD during the ozone season (May-Sept) (for these units, non-ozone season data is taken from EIA 923)
3. Units that do not report to CAMD (generally fossil units < 25MW and non-fossil generators)

There are also certain plants that report to CAMD but do not procuce electricty for the grid, and need to be removed from the CEMS data:
- Non grid connected plants
- Steam-only plants


In [7]:
# NOTE: all of the functions in this section could be run by calling clean_cems()
#cems = data_cleaning.clean_cems(year)

In [6]:
# load the CEMS data
cems = load_data.load_cems_data(year)

# remove non-grid connected plants
cems = data_cleaning.remove_non_grid_connected_plants(cems)

# manually remove steam-only units
cems = data_cleaning.manually_remove_steam_units(cems)

# remove plants that only report steam generation and no electrical generation
# NOTE: keeping steam only plants for now
#cems = data_cleaning.remove_heating_only_plants(cems)

# add a report date
cems = data_cleaning.add_report_date(cems)

# identify cems reporting status (full year or partial year)
# NOTE: this information is not really useful yet, so we are not going to run this to save time
#cems = data_cleaning.determine_cems_reporting_status(cems)

# TODO: identify and remove any hourly values that appear to be outliers

# add a fuel type to each observation
cems = data_cleaning.assign_fuel_type_to_cems(cems,year)

# fill in missing hourly emissions data using the fuel type and heat input
cems = data_cleaning.fill_cems_missing_co2(cems, year)

# remove any observations from cems where zero operation is reported for an entire month
# although this data could be considered to be accurately reported, let's remove it so that we can double check against the eia data
# TODO: check if any of these observations are from geothermal generators
cems = data_cleaning.remove_cems_with_zero_monthly_data(cems)

# identify any remaining missing values
# TODO: Try to identify fuel types
still_missing_co2_data = list(cems[cems['co2_mass_tons'].isnull()]['cems_id'].unique())
print(f"Unable to calculate emissions for the following plants_units: {still_missing_co2_data}")
cems_missing_fuel = cems[cems['co2_mass_tons'].isnull()]

# For now, lets drop these from the data
cems = cems[~cems['cems_id'].isin(still_missing_co2_data)]

cems_missing_fuel

Removing 45 plants that are not grid-connected
removing 7329216 observations from cems for unit-months where no data reported
Unable to calculate emissions for the following plants_units: []


Unnamed: 0,plant_id_eia,unitid,cems_id,operating_datetime_utc,operating_time_hours,gross_load_mw,gross_generation_mwh,steam_load_1000_lbs,heat_content_mmbtu,co2_mass_tons,co2_mass_measurement_code,plant_id_epa,unit_id_epa,report_date,energy_source_code


In [7]:
# identify which units are still missing a fuel type assignment
# NOTE: we will need to fix this before matching to EIA-930 data
# however, we should create a plant_primary_fuel column, which might be different than the unit-specific primary fuel
cems[cems['energy_source_code'].isnull()]

Unnamed: 0,plant_id_eia,unitid,cems_id,operating_datetime_utc,operating_time_hours,gross_load_mw,gross_generation_mwh,steam_load_1000_lbs,heat_content_mmbtu,co2_mass_tons,co2_mass_measurement_code,plant_id_epa,unit_id_epa,report_date,energy_source_code
445104,54096,X026,54096_X026,2020-07-01 06:00:00+00:00,1.0,23.0,23.0,,310.899994,20.164635,,54096,88302,2020-07-01,
445105,54096,X026,54096_X026,2020-07-01 07:00:00+00:00,1.0,23.0,23.0,,311.000000,20.171121,,54096,88302,2020-07-01,
445106,54096,X026,54096_X026,2020-07-01 08:00:00+00:00,1.0,23.0,23.0,,310.600006,20.145178,,54096,88302,2020-07-01,
445107,54096,X026,54096_X026,2020-07-01 09:00:00+00:00,1.0,23.0,23.0,,314.600006,20.404614,,54096,88302,2020-07-01,
445108,54096,X026,54096_X026,2020-07-01 10:00:00+00:00,1.0,23.0,23.0,,316.000000,20.495416,,54096,88302,2020-07-01,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34770643,4125,8,4125_8,2021-01-01 01:00:00+00:00,0.0,0.0,0.0,,0.000000,0.000000,,4125,2621,2020-12-01,
34770644,4125,8,4125_8,2021-01-01 02:00:00+00:00,0.0,0.0,0.0,,0.000000,0.000000,,4125,2621,2020-12-01,
34770645,4125,8,4125_8,2021-01-01 03:00:00+00:00,0.0,0.0,0.0,,0.000000,0.000000,,4125,2621,2020-12-01,
34770646,4125,8,4125_8,2021-01-01 04:00:00+00:00,0.0,0.0,0.0,,0.000000,0.000000,,4125,2621,2020-12-01,


In [18]:
# flag any generator-months for which we already have cems data
# NOTE: there is still an issue identifying for which generators we have cems data because of incomplete mapping
#gen_fuel_allocated = gen_fuel_allocated.drop(columns=['data_source'])
gen_fuel_allocated = data_cleaning.identify_emissions_data_source(cems, gen_fuel_allocated)

# create a separate dataframe containing only the generators for which we do not have CEMS data
monthly_eia_data_to_distribute = gen_fuel_allocated[(gen_fuel_allocated['data_source'] == 'eia_only') & ~(gen_fuel_allocated['fuel_consumed_mmbtu'].isna())]

# what percent of generators are in CEMS vs not
(gen_fuel_allocated.groupby('data_source').sum() / gen_fuel_allocated.groupby('data_source').sum().sum(axis=0)).round(3)

Unnamed: 0_level_0,plant_id_eia,net_generation_mwh,fuel_consumed_mmbtu,fuel_consumed_for_electricity_mmbtu,co2_mass_tons
data_source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
cems,0.12,0.57,0.523,0.545,0.829
eia_only,0.88,0.43,0.477,0.455,0.171


In [19]:
# convert hourly gross generation to net generation
#cems = cems.drop(columns=['net_generation_mwh','gross_to_net_ratio','net_gen_method'])
cems = data_cleaning.convert_gross_to_net_generation(cems, gen_fuel_allocated)

# for generators where there is heat input but no gross generation reported, impute hourly net generation based on reported EIA values
cems = data_cleaning.impute_missing_hourly_net_generation(cems, gen_fuel_allocated)

# what percent of net generation was allocated using each method?
cems.groupby('net_gen_method', dropna=False).sum()['net_generation_mwh'] / cems['net_generation_mwh'].sum()

net_gen_method
annual_regression            0.002592
imputed_from_heat_content    0.000854
monthly_ratio                0.992664
net_equals_gross             0.003890
Name: net_generation_mwh, dtype: float32

In [20]:
# add information about the balancing authority 
#cems = cems.drop(columns=['ba_code','state'])
cems = data_cleaning.assign_ba_code_to_plant(cems, year)

# 3. Compare Monthly data from CEMS/EIA-923
We have now identified all plants that report the full year to CEMS, and all plants that report a partial year. We will now use the EIA-923 data to fill in the missing pieces.


In [13]:
# TODO: Compare the data reported by both sources
# for plants where there is data reported in cems, see how off it is from data reported in eia
cems_plant_monthly = cems.groupby(['plant_id_eia','report_date']).sum()[['gross_generation_mwh','net_generation_mwh','heat_content_mmbtu','co2_mass_tons']].reset_index().rename(columns={'heat_content_mmbtu':'fuel_consumed_mmbtu'})
gf_plant_monthly = gen_fuel_allocated.groupby(['plant_id_eia','report_date']).sum().reset_index()
gf_plant_monthly = gf_plant_monthly.merge(cems_plant_monthly, how='inner', on=['plant_id_eia','report_date'], suffixes=("_eia",'_cems'))

gf_plant_monthly['pctdiff_gen'] = ((gf_plant_monthly['net_generation_mwh_cems'].replace(0,0.1) - gf_plant_monthly['net_generation_mwh_eia'].replace(0,0.1)) / gf_plant_monthly['net_generation_mwh_eia'].replace(0,0.1)).round(3)
gf_plant_monthly['pctdiff_fuel'] = ((gf_plant_monthly['fuel_consumed_mmbtu_cems'].replace(0,0.1) - gf_plant_monthly['fuel_consumed_mmbtu_eia'].replace(0,0.1)) / gf_plant_monthly['fuel_consumed_mmbtu_eia'].replace(0,0.1)).round(3)
gf_plant_monthly['pctdiff_co2'] = ((gf_plant_monthly['co2_mass_tons_cems'].replace(0,0.1) - gf_plant_monthly['co2_mass_tons_eia'].replace(0,0.1)) / gf_plant_monthly['co2_mass_tons_eia'].replace(0,0.1)).round(3)

gf_plant_monthly.sort_values(by='pctdiff_gen')

Unnamed: 0,plant_id_eia,report_date,net_generation_mwh_eia,fuel_consumed_mmbtu_eia,fuel_consumed_for_electricity_mmbtu,co2_mass_tons_eia,gross_generation_mwh,net_generation_mwh_cems,fuel_consumed_mmbtu_cems,co2_mass_tons_cems,pctdiff_gen,pctdiff_fuel,pctdiff_co2
11117,55641,2020-09-01,-505.000,0.0,0.0,0.00000,261056.609375,261056.609375,1.817727e+06,108023.497268,-517.944,1.817727e+07,1080233.973
11118,55641,2020-10-01,-972.000,0.0,0.0,0.00000,370347.375000,370347.375000,2.564216e+06,152387.475818,-382.016,2.564216e+07,1523873.758
11119,55641,2020-11-01,-1002.000,0.0,0.0,0.00000,141516.921875,141516.921875,9.632251e+05,57242.498871,-142.234,9.632250e+06,572423.989
3894,3648,2020-11-01,-65.000,8273.0,8273.0,483.47412,563.590027,274.000000,5.566524e+03,331.034671,-5.215,-3.270000e-01,-0.315
2460,2393,2020-01-01,-197.025,9790.0,9790.0,648.68394,6205.459961,612.974976,6.539593e+04,4537.408885,-4.111,5.680000e+00,5.995
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3944,3804,2020-06-01,0.000,0.0,0.0,0.00000,338518.156250,329370.281250,2.359999e+06,140251.348182,3293701.812,2.359999e+07,1402512.482
4656,6112,2020-01-01,0.000,0.0,0.0,0.00000,348390.593750,337367.343750,2.632619e+06,156451.644281,3373672.438,2.632619e+07,1564515.443
10313,55382,2020-01-01,0.000,0.0,0.0,0.00000,540560.500000,521570.031250,3.892977e+06,231354.493562,5215699.312,3.892977e+07,2313543.936
0,3,2020-01-01,0.000,0.0,0.0,0.00000,806088.250000,770821.312500,6.699430e+06,554474.688740,7708212.125,6.699429e+07,5544745.887


In [14]:
# investigate single plants
gf_plant_monthly[gf_plant_monthly['plant_id_eia'] == 55641]

Unnamed: 0,plant_id_eia,report_date,net_generation_mwh_eia,fuel_consumed_mmbtu_eia,fuel_consumed_for_electricity_mmbtu,co2_mass_tons_eia,gross_generation_mwh,net_generation_mwh_cems,fuel_consumed_mmbtu_cems,co2_mass_tons_cems,pctdiff_gen,pctdiff_fuel,pctdiff_co2
11109,55641,2020-01-01,404844.0,2992734.0,2992734.0,174895.37496,415134.0,404844.0,2795774.0,166148.400093,0.0,-0.066,-0.05
11110,55641,2020-02-01,385147.0,2734786.0,2734786.0,159820.89384,396118.0,385147.0,2694118.0,160105.800156,0.0,-0.015,0.002
11111,55641,2020-03-01,390252.0,2926604.0,2926604.0,171030.73776,400920.4375,390252.0,2731180.0,162311.506564,0.0,-0.067,-0.051
11112,55641,2020-04-01,369304.0,2748952.0,2748952.0,160648.75488,385077.5,369304.0,2662860.0,158251.328952,0.0,-0.031,-0.015
11113,55641,2020-05-01,265800.0,2027648.0,2027648.0,118495.74912,487726.0,265800.0,3636866.0,216132.397752,0.0,0.794,0.824
11114,55641,2020-06-01,280279.0,2151172.0,2151172.0,125714.49168,416998.6875,280279.0,2894332.0,172005.366444,0.0,0.345,0.368
11115,55641,2020-07-01,120332.0,926431.0,926431.0,54140.62764,263503.21875,120331.992188,1821632.0,108257.569021,-0.0,0.966,1.0
11116,55641,2020-08-01,420368.0,2962858.0,2962858.0,173149.42152,429201.65625,420368.0,2883438.0,171358.046279,0.0,-0.027,-0.01
11117,55641,2020-09-01,-505.0,0.0,0.0,0.0,261056.609375,261056.609375,1817727.0,108023.497268,-517.944,18177270.0,1080233.973
11118,55641,2020-10-01,-972.0,0.0,0.0,0.0,370347.375,370347.375,2564216.0,152387.475818,-382.016,25642160.0,1523873.758


# Adjust emissions
We next need to make certain adjustments to the data:
 - [ ] Calculate emissions for Geothermal plants
 - [ ] Adjust heat input/emissions from CHP plants by proportion used for electric generation
 - EPA adjusts biomass emissions, but not sure if we want to do that. Need to look into it more

In [21]:
# try using the method we used to impute net generation - adjust emissions based on proportion of heat used for electricity vs steam

# for now, let's just add a placeholder column
cems['co2_mass_tons_adjusted'] = cems['co2_mass_tons']

# Compare results to eGRID totals

Before we allocate the data to the hourly level, we should double check that the total annual emissions / generation values match the "official" data published in eGRID at the annual level.

In [122]:
# Aggregate total calculated values
###################################

# combine cems and eia data
cems_plant_annual = cems.groupby(['ba_code','state','plant_id_eia'], dropna=False).sum()[['net_generation_mwh','heat_content_mmbtu','co2_mass_tons']].rename(columns={'heat_content_mmbtu':'heat_input_mmbtu'}).reset_index()
eia_plant_annual = monthly_eia_data_to_distribute.groupby(['ba_code','state','plant_id_eia'], dropna=False).sum()[['net_generation_mwh','fuel_consumed_mmbtu','co2_mass_tons']].rename(columns={'fuel_consumed_mmbtu':'heat_input_mmbtu'}).reset_index()

plant_annual_total = pd.concat([cems_plant_annual,eia_plant_annual], axis=0)

# group any plants that have records from both datasets
plant_annual_total = plant_annual_total.groupby(['ba_code','state','plant_id_eia'], dropna=False).sum().reset_index()

# For plants that have different EPA and EIA plant IDs, the plant ID in eGRID is usually the EPA ID, but sometimes the EIA ID
# however, there are sometime 2 EIA IDs for a single eGRID ID, so we need to group the data in the EIA table by the egrid id
# We need to update all of the egrid plant IDs to the EIA plant IDs
egrid_crosswalk = pd.read_csv('../data/egrid/egrid_static_tables/2020/table_C5_crosswalk_of_EIA_ID_to_EPA_ID.csv')
eia_to_egrid_id = dict(zip(list(egrid_crosswalk['plant_id_eia']), list(egrid_crosswalk['plant_id_egrid'])))
egrid_to_eia_id = dict(zip(list(egrid_crosswalk['plant_id_egrid']), list(egrid_crosswalk['plant_id_eia'])))
plant_annual_total['plant_id_egrid'] = plant_annual_total['plant_id_eia']
plant_annual_total['plant_id_egrid'].update(plant_annual_total['plant_id_egrid'].map(eia_to_egrid_id))

# Load the eGRID plant table
############################

# load plant level data from egrid
egrid_plant = pd.read_excel(f'../data/egrid/egrid{year}_data.xlsx', 
                            sheet_name=f'PLNT{str(year)[-2:]}', 
                            header=1, 
                            usecols=['BACODE','PSTATABB', 'PLPRMFL','ORISPL', 'PNAME','PLGENATN', 'PLGENATR', 'PLHTIANT','UNCO2','PLCO2AN'])
# calculate total net generation from reported renewable and nonrenewable generation
egrid_plant['net_generation_mwh'] = egrid_plant['PLGENATN'] + egrid_plant['PLGENATR']
egrid_plant = egrid_plant.drop(columns=['PLGENATN', 'PLGENATR'])
# rename the columns
egrid_plant = egrid_plant.rename(columns={'BACODE':'ba_code',
                                          'PSTATABB':'state',
                                          'PLPRMFL':'energy_source_code',
                                          'ORISPL':'plant_id_egrid',
                                          'PNAME':'plant_name',
                                          'PLHTIANT':'heat_input_mmbtu',
                                          'UNCO2':'co2_mass_tons',
                                          'PLCO2AN':'co2_mass_tons_adjusted'})

# if egrid has a missing value for co2 for a clean plant, replace with zero
clean_fuels = ['SUN','MWH','WND', 'WAT','WH','PUR','NUC']
egrid_plant.loc[egrid_plant['energy_source_code'].isin(clean_fuels), 'co2_mass_tons_adjusted'] = egrid_plant.loc[egrid_plant['energy_source_code'].isin(clean_fuels), 'co2_mass_tons_adjusted'].fillna(0)
egrid_plant.loc[egrid_plant['energy_source_code'].isin(clean_fuels), 'co2_mass_tons'] = egrid_plant.loc[egrid_plant['energy_source_code'].isin(clean_fuels), 'co2_mass_tons'].fillna(0)

# reorder the columns                                         
egrid_plant = egrid_plant[['ba_code', 'state', 'plant_id_egrid', 'plant_name', 'net_generation_mwh', 'heat_input_mmbtu', 'co2_mass_tons', 'co2_mass_tons_adjusted']]

# remove any plants that habe no reported data
# NOTE: it seems that egrid includes a lot of proposed projects that are not yet operating, but just has missing data for them
plants_with_no_data_in_egrid = list(egrid_plant[egrid_plant[['net_generation_mwh','heat_input_mmbtu','co2_mass_tons','co2_mass_tons_adjusted']].sum(axis=1) == 0]['plant_id_egrid'])
egrid_plant = egrid_plant[~egrid_plant['plant_id_egrid'].isin(plants_with_no_data_in_egrid)]

# We also want to remove any plants that are located in Puerto Rico
egrid_plant = egrid_plant[(egrid_plant['state'] != 'PR')]

# create a column for eia id
egrid_plant['plant_id_eia'] = egrid_plant['plant_id_egrid']
egrid_plant['plant_id_eia'].update(egrid_plant['plant_id_eia'].map(egrid_to_eia_id))

## Identify plants that are missing from each dataset
Notes as of 4/21/22:
 - plant 7922 is being removed in gen_fuel_allocated because it is marked as a retired plant, and only has generation in the gf table
 - most of these other plants are being removed because they report 0 net generation, even though they have fuel consumption for electricity
 - not sure if these plants are exporting electricity to the grid. If not, it is probably safe to leave them off of the grid average emissions calculation

In [123]:
# identify any plants that are in egrid but not our totals, and any plants that are in our totals, but not egrid
plant_not_in_calc = list(set(egrid_plant['plant_id_eia'].unique()) - set(plant_annual_total['plant_id_eia'].unique()))
plants_not_in_egrid = list(set(plant_annual_total['plant_id_egrid'].unique()) - set(egrid_plant['plant_id_egrid'].unique()))

# Which plants are included in eGRID but are missing from our calculations?
missing_from_calc = egrid_plant[egrid_plant['plant_id_egrid'].isin(plant_not_in_calc)]

#missing_from_calc.to_csv('../data/temp/plants_missing_from_calcs.csv', index=False)

# see if any of these plants are retired
generators_eia860 = load_data.load_pudl_table('generators_eia860', year=year)
missing_from_calc.merge(generators_eia860.groupby('plant_id_eia')['retirement_date'].unique().reset_index(), how='left', on='plant_id_eia')

Unnamed: 0,ba_code,state,plant_id_egrid,plant_name,net_generation_mwh,heat_input_mmbtu,co2_mass_tons,co2_mass_tons_adjusted,plant_id_eia,retirement_date
0,CISO,CA,328,Borel,-204.0,,0.0,0.0,328,[2017-09-01]
1,MISO,IA,7922,Brooklyn City North Plant,42.0,,,,7922,[2011-09-01]


In [124]:
# Which plants are in our calculations, but are missing from eGRID?
plant_names = load_data.load_pudl_table('plants_entity_eia')[['plant_id_eia','plant_name_eia','sector_name_eia']]
missing_from_egrid = plant_annual_total[plant_annual_total['plant_id_eia'].isin(plants_not_in_egrid)].merge(plant_names, how='left', on='plant_id_eia')

missing_from_egrid

Unnamed: 0,ba_code,state,plant_id_eia,net_generation_mwh,heat_input_mmbtu,co2_mass_tons,plant_id_egrid,plant_name_eia,sector_name_eia
0,AECI,MO,2127,0.0,0.00,0.000000,2127,Gallatin (MO),Electric Utility
1,AECI,MO,2141,0.0,0.00,0.000000,2141,Macon,Electric Utility
2,AECI,MO,2146,0.0,0.00,0.000000,2146,Monroe (MO),Electric Utility
3,AECI,MO,8110,0.0,0.00,0.000000,8110,Stanberry,Electric Utility
4,AECI,MO,56126,0.0,0.00,0.000000,56126,Sub 2 Generating Station,Electric Utility
...,...,...,...,...,...,...,...,...,...
356,,AK,57053,0.0,45.00,3.674700,57053,Alakanuk,Electric Utility
357,,AK,60250,0.0,441.00,36.012060,60250,Swampy Acres Microgrid,Electric Utility
358,,HI,774,0.0,0.00,0.000000,774,Waiau Hydro,Electric Utility
359,,HI,63280,0.0,0.00,0.000000,63280,"AES Kekaha Solar, LLC Hybrid",IPP Non-CHP


In [125]:
# how many of the plants missing from egrid have non-zero data
missing_from_egrid[missing_from_egrid[['net_generation_mwh','heat_input_mmbtu','co2_mass_tons']].sum(axis=1) > 0]

Unnamed: 0,ba_code,state,plant_id_eia,net_generation_mwh,heat_input_mmbtu,co2_mass_tons,plant_id_egrid,plant_name_eia,sector_name_eia
68,CISO,CA,57714,0.0,1.0,0.05844,57714,City of Palo Alto,Electric Utility
79,CISO,CA,61464,0.0,1276.0,104.1982,61464,"CoreSite Real Estate 2972 Stender, L.P.",IPP Non-CHP
80,CISO,CA,61474,0.0,223.0,18.21018,61474,CoreSite Real Estate 55 S. Market Street,Commercial Non-CHP
81,CISO,CA,61475,0.0,1270.0,103.7082,61475,"CoreSite Real Estate 1656 McCarthy, L.P.",Commercial Non-CHP
82,CISO,CA,61476,0.0,584.0,47.68944,61476,"CoreSite Real Estate 2901 Coronado, L.P.",Commercial Non-CHP
83,CISO,CA,61482,0.0,2249.0,183.6533,61482,"CoreSite Real Estate 3032 Coronado, L.P.",Commercial Non-CHP
91,CPLE,NC,54316,0.0,16737.0,981.8023,54316,Southport,Industrial CHP
97,DUK,NC,54363,0.0,204.0,16.65864,54363,PPG Industries Inc Shelby NC W,Industrial Non-CHP
106,FMPP,FL,7997,0.0,3589.0,293.0777,7997,Winston,Electric Utility
145,MISO,IA,1146,0.0,6.0,0.48996,1146,Harlan,Electric Utility


In [126]:
# identify where there is a single egrid plant id for multiple eia plant ids
double_ids = plant_annual_total[plant_annual_total['plant_id_egrid'].duplicated(keep=False)]
double_ids = double_ids.groupby('plant_id_egrid').sum()['net_generation_mwh'].reset_index() # focus on net generation for now
# merge the egrid data
double_ids = double_ids.merge(egrid_plant[['plant_id_egrid','net_generation_mwh']], how='left', on='plant_id_egrid', suffixes=('_calc','_egrid'))
double_ids['percent_diff'] = ((double_ids['net_generation_mwh_calc'] - double_ids['net_generation_mwh_egrid']) / double_ids['net_generation_mwh_egrid']).round(3)
double_ids

Unnamed: 0,plant_id_egrid,net_generation_mwh_calc,net_generation_mwh_egrid,percent_diff
0,562,297153.5,295534.003,0.005
1,762,13132.0,13132.0,-0.0
2,1416,3497026.0,3497026.0,0.0
3,2709,5829410.0,5829409.996,0.0
4,3612,4045388.0,4045388.003,-0.0
5,4076,110539.0,110539.0,0.0
6,10474,670411.0,670410.997,0.0
7,55306,8488507.0,8487498.0,0.0
8,55375,5447419.0,5447419.084,-0.0
9,55481,7276336.0,7276336.0,0.0


## Identify plants for which we are missing a BA assignment
(of the plants not already missing from our calculated totals)

In [127]:
ba_code_match = egrid_plant.set_index('plant_id_eia')[['plant_name','ba_code']].merge(plant_annual_total.set_index('plant_id_eia')[['ba_code']], how='inner', left_index=True, right_index=True, suffixes=("_egrid",'_calc'))

# plants with missing ba code
ba_code_match[(ba_code_match['ba_code_calc'].isna()) & ~(ba_code_match['ba_code_egrid'].isna())]

Unnamed: 0_level_0,plant_name,ba_code_egrid,ba_code_calc
plant_id_eia,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


## Identify plants for which we have incorrectly assigned the BA code

In [128]:
# plants with incorrect ba code
ba_code_match[(ba_code_match['ba_code_calc'] != ba_code_match['ba_code_egrid']) & ~(ba_code_match['ba_code_calc'].isna())]

Unnamed: 0_level_0,plant_name,ba_code_egrid,ba_code_calc
plant_id_eia,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


## Identify where our calculated totals do not match eGRID's totals

In [129]:
# standardize column names and index so that the two dfs can be divided
calculated_to_compare = plant_annual_total.groupby('plant_id_egrid').sum().drop(columns=['plant_id_eia'])

# create an adjusted co2 column
# TODO: remove this once we calculate adjusted emissions
calculated_to_compare['co2_mass_tons_adjusted'] = calculated_to_compare['co2_mass_tons']

# drop the plants that have no data in eGRID
egrid_plant = egrid_plant[~egrid_plant['plant_id_eia'].isin(plants_with_no_data_in_egrid)]

egrid_to_compare = egrid_plant.set_index(['plant_id_egrid']).drop(columns=['ba_code','state','plant_name','plant_id_eia'])

# divide calculated value by egrid value
compared = calculated_to_compare.div(egrid_to_compare).merge(egrid_plant[['plant_id_egrid','plant_name','ba_code', 'state']], how='left', left_index=True, right_on='plant_id_egrid').set_index('plant_id_egrid')
compared['plant_name'] = compared['plant_name'].fillna('unknown')

# create a dataframe that merges the two sources of data together
compared_merged = calculated_to_compare.merge(egrid_to_compare, how='outer', on='plant_id_egrid', suffixes=('_calc','_egrid'))

# for each column, change missing values to zero if both values are zero (only nan b/c divide by zero)
for col in ['net_generation_mwh','heat_input_mmbtu','co2_mass_tons_adjusted','co2_mass_tons']:
    # identify plants with zero values for both
    plant_ids = list(compared_merged[(compared_merged[f'{col}_calc'] == 0) & (compared_merged[f'{col}_egrid'] == 0)].index)
    compared.loc[compared.index.isin(plant_ids), col] = 1

# for each column, categorize the data based on how far it is off from egrid
for col in ['net_generation_mwh','heat_input_mmbtu','co2_mass_tons_adjusted','co2_mass_tons']:
    # add a new column
    compared[f'{col}_status'] = pd.cut(x=compared[col], 
                                       bins=[-999999999,0,0.5,0.9,0.99,0.9999,1,1.0001,1.01,1.1,1.5,999999999], 
                                       labels=['negative','<50%','+/-50%','+/-10%','+/-1%','!exact','!exact','+/-1%','+/-10%','+/-50%','>50%'], 
                                       ordered=False)
    # replace any missing values with missing
    compared[f'{col}_status'] = compared[f'{col}_status'].astype(str)  
    compared[f'{col}_status'] = compared[f'{col}_status'].fillna('missing')
    compared[f'{col}_status'] = compared[f'{col}_status'].replace('nan','missing')
    compared.loc[(compared.index.isin(plants_not_in_egrid)),f'{col}_status'] = 'not_in_egrid'

# identify which plants are missing from egrid vs calculated values
for col in ['net_generation_mwh','heat_input_mmbtu','co2_mass_tons_adjusted','co2_mass_tons']:
    # identify plants that are missing in egrid
    plants_missing_egrid = list(compared_merged[(compared_merged[f'{col}_calc'] > 0) & (compared_merged[f'{col}_egrid'].isna())].index)
    compared.loc[compared.index.isin(plants_missing_egrid), f'{col}_status'] = 'missing_in_egrid'
    # identify plants that are missing from our calculations
    plants_missing_calc = list(compared_merged[(compared_merged[f'{col}_calc'].isna()) & (compared_merged[f'{col}_egrid'] > 0)].index)
    compared.loc[compared.index.isin(plants_missing_calc), f'{col}_status'] = 'missing_in_calc'
    # identify where our calculations are missing a zero value
    plants_missing_zero_calc = list(compared_merged[(compared_merged[f'{col}_calc'].isna()) & (compared_merged[f'{col}_egrid'] == 0)].index)
    compared.loc[compared.index.isin(plants_missing_zero_calc), f'{col}_status'] = 'missing_zero_in_calc'
    # identify where egrid has a missing value instead of a zero
    plants_missing_zero_egrid = list(compared_merged[(compared_merged[f'{col}_calc'] == 0) & (compared_merged[f'{col}_egrid'].isna())].index)
    compared.loc[compared.index.isin(plants_missing_zero_egrid), f'{col}_status'] = 'missing_zero_in_egrid'
    # identify where egrid has a zero value where we have a positive value
    plants_incorrect_zero_egrid = list(compared_merged[(compared_merged[f'{col}_calc'] > 0) & (compared_merged[f'{col}_egrid'] == 0)].index)
    compared.loc[compared.index.isin(plants_incorrect_zero_egrid), f'{col}_status'] = '>50%'

# create a dataframe that counts how many plants are in each category
comparison_count = []
for col in ['net_generation_mwh','heat_input_mmbtu','co2_mass_tons_adjusted','co2_mass_tons']:
    count = compared.groupby(f'{col}_status', dropna=False).count()['plant_name'].rename(col)
    count.index = count.index.rename('status')
    comparison_count.append(count)

comparison_count = pd.concat(comparison_count, axis=1).fillna(0).astype(int)
comparison_count = pd.concat([comparison_count, pd.DataFrame(comparison_count.sum().rename('Total')).T], axis=0)   
comparison_count

Unnamed: 0,net_generation_mwh,heat_input_mmbtu,co2_mass_tons_adjusted,co2_mass_tons
!exact,9827,8106,7404,8271
+/-1%,39,584,629,785
+/-10%,55,316,404,451
+/-50%,56,272,319,285
<50%,14,6,4,10
>50%,9,748,1159,117
missing,1,2,1,1
missing_in_calc,1,0,0,0
missing_in_egrid,1,56,163,163
missing_zero_in_egrid,361,324,328,328


In [130]:
print(comparison_count.to_markdown())

|                       |   net_generation_mwh |   heat_input_mmbtu |   co2_mass_tons_adjusted |   co2_mass_tons |
|:----------------------|---------------------:|-------------------:|-------------------------:|----------------:|
| !exact                |                 9827 |               8106 |                     7404 |            8271 |
| +/-1%                 |                   39 |                584 |                      629 |             785 |
| +/-10%                |                   55 |                316 |                      404 |             451 |
| +/-50%                |                   56 |                272 |                      319 |             285 |
| <50%                  |                   14 |                  6 |                        4 |              10 |
| >50%                  |                    9 |                748 |                     1159 |             117 |
| missing               |                    1 |                  2 |           

In [131]:
# examine specific plants in a category
value = 'net_generation_mwh'
status = '>50%'

#compared_merged.loc[64877,:]

compared[compared[f'{value}_status'] == status].sort_values(by=value)

Unnamed: 0_level_0,net_generation_mwh,heat_input_mmbtu,co2_mass_tons,co2_mass_tons_adjusted,plant_name,ba_code,state,net_generation_mwh_status,heat_input_mmbtu_status,co2_mass_tons_adjusted_status,co2_mass_tons_status
plant_id_egrid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2707,1.642227,1.004128,1.122389,1.122389,Blewett,CPLE,NC,>50%,+/-1%,+/-50%,+/-50%
1599,1.669652,0.996201,0.995777,0.995777,Canal Station,ISNE,MA,>50%,+/-1%,+/-1%,+/-1%
7277,1.762447,1.605793,1.59336,1.59336,Lincoln Combustion Turbine,DUK,NC,>50%,>50%,>50%,>50%
335,1.856021,0.987818,0.987818,0.987818,AES Huntington Beach,CISO,CA,>50%,+/-10%,+/-10%,+/-10%
7288,2.060279,2.016973,1.99926,1.99926,Sherman Avenue,PJM,NJ,>50%,>50%,>50%,>50%
7790,2.066522,1.964549,1.967934,1.967934,Bonanza,PACE,UT,>50%,>50%,>50%,>50%
60698,2.154015,0.993232,0.992273,0.992273,Stanton Energy Reliability Center,CISO,CA,>50%,+/-1%,+/-1%,+/-1%
58256,2.660714,1.0,1.0,1.0,Los Alamos PV Site,PNM,NM,>50%,!exact,!exact,!exact
52089,5.031182,8.38859,0.999923,8.38794,Celanese Acetate LLC,PJM,VA,>50%,>50%,>50%,!exact


## Notes

Net generation < 50%:
 - 2617 has negative net generation
 - 10444, 59395: some plants are hybrid fossil / solar plants, but the primary energy source code is getting listed as PV for certain generators, which is causing an issue

Net generation > 50%:
 - 335 has two extra units (CT1 and CT2) that report to CEMS but not EIA. Likely correct
 - 7288 reports DFO to CEMS, but NG to EIA - there's probably some generators not reporting
 - 60698 double counting generation from cEMS because of 90% heat threhshold in a month - might be fixed once we tackle heat input
 - 58256 is a solar/hybrid and in generators_entity_eia the battery portion is associated with a separate generator id (should only be 1)


Other:
 - plant 3754 has heat input in cems and eia that don't match
 - 2401 has generation in both cems and eia
 - 50933 might have allocation issue (doesn't appear in CEMS)

To check
 - 1404 reports generation to CEMS in December, but not to EIA. This is likely correct
 - plant 2504 has three units (120, 121, 122) that don't appear in EIA, and in CEMS only report steam. 


In [61]:
egrid_plant[egrid_plant['plant_id_eia'] == 1599]

Unnamed: 0,ba_code,state,plant_id_egrid,plant_name,net_generation_mwh,heat_input_mmbtu,co2_mass_tons,co2_mass_tons_adjusted,plant_id_eia
4780,ISNE,MA,1599,Canal Station,177157.002,3172012.418,192024.506,192024.506,1599


In [73]:
plant_annual_total[plant_annual_total['plant_id_eia'] == 1599]

Unnamed: 0,ba_code,state,plant_id_eia,net_generation_mwh,heat_input_mmbtu,co2_mass_tons,plant_id_egrid
3296,ISNE,MA,1599,290310.714658,3159963.0,191213.588617,1599


In [74]:
gen_fuel_allocated[gen_fuel_allocated['plant_id_eia'] == 1599]

Unnamed: 0,plant_id_eia,generator_id,report_date,net_generation_mwh,fuel_consumed_mmbtu,fuel_consumed_for_electricity_mmbtu,co2_mass_tons,ba_code,state,energy_source_code,data_source
31806,1599,1,2020-01-01,-891.620414,0.0,0.0,0.0,ISNE,MA,RFO,eia_only
31807,1599,1,2020-02-01,-785.095633,0.0,0.0,0.0,ISNE,MA,RFO,eia_only
31808,1599,1,2020-03-01,-1519.026378,0.0,0.0,0.0,ISNE,MA,RFO,eia_only
31809,1599,1,2020-04-01,-263.376189,0.0,0.0,0.0,ISNE,MA,RFO,eia_only
31810,1599,1,2020-05-01,-525.913667,0.0,0.0,0.0,ISNE,MA,RFO,eia_only
31811,1599,1,2020-06-01,-558.626,0.0,0.0,0.0,ISNE,MA,RFO,eia_only
31812,1599,1,2020-07-01,-173.003092,-2089.948,-2089.948,-170.51888,ISNE,MA,RFO,eia_only
31813,1599,1,2020-08-01,,,,,ISNE,MA,RFO,cems
31814,1599,1,2020-09-01,-248.278,0.0,0.0,0.0,ISNE,MA,RFO,eia_only
31815,1599,1,2020-10-01,-137.559556,0.0,0.0,0.0,ISNE,MA,RFO,cems


In [28]:
cems_unit_monthly = cems.groupby(['plant_id_eia','unitid','report_date']).sum().reset_index()

In [75]:
cems_unit_monthly[cems_unit_monthly['plant_id_eia'] == 1599]

Unnamed: 0,plant_id_eia,unitid,report_date,operating_time_hours,gross_load_mw,gross_generation_mwh,steam_load_1000_lbs,heat_content_mmbtu,co2_mass_tons,plant_id_epa,net_generation_mwh,gross_to_net_ratio,co2_mass_tons_adjusted
5131,1599,1,2020-08-01,20.91,3615.0,3598.209961,0.0,40808.41,3311.083799,1189656,3331.033447,688.756055,3311.083799
5132,1599,1,2020-10-01,0.55,0.0,0.0,0.0,55.96125,4.53175,1189656,0.0,692.251767,4.53175
5133,1599,2,2020-06-01,1.02,0.0,0.0,0.0,152.397,9.07344,1151280,0.0,657.679852,9.07344
5134,1599,2,2020-07-01,43.900002,9995.0,9962.599609,0.0,121664.4,8286.40701,1189656,9681.212891,722.986237,8286.40701
5135,1599,2,2020-10-01,11.05,301.0,283.200012,0.0,4835.425,287.108501,1189656,263.502289,692.251767,287.108501
5136,1599,3,2020-01-01,36.889999,9440.0,9119.089844,0.0,90123.08,6212.165991,1189656,8427.0,687.534404,6212.165991
5137,1599,3,2020-02-01,9.21,2752.0,2569.540039,0.0,23856.79,1622.042035,1112904,2549.0,690.43641,1622.042035
5138,1599,3,2020-05-01,1.8,449.0,403.880005,0.0,3293.841,236.048813,1189656,-4.0,-7.368525,236.048813
5139,1599,3,2020-06-01,76.239998,20458.0,19890.650391,0.0,189180.4,11242.900164,1151280,18169.0,657.679852,11242.900164
5140,1599,3,2020-07-01,391.269989,99967.0,99615.351562,0.0,980179.3,58249.766915,1189656,96801.78125,722.986237,58249.766915


In [77]:
cems[cems['plant_id_eia'] == 1599].sample(10)

Unnamed: 0,plant_id_eia,unitid,cems_id,operating_datetime_utc,operating_time_hours,gross_load_mw,gross_generation_mwh,steam_load_1000_lbs,heat_content_mmbtu,co2_mass_tons,...,plant_id_epa,unit_id_epa,report_date,energy_source_code,ba_code,state,co2_mass_tons_adjusted,net_generation_mwh,gross_to_net_ratio,net_gen_method
11398584,1599,2,1599_2,2020-07-18 05:00:00+00:00,0.0,0.0,0.0,,0.0,0.0,...,1599,1103,2020-07-01,RFO,ISNE,MA,0.0,0.0,0.973335,monthly_ratio
11445326,1599,1,1599_1,2020-08-12 19:00:00+00:00,0.0,0.0,0.0,,0.0,0.0,...,1599,1102,2020-08-01,RFO,ISNE,MA,0.0,0.0,0.944278,annual_regression
11524452,1599,3,1599_3,2020-10-12 17:00:00+00:00,0.0,0.0,0.0,,0.0,0.0,...,1599,88425,2020-10-01,NG,ISNE,MA,0.0,0.0,0.930446,monthly_ratio
11355876,1599,2,1599_2,2020-06-27 17:00:00+00:00,0.0,0.0,0.0,,0.0,0.0,...,1599,1103,2020-06-01,RFO,ISNE,MA,0.0,0.0,0.941529,monthly_ratio
11355895,1599,2,1599_2,2020-06-28 12:00:00+00:00,0.0,0.0,0.0,,0.0,0.0,...,1599,1103,2020-06-01,RFO,ISNE,MA,0.0,0.0,0.941529,monthly_ratio
11356509,1599,3,1599_3,2020-06-24 02:00:00+00:00,0.6,163.0,97.800003,,389.412018,23.148003,...,1599,88425,2020-06-01,NG,ISNE,MA,23.148003,92.081551,0.941529,monthly_ratio
11589732,1599,3,1599_3,2020-12-18 17:00:00+00:00,0.0,0.0,0.0,,0.0,0.0,...,1599,88425,2020-12-01,,ISNE,MA,0.0,0.0,6.790533,monthly_ratio
11215161,1599,3,1599_3,2020-01-31 14:00:00+00:00,0.0,0.0,0.0,,0.0,0.0,...,1599,88425,2020-01-01,,ISNE,MA,0.0,0.0,0.924105,monthly_ratio
11589568,1599,3,1599_3,2020-12-11 21:00:00+00:00,0.0,0.0,0.0,,0.0,0.0,...,1599,88425,2020-12-01,,ISNE,MA,0.0,0.0,6.790533,monthly_ratio
11524115,1599,2,1599_2,2020-10-29 16:00:00+00:00,0.0,0.0,0.0,,0.0,0.0,...,1599,1103,2020-10-01,RFO,ISNE,MA,0.0,0.0,0.930446,monthly_ratio


In [None]:
cems_plant_annual[cems_plant_annual['plant_id_eia'] == 10867]

In [None]:
cems[cems['plant_id_eia'] == 61242]['cems_id'].unique()

In [None]:
# there are some plants that report heat input and co2 in CEMS, but are missing net generation data
# TODO: we should maybe try and fill net generation data using EIA-923?
cems_plant_annual[cems_plant_annual['net_generation_mwh'] == 0]

## Compare data at BA level

In [132]:
# load egrid BA totals
egrid_ba = pd.read_excel(f'../data/egrid/egrid{year}_data.xlsx', sheet_name=f'BA{str(year)[-2:]}', header=1, usecols=['BANAME','BACODE','BAHTIANT','BANGENAN','BACO2AN'])
# rename the columns
egrid_ba = egrid_ba.rename(columns={'BANAME':'ba_name',
                                    'BACODE':'ba_code',
                                    'BAHTIANT':'heat_input_mmbtu',
                                    'BANGENAN':'net_generation_mwh',
                                    'BACO2AN':'co2_mass_tons'})

# divide our calculation by the BA totals from eGRID
# if there are 0 values, replace with 0.1, so that div by zero doesn't return missing value
ba_metric = plant_annual_total.groupby('ba_code', dropna=False).sum().drop(columns='plant_id_eia').replace(0,0.1).div(egrid_ba.set_index('ba_code').drop(columns='ba_name').replace(0,0.1)).sort_values(by='co2_mass_tons').round(3)

total = pd.DataFrame(plant_annual_total[['net_generation_mwh','heat_input_mmbtu','co2_mass_tons']].sum().div(egrid_ba[['net_generation_mwh','heat_input_mmbtu','co2_mass_tons']].sum()).rename('Total')).T

# calculate the difference in the number of plants in each region
plant_count = (plant_annual_total.groupby('ba_code', dropna=False).count()['plant_id_egrid'] - egrid_plant.groupby('ba_code', dropna=False).count()['plant_id_egrid']).rename('num_plants')
ba_metric = ba_metric.merge(plant_count, how='left', left_index=True, right_index=True).drop(columns=['plant_id_egrid']).sort_index()

ba_metric = pd.concat([ba_metric, total], axis=0) 

ba_metric = ba_metric[['net_generation_mwh','heat_input_mmbtu','co2_mass_tons','num_plants']]

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(ba_metric)

Unnamed: 0,net_generation_mwh,heat_input_mmbtu,co2_mass_tons,num_plants
AEC,1.0,1.006,1.018,0.0
AECI,1.0,1.003,1.003,6.0
AMPL,1.0,1.0,1883.532,0.0
AVA,1.0,1.245,3.451,1.0
AVRN,1.0,1.001,1.003,0.0
AZPS,1.005,1.07,1.075,1.0
BANC,0.999,1.04,1.08,2.0
BPAT,1.0,1.205,1.808,8.0
CEA,1.0,1.0,1.0,0.0
CHPD,1.0,1.0,1.0,0.0


In [None]:
data_source_by_ba = pd.pivot_table(gen_fuel_allocated, values='co2_mass_tons', index='ba_code', columns='data_source', dropna=False)
data_source_by_ba = data_source_by_ba.div(data_source_by_ba.sum(axis=1), axis=0)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(data_source_by_ba.round(3).sort_values(by='cems'))

In [None]:
print(ba_metric.to_markdown())

### Notes
TEPC and SRP are off because the Gila River Generator is shared between SRP and TEPC, and eGRID reports all generation from this project belonging to TEPC

In [None]:
plant_annual_total[plant_annual_total['plant_id_egrid'] == 55306]

In [None]:
ba_plant_match = plant_annual_total[plant_annual_total['ba_code'].isna()].merge(egrid_plant[egrid_plant['ba_code'].isna()], how='left', on='plant_id_egrid', suffixes=("_calc",'_egrid'))
ba_plant_match[ba_plant_match['net_generation_mwh_calc'].round(0) != ba_plant_match['net_generation_mwh_egrid'].round(0)]

In [None]:
egrid_plant[egrid_plant['ba_code'] == 'TEPC']

In [None]:
compared[compared['ba_code'] == 'TEPC']

# Assign monthly data to hourly profile
We now, in theory, have complete data on national-level heat input, net generation, and emissions, from a combination of two sources:
    1. hourly data from CEMS
    2. momthly data for generators that don't report to CEMS

For the second category of monthly data, we need to figure out how to allocate the monthly level data to each hour. 

In [209]:
fuel_group_name = 'fuel_group_custom'
energy_source_groups = pd.read_csv('../data/manual/energy_source_groups.csv')[['energy_source_code',fuel_group_name]].rename(columns={fuel_group_name:'fuel_category'})

In [208]:
# For now, create a synthetic flat profile for each resource for now until we have shapes to distribute to from EIA-930

# create lists of all bas and fuel types
ba_list = list(monthly_eia_data_to_distribute['ba_code'].dropna().unique())
fuel_list = list(energy_source_groups['fuel_category'].unique())

# create an hourly datetime series in local time for each ba/fuel type
hourly_profiles = []

for ba in ba_list:
    for fuel in fuel_list:
        # create a dataframe
        df_temp = pd.DataFrame(index=pd.date_range(start=f'{year}-01-01 00:00:00', end=f'{year}-12-31 23:00:00', freq='H', tz=data_cleaning.ba_timezone(ba=ba, format='GMT'), name='datetime_local'),
                                columns=['ba_code','fuel_category','profile']).reset_index()                  
        df_temp['ba_code'] = ba
        df_temp['fuel_category'] = fuel
        df_temp['profile'] = 1.0
        df_temp['report_date'] = df_temp['datetime_local'].astype(str).str[:7]
        df_temp['report_date'] = pd.to_datetime(df_temp['report_date'])
        hourly_profiles.append(df_temp)

hourly_profiles = pd.concat(hourly_profiles, axis=0, ignore_index=True)

In [211]:
def distribute_monthly_eia_data_to_hourly(monthly_eia_data_to_distribute, hourly_profiles, energy_source_groups):
    """
    Uses monthly-level EIA data and assigns an hourly profile
    Inputs: 
        monthly_eia_data_to_distribute: a dataframe that contains monthly total net generation, fuel consumption, and co2 data, along with columns for report_date and ba_code
    """

    # assign a fuel group to each energy source code in the dataframe
    monthly_eia_ba_fuel = monthly_eia_data_to_distribute.merge(energy_source_groups[['energy_source_code','fuel_category']], how='left', on='energy_source_code')

    # calculate totals by BA, Fuel Group, and Month
    monthly_eia_ba_fuel = monthly_eia_ba_fuel.groupby(['ba_code','fuel_category','report_date']).sum()[['net_generation_mwh','fuel_consumed_mmbtu','co2_mass_tons']].reset_index()

    # calculate the total monthly net generation profile by BA and fuel group
    monthly_profile_total = hourly_profiles.groupby(['ba_code','fuel_category','report_date']).sum().reset_index()

    # merge the total monthly profile into the monthly totals
    monthly_eia_ba_fuel = monthly_eia_ba_fuel.merge(monthly_profile_total, how='left', on=['ba_code','fuel_category','report_date'])

    # calculate how much net generation, fuel, and co2 should be assigned to each unit of net generation in the profile
    for col in ['net_generation_mwh','fuel_consumed_mmbtu','co2_mass_tons']:
        monthly_eia_ba_fuel[col] = monthly_eia_ba_fuel[col] / monthly_eia_ba_fuel['profile']

    # drop the profile column and merge the hourly generation, fuel, and co2 factors back into the profile timeseries data
    monthly_eia_ba_fuel = monthly_eia_ba_fuel.drop(columns='profile')
    hourly_eia_data = hourly_profiles.merge(monthly_eia_ba_fuel, how='left', on=['ba_code','fuel_category','report_date'])

    # multiply each factor by the profile to calculate the hourly shape
    for col in ['net_generation_mwh','fuel_consumed_mmbtu','co2_mass_tons']:
        hourly_eia_data[col] = hourly_eia_data[col] * hourly_eia_data['profile']

    # add a UTC timestamp column to match the CEMS data
    hourly_eia_data['operating_datetime_utc'] = pd.to_datetime(hourly_eia_data['datetime_local'], utc=True)

    # create a column identifying the source of the data
    hourly_eia_data['data_source'] = 'EIA'

    return hourly_eia_data


In [213]:
hourly_eia_data = distribute_monthly_eia_data_to_hourly(monthly_eia_data_to_distribute, hourly_profiles, energy_source_groups)

# Output data 

Save data to a CSV so we can separate generation of and analysis of hourly e-grid numbers

In [216]:
# assign a fuel group to each observation in CEMS
cems_ba_fuel = cems.merge(energy_source_groups[['energy_source_code','fuel_category']], how='left', on='energy_source_code')

# aggregate cems data by BA and fuel type
cems_ba_fuel = cems_ba_fuel.groupby(['ba_code','fuel_category','operating_datetime_utc']).sum()[['gross_generation_mwh','net_generation_mwh','heat_content_mmbtu','co2_mass_tons','co2_mass_tons_adjusted']].reset_index()
cems_ba_fuel['data_source'] = 'CEMS'

In [217]:
combined_data = pd.concat([cems_ba_fuel, hourly_eia_data.drop(columns=['datetime_local','profile','report_date'])], axis=0)
combined_data

Unnamed: 0,ba_code,fuel_category,operating_datetime_utc,gross_generation_mwh,net_generation_mwh,heat_content_mmbtu,co2_mass_tons,co2_mass_tons_adjusted,data_source,datetime_local,profile,report_date,fuel_consumed_mmbtu
0,AEC,coal,2020-03-01 06:00:00+00:00,0.0,0.000000,0.0,0.0,0.0,CEMS,,,NaT,
1,AEC,coal,2020-03-01 07:00:00+00:00,0.0,0.000000,0.0,0.0,0.0,CEMS,,,NaT,
2,AEC,coal,2020-03-01 08:00:00+00:00,0.0,0.000000,0.0,0.0,0.0,CEMS,,,NaT,
3,AEC,coal,2020-03-01 09:00:00+00:00,0.0,0.000000,0.0,0.0,0.0,CEMS,,,NaT,
4,AEC,coal,2020-03-01 10:00:00+00:00,0.0,0.000000,0.0,0.0,0.0,CEMS,,,NaT,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6667051,CPLW,other,2021-01-01 00:00:00+00:00,,-0.018817,,0.0,,EIA,2020-12-31 19:00:00-05:00,1.0,2020-12-01,0.0
6667052,CPLW,other,2021-01-01 01:00:00+00:00,,-0.018817,,0.0,,EIA,2020-12-31 20:00:00-05:00,1.0,2020-12-01,0.0
6667053,CPLW,other,2021-01-01 02:00:00+00:00,,-0.018817,,0.0,,EIA,2020-12-31 21:00:00-05:00,1.0,2020-12-01,0.0
6667054,CPLW,other,2021-01-01 03:00:00+00:00,,-0.018817,,0.0,,EIA,2020-12-31 22:00:00-05:00,1.0,2020-12-01,0.0


In [220]:
ba = 'CISO'
data = 'co2_mass_tons'

fuel_color = {'natural_gas':'sienna',
              'coal':'black',
              'nuclear':'red',
              'biomass':'green',
              'geothermal':'orange',
              'wind':'blue',
              'solar':'gold',
              'petroleum':'purple',
              'hydro':'skyblue',
              'other':'lightgrey',
              'waste':'pink'}

fuel_order = ['nuclear','coal','natural_gas','petroleum','other','waste','biomass','geothermal','hydro','solar','wind']

data_to_graph = combined_data[combined_data['ba_code'] == ba]
data_to_graph = data_to_graph.groupby(['operating_datetime_utc','fuel_category','data_source']).sum().reset_index()


px.area(data_to_graph, 
        x='operating_datetime_utc', 
        y=data, 
        color='fuel_category', 
        color_discrete_map=fuel_color, 
        facet_col='data_source', 
        template='plotly_white',
        title=f'Hourly data for {ba} by fuel type',
        category_orders={'fuel_category':fuel_order}).update_traces(line={'width':0})