# eGRID Annual Database

NOTE: Query numbers mentioned throughout the code refer to SQL queries in the MS Access Database version of eGRID on which this code is based

In [None]:
# Standard libraries
import logging
import sys
import os
import pathlib

# 3rd party libraries
import numpy as np
import pandas as pd
import sqlalchemy as sa
import importlib
import calendar

# Local libraries
import pudl

## Notebook Parameters

In [None]:
EGRID_YEAR = 2018

# Establish connection to pudl database
pudl_settings = pudl.workspace.setup.get_defaults()
pudl_engine = sa.create_engine(pudl_settings['pudl_db'])

# set up access to output tables at different frequencies
# list of frequency aliases: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
pudl_out_annual = pudl.output.pudltabl.PudlTabl(pudl_engine, freq='AS', start_date=f'{EGRID_YEAR}-01-01', end_date=f'{EGRID_YEAR}-12-31') #annual frequency
pudl_out_monthly = pudl.output.pudltabl.PudlTabl(pudl_engine, freq='MS', start_date=f'{EGRID_YEAR}-01-01', end_date=f'{EGRID_YEAR}-12-31') #monthly frequency

datapkg_dir = pudl_settings['datapkg_dir'] + '/to_parquet/epacems-eia/data/'

In [None]:
# test accessing db
eia_860 = pudl_out_annual.bf_eia923()
eia_860.columns

In [None]:
# examples
# access datapkg
#pd.read_csv(datapkg_dir + 'generators_eia860.csv', parse_dates=['report_date']).query('report_date.dt.year == @EGRID_YEAR')

#access db
pudl_out_monthly.gen_eia923()

# Generator (GEN) File
This file includes generation from steam boilers and nuclear units in the EIA-923, plant prime movers in the EIA-923 that have only one generator in the EIA-860, and the EIA-923 plant prime movers where generation is distributed to the generator level based on nameplate capacity.


### 1. Create EIA-860 Generator Combined table
This table includes operable, proposed, and retired units  
(Queries 1g01, 1g02, 1g03, 1g04, 1g05)

In [None]:
# Load data from EIA-860 Generator Data Package
eia_860_gen_columns = ['plant_id_eia', 'generator_id','plant_name_eia','state', 'operational_status_code','prime_mover_code', 'energy_source_code_1', 'capacity_mw', 'planned_retirement_date', 'retirement_date', 'report_date']
#gen_file = pd.read_csv(datapkg_dir + 'generators_eia860.csv', parse_dates=['report_date', 'retirement_date', 'planned_retirement_date'], usecols=eia_860_gen_columns).query('report_date.dt.year == @EGRID_YEAR')
gen_file = pudl_out_annual.gens_eia860()[eia_860_gen_columns]

# parse datetime columns
gen_file[['planned_retirement_date', 'retirement_date']] = gen_file[['planned_retirement_date', 'retirement_date']].apply(pd.to_datetime)

# Merge Prime Mover data
#gen_file = gen_file.merge(pd.read_csv(datapkg_dir + 'generators_entity_eia.csv', usecols=['plant_id_eia','generator_id','prime_mover_code']), how='left', on=['plant_id_eia','generator_id'])

# Merge State and Plant Name
#gen_file = gen_file.merge(pd.read_csv(datapkg_dir + 'plants_entity_eia.csv', usecols=['plant_id_eia','plant_name_eia', 'state']), how='left', on=['plant_id_eia'])

# Add new columns
gen_file['sequence_number'] = np.NaN
gen_file['CFACT'] = np.NaN
gen_file['NUMBLR'] = 0
gen_file['NETGEN'] = np.NaN
gen_file['NETGENOZ'] = np.NaN
gen_file['data_source'] = ''

# combine planned_retirement_year and retirement_year columns
gen_file['retirement_date'] = gen_file['retirement_date'].fillna(gen_file['planned_retirement_date'])
#convert this column to a year instead of a date
gen_file['retirement_year'] = gen_file['retirement_date'].dt.year
#drop the old columns
gen_file = gen_file.drop(columns=['planned_retirement_date','retirement_date'])

# drop plants not connected to grid
# NOTE: in 1g04, only Plant 10788 is dropped
non_grid_connected_plant_ids = list(pd.read_csv(importlib.resources.open_text(
        'pudl.package_data.epa.egrid', 'table_4-2_plants_not_connected_to_grid.csv'),
        usecols=['Plant ID'])['Plant ID'])
gen_file = gen_file[~gen_file['plant_id_eia'].isin(non_grid_connected_plant_ids)]

gen_file = gen_file.set_index(['plant_id_eia', 'generator_id'])

gen_file.head(3)

### 2. Count number of boilers per generator
(Queries 1g07, 1g08, 1g09)

In [None]:
# number_of_boilers = pd.read_csv(datapkg_dir + 'boiler_generator_assn_eia860.csv', parse_dates=['report_date'], usecols=['plant_id_eia','report_date','generator_id','boiler_id']).query('report_date.dt.year == @EGRID_YEAR').drop(columns='report_date')
number_of_boilers = pudl_out_annual.bga_eia860()[['plant_id_eia','generator_id','boiler_id']]

# count the number of boilers per generator
number_of_boilers = number_of_boilers.groupby(['plant_id_eia','generator_id']).count().rename(columns={'boiler_id':'NUMBLR'})

# merge this data into gen_file
gen_file.update(number_of_boilers)

gen_file.head(2)

### 3. Update Net Generation Data
(Queries 1g10, 1g11)  
NOTE: 1g11 not necessary because generator ids between EIA-860 and EIA-923 have already been standardized in the PUDL data package

In [None]:
#eia_923_generator = pd.read_csv(datapkg_dir + 'generation_eia923.csv', parse_dates=['report_date']).query('report_date.dt.year == @EGRID_YEAR')
eia_923_generator = pudl_out_monthly.gen_eia923()[['report_date','plant_id_eia','generator_id','net_generation_mwh']]

# sum annual net generation
eia_923_generator_NETGEN = eia_923_generator.drop(columns='report_date').groupby(['plant_id_eia','generator_id']).sum().rename(columns={'net_generation_mwh':'NETGEN'})
eia_923_generator_NETGEN['data_source'] = 'EIA-923 Generator File'

# calculate ozone season net generation, which includes months May - September
eia_923_generator_NETGENOZ = eia_923_generator[(eia_923_generator['report_date'].dt.month >= 5) & (eia_923_generator['report_date'].dt.month <= 9)].groupby(['plant_id_eia','generator_id']).sum().rename(columns={'net_generation_mwh':'NETGENOZ'})
eia_923_generator_NETGENOZ['data_source'] = 'EIA-923 Generator File'

# merge this data into gen_file
gen_file.update(eia_923_generator_NETGEN)
gen_file.update(eia_923_generator_NETGENOZ)

gen_file.head(2)

### 4. Distribute generation
(Queries 1g12, 1g13, 1g14, 1g15, 1g16, 1g18, 1g19)

In [None]:
# Sum net generation by prime mover (1g12)
net_gen_by_PM = gen_file.reset_index()[['plant_id_eia','prime_mover_code', 'NETGEN', 'NETGENOZ']].groupby(['plant_id_eia','prime_mover_code']).sum()

# Sum EIA-923 Generation and Fuel by prime mover (1g13)
# Load EIA-923 Generation and Fuel data
#eia_923_gen_fuel = pd.read_csv(datapkg_dir + 'generation_fuel_eia923.csv', parse_dates=['report_date'], usecols=['plant_id_eia','report_date','prime_mover_code','net_generation_mwh']).query('report_date.dt.year == @EGRID_YEAR')
eia_923_gen_fuel = pudl_out_monthly.gf_eia923()[['report_date','plant_id_eia','prime_mover_code','net_generation_mwh']]

# Sum annual and ozone season net generation by plant and prime mover
eia_923_gen_fuel_NETGEN = eia_923_gen_fuel.drop(columns='report_date').groupby(['plant_id_eia','prime_mover_code']).sum().rename(columns={'net_generation_mwh':'NETGEN'}).reset_index()
eia_923_gen_fuel_NETGENOZ = eia_923_gen_fuel[(eia_923_gen_fuel['report_date'].dt.month >= 5) & (eia_923_gen_fuel['report_date'].dt.month <= 9)].groupby(['plant_id_eia','prime_mover_code']).sum().rename(columns={'net_generation_mwh':'NETGENOZ'}).reset_index()

# Calculate the difference between PM net generation from gen_file and EIA-923 Generation and Fuel (1g14)
#merge the net gen by PM data from the gen file and the EIA-923 generation and fuel file together
netgen_diff_by_PM = net_gen_by_PM.merge(eia_923_gen_fuel_NETGEN, how='left', on=['plant_id_eia','prime_mover_code'], suffixes=('_gen','_923gf'))
netgen_diff_by_PM = netgen_diff_by_PM.merge(eia_923_gen_fuel_NETGENOZ, how='left', on=['plant_id_eia','prime_mover_code'], suffixes=('_gen','_923gf'))

# find the netgen_diff_by_PM between the net generation from the two sources
netgen_diff_by_PM['netgen_diff_by_PM'] = netgen_diff_by_PM['NETGEN_923gf'].round(decimals=0) - netgen_diff_by_PM['NETGEN_gen'].fillna(0).round(decimals=0)
netgen_diff_by_PM['netgen_diff_by_PM_oz'] = netgen_diff_by_PM['NETGENOZ_923gf'].round(decimals=0) - netgen_diff_by_PM['NETGENOZ_gen'].fillna(0).round(decimals=0)

netgen_diff_by_PM.head(3)

In [None]:
# get units in gen_file without generation (1g15)
np_capacity = gen_file[['prime_mover_code', 'NETGEN', 'NETGENOZ', 'operational_status_code', 'data_source', 'capacity_mw', 'retirement_year']]
# only keep generators that are missing data and are either in operation or are retired in 2018
np_capacity = np_capacity[
    (np_capacity['data_source'] == '') & 
    ((np_capacity['operational_status_code'].isin(["OP","SB","OS","OA","IP","TS","U","V"])) | 
    ((np_capacity['operational_status_code'] == 'RE') & (np_capacity['retirement_year'] == EGRID_YEAR)))]

"""
# group these generators by Prime Mover (1g16)
np_capacity_by_PM = np_capacity.reset_index().drop(columns=['NETGEN','NETGENOZ']).groupby(['plant_id_eia','prime_mover_code','operational_status_code','retirement_year'], dropna=False).sum()
"""

# group these generators by plant (1g17)
np_capacity_by_plant = np_capacity.reset_index().drop(columns=['NETGEN','NETGENOZ','retirement_year']).groupby(['plant_id_eia','prime_mover_code']).sum().rename(columns={'capacity_mw':'capacity_mw_by_plant'})

# calculate nameplate capacity ratio (1g18)
np_capacity = np_capacity.reset_index().merge(np_capacity_by_plant.reset_index(), how='left', on=['plant_id_eia','prime_mover_code'])
np_capacity['capacity_ratio'] = np_capacity['capacity_mw'] / np_capacity['capacity_mw_by_plant']
np_capacity = np_capacity.sort_values(by=['plant_id_eia','generator_id'])

np_capacity.head(5)

In [None]:
# Distribute generation by prime mover where missing (1g19)
generation_and_fuel_to_distribute = np_capacity.merge(netgen_diff_by_PM[['plant_id_eia','prime_mover_code','netgen_diff_by_PM','netgen_diff_by_PM_oz']], how='inner', on=['plant_id_eia','prime_mover_code']).drop(columns=['operational_status_code','capacity_mw','retirement_year','capacity_mw_by_plant'])

generation_and_fuel_to_distribute['NETGEN_to_use'] = generation_and_fuel_to_distribute['netgen_diff_by_PM'] * generation_and_fuel_to_distribute['capacity_ratio']
generation_and_fuel_to_distribute['NETGENOZ_to_use'] = generation_and_fuel_to_distribute['netgen_diff_by_PM_oz'] * generation_and_fuel_to_distribute['capacity_ratio']

generation_and_fuel_to_distribute['data_source'] = 'Distributed from 923 Generation And Fuel'

generation_and_fuel_to_distribute.head(2)

In [None]:
# Update gen file (1g20 & 1g21)
gen_file = gen_file.reset_index().set_index(['plant_id_eia','generator_id','prime_mover_code'])
gen_file.update(generation_and_fuel_to_distribute.set_index(['plant_id_eia','generator_id','prime_mover_code'])[['NETGEN_to_use','NETGENOZ_to_use','data_source']].rename(columns={'NETGEN_to_use':'NETGEN','NETGENOZ_to_use':'NETGENOZ'}), overwrite=False)
gen_file

### 5. Add Capacity Factors
(Query 1g21)

In [None]:
# get the number of hours in the year, accounting for leap years
if calendar.isleap(EGRID_YEAR) == True:
    hours_in_year = 8784
else:
    hours_in_year = 8760

gen_file['CFACT'] = gen_file['NETGEN'] / (gen_file['capacity_mw'] * hours_in_year)

### 6. Update Fuel type of "Other Gas" (OG)
(Query 1g22)  
NOTE: This loads data in a static table - not sure what the source of these data are

In [None]:
updated_fuel_type_codes = pd.read_csv(importlib.resources.open_text(
        'pudl.package_data.epa.egrid', 'updated_fuel_type_codes.csv'),
        usecols=['plant_id_eia','generator_id','updated_fuel_type_code'], index_col=['plant_id_eia','generator_id']).rename(columns={'updated_fuel_type_code':'energy_source_code_1'})

gen_file.update(updated_fuel_type_codes)

### 7. Update net generation data from Ventyx data
(Query 1g23)  
NOTE: The Ventyx data may be confidential so for now I am not including it in this code. This step only updates the net generation total for a single generator (plant_id 58478, generator_id CC01), which does not seem to exist in the gen_file

In [None]:
gen_file[gen_file.index.get_level_values(0) == 58478]

### 8. Distribute Generation, Part 2 (can this be part of step 4?)

NOTE: This seems to be dubplicating the previous step in some ways. The previous step uses the differences to fill in missing data from the gen file, and this step instead attempts to reconcile differences between the two data sources.
(Queries 1g24, 1g25, 1g26, 1g27, 1g28, 1g29)

In [None]:
# sum generation by prime mover (1g24)
netgen_by_PM_from_gen_file = gen_file.reset_index()[['plant_id_eia','prime_mover_code','NETGEN','NETGENOZ']].groupby(['plant_id_eia','prime_mover_code']).sum()

# sum EIA-923 gen and fuel data by prime mover (1g25)
# NOTE: we performed this step already, so we will reuse these dfs
netgen_by_PM_from_gf_eia923 = eia_923_gen_fuel_NETGEN.merge(eia_923_gen_fuel_NETGENOZ, how='outer', on=['plant_id_eia','prime_mover_code'])

# compare net generation from the gen_file and eia_923 (1g26)
# only keep values if the percent difference is greater than +/-0.001
compare_net_generation = netgen_by_PM_from_gen_file.merge(netgen_by_PM_from_gf_eia923, how='inner', on=['plant_id_eia','prime_mover_code'], suffixes=("_GenFile","_923"))
compare_net_generation['AbsDiff'] = compare_net_generation['NETGEN_923'] - compare_net_generation['NETGEN_GenFile'].fillna(0)
compare_net_generation['PctDiff'] = (compare_net_generation['AbsDiff'] / compare_net_generation['NETGEN_923'])
compare_net_generation = compare_net_generation[abs(compare_net_generation['PctDiff']) > 0.001].sort_values(by='AbsDiff')
compare_net_generation

In [None]:
# calculate the nameplate capacity by prime mover (1g27)

np_capacity = gen_file.reset_index()[['plant_id_eia','prime_mover_code', 'operational_status_code', 'capacity_mw', 'retirement_year']]
# only keep generators that are missing data and are either in operation or are retired in 2018
np_capacity = np_capacity[
    ((np_capacity['operational_status_code'].isin(["OP","SB","OS","OA","IP","TS","U","V"])) | 
    ((np_capacity['operational_status_code'] == 'RE') & (np_capacity['retirement_year'] == EGRID_YEAR)))]
np_capacity_by_PM = np_capacity.reset_index().drop(columns=['index','operational_status_code','retirement_year']).groupby(['plant_id_eia','prime_mover_code'], dropna=False).sum()

# calculate nameplate capacity ratio (1g28)
np_capacity_ratio = gen_file.reset_index()[['plant_id_eia','prime_mover_code','generator_id','capacity_mw']].merge(np_capacity_by_PM.reset_index(), how='inner', on=['plant_id_eia','prime_mover_code'], suffixes=("","_sum_by_PM"))
np_capacity_ratio['ratio'] = np_capacity_ratio['capacity_mw'] / np_capacity_ratio['capacity_mw_sum_by_PM']
np_capacity_ratio = np_capacity_ratio.sort_values(by=['plant_id_eia','generator_id'])

np_capacity_ratio

In [None]:
# redistribute generation (1g29)
# merge data from previous steps together
gf_to_distribute_2 = np_capacity_ratio.merge(netgen_by_PM_from_gf_eia923, how='inner', on=['plant_id_eia','prime_mover_code'])
gf_to_distribute_2 = gf_to_distribute_2.merge(compare_net_generation, how='inner', on=['plant_id_eia','prime_mover_code'])
gf_to_distribute_2 = gf_to_distribute_2[['plant_id_eia','generator_id','prime_mover_code','ratio','NETGEN','NETGENOZ']]
#calculate how much generation to distribute based on ratio
gf_to_distribute_2['NETGEN'] = (gf_to_distribute_2['ratio'] * gf_to_distribute_2['NETGEN']).round(3)
gf_to_distribute_2['NETGENOZ'] = (gf_to_distribute_2['ratio'] * gf_to_distribute_2['NETGENOZ']).round(3) 

#update data_source
gf_to_distribute_2['data_source'] = 'Data from EIA-923 Generator File overwritten with distributed data from EIA-923 Generation and Fuel'

#prepare for updating
gf_to_distribute_2 = gf_to_distribute_2.drop(columns=['prime_mover_code','ratio'])
gf_to_distribute_2 = gf_to_distribute_2.set_index(['plant_id_eia','generator_id'])

# Update gen_file (1g30)
gen_file.update(gf_to_distribute_2)
gen_file

### 9. Distribute Generation, Part 3
It seems like some generators reported all of their generation for the year in December. This step does what?

In [None]:
# 1g31 find where the generation from december equals the annual generation
gens_where_dec_eq_annual = eia_923_generator_NETGEN.merge(eia_923_generator[eia_923_generator.report_date.dt.month == 12].set_index(['plant_id_eia','generator_id']), how='left', left_index=True, right_index=True)
# NETGEN is the annual total, and net_generation_mwh is the monthly data from december
gens_where_dec_eq_annual = gens_where_dec_eq_annual[(gens_where_dec_eq_annual.NETGEN == gens_where_dec_eq_annual.net_generation_mwh) & (gens_where_dec_eq_annual.NETGEN != 0)]
gens_where_dec_eq_annual = gens_where_dec_eq_annual.rename(columns={'NETGEN':'net_generation_annual','net_generation_mwh':'net_generation_december'})
gens_where_dec_eq_annual = gens_where_dec_eq_annual[['net_generation_annual','net_generation_december']]
gens_where_dec_eq_annual

In [None]:
# distribute the ozone season net generation for generators when all reported net generation was in december 1g32
gen_to_distribute_3 = gens_where_dec_eq_annual.merge(np_capacity_ratio.set_index(['plant_id_eia','generator_id'])[['prime_mover_code','ratio']], how='inner', left_index=True, right_index=True).reset_index().set_index(['plant_id_eia','generator_id','prime_mover_code'])

gen_to_distribute_3 = gen_to_distribute_3.merge(netgen_by_PM_from_gf_eia923.set_index(['plant_id_eia','prime_mover_code']), how='inner', left_index=True, right_index=True)

gen_to_distribute_3['NETGENOZ_update'] = gen_to_distribute_3['ratio'] * gen_to_distribute_3['NETGENOZ']

gen_to_distribute_3

In [None]:
gen_file[gen_file['NETGENOZ'] == 0]

In [None]:
# Update gen file with new netgenoz data (1g33)
gen_to_distribute_3 = gen_to_distribute_3[['NETGENOZ_update']].droplevel(1)

gen_file = gen_file.merge(gen_to_distribute_3, how='left', left_index=True, right_index=True)

gen_file[gen_file['NETGENOZ'] == 0]

In [None]:
gen_file[gen_file['NETGENOZ'] == 0]

In [None]:
gen_to_distribute_3 = gen_to_distribute_3[['NETGENOZ_update']].rename(columns={'NETGENOZ_update':'NETGENOZ'}).droplevel(1)
gen_to_distribute_3