# EIA-923 Cleaning/Export Tool

This notebook can be used to export cleaned EIA-923 generation and emissions data.

The emissions data includes unadjusted, adjusted, and "for electricity":
* CO2
* CO2-eq
* NOx
* SO2

In [1]:
import sys
sys.path.append('../../hourly-egrid/')

%reload_ext autoreload
%autoreload 2

import os
import pandas as pd
import numpy as np

import src.data_cleaning as dc
import src.load_data as ld

import src.gross_to_net_generation as gtn



In [2]:
def path_to_top(rel=''):
    """Make a filename relative to the top-level project directory."""
    return os.path.join('..', rel)

def path_to_data(rel=''):
    """Helper function to make a filename relative to the `data` folder."""
    return os.path.join(path_to_top('data'), rel)

def path_to_outputs(rel=''):
    """Helper function to make a filename relative to the `output` folder."""
    return os.path.join(path_to_data('outputs'), rel)

In [11]:
# STEP 1: Generate the subplant crosswalk needed to clean EIA-923 data.
start_year = 2001
end_year = 2020

# This function will internally output the subplant_crosswalk.csv file. Only run this once! It takes a while.
cems_monthly, gen_fuel_allocated = gtn.generate_subplant_ids(
    start_year, end_year, cems_monthly, gen_fuel_allocated)

NameError: name 'gtn' is not defined

In [18]:
subplant_crosswalk = pd.read_csv(path_to_outputs('subplant_crosswalk.csv'))
subplant_crosswalk

Unnamed: 0,plant_id_epa,unitid,plant_id_eia,generator_id,subplant_id,current_planned_operating_date,retirement_date
0,3,1,3,1,0,,
1,3,2,3,2,1,,
2,3,3,3,3,2,,2015-08-01
3,3,4,3,4,3,,
4,3,5,3,5,4,,
...,...,...,...,...,...,...,...
5955,57185,U003,57185,U003,2,2020-04-01,
5956,60927,1A,60927,1A,0,2020-03-01,
5957,60927,1B,60927,1B,1,2020-03-01,
5958,61028,CT1,61028,CTG1,0,2020-05-01,


In [30]:
# Test out a single year of calculations.
df_923_emissions, primary_fuel_table = dc.clean_eia923(2001)

Removing 16 plants that are not grid-connected
Removing 0 plants located in the following states: ['PR']


In [31]:
# df_923_emissions

# print('# events:', len(df_923_emissions))
# print('# events with net generation:', len(df_923_emissions[pd.notna(df_923_emissions['net_generation_mwh'])]))
# print('# events w/ CO2 emissions:', len(df_923_emissions[pd.notna(df_923_emissions['co2_mass_lb'])]))

primary_fuel_table

Unnamed: 0,plant_id_eia,generator_id,energy_source_code,plant_primary_fuel
0,2,1,WAT,WAT
1,3,1,BIT,BIT
2,3,2,BIT,BIT
3,3,3,BIT,BIT
4,3,4,BIT,BIT
...,...,...,...,...
16095,55247,GT5,NG,NG
16096,55247,GT6,NG,NG
16097,55499,CPP1,NG,NG
16098,55512,CPP6,NG,NG


In [35]:
def export_cleaned_eia_923(year):
    """
    Helper function for exporting a year of cleaned EIA-923 data to CSV.
    """
    print(f'[INFO] Cleaning EIA-923 data from {year}.')
    df_923_data, primary_fuel_table = dc.clean_eia923(year)
    print('[INFO] Done cleaning.')
    
    # Calculate CO2-eq emissions using the same GWPs as eGRID.
    if year < 2018:        
        df_923_data = dc.calculate_co2_eq_mass(
            df_923_data, ipcc_version='AR5', gwp_horizon=100, ar5_climate_carbon_feedback=True)
    else:
        df_923_data = dc.calculate_co2_eq_mass(
            df_923_data, ipcc_version='AR4', gwp_horizon=100, ar5_climate_carbon_feedback=False)

    # Export the data to output.
    df_923_data.to_csv(path_to_outputs(f'923/cleaned_eia_923_generation_{year}.csv'))    
    primary_fuel_table.to_csv(path_to_outputs(f'923/cleaned_eia_923_primary_fuel_{year}.csv'))
    print('[INFO] Wrote to CSV.')

In [39]:
# STEP 2: Export EIA-923 data across all available years.
# NOTE: This is slow! Each year takes a few minutes to run.
os.makedirs(path_to_outputs('923'), exist_ok=True)

years_to_export = range(2003, 2021)
for year in years_to_export:
    export_cleaned_eia_923(year)

[INFO] Cleaning EIA-923 data from 2003.
Removing 18 plants that are not grid-connected
Removing 0 plants located in the following states: ['PR']
[INFO] Done cleaning.
[INFO] Wrote to CSV.
[INFO] Cleaning EIA-923 data from 2004.
Removing 17 plants that are not grid-connected
Removing 0 plants located in the following states: ['PR']
[INFO] Done cleaning.
[INFO] Wrote to CSV.
[INFO] Cleaning EIA-923 data from 2005.
Removing 17 plants that are not grid-connected
Removing 0 plants located in the following states: ['PR']
[INFO] Done cleaning.
[INFO] Wrote to CSV.
[INFO] Cleaning EIA-923 data from 2006.
Removing 15 plants that are not grid-connected
Removing 0 plants located in the following states: ['PR']
[INFO] Done cleaning.
[INFO] Wrote to CSV.
[INFO] Cleaning EIA-923 data from 2007.
Removing 14 plants that are not grid-connected
Removing 0 plants located in the following states: ['PR']
[INFO] Done cleaning.
[INFO] Wrote to CSV.
[INFO] Cleaning EIA-923 data from 2008.
Removing 14 plants t

In [44]:
def check_923_primary_fuel_for_plant_and_generator(year):
    """
    Check that all EIA plant IDs found in the 923 data are also in the primary fuel table.
    """
    df_data = pd.read_csv(path_to_outputs(f'923/cleaned_eia_923_generation_{year}.csv'))
    df_fuel = pd.read_csv(path_to_outputs(f'923/cleaned_eia_923_primary_fuel_{year}.csv'))

    # Only check one month of the year for speed. Plants/generators that are missing in one
    # month seem to be missing for all months.
    df_data = df_data[df_data['report_date'] == f'{year}-06-01']

    for i in range(len(df_data)):
        row = df_data.iloc[i]
        plant_id_eia = row['plant_id_eia']
        generator_id = row['generator_id']
        
        fuel_this_plant_and_gen = df_fuel[(df_fuel['plant_id_eia'] == plant_id_eia) & \
                                          (df_fuel['generator_id'] == generator_id)]
        if len(fuel_this_plant_and_gen) == 0:
            print(f'[WARNING] Missing primary fuel for:\n  plant_id_eia={plant_id_eia}\n  generator_id={generator_id}')

In [45]:
# STEP 3: Check that all EIA plant IDs found in the 923 data are also in the primary fuel table.
years_to_export = range(2001, 2021)
for year in years_to_export:
    print(f'[INFO] --- Checking EIA-923 data for {year}')
    check_923_primary_fuel_for_plant_and_generator(year)

[INFO] --- Checking EIA-923 data for 2001
                      plant_id_eia=6058
                      generator_id=2
[INFO] --- Checking EIA-923 data for 2002
[INFO] --- Checking EIA-923 data for 2003
                      plant_id_eia=54224
                      generator_id=GEN6
[INFO] --- Checking EIA-923 data for 2004
                      plant_id_eia=54224
                      generator_id=GEN6
[INFO] --- Checking EIA-923 data for 2005
                      plant_id_eia=6190
                      generator_id=3
                      plant_id_eia=7790
                      generator_id=2
                      plant_id_eia=10612
                      generator_id=GEN2
                      plant_id_eia=54224
                      generator_id=GEN6
                      plant_id_eia=54690
                      generator_id=6000
                      plant_id_eia=55821
                      generator_id=BCT
                      plant_id_eia=55821
                      generator_i