In [137]:
# import the necessary packages
%reload_ext autoreload
%autoreload 2

# Tell python where to look for modules. 
# Depending on how your jupyter handles working directories, this may not be needed.
import sys
sys.path.append('../../hourly-egrid/')

import pandas as pd
import sqlalchemy as sa
import numpy as np

import plotly.express as px
import plotly.graph_objects as go
import missingno

import src.load_data as load_data
import src.data_cleaning as data_cleaning

In [135]:
# specify the year for analysis
year = 2020

# specify the relative path to the sqllite database, and create an sqalchemy engine
pudl_db = 'sqlite:///../data/pudl/pudl_data/sqlite/pudl.sqlite'
pudl_engine = sa.create_engine(pudl_db)

# Load generator data from 923

In [7]:
# load the generation fuel data for the year
generation_fuel_eia923 = pd.read_sql(f"SELECT * FROM generation_fuel_eia923 WHERE report_date >= '{year}-01-01' AND report_date <= '{year}-12-01'", pudl_engine)

# create a list of all plant IDs that report to EIA 923
plants_in_923 = list(generation_fuel_eia923.groupby('plant_id_eia').sum().index)

# Create a complete list of plant/generator IDs

In [27]:
# load the eia 860 generator data
generators_eia860 = pd.read_sql(f"SELECT * FROM generators_eia860 WHERE report_date >= '{year}-01-01' AND report_date <= '{year}-12-01'", pudl_engine, parse_dates=['report_date','retirement_date'])


# filter data to generators that were active or retiring in the current year
generators_eia860 = generators_eia860.loc[(generators_eia860['operational_status'] == 'existing') | ((generators_eia860['operational_status'] == 'retired') & (generators_eia860['retirement_date'].dt.year == year)),:]

# only keep certain columns
columns_to_keep = ['plant_id_eia', 'generator_id','report_date', 'capacity_mw','deliver_power_transgrid','distributed_generation','technology_description',
                   'energy_source_code_1', 'energy_source_code_2', 'energy_source_code_3','energy_source_code_4', 'energy_source_code_5', 'energy_source_code_6',
                   'multiple_fuels',  'cofire_fuels', 'switch_oil_gas','minimum_load_mw',
                   'current_planned_operating_date','retirement_date','operational_status_code', 'operational_status','data_source']
generators_eia860 = generators_eia860[columns_to_keep]

# only keep plants that are reported in eia 923
generators_eia860 = generators_eia860[generators_eia860['plant_id_eia'].isin(plants_in_923)]

# remove non-grid connected plants
# get the list of plant_id_eia from the static table
ngc_plants = list(pd.read_csv(f'../data/egrid/egrid{year}_static_tables/table_4-2_plants_not_connected_to_grid.csv')['Plant ID'])
# remove these plants from the cems data
generators_eia860 = generators_eia860[~generators_eia860['plant_id_eia'].isin(ngc_plants)]

# create a unique ID for each plant-generator
generators_eia860['unique_gen_id'] = generators_eia860['plant_id_eia'].astype(str) + "_" + generators_eia860['generator_id'].astype(str)

generators_eia860

Unnamed: 0,plant_id_eia,generator_id,report_date,capacity_mw,deliver_power_transgrid,distributed_generation,technology_description,energy_source_code_1,energy_source_code_2,energy_source_code_3,...,multiple_fuels,cofire_fuels,switch_oil_gas,minimum_load_mw,current_planned_operating_date,retirement_date,operational_status_code,operational_status,data_source,unique_gen_id
213,64756,EQX05,2020-01-01,1.1,,,Other Natural Gas,NG,,,...,,,,1.1,,NaT,OP,existing,eia860,64756_EQX05
215,64753,FDX10,2020-01-01,1.0,,,Other Natural Gas,NG,,,...,,,,1.0,,NaT,OP,existing,eia860,64753_FDX10
220,64749,CR18B,2020-01-01,4.0,,,Other Natural Gas,NG,OG,,...,1.0,,,0.1,,NaT,OP,existing,eia860,64749_CR18B
221,64748,CR18A,2020-01-01,3.5,,,Other Natural Gas,NG,OG,,...,,,,0.1,,NaT,OP,existing,eia860,64748_CR18A
223,64746,GSCS,2020-01-01,1.0,,,Solar Photovoltaic,SUN,,,...,0.0,,,,,NaT,OP,existing,eia860,64746_GSCS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30048,1,WT1,2020-01-01,0.5,,,Onshore Wind Turbine,WND,,,...,,,,0.1,,NaT,OA,existing,eia860,1_WT1
30049,1,5,2020-01-01,0.7,,,Petroleum Liquids,DFO,,,...,0.0,,,0.3,,NaT,OA,existing,eia860,1_5
30050,1,3,2020-01-01,0.5,,,Petroleum Liquids,DFO,,,...,0.0,,,0.3,,NaT,OP,existing,eia860,1_3
30051,1,2,2020-01-01,0.9,,,Petroleum Liquids,DFO,,,...,0.0,,,0.3,,NaT,OP,existing,eia860,1_2


In [151]:
generators_eia860[generators_eia860['plant_id_eia'] == 10]

Unnamed: 0,plant_id_eia,generator_id,report_date,capacity_mw,deliver_power_transgrid,distributed_generation,technology_description,energy_source_code_1,energy_source_code_2,energy_source_code_3,...,switch_oil_gas,minimum_load_mw,current_planned_operating_date,retirement_date,operational_status_code,operational_status,data_source,unique_gen_id,fuel_category,cems_reporting_category
30011,10,GT9,2020-01-01,80.0,,,Natural Gas Fired Combustion Turbine,NG,DFO,,...,1.0,66.0,,NaT,OP,existing,eia860,10_GT9,fossil,DNR
30012,10,GT8,2020-01-01,80.0,,,Natural Gas Fired Combustion Turbine,NG,DFO,,...,1.0,66.0,,NaT,OP,existing,eia860,10_GT8,fossil,DNR
30013,10,GT7,2020-01-01,80.0,,,Natural Gas Fired Combustion Turbine,NG,DFO,,...,1.0,66.0,,NaT,OP,existing,eia860,10_GT7,fossil,DNR
30014,10,GT6,2020-01-01,80.0,,,Natural Gas Fired Combustion Turbine,NG,DFO,,...,1.0,66.0,,NaT,OP,existing,eia860,10_GT6,fossil,DNR
30015,10,GT5,2020-01-01,80.0,,,Natural Gas Fired Combustion Turbine,NG,DFO,,...,1.0,66.0,,NaT,OP,existing,eia860,10_GT5,fossil,DNR
30016,10,GT4,2020-01-01,80.0,,,Natural Gas Fired Combustion Turbine,NG,DFO,,...,1.0,66.0,,NaT,OP,existing,eia860,10_GT4,fossil,DNR
30017,10,GT3,2020-01-01,80.0,,,Natural Gas Fired Combustion Turbine,NG,DFO,,...,1.0,66.0,,NaT,OP,existing,eia860,10_GT3,fossil,DNR
30018,10,GT2,2020-01-01,80.0,,,Natural Gas Fired Combustion Turbine,NG,DFO,,...,1.0,66.0,,NaT,OP,existing,eia860,10_GT2,fossil,DNR
30019,10,GT10,2020-01-01,80.0,,,Natural Gas Fired Combustion Turbine,NG,DFO,,...,1.0,66.0,,NaT,OP,existing,eia860,10_GT10,fossil,DNR
30020,10,2,2020-01-01,269.2,,,Natural Gas Steam Turbine,NG,,,...,,110.0,,NaT,OP,existing,eia860,10_2,fossil,DNR


In [28]:
# categorize generators by broad fuel categories (clean, geothermal, biofuel, fossil)
clean_fuels = ['SUN','MWH','WND', 'WAT','WH','PUR','NUC']
fossil_fuels = ['NG', 'DFO','OG','WDS','BIT','OTH','PC','SUB', 'LIG','KER', 'RC', 'WO','RFO', 'WC', 'SGC', 'SGP', 'PG', 'JF']
bio_fuels = ['OBG','LFG','AB','OBL', 'BLQ','OBS','MSW','BFG','WDL']

# create a column for generator fuel category
generators_eia860['fuel_category'] = None
generators_eia860.loc[generators_eia860['energy_source_code_1'].isin(clean_fuels),'fuel_category'] = 'clean'
generators_eia860.loc[generators_eia860['energy_source_code_1'].isin(fossil_fuels),'fuel_category'] = 'fossil'
generators_eia860.loc[generators_eia860['energy_source_code_1'].isin(bio_fuels),'fuel_category'] = 'biofuel'
generators_eia860.loc[generators_eia860['energy_source_code_1'].isin(['GEO']),'fuel_category'] = 'geothermal'

In [29]:
# create a column for reporting status, setting the default to DNR for "does not report" to CEMS
generators_eia860['cems_reporting_category'] = 'DNR'

In [30]:
# duplicate the entries for each month
generators_months = generators_eia860.copy()
eia860_month = generators_eia860.copy()

month = 2
while month <= 12:
    eia860_month['report_date'] = eia860_month['report_date'] + pd.DateOffset(months=1)
    generators_months = pd.concat([generators_months, eia860_month], axis = 0)
    month += 1

generators_months


Unnamed: 0,plant_id_eia,generator_id,report_date,capacity_mw,deliver_power_transgrid,distributed_generation,technology_description,energy_source_code_1,energy_source_code_2,energy_source_code_3,...,switch_oil_gas,minimum_load_mw,current_planned_operating_date,retirement_date,operational_status_code,operational_status,data_source,unique_gen_id,fuel_category,cems_reporting_category
213,64756,EQX05,2020-01-01,1.1,,,Other Natural Gas,NG,,,...,,1.1,,NaT,OP,existing,eia860,64756_EQX05,fossil,DNR
215,64753,FDX10,2020-01-01,1.0,,,Other Natural Gas,NG,,,...,,1.0,,NaT,OP,existing,eia860,64753_FDX10,fossil,DNR
220,64749,CR18B,2020-01-01,4.0,,,Other Natural Gas,NG,OG,,...,,0.1,,NaT,OP,existing,eia860,64749_CR18B,fossil,DNR
221,64748,CR18A,2020-01-01,3.5,,,Other Natural Gas,NG,OG,,...,,0.1,,NaT,OP,existing,eia860,64748_CR18A,fossil,DNR
223,64746,GSCS,2020-01-01,1.0,,,Solar Photovoltaic,SUN,,,...,,,,NaT,OP,existing,eia860,64746_GSCS,clean,DNR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30048,1,WT1,2020-12-01,0.5,,,Onshore Wind Turbine,WND,,,...,,0.1,,NaT,OA,existing,eia860,1_WT1,clean,DNR
30049,1,5,2020-12-01,0.7,,,Petroleum Liquids,DFO,,,...,,0.3,,NaT,OA,existing,eia860,1_5,fossil,DNR
30050,1,3,2020-12-01,0.5,,,Petroleum Liquids,DFO,,,...,,0.3,,NaT,OP,existing,eia860,1_3,fossil,DNR
30051,1,2,2020-12-01,0.9,,,Petroleum Liquids,DFO,,,...,,0.3,,NaT,OP,existing,eia860,1_2,fossil,DNR


In [88]:
generators_eia860[(generators_eia860['operational_status'] == 'retired') & (generators_eia860['retirement_date'].dt.month != 12)]

Unnamed: 0,plant_id_eia,generator_id,report_date,capacity_mw,deliver_power_transgrid,distributed_generation,technology_description,energy_source_code_1,energy_source_code_2,energy_source_code_3,...,switch_oil_gas,minimum_load_mw,current_planned_operating_date,retirement_date,operational_status_code,operational_status,data_source,unique_gen_id,fuel_category,cems_reporting_category
1459,63673,EU-02,2020-01-01,1.3,,,Natural Gas Fired Combustion Turbine,NG,,,...,,0.7,,2020-03-01,RE,retired,eia860,63673_EU-02,fossil,DNR
2927,62447,2,2020-01-01,0.8,,,Petroleum Liquids,DFO,,,...,,,,2020-02-01,RE,retired,eia860,62447_2,fossil,DNR
2928,62447,1,2020-01-01,1.1,,,Petroleum Liquids,DFO,,,...,,,,2020-02-01,RE,retired,eia860,62447_1,fossil,DNR
3475,62019,GEN 2,2020-01-01,0.5,,,Other Waste Biomass,OBG,,,...,0.0,0.1,,2020-01-01,RE,retired,eia860,62019_GEN 2,biofuel,DNR
3476,62019,GEN 1,2020-01-01,0.5,,,Other Waste Biomass,OBG,,,...,0.0,0.1,,2020-01-01,RE,retired,eia860,62019_GEN 1,biofuel,DNR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29687,127,1,2020-01-01,720.0,,,Conventional Steam Coal,SUB,,,...,,215.0,,2020-09-01,RE,retired,eia860,127_1,fossil,DNR
29826,87,1,2020-01-01,257.0,,,Conventional Steam Coal,SUB,NG,,...,0.0,90.0,,2020-11-01,RE,retired,eia860,87_1,fossil,DNR
29900,56,3,2020-01-01,236.0,,,Conventional Steam Coal,BIT,,,...,,140.0,,2020-10-01,RE,retired,eia860,56_3,fossil,DNR
29901,56,2,2020-01-01,236.0,,,Conventional Steam Coal,BIT,,,...,,140.0,,2020-10-01,RE,retired,eia860,56_2,fossil,DNR


# Classify CEMS units based on how they report to CAMD
There are three broad categories of plants based on their CAMD reporting status:
1. Units that report to CAMD year-round (for these plants, emissions data is used directly from CEMS)
2. Units that only report to CAMD during the ozone season (May-Sept) (for these units, non-ozone season data is taken from EIA 923)
3. Units that do not report to CAMD (generally fossil units < 25MW and non-fossil generators)

There are also certain plants that report to CAMD but do not procuce electricty for the grid, and need to be removed from the CEMS data:
- Non grid connected plants
- Steam-only plants

In [138]:
# load the CEMS data
cems = load_data.load_cems_data(year)

# remove non-grid connected plants
cems = data_cleaning.remove_non_grid_connected_plants(cems, year)

# remove plants that only report steam generation and no electrical generation
cems = data_cleaning.remove_heating_only_plants(cems)

# add a report date
cems = data_cleaning.add_report_date(cems)

# identify cems reporting status
cems = data_cleaning.determine_cems_reporting_status(cems)

In [None]:
# fill in missing hourly emissions data using the fuel type and heat input
cems = data_cleaning.fill_cems_missing_co2(cems, year)

# identify any remaining missing values
cems[cems['co2_mass_tons'].isnull()]

We have now identified all plants that report the full year to CEMS, and all plants that report a partial year.  

Next steps:
1. For plants that have a full year of data, but are missing CO2 data, calculate the missing data based on the reported hourly heat input
2. For plants that report a partial year, calculate the monthly heat input values from EIA-923
    - load the eia 923 data
    - identify the generators/months for which we have data in CEMS, and remove these
    - perform calculations as needed

# Old Stuff

In [None]:
# sum CEMS data by month for each unit
cems_monthly = cems.groupby(['cems_id','report_date']).sum()[['operating_time_hours','gross_load_mw','steam_load_1000_lbs','co2_mass_tons','heat_content_mmbtu']].reset_index()

# replace 0 reported CO2 values with missing values, if operating hours is also zero
cems_monthly.loc[(cems_monthly['co2_mass_tons'] == 0) & (cems_monthly['operating_time_hours'] > 0), 'co2_mass_tons'] = np.NaN

print(f"There are {len(cems_monthly.groupby(['cems_id']).count())} unique units that reported to CEMS in {year}")

# are there any plants that report zero operation all year
units_with_zero_operation = cems_monthly.groupby(['plant_id_eia','unitid']).sum()
units_with_zero_operation = units_with_zero_operation[units_with_zero_operation.sum(axis=1) == 0].reset_index()

# create a unique ID for each plant-generator
units_with_zero_operation = list((units_with_zero_operation['plant_id_eia'].astype(str) + "_" + units_with_zero_operation['unitid'].astype(str)).unique())

# create a unique id for each unit in cems
cems['unique_id_cems'] = cems['plant_id_eia'].astype(str) + "_" + cems['unitid'].astype(str)

# remove these plants from the cems data
cems = cems[~cems['unique_id_cems'].isin(units_with_zero_operation)]

# merge the CEMS data into the plant list
generators_months = generators_months.merge(cems_monthly.reset_index()[['plant_id_eia','unitid','report_date','co2_mass_tons']], how='left', left_on=['plant_id_eia','generator_id','report_date'], right_on=['plant_id_eia','unitid','report_date'])