In [66]:
# import the necessary packages
%reload_ext autoreload
%autoreload 2

# import packages
import os
import requests
import tarfile
import sqlalchemy as sa
from pathlib import Path
import pandas as pd

import data_cleaning
import gross_to_net_generation

# 1. Download data

 - Downloads the pre-cleaned PUDL versions of EIA-923, EIA-860, and EPA CEMS data  
 - Downloads EPA eGRID data  
 - Downloads EIA-930 data  
 - Downloads the EPA Power Sector Data Crosswalk

TODO
- [ ] The code for downloading the files could probably be made into functions
- [ ] Investigate other packages besides `requests` that would download these files faster

In [23]:
############### PUDL data ###########################

zenodo_url = 'https://zenodo.org/record/5701406/files/pudl-v0.5.0-2021-11-14.tgz'
pudl_version = zenodo_url.split('/')[-1].replace('.tgz','')

# if the pudl data already exists, do not re-download
if os.path.exists(f'../data/pudl'):
    print('PUDL data already downloaded')
else:
    r = requests.get(zenodo_url, params={"download":"1"}, stream=True)
    # specify parameters for progress bar
    total_size_in_bytes= int(r.headers.get('content-length', 0))
    block_size = 1024 * 1024 * 10 # 10 MB
    downloaded = 0
    with open("../data/pudl.tgz", 'wb') as fd:
        for chunk in r.iter_content(chunk_size=block_size):
            print(f'Downloading PUDL. Progress: {(round(downloaded/total_size_in_bytes*100,2))}%   \r', end='')
            fd.write(chunk)
            downloaded += block_size

    # extract the tgz file
    print('Extracting PUDL data...')
    with tarfile.open("../data/pudl.tgz") as tar:
        tar.extractall('../data/')

    # rename the extracted directory to pudl so that we don't have to update this for future versions
    os.rename(f'../data/{pudl_version}', 'pudl')

    # delete the downloaded tgz file
    os.remove("../data/pudl.tgz")



PUDL data already downloaded


In [16]:
################# eGRID data #########################

# the 2019 and 2020 data appear to be hosted on different urls
egrid_files_to_download = ['https://www.epa.gov/sites/default/files/2021-02/egrid2019_data.xlsx', 
                           'https://www.epa.gov/system/files/documents/2022-01/egrid2020_data.xlsx']


# if there is not yet a directory for egrid, make it
if not os.path.exists('../data/egrid'):
    os.mkdir('../data/egrid')

# download the egrid files
for url in egrid_files_to_download:
    filename = url.split("/")[-1]
    # if the file already exists, do not re-download it
    if os.path.exists(f'../data/egrid/{filename}'):
        print(f'{filename} already downloaded')
    else:
        r = requests.get(url, stream=True)
        
        with open(f'../data/egrid/{filename}', 'wb') as fd:
            for chunk in r.iter_content(chunk_size=1024):
                fd.write(chunk)

In [26]:
############# EIA-930 data #####################

years_to_download = [2019,2020]

# if there is not yet a directory for EIA-930, make it
if not os.path.exists('../data/eia930'):
    os.mkdir('../data/eia930')

# download the egrid files
for year in years_to_download:
    for period in ['Jan_Jun','Jul_Dec']:
        if os.path.exists(f'../data/eia930/EIA930_BALANCE_{year}_{period}.csv'):
            print(f'{year}_{period} data already downloaded')
        else:
            print(f'downloading {year}_{period} data')
            r = requests.get(f"https://www.eia.gov/electricity/gridmonitor/sixMonthFiles/EIA930_BALANCE_{year}_{period}.csv", stream=True)
        
            with open(f'../data/eia930/EIA930_BALANCE_{year}_{period}.csv', 'wb') as fd:
                for chunk in r.iter_content(chunk_size=1024 * 1024):
                    fd.write(chunk)


2019_Jan_Jun data already downloaded
2019_Jul_Dec data already downloaded
2020_Jan_Jun data already downloaded
2020_Jul_Dec data already downloaded


In [34]:
########## Power Sector Data Crosswalk #############
# NOTE: Check for new releases at https://github.com/USEPA/camd-eia-crosswalk

psdc_url = 'https://github.com/USEPA/camd-eia-crosswalk/releases/download/v0.2.1/epa_eia_crosswalk.csv'

# if there is not yet a directory for egrid, make it
if not os.path.exists('../data/epa'):
    os.mkdir('../data/epa')

filename = psdc_url.split("/")[-1]
# if the file already exists, do not re-download it
if os.path.exists(f'../data/epa/{filename}'):
    print(f'{filename} already downloaded')
else:
    r = requests.get(psdc_url, stream=True)
    
    with open(f'../data/epa/{filename}', 'wb') as fd:
        for chunk in r.iter_content(chunk_size=1024):
            fd.write(chunk)
    

epa_eia_crosswalk.csv already downloaded


# 2. Load data

In [25]:
# specify the year for analysis
year = 2019

#specify the path to the CEMS data
cems_path = f'../data/pudl/pudl_data/parquet/epacems/year={year}' 

# specify the columns to use from the CEMS database
cems_columns = ['plant_id_eia', 'unitid', 'operating_datetime_utc',
'operating_time_hours', 'gross_load_mw', 'steam_load_1000_lbs',
'co2_mass_tons', 'co2_mass_measurement_code', 'heat_content_mmbtu',
'facility_id','unit_id_epa']

# specify the relative path to the sqllite database, and create an sqalchemy engine
pudl_db = 'sqlite:///../data/pudl/pudl_data/sqlite/pudl.sqlite'
pudl_engine = sa.create_engine(pudl_db)

In [60]:
# load the CEMS data
cems = pd.read_parquet(cems_path, columns=cems_columns).query("operating_time_hours > 0")  # only load observations when the plant was operating

# rename cems plant_id_eia to plant_id_epa (PUDL simply renames the ORISPL_CODE column from the raw CEMS data as 'plant_id_eia' without actually crosswalking to the EIA id)
cems = cems.rename(columns={'plant_id_eia': 'plant_id_epa'})

# calculate gross generation by multiplying gross_load_mw by operating_time_hours
cems['gross_generation_mwh'] = cems['gross_load_mw'] * cems['operating_time_hours']

cems.head(5)

Unnamed: 0,plant_id_epa,unitid,operating_datetime_utc,operating_time_hours,gross_load_mw,steam_load_1000_lbs,co2_mass_tons,co2_mass_measurement_code,heat_content_mmbtu,facility_id,unit_id_epa,gross_generation_mwh
1488,3,4,2019-01-01 06:00:00+00:00,1.0,150.0,,168.699997,Measured,1644.5,1,4,150.0
1489,3,4,2019-01-01 07:00:00+00:00,1.0,150.0,,168.199997,Measured,1639.5,1,4,150.0
1490,3,4,2019-01-01 08:00:00+00:00,1.0,150.0,,168.800003,Measured,1644.900024,1,4,150.0
1491,3,4,2019-01-01 09:00:00+00:00,1.0,150.0,,167.5,Measured,1632.300049,1,4,150.0
1492,3,4,2019-01-01 10:00:00+00:00,1.0,150.0,,168.100006,Measured,1638.300049,1,4,150.0


In [None]:
# load the plants_entity_eia data
plants_entity_eia = pd.read_sql("plants_entity_eia", pudl_engine)

In [75]:
#load EIA-923 boiler fuel input data
boiler_fuel_eia923 = pd.read_sql('boiler_fuel_eia923', pudl_engine)
boiler_fuel_eia923['fuel_consumed_mmbtu'] = boiler_fuel_eia923['fuel_consumed_units'] * boiler_fuel_eia923['fuel_mmbtu_per_unit']

In [76]:
boiler_fuel_eia923

Unnamed: 0,plant_id_eia,boiler_id,energy_source_code,fuel_type_code_pudl,report_date,fuel_consumed_units,fuel_mmbtu_per_unit,sulfur_content_pct,ash_content_pct,fuel_consumed_mmbtu
0,3,1,BIT,coal,2008-01-01,41471.0,22.584,0.49,5.4,936581.064
1,3,1,BIT,coal,2008-02-01,33833.0,22.498,5.00,5.6,761174.834
2,3,1,BIT,coal,2008-03-01,40430.0,22.720,0.49,5.4,918569.600
3,3,1,BIT,coal,2008-04-01,33311.0,22.454,0.50,13.7,747965.194
4,3,1,BIT,coal,2008-05-01,31067.0,22.612,0.52,5.5,702487.004
...,...,...,...,...,...,...,...,...,...,...
1310419,3295,URQ3,NG,gas,2011-08-01,46321.0,1.020,0.00,0.0,47247.420
1310420,3295,URQ3,NG,gas,2011-09-01,10121.0,1.020,0.00,0.0,10323.420
1310421,3295,URQ3,NG,gas,2011-10-01,2568.0,1.010,0.00,0.0,2593.680
1310422,3295,URQ3,NG,gas,2011-11-01,0.0,0.000,0.00,0.0,0.000


# 3. Clean Data

TODO:
- [x] Crosswalk the epa plant codes to the EIA plant codes. NOTE: I believe that in the PUDL ETL process, the `plant_id_epa` is simply being renamed `plant_id_eia` without actually crosswalking the ids
- [ ] Outlier detection
- [ ] Ensure consistency of net generation and heat input data in EIA-923
- [x] Apply corrections from egrid static tables
- [ ] Assign primary fuel type to each CEMS observation

### Crosswalk the EPA plant_id to the EIA plant_id

In [62]:
# load the power sector data crosswalk
psdc = pd.read_csv('../data/epa/epa_eia_crosswalk.csv', usecols=['CAMD_PLANT_ID','CAMD_UNIT_ID','CAMD_GENERATOR_ID','EIA_PLANT_ID','EIA_GENERATOR_ID','EIA_BOILER_ID','CAMD_FUEL_TYPE','EIA_FUEL_TYPE'])

# create a table that matches EPA plant and unit IDs to an EIA plant ID
plant_id_crosswalk = psdc[['CAMD_PLANT_ID','CAMD_UNIT_ID','EIA_PLANT_ID']].drop_duplicates()
# only keep plant ids where the two are different
plant_id_crosswalk = plant_id_crosswalk[plant_id_crosswalk['CAMD_PLANT_ID'] != plant_id_crosswalk['EIA_PLANT_ID']].dropna()
# change the id to an int
plant_id_crosswalk['EIA_PLANT_ID'] = plant_id_crosswalk['EIA_PLANT_ID'].astype(int)
# rename the columns to match the format of the cems data
plant_id_crosswalk = plant_id_crosswalk.rename(columns={'CAMD_PLANT_ID':'plant_id_epa','CAMD_UNIT_ID':'unitid','EIA_PLANT_ID':'plant_id_eia'})

plant_id_crosswalk

Unnamed: 0,plant_id_epa,unitid,plant_id_eia
162,302,10,59002
167,302,6,59002
168,302,7,59002
169,302,8,59002
170,302,9,59002
...,...,...,...
5722,55375,CT4,57664
5942,55481,1,58557
5944,55481,2,58557
6823,70454,MAG1,54538


In [63]:
# match plant_id_eia on plant_id_epa and unitid
cems = cems.merge(plant_id_crosswalk, how='left', on=['plant_id_epa','unitid'])

# if the merge resulted in any missing plant_id associations, fill with the plant_id_epa, assuming that they are the same
cems['plant_id_eia'] = cems['plant_id_eia'].fillna(cems['plant_id_epa'])

### Remove any non-grid connected plants

In [71]:
# get the list of plant_id_eia from the static table
ngc_plants = list(pd.read_csv(f'../data/egrid/egrid{year}_static_tables/table_4-2_plants_not_connected_to_grid.csv')['Plant ID'])
# remove these plants from the cems data
cems = cems[~cems['plant_id_eia'].isin(ngc_plants)]

### Identify missing and outlier CO2 values
Notes
 - whenever there is a non-zero heat input and/or operating time reported, but 0 carbon emissions, we might want to treat that as a missing value

### Assign a primary fuel type to each unit-hour in CEMS
- Assign a primary fuel type code  
- For generators with multiple fuel types, create a heat-input weighted emission factor that can be used to fill missing CO2 data if needed

### Fill missing CO2 data in CEMS
Approach: once a primary fuel type has been assigned, use that to estimate a CO2 emission rate by multiplying heat_input_mmbtu by the emission factor for that fuel type, taken from `data\egrid\egrid2019_static_tables\table_C1_emission_factors_for_CO2_CH4_N2O.csv`

In [None]:
# check if there is any missing co2 data in the cems
if pd.isnull(cems['co2_mass_tons']).sum() != 0:
    #create a new df with all observations with missing co2 data
    missing_co2 = cems[cems['co2_mass_tons'].isnull()]
    #add a report date column to the missing data to prepare to merge
    missing_co2 = data_cleaning.add_report_date(missing_co2, plants_entity_eia)
    # fill missing co2 data
    unit_fuel_table = data_cleaning.monthly_fuel_types(missing_co2, boiler_fuel_eia923, plants_entity_eia)
    #merge the unit fuel info into the missing_co2 dataframe
    missing_co2 = missing_co2.merge(unit_fuel_table, how='left', on=[
                                    'plant_id_eia', 'unitid', 'report_date'])
    missing_co2.update(data_cleaning.fill_missing_co2(missing_co2))
    cems['co2_mass_tons'].update(missing_co2['co2_mass_tons'])

    # this code could be implemented to calculate the weighted average ef for a plant to help fill missing data
    """
    weighted_ef = data_cleaning.calculate_heat_input_weighted_ef(boiler_fuel_eia923=boiler_fuel_eia923, level='plant')
    missing_co2 = cems[cems['co2_mass_tons'] == 0]
    missing_co2 = missing_co2.merge(weighted_ef, how='left', on=['plant_id_eia','report_date'])
    missing_co2['co2_mass_tons'] = missing_co2['heat_content_mmbtu'] * missing_co2['fuel_weighted_ef_tons_per_mmbtu']
    missing_co2 = missing_co2.drop(columns=['fuel_weighted_ef_tons_per_mmbtu'])
    cems.update(missing_co2)
    """

### Identify CHP Plants
Notes:
 - Generators where gross_load_mw = 0 are likely CHP plants that are only producing heat and no electricity - need to confirm

# Calculate CEMS net generation

In [None]:
# calculate parastic loss factors
gtn_ratios, gtn_fill_values = gross_to_net_generation.gross_to_net_ratios(cems, generators, plants_entity_eia)

print(' aggregating data to plant level')
# aggregate to plant level
# drop columns that will not be aggregated
cems_gross = cems[['plant_id_eia', 'operating_datetime_utc','co2_mass_tons', 'heat_content_mmbtu', 'gross_generation_mwh']]
cems_gross = cems_gross.groupby(['plant_id_eia', 'operating_datetime_utc']).sum().reset_index()

print(' adding report dates')
# add report_date column
cems_gross = data_cleaning.add_report_date(cems_gross, plants_entity_eia)

print(' calculating net generation')
# convert gross load to net load
cems_gross = cems_gross.merge(gtn_ratios[['plant_id_eia', 'report_date', 'gtn_ratio']], how='left', on=['plant_id_eia', 'report_date'])

#fillna with average of non-na values for same plant
cems_gross = cems_gross.merge(gtn_fill_values, how='left', on=['plant_id_eia'])
cems_gross['gtn_ratio'] = cems_gross['gtn_ratio'].fillna(cems_gross['gtn_fill'])
cems_gross = cems_gross.drop(columns=['gtn_fill'])
# any other values that are still missing should be filled with 1
cems_gross['gtn_ratio'] = cems_gross['gtn_ratio'].fillna(1)

#calculate net generation
cems_gross['net_generation_mwh'] = cems_gross['gross_generation_mwh'] * cems_gross['gtn_ratio']

# merge this data into the main cems dataframe
# need to figure out if aggregating at the plant level is ok


# Determine coverage of CEMS
- Groupby.sum() CEMS data to plant, generator, and month - focusing on net generation and heat input
- Compare to EIA-923 data to determine in which months a generator was active (according to EIA-923), but is missing data from CEMS
- Aggregate up to BA level to determine what percent of generation (mwh) is missing from CEMS

# Calculate Residual Net Generation Profiles

# Aggregate data to Balancing Authorities
The output should be a pandas dataframe with a datetimeindex for each hour of the year (in UTC) and each column is a different BA code. The data will represent the average emission rate

Unnamed: 0,plant_id_eia,plant_name_eia,balancing_authority_code_eia,balancing_authority_name_eia,city,county,ferc_cogen_status,ferc_exempt_wholesale_generator,ferc_small_power_producer,grid_voltage_kv,...,iso_rto_code,latitude,longitude,primary_purpose_id_naics,sector_name_eia,sector_id_eia,state,street_address,zip_code,timezone
0,1,Sand Point,,,Sand Point,Aleutians East,False,False,False,0.48,...,,55.339722,-160.497222,22.0,IPP Non-CHP,2.0,AK,100 Power Plant Way,99661,America/Anchorage
1,2,Bankhead Dam,SOCO,"Southern Company Services, Inc. - Trans",Northport,Tuscaloosa,False,False,False,115.00,...,,33.458665,-87.356820,22.0,Electric Utility,1.0,AL,19001 Lock 17 Road,35476,America/Chicago
2,3,Barry,SOCO,"Southern Company Services, Inc. - Trans",Bucks,Mobile,False,False,False,230.00,...,,31.006900,-88.010300,22.0,Electric Utility,1.0,AL,North Highway 43,36512,America/Chicago
3,4,Walter Bouldin Dam,SOCO,"Southern Company Services, Inc. - Trans",Wetumpka,Elmore,False,False,False,115.00,...,,32.583889,-86.283060,22.0,Electric Utility,1.0,AL,750 Bouldin Dam Road,36092,America/Chicago
4,5,Chickasaw,,,,,,,,,...,,30.763300,-88.060600,,,,AL,,,America/Chicago
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14496,880100,Ravenswood Steam Plant,,,,,,,,,...,,40.760000,-73.750000,,,,NY,,,America/New_York
14497,880101,RockTenn CP LLC Stevenson Mill,,,,,,,,,...,,34.838000,-85.786500,,,,AL,,,America/Chicago
14498,880107,SPMT Marcus Hook Industrial Complex,,,,,,,,,...,,39.807600,-75.423900,,,,PA,,,America/New_York
14499,880108,Grain Processing Corporation,,,,,,,,,...,,38.655200,-87.181400,,,,IN,,,America/Indiana/Vincennes


In [29]:
# identify which BA each plant is in
plant_ba = plants_entity_eia[['plant_id_eia','balancing_authority_code_eia']]

# merge the ba code into the CEMS data
cems = cems.merge(plant_ba, how='left', on='plant_id_eia')
cems.head(5)

Unnamed: 0,plant_id_eia,unitid,operating_datetime_utc,operating_time_hours,gross_load_mw,steam_load_1000_lbs,co2_mass_tons,co2_mass_measurement_code,heat_content_mmbtu,facility_id,unit_id_epa,gross_generation_mwh,balancing_authority_code_eia
0,3,4,2019-01-01 06:00:00+00:00,1.0,150.0,,168.699997,Measured,1644.5,1,4,150.0,SOCO
1,3,4,2019-01-01 07:00:00+00:00,1.0,150.0,,168.199997,Measured,1639.5,1,4,150.0,SOCO
2,3,4,2019-01-01 08:00:00+00:00,1.0,150.0,,168.800003,Measured,1644.900024,1,4,150.0,SOCO
3,3,4,2019-01-01 09:00:00+00:00,1.0,150.0,,167.5,Measured,1632.300049,1,4,150.0,SOCO
4,3,4,2019-01-01 10:00:00+00:00,1.0,150.0,,168.100006,Measured,1638.300049,1,4,150.0,SOCO


In [30]:
hourly_emissions = cems.groupby(['balancing_authority_code_eia','operating_datetime_utc']).sum()[['co2_mass_tons','gross_generation_mwh']]
# divide total emissions by total generation to get the emission factor
hourly_emissions['ef_tons_co2_per_mwh'] = hourly_emissions['co2_mass_tons'] / hourly_emissions['gross_generation_mwh']

# pivot the data
hourly_emissions = hourly_emissions.reset_index().pivot(index='operating_datetime_utc', columns='balancing_authority_code_eia', values='ef_tons_co2_per_mwh')
hourly_emissions

balancing_authority_code_eia,AEC,AECI,AVA,AVRN,AZPS,BANC,BPAT,CISO,CPLE,CSTO,...,SPA,SRP,SWPP,TAL,TEC,TEPC,TIDC,TVA,WACM,WALC
operating_datetime_utc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-01-01 05:00:00+00:00,,,,,,,,,0.576899,,...,,,,0.527125,0.566509,,,1.037195,,
2019-01-01 06:00:00+00:00,0.659398,0.825115,,,,,,,0.579561,,...,0.953725,,0.936583,0.530544,0.574877,,,0.930967,,
2019-01-01 07:00:00+00:00,0.653285,0.828527,,,1.018196,,,,0.577471,,...,0.941797,0.655890,0.919715,0.534728,0.587272,0.983496,,0.931806,1.015533,0.905725
2019-01-01 08:00:00+00:00,0.682231,0.830951,0.406897,0.422533,1.012820,0.438338,0.750742,0.439264,0.579023,0.424760,...,0.949609,0.651747,0.923755,0.542358,0.598842,0.973174,0.479167,0.958827,1.021793,0.958800
2019-01-01 09:00:00+00:00,0.699115,0.830491,0.408140,0.422533,1.004576,0.445128,0.758139,0.434707,0.578672,0.423040,...,0.953333,0.648676,0.919082,0.543946,0.604369,0.978117,0.478302,0.954586,1.026918,0.954202
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-01-01 03:00:00+00:00,0.592540,0.743954,0.409836,0.427830,0.897877,0.465038,0.771250,0.488045,0.499840,0.425168,...,0.966986,0.451155,0.771155,0.514909,0.606840,0.976912,0.489942,0.537522,1.030843,0.601611
2020-01-01 04:00:00+00:00,0.592184,0.738933,0.410700,0.455556,0.885855,0.463407,0.780510,0.488733,0.496308,0.425645,...,0.967150,0.447647,0.776453,0.517778,0.622017,0.983260,0.490674,0.539789,1.035247,0.613097
2020-01-01 05:00:00+00:00,0.586654,0.740158,0.410288,0.431656,0.895115,0.464925,0.784062,0.492253,,0.424421,...,0.963942,0.453557,0.776866,,,0.988778,0.495745,0.536776,1.039944,0.612917
2020-01-01 06:00:00+00:00,,,0.410331,0.431785,0.892498,0.463127,0.852027,0.477131,,0.425125,...,,0.472407,0.533842,,,0.988355,0.486000,,1.039958,0.610431
