In [5]:
# Tell python where to look for modules. 
# Depending on how your jupyter handles working directories, this may not be needed.
import sys
sys.path.append('../../hourly-egrid/')

## A note on working paths 

The data directories here assume that Jupyter's working directory is hourly-egrid/data. 

In [6]:
# import the necessary packages
%reload_ext autoreload
%autoreload 2

# import packages
import os
import requests
import tarfile
import sqlalchemy as sa
from pathlib import Path
import pandas as pd

import src.data_cleaning as data_cleaning
import src.gross_to_net_generation as gross_to_net_generation

In [50]:
import importlib
importlib.reload(data_cleaning)
importlib.reload(gross_to_net_generation)

<module 'src.gross_to_net_generation' from '/Users/gailin.pease/singularity/hourly-egrid/notebooks/../../hourly-egrid/src/gross_to_net_generation.py'>

# 1. Download data

 - Downloads the pre-cleaned PUDL versions of EIA-923, EIA-860, and EPA CEMS data  
 - Downloads EPA eGRID data  
 - Downloads EIA-930 data  
 - Downloads the EPA Power Sector Data Crosswalk

TODO
- [ ] The code for downloading the files could probably be made into functions
- [ ] Investigate other packages besides `requests` that would download these files faster

In [11]:
############### PUDL data ###########################

zenodo_url = 'https://zenodo.org/record/5701406/files/pudl-v0.5.0-2021-11-14.tgz'
pudl_version = zenodo_url.split('/')[-1].replace('.tgz','')

# if the pudl data already exists, do not re-download
if os.path.exists(f'../data/pudl'):
    print('PUDL data already downloaded')
else:
    r = requests.get(zenodo_url, params={"download":"1"}, stream=True)
    # specify parameters for progress bar
    total_size_in_bytes= int(r.headers.get('content-length', 0))
    block_size = 1024 * 1024 * 10 # 10 MB
    downloaded = 0
    with open("../data/pudl.tgz", 'wb') as fd:
        for chunk in r.iter_content(chunk_size=block_size):
            print(f'Downloading PUDL. Progress: {(round(downloaded/total_size_in_bytes*100,2))}%   \r', end='')
            fd.write(chunk)
            downloaded += block_size

    # extract the tgz file
    print('Extracting PUDL data...')
    with tarfile.open("../data/pudl.tgz") as tar:
        tar.extractall('../data/')

    # rename the extracted directory to pudl so that we don't have to update this for future versions
    os.rename(f'../data/{pudl_version}', 'pudl')

    # delete the downloaded tgz file
    os.remove("../data/pudl.tgz")



PUDL data already downloaded


In [12]:
################# eGRID data #########################

# the 2019 and 2020 data appear to be hosted on different urls
egrid_files_to_download = ['https://www.epa.gov/sites/default/files/2021-02/egrid2019_data.xlsx', 
                           'https://www.epa.gov/system/files/documents/2022-01/egrid2020_data.xlsx']


# if there is not yet a directory for egrid, make it
if not os.path.exists('../data/egrid'):
    os.mkdir('../data/egrid')

# download the egrid files
for url in egrid_files_to_download:
    filename = url.split("/")[-1]
    # if the file already exists, do not re-download it
    if os.path.exists(f'../data/egrid/{filename}'):
        print(f'{filename} already downloaded')
    else:
        r = requests.get(url, stream=True)
        
        with open(f'../data/egrid/{filename}', 'wb') as fd:
            for chunk in r.iter_content(chunk_size=1024):
                fd.write(chunk)

egrid2019_data.xlsx already downloaded
egrid2020_data.xlsx already downloaded


In [13]:
############# EIA-930 data #####################

years_to_download = [2019,2020]

# if there is not yet a directory for EIA-930, make it
if not os.path.exists('../data/eia930'):
    os.mkdir('../data/eia930')

# download the egrid files
for year in years_to_download:
    for period in ['Jan_Jun','Jul_Dec']:
        if os.path.exists(f'../data/eia930/EIA930_BALANCE_{year}_{period}.csv'):
            print(f'{year}_{period} data already downloaded')
        else:
            print(f'downloading {year}_{period} data')
            r = requests.get(f"https://www.eia.gov/electricity/gridmonitor/sixMonthFiles/EIA930_BALANCE_{year}_{period}.csv", stream=True)
        
            with open(f'../data/eia930/EIA930_BALANCE_{year}_{period}.csv', 'wb') as fd:
                for chunk in r.iter_content(chunk_size=1024 * 1024):
                    fd.write(chunk)


2019_Jan_Jun data already downloaded
2019_Jul_Dec data already downloaded
2020_Jan_Jun data already downloaded
2020_Jul_Dec data already downloaded


In [14]:
########## Power Sector Data Crosswalk #############
# NOTE: Check for new releases at https://github.com/USEPA/camd-eia-crosswalk

psdc_url = 'https://github.com/USEPA/camd-eia-crosswalk/releases/download/v0.2.1/epa_eia_crosswalk.csv'

# if there is not yet a directory for egrid, make it
if not os.path.exists('../data/epa'):
    os.mkdir('../data/epa')

filename = psdc_url.split("/")[-1]
# if the file already exists, do not re-download it
if os.path.exists(f'../data/epa/{filename}'):
    print(f'{filename} already downloaded')
else:
    r = requests.get(psdc_url, stream=True)
    
    with open(f'../data/epa/{filename}', 'wb') as fd:
        for chunk in r.iter_content(chunk_size=1024):
            fd.write(chunk)
    

epa_eia_crosswalk.csv already downloaded


# 2. Load data

In [112]:
# specify the year for analysis
year = 2019

#specify the path to the CEMS data
cems_path = f'../data/pudl/pudl_data/parquet/epacems/year={year}' 

# specify the columns to use from the CEMS database
cems_columns = ['plant_id_eia', 'unitid', 'operating_datetime_utc',
'operating_time_hours', 'gross_load_mw', 'steam_load_1000_lbs',
'co2_mass_tons', 'co2_mass_measurement_code', 'heat_content_mmbtu',
'facility_id','unit_id_epa']

# specify the relative path to the sqllite database, and create an sqalchemy engine
pudl_db = 'sqlite:///../data/pudl/pudl_data/sqlite/pudl.sqlite'
pudl_engine = sa.create_engine(pudl_db)

In [113]:
# load the CEMS data
cems = pd.read_parquet(cems_path, columns=cems_columns).query("operating_time_hours > 0")  # only load observations when the plant was operating

# rename cems plant_id_eia to plant_id_epa (PUDL simply renames the ORISPL_CODE column from the raw CEMS data as 'plant_id_eia' without actually crosswalking to the EIA id)
cems = cems.rename(columns={'plant_id_eia': 'plant_id_epa'})

# calculate gross generation by multiplying gross_load_mw by operating_time_hours
cems['gross_generation_mwh'] = cems['gross_load_mw'] * cems['operating_time_hours']

cems.head(5)

Unnamed: 0,plant_id_epa,unitid,operating_datetime_utc,operating_time_hours,gross_load_mw,steam_load_1000_lbs,co2_mass_tons,co2_mass_measurement_code,heat_content_mmbtu,facility_id,unit_id_epa,gross_generation_mwh
1488,3,4,2019-01-01 06:00:00+00:00,1.0,150.0,,168.699997,Measured,1644.5,1,4,150.0
1489,3,4,2019-01-01 07:00:00+00:00,1.0,150.0,,168.199997,Measured,1639.5,1,4,150.0
1490,3,4,2019-01-01 08:00:00+00:00,1.0,150.0,,168.800003,Measured,1644.900024,1,4,150.0
1491,3,4,2019-01-01 09:00:00+00:00,1.0,150.0,,167.5,Measured,1632.300049,1,4,150.0
1492,3,4,2019-01-01 10:00:00+00:00,1.0,150.0,,168.100006,Measured,1638.300049,1,4,150.0


In [114]:
# load the plants_entity_eia data
plants_entity_eia = pd.read_sql("plants_entity_eia", pudl_engine)

In [115]:
#load EIA-923 boiler fuel input data
boiler_fuel_eia923 = pd.read_sql('boiler_fuel_eia923', pudl_engine)
boiler_fuel_eia923['fuel_consumed_mmbtu'] = boiler_fuel_eia923['fuel_consumed_units'] * boiler_fuel_eia923['fuel_mmbtu_per_unit']

In [116]:
boiler_fuel_eia923

Unnamed: 0,plant_id_eia,boiler_id,energy_source_code,fuel_type_code_pudl,report_date,fuel_consumed_units,fuel_mmbtu_per_unit,sulfur_content_pct,ash_content_pct,fuel_consumed_mmbtu
0,3,1,BIT,coal,2008-01-01,41471.0,22.584,0.49,5.4,936581.064
1,3,1,BIT,coal,2008-02-01,33833.0,22.498,5.00,5.6,761174.834
2,3,1,BIT,coal,2008-03-01,40430.0,22.720,0.49,5.4,918569.600
3,3,1,BIT,coal,2008-04-01,33311.0,22.454,0.50,13.7,747965.194
4,3,1,BIT,coal,2008-05-01,31067.0,22.612,0.52,5.5,702487.004
...,...,...,...,...,...,...,...,...,...,...
1310419,3295,URQ3,NG,gas,2011-08-01,46321.0,1.020,0.00,0.0,47247.420
1310420,3295,URQ3,NG,gas,2011-09-01,10121.0,1.020,0.00,0.0,10323.420
1310421,3295,URQ3,NG,gas,2011-10-01,2568.0,1.010,0.00,0.0,2593.680
1310422,3295,URQ3,NG,gas,2011-11-01,0.0,0.000,0.00,0.0,0.000


# 3. Clean Data

TODO:
- [x] Crosswalk the epa plant codes to the EIA plant codes. NOTE: I believe that in the PUDL ETL process, the `plant_id_epa` is simply being renamed `plant_id_eia` without actually crosswalking the ids
- [ ] Outlier detection
- [ ] Ensure consistency of net generation and heat input data in EIA-923
- [x] Apply corrections from egrid static tables
- [ ] Assign primary fuel type to each CEMS observation

### Crosswalk the EPA plant_id to the EIA plant_id

In [117]:
# load the power sector data crosswalk
psdc = pd.read_csv('../data/epa/epa_eia_crosswalk.csv', usecols=['CAMD_PLANT_ID','CAMD_UNIT_ID','CAMD_GENERATOR_ID','EIA_PLANT_ID','EIA_GENERATOR_ID','EIA_BOILER_ID','CAMD_FUEL_TYPE','EIA_FUEL_TYPE'])

# create a table that matches EPA plant and unit IDs to an EIA plant ID
plant_id_crosswalk = psdc[['CAMD_PLANT_ID','CAMD_UNIT_ID','EIA_PLANT_ID']].drop_duplicates()
# only keep plant ids where the two are different
plant_id_crosswalk = plant_id_crosswalk[plant_id_crosswalk['CAMD_PLANT_ID'] != plant_id_crosswalk['EIA_PLANT_ID']].dropna()
# change the id to an int
plant_id_crosswalk['EIA_PLANT_ID'] = plant_id_crosswalk['EIA_PLANT_ID'].astype(int)
# rename the columns to match the format of the cems data
plant_id_crosswalk = plant_id_crosswalk.rename(columns={'CAMD_PLANT_ID':'plant_id_epa','CAMD_UNIT_ID':'unitid','EIA_PLANT_ID':'plant_id_eia'})

plant_id_crosswalk

Unnamed: 0,plant_id_epa,unitid,plant_id_eia
162,302,10,59002
167,302,6,59002
168,302,7,59002
169,302,8,59002
170,302,9,59002
...,...,...,...
5722,55375,CT4,57664
5942,55481,1,58557
5944,55481,2,58557
6823,70454,MAG1,54538


In [118]:
# match plant_id_eia on plant_id_epa and unitid
cems = cems.merge(plant_id_crosswalk, how='left', on=['plant_id_epa','unitid'])

# if the merge resulted in any missing plant_id associations, fill with the plant_id_epa, assuming that they are the same
cems['plant_id_eia'] = cems['plant_id_eia'].fillna(cems['plant_id_epa'])

### Remove any non-grid connected plants

In [119]:
# get the list of plant_id_eia from the static table
ngc_plants = list(pd.read_csv(f'../data/egrid/egrid{year}_static_tables/table_4-2_plants_not_connected_to_grid.csv')['Plant ID'])
# remove these plants from the cems data
cems = cems[~cems['plant_id_eia'].isin(ngc_plants)]

### Identify missing and outlier CO2 values
Notes
 - whenever there is a non-zero heat input and/or operating time reported, but 0 carbon emissions, we might want to treat that as a missing value

### Assign a primary fuel type to each unit-hour in CEMS
- Assign a primary fuel type code  
- For generators with multiple fuel types, create a heat-input weighted emission factor that can be used to fill missing CO2 data if needed

### Fill missing CO2 data in CEMS
Approach: once a primary fuel type has been assigned, use that to estimate a CO2 emission rate by multiplying heat_input_mmbtu by the emission factor for that fuel type, taken from `data\egrid\egrid2019_static_tables\table_C1_emission_factors_for_CO2_CH4_N2O.csv`

In [120]:
len(cems[cems['co2_mass_tons'].isnull()])

1224288

In [121]:
# check if there is any missing co2 data in the cems
if pd.isnull(cems['co2_mass_tons']).sum() != 0:
    #create a new df with all observations with missing co2 data
    missing_co2 = cems[cems['co2_mass_tons'].isnull()]
    #add a report date column to the missing data to prepare to merge
    missing_co2 = data_cleaning.add_report_date(missing_co2, plants_entity_eia)
    # fill missing co2 data
    unit_fuel_table = data_cleaning.monthly_fuel_types(missing_co2, boiler_fuel_eia923, plants_entity_eia)
    #merge the unit fuel info into the missing_co2 dataframe
    missing_co2 = missing_co2.merge(unit_fuel_table, how='left', on=[
                                    'plant_id_eia', 'unitid', 'report_date'])
    missing_co2.update(data_cleaning.fill_missing_co2(missing_co2))
    cems['co2_mass_tons'].update(missing_co2['co2_mass_tons'])

    # this code could be implemented to calculate the weighted average ef for a plant to help fill missing data
    """
    weighted_ef = data_cleaning.calculate_heat_input_weighted_ef(boiler_fuel_eia923=boiler_fuel_eia923, level='plant')
    missing_co2 = cems[cems['co2_mass_tons'] == 0]
    missing_co2 = missing_co2.merge(weighted_ef, how='left', on=['plant_id_eia','report_date'])
    missing_co2['co2_mass_tons'] = missing_co2['heat_content_mmbtu'] * missing_co2['fuel_weighted_ef_tons_per_mmbtu']
    missing_co2 = missing_co2.drop(columns=['fuel_weighted_ef_tons_per_mmbtu'])
    cems.update(missing_co2)
    """

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing['primary_fuel'] = missing.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing['primary_fuel'] = missing.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing['primary_fuel'] = missing.apply(


In [122]:
len(cems[cems['co2_mass_tons'].isnull()])

1201519

### Identify CHP Plants
Notes:
 - Generators where gross_load_mw = 0 are likely CHP plants that are only producing heat and no electricity - need to confirm

# Calculate CEMS net generation

In [123]:
generators = pd.read_sql('generation_eia923', pudl_engine)

In [126]:
cems.head(10)

Unnamed: 0,plant_id_epa,unitid,operating_datetime_utc,operating_time_hours,gross_load_mw,steam_load_1000_lbs,co2_mass_tons,co2_mass_measurement_code,heat_content_mmbtu,facility_id,unit_id_epa,gross_generation_mwh,plant_id_eia
0,3,4,2019-01-01 06:00:00+00:00,1.0,150.0,,29.126123,Measured,1644.5,1,4,150.0,3.0
1,3,4,2019-01-01 07:00:00+00:00,1.0,150.0,,30.600164,Measured,1639.5,1,4,150.0,3.0
2,3,4,2019-01-01 08:00:00+00:00,1.0,150.0,,23.2743,Measured,1644.900024,1,4,150.0,3.0
3,3,4,2019-01-01 09:00:00+00:00,1.0,150.0,,22.853146,Measured,1632.300049,1,4,150.0,3.0
4,3,4,2019-01-01 10:00:00+00:00,1.0,150.0,,22.897478,Measured,1638.300049,1,4,150.0,3.0
5,3,4,2019-01-01 11:00:00+00:00,1.0,150.0,,25.989635,Measured,1644.199951,1,4,150.0,3.0
6,3,4,2019-01-01 12:00:00+00:00,1.0,150.0,,26.942774,Measured,1651.699951,1,4,150.0,3.0
7,3,4,2019-01-01 13:00:00+00:00,1.0,150.0,,27.408258,Measured,1646.900024,1,4,150.0,3.0
8,3,4,2019-01-01 14:00:00+00:00,1.0,150.0,,27.89591,Measured,1654.800049,1,4,150.0,3.0
9,3,4,2019-01-01 15:00:00+00:00,1.0,150.0,,27.53017,Measured,1640.300049,1,4,150.0,3.0


In [127]:
# Some columns (eg, date) do not make sense to aggregate
aggregate_cols = ['plant_id_eia', 'operating_datetime_utc','co2_mass_tons', 'heat_content_mmbtu', 'gross_generation_mwh', 'gross_load_mw']

# calculate parastic loss factors
gtn_ratios, gtn_fill_values = gross_to_net_generation.gross_to_net_ratios(cems, generators, plants_entity_eia)

print(' aggregating data to plant level')
# aggregate to plant level
# drop columns that will not be aggregated
cems_gross = cems[aggregate_cols]
cems_gross = cems_gross.groupby(['plant_id_eia', 'operating_datetime_utc']).sum().reset_index()

print(' adding report dates')
# add report_date column
cems_gross = data_cleaning.add_report_date(cems_gross, plants_entity_eia)

print(' calculating net generation')
# convert gross load to net load
cems_gross = cems_gross.merge(gtn_ratios[['plant_id_eia', 'report_date', 'gtn_ratio']], how='left', on=['plant_id_eia', 'report_date'])

#fillna with average of non-na values for same plant
cems_gross = cems_gross.merge(gtn_fill_values, how='left', on=['plant_id_eia'])
cems_gross['gtn_ratio'] = cems_gross['gtn_ratio'].fillna(cems_gross['gtn_fill'])
cems_gross = cems_gross.drop(columns=['gtn_fill'])
# any other values that are still missing should be filled with 1
cems_gross['gtn_ratio'] = cems_gross['gtn_ratio'].fillna(1)

#calculate net generation
cems_gross['net_generation_mwh'] = cems_gross['gross_generation_mwh'] * cems_gross['gtn_ratio']

# need to figure out if aggregating at the plant level is ok


  return np.dot(wresid, wresid) / self.df_resid
  r = _umath_linalg.det(a, signature=signature)
  return self.resid / sigma / np.sqrt(1 - hii)
  return 1 - self.ssr/self.centered_tss
  return np.dot(wresid, wresid) / self.df_resid
  r = _umath_linalg.det(a, signature=signature)
  return np.dot(wresid, wresid) / self.df_resid
  return np.dot(wresid, wresid) / self.df_resid
  r = _umath_linalg.det(a, signature=signature)
  return np.dot(wresid, wresid) / self.df_resid
  r = _umath_linalg.det(a, signature=signature)
  return self.resid / sigma / np.sqrt(1 - hii)
  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss
  return np.dot(wresid, wresid) / self.df_resid
  return np.dot(wresid, wresid) / self.df_resid
  r = _umath_linalg.det(a, signature=signature)
  return self.resid / sigma / np.sqrt(1 - hii)
  return np.dot(wresid, wresid) / self.df_resid
  r = _umath_linalg.det(a, signature=signature)
  return np.dot(wresid, wresid) / self.df_resid
  r = _umath_linalg

  return np.dot(wresid, wresid) / self.df_resid
  r = _umath_linalg.det(a, signature=signature)
  return np.dot(wresid, wresid) / self.df_resid
  return np.dot(wresid, wresid) / self.df_resid
  return np.dot(wresid, wresid) / self.df_resid
  r = _umath_linalg.det(a, signature=signature)
  return np.dot(wresid, wresid) / self.df_resid
  r = _umath_linalg.det(a, signature=signature)
  return self.resid / sigma / np.sqrt(1 - hii)
  return 1 - self.ssr/self.centered_tss
  return self.resid / sigma / np.sqrt(1 - hii)
  return 1 - self.ssr/self.centered_tss


 aggregating data to plant level
 adding report dates
 calculating net generation


In [89]:
# For what fraction of rows are we using default of 1 as gross-to-net ratio? 
sum(cems_gross["gtn_ratio"] == 1)/len(cems_gross["gtn_ratio"])

0.22809763579979325

In [143]:
# merge this data into the main cems dataframe

# These columns should be the same across units within plant, so get first row of each
# TODO: we don't use these columns currently. if that continues to be true, we could drop them.
non_aggregated_cols = ['plant_id_eia', 'plant_id_epa', 'operating_time_hours', 'co2_mass_measurement_code','facility_id','operating_datetime_utc']
plant_cems = cems.groupby(['plant_id_eia', 'operating_datetime_utc']).head(1)

cems = cems_gross.merge(plant_cems[non_aggregated_cols], how='left', on=[
                                    'plant_id_eia', 'operating_datetime_utc'])
cems.groupby("balancing_authority_code_eia").sum().head()

Unnamed: 0_level_0,plant_id_eia,co2_mass_tons,heat_content_mmbtu,gross_generation_mwh,gross_load_mw,gtn_ratio,net_generation_mwh,plant_id_epa,operating_time_hours,facility_id
balancing_authority_code_eia,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AEC,28274776.0,1692280.0,43678212.0,4268174.0,4291565.0,19616.738018,5214430.0,28274776.0,16269.44043,3731549
AECI,539727252.0,19721530.0,230825888.0,27237248.0,27268988.0,37420.47078,26210100.0,539727252.0,38091.179688,31542575
AVA,434111194.0,853074.5,14354582.0,1999214.75,2005609.0,9690.115169,1973711.0,434111194.0,9585.879883,12133734
AVRN,527068000.0,1537410.0,25869792.0,3500520.25,3511278.0,9316.197443,3397836.0,527068000.0,9193.639648,12600127
AZPS,112040647.0,11989450.0,156645472.0,16092977.0,16141610.0,34872.161949,14817970.0,112040647.0,37570.890625,18046119


# Determine coverage of CEMS
- Groupby.sum() CEMS data to plant, generator, and month - focusing on net generation and heat input
- Compare to EIA-923 data to determine in which months a generator was active (according to EIA-923), but is missing data from CEMS
- Aggregate up to BA level to determine what percent of generation (mwh) is missing from CEMS <- using EIA-930??

# Calculate Residual Net Generation Profiles


# Aggregate data to Balancing Authorities
The output should be a pandas dataframe with a datetimeindex for each hour of the year (in UTC) and each column is a different BA code. The data will represent the average emission rate.

To compare to EIA-930 and annual eGRID, we will also want to compare net generation and emissions.

In [148]:
# identify which BA each plant is in
plant_ba = plants_entity_eia[['plant_id_eia','balancing_authority_code_eia']]

# merge the ba code into the CEMS data
cems = cems.merge(plant_ba, how='left', on='plant_id_eia')
cems.head(5)

Unnamed: 0,plant_id_eia,operating_datetime_utc,co2_mass_tons,heat_content_mmbtu,gross_generation_mwh,gross_load_mw,report_date,gtn_ratio,net_generation_mwh,balancing_authority_code_eia_x,plant_id_epa,operating_time_hours,co2_mass_measurement_code,facility_id,balancing_authority_code_eia_y,balancing_authority_code_eia
0,3.0,2019-01-01 06:00:00+00:00,143.402939,7129.899902,645.0,666.0,2019-01,0.961957,620.462497,SOCO,3,1.0,Measured,1,SOCO,SOCO
1,3.0,2019-01-01 07:00:00+00:00,101.808441,7039.100098,638.0,638.0,2019-01,0.961957,613.728795,SOCO,3,1.0,Measured,1,SOCO,SOCO
2,3.0,2019-01-01 08:00:00+00:00,84.818199,6966.400391,626.0,626.0,2019-01,0.961957,602.185307,SOCO,3,1.0,Measured,1,SOCO,SOCO
3,3.0,2019-01-01 09:00:00+00:00,86.048409,6969.300293,625.0,625.0,2019-01,0.961957,601.22335,SOCO,3,1.0,Measured,1,SOCO,SOCO
4,3.0,2019-01-01 10:00:00+00:00,88.143097,6829.799805,625.0,625.0,2019-01,0.961957,601.22335,SOCO,3,1.0,Measured,1,SOCO,SOCO


In [149]:
hourly_emissions = cems.groupby(['balancing_authority_code_eia','operating_datetime_utc']).sum()[['co2_mass_tons','net_generation_mwh']]
# divide total emissions by total generation to get the emission factor
hourly_emissions['ef_tons_co2_per_mwh'] = hourly_emissions['co2_mass_tons'] / hourly_emissions['net_generation_mwh']

# pivot the data
hourly_emission_rate = hourly_emissions.reset_index().pivot(index='operating_datetime_utc', columns='balancing_authority_code_eia', values='ef_tons_co2_per_mwh')
hourly_emission_rate

balancing_authority_code_eia,AEC,AECI,AVA,AVRN,AZPS,BANC,BPAT,CISO,CPLE,CSTO,...,SPA,SRP,SWPP,TAL,TEC,TEPC,TIDC,TVA,WACM,WALC
operating_datetime_utc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-01-01 05:00:00+00:00,,,,,,,,,0.660468,,...,,,,0.471681,0.534136,,,1.188027,,
2019-01-01 06:00:00+00:00,0.454045,0.859463,,,,,,,0.664060,,...,1.037423,,0.960496,0.477731,0.550416,,,0.985028,,
2019-01-01 07:00:00+00:00,0.449835,0.862902,,,0.800466,,,,0.659645,,...,1.024447,0.190692,0.937146,0.474420,0.563341,0.157935,,0.986587,1.083537,0.539106
2019-01-01 08:00:00+00:00,0.469767,0.865417,0.412470,0.435513,0.834473,0.574896,0.759043,0.329894,0.659514,0.434392,...,1.032945,0.162165,0.941366,0.480163,0.573574,0.163685,0.077800,1.016743,1.092086,0.323829
2019-01-01 09:00:00+00:00,0.481392,0.864889,0.413730,0.435513,0.824236,0.615730,0.765567,0.334543,0.663090,0.432632,...,1.036996,0.161858,0.937645,0.484387,0.578491,0.162700,0.080062,1.011904,1.098681,0.333634
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-01-01 03:00:00+00:00,0.365494,0.777508,0.415681,0.440340,0.787157,0.494792,0.763997,0.486227,0.540658,0.435041,...,1.052937,0.374331,0.763282,0.444443,0.599982,0.597433,0.506816,0.529078,1.091017,0.676953
2020-01-01 04:00:00+00:00,0.365532,0.771026,0.416557,0.468876,0.794313,0.493544,0.773044,0.486494,0.537783,0.435530,...,1.053116,0.383485,0.763401,0.447009,0.614524,0.599765,0.507801,0.531106,1.096773,0.680819
2020-01-01 05:00:00+00:00,0.353210,0.771268,0.416140,0.444278,0.785550,0.494958,0.775959,0.489287,,0.434276,...,1.049623,0.421194,0.760760,,,0.601815,0.513050,0.524029,1.102090,0.697604
2020-01-01 06:00:00+00:00,,,0.416183,0.444410,0.748214,0.493020,0.832311,0.474847,,0.434997,...,,0.459677,0.567430,,,0.596953,0.502965,,1.103613,0.698169


In [150]:
# Net generation and emission 

hourly_net_generation = hourly_emissions.reset_index().pivot(index='operating_datetime_utc', columns='balancing_authority_code_eia', values='net_generation_mwh')
hourly_net_emissions = hourly_emissions.reset_index().pivot(index='operating_datetime_utc', columns='balancing_authority_code_eia', values='co2_mass_tons')

# 4. Output data 

Save data to a CSV so we can separate generation of and analysis of hourly e-grid numbers

In [151]:
if not os.path.exists('../data/output'):
    os.makedirs('../data/output')
    
hourly_emission_rate.to_csv("../data/output/hourly_emission_rate.csv")
hourly_net_generation.to_csv("../data/output/hourly_net_generation.csv")
hourly_net_emissions.to_csv("../data/output/hourly_net_emission.csv")