In [2]:
# Tell python where to look for modules. 
# Depending on how your jupyter handles working directories, this may not be needed.
import sys
sys.path.append('../../hourly-egrid/')

In [3]:
# import the necessary packages
%reload_ext autoreload
%autoreload 2

# import packages
import os
import requests
import tarfile
import sqlalchemy as sa
from pathlib import Path
import pandas as pd

import src.data_cleaning as data_cleaning
import src.gross_to_net_generation as gross_to_net_generation
import src.load_data as load_data

# not sure what this code is for
"""
import importlib
importlib.reload(data_cleaning)
importlib.reload(gross_to_net_generation)
"""

  from pandas import Int64Index as NumericIndex


'\nimport importlib\nimportlib.reload(data_cleaning)\nimportlib.reload(gross_to_net_generation)\n'

# Specify the year for analysis

In [4]:
year = 2020

# 1. Download data

 - Downloads the pre-cleaned PUDL versions of EIA-923, EIA-860, and EPA CEMS data  
 - Downloads EPA eGRID data  
 - Downloads EIA-930 data  
 - Downloads the EPA Power Sector Data Crosswalk

TODO
- [x] The code for downloading the files could probably be made into functions
- [ ] Investigate other packages besides `requests` that would download these files faster

In [4]:
############### PUDL Database ######################

load_data.download_pudl_data(zenodo_url = 'https://zenodo.org/record/5701406/files/pudl-v0.5.0-2021-11-14.tgz')

################# eGRID data #########################

# the 2019 and 2020 data appear to be hosted on different urls
egrid_files_to_download = ['https://www.epa.gov/sites/default/files/2021-02/egrid2019_data.xlsx', 
                           'https://www.epa.gov/system/files/documents/2022-01/egrid2020_data.xlsx']

load_data.download_egrid_files(egrid_files_to_download)

############# EIA-930 data #####################

load_data.download_eia930_data(years_to_download=[year])

########## Power Sector Data Crosswalk #############
# NOTE: Check for new releases at https://github.com/USEPA/camd-eia-crosswalk

load_data.download_epa_psdc(psdc_url='https://github.com/USEPA/camd-eia-crosswalk/releases/download/v0.2.1/epa_eia_crosswalk.csv')


PUDL data already downloaded
egrid2019_data.xlsx already downloaded
egrid2020_data.xlsx already downloaded
2020_Jan_Jun data already downloaded
2020_Jul_Dec data already downloaded
epa_eia_crosswalk.csv already downloaded



# 1. Get emissions data for CEMS-reporting plants
There are three broad categories of plants based on their CAMD reporting status:
1. Units that report to CAMD year-round (for these plants, emissions data is used directly from CEMS)
2. Units that only report to CAMD during the ozone season (May-Sept) (for these units, non-ozone season data is taken from EIA 923)
3. Units that do not report to CAMD (generally fossil units < 25MW and non-fossil generators)

The first step is to gather all of the emissions data for all CEMS-reported data

## Data cleaning steps

- [x] There are also certain plants that report to CAMD but do not procuce electricty for the grid, and need to be removed from the CEMS data:
    - Non grid connected plants
    - Steam-only plants
- [ ] Check CEMS for outlier values in gross load, heat input, and co2 emissions
- [ ] Update fuel types of plants based on EPA static table
- [x] In some hours, plants that report to CEMS report heat input but no CO2 emissions. We use the reported heat input and the emission factor for the fuel type associated with the unit to calculate the missing emissions


In [5]:
# load the CEMS data
cems = load_data.load_cems_data(year)

# remove non-grid connected plants
cems = data_cleaning.remove_non_grid_connected_plants(cems, year)

# remove plants that only report steam generation and no electrical generation
cems = data_cleaning.remove_heating_only_plants(cems)

# add a report date
cems = data_cleaning.add_report_date(cems)

# identify cems reporting status, whether reporting for a full year or partial year
cems = data_cleaning.determine_cems_reporting_status(cems)

# fill in missing hourly emissions data using the fuel type and heat input
cems = data_cleaning.fill_cems_missing_co2(cems, year)

cems

Unnamed: 0,plant_id_eia,unitid,cems_id,operating_datetime_utc,operating_time_hours,gross_load_mw,gross_generation_mwh,steam_load_1000_lbs,heat_content_mmbtu,co2_mass_tons,co2_mass_measurement_code,plant_id_epa,unit_id_epa,report_date,cems_reporting_category
0,3,1,3_1,2020-01-01 06:00:00+00:00,0.0,0.0,0.0,,0.0,0.0,,3,1,2020-01-01,full_year
1,3,1,3_1,2020-01-01 07:00:00+00:00,0.0,0.0,0.0,,0.0,0.0,,3,1,2020-01-01,full_year
2,3,1,3_1,2020-01-01 08:00:00+00:00,0.0,0.0,0.0,,0.0,0.0,,3,1,2020-01-01,full_year
3,3,1,3_1,2020-01-01 09:00:00+00:00,0.0,0.0,0.0,,0.0,0.0,,3,1,2020-01-01,full_year
4,3,1,3_1,2020-01-01 10:00:00+00:00,0.0,0.0,0.0,,0.0,0.0,,3,1,2020-01-01,full_year
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34107379,57703,CT03,57703_CT03,2021-01-01 02:00:00+00:00,0.0,0.0,0.0,,0.0,0.0,,57703,90709,2020-12-01,full_year
34107380,57703,CT03,57703_CT03,2021-01-01 03:00:00+00:00,0.0,0.0,0.0,,0.0,0.0,,57703,90709,2020-12-01,full_year
34107381,57703,CT03,57703_CT03,2021-01-01 04:00:00+00:00,0.0,0.0,0.0,,0.0,0.0,,57703,90709,2020-12-01,full_year
34107382,57703,CT03,57703_CT03,2021-01-01 05:00:00+00:00,0.0,0.0,0.0,,0.0,0.0,,57703,90709,2020-12-01,full_year


In [6]:
# identify any remaining missing values
print(f"Unable to find fuel types for the following plants_units: {list(cems[cems['co2_mass_tons'].isnull()]['cems_id'].unique())}")

cems[cems['co2_mass_tons'].isnull()]

Unable to find fuel types for the following plants_units: ['1004_CTG1', '880109_B001']


Unnamed: 0,plant_id_eia,unitid,cems_id,operating_datetime_utc,operating_time_hours,gross_load_mw,gross_generation_mwh,steam_load_1000_lbs,heat_content_mmbtu,co2_mass_tons,co2_mass_measurement_code,plant_id_epa,unit_id_epa,report_date,cems_reporting_category
10905359,1004,CTG1,1004_CTG1,2020-04-10 04:00:00+00:00,0.03,0.0,0.0,,0.030000,,Measured,1004,90673,2020-04-01,full_year
11497174,1004,CTG1,1004_CTG1,2020-11-02 03:00:00+00:00,0.03,0.0,0.0,,0.030000,,Measured,1004,90673,2020-11-01,full_year
23945496,880109,B001,880109_B001,2020-05-01 05:00:00+00:00,1.00,0.0,0.0,,332.299988,,,880109,91300,2020-05-01,partial_year
23945497,880109,B001,880109_B001,2020-05-01 06:00:00+00:00,1.00,0.0,0.0,,332.299988,,,880109,91300,2020-05-01,partial_year
23945498,880109,B001,880109_B001,2020-05-01 07:00:00+00:00,1.00,0.0,0.0,,332.299988,,,880109,91300,2020-05-01,partial_year
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24290251,880109,B001,880109_B001,2020-10-01 00:00:00+00:00,1.00,0.0,0.0,,332.299988,,,880109,91300,2020-09-01,partial_year
24290252,880109,B001,880109_B001,2020-10-01 01:00:00+00:00,1.00,0.0,0.0,,332.299988,,,880109,91300,2020-09-01,partial_year
24290253,880109,B001,880109_B001,2020-10-01 02:00:00+00:00,1.00,0.0,0.0,,332.299988,,,880109,91300,2020-09-01,partial_year
24290254,880109,B001,880109_B001,2020-10-01 03:00:00+00:00,1.00,0.0,0.0,,332.299988,,,880109,91300,2020-09-01,partial_year


# 2. Get monthly data for all plants/units where data is missing from CEMS
We have now identified all plants that report the full year to CEMS, and all plants that report a partial year. We will now use the EIA-923 data to fill in the missing pieces.

1. Load EIA-923 data, and standardize heat input and generation data across the tables
2. Identify all plants/months for which we do not have CEMS data

Other:
- [ ] We need to identify whether there are plants where data is missing for some units but not others, or if we can aggregate at the plant level
- [ ] Double check data for generators that were retiring in the year

We need to be able to match the EIA data to the CEMS data based on units so we know which data will be used to fill the missing data


In [7]:
# load the generation fuel data for the year
generation_fuel_eia923 = load_data.load_pudl_table(f"SELECT * FROM generation_fuel_eia923 WHERE report_date >= '{year}-01-01' AND report_date <= '{year}-12-01'")

# remove non-grid connected plants
generation_fuel_eia923 = data_cleaning.remove_non_grid_connected_plants(generation_fuel_eia923, year)

generation_fuel_eia923

Unnamed: 0,plant_id_eia,report_date,energy_source_code,fuel_type_code_pudl,fuel_type_code_aer,prime_mover_code,fuel_consumed_units,fuel_consumed_for_electricity_units,fuel_mmbtu_per_unit,fuel_consumed_mmbtu,fuel_consumed_for_electricity_mmbtu,net_generation_mwh
0,1,2020-01-01,WND,wind,WND,WT,0.0,0.0,0.000,266.0,266.0,30.317
1,1,2020-02-01,WND,wind,WND,WT,0.0,0.0,0.000,273.0,273.0,31.076
2,1,2020-03-01,WND,wind,WND,WT,0.0,0.0,0.000,273.0,273.0,31.133
3,1,2020-04-01,WND,wind,WND,WT,0.0,0.0,0.000,272.0,272.0,30.983
4,1,2020-05-01,WND,wind,WND,WT,0.0,0.0,0.000,258.0,258.0,29.431
...,...,...,...,...,...,...,...,...,...,...,...,...
171622,64756,2020-08-01,NG,gas,NG,FC,7497.0,7497.0,1.036,7767.0,7767.0,1105.754
171623,64756,2020-09-01,NG,gas,NG,FC,6528.0,6528.0,1.036,6763.0,6763.0,962.939
171624,64756,2020-10-01,NG,gas,NG,FC,6780.0,6780.0,1.036,7024.0,7024.0,1000.037
171625,64756,2020-11-01,NG,gas,NG,FC,5416.0,5416.0,1.036,5611.0,5611.0,798.794


In [8]:
# categorize generators by broad fuel categories (clean, geothermal, biofuel, fossil)
clean_fuels = ['SUN','MWH','WND', 'WAT','WH','PUR','NUC']
fossil_fuels = ['NG', 'DFO','OG','WDS','BIT','OTH','PC','SUB', 'LIG','KER', 'RC', 'WO','RFO', 'WC', 'SGC', 'SGP', 'PG', 'JF','BFG']
bio_fuels = ['OBG','LFG','AB','OBL', 'BLQ','OBS','MSW','WDL']
geo_fuel = ['GEO']

# identify and remove all plants that are non-emitting
generation_fuel_eia923 = generation_fuel_eia923[~generation_fuel_eia923['energy_source_code'].isin(clean_fuels)]

In [9]:
# identify any plants that did not generate any electricity, and remove them from the data
gf923_annual = generation_fuel_eia923.groupby(['plant_id_eia']).sum()
steam_only_plants = list(gf923_annual[(gf923_annual['fuel_consumed_for_electricity_mmbtu'] == 0) & (gf923_annual['fuel_consumed_mmbtu'] > 0)].index)
generation_fuel_eia923 = generation_fuel_eia923[~generation_fuel_eia923['plant_id_eia'].isin(steam_only_plants)]

In [26]:
generation_fuel_eia923

Unnamed: 0,plant_id_eia,report_date,energy_source_code,fuel_type_code_pudl,fuel_type_code_aer,prime_mover_code,fuel_consumed_units,fuel_consumed_for_electricity_units,fuel_mmbtu_per_unit,fuel_consumed_mmbtu,fuel_consumed_for_electricity_mmbtu,net_generation_mwh,co2_tons_per_mmbtu,co2_mass_tons
0,3,2020-02-01,NG,gas,NG,CA,84692.0,84692.0,1.015,85962.0,85962.0,246804.000,0.05844,5023.61928
1,3,2020-03-01,NG,gas,NG,CA,154529.0,154529.0,1.015,156847.0,156847.0,257145.000,0.05844,9166.13868
2,3,2020-04-01,NG,gas,NG,CA,59835.0,59835.0,1.029,61570.0,61570.0,215636.000,0.05844,3598.15080
3,3,2020-05-01,NG,gas,NG,CA,48570.0,48570.0,1.015,49299.0,49299.0,197459.000,0.05844,2881.03356
4,3,2020-06-01,NG,gas,NG,CA,189421.0,189421.0,1.033,195672.0,195672.0,253075.000,0.05844,11435.07168
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93417,64756,2020-08-01,NG,gas,NG,FC,7497.0,7497.0,1.036,7767.0,7767.0,1105.754,0.05844,453.90348
93418,64756,2020-09-01,NG,gas,NG,FC,6528.0,6528.0,1.036,6763.0,6763.0,962.939,0.05844,395.22972
93419,64756,2020-10-01,NG,gas,NG,FC,6780.0,6780.0,1.036,7024.0,7024.0,1000.037,0.05844,410.48256
93420,64756,2020-11-01,NG,gas,NG,FC,5416.0,5416.0,1.036,5611.0,5611.0,798.794,0.05844,327.90684


In [None]:
generation_eia923 = load_data.load_pudl_table(f"SELECT * FROM generation_fuel_eia923 WHERE report_date >= '{year}-01-01' AND report_date <= '{year}-12-01'")

In [None]:
# data cleaning

In [16]:
# calculate emissions based on fuel type
# get emission factors
emission_factors = load_data.load_emission_factors(year)[['energy_source_code', 'co2_tons_per_mmbtu']]
# add emission factor to missing df
generation_fuel_eia923 = generation_fuel_eia923.merge(emission_factors, how='left', on='energy_source_code')
# calculate missing co2 data
generation_fuel_eia923['co2_mass_tons'] = generation_fuel_eia923['fuel_consumed_mmbtu'] * generation_fuel_eia923['co2_tons_per_mmbtu']
generation_fuel_eia923


In [21]:
generation_fuel_eia923

Unnamed: 0,plant_id_eia,report_date,energy_source_code,fuel_type_code_pudl,fuel_type_code_aer,prime_mover_code,fuel_consumed_units,fuel_consumed_for_electricity_units,fuel_mmbtu_per_unit,fuel_consumed_mmbtu,fuel_consumed_for_electricity_mmbtu,net_generation_mwh,co2_tons_per_mmbtu,co2_mass_tons
0,3,2020-02-01,NG,gas,NG,CA,84692.0,84692.0,1.015,85962.0,85962.0,246804.000,0.05844,5023.61928
1,3,2020-03-01,NG,gas,NG,CA,154529.0,154529.0,1.015,156847.0,156847.0,257145.000,0.05844,9166.13868
2,3,2020-04-01,NG,gas,NG,CA,59835.0,59835.0,1.029,61570.0,61570.0,215636.000,0.05844,3598.15080
3,3,2020-05-01,NG,gas,NG,CA,48570.0,48570.0,1.015,49299.0,49299.0,197459.000,0.05844,2881.03356
4,3,2020-06-01,NG,gas,NG,CA,189421.0,189421.0,1.033,195672.0,195672.0,253075.000,0.05844,11435.07168
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93417,64756,2020-08-01,NG,gas,NG,FC,7497.0,7497.0,1.036,7767.0,7767.0,1105.754,0.05844,453.90348
93418,64756,2020-09-01,NG,gas,NG,FC,6528.0,6528.0,1.036,6763.0,6763.0,962.939,0.05844,395.22972
93419,64756,2020-10-01,NG,gas,NG,FC,6780.0,6780.0,1.036,7024.0,7024.0,1000.037,0.05844,410.48256
93420,64756,2020-11-01,NG,gas,NG,FC,5416.0,5416.0,1.036,5611.0,5611.0,798.794,0.05844,327.90684


In [22]:
# need to calculate emissions for geothermal plants
# need to find a data source that reports this
generation_fuel_eia923[generation_fuel_eia923['energy_source_code'] == 'GEO']

Unnamed: 0,plant_id_eia,report_date,energy_source_code,fuel_type_code_pudl,fuel_type_code_aer,prime_mover_code,fuel_consumed_units,fuel_consumed_for_electricity_units,fuel_mmbtu_per_unit,fuel_consumed_mmbtu,fuel_consumed_for_electricity_mmbtu,net_generation_mwh,co2_tons_per_mmbtu,co2_mass_tons
1482,286,2020-01-01,GEO,other,GEO,ST,0.0,0.0,0.0,2676835.0,2676835.0,305122.0,,
1483,286,2020-02-01,GEO,other,GEO,ST,0.0,0.0,0.0,2715761.0,2715761.0,309559.0,,
1484,286,2020-03-01,GEO,other,GEO,ST,0.0,0.0,0.0,3782093.0,3782093.0,431106.0,,
1485,286,2020-04-01,GEO,other,GEO,ST,0.0,0.0,0.0,3469450.0,3469450.0,395469.0,,
1486,286,2020-05-01,GEO,other,GEO,ST,0.0,0.0,0.0,3433296.0,3433296.0,391348.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92497,63365,2020-08-01,GEO,other,GEO,ST,0.0,0.0,0.0,49015.0,49015.0,5587.0,,
92498,63365,2020-09-01,GEO,other,GEO,ST,0.0,0.0,0.0,67763.0,67763.0,7724.0,,
92499,63365,2020-10-01,GEO,other,GEO,ST,0.0,0.0,0.0,93801.0,93801.0,10692.0,,
92500,63365,2020-11-01,GEO,other,GEO,ST,0.0,0.0,0.0,106039.0,106039.0,12087.0,,


In [20]:
generation_fuel_eia923[generation_fuel_eia923['co2_tons_per_mmbtu'].isna()]['energy_source_code'].unique()

array(['GEO', 'OTH', 'SGC', 'SC'], dtype=object)

In [13]:
# calculate what percent of heat input is covered by CEMS
cems['heat_content_mmbtu'].sum() / generation_fuel_eia923['fuel_consumed_mmbtu'].sum()

0.8452616192463616

In [18]:
# calculate what percent of emissions is covered by CEMS
cems['co2_mass_tons'].sum() / generation_fuel_eia923['co2_mass_tons'].sum()

0.8405848968979883

# Calculate CEMS net generation

In [21]:
generators = pd.read_sql('generation_eia923', pudl_engine)

In [23]:
# Some columns (eg, date) do not make sense to aggregate
aggregate_cols = ['plant_id_eia', 'operating_datetime_utc','co2_mass_tons', 'heat_content_mmbtu', 'gross_generation_mwh', 'gross_load_mw']

# calculate parastic loss factors
gtn_ratios, gtn_fill_values = gross_to_net_generation.gross_to_net_ratios(cems, generators, plants_entity_eia)

print(' aggregating data to plant level')
# aggregate to plant level
# drop columns that will not be aggregated
cems_gross = cems[aggregate_cols]
cems_gross = cems_gross.groupby(['plant_id_eia', 'operating_datetime_utc']).sum().reset_index()

print(' adding report dates')
# add report_date column
cems_gross = data_cleaning.add_report_date(cems_gross, plants_entity_eia)

print(' calculating net generation')
# convert gross load to net load
cems_gross = cems_gross.merge(gtn_ratios[['plant_id_eia', 'report_date', 'gtn_ratio']], how='left', on=['plant_id_eia', 'report_date'])

#fillna with average of non-na values for same plant
cems_gross = cems_gross.merge(gtn_fill_values, how='left', on=['plant_id_eia'])
cems_gross['gtn_ratio'] = cems_gross['gtn_ratio'].fillna(cems_gross['gtn_fill'])
cems_gross = cems_gross.drop(columns=['gtn_fill'])
# any other values that are still missing should be filled with 1
cems_gross['gtn_ratio'] = cems_gross['gtn_ratio'].fillna(1)

#calculate net generation
cems_gross['net_generation_mwh'] = cems_gross['gross_generation_mwh'] * cems_gross['gtn_ratio']

# need to figure out if aggregating at the plant level is ok


  return np.dot(wresid, wresid) / self.df_resid
  r = _umath_linalg.det(a, signature=signature)
  return self.resid / sigma / np.sqrt(1 - hii)
  return 1 - self.ssr/self.centered_tss
  return np.dot(wresid, wresid) / self.df_resid
  r = _umath_linalg.det(a, signature=signature)
  return np.dot(wresid, wresid) / self.df_resid
  return np.dot(wresid, wresid) / self.df_resid
  r = _umath_linalg.det(a, signature=signature)
  return np.dot(wresid, wresid) / self.df_resid
  r = _umath_linalg.det(a, signature=signature)
  return self.resid / sigma / np.sqrt(1 - hii)
  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss
  return np.dot(wresid, wresid) / self.df_resid
  return np.dot(wresid, wresid) / self.df_resid
  r = _umath_linalg.det(a, signature=signature)
  return self.resid / sigma / np.sqrt(1 - hii)
  return np.dot(wresid, wresid) / self.df_resid
  r = _umath_linalg.det(a, signature=signature)
  return np.dot(wresid, wresid) / self.df_resid
  r = _umath_linalg

 aggregating data to plant level
 adding report dates
 calculating net generation


In [24]:
# For what fraction of rows are we using default of 1 as gross-to-net ratio? 
sum(cems_gross["gtn_ratio"] == 1)/len(cems_gross["gtn_ratio"])

0.22809763579979325

In [25]:
# merge net generation into the main cems dataframe

# These columns should be the same across units within plant, so get first row of each
# TODO: we don't use these columns currently. if that continues to be true, we could drop them.
non_aggregated_cols = ['plant_id_eia', 'plant_id_epa', 'operating_time_hours', 'co2_mass_measurement_code','facility_id','operating_datetime_utc']
plant_cems = cems.groupby(['plant_id_eia', 'operating_datetime_utc']).head(1)

cems = cems_gross.merge(plant_cems[non_aggregated_cols], how='left', on=[
                                    'plant_id_eia', 'operating_datetime_utc'])

# NOTE: the BA data has not yet been merged in
#cems.groupby("balancing_authority_code_eia").sum().head()

# Determine coverage of CEMS
- Groupby.sum() CEMS data to plant, generator, and month - focusing on net generation and heat input
- Compare to EIA-923 data to determine in which months a generator was active (according to EIA-923), but is missing data from CEMS
- Aggregate up to BA level to determine what percent of generation (mwh) is missing from CEMS <- using EIA-930??

# Calculate Residual Net Generation Profiles


# Aggregate data to Balancing Authorities
The output should be a pandas dataframe with a datetimeindex for each hour of the year (in UTC) and each column is a different BA code. The data will represent the average emission rate.

To compare to EIA-930 and annual eGRID, we will also want to compare net generation and emissions.

In [28]:
# identify which BA each plant is in
plant_ba = plants_entity_eia[['plant_id_eia','balancing_authority_code_eia']]

# merge the ba code into the CEMS data
cems = cems.merge(plant_ba, how='left', on='plant_id_eia')
cems.head(5)

Unnamed: 0,plant_id_eia,operating_datetime_utc,co2_mass_tons,heat_content_mmbtu,gross_generation_mwh,gross_load_mw,report_date,gtn_ratio,net_generation_mwh,plant_id_epa,operating_time_hours,co2_mass_measurement_code,facility_id,balancing_authority_code_eia
0,3.0,2019-01-01 06:00:00+00:00,143.402939,7129.899902,645.0,666.0,2019-01,0.961957,620.462497,3,1.0,Measured,1,SOCO
1,3.0,2019-01-01 07:00:00+00:00,101.808441,7039.100098,638.0,638.0,2019-01,0.961957,613.728795,3,1.0,Measured,1,SOCO
2,3.0,2019-01-01 08:00:00+00:00,84.818199,6966.400391,626.0,626.0,2019-01,0.961957,602.185307,3,1.0,Measured,1,SOCO
3,3.0,2019-01-01 09:00:00+00:00,86.048409,6969.300293,625.0,625.0,2019-01,0.961957,601.22335,3,1.0,Measured,1,SOCO
4,3.0,2019-01-01 10:00:00+00:00,88.143097,6829.799805,625.0,625.0,2019-01,0.961957,601.22335,3,1.0,Measured,1,SOCO


In [30]:
# Output CEMS data after data processing steps above. 
# Note: this is a big file. If needed could switch to better file format, eg hdf

cems.to_csv(f"../data/output/cems{year}.csv")

In [29]:
hourly_emissions = cems.groupby(['balancing_authority_code_eia','operating_datetime_utc']).sum()[['co2_mass_tons','net_generation_mwh']]
# divide total emissions by total generation to get the emission factor
hourly_emissions['ef_tons_co2_per_mwh'] = hourly_emissions['co2_mass_tons'] / hourly_emissions['net_generation_mwh']

# pivot the data
hourly_emission_rate = hourly_emissions.reset_index().pivot(index='operating_datetime_utc', columns='balancing_authority_code_eia', values='ef_tons_co2_per_mwh')
hourly_emission_rate

balancing_authority_code_eia,AEC,AECI,AVA,AVRN,AZPS,BANC,BPAT,CISO,CPLE,CSTO,...,SPA,SRP,SWPP,TAL,TEC,TEPC,TIDC,TVA,WACM,WALC
operating_datetime_utc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-01-01 05:00:00+00:00,,,,,,,,,0.660468,,...,,,,0.471681,0.534136,,,1.188027,,
2019-01-01 06:00:00+00:00,0.454045,0.859463,,,,,,,0.664060,,...,1.037423,,0.960496,0.477731,0.550416,,,0.985028,,
2019-01-01 07:00:00+00:00,0.449835,0.862902,,,0.800466,,,,0.659645,,...,1.024447,0.190692,0.937146,0.474420,0.563341,0.157935,,0.986587,1.083537,0.539106
2019-01-01 08:00:00+00:00,0.469767,0.865417,0.412470,0.435513,0.834473,0.574896,0.759043,0.329894,0.659514,0.434392,...,1.032945,0.162165,0.941366,0.480163,0.573574,0.163685,0.077800,1.016743,1.092086,0.323829
2019-01-01 09:00:00+00:00,0.481392,0.864889,0.413730,0.435513,0.824236,0.615730,0.765567,0.334543,0.663090,0.432632,...,1.036996,0.161858,0.937645,0.484387,0.578491,0.162700,0.080062,1.011904,1.098681,0.333634
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-01-01 03:00:00+00:00,0.365494,0.777508,0.415681,0.440340,0.787157,0.494792,0.763997,0.486227,0.540658,0.435041,...,1.052937,0.374331,0.763282,0.444443,0.599982,0.597433,0.506816,0.529078,1.091017,0.676953
2020-01-01 04:00:00+00:00,0.365532,0.771026,0.416557,0.468876,0.794313,0.493544,0.773044,0.486494,0.537783,0.435530,...,1.053116,0.383485,0.763401,0.447009,0.614524,0.599765,0.507801,0.531106,1.096773,0.680819
2020-01-01 05:00:00+00:00,0.353210,0.771268,0.416140,0.444278,0.785550,0.494958,0.775959,0.489287,,0.434276,...,1.049623,0.421194,0.760760,,,0.601815,0.513050,0.524029,1.102090,0.697604
2020-01-01 06:00:00+00:00,,,0.416183,0.444410,0.748214,0.493020,0.832311,0.474847,,0.434997,...,,0.459677,0.567430,,,0.596953,0.502965,,1.103613,0.698169


In [30]:
# Net generation and emission 

hourly_net_generation = hourly_emissions.reset_index().pivot(index='operating_datetime_utc', columns='balancing_authority_code_eia', values='net_generation_mwh')
hourly_net_emissions = hourly_emissions.reset_index().pivot(index='operating_datetime_utc', columns='balancing_authority_code_eia', values='co2_mass_tons')

# 4. Output data 

Save data to a CSV so we can separate generation of and analysis of hourly e-grid numbers

In [31]:
if not os.path.exists('../data/output'):
    os.makedirs('../data/output')
    
hourly_emission_rate.to_csv(f"../data/output/hourly_emission_rate{year}.csv")
hourly_net_generation.to_csv(f"../data/output/hourly_net_generation{year}.csv")
hourly_net_emissions.to_csv(f"../data/output/hourly_net_emission{year}.csv")