In [2]:
# import packages
from turtle import shape
import numpy as np
import pandas as pd
import argparse
import os
from collections import namedtuple

%reload_ext autoreload
%autoreload 2

# # Tell python where to look for modules.
import sys

sys.path.append("../../hourly-egrid/")

# import local modules
import src.download_data as download_data
import src.load_data as load_data
import src.data_cleaning as data_cleaning
import src.gross_to_net_generation as gross_to_net_generation
import src.impute_hourly_profiles as impute_hourly_profiles
import src.column_checks as column_checks
import src.eia930 as eia930
import src.validation as validation
import src.output_data as output_data
import src.consumed as consumed


In [6]:
# Fake command line args

Args = namedtuple("Args", "small gtn_years year")
args = Args(True, 5, 2020)

In [7]:
year = args.year

# 0. Set up directory structure
path_prefix = "" if not args.small else "small/"
os.makedirs("../data/downloads", exist_ok=True)
os.makedirs(f"../data/outputs/{path_prefix}", exist_ok=True)
os.makedirs(f"../data/results/{path_prefix}", exist_ok=True)
os.makedirs(f"../data/results/{path_prefix}plant_data", exist_ok=True)
os.makedirs(f"../data/results/{path_prefix}carbon_accounting", exist_ok=True)
os.makedirs(f"../data/results/{path_prefix}power_sector_data", exist_ok=True)
os.makedirs(f"../data/results/{path_prefix}validation_metrics", exist_ok=True)

# 1. Download data
# PUDL
download_data.download_pudl_data(
    zenodo_url="https://zenodo.org/record/6349861/files/pudl-v0.6.0-2022-03-12.tgz"
)
# eGRID
# the 2019 and 2020 data appear to be hosted on different urls
egrid_files_to_download = [
    "https://www.epa.gov/sites/default/files/2021-02/egrid2019_data.xlsx",
    "https://www.epa.gov/system/files/documents/2022-01/egrid2020_data.xlsx",
]
download_data.download_egrid_files(egrid_files_to_download)
# EIA-930
download_data.download_eia930_data(years_to_download=[year])
download_data.download_chalendar_files()
# Power Sector Data Crosswalk
# NOTE: Check for new releases at https://github.com/USEPA/camd-eia-crosswalk
download_data.download_epa_psdc(
    psdc_url="https://github.com/USEPA/camd-eia-crosswalk/releases/download/v0.2.1/epa_eia_crosswalk.csv"
)

# 2. Identify subplants and gross-to net ratios
# GTN ratios are saved for reloading, as this is computationally intensive
if not os.path.isdir("../data/outputs/gross_to_net/"):
    print("Generating subplant IDs and gross to net calcuations")
    number_of_years = args.gtn_years
    gross_to_net_generation.identify_subplants_and_gtn_conversions(
        year, number_of_years
    )

# 3. Clean EIA-923 Generation and Fuel Data at the Monthly Level
print("Cleaning EIA-923 data")
eia923_allocated, primary_fuel_table = data_cleaning.clean_eia923(year, args.small)

# Add primary fuel data to each generator
eia923_allocated = eia923_allocated.merge(
    primary_fuel_table,
    how="left",
    on=["plant_id_eia", "generator_id"],
    validate="m:1",
)

# 4. Clean Hourly Data from CEMS
print("Cleaning CEMS data")
cems = data_cleaning.clean_cems(year, args.small)

# 5. Convert CEMS Hourly Gross Generation to Hourly Net Generation
print("Converting CEMS gross generation to net generation")
cems = data_cleaning.convert_gross_to_net_generation(cems)

# 6. Crosswalk CEMS and EIA data
print("Identifying source for hourly data")
eia923_allocated = data_cleaning.identify_hourly_data_source(
    eia923_allocated, cems, year
)

# 7. Calculate hourly data for partial_cems plants
print("Scaling partial CEMS data")
(
    partial_cems_scaled,
    eia923_allocated,
) = impute_hourly_profiles.scale_partial_cems_data(cems, eia923_allocated)

# Export data cleaned by above for later validation, visualization, analysis
print("Exporting intermediate output files")
output_data.output_intermediate_data(cems, "cems", path_prefix, year)
output_data.output_intermediate_data(
    eia923_allocated.drop(columns="plant_primary_fuel"),
    "eia923_allocated",
    path_prefix,
    year,
)
output_data.output_intermediate_data(
    partial_cems_scaled, "partial_cems_scaled", path_prefix, year
)

# 8. Assign static characteristics to CEMS and EIA data to aid in aggregation
# assign a BA code and state code to each plant
eia923_allocated = data_cleaning.assign_ba_code_to_plant(eia923_allocated, year)
# assign a fuel category to each plant based on what is most likely to match with the category used in EIA-930
# TODO: Add two different fuel categories (one for 930, one that is more specific)
eia923_allocated = data_cleaning.assign_fuel_category_to_ESC(
    df=eia923_allocated, esc_column="plant_primary_fuel",
)
# add a flag about whether the plant is distribution connected
eia923_allocated = data_cleaning.identify_distribution_connected_plants(
    eia923_allocated, year, voltage_threshold_kv=60
)
# Repeat for CEMS
cems = data_cleaning.assign_ba_code_to_plant(cems, year)
cems = data_cleaning.identify_distribution_connected_plants(
    cems, year, voltage_threshold_kv=60
)
# add a plant primary fuel and a fuel category for eia930
cems = cems.merge(
    primary_fuel_table.drop_duplicates(subset="plant_id_eia")[
        ["plant_id_eia", "plant_primary_fuel"]
    ],
    how="left",
    on="plant_id_eia",
)
cems = data_cleaning.assign_fuel_category_to_ESC(
    df=cems, esc_column="plant_primary_fuel"
)

partial_cems_scaled = data_cleaning.assign_ba_code_to_plant(
    partial_cems_scaled, year
)
# add a plant primary fuel and a fuel category for eia930
partial_cems_scaled = partial_cems_scaled.merge(
    primary_fuel_table.drop_duplicates(subset="plant_id_eia")[
        ["plant_id_eia", "plant_primary_fuel"]
    ],
    how="left",
    on="plant_id_eia",
)
partial_cems_scaled = data_cleaning.assign_fuel_category_to_ESC(
    df=partial_cems_scaled, esc_column="plant_primary_fuel"
)

# export plant frame
plant_static_columns = [
    "plant_id_eia",
    "plant_primary_fuel",
    "fuel_category",
    "fuel_category_eia930",
    "ba_code",
    "ba_code_physical",
    "state",
    "distribution_flag",
]
plant_frame = eia923_allocated[plant_static_columns].drop_duplicates(
    subset="plant_id_eia"
)
# add tz info before exporting
pudl_out = load_data.initialize_pudl_out(year=year)
plant_tz = pudl_out.plants_eia860()[["plant_id_eia", "timezone"]]
plant_frame = plant_frame.merge(plant_tz, how="left", on=["plant_id_eia"])
output_data.output_intermediate_data(
    plant_frame, "plant_static_attributes", path_prefix, year
)
output_data.output_to_results(
    plant_frame, "plant_static_attributes", "plant_data/", path_prefix,
)

# 9. Clean and Reconcile EIA-930 data
print("Cleaning EIA-930 data")
# TODO
# Load raw EIA-930 data, fix timestamp issues, perform physics-based reconciliation
# Currently implemented in `notebooks/930_lag` and the `gridemissions` repository
# Output: `data/outputs/EBA_adjusted_elec.csv`
eia930_dat = eia930.load_chalendar_for_pipeline(
    "../data/downloads/eia930/chalendar/EBA_adjusted_elec.csv", year=year
)  # For now, load data in form it will eventually be in

# 10. Calculate Residual Net Generation Profile
print("Calculating residual net generation profiles from EIA-930")
residual_profiles = impute_hourly_profiles.calculate_residual(
    cems, eia930_dat, plant_frame, year
)
output_data.output_intermediate_data(
    residual_profiles, "residual_profiles", path_prefix, year
)

# 11. Assign hourly profile to monthly data
print("Assigning hourly profile to monthly EIA-923 data")
# create a separate dataframe containing only the generators for which we do not have CEMS data
monthly_eia_data_to_shape = eia923_allocated[
    (eia923_allocated["hourly_data_source"] == "eia")
    & ~(eia923_allocated["fuel_consumed_mmbtu"].isna())
]
# load profile data and format for use in the pipeline
# TODO: once this is in the pipeline (step 10), may not need to read file
hourly_profiles = impute_hourly_profiles.impute_missing_hourly_profiles(
    monthly_eia_data_to_shape, residual_profiles, year
)
hourly_profiles = impute_hourly_profiles.convert_profile_to_percent(hourly_profiles)

# TODO: shaped_eia_data is HUGE, consider moving to dask.dataframe
shaped_eia_data = impute_hourly_profiles.shape_monthly_eia_data_as_hourly(
    monthly_eia_data_to_shape, hourly_profiles
)
# Export data
output_data.output_intermediate_data(
    shaped_eia_data, "shaped_eia923_data", path_prefix, year
)

# 12. Export plant files
cems = data_cleaning.filter_unique_cems_data(cems, partial_cems_scaled)

# output data quality metrics
output_data.output_to_results(
    validation.co2_source_metric(cems, partial_cems_scaled, shaped_eia_data),
    "co2_measurement_source",
    "validation_metrics/",
    path_prefix,
)
output_data.output_to_results(
    validation.net_generation_method_metric(
        cems, partial_cems_scaled, shaped_eia_data
    ),
    "net_generation_method",
    "validation_metrics/",
    path_prefix,
)
output_data.output_to_results(
    validation.hourly_profile_source_metric(
        cems, partial_cems_scaled, shaped_eia_data
    ),
    "hourly_profile_method",
    "validation_metrics/",
    path_prefix,
)

PUDL data already downloaded
egrid2019_data.xlsx already downloaded
egrid2020_data.xlsx already downloaded
BALANCE_2020_Jan_Jun data already downloaded
BALANCE_2020_Jul_Dec data already downloaded
INTERCHANGE_2020_Jan_Jun data already downloaded
INTERCHANGE_2020_Jul_Dec data already downloaded
EBA_elec.csv already downloaded
EBA_raw.csv already downloaded
epa_eia_crosswalk.csv already downloaded
Cleaning EIA-923 data




Randomly selecting 5% of plants for faster test run.
Removing 0 plants that are not grid-connected
Removing 0 plants located in the following states: ['PR']
Cleaning CEMS data
Randomly selecting 5% of plants for faster test run.
Removing 3 plants that are not grid-connected
Removing 0 plants located in the following states: ['PR']
Removing 3 units that only produce steam and do not report to EIA
removing 315978 observations from cems for unit-months where no data reported
Converting CEMS gross generation to net generation
Identifying source for hourly data
Scaling partial CEMS data
Exporting intermediate output files
Cleaning EIA-930 data


  data.loc[data.ba_code == ba, "datetime_local"] = data.loc[


Calculating residual net generation profiles from EIA-930
Assigning hourly profile to monthly EIA-923 data
Summary of methods used to estimate missing hourly profiles:
                    ba_code                                                       
profile_method DIBA_average assumed_flat national_average residual scaled_residual
fuel_category                                                                     
biomass                   0           12                0        1               0
coal                      0            0                0       41               0
geothermal                0            2                0        1               0
hydro                     0            0                0       47               0
natural_gas               0            0                0       52               1
nuclear                   0            0                0       18               0
other                     0            1                0       38               0
pe

In [None]:
# G: BREAKS SOMEWHERE AT OR BELOW HERE
combined_plant_data = data_cleaning.combine_subplant_data(
    cems, partial_cems_scaled, shaped_eia_data
)
del shaped_eia_data, cems, partial_cems_scaled # free memory back to python

# export to a csv. Dask breaks into per-chunk files of <x>.csv
combined_plant_data.to_csv(
    f"../data/results/{path_prefix}plant_data/hourly_plant_data/*.csv", index=False,
)

In [11]:
# 12. Aggregate CEMS data to BA-fuel and write power sector results
ba_fuel_data = data_cleaning.aggregate_plant_data_to_ba_fuel(
    combined_plant_data, plant_frame
)

# Output intermediate data: produced per-fuel annual averages
output_data.write_generated_averages(ba_fuel_data, path_prefix, year)

# Output final data: per-ba hourly generation and rate
output_data.write_power_sector_results(ba_fuel_data, path_prefix)

# 13. Calculate consumption-based emissions and write carbon accounting results
hourly_consumed_calc = consumed.HourlyBaDataEmissionsCalc(
    "../data/downloads/eia930/chalendar/EBA_adjusted_elec.csv", small=args.small
)
hourly_consumed_calc.process()
hourly_consumed_calc.output_data(path_prefix=path_prefix)

Running adjusted, CO2
2022-06-10 09:58:03,546 - clean - INFO - Running BaDataEmissionsCalc for 24 rows
2022-06-10 09:58:03,592 - clean - INFO - Calculating consumption emissions...
2022-06-10 09:58:03,673 - clean - INFO - Elapsed time was 0.0811226 seconds
dropping cols
Running for_electricity, CO2
2022-06-10 09:58:03,813 - clean - INFO - Running BaDataEmissionsCalc for 24 rows
2022-06-10 09:58:03,861 - clean - INFO - Calculating consumption emissions...
2022-06-10 09:58:03,939 - clean - INFO - Elapsed time was 0.07776 seconds
dropping cols
Running adjusted, CH4
2022-06-10 09:58:04,072 - clean - INFO - Running BaDataEmissionsCalc for 24 rows
2022-06-10 09:58:04,113 - clean - INFO - Calculating consumption emissions...
2022-06-10 09:58:04,221 - clean - INFO - Elapsed time was 0.107209 seconds
dropping cols
Running for_electricity, CH4
2022-06-10 09:58:04,353 - clean - INFO - Running BaDataEmissionsCalc for 24 rows
2022-06-10 09:58:04,398 - clean - INFO - Calculating consumption emission