In [1]:
# import packages
import pandas as pd
import numpy as np

%reload_ext autoreload
%autoreload 2

# # Tell python where to look for modules.
import sys
sys.path.append('../../../open-grid-emissions/src/')

import data_cleaning
import emissions
import output_data
from column_checks import get_dtypes
from filepaths import *

year = 2020
path_prefix = f"{year}/"

In [3]:
# 3. Clean EIA-923 Generation and Fuel Data at the Monthly Level
####################################################################################
print("3. Cleaning EIA-923 data")
eia923_allocated, primary_fuel_table = data_cleaning.clean_eia923(year, False)
# Add primary fuel data to each generator
eia923_allocated = eia923_allocated.merge(
    primary_fuel_table,
    how="left",
    on=["plant_id_eia", "generator_id"],
    validate="m:1",
)

# 4. Clean Hourly Data from CEMS
####################################################################################
print("4. Cleaning CEMS data")
cems = data_cleaning.clean_cems(year, False)

# calculate biomass-adjusted emissions while cems data is at the unit level
cems = emissions.adjust_emissions_for_biomass(cems)

# 5. Assign static characteristics to CEMS and EIA data to aid in aggregation
####################################################################################
print("5. Loading plant static attributes")
plant_attributes = data_cleaning.create_plant_attributes_table(
    cems, eia923_allocated, year, primary_fuel_table
)


3. Cleaning EIA-923 data




    Checking that there are no missing energy source codes associated with non-zero fuel consumption...  OK
    Checking that fuel and emissions values are positive...  OK
 
      prime_mover_code energy_source_code boiler_bottom_type boiler_firing_type
20403               FC                LFG                NaN                NaN
13248               FC                 NG                NaN                NaN
19938               FC                OBG                NaN                NaN
 
 
      prime_mover_code energy_source_code boiler_bottom_type boiler_firing_type
20403               FC                LFG                NaN                NaN
13248               FC                 NG                NaN                NaN
19938               FC                OBG                NaN                NaN
 
 
      prime_mover_code energy_source_code boiler_firing_type
20403               FC                LFG                NaN
13248               FC                 NG               

In [6]:
# 6. Crosswalk CEMS and EIA data
####################################################################################
print("6. Identifying source for hourly data")
eia923_allocated = data_cleaning.identify_hourly_data_source(
    eia923_allocated, cems, year
)

# 7. Aggregating CEMS data to subplant
####################################################################################
print("7. Aggregating CEMS data from unit to subplant")
# aggregate cems data to subplant level
cems = data_cleaning.aggregate_cems_to_subplant(cems)

6. Identifying source for hourly data
7. Aggregating CEMS data from unit to subplant


In [24]:
import impute_hourly_profiles

In [8]:
from impute_hourly_profiles import DATA_COLUMNS

In [47]:
SUBPLANT_KEYS = ["report_date", "plant_id_eia", "subplant_id"]

# identify all of the partial cems plants and group by subplant-month
eia_data_to_shape = eia923_allocated.copy().loc[
    eia923_allocated.hourly_data_source == "partial_cems_plant"
]

# if there is no data in the partial cems dataframe, skip.
if len(eia_data_to_shape) > 0:

    # group the eia data by subplant
    eia_data_to_shape = (
        eia_data_to_shape.groupby(SUBPLANT_KEYS, dropna=False)
        .sum()[DATA_COLUMNS]
        .reset_index()
    )
    
    # get a list of plant ids for plants with partial plant data in CEMS
    partial_cems_plant_ids = list(eia_data_to_shape.plant_id_eia.unique())

    # get the hourly cems data for the partial plants and aggregate by plant-hour to use for shaping
    partial_cems_profiles = (
        cems[cems["plant_id_eia"].isin(partial_cems_plant_ids)]
        .groupby(["plant_id_eia", "report_date", "datetime_utc"], dropna=False)
        .sum()[["gross_generation_mwh","fuel_consumed_mmbtu"]]
        .reset_index()
    )
    # add a column for flat profiles
    partial_cems_profiles["flat_profile"] = 1
    # convert the profiles to a percent
    partial_cems_profiles = impute_hourly_profiles.convert_profile_to_percent(partial_cems_profiles, group_keys = ["plant_id_eia"], columns_to_convert = ["gross_generation_mwh","fuel_consumed_mmbtu","flat_profile"])
    partial_cems_profiles = partial_cems_profiles.rename(columns={"gross_generation_mwh":"generation_profile","fuel_consumed_mmbtu":"fuel_profile"})
    
    # prepare the profiles
    # generation or fuel profiles will be missing for a month if there was zero generation or fuel reported for that month
    # if we are missing a profile, try filling it with the other profile (e.g. fill generation with fuel profile)
    partial_cems_profiles["generation_profile"] = partial_cems_profiles["generation_profile"].fillna(partial_cems_profiles["fuel_profile"])
    partial_cems_profiles["fuel_profile"] = partial_cems_profiles["fuel_profile"].fillna(partial_cems_profiles["generation_profile"])
    # if the profile is still missing, fill it using a flat profile
    partial_cems_profiles["generation_profile"] = partial_cems_profiles["generation_profile"].fillna(partial_cems_profiles["flat_profile"])
    partial_cems_profiles["fuel_profile"] = partial_cems_profiles["fuel_profile"].fillna(partial_cems_profiles["flat_profile"])

    # merge the profiles into the monthly data
    shaped_partial_plants = eia_data_to_shape.merge(partial_cems_profiles, how="left", on=["plant_id_eia","report_date"], validate="m:m")

    # where monthly net generation is negative, replace the generation profile with a flat profile
    shaped_partial_plants.loc[shaped_partial_plants["net_generation_mwh"] < 0, "generation_profile"] = shaped_partial_plants.loc[shaped_partial_plants["net_generation_mwh"] < 0, "flat_profile"]

    # check that no profiles contain NA values
    missing_profiles = shaped_partial_plants[shaped_partial_plants["generation_profile"].isna() | shaped_partial_plants["fuel_profile"].isna()]
    if len(missing_profiles) > 0:
        print("WARNING: Certain partial CEMS plants are missing hourly profile data. This will result in inaccurate results")
    # check that all profiles add to 1 for each month
    incorrect_profiles = shaped_partial_plants.groupby(SUBPLANT_KEYS)[["generation_profile","fuel_profile"]].sum().reset_index()
    incorrect_profiles = incorrect_profiles[(~np.isclose(incorrect_profiles["generation_profile"], 1)) | (~np.isclose(incorrect_profiles["fuel_profile"], 1))]
    if len(incorrect_profiles) > 0:
        print("WARNING: Certain partial CEMS profiles do not add to 100%. This will result in inaccurate results")

    # shape the profiles
    for col in DATA_COLUMNS:
        # use the generation profile to shape net generation data, otherwise use the fuel profile
        if col == "net_generation_mwh":
            profile_to_use = "generation_profile"
        else:
            profile_to_use = "fuel_profile"

        shaped_partial_plants[col] = shaped_partial_plants[col] * shaped_partial_plants[profile_to_use]

    # remove the intermediate columns
    shaped_partial_plants = shaped_partial_plants.drop(columns=["generation_profile","fuel_profile","flat_profile"])

