This notebook is set up to test code as needed.

In [None]:
# import packages
import pandas as pd
import os

%reload_ext autoreload
%autoreload 2

# # Tell python where to look for modules.
import sys
sys.path.append('../../../open-grid-emissions/src/')

import download_data
import load_data
from column_checks import get_dtypes
from filepaths import *
import impute_hourly_profiles
import data_cleaning
import output_data
import emissions
import validation
import gross_to_net_generation
import eia930

year = 2020
path_prefix = f"{year}/"

In [None]:
# 3. Clean EIA-923 Generation and Fuel Data at the Monthly Level
####################################################################################
print("3. Cleaning EIA-923 data")
eia923_allocated, primary_fuel_table = data_cleaning.clean_eia923(year, False)
# Add primary fuel data to each generator
eia923_allocated = eia923_allocated.merge(
    primary_fuel_table,
    how="left",
    on=["plant_id_eia", "generator_id"],
    validate="m:1",
)

# 4. Clean Hourly Data from CEMS
####################################################################################
print("4. Cleaning CEMS data")
cems = data_cleaning.clean_cems(year, False, primary_fuel_table)
# output data quality metrics about measured vs imputed CEMS data
output_data.output_data_quality_metrics(
    validation.summarize_cems_measurement_quality(cems),
    "cems_pollutant_measurement_quality",
    path_prefix,
    True,
)

# calculate biomass-adjusted emissions while cems data is at the unit level
cems = emissions.adjust_emissions_for_biomass(cems)

# 5. Assign static characteristics to CEMS and EIA data to aid in aggregation
####################################################################################
print("5. Loading plant static attributes")
plant_attributes = data_cleaning.create_plant_attributes_table(
    cems, eia923_allocated, year, primary_fuel_table
)

# 6. Crosswalk CEMS and EIA data
####################################################################################
print("6. Identifying source for hourly data")
eia923_allocated = data_cleaning.identify_hourly_data_source(
    eia923_allocated, cems, year
)
# output data quality metrics about annually-reported EIA-923 data
output_data.output_data_quality_metrics(
    validation.summarize_annually_reported_eia_data(eia923_allocated, year),
    "annually_reported_eia_data",
    path_prefix,
    True,
)

# 7. Aggregating CEMS data to subplant
####################################################################################
print("7. Aggregating CEMS data from unit to subplant")
# aggregate cems data to subplant level
cems = data_cleaning.aggregate_cems_to_subplant(cems)

# 8. Calculate hourly data for partial_cems plants
####################################################################################
print("8. Shaping partial CEMS data")
# shape partial CEMS plant data
partial_cems_plant = impute_hourly_profiles.shape_partial_cems_plants(
    cems, eia923_allocated
)
validation.validate_unique_datetimes(
    df=partial_cems_plant,
    df_name="partial_cems_plant",
    keys=["plant_id_eia", "subplant_id"],
)
output_data.output_intermediate_data(
    partial_cems_plant,
    "partial_cems_plant",
    path_prefix,
    year,
    True,
)
# shape partial CEMS subplant data
(
    cems,
    partial_cems_subplant,
) = impute_hourly_profiles.shape_partial_cems_subplants(cems, eia923_allocated)
# Export data cleaned by above for later validation, visualization, analysis
output_data.output_intermediate_data(
    eia923_allocated.drop(columns="plant_primary_fuel"),
    "eia923_allocated",
    path_prefix,
    year,
    True,
)
validation.validate_unique_datetimes(
    df=partial_cems_subplant,
    df_name="partial_cems_subplant",
    keys=["plant_id_eia", "subplant_id"],
)
output_data.output_intermediate_data(
    partial_cems_subplant,
    "partial_cems_subplant",
    path_prefix,
    year,
    True,
)

# 9. Convert CEMS Hourly Gross Generation to Hourly Net Generation
####################################################################################
print("9. Converting CEMS gross generation to net generation")
cems, gtn_conversions = gross_to_net_generation.convert_gross_to_net_generation(
    cems, eia923_allocated, plant_attributes, year
)
# calculate the percent of gross generation converted using each method
output_data.output_data_quality_metrics(
    validation.identify_cems_gtn_method(cems),
    "cems_gross_to_net_methods",
    path_prefix,
    True,
)
# export the gtn conversion data
output_data.output_intermediate_data(
    gtn_conversions,
    "gross_to_net_conversions",
    path_prefix,
    year,
    True,
)

# 10. Adjust CEMS emission data for CHP
####################################################################################
print("10. Adjusting CEMS emissions for CHP")
cems = data_cleaning.adjust_cems_for_chp(cems, eia923_allocated)
cems = emissions.calculate_co2e_mass(
    cems, year, gwp_horizon=100, ar5_climate_carbon_feedback=True
)
validation.test_emissions_adjustments(cems)
validation.validate_unique_datetimes(
    df=cems,
    df_name="cems",
    keys=["plant_id_eia", "subplant_id"],
)
output_data.output_intermediate_data(
    cems, "cems", path_prefix, year, True
)

# 11. Export monthly and annual plant-level results
####################################################################################
print("11. Exporting monthly and annual plant-level results")
# create a separate dataframe containing only the EIA data that is missing from cems
monthly_eia_data_to_shape = eia923_allocated[
    (eia923_allocated["hourly_data_source"] == "eia")
    & ~(eia923_allocated["fuel_consumed_mmbtu"].isna())
]



In [None]:
partial_cems_plant = partial_cems_plant.drop(columns=["ba_code","eia_data_resolution"])
partial_cems_subplant = partial_cems_subplant.drop(columns=["ba_code","eia_data_resolution"])
cems = cems.drop(columns=["ba_code"])

In [None]:
eia_only_data = monthly_eia_data_to_shape.copy()
test = validation.identify_percent_of_data_by_input_source(
    cems,
    partial_cems_subplant,
    partial_cems_plant,
    eia_only_data,
    year,
    plant_attributes,
)
test

In [None]:
eia_only_data = monthly_eia_data_to_shape.copy()
eia_only_data = eia_only_data.merge(plant_attributes[["plant_id_eia","ba_code"]], how="left", on="plant_id_eia")
cems = cems.merge(plant_attributes[["plant_id_eia","ba_code"]], how="left", on="plant_id_eia")
partial_cems_subplant = partial_cems_subplant.merge(plant_attributes[["plant_id_eia","ba_code"]], how="left", on="plant_id_eia")
partial_cems_plant = partial_cems_plant.merge(plant_attributes[["plant_id_eia","ba_code"]], how="left", on="plant_id_eia")

In [None]:
partial_cems_plant = partial_cems_plant.drop(columns=["eia_data_resolution_x","ba_code","eia_data_resolution_y"])
partial_cems_subplant = partial_cems_subplant.drop(columns=["eia_data_resolution_x","ba_code","eia_data_resolution_y"])

In [None]:
columns_to_use = [
    "net_generation_mwh",
    "co2_mass_lb",
    "co2_mass_lb_for_electricity",
    "co2e_mass_lb",
    "co2e_mass_lb_for_electricity",
    "nox_mass_lb",
    "nox_mass_lb_for_electricity",
    "so2_mass_lb",
    "so2_mass_lb_for_electricity",
]

# add data resolution column to data that is based on EIA
eia_only_data = validation.identify_reporting_frequency(eia_only_data, year)
partial_cems_subplant = validation.identify_reporting_frequency(partial_cems_subplant, year)
partial_cems_plant = validation.identify_reporting_frequency(partial_cems_plant, year)

# associate each dataframe with a data source label
data_sources = {
    "cems": cems,
    "partial_cems_subplant": partial_cems_subplant,
    "partial_cems_plant": partial_cems_plant,
    "eia": eia_only_data,
}
## get a count of the number of observations (subplant-hours) from each source

source_of_input_data = []

In [None]:

for name, df in data_sources.items():
    if len(df) == 0:  # Empty df. May occur when running `small`
        print(f"WARNING: data source {name} has zero entries")
        continue
    if name == "eia":
        subplant_data = df.groupby(
            ["ba_code","plant_id_eia", "subplant_id", "eia_data_resolution"], dropna=False
        ).sum()[columns_to_use]
        # because EIA data is not hourly, we have to multiply the number of subplants by the number of hours in a year
        if year % 4 == 0:
            hours_in_year = 8784
        else:
            hours_in_year = 8760
        subplant_data["subplant_hours"] = hours_in_year
        # group the data by resolution
        subplant_data = (
            subplant_data.reset_index()
            .groupby(["ba_code","eia_data_resolution"], dropna=False)
            .sum()[["subplant_hours"] + columns_to_use]
            .reset_index()
        )
        subplant_data = subplant_data.rename(
            columns={"eia_data_resolution": "source"}
        )
        subplant_data["source"] = subplant_data["source"].replace(
            {"annual": "eia_annual", "monthly": "eia_monthly", "multiple":"eia_multiple"}
        )
        source_of_input_data.append(subplant_data)
    # for the partial cems data
    elif (name == "partial_cems_subplant") | (name == "partial_cems_plant"):
        subplant_data = df.groupby(
            ["ba_code","plant_id_eia", "subplant_id", "datetime_utc", "eia_data_resolution"],
            dropna=False,
        ).sum()[columns_to_use]
        subplant_data["subplant_hours"] = 1
        # group the data by resolution
        subplant_data = (
            subplant_data.reset_index()
            .groupby(["ba_code","eia_data_resolution"], dropna=False)
            .sum()[["subplant_hours"] + columns_to_use]
            .reset_index()
        )
        subplant_data = subplant_data.rename(
            columns={"eia_data_resolution": "source"}
        )
        subplant_data["source"] = subplant_data["source"].replace(
            {"annual": "eia_annual", "monthly": "eia_monthly","multiple":"eia_multiple"}
        )
        source_of_input_data.append(subplant_data)
    # for the cems data
    else:
        subplant_data = df.groupby(
            ["ba_code","plant_id_eia", "subplant_id", "datetime_utc"], dropna=False
        ).sum()[columns_to_use]
        subplant_data["subplant_hours"] = 1
        subplant_data["source"] = "cems_hourly"
        # group the data by resolution
        subplant_data = (
            subplant_data.reset_index()
            .groupby(["ba_code","source"], dropna=False)
            .sum()[["subplant_hours"] + columns_to_use]
            .reset_index()
        )
        source_of_input_data.append(subplant_data)

# concat the dataframes together
source_of_input_data = pd.concat(source_of_input_data, axis=0)



In [None]:
partial_cems_plant[partial_cems_plant["eia_data_resolution"].isna()]

In [None]:
source_of_input_data[source_of_input_data["ba_code"] == "CISO"]

In [None]:
source_of_input_data.groupby(["ba_code","source"], dropna=False).sum().head(20)

In [None]:
source_of_input_data.groupby(["ba_code"]).sum().head(20)

In [None]:
source_of_input_data = (source_of_input_data.groupby(["ba_code","source"]).sum() / source_of_input_data.groupby(["ba_code"]).sum()).reset_index()

In [None]:
source_of_input_data.to_csv(results_folder("2020/data_quality_metrics/input_data_source_by_ba.csv"))

In [None]:
# groupby and calculate percentages
source_of_input_data = source_of_input_data.groupby(["ba_code","source"]).sum()
source_of_input_data = source_of_input_data / source_of_input_data.sum(axis=0)

source_of_input_data = source_of_input_data.reset_index()

In [None]:
source_of_input_data

In [None]:
output_data.output_data_quality_metrics(
    validation.identify_percent_of_data_by_input_source(
        cems,
        partial_cems_subplant,
        partial_cems_plant,
        monthly_eia_data_to_shape,
        year,
    ),
    "input_data_source",
    path_prefix,
    True,
)
# combine and export plant data at monthly and annual level
monthly_plant_data = data_cleaning.combine_plant_data(
    cems,
    partial_cems_subplant,
    partial_cems_plant,
    monthly_eia_data_to_shape,
    "monthly",
    True,
)
output_data.output_plant_data(
    monthly_plant_data, path_prefix, "monthly", True
)
output_data.output_plant_data(
    monthly_plant_data, path_prefix, "annual", True
)