In [None]:
# import packages
import pandas as pd
import numpy as np
import os
import plotly.express as px

%reload_ext autoreload
%autoreload 2

# # Tell python where to look for modules.
import sys
sys.path.append('../../../open-grid-emissions/src/')

import load_data
from column_checks import get_dtypes
from filepaths import *
import output_data
import emissions
import validation


# Load eGRID Emission Factors

Need to load CHP adjustment column to create "_for_electricity" totals

In [None]:
# load the egrid data and calculate all relevant emissions
egrid_factors = []
for year in [2018,2019, 2020]:
    egrid_plant = validation.load_egrid_plant_file(year)

    # if there is a missing value for electric allocation factor, fill with 100%
    egrid_plant["chp_electric_allocation_factor"] = egrid_plant[
        "chp_electric_allocation_factor"
    ].fillna(1)

    # calculate _for_electricity values
    for pol in ["co2", "ch4", "n2o", "nox", "so2"]:
        egrid_plant[f"{pol}_mass_lb_for_electricity"] = (
            egrid_plant[f"{pol}_mass_lb"]
            * egrid_plant["chp_electric_allocation_factor"]
        )

    # egrid 2018-2020 uses the AR4 GWP

    # code adapted from `emissions.calculate_co2e_mass()`
    ipcc_version = "AR4"
    gwp_horizon = 100
    df_gwp = load_data.load_ipcc_gwp()
    gwp_to_use = df_gwp[df_gwp.ipcc_version == ipcc_version]
    ch4_gwp = gwp_to_use.loc[
        (gwp_to_use.gwp_horizon == gwp_horizon) & (gwp_to_use.gas == "ch4"),
        "gwp",
    ].item()
    n2o_gwp = gwp_to_use.loc[
        (gwp_to_use.gwp_horizon == gwp_horizon) & (gwp_to_use.gas == "n2o"),
        "gwp",
    ].item()

    egrid_plant["co2e_mass_lb_for_electricity"] = (
        egrid_plant["co2_mass_lb_for_electricity"]
        + (ch4_gwp * egrid_plant["ch4_mass_lb_for_electricity"].fillna(0))
        + (n2o_gwp * egrid_plant["n2o_mass_lb_for_electricity"].fillna(0))
    )

    # load energy source group mapping and merge with egrid
    energy_source_groups = pd.read_csv(
        manual_folder("energy_source_groups.csv"), dtype=get_dtypes()
    )
    egrid_plant = egrid_plant.merge(
        energy_source_groups[["energy_source_code", "fuel_category_eia930"]],
        how="left",
        left_on="plant_primary_fuel",
        right_on="energy_source_code",
        validate="m:1",
    )

    data_columns = [
        "net_generation_mwh",
        "co2_mass_lb_for_electricity",
        "ch4_mass_lb_for_electricity",
        "n2o_mass_lb_for_electricity",
        "co2e_mass_lb_for_electricity",
        "nox_mass_lb_for_electricity",
        "so2_mass_lb_for_electricity",
        "co2_mass_lb_for_electricity_adjusted",
        "co2e_mass_lb_for_electricity_adjusted",
        "nox_mass_lb_for_electricity_adjusted",
        "so2_mass_lb_for_electricity_adjusted",
    ]

    # groupby and do same calculations as below
    # explicitly drop na ba and fuel values
    egrid_plant = (
        egrid_plant.groupby(["ba_code", "fuel_category_eia930"], dropna=True)[
            data_columns
        ]
        .sum()
        .reset_index()
    )

    # if there are any negative generation values, replace with zero
    egrid_plant.loc[egrid_plant["net_generation_mwh"] < 0, "net_generation_mwh"] = 0

    # calculate emission factors
    for pol in ["co2", "co2e", "nox", "so2"]:
        for pol_type in ["for_electricity", "for_electricity_adjusted"]:
            egrid_plant[f"generated_{pol}_rate_lb_per_mwh_{pol_type}"] = (
                egrid_plant[f"{pol}_mass_lb_{pol_type}"]
                / egrid_plant["net_generation_mwh"]
            )

    factor_columns = [
        "generated_co2_rate_lb_per_mwh_for_electricity",
        "generated_co2e_rate_lb_per_mwh_for_electricity",
        "generated_nox_rate_lb_per_mwh_for_electricity",
        "generated_so2_rate_lb_per_mwh_for_electricity",
        "generated_co2_rate_lb_per_mwh_for_electricity_adjusted",
        "generated_co2e_rate_lb_per_mwh_for_electricity_adjusted",
        "generated_nox_rate_lb_per_mwh_for_electricity_adjusted",
        "generated_so2_rate_lb_per_mwh_for_electricity_adjusted",
    ]

    if year == 2018:
        egrid_plant["source"] = f"eGRID{year}v2"
    else:
        egrid_plant["source"] = f"eGRID{year}"
    

    egrid_plant = egrid_plant[
        [
            "source",
            "ba_code",
            "fuel_category_eia930",
        ]
        + factor_columns
    ]

    # set all emissions for zero carbon fuels to zero
    zero_carbon_fuels = ["solar", "wind", "hydro", "nuclear"]
    egrid_plant.loc[
        egrid_plant["fuel_category_eia930"].isin(zero_carbon_fuels), factor_columns
    ] = 0

    # drop any rows where egrid rates are missing
    egrid_plant = egrid_plant.dropna(thresh=6)

    egrid_factors.append(egrid_plant)

egrid_factors = pd.concat(egrid_factors, axis=0)


In [None]:
# graph the data to spot check
px.box(egrid_factors, x="fuel_category_eia930", y="generated_co2_rate_lb_per_mwh_for_electricity", color="source", hover_data=["ba_code"])

In [None]:
egrid_factors.to_csv(outputs_folder("egrid_emission_factors_for_eia930.csv"), index=False)

# Create Emission factors for EIA-930 regions

In [None]:
years_to_load = [2019, 2020, 2021]

# get a list of all balancing areas for which there is data in EIA-930
ba_reference = load_data.load_ba_reference()
bas_in_eia930 = ba_reference[
    (~ba_reference["timezone_reporting_eia930"].isna())
    & (ba_reference["us_ba"] == "Yes")
    & (~(ba_reference["ba_category"] == "miscellaneous"))
    & (~(ba_reference["retirement_date"].dt.year < min(years_to_load)))
]
ba_list = list(bas_in_eia930.ba_code.unique())


local_tz_to_use = "timezone_local"
datetime_to_use = "datetime_utc"

all_data = []
for ba in ba_list:
    print(ba)
    # load and concat multiple years of data
    ba_data = []
    for year in years_to_load:

        plant_attributes = pd.read_csv(
            outputs_folder(f"{year}/plant_static_attributes_{year}.csv"),
            dtype=get_dtypes(),
        )

        # start by loading data for a single BA
        try:
            ba_year_data = pd.read_csv(
                results_folder(f"{year}/plant_data/hourly/us_units/{ba}.csv")
            )
            print(f"  {year}")

            # make sure the data frame is not empty
            if len(ba_year_data) > 0:
                # assign a fuel category to each plant
                ba_year_data = ba_year_data.merge(
                    plant_attributes[["plant_id_eia", "fuel_category_eia930"]],
                    how="left",
                    on="plant_id_eia",
                    validate="m:1",
                )

                # check that there are no missing fuel categories
                if len(ba_year_data[ba_year_data["fuel_category_eia930"].isna()]) > 0:
                    print(f"Warning: there are missing fuel categories in {ba}")

                # aggregate by fuel category
                ba_year_data = (
                    ba_year_data.groupby(
                        ["fuel_category_eia930", "datetime_utc"], dropna=False
                    )
                    .sum()
                    .reset_index()
                )

                # create columns for adjusted emissions for all pollutants
                # biomass adjustment does not affect these pollutants, only co2
                for pol in ["ch4", "n2o", "nox", "so2"]:
                    ba_year_data[
                        f"{pol}_mass_lb_for_electricity_adjusted"
                    ] = ba_year_data[f"{pol}_mass_lb_for_electricity"]

                # calculate co2eq emissions
                ba_year_data = emissions.calculate_co2e_mass(
                    ba_year_data,
                    year,
                    gwp_horizon=100,
                    ar5_climate_carbon_feedback=True,
                )

                ba_data.append(ba_year_data)
        except FileNotFoundError:
            pass

    try:
        # concat data for all years together
        ba_data = pd.concat(ba_data, axis=0)

        # groupby to make sure we don't have duplicate timestamps
        ba_data = (
            ba_data.groupby(["fuel_category_eia930", "datetime_utc"], dropna=False)
            .sum()
            .reset_index()
        )

        # convert the datetime column to a datetime dtype
        ba_data["datetime_utc"] = pd.to_datetime(ba_data["datetime_utc"])

        # load the local timezone to which each ba reports data to EIA-930
        ba_local_tz = bas_in_eia930.loc[
            ba_reference["ba_code"] == ba, local_tz_to_use
        ].values[0]

        # create a local datetime column
        ba_data["datetime_local"] = ba_data["datetime_utc"].dt.tz_convert(ba_local_tz)

        # create columns for month and hour
        ba_data["year"] = ba_data[datetime_to_use].dt.year
        ba_data["month"] = ba_data[datetime_to_use].dt.month
        ba_data["hour"] = ba_data[datetime_to_use].dt.hour

        # groupby month-hour
        ba_data = (
            ba_data.groupby(["fuel_category_eia930", "year", "month", "hour"])
            .sum()
            .reset_index()
        )

        # calculate emission factors
        for pol in ["co2", "co2e", "nox", "so2"]:
            for pol_type in ["for_electricity", "for_electricity_adjusted"]:
                ba_data[f"generated_{pol}_rate_lb_per_mwh_{pol_type}"] = (
                    ba_data[f"{pol}_mass_lb_{pol_type}"] / ba_data["net_generation_mwh"]
                )

        # add a column for ba code
        ba_data["ba_code"] = ba

        # only keep relevant columns
        key_columns = [
            "ba_code",
            "fuel_category_eia930",
            "year",
            "month",
            "hour",
        ]

        factor_columns = [
            "generated_co2_rate_lb_per_mwh_for_electricity",
            "generated_co2e_rate_lb_per_mwh_for_electricity",
            "generated_nox_rate_lb_per_mwh_for_electricity",
            "generated_so2_rate_lb_per_mwh_for_electricity",
            "generated_co2_rate_lb_per_mwh_for_electricity_adjusted",
            "generated_co2e_rate_lb_per_mwh_for_electricity_adjusted",
            "generated_nox_rate_lb_per_mwh_for_electricity_adjusted",
            "generated_so2_rate_lb_per_mwh_for_electricity_adjusted",
        ]

        ba_data = ba_data[key_columns + factor_columns]

        # set all emissions for zero carbon fuels to zero
        zero_carbon_fuels = ["solar", "wind", "hydro", "nuclear"]
        ba_data.loc[
            ba_data["fuel_category_eia930"].isin(zero_carbon_fuels), factor_columns
        ] = 0

        all_data.append(ba_data)
    # let us know if there was not any data to concatenate
    except ValueError:
        print(f"No data available for {ba}")
        pass

all_data = pd.concat(all_data, axis=0)

# only keep data that are in the data years we want
all_data = all_data[all_data["year"].isin(years_to_load)]


In [None]:
all_data = all_data[all_data["year"].isin(years_to_load)]

In [None]:
all_data.to_csv("../../../../time_varying_emission_factors_for_eia_regions.csv", index=False)

# Create Emission Factors for ISO Regions
 - Need to create a table that maps fuel categories to ISO categories (updating in energy_source_groups)
 - Need to look at aggregate/special categories

In [None]:
years_to_load = [2021]


iso_list = ["NYIS"]#["BPAT", "CISO", "ERCO", "ISNE", "MISO", "NYIS", "PJM", "SWPP"]


local_tz_to_use = "timezone_local"
datetime_to_use = "datetime_utc"

all_data = []
for ba in iso_list:
    print(ba)
    # load and concat multiple years of data
    ba_data = []
    for year in years_to_load:

        plant_attributes = load_plant_fuel_category_for_iso(ba, year)

        # start by loading data for a single BA
        try:
            ba_year_data = pd.read_csv(
                results_folder(f"{year}/plant_data/hourly/us_units/{ba}.csv")
            )
            print(f"  {year}")

            # make sure the data frame is not empty
            if len(ba_year_data) > 0:
                # assign a fuel category to each plant
                ba_year_data = ba_year_data.merge(
                    plant_attributes[["plant_id_eia", "fuel_category"]],
                    how="left",
                    on="plant_id_eia",
                    validate="m:1",
                )

                # check that there are no missing fuel categories
                if len(ba_year_data[ba_year_data["fuel_category"].isna()]) > 0:
                    print(f"Warning: there are missing fuel categories in {ba}")

                # aggregate by fuel category
                ba_year_data = (
                    ba_year_data.groupby(
                        ["fuel_category", "datetime_utc"], dropna=False
                    )
                    .sum()
                    .reset_index()
                )

                # create columns for adjusted emissions for all pollutants
                # biomass adjustment does not affect these pollutants, only co2
                for pol in ["ch4", "n2o", "nox", "so2"]:
                    ba_year_data[
                        f"{pol}_mass_lb_for_electricity_adjusted"
                    ] = ba_year_data[f"{pol}_mass_lb_for_electricity"]

                # calculate co2eq emissions
                ba_year_data = emissions.calculate_co2e_mass(
                    ba_year_data,
                    year,
                    gwp_horizon=100,
                    ar5_climate_carbon_feedback=True,
                )

                ba_data.append(ba_year_data)
        except FileNotFoundError:
            pass

    try:
        # concat data for all years together
        ba_data = pd.concat(ba_data, axis=0)

        # groupby to make sure we don't have duplicate timestamps
        ba_data = (
            ba_data.groupby(["fuel_category", "datetime_utc"], dropna=False)
            .sum()
            .reset_index()
        )

        # convert the datetime column to a datetime dtype
        ba_data["datetime_utc"] = pd.to_datetime(ba_data["datetime_utc"])

        # load the local timezone to which each ba reports data to EIA-930
        ba_local_tz = bas_in_eia930.loc[
            ba_reference["ba_code"] == ba, local_tz_to_use
        ].values[0]

        # create a local datetime column
        ba_data["datetime_local"] = ba_data["datetime_utc"].dt.tz_convert(ba_local_tz)

        # create columns for month and hour
        ba_data["year"] = ba_data[datetime_to_use].dt.year
        ba_data["month"] = ba_data[datetime_to_use].dt.month
        ba_data["hour"] = ba_data[datetime_to_use].dt.hour

        # groupby month-hour
        ba_data = (
            ba_data.groupby(["fuel_category", "year", "month", "hour"])
            .sum()
            .reset_index()
        )

        # calculate emission factors
        for pol in ["co2", "co2e", "nox", "so2"]:
            for pol_type in ["for_electricity", "for_electricity_adjusted"]:
                ba_data[f"generated_{pol}_rate_lb_per_mwh_{pol_type}"] = (
                    ba_data[f"{pol}_mass_lb_{pol_type}"] / ba_data["net_generation_mwh"]
                )

        # add a column for ba code
        ba_data["ba_code"] = ba

        # only keep relevant columns
        key_columns = [
            "ba_code",
            "fuel_category",
            "year",
            "month",
            "hour",
        ]

        factor_columns = [
            "generated_co2_rate_lb_per_mwh_for_electricity",
            "generated_co2e_rate_lb_per_mwh_for_electricity",
            "generated_nox_rate_lb_per_mwh_for_electricity",
            "generated_so2_rate_lb_per_mwh_for_electricity",
            "generated_co2_rate_lb_per_mwh_for_electricity_adjusted",
            "generated_co2e_rate_lb_per_mwh_for_electricity_adjusted",
            "generated_nox_rate_lb_per_mwh_for_electricity_adjusted",
            "generated_so2_rate_lb_per_mwh_for_electricity_adjusted",
        ]

        ba_data = ba_data[key_columns + factor_columns]

        # set all emissions for zero carbon fuels to zero
        zero_carbon_fuels = ["solar", "wind", "hydro", "nuclear", "variable_renewables", "batteries", "power_storage"]
        ba_data.loc[
            ba_data["fuel_category"].isin(zero_carbon_fuels), factor_columns
        ] = 0

        all_data.append(ba_data)
    # let us know if there was not any data to concatenate
    except ValueError:
        print(f"No data available for {ba}")
        pass

all_data = pd.concat(all_data, axis=0)

# only keep data that are in the data years we want
all_data = all_data[all_data["year"].isin(years_to_load)]





In [None]:
def load_plant_fuel_category_for_iso(ba, year):
    """Loads the ISO-specific fuel category for each plant in a single BA."""
    # load plant attribute data for the ba
    plant_attributes = pd.read_csv(
        outputs_folder(f"{year}/plant_static_attributes_{year}.csv"),
        dtype=get_dtypes(),
    )[["plant_id_eia", "plant_primary_fuel", "ba_code"]]
    plant_attributes = plant_attributes[plant_attributes["ba_code"] == ba]

    # merge special flags into plant attributes
    special_categories = load_special_category_flags(year)
    plant_attributes = plant_attributes.merge(
        special_categories, how="left", on="plant_id_eia", validate="1:1"
    )

    # merge fuel category into plant attributes
    energy_source_groups = pd.read_csv(
        manual_folder("energy_source_groups.csv"), dtype=get_dtypes()
    )[["energy_source_code", f"fuel_category_{ba}"]]
    plant_attributes = plant_attributes.merge(
        energy_source_groups,
        how="left",
        left_on="plant_primary_fuel",
        right_on="energy_source_code",
        validate="m:1",
    ).drop(columns="energy_source_code")

    # update special fuel categories
    if ba == "PJM":
        # identify multi-fuel plants in PJM
        plant_attributes.loc[
            (plant_attributes["multiple_fuels"] == 1), "fuel_category_PJM"
        ] = "multi_fuel"
    if ba == "NYIS":
        # identify multi-fuel plants
        plant_attributes.loc[
            (plant_attributes["multiple_fuels"] == 1), "fuel_category_NYIS"
        ] = "dual_fuel"
    if ba == "ERCO":
        # identify combined cycle gas plants
        plant_attributes.loc[
            (plant_attributes["combined_cycle"] == 1)
            & (plant_attributes["fuel_category_ERCO"] == "gas"),
            "fuel_category_ERCO",
        ] = "gas_cc"

    # rename the fuel category column
    plant_attributes = plant_attributes.rename(columns={f"fuel_category_{ba}":"fuel_category"})

    return plant_attributes[["plant_id_eia", "fuel_category"]]


def load_special_category_flags(year):
    """Loads a dataframe identifying whether any plants are multi-fuel or combined-cycle plants"""
    # identify multi fuel and combined cycle gas plants
    gen_types = load_data.load_pudl_table("generators_eia860", year=year)[
        ["plant_id_eia", "generator_id", "multiple_fuels", "prime_mover_code"]
    ]

    # create a column identifying where a generator is part of a combined cycle plant
    cc_codes = ["CA", "CC", "CS", "CT"]
    gen_types["combined_cycle"] = 0
    gen_types.loc[gen_types["prime_mover_code"].isin(cc_codes), "combined_cycle"] = 1

    # if there is a missing multiple fuel flag, assume that the generator does not burn multiple fuels
    gen_types["multiple_fuels"] = gen_types["multiple_fuels"].fillna(0).astype(int)

    # sum the boolean flags
    gen_types = (
        gen_types.groupby(["plant_id_eia"], dropna=False)[
            ["multiple_fuels", "combined_cycle"]
        ]
        .sum()
        .reset_index()
    )

    # replace the summed values with 1 if greater than zero
    gen_types.loc[gen_types["multiple_fuels"] > 0, "multiple_fuels"] = 1
    gen_types.loc[gen_types["combined_cycle"] > 0, "combined_cycle"] = 1

    return gen_types


In [None]:
plant_attributes

In [None]:
# assign a fuel category to each plant
ba_year_data = ba_year_data.merge(
    plant_attributes[["plant_id_eia", "fuel_category_eia930"]],
    how="left",
    on="plant_id_eia",
    validate="m:1",
)

# check that there are no missing fuel categories
if len(ba_year_data[ba_year_data["fuel_category_eia930"].isna()]) > 0:
    print(f"Warning: there are missing fuel categories in {ba}")

# aggregate by fuel category
ba_year_data = (
    ba_year_data.groupby(
        ["fuel_category_eia930", "datetime_utc"], dropna=False
    )
    .sum()
    .reset_index()
)

# create columns for adjusted emissions for all pollutants
# biomass adjustment does not affect these pollutants, only co2
for pol in ["ch4", "n2o", "nox", "so2"]:
    ba_year_data[
        f"{pol}_mass_lb_for_electricity_adjusted"
    ] = ba_year_data[f"{pol}_mass_lb_for_electricity"]

# calculate co2eq emissions
ba_year_data = emissions.calculate_co2e_mass(
    ba_year_data,
    year,
    gwp_horizon=100,
    ar5_climate_carbon_feedback=True,
)

ba_data.append(ba_year_data)
