In [None]:
# import packages
import pandas as pd
import numpy as np
import os
import plotly.express as px

%reload_ext autoreload
%autoreload 2

# # Tell python where to look for modules.
import sys
sys.path.append('../../../open-grid-emissions/src/')

import load_data
from column_checks import get_dtypes
from filepaths import *
import output_data
import emissions
import validation
import emission_factors as efs


## Notes

For each calculation, we need to follow certain steps:
1. Load fuel category mapping from energy_source_groups
2. Load individual plant data
3. Assign each plant to a fuel category and region
4. Calculate Fleet Emission Factors
6. Calculate Backstop national values
7. Merge and format


First load EIA source, then ISO, then ISO-H
Recycle the functions

# Calculate eGRID Emission Factors

### Calculate factors for EIA regions

In [None]:
egrid_years = [2018, 2019, 2020]

egrid_factors_eia = efs.create_egrid_factors_for_eia_regions(egrid_years)

egrid_factors_iso = efs.create_egrid_factors_for_iso_regions(
    iso_list=["BPAT", "CISO", "ERCO", "ISNE", "MISO", "NYIS", "PJM", "SWPP"],
    egrid_years=egrid_years,
    fuel_mix_source="ISO",
)

egrid_factors_isoh = efs.create_egrid_factors_for_iso_regions(
    iso_list=["ERCO", "MISO"], egrid_years=egrid_years, fuel_mix_source="ISO-H"
)


### Combine the eGRID data

In [None]:
egrid_factors_combined = pd.concat([egrid_factors_eia, egrid_factors_iso, egrid_factors_isoh], axis=0)

egrid_factors_combined["year"] = egrid_factors_combined["year"].astype(int)
egrid_factors_combined["emission_factor_lb_per_mwh"] = egrid_factors_combined[
    "emission_factor_lb_per_mwh"
].round(3)
egrid_factors_combined = egrid_factors_combined.sort_values(
    by=[
        "fuel_mix_source",
        "year",
        "region",
        "fuel_category",
        "pollutant",
        "emission_factor_adjustment",
        "emission_factor_source",
    ]
)

egrid_factors_combined


In [None]:
# check for missing factors for ISO or ISO-H regions since there is no backstop
egrid_factors_combined[(egrid_factors_combined["fuel_mix_source"].isin(["ISO","ISO-H"])) & (egrid_factors_combined["emission_factor_lb_per_mwh"].isna())]

In [None]:
# identify where there are zero emission rates for non-clean fuels
egrid_factors_combined[
    (egrid_factors_combined["pollutant"].isin(["co2", "co2e"]))
    & (egrid_factors_combined["emission_factor_adjustment"] == "for_electricity")
    & (egrid_factors_combined["emission_factor_lb_per_mwh"] == 0)
    & ~(egrid_factors_combined["fuel_category"].isin(efs.zero_carbon_fuels))
]


In [None]:
anomalous_screen = efs.screen_for_anomalous_factors(egrid_factors_combined, filter_factor=3)
anomalous_screen


In [None]:
egrid_factors_combined.to_csv(
    outputs_folder("egrid_emission_factors.csv"), index=False
)

In [None]:
# load data if already created
egrid_factors_combined = pd.read_csv(outputs_folder("egrid_emission_factors.csv"))
egrid_factors_combined

In [None]:
pol = "co2e"
adjustment = "for_electricity"
source = "EIA"
data_to_plot = egrid_factors_combined[
    (egrid_factors_combined["pollutant"] == pol)
    & (egrid_factors_combined["emission_factor_adjustment"] == adjustment)
    & (egrid_factors_combined["fuel_mix_source"] == source)
    & (~egrid_factors_combined["fuel_category"].isin(efs.zero_carbon_fuels))
]

px.box(
    data_to_plot,
    x="fuel_category",
    y="emission_factor_lb_per_mwh",
    color="year",
    title=f"eGRID {pol}_{adjustment} factors for {source} regions",
).update_yaxes(dtick=1000)


In [None]:
pol = "co2e"
adjustment = "for_electricity"
source = "ISO"
region = "ERCO"
data_to_plot = egrid_factors_combined[
    (egrid_factors_combined["pollutant"] == pol)
    & (egrid_factors_combined["emission_factor_adjustment"] == adjustment)
    & (egrid_factors_combined["fuel_mix_source"] == source)
    & (egrid_factors_combined["region"] == region)
    & (~egrid_factors_combined["fuel_category"].isin(zero_carbon_fuels))
]

px.bar(
    data_to_plot,
    x="fuel_category",
    y="emission_factor_lb_per_mwh",
    color=data_to_plot.year.astype(str),
    barmode="group",
    width=600,
    title=f"eGRID {pol}_{adjustment} factors for {source} {region}",
).update_yaxes(dtick=1000)


# Calculate annual-average OGE factors

In [None]:
years_to_load = [2019, 2020, 2021]

oge_factors_eia = efs.create_oge_factors_for_eia_regions(years_to_load)

oge_factors_iso = efs.create_oge_factors_for_iso_regions(
    iso_list=["BPAT", "CISO", "ERCO", "ISNE", "MISO", "NYIS", "PJM", "SWPP"],
    years_to_load=years_to_load,
    fuel_mix_source="ISO",
)
oge_factors_isoh = efs.create_oge_factors_for_iso_regions(
    iso_list=["ERCO", "MISO"], years_to_load=years_to_load, fuel_mix_source="ISO-H"
)


In [None]:
oge_factors_combined = pd.concat([oge_factors_eia, oge_factors_iso, oge_factors_isoh], axis=0)

oge_factors_combined["year"] = oge_factors_combined["year"].astype(int)
oge_factors_combined["emission_factor_lb_per_mwh"] = oge_factors_combined[
    "emission_factor_lb_per_mwh"
].round(3)
oge_factors_combined = oge_factors_combined.sort_values(
    by=[
        "fuel_mix_source",
        "year",
        "region",
        "fuel_category",
        "pollutant",
        "emission_factor_adjustment",
        "emission_factor_source",
    ]
)

oge_factors_combined


In [None]:
# check for missing factors for ISO or ISO-H regions since there is no backstop
oge_factors_combined[(oge_factors_combined["fuel_mix_source"].isin(["ISO","ISO-H"])) & (oge_factors_combined["emission_factor_lb_per_mwh"].isna())]

In [None]:
# identify where there are zero emission rates for non-clean fuels
oge_factors_combined[
    (oge_factors_combined["pollutant"].isin(["co2", "co2e"]))
    & (oge_factors_combined["emission_factor_adjustment"] == "for_electricity")
    & (oge_factors_combined["emission_factor_lb_per_mwh"] == 0)
    & ~(oge_factors_combined["fuel_category"].isin(efs.zero_carbon_fuels))
]


In [None]:
anomalous_screen = efs.screen_for_anomalous_factors(oge_factors_combined, filter_factor=3)
anomalous_screen


In [None]:
oge_factors_combined.to_csv(
    outputs_folder("oge_emission_factors.csv"), index=False
)

In [None]:
emission_factors = pd.concat([egrid_factors_combined,oge_factors_combined], axis=0)
emission_factors.to_csv(
    outputs_folder("emission_factors.csv"), index=False
)

In [None]:
pol = "co2"
adj = "for_electricity"
f_source = "EIA"
year = 2020

to_plot = emission_factors[
    (emission_factors["pollutant"] == pol)
    & (emission_factors["emission_factor_adjustment"] == adj)
    & (emission_factors["fuel_mix_source"] == f_source)
    & (emission_factors["year"] == year)
    #& (emission_factors["region"].isin(["BPAT", "CISO", "ERCO", "ISNE", "MISO", "NYIS", "PJM", "SWPP"]))
    & ~(emission_factors["fuel_category"].isin(efs.zero_carbon_fuels))
]

px.bar(
    to_plot,
    x="region",
    y="emission_factor_lb_per_mwh",
    color="emission_factor_source",
    facet_col="fuel_category",
    facet_col_wrap=1,
    height=800,
    barmode="group",
).update_yaxes(matches=None)


In [None]:
pol = "co2"
adj = "for_electricity"
f_source = "EIA"
e_source = "OGEv0.2.0"

to_plot = emission_factors[
    (emission_factors["pollutant"] == pol)
    & (emission_factors["emission_factor_adjustment"] == adj)
    & (emission_factors["fuel_mix_source"] == f_source)
    & (emission_factors["emission_factor_source"] == e_source)
    #& (emission_factors["region"].isin(["BPAT", "CISO", "ERCO", "ISNE", "MISO", "NYIS", "PJM", "SWPP"]))
    & ~(emission_factors["fuel_category"].isin(efs.zero_carbon_fuels))
]

to_plot["year"] = to_plot["year"].astype(str)

px.bar(
    to_plot,
    x="region",
    y="emission_factor_lb_per_mwh",
    color="year",
    facet_col="fuel_category",
    facet_col_wrap=1,
    height=800,
    barmode="group",
).update_yaxes(matches=None)


# Calculate Time-varying emission factors from OGE

### Create Emission factors for EIA-930 regions

In [None]:
years_to_load = [2019, 2020, 2021]

# get a list of all balancing areas for which there is data in EIA-930
ba_reference = load_data.load_ba_reference()
bas_in_eia930 = ba_reference[
    (~ba_reference["timezone_reporting_eia930"].isna())
    & (ba_reference["us_ba"] == "Yes")
    & (~(ba_reference["ba_category"] == "miscellaneous"))
    & (~(ba_reference["retirement_date"].dt.year < min(years_to_load)))
]
ba_list = list(bas_in_eia930.ba_code.unique())

oge_factors_eia = []
for ba in ba_list:
    print(ba)
    # load and concat multiple years of data
    ba_data = []
    for year in years_to_load:
        fuel_category_table = pd.read_csv(
            outputs_folder(f"{year}/plant_static_attributes_{year}.csv"),
            dtype=get_dtypes(),
        )[["plant_id_eia", "fuel_category_eia930"]].rename(
            columns={"fuel_category_eia930": "fuel_category"}
        )

        # start by loading data for a single BA
        try:
            ba_year_data = pd.read_csv(
                results_folder(f"{year}/plant_data/hourly/us_units/{ba}.csv")
            )

            # make sure the data frame is not empty
            if len(ba_year_data) > 0:
                ba_data.append(
                    calculate_emissions_for_ba_year_fuel(
                        ba, ba_year_data, fuel_category_table, year
                    )
                )
        except FileNotFoundError:
            pass

    try:
        # concat data for all years together
        ba_data = pd.concat(ba_data, axis=0)

        region = ba

        ba_data = calculate_month_hour_emission_factors(
            region, ba, ba_data, local_tz_to_use, datetime_to_use, ba_reference
        )

        oge_factors_eia.append(ba_data)
    # let us know if there was not any data to concatenate
    except ValueError:
        print(f"No data available for {ba}")
        pass

oge_factors_eia = pd.concat(oge_factors_eia, axis=0)

# only keep data that are in the data years we want
oge_factors_eia = oge_factors_eia[oge_factors_eia["year"].isin(years_to_load)]

oge_factors_eia["emission_factor_source"] = f"OGE v{oge_version_number}"

oge_factors_eia = add_national_average_oge_values(oge_factors_eia)

oge_factors_eia = format_df_for_output(
    oge_factors_eia,
    fuel_mix_source="EIA",
    id_columns=["fuel_mix_source", "emission_factor_source", "region", "fuel_category", "year", "month", "hour"],
)


### Create Emission Factors for ISO Regions

In [None]:
years_to_load = [2019, 2020, 2021]
iso_list = ["BPAT", "CISO", "ERCO", "ERCO-H", "ISNE", "MISO", "NYIS", "PJM", "SWPP"]

ba_reference = load_data.load_ba_reference()

local_tz_to_use = "timezone_local"
datetime_to_use = "datetime_utc"

oge_factors_iso = []
for region in iso_list:
    print(region)

    if region == "ERCO-H":
        ba = "ERCO"
    else:
        ba = region

    # load and concat multiple years of data
    ba_data = []
    for year in years_to_load:

        fuel_category_table = load_plant_fuel_category_for_iso(region, year)

        # start by loading data for a single BA
        try:
            ba_year_data = pd.read_csv(
                results_folder(f"{year}/plant_data/hourly/us_units/{ba}.csv")
            )
            # make sure the data frame is not empty
            if len(ba_year_data) > 0:
                ba_data.append(
                    calculate_emissions_for_ba_year_fuel(
                        region, ba_year_data, fuel_category_table, year
                    )
                )
        except FileNotFoundError:
            pass

    try:
        # concat data for all years together
        ba_data = pd.concat(ba_data, axis=0)

        ba_data = calculate_month_hour_emission_factors(
            region, ba, ba_data, local_tz_to_use, datetime_to_use, ba_reference
        )

        oge_factors_iso.append(ba_data)
    # let us know if there was not any data to concatenate
    except ValueError:
        print(f"No data available for {ba}")
        pass

oge_factors_iso = pd.concat(oge_factors_iso, axis=0)

# only keep data that are in the data years we want
oge_factors_iso = oge_factors_iso[oge_factors_iso["year"].isin(years_to_load)]

oge_factors_iso["emission_factor_source"] = f"OGE v{oge_version_number}"

oge_factors_iso = format_df_for_output(
    oge_factors_iso,
    fuel_mix_source="ISO",
    id_columns=["fuel_mix_source", "emission_factor_source", "region", "fuel_category", "year", "month", "hour"],
)


In [None]:
oge_factors_combined = pd.concat([oge_factors_eia, oge_factors_iso], axis=0)

oge_factors_combined["year"] = oge_factors_combined["year"].astype(int)
oge_factors_combined["month"] = oge_factors_combined["month"].astype(int)
oge_factors_combined["hour"] = oge_factors_combined["hour"].astype(int)
oge_factors_combined["emission_factor_lb_per_mwh"] = oge_factors_combined["emission_factor_lb_per_mwh"].round(3)
oge_factors_combined = oge_factors_combined.sort_values(by=["fuel_mix_source","year","region","fuel_category","pollutant", "emission_factor_adjustment","emission_factor_source", "month","hour"])

oge_factors_combined

In [None]:
oge_factors_combined.to_csv(
    outputs_folder("oge_emission_factors.csv"), index=False
)

In [None]:
# load data if already created
oge_factors_combined = pd.read_csv(outputs_folder("oge_emission_factors.csv"))
oge_factors_combined

## Plot Data

In [None]:
# compare honth hour averages across years
region = "CISO"
fuel_source = "ISO"
pollutant = "co2e"
adjustment = "for_electricity"

data_to_plot = oge_factors_combined[
    (oge_factors_combined["region"] == region)
    & (oge_factors_combined["fuel_mix_source"] == fuel_source)
    & (oge_factors_combined["pollutant"] == pollutant)
    & (oge_factors_combined["emission_factor_adjustment"] == adjustment)
    & (~oge_factors_combined["fuel_category"].isin(zero_carbon_fuels))
]

data_to_plot["month-hour"] = (
    data_to_plot["month"].astype(str) + " " + data_to_plot["hour"].astype(str) + ":00"
)

px.line(
    data_to_plot,
    x="month-hour",
    y="emission_factor_lb_per_mwh",
    color="year",
    facet_col="fuel_category",
    facet_col_wrap=1,
    title=f"{fuel_source} {region} {pollutant}_{adjustment}",
    height=800,
).update_yaxes(matches=None)


In [None]:
# compare ISO and EIA categories
region = "SWPP"
pollutant = "co2"
adjustment = "for_electricity"

data_to_plot = oge_factors_combined[
    (oge_factors_combined["region"] == region)
    & (oge_factors_combined["pollutant"] == pollutant)
    & (oge_factors_combined["emission_factor_adjustment"] == adjustment)
    & (~oge_factors_combined["fuel_category"].isin(zero_carbon_fuels))
]

data_to_plot["date"] = (
    data_to_plot["year"].astype(str)
    + "-"
    + data_to_plot["month"].astype(str)
    + " "
    + data_to_plot["hour"].astype(str)
    + ":00"
)

px.line(
    data_to_plot,
    x="date",
    y="emission_factor_lb_per_mwh",
    color="fuel_mix_source",
    facet_col="fuel_category",
    facet_col_wrap=1,
    height=800,
).update_yaxes(matches=None)


## Explore Emission Rate Curves
We should test this both with CEMS only data, and with all plant data

In [None]:
ba = ""
fuel = ""

all_data = []
for year in [2019, 2020, 2021]:
    plant_attributes = pd.read_csv(
                outputs_folder(f"{year}/plant_static_attributes_{year}.csv"),
                dtype=get_dtypes(),
            )[["plant_id_eia", "fuel_category", "ba_code"]]
    # identify all plants in the ba and fleet
    relevant_plants = list(plant_attributes.loc[(plant_attributes["ba_code"] == ba) & (plant_attributes["fuel_category"] == fuel), "plant_id_eia"].unique())

    # load cleaned CEMS data
    cems_data = pd.read_csv(
        outputs_folder(f"{year}/cems_subplant_{year}.csv"),
        usecols=[
            "plant_id_eia",
            "subplant_id",
            "datetime_utc",
            "gross_generation_mwh",
            "net_generation_mwh",
            "fuel_consumed_for_electricity_mmbtu",
            "co2_mass_lb_for_electricity",
        ]
    )

    # filter the cems data to only those plants that exist in the BA data
    cems_data = cems_data[cems_data["plant_id_eia"].isin(relevant_plants)]

    all_data.append(cems_data)

cems_data = pd.concat(all_data, axis=0)


In [None]:
cems_fleet = cems_data.groupby(["datetime_utc"])[["gross_generation_mwh","net_generation_mwh", "co2_mass_lb_for_electricity"]].sum()
# only keep data where net generation is greater than 1
cems_fleet = cems_fleet[cems_fleet["net_generation_mwh"] >= 1]
cems_fleet["co2_rate_net"] = cems_fleet["co2_mass_lb_for_electricity"] / cems_fleet["net_generation_mwh"]
cems_fleet["co2_rate_gross"] = cems_fleet["co2_mass_lb_for_electricity"] / cems_fleet["gross_generation_mwh"]
cems_fleet

In [None]:
px.scatter(cems_fleet, x="net_generation_mwh", y="co2_rate_gross")

In [None]:
def model_heat_input(df):
    hr_model = ols(
        "co2_mass_lb ~ net_generation_mwh + I(net_generation_mwh**2)",  
        data=df,
    ).fit()

    a0 = hr_model.params[0]
    a1 = hr_model.params[1]
    a2 = hr_model.params[2]
    r2 = hr_model.rsquared
    nobs = hr_model.nobs

    return a0, a1, a2, r2, nobs


# filter the cems data to those plants that combust fuel
data_for_ci_calc = cems_data.loc[:,
    [
        "plant_id_eia",
        "emissions_unit_id_epa",
        "datetime_utc",
        "co2_mass_lb",
        "gross_generation_mwh",
    ],
]

hr_regression = data_for_ci_calc.groupby(
    ["plant_id_eia", "emissions_unit_id_epa"]
).apply(model_heat_input)

gross_ci = pd.DataFrame(
    hr_regression.tolist(),
    index=hr_regression.index,
    columns=["a0", "a1", "a2", "rsquared", "observations"],
).reset_index()

gross_ci
