In [None]:
# import packages
import pandas as pd
import numpy as np
import os
import plotly.express as px

%reload_ext autoreload
%autoreload 2

# # Tell python where to look for modules.
import sys
sys.path.append('../../../open-grid-emissions/src/')

import download_data
import load_data
from column_checks import get_dtypes
from filepaths import *
import impute_hourly_profiles
import data_cleaning
import output_data
import emissions
import validation
import gross_to_net_generation
import eia930
from logging_util import get_logger, configure_root_logger
configure_root_logger()
logger = get_logger("test")


year = 2022
path_prefix = f"{year}/"

In [None]:
import sqlalchemy as sa
pudl_db = "sqlite:///../../data/downloads/pudl/pudl_data/sqlite/pudl.sqlite"
pudl_engine = sa.create_engine(pudl_db)

gens_eia860 = pd.read_sql("generators_eia860", pudl_engine)
plants_eia860 = pd.read_sql("plants_eia860", pudl_engine)
plants_entity = pd.read_sql("plants_entity_eia", pudl_engine)
generators_entity = pd.read_sql("generators_entity_eia", pudl_engine)

In [None]:
utilities_entity = pd.read_sql("utilities_entity_eia", pudl_engine)

In [None]:
eia_gens = gens_eia860.merge(
    plants_eia860,
    how="left",
    on=["report_date", "plant_id_eia"],
    suffixes=(None, "_remove"),
    validate="m:1"
)

eia_gens = eia_gens.merge(
    generators_entity,
    how="left",
    on=["plant_id_eia", "generator_id"],
    suffixes=(None, "_remove"),
    validate="m:1"
)

eia_gens = eia_gens.merge(
    plants_entity,
    how="left",
    on=["plant_id_eia"],
    suffixes=(None, "_remove"),
    validate="m:1"
)

# keep only the most recent entry for each generator
eia_gens = eia_gens.sort_values(
    by=["plant_id_eia", "generator_id", "report_date"], ascending=True
)
eia_gens = eia_gens.drop_duplicates(
    subset=["plant_id_eia", "generator_id"], keep="last"
)

# remove duplicate columns from the plant df
eia_gens = eia_gens[[col for col in eia_gens.columns if "_remove" not in col]]
eia_gens = eia_gens[
    [
        "plant_id_eia",
        "plant_name_eia",
        "generator_id",
        "report_date",
        "rto_iso_lmp_node_id",
        "transmission_distribution_owner_name",
        "transmission_distribution_owner_state",
        "balancing_authority_code_eia",
        "balancing_authority_name_eia",
        "iso_rto_code",
        "nerc_region",
        "latitude",
        "longitude",
        "street_address",
        "city",
        "county",
        "state",
        "zip_code",
        "capacity_mw",
        "grid_voltage_1_kv",
        "prime_mover_code",
        "energy_source_code_1",
        "energy_source_code_2",
        "multiple_fuels",
        "cofire_fuels",
        "switch_oil_gas",
        "energy_storage",
        "generator_operating_date",
        "current_planned_generator_operating_date",
        "operational_status",
        "generator_retirement_date",
    ]
]

states_to_remove = ["AK","HI","WA","OR","CA","ID","NV","AZ","UT","WY","CO","ME","NH",'RI']

# to filter this a bit, get rid of generators in WECC, Texas, Alaska, and Hawaii
eia_gens = eia_gens[~eia_gens["state"].isin(["UNK", "TRE", "WECC"])]


eia_gens.to_csv(
    "../../../carbon-flow-analysis/MISO/data/manual/eia860_export.csv", index=False
)

In [None]:
# export for Ryan
eia_gens = gens_eia860.merge(
    plants_eia860,
    how="left",
    on=["report_date", "plant_id_eia"],
    suffixes=(None, "_remove"),
    validate="m:1"
)

eia_gens = eia_gens.merge(
    generators_entity,
    how="left",
    on=["plant_id_eia", "generator_id"],
    suffixes=(None, "_remove"),
    validate="m:1"
)

eia_gens = eia_gens.merge(
    plants_entity,
    how="left",
    on=["plant_id_eia"],
    suffixes=(None, "_remove"),
    validate="m:1"
)

eia_gens = eia_gens.merge(
    utilities_entity,
    how="left",
    on=["utility_id_eia"],
    suffixes=(None, "_remove"),
    validate="m:1"
)

# keep only the most recent entry for each generator
eia_gens = eia_gens.sort_values(
    by=["plant_id_eia", "generator_id", "report_date"], ascending=True
)
eia_gens = eia_gens.drop_duplicates(
    subset=["plant_id_eia", "generator_id"], keep="last"
)

# remove duplicate columns from the plant df
eia_gens = eia_gens[[col for col in eia_gens.columns if "_remove" not in col]]
eia_gens = eia_gens[
    [
        "plant_id_eia",
        "plant_name_eia",
        "generator_id",
        "operational_status",
        "generator_operating_date",
        "current_planned_generator_operating_date",
        "generator_retirement_date",
        "capacity_mw",
        "technology_description",
        "energy_source_code_1",
        "energy_source_code_2",
        "multiple_fuels",
        "cofire_fuels",
        "switch_oil_gas",
        "energy_storage",
        "utility_id_eia",
        "utility_name_eia",
        "transmission_distribution_owner_name",
        "balancing_authority_code_eia",
        "balancing_authority_name_eia",
        "iso_rto_code",
        "latitude",
        "longitude",
        "street_address",
        "city",
        "county",
        "state",
        "zip_code",
    ]
]



In [None]:
eia_gens.to_csv("eia860_export_for_eadm.csv", index=False)

### Explore emission rates for MISO FOTM data

In [None]:
plants = pd.read_csv(results_folder("2021/plant_data/annual/us_units/plant_data.csv"))
attributes = pd.read_csv(results_folder("2021/plant_data/plant_static_attributes.csv"))[
    ["plant_id_eia", "ba_code", "state", "fuel_category", "plant_primary_fuel"]
]
miso_region_ef = plants.merge(attributes, how="left", on="plant_id_eia")

# only keep plants in miso
miso_region_ef = miso_region_ef[miso_region_ef["ba_code"] == "MISO"]

# map states to miso regions
state_to_region = {
    "AR": "SOUTH",
    "IA": "NORTH",
    "IL": "CENTRAL",
    "IN": "CENTRAL",
    "KY": "CENTRAL",
    "LA": "SOUTH",
    "MI": "CENTRAL",
    "MN": "NORTH",
    "MO": "CENTRAL",
    "MS": "SOUTH",
    "MT": "NORTH",
    "ND": "NORTH",
    "SD": "NORTH",
    "TX": "SOUTH",
    "WI": "NORTH",
    np.nan: "NORTH",  # currently the two plants with missing states are in north
}
miso_region_ef["miso_region"] = miso_region_ef["state"].map(state_to_region)

# update fuel category maps to reflect categories used by MISO
fuel_category_update = {"biomass": "other", "petroleum": "other", "waste": "other"}
miso_region_ef["fuel_category"] = miso_region_ef["fuel_category"].replace(
    fuel_category_update
)

# add a category for storage
miso_region_ef.loc[
    miso_region_ef["plant_primary_fuel"] == "MWH", "fuel_category"
] = "storage"

# group by fuel and region
miso_region_ef = miso_region_ef.groupby(["fuel_category", "miso_region"])[
    [
        "net_generation_mwh",
        "co2_mass_lb_for_electricity",
        "co2e_mass_lb_for_electricity",
        "nox_mass_lb_for_electricity",
        "so2_mass_lb_for_electricity",
        "co2_mass_lb_for_electricity_adjusted",
        "co2e_mass_lb_for_electricity_adjusted",
        "nox_mass_lb_for_electricity_adjusted",
        "so2_mass_lb_for_electricity_adjusted",
    ]
].sum()

# add backsrop data for no_region
no_region = (
    miso_region_ef.reset_index()
    .groupby(["fuel_category"])
    .sum(numeric_only=True)
    .reset_index()
)
no_region["miso_region"] = "NO_REGION"
no_region = no_region.set_index(["fuel_category", "miso_region"])
miso_region_ef = miso_region_ef = pd.concat([miso_region_ef, no_region], axis=0)

for pol in ["co2", "co2e", "nox", "so2"]:
    for pol_type in ["for_electricity", "for_electricity_adjusted"]:
        miso_region_ef[f"generated_{pol}_rate_lb_per_mwh_{pol_type}"] = (
            miso_region_ef[f"{pol}_mass_lb_{pol_type}"]
            / miso_region_ef["net_generation_mwh"]
        )

data_cols = list(miso_region_ef.columns)

# replace clean fuels with zero emission rate
miso_region_ef.loc[
    miso_region_ef.index.get_level_values(0).isin(
        ["hydro", "nuclear", "solar", "storage", "wind"]
    ),
    data_cols,
] = 0

# miso_region_ef = miso_region_ef.set_index(["fuel_category","miso_region"])

# only keep rate columns
miso_region_ef = miso_region_ef.filter(like="generated_")

miso_region_ef = miso_region_ef.reset_index()

# add missing rows for South Wind and Storage
miso_region_ef = pd.concat(
    [
        miso_region_ef,
        pd.DataFrame(
            {
                "fuel_category": ["storage", "wind"],
                "miso_region": ["SOUTH", "SOUTH"],
            }
        ),
    ],
    axis=0,
).fillna(0)


# change to long format
miso_region_ef = miso_region_ef.melt(
    id_vars=["fuel_category", "miso_region"],
    var_name="column_name",
    value_name="emission_factor_lb_per_mwh",
)

miso_region_ef["pollutant"] = miso_region_ef["column_name"].str.split("_", expand=True)[
    1
]
miso_region_ef = miso_region_ef.assign(
    emission_factor_adjustment=lambda x: np.where(
        x.column_name.str.contains("_adjusted"),
        "for_electricity_adjusted",
        "for_electricity",
    )
)

miso_region_ef["emission_factor_lb_per_mwh"] = miso_region_ef[
    "emission_factor_lb_per_mwh"
].round(1)

miso_region_ef["year"] = 2021
miso_region_ef["fuel_mix_source"] = "ISO"
miso_region_ef["emission_factor_source"] = "OGE"

miso_region_ef = miso_region_ef[
    [
        "year",
        "fuel_mix_source",
        "miso_region",
        "fuel_category",
        "emission_factor_source",
        "pollutant",
        "emission_factor_adjustment",
        "emission_factor_lb_per_mwh",
    ]
]

miso_region_ef = miso_region_ef.sort_values(
    by=[
        "miso_region",
        "fuel_category",
    ]
)

miso_region_ef

In [None]:
miso_region_ef.to_csv("miso_regional_ef.csv", index=False)

# eGRID factors

In [None]:
# load energy source group mapping and merge with egrid
eia_fuel_categories = pd.read_csv(
    manual_folder("energy_source_groups.csv"), dtype=get_dtypes()
)[["energy_source_code", "fuel_category_eia930"]].rename(
    columns={"fuel_category_eia930": "fuel_category"}
)

egrid_years = [2019, 2020, 2021]

egrid_plant_all = []
for year in egrid_years:

    # load the plant data
    egrid_plant = load_egrid_plant_data_for_year(
        year, ipcc_version="AR4", gwp_horizon=100
    )

    # Assign each plant to to a fuel category
    egrid_plant = egrid_plant.merge(
        eia_fuel_categories,
        how="left",
        left_on="plant_primary_fuel",
        right_on="energy_source_code",
        validate="m:1",
    )

    egrid_plant = remove_anomalous_plant_values(egrid_plant)

    # only keep plants in miso
    miso_region_ef = egrid_plant[egrid_plant["ba_code"] == "MISO"].copy()

    # map states to miso regions
    state_to_region = {
        "AR": "SOUTH",
        "IA": "NORTH",
        "IL": "CENTRAL",
        "IN": "CENTRAL",
        "KY": "CENTRAL",
        "LA": "SOUTH",
        "MI": "CENTRAL",
        "MN": "NORTH",
        "MO": "CENTRAL",
        "MS": "SOUTH",
        "MT": "NORTH",
        "ND": "NORTH",
        "SD": "NORTH",
        "TX": "SOUTH",
        "WI": "NORTH",
        np.nan: "NORTH",  # currently the two plants with missing states are in north
    }
    miso_region_ef["miso_region"] = miso_region_ef["state"].map(state_to_region)

    # update fuel category maps to reflect categories used by MISO
    fuel_category_update = {"biomass": "other", "petroleum": "other", "waste": "other"}
    miso_region_ef["fuel_category"] = miso_region_ef["fuel_category"].replace(
        fuel_category_update
    )

    # add a category for storage
    miso_region_ef.loc[
        miso_region_ef["plant_primary_fuel"] == "MWH", "fuel_category"
    ] = "storage"

    # group by fuel and region
    miso_region_ef = miso_region_ef.groupby(["fuel_category", "miso_region"])[
        [
            "net_generation_mwh",
            "co2_mass_lb_for_electricity",
            "co2e_mass_lb_for_electricity",
            "nox_mass_lb_for_electricity",
            "so2_mass_lb_for_electricity",
            "co2_mass_lb_for_electricity_adjusted",
            "co2e_mass_lb_for_electricity_adjusted",
            "nox_mass_lb_for_electricity_adjusted",
            "so2_mass_lb_for_electricity_adjusted",
        ]
    ].sum()

    # add backsrop data for no_region
    no_region = (
        miso_region_ef.reset_index()
        .groupby(["fuel_category"])
        .sum(numeric_only=True)
        .reset_index()
    )
    no_region["miso_region"] = "NO_REGION"
    no_region = no_region.set_index(["fuel_category", "miso_region"])
    miso_region_ef = miso_region_ef = pd.concat([miso_region_ef, no_region], axis=0)

    for pol in ["co2", "co2e", "nox", "so2"]:
        for pol_type in ["for_electricity", "for_electricity_adjusted"]:
            miso_region_ef[f"generated_{pol}_rate_lb_per_mwh_{pol_type}"] = (
                miso_region_ef[f"{pol}_mass_lb_{pol_type}"]
                / miso_region_ef["net_generation_mwh"]
            )

    data_cols = list(miso_region_ef.columns)

    # replace clean fuels with zero emission rate
    miso_region_ef.loc[
        miso_region_ef.index.get_level_values(0).isin(
            ["hydro", "nuclear", "solar", "storage", "wind"]
        ),
        data_cols,
    ] = 0

    # miso_region_ef = miso_region_ef.set_index(["fuel_category","miso_region"])

    # only keep rate columns
    miso_region_ef = miso_region_ef.filter(like="generated_")

    miso_region_ef = miso_region_ef.reset_index()

    # add missing rows for South Wind and Storage
    miso_region_ef = pd.concat(
        [
            miso_region_ef,
            pd.DataFrame(
                {
                    "fuel_category": ["storage", "wind"],
                    "miso_region": ["SOUTH", "SOUTH"],
                }
            ),
        ],
        axis=0,
    ).fillna(0)


    # change to long format
    miso_region_ef = miso_region_ef.melt(
        id_vars=["fuel_category", "miso_region"],
        var_name="column_name",
        value_name="emission_factor_lb_per_mwh",
    )

    miso_region_ef["pollutant"] = miso_region_ef["column_name"].str.split("_", expand=True)[
        1
    ]
    miso_region_ef = miso_region_ef.assign(
        emission_factor_adjustment=lambda x: np.where(
            x.column_name.str.contains("_adjusted"),
            "for_electricity_adjusted",
            "for_electricity",
        )
    )

    miso_region_ef["emission_factor_lb_per_mwh"] = miso_region_ef[
        "emission_factor_lb_per_mwh"
    ].round(1)

    miso_region_ef["year"] = year
    miso_region_ef["fuel_mix_source"] = "ISO"
    miso_region_ef["emission_factor_source"] = f"eGRID{year}"

    miso_region_ef = miso_region_ef[
        [
            "year",
            "fuel_mix_source",
            "miso_region",
            "fuel_category",
            "emission_factor_source",
            "pollutant",
            "emission_factor_adjustment",
            "emission_factor_lb_per_mwh",
        ]
    ]

    miso_region_ef = miso_region_ef.sort_values(
        by=[
            "miso_region",
            "fuel_category",
        ]
    )

    miso_region_ef["fuel_category"] = miso_region_ef["miso_region"] + "_" + miso_region_ef["fuel_category"]
    miso_region_ef = miso_region_ef.drop(columns=["miso_region"])

    egrid_plant_all.append(miso_region_ef)

miso_region_ef = pd.concat(egrid_plant_all, axis=0)

In [None]:
miso_region_ef['region'] = "MISO-H"

In [None]:
miso_region_ef.to_csv("miso_region_ef.csv", index=False)

In [None]:
miso_region_ef

In [None]:
def load_egrid_plant_data_for_year(year, ipcc_version, gwp_horizon):
    egrid_plant = validation.load_egrid_plant_file(year)

    # add a column for eia plant id
    # create a map of eia plant ids to egrid plant ids
    egrid_crosswalk = pd.read_csv(
        manual_folder("eGRID2020_crosswalk_of_EIA_ID_to_EPA_ID.csv")
    )
    egrid_to_eia_id = dict(
        zip(
            list(egrid_crosswalk["plant_id_egrid"]),
            list(egrid_crosswalk["plant_id_eia"]),
        )
    )
    egrid_plant["plant_id_eia"] = egrid_plant["plant_id_egrid"]
    egrid_plant["plant_id_eia"].update(egrid_plant["plant_id_eia"].map(egrid_to_eia_id))

    # if there is a missing value for electric allocation factor, fill with 100%
    egrid_plant["chp_electric_allocation_factor"] = egrid_plant[
        "chp_electric_allocation_factor"
    ].fillna(1)

    # calculate _for_electricity values
    for pol in ["co2", "ch4", "n2o", "nox", "so2"]:
        egrid_plant[f"{pol}_mass_lb_for_electricity"] = (
            egrid_plant[f"{pol}_mass_lb"]
            * egrid_plant["chp_electric_allocation_factor"]
        )

    # egrid 2018-2020 uses the AR4 GWP
    # code adapted from `emissions.calculate_co2e_mass()`
    df_gwp = load_data.load_ipcc_gwp()
    gwp_to_use = df_gwp[df_gwp.ipcc_version == ipcc_version]
    ch4_gwp = gwp_to_use.loc[
        (gwp_to_use.gwp_horizon == gwp_horizon) & (gwp_to_use.gas == "ch4"),
        "gwp",
    ].item()
    n2o_gwp = gwp_to_use.loc[
        (gwp_to_use.gwp_horizon == gwp_horizon) & (gwp_to_use.gas == "n2o"),
        "gwp",
    ].item()

    egrid_plant["co2e_mass_lb_for_electricity"] = (
        egrid_plant["co2_mass_lb_for_electricity"]
        + (ch4_gwp * egrid_plant["ch4_mass_lb_for_electricity"].fillna(0))
        + (n2o_gwp * egrid_plant["n2o_mass_lb_for_electricity"].fillna(0))
    )

    egrid_plant["year"] = year
    if year == 2018:
        egrid_plant["emission_factor_source"] = f"eGRID{year}v2"
    else:
        egrid_plant["emission_factor_source"] = f"eGRID{year}"

    return egrid_plant

def remove_anomalous_plant_values(df):
    """Checks for and removes plants with a co2 factor > 20,000 lb/MWh"""
    df["co2_factor"] = df["co2_mass_lb_for_electricity"] / df["net_generation_mwh"]

    anomalies = df[df["co2_factor"] >= 20000]
    if len(anomalies) > 0:
        print("Anomalous plant data detected. Removing from data.")
        print(
            anomalies[
                [
                    "plant_id_eia",
                    "ba_code",
                    "fuel_category",
                    "year",
                    "co2_factor",
                    "co2_mass_lb_for_electricity",
                    "net_generation_mwh",
                ]
            ].sort_values(
                by=[
                    "year",
                    "ba_code",
                    "fuel_category",
                    "plant_id_eia",
                ]
            )
        )
        df = df[df["co2_factor"] < 20000]

    df = df.drop(columns="co2_factor")

    return df