# About this notebook

The EPA's power sector data crosswalk is an important input for this data pipeline. However, it is incomplete and in some cases has incorrect crosswalks. Ideally, we would contribute directly to the github repository, but the code is written in R and not python. As a temporary measure, we want to explore the outputted crosswalk and post-process it to fix some of these issues.

In [None]:
# import packages
import pandas as pd
import numpy as np
import os

%reload_ext autoreload
%autoreload 2

# # Tell python where to look for modules.
import sys

sys.path.append("../../src/")


import oge.load_data as load_data
from oge.filepaths import *
import oge.data_cleaning as data_cleaning

year = 2023

### Identify CEMS Units that are not mapped to EIA

In [None]:
# load cems data
cems = load_data.load_cems_data(year)
# remove non-grid connected plants
cems = data_cleaning.remove_plants(
    cems,
    non_grid_connected=True,
    remove_states=["PR"],
    steam_only_plants=False,
    distribution_connected_plants=False,
)

# load the unit to generator mapping from the subplant crosswalk
# NOTE: you will need to run this in the pipeline first
generator_unit_map = pd.read_csv(
    outputs_folder(f"{year}/subplant_crosswalk_{year}.csv.zip"),
)[["plant_id_epa", "generator_id", "emissions_unit_id_epa"]]

# sum operational data by unit and merge the generator map
cems_units = (
    cems.groupby(["plant_id_epa", "plant_id_eia", "emissions_unit_id_epa"])[
        [
            "gross_generation_mwh",
            "fuel_consumed_mmbtu",
            "steam_load_1000_lb",
            "co2_mass_lb",
        ]
    ]
    .sum()
    .reset_index()
)
cems_units = cems_units.merge(
    generator_unit_map, how="left", on=["plant_id_epa", "emissions_unit_id_epa"]
)

In [None]:
# identify units that are not mapped to a generator and have non-zero generation
cems_units[cems_units["generator_id"].isna() & (cems_units["gross_generation_mwh"] > 0)]

In [None]:
# identify units that are not mapped to a generator and have zero generation
cems_units[
    cems_units["generator_id"].isna() & (cems_units["gross_generation_mwh"] == 0)
]

### Investigate matches

In [None]:
# load data that will help with exploration
gens860 = load_data.load_pudl_table(
    "core_eia860__scd_generators",
    year,
    columns=[
        "plant_id_eia",
        "generator_id",
        "operational_status",
        "current_planned_generator_operating_date",
        "prime_mover_code",
        "energy_source_code_1",
        "energy_source_code_2",
    ],
)
plants_entity_eia = load_data.load_pudl_table(
    "core_eia__entity_plants",
    columns=[
        "plant_id_eia",
        "plant_name_eia",
        "street_address",
        "city",
        "county",
        "state",
    ],
)
gens_to_boilers = load_data.load_pudl_table(
    "core_eia860__assn_boiler_generator",
    columns=["plant_id_eia", "generator_id", "boiler_id"],
)

gens860 = gens860.merge(plants_entity_eia, how="left", on="plant_id_eia").merge(
    gens_to_boilers, how="left", on=["plant_id_eia", "generator_id"]
)

gen_fuel_allocated = (
    load_data.load_pudl_table(
        "out_eia923__monthly_generation_fuel_by_generator_energy_source", year
    )
    .groupby(
        ["plant_id_eia", "generator_id", "prime_mover_code", "energy_source_code"]
    )[["net_generation_mwh", "fuel_consumed_mmbtu"]]
    .sum()
    .reset_index()
)

#### Match Emission Unit to Boiler

In [None]:
match = {}
for i in cems_units[cems_units["generator_id"].isna()].index:
    plant_id = cems_units.loc[i, "plant_id_eia"]
    emission_unit_id = cems_units.loc[i, "emissions_unit_id_epa"]
    df = gens860.query(
        "plant_id_eia == @plant_id and boiler_id == @emission_unit_id"
    ).drop_duplicates(keep="first")
    if not df.empty:
        match[(plant_id, emission_unit_id)] = df

match

#### Look at Specific Plant

In [None]:
plant_id = 55386

cems_units[cems_units["plant_id_eia"] == plant_id]

In [None]:
gens860[gens860["plant_id_eia"] == plant_id]

In [None]:
display(gens860[gens860["plant_id_eia"] == plant_id]["generator_id"].unique())
display(gens860[gens860["plant_id_eia"] == plant_id]["boiler_id"].unique())

In [None]:
# look and see if there are plants with similar names in the same county
# sometimes plants split off
gens860[
    (gens860["county"] == "Calhoun")
    & gens860["plant_name_eia"].str.contains("Columbia")
]

In [None]:
gens860[(gens860["city"] == "Gaston")]

In [None]:
gen_fuel_allocated[gen_fuel_allocated["plant_id_eia"] == plant_id]