# About this notebook

The EPA's power sector data crosswalk is an important input for this data pipeline. However, it is incomplete and in some cases has incorrect crosswalks. Ideally, we would contribute directly to the github repository, but the code is written in R and not python. As a temporary measure, we want to explore the outputted crosswalk and post-process it to fix some of these issues.

In [1]:
# import packages
import pandas as pd
import numpy as np
import os

%reload_ext autoreload
%autoreload 2

# # Tell python where to look for modules.
import sys

sys.path.append("../../src/")


import oge.load_data as load_data
from oge.filepaths import *
import oge.data_cleaning as data_cleaning

year = 2023



### Identify CEMS Units that are not mapped to EIA

In [2]:
# load cems data
cems = load_data.load_cems_data(year)
# remove non-grid connected plants
cems = data_cleaning.remove_plants(
    cems,
    non_grid_connected=True,
    remove_states=["PR"],
    steam_only_plants=False,
    distribution_connected_plants=False,
)

# load the unit to generator mapping from the subplant crosswalk
# NOTE: you will need to run this in the pipeline first
generator_unit_map = pd.read_csv(
    outputs_folder(f"{year}/subplant_crosswalk_{year}.csv.zip"),
)[["plant_id_epa", "generator_id", "emissions_unit_id_epa"]]

# sum operational data by unit and merge the generator map
cems_units = (
    cems.groupby(["plant_id_epa", "plant_id_eia", "emissions_unit_id_epa"])[
        [
            "gross_generation_mwh",
            "fuel_consumed_mmbtu",
            "steam_load_1000_lb",
            "co2_mass_lb",
        ]
    ]
    .sum()
    .reset_index()
)
cems_units = cems_units.merge(
    generator_unit_map, how="left", on=["plant_id_epa", "emissions_unit_id_epa"]
)

2024-11-19 15:48:58 [INFO] oge.oge.data_cleaning:677 Removing 41 plants that are not grid-connected
2024-11-19 15:49:05 [INFO] oge.oge.data_cleaning:641 Removing 1 plants located in the following states: ['PR']


In [3]:
# identify units that are not mapped to a generator and have non-zero generation
cems_units[cems_units["generator_id"].isna() & (cems_units["gross_generation_mwh"] > 0)]

Unnamed: 0,plant_id_epa,plant_id_eia,emissions_unit_id_epa,gross_generation_mwh,fuel_consumed_mmbtu,steam_load_1000_lb,co2_mass_lb,generator_id
12,3,3,8,283475.0,1746748.0,0.0,207614400.0,
31,47,47,CCT10,13866.0,132644.6,0.0,15765640.0,
32,47,47,CCT11,17421.0,169668.1,0.0,20166700.0,
40,47,47,CCT9,15559.0,151002.0,0.0,17948300.0,
50,56,56,CC1,1754974.0,10871310.0,0.0,1292129000.0,
81,136,136,CT1,3069932.0,19363740.0,0.0,2301511000.0,
82,136,136,CT2,3088501.0,19527600.0,0.0,2320999000.0,
207,492,492,A1,23414.0,231120.1,0.0,27773950.0,
208,492,492,A2,26680.0,256672.8,0.0,30706140.0,
209,492,492,A3,21594.0,205127.7,0.0,24572860.0,


In [4]:
# identify units that are not mapped to a generator and have zero generation
cems_units[
    cems_units["generator_id"].isna() & (cems_units["gross_generation_mwh"] == 0)
]

Unnamed: 0,plant_id_epa,plant_id_eia,emissions_unit_id_epa,gross_generation_mwh,fuel_consumed_mmbtu,steam_load_1000_lb,co2_mass_lb,generator_id
745,1702,1702,A,0.0,3.825856e+04,34735.0,4.536492e+06,
746,1702,1702,B,0.0,5.631849e+04,33188.0,6.679372e+06,
1074,2503,2503,BLR114,0.0,1.797387e+06,1245005.0,0.000000e+00,
1075,2503,2503,BLR115,0.0,1.138404e+06,807743.0,0.000000e+00,
1076,2503,2503,BLR116,0.0,1.041446e+05,84140.0,0.000000e+00,
...,...,...,...,...,...,...,...,...
3973,55308,55308,A,0.0,2.856989e+05,339814.0,0.000000e+00,
3974,55308,55308,B,0.0,3.011653e+05,365662.0,0.000000e+00,
4113,55386,55386,B-1,0.0,3.275864e+05,227945.0,0.000000e+00,
4114,55386,55386,B-2,0.0,3.989732e+05,285374.0,0.000000e+00,


### Investigate matches

In [5]:
# load data that will help with exploration
gens860 = load_data.load_pudl_table(
    "core_eia860__scd_generators",
    year,
    columns=[
        "plant_id_eia",
        "generator_id",
        "operational_status",
        "current_planned_generator_operating_date",
        "prime_mover_code",
        "energy_source_code_1",
        "energy_source_code_2",
    ],
)
plants_entity_eia = load_data.load_pudl_table(
    "core_eia__entity_plants",
    columns=[
        "plant_id_eia",
        "plant_name_eia",
        "street_address",
        "city",
        "county",
        "state",
    ],
)
gens_to_boilers = load_data.load_pudl_table(
    "core_eia860__assn_boiler_generator",
    columns=["plant_id_eia", "generator_id", "boiler_id"],
)

gens860 = gens860.merge(plants_entity_eia, how="left", on="plant_id_eia").merge(
    gens_to_boilers, how="left", on=["plant_id_eia", "generator_id"]
)

gen_fuel_allocated = (
    load_data.load_pudl_table(
        "out_eia923__monthly_generation_fuel_by_generator_energy_source", year
    )
    .groupby(
        ["plant_id_eia", "generator_id", "prime_mover_code", "energy_source_code"]
    )[["net_generation_mwh", "fuel_consumed_mmbtu"]]
    .sum()
    .reset_index()
)



#### Match Emission Unit to Boiler

In [None]:
match = {}
for i in cems_units[cems_units["generator_id"].isna()].index:
    plant_id = cems_units.loc[i, "plant_id_eia"]
    emission_unit_id = cems_units.loc[i, "emissions_unit_id_epa"]
    df = gens860.query(
        "plant_id_eia == @plant_id and boiler_id == @emission_unit_id"
    ).drop_duplicates(keep="first")
    if not df.empty:
        match[(plant_id, emission_unit_id)] = df

match

{(3,
  '8'):         plant_id_eia generator_id operational_status current_planned_generator_operating_date prime_mover_code energy_source_code_1 energy_source_code_2 plant_name_eia    street_address   city  county state boiler_id
 177086             3         A3ST           existing                                      NaT               CA                   NG                 <NA>          Barry  North Highway 43  Bucks  Mobile    AL         8
 177088             3         A3C1           existing                                      NaT               CT                   NG                 <NA>          Barry  North Highway 43  Bucks  Mobile    AL         8,
 (136,
  'CT1'):         plant_id_eia generator_id operational_status current_planned_generator_operating_date prime_mover_code energy_source_code_1 energy_source_code_2 plant_name_eia    street_address     city  county state boiler_id
 175751           136           ST           existing                                      NaT   

#### Look at Specific Plant

In [7]:
plant_id = 55386

cems_units[cems_units["plant_id_eia"] == plant_id]

Unnamed: 0,plant_id_epa,plant_id_eia,emissions_unit_id_epa,gross_generation_mwh,fuel_consumed_mmbtu,steam_load_1000_lb,co2_mass_lb,generator_id
4113,55386,55386,B-1,0.0,327586.4,227945.0,0.0,
4114,55386,55386,B-2,0.0,398973.2,285374.0,0.0,
4115,55386,55386,B-3,0.0,712933.0,561154.0,0.0,
4116,55386,55386,CT-1,1437698.0,10809940.0,0.0,1285204000.0,CT1
4117,55386,55386,CT-1,1437698.0,10809940.0,0.0,1285204000.0,ST1
4118,55386,55386,CT-2,1109065.0,8471390.0,0.0,1008796000.0,CT2
4119,55386,55386,CT-2,1109065.0,8471390.0,0.0,1008796000.0,ST1


In [8]:
gens860[gens860["plant_id_eia"] == plant_id]

Unnamed: 0,plant_id_eia,generator_id,operational_status,current_planned_generator_operating_date,prime_mover_code,energy_source_code_1,energy_source_code_2,plant_name_eia,street_address,city,county,state,boiler_id
42579,55386,ST1,existing,NaT,CA,NG,,Columbia Energy Center,100 Calpine Way,Gaston,Calhoun,SC,DB1
42580,55386,ST1,existing,NaT,CA,NG,,Columbia Energy Center,100 Calpine Way,Gaston,Calhoun,SC,DB2
42581,55386,ST1,existing,NaT,CA,NG,,Columbia Energy Center,100 Calpine Way,Gaston,Calhoun,SC,DB1
42582,55386,ST1,existing,NaT,CA,NG,,Columbia Energy Center,100 Calpine Way,Gaston,Calhoun,SC,DB2
42583,55386,ST1,existing,NaT,CA,NG,,Columbia Energy Center,100 Calpine Way,Gaston,Calhoun,SC,DB1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
42704,55386,CT1,existing,NaT,CT,NG,DFO,Columbia Energy Center,100 Calpine Way,Gaston,Calhoun,SC,DB2
42705,55386,CT1,existing,NaT,CT,NG,DFO,Columbia Energy Center,100 Calpine Way,Gaston,Calhoun,SC,CT1
42706,55386,CT1,existing,NaT,CT,NG,DFO,Columbia Energy Center,100 Calpine Way,Gaston,Calhoun,SC,CT2
42707,55386,CT1,existing,NaT,CT,NG,DFO,Columbia Energy Center,100 Calpine Way,Gaston,Calhoun,SC,DB1


In [9]:
display(gens860[gens860["plant_id_eia"] == plant_id]["generator_id"].unique())
display(gens860[gens860["plant_id_eia"] == plant_id]["boiler_id"].unique())

<StringArray>
['ST1', 'CT2', 'CT1']
Length: 3, dtype: string

<StringArray>
['DB1', 'DB2', 'CT1', 'CT2']
Length: 4, dtype: string

In [None]:
# look and see if there are plants with similar names in the same county
# sometimes plants split off
gens860[
    (gens860["county"] == "Calhoun")
    & gens860["plant_name_eia"].str.contains("Columbia")
]

Unnamed: 0,plant_id_eia,generator_id,operational_status,current_planned_generator_operating_date,prime_mover_code,energy_source_code_1,energy_source_code_2,plant_name_eia,street_address,city,county,state,boiler_id
42579,55386,ST1,existing,NaT,CA,NG,,Columbia Energy Center,100 Calpine Way,Gaston,Calhoun,SC,DB1
42580,55386,ST1,existing,NaT,CA,NG,,Columbia Energy Center,100 Calpine Way,Gaston,Calhoun,SC,DB2
42581,55386,ST1,existing,NaT,CA,NG,,Columbia Energy Center,100 Calpine Way,Gaston,Calhoun,SC,DB1
42582,55386,ST1,existing,NaT,CA,NG,,Columbia Energy Center,100 Calpine Way,Gaston,Calhoun,SC,DB2
42583,55386,ST1,existing,NaT,CA,NG,,Columbia Energy Center,100 Calpine Way,Gaston,Calhoun,SC,DB1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
42704,55386,CT1,existing,NaT,CT,NG,DFO,Columbia Energy Center,100 Calpine Way,Gaston,Calhoun,SC,DB2
42705,55386,CT1,existing,NaT,CT,NG,DFO,Columbia Energy Center,100 Calpine Way,Gaston,Calhoun,SC,CT1
42706,55386,CT1,existing,NaT,CT,NG,DFO,Columbia Energy Center,100 Calpine Way,Gaston,Calhoun,SC,CT2
42707,55386,CT1,existing,NaT,CT,NG,DFO,Columbia Energy Center,100 Calpine Way,Gaston,Calhoun,SC,DB1


In [11]:
gens860[(gens860["city"] == "Gaston")]

Unnamed: 0,plant_id_eia,generator_id,operational_status,current_planned_generator_operating_date,prime_mover_code,energy_source_code_1,energy_source_code_2,plant_name_eia,street_address,city,county,state,boiler_id
928,67161,OAK,existing,NaT,PV,SUN,,Oak Solar,146 Summit Trail,Gaston,Northampton,NC,
4865,64167,GEN,proposed,2026-06-01,PV,SUN,,Hornet Solar,0 Old Lowesville Rd,Gaston,Gaston,NC,
6078,63235,OCPV1,existing,NaT,PV,SUN,,O'Neill Creek Solar,Old Highway 47,Gaston,Washington,OR,
8251,61530,GEN1,existing,NaT,PV,SUN,,Gaston Solar I - SC,260 George Derrick Road,Gaston,Lexington,SC,
8922,61031,PV1,existing,NaT,PV,SUN,,Floyd Road Solar Farm,4361 NC 46 HW,Gaston,Northampton,NC,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
42704,55386,CT1,existing,NaT,CT,NG,DFO,Columbia Energy Center,100 Calpine Way,Gaston,Calhoun,SC,DB2
42705,55386,CT1,existing,NaT,CT,NG,DFO,Columbia Energy Center,100 Calpine Way,Gaston,Calhoun,SC,CT1
42706,55386,CT1,existing,NaT,CT,NG,DFO,Columbia Energy Center,100 Calpine Way,Gaston,Calhoun,SC,CT2
42707,55386,CT1,existing,NaT,CT,NG,DFO,Columbia Energy Center,100 Calpine Way,Gaston,Calhoun,SC,DB1


In [12]:
gen_fuel_allocated[gen_fuel_allocated["plant_id_eia"] == plant_id]

Unnamed: 0,plant_id_eia,generator_id,prime_mover_code,energy_source_code,net_generation_mwh,fuel_consumed_mmbtu
21263,55386,CT1,CT,DFO,4962.89,25831.0
21264,55386,CT1,CT,NG,1545602.0,8750625.5
21265,55386,CT2,CT,DFO,115.5981,25831.0
21266,55386,CT2,CT,NG,197614.4,8750625.5
21267,55386,ST1,CA,DFO,2008.272,0.0
21268,55386,ST1,CA,NG,715601.7,131322.0
