# Compare Subplant Crosswalk Across Years 

In [None]:
import os
import pandas as pd

from oge.filepaths import outputs_folder

## Load Data from S3
Drop NAs in `subplan_id`, `plant_id_eia` and`generator_id` in order to compare `subplan_id` across years for same (`plant_id_eia`, `generator_id`) combination

In [None]:
os.environ["OGE_DATA_STORE"] = "s3"
subplant = {
    y: pd.read_csv(outputs_folder(f"{y}/subplant_crosswalk_{y}.csv"))
    .dropna(axis=0, subset=["plant_id_eia", "generator_id", "subplant_id"])
    .set_index(["plant_id_eia", "generator_id"])["subplant_id"]
    for y in range(2019, 2023)
}

Look at length of series. It should be different from year to year as generators come online or are retired. It seems to increase from one year to the next.

In [None]:
[len(df) for df in subplant.values()]

## Look at Difference

In [None]:
mismatch = {
    f"{i}-{j}": subplant[i]
    .reset_index()
    .merge(
        subplant[j].reset_index(),
        on=["plant_id_eia", "generator_id"],
        how="inner",
        suffixes=[f"_{i}", f"_{j}"],
    )
    .query(f"subplant_id_{i} - subplant_id_{j} != 0")
    .set_index(["plant_id_eia", "generator_id"])
    for i in range(2019, 2022)
    for j in range(i + 1, 2023)
}

In [None]:
print(
    "Number of difference in subplant_id for same (plant_id_eia, generator_id) combination"
)
for k, v in mismatch.items():
    print(f"{k}: {len(v)}")

In [None]:
mismatch["2021-2022"]