# Evaluate 930 timestamps and interchanges

### Generation: 

Check lagged correlation (-11 to +12 hours) between 930 fossil generation and CEMS fossil generation for each BA over different time bounds: 
* 2018, 2019, 2020, 2021, 2022, 2023, 2024
* Daylight savings vs. non- daylight savings

Run for both raw, shifted, and shifted + basic and rolling filtered 930 data. When shifts are correct, the best correlation in the shifted data should be at lag=0. The best correlation in the non-shifted data can indicate what shift might be appropriate. Manual inspection is required to actually decide whether and how much to lag by. 

We run with the rolling-filtered data because in some cases large errors can cause anomalous best correlations in the shifted but not filtered 930 data. 

Note: for correct timestamps, demand data in non-shifted data will be best correlated at lag=-1 because 930 uses end-of-hour timestamps while CEMS uses start-of-hour. 

### Interchange: 

Check lagged correlations between pairs of BAs with shared interchange. If timestamps are consistent, the best correlation should be at lag=0. 

We also check the sign of the best correlation between paired BAs: if they're not negatively correlated, one of the signs may be incorrect. 

### Edge cases: 

In some BAs, the shifted data still shows a best correlation at lag != 0, but inspection of the data doesn't show an obvious fix. In these cases, we do nothing and rely on `gridemissions` to make the data consistent. 




In [None]:
import numpy as np
import pandas as pd
import re

import plotly.express as px
import plotly.graph_objects as go

In [None]:
%reload_ext autoreload
%autoreload 2

from oge.column_checks import get_dtypes
from oge.filepaths import *
import oge.eia930 as eia930

In [None]:
# Data before and after timestamp shifts
# Note: this is very slow! (~2min per year) because it's pivoting large files.
# Only consider thermal fuels (coal, natural gas, petroleum) in EIA-930 data for
# comparison with CEMS.
lraw = []
lshift = []

for year in [2019, 2020, 2021, 2022, 2023, 2024]:
    print(year)

    r = eia930.convert_balance_data_to_gridemissions_format(year)

    s = eia930.manual_930_adjust(r)
    s = eia930.reformat_chalendar(s)
    r = eia930.reformat_chalendar(r)

    s = s[s["fuel"].isin(["COL", "NG", "OIL"])]
    s = s.rename(columns={"UTC Time at End of Hour": "datetime_utc"})
    s = s.groupby(["datetime_utc", "BA"]).sum()["generation"].reset_index()
    s = s[s["datetime_utc"].dt.year == year]  # filter for year

    # Filter for fossil fuels, sum by BA
    r = r[r["fuel"].isin(["COL", "NG", "OIL"])]
    r = r.rename(columns={"UTC Time at End of Hour": "datetime_utc"})
    r = r.groupby(["datetime_utc", "BA"]).sum()["generation"].reset_index()
    r = r[r["datetime_utc"].dt.year == year]  # filter for year

    lraw.append(r)
    lshift.append(s)

In [None]:
raw = pd.concat(lraw, axis=0)
shifted = pd.concat(lshift, axis=0)

In [None]:
# Load data after shifting and rolling filter
all_rolled = []
for y in [2019, 2020, 2021, 2022, 2023, 2024]:
    rolled_930 = pd.read_csv(
        f"{data_folder()}/outputs/{y}/eia930/eia930_rolling.csv",
        index_col=0,
        parse_dates=True,
    )
    rolled_930 = rolled_930[rolled_930.index.year == y]
    all_rolled.append(rolled_930)
rolled_930 = eia930.reformat_chalendar(pd.concat(all_rolled))

In [None]:
# Remove renewables before summing EIA-930
rolled_930 = (
    rolled_930[rolled_930["fuel"].isin(["COL", "NG", "OIL"])]
    .groupby(["datetime_utc", "BA"])
    .sum()
    .reset_index()
)

In [None]:
# Load files
# Aggregate by BA during loading to cut down on space
cems = pd.DataFrame()
for y in [2019, 2020, 2021, 2022, 2023, 2024]:
    print(f"loading {y}")
    file = f"{data_folder()}/outputs/{y}/cems_cleaned_{y}.csv.zip"
    plant_meta = pd.read_csv(
        f"{data_folder()}/outputs/{y}/plant_static_attributes_{y}.csv.zip"
    )
    c = pd.read_csv(file, index_col=0, parse_dates=["datetime_utc"], low_memory=False)
    c = c.merge(
        plant_meta[["plant_id_eia", "plant_primary_fuel", "ba_code"]],
        how="left",
        left_index=True,
        right_on="plant_id_eia",
    )
    # Exclude solar
    c = c[c["plant_primary_fuel"] != "SUN"]
    c = c[["datetime_utc", "ba_code", "gross_generation_mwh"]]

    print("Aggregating")
    if y in [2019, 2020, 2021, 2022, 2023, 2024]:
        c = c.rename(columns={"gross_generation_mwh": "net_generation_mwh"})
    cems_aggregated = (
        c.groupby(["datetime_utc", "ba_code"]).sum()["net_generation_mwh"].reset_index()
    )
    cems = pd.concat([cems, cems_aggregated])

In [None]:
year = 2024
plant_attributes = pd.read_csv(
    f"{data_folder()}/outputs/{year}/plant_static_attributes_{year}.csv.zip",
    dtype=get_dtypes(),
)

# Correlation 

In [None]:
bas = set(raw["BA"].unique())
bas.intersection_update(set(cems["ba_code"].unique()))

In [None]:
print(
    f"shared BAs: {len(bas)} out of {len(raw['BA'].unique())} 930 BAs and "
    f"{len(cems['ba_code'].unique())} CEMS BAs."
)

missing_cems = set(raw["BA"].unique()).difference(set(cems["ba_code"].unique()))
missing_930 = set(cems["ba_code"].unique()).difference(set(raw["BA"].unique()))
print(f"930 BAs missing in CEMS: {missing_cems}")
print(f"CEMS missing 930: {missing_930}")

In [None]:
def find_best_cor(cems, df_eia930):
    cems = cems.pivot(
        columns="ba_code", index="datetime_utc", values="net_generation_mwh"
    )
    df_eia930 = df_eia930.pivot(columns="BA", index="datetime_utc", values="generation")

    bas = set(cems.columns).intersection(set(df_eia930.columns))

    # Create multi-level columns: month names x (lags + best)
    month_names = [
        "January",
        "February",
        "March",
        "April",
        "May",
        "June",
        "July",
        "August",
        "September",
        "October",
        "November",
        "December",
    ]
    lags = list(range(-12, 13))  # -12 to 12 inclusive
    lag_cols = lags + ["best"]
    multi_cols = pd.MultiIndex.from_product(
        [month_names, lag_cols], names=["month", "lag"]
    )

    correlations = pd.DataFrame(index=list(bas), columns=multi_cols, dtype=float)

    for ba in correlations.index:
        for month_idx, month_name in enumerate(month_names, 1):
            # Get data for this specific month
            cems_month = cems[ba][cems.index.month == month_idx]
            eia930_month = df_eia930[ba][df_eia930.index.month == month_idx]

            # Get index where both have non-zero and not-missing values
            idx_for_corr = (
                cems_month.notna()
                & eia930_month.notna()
                & (cems_month != 0)
                & (eia930_month != 0)
            )

            # Check if we have enough data points for meaningful correlation
            if cems_month[idx_for_corr].empty or eia930_month[idx_for_corr].empty:
                continue

            valid_cems = cems_month[idx_for_corr]
            valid_eia930 = eia930_month[idx_for_corr]

            # Need at least 3 data points for correlation and non-zero variance
            if len(valid_cems) < 3 or valid_cems.std() == 0 or valid_eia930.std() == 0:
                continue

            # Calculate correlations for each lag
            month_correlations = {}
            for lag in lags:
                try:
                    shifted_eia930 = valid_eia930.shift(lag).dropna()
                    # Align indices after shifting
                    common_idx = valid_cems.index.intersection(shifted_eia930.index)

                    if len(common_idx) < 3:
                        continue

                    cems_aligned = valid_cems.loc[common_idx]
                    eia930_aligned = shifted_eia930.loc[common_idx]

                    # Check for zero variance after alignment
                    if cems_aligned.std() == 0 or eia930_aligned.std() == 0:
                        continue

                    corr_val = cems_aligned.corr(eia930_aligned)

                    # Only store if correlation is valid
                    if pd.notna(corr_val) and not np.isinf(corr_val):
                        correlations.loc[ba, (month_name, lag)] = corr_val
                        month_correlations[lag] = corr_val

                except (ValueError, ZeroDivisionError, RuntimeWarning):
                    # Skip this lag if calculation fails
                    print(f"Skipping {lag}h lag for {ba} in {month_name}")
                    continue

            # Find best lag for this month
            if month_correlations:
                # Find lag with maximum correlation (handling NaN values)
                valid_correlations = {
                    k: v for k, v in month_correlations.items() if pd.notna(v)
                }
                if valid_correlations:
                    best_lag = max(
                        valid_correlations.keys(), key=lambda x: valid_correlations[x]
                    )
                    best_corr = valid_correlations[best_lag]
                    correlations.loc[ba, (month_name, "best")] = (
                        best_lag if best_corr > 0.5 else np.nan
                    )

    correlations.dropna(how="all", inplace=True)
    return correlations

In [None]:
cems.drop_duplicates(subset=["datetime_utc", "ba_code"], inplace=True)

In [None]:
# Extract best lags across all months for summary
year_dfs = {}
for year in [2019, 2020, 2021, 2022, 2023, 2024]:
    print(f"### Year: {year} ###")
    year_result = find_best_cor(
        cems[cems["datetime_utc"].dt.year == year],
        shifted[shifted["datetime_utc"].dt.year == year],
    ).xs("best", level="lag", axis=1)
    year_dfs[str(year)] = year_result

cems_930_cors = pd.concat(year_dfs, axis="columns")

# Remove BAs where all values are either zero or NA
mask = ~((cems_930_cors == 0.0) | cems_930_cors.isna()).all(axis=1)
filtered_cors = cems_930_cors.loc[mask]
filtered_cors.to_csv(f"{data_folder()}/outputs/2024/cems_shifted_eia930_lags.csv")

In [None]:
# Extract best lags across all months for summary
year_dfs = {}
for year in [2019, 2020, 2021, 2022, 2023, 2024]:
    print(f"### Year: {year} ###")
    year_result = find_best_cor(
        cems[cems["datetime_utc"].dt.year == year],
        raw[raw["datetime_utc"].dt.year == year],
    ).xs("best", level="lag", axis=1)
    year_dfs[str(year)] = year_result

cems_930_cors = pd.concat(year_dfs, axis="columns")

# Remove BAs where all values are either -1 or NA
mask = ~((cems_930_cors == -1.0) | cems_930_cors.isna()).all(axis=1)
filtered_cors = cems_930_cors.loc[mask]
filtered_cors.to_csv(f"{data_folder()}/outputs/2024/cems_raw_eia930_lags.csv")

In [None]:
# Extract best lags across all months for summary
year_dfs = {}
for year in [2019, 2020, 2021, 2022, 2023, 2024]:
    print(f"### Year: {year} ###")
    year_result = find_best_cor(
        cems[cems["datetime_utc"].dt.year == year],
        rolled_930[rolled_930["datetime_utc"].dt.year == year],
    ).xs("best", level="lag", axis=1)
    year_dfs[str(year)] = year_result

cems_930_cors = pd.concat(year_dfs, axis="columns")

cems_930_cors.to_csv(f"{data_folder()}/outputs/2024/cems_rolled_eia930_lags.csv")

In [None]:
# Visualize a BA for manual inspection
ba = "AVA"

to_plot_930 = shifted[shifted["BA"] == ba].groupby("datetime_utc").sum()

print(f"correlations for {ba}")
print(cems_930_cors.loc[ba])

fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=cems[cems["ba_code"] == ba]["datetime_utc"],
        y=cems[cems["ba_code"] == ba]["net_generation_mwh"],
        name="CEMS",
    )
)
fig.add_trace(
    go.Scatter(
        x=to_plot_930.index,
        y=to_plot_930["generation"],
        name="Adjusted EIA-930",
    )
)
fig.update_layout(title=ba, xaxis_title="Date", yaxis_title="Generation")

# Interchange

In [None]:
interchanges = []
for year in [2019, 2020, 2021, 2022, 2023, 2024]:
    interchange = pd.read_csv(
        f"{data_folder()}/outputs/{year}/eia930/eia930_raw.csv",
        index_col=0,
        parse_dates=True,
    )
    interchange = interchange[
        interchange.index.year == year
    ]  # limit to after gen was reported by fuel type
    interchanges.append(interchange)

In [None]:
interchange = pd.concat(interchanges)

In [None]:
bas930 = {re.split(r"[-.]", c)[1] for c in interchange.columns}

In [None]:
# given a df where columns are interchange data, add best correlation between matching
# BAs to interchange_cors dict, optionally, write markdown to {file}.md and csvs at
# {file}_{ba}.csv
def interchange_cor(
    interchange, interchange_cors: dict = {}, file="", name: str = "cors"
):
    # Delete file
    if file != "":
        hs = open(file + ".md", "w")
        hs.write("\n\n")
        hs.close()

    for ba in bas930:
        print(ba, end="...")
        other_cols = [
            c
            for c in interchange.columns
            if re.split(r"[-.]", c)[1] == ba and re.split(r"[-.]", c)[2] != "ALL"
        ]
        other_bas = [re.split(r"[-.]", c)[2] for c in other_cols]

        out = pd.DataFrame(index=other_bas, columns=range(-12, 12), dtype=float)
        for o_ba in out.index:
            this_way = f"EBA.{o_ba}-{ba}.ID.H"
            other_way = f"EBA.{ba}-{o_ba}.ID.H"
            if (
                other_way not in interchange.columns
                or this_way not in interchange.columns
            ):
                continue
            idx_for_corr = (
                interchange[this_way].notna()
                & interchange[other_way].notna()
                & (interchange[this_way] != 0)
                & (interchange[other_way] != 0)
            )
            if (
                interchange[this_way][idx_for_corr].empty
                or interchange[other_way][idx_for_corr].empty
            ):
                continue
            else:
                for lag in out.columns:
                    out.loc[o_ba, lag] = abs(
                        interchange[this_way].corr(
                            -1 * interchange[other_way].shift(lag)
                        )
                    )

        out.dropna(how="all", inplace=True)
        best = out.idxmax(skipna=True, axis=1).rename("best")

        out = pd.concat([best, out], axis="columns")

        if file != "":
            # add new lines for proper markdown syntax
            hs = open(file + ".md", "a")
            hs.write(f"\n\n# {ba}\n\n")
            hs.close()

            out.to_markdown(file + ".md", mode="a")

            out.to_csv(f"{file}_{ba}" + ".csv")

        interchange_cors[ba] = pd.concat(
            [interchange_cors.get(ba, pd.DataFrame()), out.best.rename(name)],
            axis="columns",
        )

    return interchange_cors

In [None]:
int_cors = {}
for year in [2019, 2020, 2021, 2022, 2023, 2024]:
    int_cors = interchange_cor(
        interchange[f"{year}-01-01T00:00" : f"{year}-12-30T00:00"],
        int_cors,
        name=str(year),
    )

int_cors = interchange_cor(interchange, int_cors, name="all_years")
int_cors = interchange_cor(
    interchange[(interchange.index.month >= 4) & (interchange.index.month <= 9)],
    int_cors,
    name="daylight savings",
)
int_cors = interchange_cor(
    interchange[(interchange.index.month >= 11) | (interchange.index.month <= 2)],
    int_cors,
    name="standard time",
)

In [None]:
# Inspect interchange correlations
int_cors["PJM"]

In [None]:
# Output to md file because that's an easy way to manually scan through BAs and look for anomalies
file = f"{data_folder()}/outputs/2024/interchange_corr_summary_adjusted.md"
hs = open(file, "w")
hs.write("\n\n")
hs.close()

for ba, out in int_cors.items():
    # add new lines for proper markdown syntax
    hs = open(file, "a")
    hs.write(f"\n\n# {ba}\n\n")
    hs.close()

    out.to_markdown(file, mode="a")

# Plot interchange for BA of interest

In [None]:
ba1 = "IID"
ba2 = "CISO"

fig = px.line(interchange[f"EBA.{ba1}-{ba2}.ID.H"])
fig.add_trace(
    go.Scatter(
        x=interchange.index,
        y=interchange[f"EBA.{ba2}-{ba1}.ID.H"],
        name=f"EBA.{ba2}-{ba1}.ID.H",
    )
)

In [None]:
ba = "PJM"

# find cols of mappings in both directions
other_cols = [
    c
    for c in interchange.columns
    if re.split(r"[-.]", c)[1] == ba and re.split(r"[-.]", c)[2] != "ALL"
]
other_bas = [re.split(r"[-.]", c)[2] for c in other_cols]

these_cols = [f"EBA.{o_ba}-{ba}.ID.H" for o_ba in other_bas]

# make long version with just cols of interest, adding BA column and to/from column
toplot = pd.DataFrame()
for i in range(len(other_bas)):
    to_add = (interchange[other_cols[i]]).rename("interchange").to_frame()
    to_add["source"] = ba
    to_add["BA"] = other_bas[i]

    to_add_2 = (interchange[these_cols[i]] * (-1)).rename("interchange").to_frame()
    to_add_2["source"] = "other BA"
    to_add_2["BA"] = other_bas[i]

    toplot = pd.concat([toplot, to_add, to_add_2], axis="index")

In [None]:
fig = px.line(
    toplot,
    x=toplot.index,
    y="interchange",
    facet_col="BA",
    facet_col_wrap=2,
    color="source",
)
fig.update_layout(
    title=f"Interchange from {ba}",
    xaxis_title="Date",
    yaxis_title="Interchange",
    legend_title="Source for<br>interchange data",
)
fig.for_each_annotation(lambda a: a.update(text="Other " + a.text))

In [None]:
first = "PJM"
second = "MISO"

fig = px.line(
    interchange,
    x=interchange.index,
    y=[
        f"EBA.{first}-{second}.ID.H",
        f"EBA.{second}-{first}.ID.H",
        f"EBA.{first}-ALL.TI.H",
    ],
)

fig.update_layout(
    title=f"{first}/{second} interchange",
    xaxis_title="Date",
    yaxis_title="Interchange",
    legend_title="Series",
)

In [None]:
ba = "PJM"

fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=interchange.index,
        y=interchange[f"EBA.{ba}-ALL.D.H"] - interchange[f"EBA.{ba}-ALL.NG.H"],
    )
)

fig.update_layout(
    title=f"{ba} demand - generation",
    xaxis_title="Date",
    yaxis_title="Demand - generation",
    legend_title="Series",
)

# Sign issues across interchange data

Most interchanges should be negatively correlated with the interchange coming the other way. 

In [None]:
# given a df where columns are interchange data, add best correlation between matching
# BAs to interchange_cors dict
def interchange_sign(interchange, i_sign: dict = {}, name: str = "cors"):
    for ba in bas930:
        print(ba, end="...")
        other_cols = [
            c
            for c in interchange.columns
            if re.split(r"[-.]", c)[1] == ba and re.split(r"[-.]", c)[2] != "ALL"
        ]
        other_bas = [re.split(r"[-.]", c)[2] for c in other_cols]
        # print(f"{ba} connects to {other_bas}")

        out = pd.DataFrame(index=other_bas, columns=range(-12, 12), dtype=float)
        for o_ba in out.index:
            this_way = f"EBA.{o_ba}-{ba}.ID.H"
            other_way = f"EBA.{ba}-{o_ba}.ID.H"
            if other_way not in interchange or this_way not in interchange:
                continue
            for lag in out.columns:
                out.loc[o_ba, lag] = interchange[this_way].corr(
                    -1 * interchange[other_way].shift(lag)
                )

        i_sign[ba] = pd.concat(
            [
                i_sign.get(ba, pd.DataFrame()),
                out.dropna(how="all").max(axis=1).rename(name),
            ],
            axis="columns",
        )

    return i_sign

In [None]:
int_sign = {}
for year in [2019, 2020, 2021, 2022, 2023, 2024]:
    int_sign = interchange_sign(
        interchange[f"{year}-01-01T00:00" : f"{year}-12-30T00:00"],
        int_sign,
        name=str(year),
    )

int_sign = interchange_sign(interchange, int_sign, name="all_years")
int_sign = interchange_sign(
    interchange[(interchange.index.month >= 4) & (interchange.index.month <= 9)],
    int_sign,
    name="daylight savings",
)
int_sign = interchange_sign(
    interchange[(interchange.index.month >= 11) | (interchange.index.month <= 2)],
    int_sign,
    name="standard time",
)

In [None]:
file = f"{outputs_folder('2024')}/interchange_cors_sign.md"
hs = open(file, "w")
hs.write("\n\n")
hs.close()

for ba, out in int_sign.items():
    # add new lines for proper markdown syntax
    hs = open(file, "a")
    hs.write(f"\n\n# {ba}\n\n")
    hs.close()

    out.to_markdown(file, mode="a")