# Evaluate 930 timestamps and interchanges

### Generation: 

Check lagged correlation (-11 to +12 hours) between 930 fossil generation and CEMS fossil generation for each BA over different time bounds: 
* 2021, 2022
* Daylight savings vs. non- daylight savings

Run for both raw, shifted, and shifted + basic and rolling filtered 930 data. When shifts are correct, the best correlation in the shifted data should be at lag=0. The best correlation in the non-shifted data can indicate what shift might be appropriate. Manual inspection is required to actually decide whether and how much to lag by. 

We run with the rolling-filtered data because in some cases large errors can cause anomalous best correlations in the shifted but not filtered 930 data. 

Note: for correct timestamps, demand data in non-shifted data will be best correlated at lag=-1 because 930 uses end-of-hour timestamps while CEMS uses start-of-hour. 

### Interchange: 

Check lagged correlations between pairs of BAs with shared interchange. If timestamps are consistent, the best correlation should be at lag=0. 

We also check the sign of the best correlation between paired BAs: if they're not negatively correlated, one of the signs may be incorrect. 

### Edge cases: 

In some BAs, the shifted data still shows a best correlation at lag != 0, but inspection of the data doesn't show an obvious fix. In these cases, we do nothing and rely on `gridemissions` to make the data consistent. 




In [None]:
import pandas as pd
import numpy as np
import re
from datetime import timedelta

import plotly.express as px
import plotly.graph_objects as go

In [None]:
%reload_ext autoreload
%autoreload 2

# # Tell python where to look for modules.
import sys

sys.path.append("../../src")

import oge.download_data as download_data
import oge.load_data as load_data
from oge.column_checks import get_dtypes
from oge.filepaths import *
import oge.impute_hourly_profiles as impute_hourly_profiles
import oge.data_cleaning as data_cleaning
import oge.output_data as output_data
import oge.emissions as emissions
import oge.validation as validation
import oge.gross_to_net_generation as gross_to_net_generation
import oge.eia930 as eia930

In [None]:
# Data before and after shifts
# Note: this is very slow! (~30min) because it's pivoting large files.
lraw = []
lshift = []

for year in [2021, 2022]:
    print(year)

    r = eia930.convert_balance_file_to_gridemissions_format(year)

    s = eia930.manual_930_adjust(r)
    s = eia930.reformat_chalendar(s)
    r = eia930.reformat_chalendar(r)

    s = s[s.fuel.isin(["COL", "NG", "OIL"])]
    s = s.rename(columns={"UTC Time at End of Hour": "datetime_utc"})
    s = s.groupby(["datetime_utc", "BA"]).sum()["generation"].reset_index()
    s = s[s.datetime_utc.dt.year == year]  # filter for year

    # Filter for fossil fuels, sum by BA
    r = r[r.fuel.isin(["COL", "NG", "OIL"])]
    r = r.rename(columns={"UTC Time at End of Hour": "datetime_utc"})
    r = r.groupby(["datetime_utc", "BA"]).sum()["generation"].reset_index()
    r = r[r.datetime_utc.dt.year == year]  # filter for year
    lraw.append(r)
    lshift.append(s)

In [None]:
raw = pd.concat(lraw, axis=0)
shifted = pd.concat(lshift, axis=0)

In [None]:
# Load data after shifting and rolling filter
all_rolled = []
for y in [2021, 2022]:
    rolled_930 = pd.read_csv(
        f"{data_folder()}/outputs/{y}/eia930/eia930_rolling.csv",
        index_col=0,
        parse_dates=True,
    )
    rolled_930 = rolled_930[rolled_930.index.year == y]
    all_rolled.append(rolled_930)
rolled_930 = eia930.reformat_chalendar(pd.concat(all_rolled))

In [None]:
# Remove renewables before summing 930
rolled_930 = (
    rolled_930[rolled_930.fuel.isin(["COL", "NG", "OIL"])]
    .groupby(["datetime_utc", "BA"])
    .sum()
    .reset_index()
)

In [None]:
# Load files
# Aggregate by BA during loading to cut down on space
cems = pd.DataFrame()
for y in [2021, 2022]:
    print(f"loading {y}")
    file = f"{data_folder()}/outputs/{y}/cems_cleaned_{y}.csv"
    plant_meta = pd.read_csv(
        f"{data_folder()}/outputs/{y}/plant_static_attributes_{y}.csv"
    )
    c = pd.read_csv(file, index_col=0, parse_dates=["datetime_utc"])
    c = c.merge(
        plant_meta[["plant_id_eia", "plant_primary_fuel", "ba_code"]],
        how="left",
        left_index=True,
        right_on="plant_id_eia",
    )
    # Exclude solar power for CEMS, since we're just going to look at COL + OIL + NG in the 930 data
    c = c[c["plant_primary_fuel"] != "SUN"]
    c = c[["datetime_utc", "ba_code", "gross_generation_mwh"]]

    print("Aggregating")
    if y in [2021, 2022]:
        c = c.rename(columns={"gross_generation_mwh": "net_generation_mwh"})
    cems_aggregated = (
        c.groupby(["datetime_utc", "ba_code"]).sum()["net_generation_mwh"].reset_index()
    )
    cems = pd.concat([cems, cems_aggregated])

cems.head()

In [None]:
year = 2022
plant_attributes = pd.read_csv(
    f"{data_folder()}/outputs/{year}/plant_static_attributes_{year}.csv",
    dtype=get_dtypes(),
)

# Correlation 

In [None]:
bas = set(raw.BA.unique())
bas.intersection_update(set(cems.ba_code.unique()))

In [None]:
print(
    f"shared BAs: {len(bas)} out of {len(raw.BA.unique())} 930 BAs and {len(cems.ba_code.unique())} CEMS BAs."
)

missing_cems = set(raw.BA.unique()).difference(set(cems.ba_code.unique()))
missing_930 = set(cems.ba_code.unique()).difference(set(raw.BA.unique()))
print(f"930 BAs missing in CEMS: {missing_cems}")
print(f"CEMS missing 930: {missing_930}")

In [None]:
def find_best_cor(cems, df_eia930):
    cems = cems.pivot(
        columns="ba_code", index="datetime_utc", values="net_generation_mwh"
    )
    df_eia930 = df_eia930.pivot(columns="BA", index="datetime_utc", values="generation")

    bas = set(cems.columns).intersection(set(df_eia930.columns))

    correlations = pd.DataFrame(index=list(bas), columns=range(-12, 12), dtype=float)

    for ba in correlations.index:
        for lag in correlations.columns:
            # prepare 930: select BA
            # eia = df_eia930[df_eia930.BA==ba]["generation"]
            # prepare CEMS: select BA
            # c = cems[cems.ba_code==ba]["net_generation_mwh"]
            # calculate
            correlations.loc[ba, lag] = cems[ba].corr(df_eia930[ba].shift(lag))

    best = correlations.apply(lambda s: s.index[s.argmax()], axis=1).rename("best")

    correlations = pd.concat([best, correlations], axis="columns")
    return correlations

In [None]:
cems.drop_duplicates(subset=["datetime_utc", "ba_code"], inplace=True)
# rolled_930.drop_duplicates(subset=["datetime_utc","BA"], inplace=True)

In [None]:
# Calculate best correlations for shifted (no EBA cleaning) data
cems_930_cors = pd.concat(
    [
        find_best_cor(cems, shifted).best.rename("all_years"),
        find_best_cor(
            cems[cems.datetime_utc.dt.year == 2021],
            shifted[shifted.datetime_utc.dt.year == 2021],
        ).best.rename("2021"),
        find_best_cor(
            cems[cems.datetime_utc.dt.year == 2022],
            shifted[shifted.datetime_utc.dt.year == 2022],
        ).best.rename("2022"),
        find_best_cor(
            cems[(cems.datetime_utc.dt.month >= 4) & (cems.datetime_utc.dt.month <= 9)],
            shifted[
                (shifted.datetime_utc.dt.month >= 4)
                & (shifted.datetime_utc.dt.month <= 9)
            ],
        ).best.rename("daylight time"),
        find_best_cor(
            cems[
                (cems.datetime_utc.dt.month >= 11) | (cems.datetime_utc.dt.month <= 2)
            ],
            shifted[
                (shifted.datetime_utc.dt.month >= 11)
                | (shifted.datetime_utc.dt.month <= 2)
            ],
        ).best.rename("standard time"),
    ],
    axis="columns",
)

cems_930_cors.to_csv(f"{data_folder()}/outputs/2022/cems_SHIFTEDeia930_cor_lags.csv")

In [None]:
# Calculate best correlations for raw data
cems_930_cors = pd.concat(
    [
        find_best_cor(cems, raw).best.rename("all_years"),
        find_best_cor(
            cems[cems.datetime_utc.dt.year == 2021],
            raw[raw.datetime_utc.dt.year == 2021],
        ).best.rename("2021"),
        find_best_cor(
            cems[cems.datetime_utc.dt.year == 2022],
            raw[raw.datetime_utc.dt.year == 2022],
        ).best.rename("2022"),
        find_best_cor(
            cems[(cems.datetime_utc.dt.month >= 4) & (cems.datetime_utc.dt.month <= 9)],
            raw[(raw.datetime_utc.dt.month >= 4) & (raw.datetime_utc.dt.month <= 9)],
        ).best.rename("daylight time"),
        find_best_cor(
            cems[
                (cems.datetime_utc.dt.month >= 11) | (cems.datetime_utc.dt.month <= 2)
            ],
            raw[(raw.datetime_utc.dt.month >= 11) | (raw.datetime_utc.dt.month <= 2)],
        ).best.rename("standard time"),
    ],
    axis="columns",
)

cems_930_cors.to_csv(f"{data_folder()}/outputs/2022/cems_RAWeia930_cor_lags.csv")

In [None]:
## Calculate correlations using different subsets of 930 data

cems_930_cors = pd.concat(
    [
        find_best_cor(cems, rolled_930).best.rename("all_years"),
        find_best_cor(
            cems[cems.datetime_utc.dt.year == 2021],
            rolled_930[rolled_930.datetime_utc.dt.year == 2021],
        ).best.rename("2021"),
        find_best_cor(
            cems[cems.datetime_utc.dt.year == 2022],
            rolled_930[rolled_930.datetime_utc.dt.year == 2022],
        ).best.rename("2022"),
        find_best_cor(
            cems[(cems.datetime_utc.dt.month >= 4) & (cems.datetime_utc.dt.month <= 9)],
            rolled_930[
                (rolled_930.datetime_utc.dt.month >= 4)
                & (rolled_930.datetime_utc.dt.month <= 9)
            ],
        ).best.rename("daylight time"),
        find_best_cor(
            cems[
                (cems.datetime_utc.dt.month >= 11) | (cems.datetime_utc.dt.month <= 2)
            ],
            rolled_930[
                (rolled_930.datetime_utc.dt.month >= 11)
                | (rolled_930.datetime_utc.dt.month <= 2)
            ],
        ).best.rename("standard time"),
    ],
    axis="columns",
)

cems_930_cors.to_csv(f"{data_folder()}/outputs/2022/cems_RAWeia930_cor_lags.csv")

In [None]:
# Visualize a BA for manual inspection
ba = "TEPC"

to_plot_930 = shifted[shifted.BA == ba].groupby("datetime_utc").sum()

print(f"correlations for {ba}")
print(cems_930_cors.loc[ba])

fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=cems[cems.ba_code == ba].datetime_utc,
        y=cems[cems.ba_code == ba].net_generation_mwh,
        name="CEMS",
    )
)
fig.add_trace(
    go.Scatter(
        x=to_plot_930.index,
        y=to_plot_930.generation,
        name="EIA 930 (after adjustment and rolling cleaning)",
    )
)
fig.update_layout(title=ba, xaxis_title="Date", yaxis_title="Generation")

# Interchange

In [None]:
interchanges = []
for year in [2021, 2022]:
    interchange = pd.read_csv(
        f"{data_folder()}/outputs/{year}/eia930/eia930_raw.csv",
        index_col=0,
        parse_dates=True,
    )
    interchange = interchange[
        interchange.index.year == year
    ]  # limit to after gen was reported by fuel type
    interchanges.append(interchange)

In [None]:
interchange = pd.concat(interchanges)

In [None]:
bas930 = {re.split(r"[-.]", c)[1] for c in interchange.columns}

In [None]:
# given a df where columns are interchange data, add best correlation between matching BAs to interchange_cors dict
# optionally, write markdown to {file}.md and csvs at {file}_{ba}.csv
def interchange_cor(
    interchange, interchange_cors: dict = {}, file="", name: str = "cors"
):
    # Delete file
    if file != "":
        hs = open(file + ".md", "w")
        hs.write("\n\n")
        hs.close()

    for ba in bas930:
        print(ba, end="...")
        other_cols = [
            c
            for c in interchange.columns
            if re.split(r"[-.]", c)[1] == ba and re.split(r"[-.]", c)[2] != "ALL"
        ]
        other_bas = [re.split(r"[-.]", c)[2] for c in other_cols]
        # print(f"{ba} connects to {other_bas}")

        out = pd.DataFrame(index=other_bas, columns=range(-12, 12), dtype=float)
        for o_ba in out.index:
            this_way = f"EBA.{o_ba}-{ba}.ID.H"
            other_way = f"EBA.{ba}-{o_ba}.ID.H"
            if (
                other_way not in interchange.columns
                or this_way not in interchange.columns
            ):
                continue
            for lag in out.columns:
                out.loc[o_ba, lag] = abs(
                    interchange[this_way].corr(-1 * interchange[other_way].shift(lag))
                )

        # where is correlation the best?
        out = pd.concat(
            [out, out.apply(lambda s: s.index[s.argmax()], axis=1).rename("best")],
            axis="columns",
        )

        if file != "":
            # add new lines for proper markdown syntax
            hs = open(file + ".md", "a")
            hs.write(f"\n\n# {ba}\n\n")
            hs.close()

            out.to_markdown(file + ".md", mode="a")

            out.to_csv(f"{file}_{ba}" + ".csv")

        interchange_cors[ba] = pd.concat(
            [interchange_cors.get(ba, pd.DataFrame()), out.best.rename(name)],
            axis="columns",
        )

    return interchange_cors

In [None]:
int_cors = interchange_cor(interchange, interchange_cors={}, name="all_years")
int_cors = interchange_cor(
    interchange["2021-01-01T00:00":"2021-12-30T00:00"], int_cors, name="2021"
)
int_cors = interchange_cor(
    interchange["2022-01-01T00:00":"2022-12-30T00:00"], int_cors, name="2022"
)
int_cors = interchange_cor(
    interchange[(interchange.index.month >= 4) & (interchange.index.month <= 9)],
    int_cors,
    name="daylight savings",
)
int_cors = interchange_cor(
    interchange[(interchange.index.month >= 11) | (interchange.index.month <= 2)],
    int_cors,
    name="standard time",
)

In [None]:
# Inspect interchange correlations
int_cors["PJM"]

In [None]:
# Output to md file because that's an easy way to manually scan through BAs and look for anomalies
file = f"{data_folder()}/outputs/2022/interchange_corr_summary_adjusted.md"
hs = open(file, "w")
hs.write("\n\n")
hs.close()

for ba, out in int_cors.items():
    # add new lines for proper markdown syntax
    hs = open(file, "a")
    hs.write(f"\n\n# {ba}\n\n")
    hs.close()

    out.to_markdown(file, mode="a")

# Plot interchange for BA of interest

In [None]:
ba1 = "IID"
ba2 = "CISO"

fig = px.line(interchange[f"EBA.{ba1}-{ba2}.ID.H"])
fig.add_trace(
    go.Scatter(
        x=interchange.index,
        y=interchange[f"EBA.{ba2}-{ba1}.ID.H"],
        name=f"EBA.{ba2}-{ba1}.ID.H",
    )
)

In [None]:
ba = "PJM"

# find cols of mappings in both directions
other_cols = [
    c
    for c in interchange.columns
    if re.split(r"[-.]", c)[1] == ba and re.split(r"[-.]", c)[2] != "ALL"
]
other_bas = [re.split(r"[-.]", c)[2] for c in other_cols]

these_cols = [f"EBA.{o_ba}-{ba}.ID.H" for o_ba in other_bas]

# make long version with just cols of interest, adding BA column and to/from column
toplot = pd.DataFrame()
for i in range(len(other_bas)):
    to_add = (interchange[other_cols[i]]).rename("interchange").to_frame()
    to_add["source"] = ba
    to_add["BA"] = other_bas[i]

    to_add_2 = (interchange[these_cols[i]] * (-1)).rename("interchange").to_frame()
    to_add_2["source"] = "other BA"
    to_add_2["BA"] = other_bas[i]

    toplot = pd.concat([toplot, to_add, to_add_2], axis="index")

In [None]:
fig = px.line(
    toplot,
    x=toplot.index,
    y="interchange",
    facet_col="BA",
    facet_col_wrap=2,
    color="source",
)
fig.update_layout(
    title=f"Interchange from {ba}",
    xaxis_title="Date",
    yaxis_title="Interchange",
    legend_title="Source for<br>interchange data",
)
fig.for_each_annotation(lambda a: a.update(text="Other " + a.text))

In [None]:
first = "PJM"
second = "MISO"

fig = px.line(
    interchange,
    x=interchange.index,
    y=[
        f"EBA.{first}-{second}.ID.H",
        f"EBA.{second}-{first}.ID.H",
        f"EBA.{first}-ALL.TI.H",
    ],
)

fig.update_layout(
    title=f"{first}/{second} interchange",
    xaxis_title="Date",
    yaxis_title="Interchange",
    legend_title="Series",
)

In [None]:
ba = "PJM"

fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=interchange.index,
        y=interchange[f"EBA.{ba}-ALL.D.H"] - interchange[f"EBA.{ba}-ALL.NG.H"],
    )
)

fig.update_layout(
    title=f"{ba} demand - generation",
    xaxis_title="Date",
    yaxis_title="Demand - generation",
    legend_title="Series",
)

# Sign issues across interchange data

Most interchanges should be negatively correlated with the interchange coming the other way. 

In [None]:
# given a df where columns are interchange data, add best correlation between matching BAs to interchange_cors dict
# optionally, write markdown to {file}.md and csvs at {file}_{ba}.csv
def interchange_sign(interchange, i_sign: dict = {}, file="", name: str = "cors"):
    for ba in bas930:
        print(ba, end="...")
        other_cols = [
            c
            for c in interchange.columns
            if re.split(r"[-.]", c)[1] == ba and re.split(r"[-.]", c)[2] != "ALL"
        ]
        other_bas = [re.split(r"[-.]", c)[2] for c in other_cols]
        # print(f"{ba} connects to {other_bas}")

        out = pd.DataFrame(index=other_bas, columns=range(-12, 12), dtype=float)
        for o_ba in out.index:
            this_way = f"EBA.{o_ba}-{ba}.ID.H"
            other_way = f"EBA.{ba}-{o_ba}.ID.H"
            if other_way not in interchange or this_way not in interchange:
                continue
            for lag in out.columns:
                out.loc[o_ba, lag] = interchange[this_way].corr(
                    -1 * interchange[other_way].shift(lag)
                )

        # where is correlation the best?
        out = out.apply(lambda s: s.iloc[abs(s).argmax()], axis=1)

        i_sign[ba] = pd.concat(
            [i_sign.get(ba, pd.DataFrame()), out.rename(name)], axis="columns"
        )

    return i_sign

In [None]:
int_sign = interchange_sign(interchange, {}, name="all_years")
int_sign = interchange_sign(
    interchange["2021-01-01T00:00":"2021-12-30T00:00"], int_sign, name="2021"
)
int_sign = interchange_sign(
    interchange["2022-01-01T00:00":"2022-12-30T00:00"], int_sign, name="2022"
)
int_sign = interchange_sign(
    interchange[(interchange.index.month >= 4) & (interchange.index.month <= 9)],
    int_sign,
    name="daylight savings",
)
int_sign = interchange_sign(
    interchange[(interchange.index.month >= 11) | (interchange.index.month <= 2)],
    int_sign,
    name="standard time",
)

In [None]:
file = f"{outputs_folder('2022')}/interchange_cors_sign.md"
hs = open(file, "w")
hs.write("\n\n")
hs.close()

for ba, out in int_sign.items():
    # add new lines for proper markdown syntax
    hs = open(file, "a")
    hs.write(f"\n\n# {ba}\n\n")
    hs.close()

    out.to_markdown(file, mode="a")