# Evaluate 930 lag

Currently we assume that PJM and CISO report start-of-hour to EIA-930 for both generation and interchange, while all other BAs report end-of-hour for both. 

This is based on analysis comparing EIA-930 to ISO-reported data, so the analysis did not consider small western BAs. 

Manual inspection of interchange data between CISO and nearby BAs indicates that CISO is not lagged relative to these BAs, so one of two things is happening: 

1) CISO interchange data is end-of-hour 

2) BAs surrounding CISO report start-of-hour for interchange, like CISO, and maybe also for generation 

### Plan 

1) calculate correlation between CEMS and 930 generation time series in each BA; run lagged correlation (1h in each direction), check which is more correlated 

2) repeat for correlation between interchange and interchange*(-1) for neighboring BAs

3) shift per-BA generation and interchange data as indicated by the above

4) repeat steps 1&2 to ensure that we are now correct


### Algorithm for figuring out how much to lag by 

#### Generation: 

What's the lag consistent over checks? use that. 

#### Interchange: 

* Is non-zero lag consistent across my connections? if not, assume I'm fine and the other one is the issue. 
* Is non-zero lag the same across my connections? 


### Checks 
* Do we have the same plan using different years? 
* Do we have the same plan using April - October (inclusive) vs using December - February (inclusive)? (with / wo DST)
* Is demand (D) correlated with generation (NG) without lag in 930 data? 


In [None]:
import pandas as pd
import numpy as np
import re
from datetime import timedelta

import plotly.express as px
import plotly.graph_objects as go

In [None]:
%reload_ext autoreload
%autoreload 2

# Tell python where to look for modules. 
# Depending on how your jupyter handles working directories, this may not be needed.
import sys
sys.path.append('../../')

from src.download_data import download_chalendar_files
from src.eia930 import reformat_chalendar, manual_930_adjust
from src.load_data import PATH_TO_LOCAL_REPO, data_folder

In [None]:
# Download data if not exists
download_chalendar_files()

In [None]:
raw = pd.read_csv(f"{data_folder()}/downloads/eia930/chalendar/EBA_raw.csv",index_col=0, parse_dates=True)
fixed = manual_930_adjust(raw)
fixed.to_csv(f"{data_folder()}/outputs/EBA_adjusted_raw.csv")

In [None]:
# Load data
#eia930 = pd.read_csv("../data/eia930/chalendar/EBA_rolling.csv",index_col=0, parse_dates=True)
eia930 = pd.read_csv(f"{data_folder()}/outputs/EBA_adjusted_raw.csv",index_col=0, parse_dates=True)
eia930 = eia930[eia930.index>"2018-07-01T00:00"] # limit to after gen was reported by fuel type

In [None]:
eia930_raw = reformat_chalendar(raw)

In [None]:
eia930 = reformat_chalendar(eia930)

In [None]:
# some bad values missed in rolling filter
eia930.loc[(eia930.BA=="SEC") & (eia930.generation > 50000), "generation"] = np.nan

In [None]:
plant_meta = pd.read_csv("../../data/outputs/2020/plant_static_attributes_2020.csv")

In [None]:

files = [f"{data_folder()}/outputs/2019/cems_2019.csv", f"{data_folder()}/outputs/2020/cems_2020.csv"]

# Load files
# Aggregate by BA during loading to cut down on space
cems = pd.DataFrame()
for y in files: 
    print(f"loading {y}")
    c = pd.read_csv(y, index_col=0, parse_dates=['datetime_utc'])
    c = c.rename(columns={"datetime_utc":"datetime_utc"})
    c = c.merge(plant_meta[['plant_id_eia', 'plant_primary_fuel', 'ba_code']], how='left', left_index=True, right_on='plant_id_eia')
    # exclude solar power for CEMS, since we're just going to look at COL + OIL + NG in the 930 data
    c = c[c["plant_primary_fuel"] != "SUN"]
    print("Aggregating")
    cems_aggregated = c.groupby(["datetime_utc","ba_code"]).sum()["net_generation_mwh"].reset_index()
    cems = pd.concat([cems, cems_aggregated])


In [None]:
# Edge case when loading CEMS data: for some BAs, some plants are reported in either 2019 and 2020 files, so need to groupby again to catch those
cems = cems.groupby(["datetime_utc","ba_code"]).sum()["net_generation_mwh"].reset_index()

In [None]:
# Filter for fossil fuels, sum by BA
eia930 = eia930[eia930.fuel.isin(["COL","NG","OIL"])]
eia930 = eia930.groupby(["datetime_utc","BA"]).sum()["generation"].reset_index()

# Correlation 

In [None]:
bas = set(eia930.BA.unique())
bas.intersection_update(set(cems.ba_code.unique()))

In [None]:
print(f"shared BAs: {len(bas)} out of {len(eia930.BA.unique())} 930 BAs and {len(cems.ba_code.unique())} CEMS BAs.")

missing_cems = set(eia930.BA.unique()).difference(set(cems.ba_code.unique()))
missing_930 = set(cems.ba_code.unique()).difference(set(eia930.BA.unique()))
print(f"930 BAs missing in CEMS: {missing_cems}")
print(f"CEMS missing 930: {missing_930}")

In [None]:
def find_best_cor(cems, eia930):
    cems = cems.pivot(columns="ba_code", index="datetime_utc", values="net_generation_mwh")
    eia930 = eia930.pivot(columns="BA", index="datetime_utc", values="generation")

    bas = set(cems.columns).intersection(set(eia930.columns))

    correlations = pd.DataFrame(index=bas, columns=range(-12,12), dtype=float)

    for ba in correlations.index:
        for lag in correlations.columns:
            # prepare 930: select BA 
            #eia = eia930[eia930.BA==ba]["generation"]
            # prepare CEMS: select BA
            #c = cems[cems.ba_code==ba]["net_generation_mwh"]
            # calculate 
            correlations.loc[ba,lag] = cems[ba]\
                .corr(eia930[ba].shift(lag))

    best = correlations.apply(lambda s: s.index[s.argmax()], axis=1).rename("best")

    correlations = pd.concat([best, correlations], axis='columns')
    return correlations

In [None]:
#find_best_cor(cems[(cems.datetime_utc.dt.month>=11)|(cems.datetime_utc.dt.month<=2)],
#        eia930[(eia930.datetime_utc.dt.month>=11)|(eia930.datetime_utc.dt.month<=2)]).loc["WALC"]

In [None]:
## Calculate correlations using different subsets of 930 data 

cems_930_cors = pd.concat([find_best_cor(cems, eia930).best.rename("all_years"),\
    find_best_cor(cems[cems.datetime_utc.dt.year==2019],eia930[eia930.datetime_utc.dt.year==2019]).best.rename("2019"),
    find_best_cor(cems[cems.datetime_utc.dt.year==2020],eia930[eia930.datetime_utc.dt.year==2020]).best.rename("2020"),
    find_best_cor(cems[(cems.datetime_utc.dt.month>=4)&(cems.datetime_utc.dt.month<=9)],
        eia930[(eia930.datetime_utc.dt.month>=4)&(eia930.datetime_utc.dt.month<=9)]).best.rename("daylight time"),
    find_best_cor(cems[(cems.datetime_utc.dt.month>=11)|(cems.datetime_utc.dt.month<=2)],
        eia930[(eia930.datetime_utc.dt.month>=11)|(eia930.datetime_utc.dt.month<=2)]).best.rename("standard time")],
    axis='columns')

#cems_930_cors.to_csv("../data/outputs/cems_eia930_cor_lags.csv")
cems_930_cors

In [None]:
eia930_raw

In [None]:
ba = "SC"

to_plot_930 = eia930_raw[eia930_raw.BA==ba].groupby("datetime_utc").sum()

print(f"correlations for {ba}")
print(cems_930_cors.loc[ba])

fig = go.Figure()
fig.add_trace(go.Scatter(x=cems[cems.ba_code==ba].datetime_utc, y=cems[cems.ba_code==ba].net_generation_mwh, name="CEMS"))
fig.add_trace(go.Scatter(x=to_plot_930.index, y=to_plot_930.generation, name="EIA 930 (before adjustment)"))
fig.update_layout(
    title=ba,
    xaxis_title="Date",
    yaxis_title="Generation"
)

# Interchange

In [None]:
#interchange = pd.read_csv("../data/eia930/chalendar/EBA_rolling.csv",index_col=0, parse_dates=True)
interchange = pd.read_csv("../data/outputs/EBA_adjusted_raw.csv",index_col=0, parse_dates=True)
interchange = interchange[interchange.index>"2018-07-01T00:00"] # limit to after gen was reported by fuel type

In [None]:
bas930 = {re.split(r"[-.]",c)[1] for c in interchange.columns}

In [None]:
interchange.columns

In [None]:
# given a df where columns are interchange data, add best correlation between matching BAs to interchange_cors dict
# optionally, write markdown to {file}.md and csvs at {file}_{ba}.csv
def interchange_cor(interchange, interchange_cors:dict={}, file="", name:str="cors"):
    # Delete file
    if file != "":
        hs = open(file+".md","w")
        hs.write("\n\n")
        hs.close() 

    for ba in bas930:
        print(ba, end="...")
        other_cols = [c for c in interchange.columns \
            if re.split(r"[-.]",c)[1]==ba \
                and re.split(r"[-.]",c)[2]!="ALL"]
        other_bas = [re.split(r"[-.]",c)[2] for c in other_cols]
        #print(f"{ba} connects to {other_bas}")

        out = pd.DataFrame(index=other_bas, columns=range(-12,12), dtype=float)
        for o_ba in out.index:
            this_way = f"EBA.{o_ba}-{ba}.ID.H"
            other_way = f"EBA.{ba}-{o_ba}.ID.H"
            for lag in out.columns:
                out.loc[o_ba,lag] = abs(interchange[this_way]\
                    .corr(-1*interchange[other_way].shift(lag)))
        
        # where is correlation the best?
        out = pd.concat([out, out.apply(lambda s: s.index[s.argmax()], axis=1).rename("best")], axis='columns')

        if file != "":
            # add new lines for proper markdown syntax
            hs = open(file+".md","a")
            hs.write(f"\n\n# {ba}\n\n")
            hs.close() 

            out.to_markdown(file+".md",mode="a")

            out.to_csv(f"{file}_{ba}"+".csv")

        interchange_cors[ba] = pd.concat([interchange_cors.get(ba, pd.DataFrame()), out.best.rename(name)], axis='columns')

    return interchange_cors


In [None]:
int_cors = interchange_cor(interchange, interchange_cors={}, name="all_years")
int_cors = interchange_cor(interchange["2019-01-01T00:00":"2019-12-30T00:00"], int_cors, name="2019")
int_cors = interchange_cor(interchange["2020-01-01T00:00":"2020-12-30T00:00"], int_cors, name="2020")
int_cors = interchange_cor(interchange[(interchange.index.month >= 4)&(interchange.index.month <=9)], int_cors, name="daylight savings")
int_cors = interchange_cor(interchange[(interchange.index.month >= 11)|(interchange.index.month <=2)], int_cors, name="standard time")


In [None]:
int_cors["PJM"]

In [None]:
int_cors.items()

In [None]:
file = "../data/outputs/interchange_correlations/summary_adjusted.md"
hs = open(file,"w")
hs.write("\n\n")
hs.close() 

for (ba,out) in int_cors.items():

    # add new lines for proper markdown syntax
            hs = open(file,"a")
            hs.write(f"\n\n# {ba}\n\n")
            hs.close() 

            out.to_markdown(file,mode="a")

In [None]:
ba = "PJM"

# find cols of mappings in both directions 
other_cols = [c for c in interchange.columns \
    if re.split(r"[-.]",c)[1]==ba \
        and re.split(r"[-.]",c)[2]!="ALL"]
other_bas = [re.split(r"[-.]",c)[2] for c in other_cols]

these_cols = [f"EBA.{o_ba}-{ba}.ID.H" for o_ba in other_bas]

# make long version with just cols of interest, adding BA column and to/from column
toplot = pd.DataFrame()
for i in range(len(other_bas)): 
    to_add = (interchange[other_cols[i]]).rename("interchange").to_frame()
    to_add["source"] = ba
    to_add["BA"] = other_bas[i]

    to_add_2 = (interchange[these_cols[i]]*(-1)).rename("interchange").to_frame()
    to_add_2["source"] = "other BA"
    to_add_2["BA"] = other_bas[i]

    toplot = pd.concat([toplot, to_add, to_add_2], axis='index')


In [None]:
fig = px.line(toplot, x=toplot.index, y="interchange", facet_col="BA", facet_col_wrap=2, color="source")
fig.update_layout(
    title=f"Interchange from {ba}",
    xaxis_title="Date",
    yaxis_title="Interchange",
    legend_title="Source for<br>interchange data"
)
fig.for_each_annotation(lambda a: a.update(text="Other "+a.text))

In [None]:
first="PJM"
second="MISO"

fig = px.line(interchange, x=interchange.index, y=[f"EBA.{first}-{second}.ID.H",f"EBA.{second}-{first}.ID.H", f"EBA.{first}-ALL.TI.H"])

fig.update_layout(
    title=f"{first}/{second} interchange",
    xaxis_title="Date",
    yaxis_title="Interchange",
    legend_title="Series"
)

In [None]:
ba = "CFE"

fig = go.Figure()
fig.add_trace(go.Scatter(x=interchange.index, 
    y=interchange[f"EBA.{ba}-ALL.D.H"]-interchange[f"EBA.{ba}-ALL.NG.H"]))

fig.update_layout(
    title=f"{ba} demand - generation",
    xaxis_title="Date",
    yaxis_title="Demand - generation",
    legend_title="Series"
)

# Sign issues across interchange data

Most interchanges should be negatively correlated with the interchange coming the other way. 

In [None]:
# given a df where columns are interchange data, add best correlation between matching BAs to interchange_cors dict
# optionally, write markdown to {file}.md and csvs at {file}_{ba}.csv
def interchange_sign(interchange, i_sign:dict={}, file="", name:str="cors"):
    for ba in bas930:
        print(ba, end="...")
        other_cols = [c for c in interchange.columns \
            if re.split(r"[-.]",c)[1]==ba \
                and re.split(r"[-.]",c)[2]!="ALL"]
        other_bas = [re.split(r"[-.]",c)[2] for c in other_cols]
        #print(f"{ba} connects to {other_bas}")

        out = pd.DataFrame(index=other_bas, columns=range(-12,12), dtype=float)
        for o_ba in out.index:
            this_way = f"EBA.{o_ba}-{ba}.ID.H"
            other_way = f"EBA.{ba}-{o_ba}.ID.H"
            for lag in out.columns:
                out.loc[o_ba,lag] = interchange[this_way]\
                    .corr(-1*interchange[other_way].shift(lag))
        
        # where is correlation the best?
        out = out.apply(lambda s: s.iloc[abs(s).argmax()], axis=1)

        i_sign[ba] = pd.concat([i_sign.get(ba, pd.DataFrame()), out.rename(name)], axis='columns')

    return i_sign


In [None]:
int_sign = interchange_sign(interchange, {}, name="all_years")
int_sign = interchange_sign(interchange["2019-01-01T00:00":"2019-12-30T00:00"], int_sign, name="2019")
int_sign = interchange_sign(interchange["2020-01-01T00:00":"2020-12-30T00:00"], int_sign, name="2020")
int_sign = interchange_sign(interchange[(interchange.index.month >= 4)&(interchange.index.month <=9)], int_sign, name="daylight savings")
int_sign = interchange_sign(interchange[(interchange.index.month >= 11)|(interchange.index.month <=2)], int_sign, name="standard time")

In [None]:
file = "../data/outputs/interchange_cors_sign.md"
hs = open(file,"w")
hs.write("\n\n")
hs.close() 

for (ba,out) in int_sign.items():

    # add new lines for proper markdown syntax
            hs = open(file,"a")
            hs.write(f"\n\n# {ba}\n\n")
            hs.close() 

            out.to_markdown(file,mode="a")

# Is D = G - I better lagged?

In [None]:
differences = pd.DataFrame(index=bas930, columns=range(-2,3), dtype=float)

for ba in bas930:
    for lag in differences.columns:
        dif = interchange[f"EBA.{ba}-ALL.NG.H"]-\
            interchange[f"EBA.{ba}-ALL.D.H"]-\
            interchange[f"EBA.{ba}-ALL.TI.H"].shift(lag)
        differences.loc[ba,lag] = dif.mean()


In [None]:
differences = pd.concat([differences.apply(lambda s: s.index[s.argmin()], axis=1).rename("best"),
    differences], axis='columns')

differences.to_markdown("../data/outputs/lagged_differences.md")

# Adjust

In [None]:
raw = pd.read_csv("../data/eia930/chalendar/EBA_rolling.csv",index_col=0, parse_dates=True)
fixed = manual_930_adjust(raw)
fixed.to_csv("../data/outputs/EBA_adjusted_raw.csv")

In [None]:
fixed = manual_930_adjust(raw)

In [None]:
fixed.to_csv("../data/outputs/EBA_adjusted_raw.csv")

In [None]:
is_dst = raw.index.tz_convert("US/Eastern").to_series().apply(lambda s: s.utcoffset()) == timedelta(hours=-4)

In [None]:
pjm_offset = [timedelta(hours=-3) if is_d else timedelta(hours=-4) for is_d in is_dst]

In [None]:
pd.Series(pjm_offset)