# Validate EIA-930 data against net generation outputs

In [None]:
import pandas as pd
import numpy as np
import os

import plotly.express as px

In [None]:
year = 2020

In [None]:
# EIA-930 data after timestamp adjustments but no cleaning
raw = pd.read_csv("../data/outputs/2020/eia930/eia930_raw.csv", index_col=0, parse_dates=True)

In [None]:
GEN_ID = "EBA.{}-ALL.NG.H"
path = f"../data/results/{year}/power_sector_data/hourly/us_units/"
cors = {}
percent_difs = {}
annual_gen = {}
for ba_f in os.listdir(path):
    ba = ba_f.replace(".csv", "")
    print(ba, end="...")
    col_name = GEN_ID.format(ba)
    if col_name not in raw.columns: 
        continue
    else:
        dat = pd.read_csv(path+ba_f, parse_dates=["datetime_utc"])
        dat = dat[dat.fuel_category=="total"]
        dat = dat.merge(raw[ col_name], left_on="datetime_utc", right_index=True)
        c = dat[["net_generation_mwh", col_name]].corr().to_numpy()[0,1]
        cors[ba] = c
        difs = (dat[col_name]-dat["net_generation_mwh"])/dat["net_generation_mwh"]
        difs = difs.replace(np.inf, np.nan)
        percent_difs[ba] = difs.median()
        annual_gen[ba] = dat["net_generation_mwh"].sum()

In [None]:
out = pd.DataFrame(data={"Difference as percent of hourly-egrid":percent_difs, "Correlation":cors, "Annual BA generation":annual_gen})
out = out.sort_values("Annual BA generation", ascending=False)
out.to_csv(f"../data/results/{year}/validation_metrics/us_units/compare_930_hourlyegrid.csv")

# Visualize BA of interest

In [None]:
ba = "ERCO"
col_name = GEN_ID.format(ba)
dat = pd.read_csv(path+ba+".csv", parse_dates=["datetime_utc"])
dat = dat[dat.fuel_category=="total"]
dat = dat.merge(raw[ col_name], left_on="datetime_utc", right_index=True)

px.line(dat, x="datetime_utc", y=["net_generation_mwh", col_name])