# Cleaning of 930, analyze how cleaning affects residual profile

In [None]:
# data source https://gridemissions.s3.us-east-2.amazonaws.com/EBA_elec.csv.gz

In [None]:
import numpy as np
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go

import datetime as dt

In [None]:
%reload_ext autoreload
%autoreload 2

# Tell python where to look for modules. 
# Depending on how your jupyter handles working directories, this may not be needed.
import sys
sys.path.append('../../hourly-egrid/')

from src.visualization import day_hour_heatmap
from src.eia930 import fuel_code_map, reformat_chalendar, load_chalendar
from src.load_data import download_chalendar_files

In [None]:
year = 2020

In [None]:
# Download data if not exists
download_chalendar_files()

In [None]:
# Note: cleaned file written by 930_lag.py
original = load_chalendar("../data/outputs/EBA_adjusted_rolling.csv", year=year)

In [None]:
cleaned = load_chalendar("../data/outputs/EBA_adjusted_elec.csv", year=year)

In [None]:
cems = pd.read_csv(f"../data/outputs/cems_{year}_cleaned_20220505.csv", index_col=0, parse_dates=['operating_datetime_utc'])
cems = cems.rename(columns={"operating_datetime_utc":"datetime_utc"})

# Visualize residual in one large BA

In [None]:
cleaned.head()

In [None]:
ba = "CISO"

ba_dat = cleaned.loc[cleaned["BA"] == ba, ["generation","fuel","datetime_utc"]].copy().rename(columns={"generation":"cleaned_gen"})
ba_dat = ba_dat.merge(\
    original.loc[original["BA"]==ba, ["generation","fuel","datetime_utc"]].copy().rename(columns={"generation":"original_gen"}),\
    how='left', on=["fuel","datetime_utc"])

In [None]:
#px.line(ba_dat, x="datetime_utc", y=["cleaned_gen","original_gen"], color="fuel")

In [None]:
# 930 fuel types are COL, NG, OIL, WAT, SUN, WND, OTH
# Chalendar adds two others: BIO, GEO in cleaned data, only in CISO
cleaned.fuel.unique()

In [None]:
cems["fuel"] = cems["energy_source_code"].map(fuel_code_map)

In [None]:

# aggregate by either BA or physical BA, either way, make naming consistent
cems_aggregated = cems.groupby(["datetime_utc","ba_code_physical","fuel"]).sum()["net_generation_mwh"].reset_index().rename(columns={"ba_code_physical":"ba_code"})

# Plot residual 

In [None]:
#cems_fuels = cems_aggregated.fuel.unique() # includes SUN for some reason, dont' want that
cems_fuels = ["COL","NG","OIL","OTH"]

In [None]:
cleaned.BA.unique()

In [None]:
ba = "PJM"

ba_dat = cleaned.loc[cleaned["BA"] == ba, ["generation","fuel","datetime_utc"]].copy().rename(columns={"generation":"cleaned_gen"})
ba_dat = ba_dat.merge(\
    original.loc[original["BA"]==ba, ["generation","fuel","datetime_utc"]].copy().rename(columns={"generation":"original_gen"}),\
    how='left', on=["fuel","datetime_utc"])

# add cems data
cems_ba = cems_aggregated.loc[cems_aggregated["ba_code"]==ba].copy()\
    .rename(columns={"net_generation_mwh":"cems_gen","operating_datetime_utc":"index","ba_code":"BA"})
ba_dat = ba_dat.merge(cems_ba, how='left', on=['fuel','datetime_utc'])

# Only want to show fuels in cems 
ba_dat = ba_dat[ba_dat.fuel.isin(cems_fuels)]

px.line(ba_dat, x="datetime_utc", y=["cleaned_gen","original_gen","cems_gen"], facet_col="fuel", facet_col_wrap=2)

In [None]:
ba = "MISO"
use = cleaned

toshow = []
for fuel in cems_fuels:
    cleaned_baf = use[(use.fuel==fuel) & (use.BA==ba)]
    cleaned_cems = cems_aggregated[(cems_aggregated.fuel==fuel) & (cems_aggregated.ba_code==ba)]
    dif = cleaned_baf.set_index("datetime_utc").generation - cleaned_cems.set_index("datetime_utc").net_generation_mwh
    fig, times = day_hour_heatmap(dif, year=year)
    toshow.append(fig)
    # map, times = day_hour_heatmap(cleaned_baf - cems_baf)

fig = px.imshow(np.stack(toshow), facet_col=0, facet_col_wrap=1,color_continuous_scale="RdBu", \
    width=1000, height=600, x=times, color_continuous_midpoint=0, \
    title=f"{ba}", template='plotly_white')

# Set facet names to fuels
fig.for_each_annotation(lambda a: a.update(text=cems_fuels[int(a.text.split("=")[-1])]))

# Calculate all residuals

In [None]:
residuals = pd.DataFrame()

# Calculate all residuals 
bas = np.union1d(cleaned.BA.unique(), cems_aggregated.ba_code.unique())
for ba in bas:
    print(f"{ba}...", end="")
    for fuel in cems_fuels:
        cleaned_baf = cleaned[(cleaned.fuel==fuel) & (cleaned.BA==ba)]
        cleaned_cems = cems_aggregated[(cems_aggregated.fuel==fuel) & (cems_aggregated.ba_code==ba)]
        dif = cleaned_baf.set_index("datetime_utc").generation - cleaned_cems.set_index("datetime_utc").net_generation_mwh
        dif = dif.rename("residual").to_frame()
        dif["BA"] = ba
        dif["fuel"] = fuel
        residuals = pd.concat([residuals,dif])

In [None]:
r = residuals.reset_index().set_index(["BA","fuel","datetime_utc"])
r = r.dropna()

# Metrics: 

* Number of sign changes in residual profile
* Mean residual profile
* % of days with negative residual
* Generation we expect in residual, based on difference between BA's 923 generation and 930 generation 

In [None]:
eia923 = pd.read_csv(f"../data/outputs/eia923_clean_{year}_20220502")

In [None]:
# Calculate difference between EIA-923 (expected annual 930 generation) and CEMS: this is the expected magnitude of the residual 
# group by fuel and BA
eia923["fuel"] = eia923["energy_source_code"].map(fuel_code_map)
net_gen_923 = (eia923.groupby(["ba_code","fuel"]).sum()).reset_index()
net_gen_923 = net_gen_923.pivot(index="ba_code",columns="fuel", values="net_generation_mwh").drop(columns=["SUN"])

# difference from CEMS 
cems_by_fuel = cems_aggregated.groupby(["ba_code","fuel"]).sum().reset_index().pivot(index="ba_code",columns="fuel",values="net_generation_mwh").drop(columns="SUN")

expected_residual = net_gen_923 - cems_by_fuel

expected_residual_as_percent = expected_residual/net_gen_923
expected_residual_as_percent_of_total = expected_residual.div(net_gen_923.sum(axis='columns'), axis='index')

In [None]:
px.bar(expected_residual, x=expected_residual.index, y=["COL","NG","OIL","OTH"], title="Expected residual (annual MWh)")

In [None]:
px.bar(expected_residual_as_percent, x=expected_residual_as_percent.index, y=["COL","NG","OIL","OTH"], title="Expected residual (percent of annual generation by fuel type)")

In [None]:
px.bar(expected_residual_as_percent_of_total, x=expected_residual_as_percent_of_total.index,
    y=["COL","NG","OIL","OTH"],title="Expected residual (percent of total annual generation)")

In [None]:
# Compare expected and actual residuals
total_residual = residuals.groupby(["BA","fuel"]).sum().reset_index()
total_residual["what"] = "Actual (930-CEMS)"

e = expected_residual.melt(ignore_index=False).reset_index().rename(columns={"ba_code":"BA","value":"residual"})
e["what"] = "Expected (923-CEMS)"

total_residual = pd.concat([total_residual, e])
px.bar(total_residual, x="BA",y="residual",color="what",facet_col="fuel",facet_col_wrap=1, barmode='group', width=1500, height=1000)


In [None]:
ng = net_gen_923.melt(ignore_index=False).reset_index().rename(columns={"value":"923_by_fuel"})
total_residual = total_residual.merge(ng, how='left',left_on=["BA","fuel"], right_on=["ba_code","fuel"])

ng_tot = ng.groupby("ba_code").sum().rename(columns={"923_by_fuel":"923_total"})
total_residual = total_residual.merge(ng_tot, how='left', left_on='BA', right_index=True)

total_residual

In [None]:
total_residual["residual_frac_of_fuel_gen"]  = total_residual["residual"]/total_residual["923_by_fuel"]
total_residual["residual_frac_of_gen"]  = total_residual["residual"]/total_residual["923_total"]


In [None]:
# Compare expected and actual residuals as percent of  923 generation 

px.bar(total_residual, x="BA",y="residual_frac_of_fuel_gen",color="what",facet_col="fuel",facet_col_wrap=1, barmode='group', 
    width=1500, height=1000, title="Residual as percent of generation by fuel")

In [None]:
# Compare expected and actual residuals as percent of  923 generation 

px.bar(total_residual, x="BA",y="residual_frac_of_gen",color="what",facet_col="fuel",facet_col_wrap=1, barmode='group', 
    width=1500, height=1000, title="Residual as percent of generation by fuel")

# Metrics per-BA and fuel

In [None]:
bas = r.index.get_level_values(0).unique()
fuels = r.index.get_level_values(1).unique()

summary_sign_change = pd.DataFrame(index=bas, columns=fuels) # df of ba, fuel

for ba in bas:
    for fuel in r.loc[(ba)].index.get_level_values(0).unique(): # some bas don't have all fuels
        target = r.loc[(ba,fuel),:]
        val = np.sign(target).diff().ne(0).sum()
        summary_sign_change.loc[ba,fuel] = val.to_numpy()[0]

summary_sign_change

In [None]:
bas = r.index.get_level_values(0).unique()
fuels = r.index.get_level_values(1).unique()

summary_mean = pd.DataFrame(index=bas, columns=fuels) # df of ba, fuel

for ba in bas:
    for fuel in r.loc[(ba)].index.get_level_values(0).unique(): # some bas don't have all fuels
        target = r.loc[(ba,fuel),:]
        val = target.mean()
        summary_mean.loc[ba,fuel] = val.to_numpy()[0]

summary_mean

In [None]:
bas = r.index.get_level_values(0).unique()
fuels = r.index.get_level_values(1).unique()

summary_lt_0 = pd.DataFrame(index=bas, columns=fuels) # df of ba, fuel

for ba in bas:
    for fuel in r.loc[(ba)].index.get_level_values(0).unique(): # some bas don't have all fuels
        target = r.loc[(ba,fuel),:]
        val = (target < 0).residual.sum()/len(target)
        summary_lt_0.loc[ba,fuel] = val

summary_lt_0

# Plot some generation residuals

In [None]:
# Note: the figure created here is very large, so leave out of git commit by commenting.

#px.line(residuals, x=residuals.index, y='residual', color='BA', facet_col='fuel', facet_col_wrap=1,width=1500, height=1000)