# Cleaning of 930, analyze how cleaning affects residual profile

In [1]:
# data source https://gridemissions.s3.us-east-2.amazonaws.com/EBA_elec.csv.gz

In [3]:
import numpy as np
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go

import datetime as dt

In [4]:
%reload_ext autoreload
%autoreload 2

# Tell python where to look for modules. 
# Depending on how your jupyter handles working directories, this may not be needed.
import sys
sys.path.append('../../hourly-egrid/')

from src.visualization import day_hour_heatmap
from src.eia930 import fuel_code_map, reformat_chalendar, load_chalendar
from src.load_data import download_chalendar_files



In [6]:
year = 2020

In [7]:
# Download data if not exists
download_chalendar_files()

EBA_elec.csv already downloaded
EBA_raw.csv already downloaded


In [8]:
# Note: cleaned file written by 930_lag.py
original = load_chalendar("../data/eia930/chalendar/EBA_adjusted_rolling.csv", year=year)

Filtering
Expanding cols
Dropping and renaming


In [9]:
cleaned = load_chalendar("../data/eia930/chalendar/EBA_adjusted_elec.csv", year=year)

Filtering
Expanding cols
Dropping and renaming


In [115]:
cems = pd.read_csv(f"../data/output/cems_{year}_cleaned_20220505.csv", index_col=0, parse_dates=['operating_datetime_utc'])
cems = cems.rename(columns={"operating_datetime_utc":"datetime_utc"})


Columns (22) have mixed types. Specify dtype option on import or set low_memory=False.



# Visualize residual in one large BA

In [12]:
cleaned.head()

Unnamed: 0,datetime_utc,variable,generation,BA,fuel
0,2020-01-01 00:00:00+00:00,EBA.AEC-ALL.NG.COL.H,59.449007,AEC,COL
1,2020-01-01 01:00:00+00:00,EBA.AEC-ALL.NG.COL.H,57.441636,AEC,COL
2,2020-01-01 02:00:00+00:00,EBA.AEC-ALL.NG.COL.H,38.705315,AEC,COL
3,2020-01-01 03:00:00+00:00,EBA.AEC-ALL.NG.COL.H,72.99278,AEC,COL
4,2020-01-01 04:00:00+00:00,EBA.AEC-ALL.NG.COL.H,62.330264,AEC,COL


In [13]:
ba = "CISO"

ba_dat = cleaned.loc[cleaned["BA"] == ba, ["generation","fuel","datetime_utc"]].copy().rename(columns={"generation":"cleaned_gen"})
ba_dat = ba_dat.merge(\
    original.loc[original["BA"]==ba, ["generation","fuel","datetime_utc"]].copy().rename(columns={"generation":"original_gen"}),\
    how='left', on=["fuel","datetime_utc"])

In [127]:
#px.line(ba_dat, x="datetime_utc", y=["cleaned_gen","original_gen"], color="fuel")

In [15]:
# 930 fuel types are COL, NG, OIL, WAT, SUN, WND, OTH
# Chalendar adds two others: BIO, GEO in cleaned data, only in CISO
cleaned.fuel.unique()

array(['COL', 'NG', 'NUC', 'OIL', 'OTH', 'SUN', 'WAT', 'WND', 'GEO',
       'BIO'], dtype=object)

In [116]:
cems["fuel"] = cems["energy_source_code"].map(fuel_code_map)

In [122]:

# aggregate by either BA or physical BA, either way, make naming consistent
cems_aggregated = cems.groupby(["datetime_utc","ba_code_physical","fuel"]).sum()["net_generation_mwh"].reset_index().rename(columns={"ba_code_physical":"ba_code"})

# Plot residual 

In [123]:
#cems_fuels = cems_aggregated.fuel.unique() # includes SUN for some reason, dont' want that
cems_fuels = ["COL","NG","OIL","OTH"]

In [120]:
cleaned.BA.unique()

array(['AEC', 'AECI', 'AZPS', 'CISO', 'CPLE', 'CPLW', 'DUK', 'EEI',
       'ERCO', 'FMPP', 'FPC', 'GRID', 'GVL', 'IPCO', 'ISNE', 'JEA',
       'LDWP', 'LGEE', 'MISO', 'NEVP', 'NWMT', 'NYIS', 'OVEC', 'PACE',
       'PACW', 'PGE', 'PJM', 'PNM', 'PSCO', 'PSEI', 'SC', 'SCEG', 'SEC',
       'SOCO', 'SRP', 'SWPP', 'TEC', 'TEPC', 'TVA', 'WACM', 'WALC', 'AVA',
       'AVRN', 'BANC', 'BPAT', 'DEAA', 'EPE', 'FPL', 'GRIF', 'HGMA',
       'HST', 'IID', 'TAL', 'TIDC', 'NSB', 'AESO', 'CEN', 'CFE', 'GRMA',
       'IESO', 'NBSO', 'SPA', 'SPC', 'BCHA', 'CHPD', 'DOPD', 'GCPD',
       'HQT', 'MHEB', 'SCL', 'SEPA', 'TPWR', 'WAUW', 'YAD', 'GWA', 'WWA'],
      dtype=object)

In [125]:
ba = "PJM"

ba_dat = cleaned.loc[cleaned["BA"] == ba, ["generation","fuel","datetime_utc"]].copy().rename(columns={"generation":"cleaned_gen"})
ba_dat = ba_dat.merge(\
    original.loc[original["BA"]==ba, ["generation","fuel","datetime_utc"]].copy().rename(columns={"generation":"original_gen"}),\
    how='left', on=["fuel","datetime_utc"])

# add cems data
cems_ba = cems_aggregated.loc[cems_aggregated["ba_code"]==ba].copy()\
    .rename(columns={"net_generation_mwh":"cems_gen","operating_datetime_utc":"index","ba_code":"BA"})
ba_dat = ba_dat.merge(cems_ba, how='left', on=['fuel','datetime_utc'])

# Only want to show fuels in cems 
ba_dat = ba_dat[ba_dat.fuel.isin(cems_fuels)]

px.line(ba_dat, x="datetime_utc", y=["cleaned_gen","original_gen","cems_gen"], facet_col="fuel", facet_col_wrap=2)

In [126]:
ba = "MISO"
use = cleaned

toshow = []
for fuel in cems_fuels:
    cleaned_baf = use[(use.fuel==fuel) & (use.BA==ba)]
    cleaned_cems = cems_aggregated[(cems_aggregated.fuel==fuel) & (cems_aggregated.ba_code==ba)]
    dif = cleaned_baf.set_index("datetime_utc").generation - cleaned_cems.set_index("datetime_utc").net_generation_mwh
    fig, times = day_hour_heatmap(dif, year=year)
    toshow.append(fig)
    # map, times = day_hour_heatmap(cleaned_baf - cems_baf)

fig = px.imshow(np.stack(toshow), facet_col=0, facet_col_wrap=1,color_continuous_scale="RdBu", \
    width=1000, height=600, x=times, color_continuous_midpoint=0, \
    title=f"{ba}", template='plotly_white')

# Set facet names to fuels
fig.for_each_annotation(lambda a: a.update(text=cems_fuels[int(a.text.split("=")[-1])]))

# Calculate all residuals

In [24]:
residuals = pd.DataFrame()

# Calculate all residuals 
bas = np.union1d(cleaned.BA.unique(), cems_aggregated.ba_code.unique())
for ba in bas:
    print(f"{ba}...", end="")
    for fuel in cems_fuels:
        cleaned_baf = cleaned[(cleaned.fuel==fuel) & (cleaned.BA==ba)]
        cleaned_cems = cems_aggregated[(cems_aggregated.fuel==fuel) & (cems_aggregated.ba_code==ba)]
        dif = cleaned_baf.set_index("datetime_utc").generation - cleaned_cems.set_index("datetime_utc").net_generation_mwh
        dif = dif.rename("residual").to_frame()
        dif["BA"] = ba
        dif["fuel"] = fuel
        residuals = pd.concat([residuals,dif])

AEC...AECI...AESO...AVA...AVRN...AZPS...BANC...BCHA...BPAT...CEN...CFE...CHPD...CISO...CPLE...CPLW...CSTO...DEAA...DOPD...DUK...EEI...EPE...ERCO...FMPP...FPC...FPL...GCPD...GRID...GRIF...GRMA...GVL...GWA...HGMA...HQT...HST...IESO...IID...IPCO...ISNE...JEA...LDWP...LGEE...MHEB...MISO...NBSO...NEVP...NSB...NWMT...NYIS...OVEC...PACE...PACW...PGE...PJM...PNM...PSCO...PSEI...SC...SCEG...SCL...SEC...SEPA...SOCO...SPA...SPC...SRP...SWPP...TAL...TEC...TEPC...TIDC...TPWR...TVA...WACM...WALC...WAUW...WWA...YAD...

In [25]:
r = residuals.reset_index().set_index(["BA","fuel","datetime_utc"])
r = r.dropna()

# Metrics: 

* Number of sign changes in residual profile
* Mean residual profile
* % of days with negative residual
* Generation we expect in residual, based on difference between BA's 923 generation and 930 generation 

In [26]:
eia923 = pd.read_csv(f"../data/output/eia923_clean_{year}_20220502")

In [68]:
# Calculate difference between EIA-923 (expected annual 930 generation) and CEMS: this is the expected magnitude of the residual 
# group by fuel and BA
eia923["fuel"] = eia923["energy_source_code"].map(fuel_code_map)
net_gen_923 = (eia923.groupby(["ba_code","fuel"]).sum()).reset_index()
net_gen_923 = net_gen_923.pivot(index="ba_code",columns="fuel", values="net_generation_mwh").drop(columns=["SUN"])

# difference from CEMS 
cems_by_fuel = cems_aggregated.groupby(["ba_code","fuel"]).sum().reset_index().pivot(index="ba_code",columns="fuel",values="net_generation_mwh").drop(columns="SUN")

expected_residual = net_gen_923 - cems_by_fuel

expected_residual_as_percent = expected_residual/net_gen_923
expected_residual_as_percent_of_total = expected_residual.div(net_gen_923.sum(axis='columns'), axis='index')

In [75]:
px.bar(expected_residual, x=expected_residual.index, y=["COL","NG","OIL","OTH"], title="Expected residual (annual MWh)")

In [72]:
px.bar(expected_residual_as_percent, x=expected_residual_as_percent.index, y=["COL","NG","OIL","OTH"], title="Expected residual (percent of annual generation by fuel type)")

In [76]:
px.bar(expected_residual_as_percent_of_total, x=expected_residual_as_percent_of_total.index,
    y=["COL","NG","OIL","OTH"],title="Expected residual (percent of total annual generation)")

In [106]:
# Compare expected and actual residuals
total_residual = residuals.groupby(["BA","fuel"]).sum().reset_index()
total_residual["what"] = "Actual (930-CEMS)"

e = expected_residual.melt(ignore_index=False).reset_index().rename(columns={"ba_code":"BA","value":"residual"})
e["what"] = "Expected (923-CEMS)"

total_residual = pd.concat([total_residual, e])
px.bar(total_residual, x="BA",y="residual",color="what",facet_col="fuel",facet_col_wrap=1, barmode='group', width=1500, height=1000)


In [107]:
ng = net_gen_923.melt(ignore_index=False).reset_index().rename(columns={"value":"923_by_fuel"})
total_residual = total_residual.merge(ng, how='left',left_on=["BA","fuel"], right_on=["ba_code","fuel"])

ng_tot = ng.groupby("ba_code").sum().rename(columns={"923_by_fuel":"923_total"})
total_residual = total_residual.merge(ng_tot, how='left', left_on='BA', right_index=True)

total_residual

Unnamed: 0,BA,fuel,residual,what,ba_code,923_by_fuel,923_total
0,AEC,COL,-5.909145e+01,Actual (930-CEMS),AEC,3.184340e+05,4.846521e+06
1,AEC,NG,-2.188322e+05,Actual (930-CEMS),AEC,4.528087e+06,4.846521e+06
2,AEC,OTH,0.000000e+00,Actual (930-CEMS),AEC,,4.846521e+06
3,AECI,COL,-1.954827e+04,Actual (930-CEMS),AECI,1.306415e+07,2.340813e+07
4,AECI,NG,-2.751171e+06,Actual (930-CEMS),AECI,1.034343e+07,2.340813e+07
...,...,...,...,...,...,...,...
407,TIDC,OTH,,Expected (923-CEMS),TIDC,,1.501050e+06
408,TPWR,OTH,,Expected (923-CEMS),TPWR,3.114870e+05,3.114870e+05
409,TVA,OTH,,Expected (923-CEMS),TVA,1.277450e+06,7.509361e+07
410,WACM,OTH,,Expected (923-CEMS),WACM,,3.254784e+07


In [108]:
total_residual["residual_frac_of_fuel_gen"]  = total_residual["residual"]/total_residual["923_by_fuel"]
total_residual["residual_frac_of_gen"]  = total_residual["residual"]/total_residual["923_total"]


In [109]:
# Compare expected and actual residuals as percent of  923 generation 

px.bar(total_residual, x="BA",y="residual_frac_of_fuel_gen",color="what",facet_col="fuel",facet_col_wrap=1, barmode='group', 
    width=1500, height=1000, title="Residual as percent of generation by fuel")

In [111]:
# Compare expected and actual residuals as percent of  923 generation 

px.bar(total_residual, x="BA",y="residual_frac_of_gen",color="what",facet_col="fuel",facet_col_wrap=1, barmode='group', 
    width=1500, height=1000, title="Residual as percent of generation by fuel")

# Metrics per-BA and fuel

In [27]:
bas = r.index.get_level_values(0).unique()
fuels = r.index.get_level_values(1).unique()

summary_sign_change = pd.DataFrame(index=bas, columns=fuels) # df of ba, fuel

for ba in bas:
    for fuel in r.loc[(ba)].index.get_level_values(0).unique(): # some bas don't have all fuels
        target = r.loc[(ba,fuel),:]
        val = np.sign(target).diff().ne(0).sum()
        summary_sign_change.loc[ba,fuel] = val.to_numpy()[0]

summary_sign_change

fuel,COL,NG,OTH,OIL
BA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AEC,175.0,812.0,,
AECI,2155.0,453.0,,
AVA,,403.0,,
AVRN,,591.0,,
AZPS,7.0,355.0,,
BANC,,301.0,,
BPAT,,69.0,,
CISO,,256.0,1217.0,
CPLE,340.0,994.0,1.0,1.0
DEAA,,710.0,,


In [27]:
bas = r.index.get_level_values(0).unique()
fuels = r.index.get_level_values(1).unique()

summary_mean = pd.DataFrame(index=bas, columns=fuels) # df of ba, fuel

for ba in bas:
    for fuel in r.loc[(ba)].index.get_level_values(0).unique(): # some bas don't have all fuels
        target = r.loc[(ba,fuel),:]
        val = target.mean()
        summary_mean.loc[ba,fuel] = val.to_numpy()[0]

summary_mean

fuel,COL,NG,OTH,OIL
BA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AEC,0.03782,-24.849583,,
AECI,-2.165411,-324.117276,,
AVA,,6.953406,,
AVRN,,-86.431693,,
AZPS,-888.552165,265.262562,,
BANC,,193.555769,,
BPAT,,-449.967714,,
CISO,,1661.05998,-25.602259,
CPLE,147.588958,-17.947805,890.95558,43.224217
DEAA,,-20.72181,,


In [44]:
bas = r.index.get_level_values(0).unique()
fuels = r.index.get_level_values(1).unique()

summary_lt_0 = pd.DataFrame(index=bas, columns=fuels) # df of ba, fuel

for ba in bas:
    for fuel in r.loc[(ba)].index.get_level_values(0).unique(): # some bas don't have all fuels
        target = r.loc[(ba,fuel),:]
        val = (target < 0).residual.sum()/len(target)
        summary_lt_0.loc[ba,fuel] = val

summary_lt_0

fuel,COL,NG,OTH,OIL
BA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AEC,0.737401,0.617111,,
AECI,0.488722,0.862207,,
AVA,,0.619633,,
AVRN,,0.797173,,
AZPS,0.994861,0.289829,,
BANC,,0.132976,,
BPAT,,0.983751,,
CISO,,0.01242,0.896271,
CPLE,0.326149,0.514751,0.0,0.04
DEAA,,0.775364,,


# Plot some generation residuals

In [128]:
# Note: the figure created here is very large, so leave out of git commit by commenting.

#px.line(residuals, x=residuals.index, y='residual', color='BA', facet_col='fuel', facet_col_wrap=1,width=1500, height=1000)