# Validate EIA-930 data against net generation outputs

In [None]:
import pandas as pd
import numpy as np
import os

import plotly.express as px
import plotly.io as pio
from datetime import datetime
from datetime import timedelta
import json

import requests

In [None]:
import sys
sys.path.append("../../")

import src.load_data as load_data

In [None]:
year = 2020

In [None]:
# EIA-930 data after timestamp adjustments but no cleaning
raw = pd.read_csv(f"{load_data.data_folder()}/outputs/2020/eia930/eia930_raw.csv", index_col=0, parse_dates=True)

In [None]:
GEN_ID = "EBA.{}-ALL.NG.H"
path = f"{load_data.data_folder()}/results/{year}/power_sector_data/hourly/us_units/"
cors = {}
percent_difs = {}
annual_gen = {}
for ba_f in os.listdir(path):
    ba = ba_f.replace(".csv", "")
    print(ba, end="...")
    col_name = GEN_ID.format(ba)
    if col_name not in raw.columns: 
        continue
    else:
        dat = pd.read_csv(path+ba_f, parse_dates=["datetime_utc"])
        dat = dat[dat.fuel_category=="total"]
        dat = dat.merge(raw[ col_name], left_on="datetime_utc", right_index=True)
        c = dat[["net_generation_mwh", col_name]].corr().to_numpy()[0,1]
        cors[ba] = c
        difs = (dat[col_name]-dat["net_generation_mwh"])/dat["net_generation_mwh"]
        difs = difs.replace(np.inf, np.nan)
        percent_difs[ba] = difs.median()
        annual_gen[ba] = dat["net_generation_mwh"].sum()

In [None]:
out = pd.DataFrame(data={"Difference as percent of hourly-egrid":percent_difs, "Correlation":cors, "Annual BA generation":annual_gen})
out = out.sort_values("Annual BA generation", ascending=False)
out.to_csv(f"{load_data.data_folder()}/results/{year}/validation_metrics/us_units/compare_930_hourlyegrid.csv")

# Visualize BA of interest

In [None]:
ba = "BPAT"
col_name = GEN_ID.format(ba)
dat = pd.read_csv(path+ba+".csv", parse_dates=["datetime_utc"])
dat = dat[dat.fuel_category=="total"]
dat = dat.merge(raw[ col_name], left_on="datetime_utc", right_index=True)

px.line(dat, x="datetime_utc", y=["net_generation_mwh", col_name])

# Calculate real-time-rates from 930 + eGRID

In [None]:
eia930 = pd.read_csv(f"../data/outputs/{year}/eia930/eia930_raw.csv", parse_dates=True, index_col=0)

In [None]:
## Load factors from Singularity API

# Use last year's egrid because that's all we have in real time
# TODO: could expand to other pollutants if we use eGRID download 
url = f"https://api.singularity.energy/v1/emissions/" 
egrid_year = str(year-1) # use last year as eGRID year

headers = {
    'X-Api-Key': os.environ['SINGULARITY_API_KEY'],
}

factors = {}

for adjustment in ["adjusted", "unadjusted"]: 
    adjusted = adjustment == "adjusted"
    key = f"EGRID_{egrid_year}" if adjusted else f"EGRID_u{egrid_year}"
    response = requests.request("GET", url+key, headers=headers)
    factors[adjustment] = json.loads(response.content)["data"]


In [None]:
## For each BA, use singularity factors to calculate emission rate 
bas_to_calc = [ba.replace(".csv", "") for ba in os.listdir("../data/results/2020/power_sector_data/hourly/us_units/")]

fuel_categories = {
    "coal":"COL",
    "natural_gas":"NG",
    "other":"OTH",
    "hydro":"WAT",
    "wind":"WND",
    "solar":"SUN",
    "nuclear":"NUC",
    "petroleum":"OIL"
}

for ba in bas_to_calc:
    singularity_ba = "EIA." + ba if ba in EIA_REGIONS else ba
    if singularity_ba not in factors[adjustment].keys():
        print(f"missing ba {singularity_ba}")
        continue

    out = pd.DataFrame(index=eia930.index, columns=["adjusted_carbon","unajusted_carbon", "adjusted_rate", "unadjusted_rate"])

    for adjustment in ["adjusted", "unadjusted"]:
        s_fuels = list(factors[adjustment][singularity_ba].keys())
        s_factors = [factors[adjustment][singularity_ba][f]['value'] for f in s_fuels]
        fuels = [fuel_categories[f] for f in s_fuels]
        generation_labels = [f"EBA.{ba}-ALL.NG.{f}.H" for f in fuels]

        out.loc[:,f"{adjustment}_carbon"] = eia930[generation_labels].mul(s_factors, axis='columns').sum(axis='columns')
        out.loc[:,f"{adjustment}_rate"] = out.loc[:,f"{adjustment}_carbon"] / eia930.loc[:,f"EBA.{ba}-ALL.NG.H"]

    os.makedirs(f"{load_data.data_folder()}/outputs/{year}/validation/real_time_rate/", exist_ok=True)
    out.to_csv(f"{load_data.data_folder()}/outputs/{year}/validation/real_time_rate/{ba}.csv")


# Rate: correlations and percent differences

Evaluation of rates

In [None]:
gen_path = f"{load_data.data_folder()}/results/{year}/power_sector_data/hourly/us_units/"
consumed_path = f"{load_data.data_folder()}/results/{year}/carbon_accounting/hourly/us_units/"

In [None]:
year = 2020

In [None]:
percent_difs = {}
cors = {}
for ba in os.listdir(f"{load_data.data_folder()}/outputs/{year}/validation/real_time_rate/"):
    if ba == ".DS_Store": # just some os stuff
        continue 
    ba = ba.replace(".csv", "")
    singularity_dat = pd.read_csv(f"{load_data.data_folder()}/outputs/{year}/validation/real_time_rate/{ba}.csv", index_col=0, parse_dates=True)
    # hourly_consumed = pd.read_csv(consumed_path+ba+".csv",
    #     usecols=["datetime_utc", "consumed_co2_rate_lb_per_mwh_for_electricity", "consumed_co2_rate_lb_per_mwh_adjusted"], 
    #     index_col="datetime_utc", parse_dates=True)
    hourly_generated = pd.read_csv(gen_path+ba+".csv", 
        usecols=["datetime_utc", "generated_co2_rate_lb_per_mwh_for_electricity", "generated_co2_rate_lb_per_mwh_for_electricity_adjusted", "co2_mass_lb", "fuel_category"], 
        index_col="datetime_utc", parse_dates=True)
    hourly_generated = hourly_generated.loc[hourly_generated.fuel_category=="total"]
    hourly_generated = hourly_generated.sort_index()
    all_dat = pd.concat([singularity_dat, hourly_generated], axis='columns')
    all_dat = all_dat.sort_index()
    cors[ba] = all_dat[["generated_co2_rate_lb_per_mwh_for_electricity_adjusted", "adjusted_rate"]].corr().to_numpy()[0,1]
    percent_difs[ba] = ((all_dat["adjusted_rate"] - all_dat["generated_co2_rate_lb_per_mwh_for_electricity_adjusted"])/all_dat["generated_co2_rate_lb_per_mwh_for_electricity_adjusted"]).median()



In [None]:
out = pd.DataFrame(data={"Difference as percent of OGE":percent_difs, "Correlation":cors, "Annual BA generation":annual_gen})
out = out.sort_values("Annual BA generation", ascending=False)
out.to_csv(f"{load_data.data_folder()}/results/{year}/validation_metrics/us_units/compare_real_time_rates.csv")

In [None]:
out.head()

# Visualize emission rate differences

In [None]:
# For one-off interactive plotting
ba_of_interest = "DEAA"

real_time = pd.read_csv(f"{load_data.data_folder()}/outputs/{year}/validation/real_time_rate/{ba_of_interest}.csv", index_col=0, parse_dates=True)
real_time = real_time["2020-01-01T00:00":]

hourly_consumed = pd.read_csv(consumed_path+ba_of_interest+".csv",
    usecols=["datetime_utc", "consumed_co2_rate_lb_per_mwh_for_electricity", "consumed_co2_rate_lb_per_mwh_for_electricity_adjusted"], 
    index_col="datetime_utc", parse_dates=True)
hourly_generated = pd.read_csv(gen_path+ba_of_interest+".csv", 
    usecols=["datetime_utc", "generated_co2_rate_lb_per_mwh_for_electricity", "generated_co2_rate_lb_per_mwh_for_electricity_adjusted", "co2_mass_lb", "fuel_category"], 
    index_col="datetime_utc", parse_dates=True)

all_dat = pd.concat([real_time, hourly_consumed, hourly_generated.loc[hourly_generated.fuel_category=="total"]], axis='columns')
all_dat = all_dat.sort_index()

fig = px.line(all_dat, x=all_dat.index, y=["generated_co2_rate_lb_per_mwh_for_electricity", "adjusted_rate"], 
    title=f"{ba_of_interest} rate comparison",
    labels={
        "value":"Adjsuted CO2 emission rate (lb/mwh)",
        "index":"Hour"
    })

newnames = {'generated_co2_rate_lb_per_mwh_for_electricity': 'Our data', 'adjusted_rate': 'Real-time data'}
fig.for_each_trace(lambda t: t.update(name = newnames[t.name]))

In [None]:
# What's happening Jun 16? 
to_investigate = pd.read_csv(gen_path+ba_of_interest+".csv", 
    index_col="datetime_utc", parse_dates=True)

In [None]:
to_investigate.loc["2020-06-16T14:00"].to_csv("~/Desktop/plant_")

# Outputs

In [None]:
# Plot and save all BAs 
for ba_of_interest in os.listdir("{load_data.data_folder()}/outputs/2020/validation/real_time_rate/"):
    ba_of_interest = ba_of_interest.replace(".csv", "")
    if ".DS_" in ba_of_interest:
        continue
    
    real_time = pd.read_csv(f"{load_data.data_folder()}/outputs/{year}/validation/real_time_rate/{ba_of_interest}.csv", index_col=0, parse_dates=True)
    real_time = real_time["2020-01-01T00:00":]

    hourly_consumed = pd.read_csv(consumed_path+ba_of_interest+".csv",
        usecols=["datetime_utc", "consumed_co2_rate_lb_per_mwh_for_electricity", "consumed_co2_rate_lb_per_mwh_for_electricity_adjusted"], 
        index_col="datetime_utc", parse_dates=True)
    hourly_generated = pd.read_csv(gen_path+ba_of_interest+".csv", 
        usecols=["datetime_utc", "generated_co2_rate_lb_per_mwh_for_electricity", "generated_co2_rate_lb_per_mwh_for_electricity_adjusted", "co2_mass_lb", "fuel_category"], 
        index_col="datetime_utc", parse_dates=True)

    all_dat = pd.concat([real_time, hourly_consumed, hourly_generated.loc[hourly_generated.fuel_category=="total"]], axis='columns')
    all_dat = all_dat.sort_index()

    fig = px.line(all_dat, x=all_dat.index, y=["generated_co2_rate_lb_per_mwh_for_electricity", "adjusted_rate"], 
        title=f"{ba_of_interest} rate comparison",
        labels={
            "value":"Adjsuted CO2 emission rate (lb/mwh)",
            "index":"Hour"
        })

    newnames = {'generated_co2_rate_lb_per_mwh_for_electricity': 'Our data', 'adjusted_rate': 'Real-time data'}
    fig.for_each_trace(lambda t: t.update(name = newnames[t.name]))
    pio.write_image(fig, f"{load_data.data_folder()}/outputs/viz/{ba_of_interest}.jpg", width=1000, height=400, scale=3)