# Cleaning of 930, analyze how cleaning affects residual profile

In [None]:
# data source https://gridemissions.s3.us-east-2.amazonaws.com/EBA_elec.csv.gz


In [None]:
import numpy as np
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go

import datetime as dt

In [None]:
%reload_ext autoreload
%autoreload 2

# Tell python where to look for modules.
# Depending on how your jupyter handles working directories, this may not be needed.
import sys

sys.path.append("../../src")

from oge.visualization import day_hour_heatmap
from oge.eia930 import (
    reformat_chalendar,
    load_chalendar,
    load_chalendar_for_pipeline,
)
from oge.download_data import download_chalendar_files
from oge.data_cleaning import distribute_monthly_eia_data_to_hourly
from oge.impute_hourly_profiles import aggregate_for_residual, calculate_residual

In [None]:
year = 2020

In [None]:
# Download data if not exists
download_chalendar_files()

In [None]:
original = load_chalendar_for_pipeline(
    "../data/eia930/chalendar/EBA_adjusted_rolling.csv", year=year
)
original.head(5)

In [None]:
cleaned = load_chalendar_for_pipeline(
    "../data/eia930/chalendar/EBA_adjusted_elec.csv", year=year
)
cleaned.head(5)

In [None]:
# Name column same as CEMS. TODO: make eia930 output use this name
cleaned = cleaned.rename(columns={"datetime_utc": "datetime_utc"})

In [None]:
# load hourly CEMS data outputted from main data pipeline
cems = pd.read_csv(
    f"../data/outputs/cems_{year}.csv",
    parse_dates=["datetime_utc"],
)
cems.head(5)

# Aggregate CEMS data and merge with EIA-930 data

In [None]:
# # combine original and cleaned EIA-930 data
# combined_data = cleaned.merge(
#     original[["ba_code", "fuel_category", "datetime_utc", "net_generation_mwh_930"]],
#     how="left",
#     on=["ba_code", "fuel_category", "datetime_utc"],
#     suffixes=("_clean", "_orig"),
# )
# combined_data


In [None]:
# Load plant attributes (including BA codes)
plant_attributes = pd.read_csv("../data/outputs/plant_static_attributes.csv")

In [None]:
plant_attributes.head()

In [None]:
cems = cems.merge(plant_attributes, how="left", on="plant_id_eia")

In [None]:
aggregate_for_residual(cems, "datetime_utc", "ba_code_physical", transmission=True)

In [None]:
# perform different groupby operations so that we can compare different ways of aggregating the cems data

# aggregate all generation by commercial BA
cems_bac_all = (
    cems.groupby(["ba_code", "fuel_category_eia930", "datetime_utc"])[
        "net_generation_mwh"
    ]
    .sum()
    .reset_index()
    .rename(
        columns={
            "fuel_category_eia930": "fuel_category",
            "net_generation_mwh": "net_generation_mwh_bac_all",
        }
    )
)

# aggregate all generation by physical BA
cems_bap_all = (
    cems.groupby(["ba_code_physical", "fuel_category_eia930", "datetime_utc"])[
        "net_generation_mwh"
    ]
    .sum()
    .reset_index()
    .rename(
        columns={
            "fuel_category_eia930": "fuel_category",
            "ba_code_physical": "ba_code",
            "net_generation_mwh": "net_generation_mwh_bap_all",
        }
    )
)

# Aggregate transmission-connected generation by commercial BA
cems_bac_trans = (
    cems[cems["distribution_flag"] is False]
    .groupby(["ba_code", "fuel_category_eia930", "datetime_utc"])["net_generation_mwh"]
    .sum()
    .reset_index()
    .rename(
        columns={
            "fuel_category_eia930": "fuel_category",
            "net_generation_mwh": "net_generation_mwh_bac_trans",
        }
    )
)

# Aggregate transmission-connected generation by physical BA
cems_bap_trans = (
    cems[cems["distribution_flag"] == False]
    .groupby(["ba_code_physical", "fuel_category_eia930", "datetime_utc"])[
        "net_generation_mwh"
    ]
    .sum()
    .reset_index()
    .rename(
        columns={
            "fuel_category_eia930": "fuel_category",
            "ba_code_physical": "ba_code",
            "net_generation_mwh": "net_generation_mwh_bap_trans",
        }
    )
)

In [None]:
# merge the aggregated data into the EIA data
combined_data = cleaned.merge(
    cems_bac_all, how="left", on=["ba_code", "fuel_category", "datetime_utc"]
).fillna(0)
combined_data = combined_data.merge(
    cems_bap_all, how="left", on=["ba_code", "fuel_category", "datetime_utc"]
).fillna(0)
combined_data = combined_data.merge(
    cems_bac_trans, how="left", on=["ba_code", "fuel_category", "datetime_utc"]
).fillna(0)
combined_data = combined_data.merge(
    cems_bap_trans, how="left", on=["ba_code", "fuel_category", "datetime_utc"]
).fillna(0)
combined_data

In [None]:
cleaned

In [None]:
# only keep rows where local datetime is in the current year
combined_data = combined_data[
    combined_data["datetime_local"].apply(lambda x: x.year) == year
]

# Evaluate BA mappings

Mapping options: 
Physical or commercial BA; include or exclude distribution-connected generation 

Metric: 923 data aggregated to BA should be close to 930 data aggregated to month. 
For each BA, which aggregation metric minimizes difference? 
How different are the aggreagtion metrics?  

In [None]:
eia923 = pd.read_csv(f"../data/outputs/eia923_allocated_{year}.csv")
eia923.report_date = pd.to_datetime(
    eia923.report_date
)  # TODO why is this not a date already?

In [None]:
eia923 = eia923.merge(plant_attributes, how="left", on="plant_id_eia")

In [None]:
eia930_agg = (
    cleaned.groupby(["ba_code", "fuel_category", "report_date"]).sum().reset_index()
)

In [None]:
eia923.head()

In [None]:
eia930_agg.head()

In [None]:
eia923_agg = eia930_agg.copy()
# aggregate all generation by commercial BA
for ba_key in ["ba_code", "ba_code_physical"]:
    for transmission_only in ["transmission", "all"]:
        transmission_key = transmission_only == "transmission"
        aggregated_gen = aggregate_for_residual(
            eia923, time_key="report_date", ba_key=ba_key, transmission=transmission_key
        )
        aggregated_gen.rename(
            columns={"net_generation_mwh": f"mwh_{ba_key}_{transmission_only}"},
            inplace=True,
        )
        aggregated_gen[f"difference_{ba_key}_{transmission_only}"] = (
            eia923_agg["net_generation_mwh_930"]
            - aggregated_gen[f"mwh_{ba_key}_{transmission_only}"]
        )
        eia923_agg = eia923_agg.merge(
            aggregated_gen, how="left", on=["ba_code", "fuel_category", "report_date"]
        )

In [None]:
eia923_agg.head()

In [None]:
fig = px.histogram(
    eia923_agg,
    x=[
        "difference_ba_code_transmission",
        "difference_ba_code_all",
        "difference_ba_code_physical_transmission",
        "difference_ba_code_physical_all",
    ],
)

# Overlay both histograms
fig.update_layout(barmode="overlay")
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.25)
fig.show()

# Visualize net generation data from each source in a single BA
Only visualize non-renewable data

In [None]:
ba = "MISO"

data_to_visualize = combined_data[
    (combined_data["ba_code"] == ba)
    & (~combined_data["fuel_category"].isin(["hydro", "solar", "wind"]))
]

px.line(
    data_to_visualize,
    x="datetime_local",
    y=[
        "net_generation_mwh_930_clean",
        "net_generation_mwh_930_orig",
        "net_generation_mwh_bac_all",
        "net_generation_mwh_bap_all",
        "net_generation_mwh_bac_trans",
        "net_generation_mwh_bap_trans",
    ],
    facet_col="fuel_category",
    height=1000,
    facet_col_wrap=1,
).update_yaxes(matches=None)

# Calculate the residual based on a single CEMS aggregation

In [None]:
# for now, let's pick a single cems aggregation to use to calculate a residual
cems_data_column = "net_generation_mwh_bac_all"

combined_data["residual"] = (
    combined_data["net_generation_mwh_930_clean"] - combined_data[cems_data_column]
)

# Visualize residual for the BA

In [None]:
data_to_visualize = combined_data[
    (combined_data["ba_code"] == ba)
    & (~combined_data["fuel_category"].isin(["hydro", "solar", "wind"]))
]

px.line(
    data_to_visualize,
    x="datetime_local",
    y=["net_generation_mwh_930_clean", cems_data_column, "residual"],
    facet_col="fuel_category",
    height=1000,
    facet_col_wrap=1,
).update_yaxes(matches=None)

# Test Scaling Strategy
If the residual is ever negative, we want to scale the cems net generation data to always be less than or equal to the 930 net generation. 

To do this, we'll try scaling the data as a percentage:
1. For each hour, calculate the ratio between 930 NG and CEMS NG.
2. For each BA-fuel, find the minimum ratio. If the minimum ratio is >= 1, it means that 930 is always greater than CEMS and doesn't need to be scaled. For any BA-fuels where the ratio is < 1, we will use this as a scaling factor to scale the CEMS data such that the scaled data is always <= the 930 data
3. Multiply all hourly CEMS values by the scaling factor

In [None]:
# only keep data where the cems data is greater than zero
scaling_factors = combined_data.copy()[combined_data[cems_data_column] != 0]

# calculate the ratio of 930 net generation to cems net generation
# if correct, ratio should be >=1
scaling_factors["scaling_factor"] = (
    scaling_factors["net_generation_mwh_930_clean"] / scaling_factors[cems_data_column]
)

# find the minimum ratio for each ba-fuel
scaling_factors = (
    scaling_factors.groupby(["ba_code", "fuel_category"])["scaling_factor"]
    .min()
    .reset_index()
)
scaling_factors

In [None]:
# merge the scaling factor into the combined data
# for any BA-fuels without a scaling factor, fill with 1 (scale to 100% of the origina data)
combined_data = combined_data.merge(
    scaling_factors, how="left", on=["ba_code", "fuel_category"]
).fillna(1)

# calculate the scaled cems data
combined_data["cems_scaled"] = (
    combined_data[cems_data_column] * combined_data["scaling_factor"]
)

# calculate a scaled residual
combined_data["residual_scaled"] = (
    combined_data["net_generation_mwh_930_clean"] - combined_data["cems_scaled"]
)

# Plot scaled residuals

In [None]:
ba = "PJM"

data_to_visualize = combined_data[
    (combined_data["ba_code"] == ba)
    & (~combined_data["fuel_category"].isin(["hydro", "solar", "wind"]))
]

px.line(
    data_to_visualize,
    x="datetime_local",
    y=[
        "net_generation_mwh_930_clean",
        cems_data_column,
        "cems_scaled",
        "residual",
        "residual_scaled",
    ],
    facet_col="fuel_category",
    height=1000,
    facet_col_wrap=1,
).update_yaxes(matches=None)

In [None]:
combined_data = combined_data.reset_index()

In [None]:
# broken

ba = "MISO"
fuel = "natural_gas"

data_to_visualize = combined_data.copy()[
    (combined_data["ba_code"] == ba) & (combined_data["fuel_category"] == fuel)
]
data_to_visualize["datetime_local"] = pd.to_datetime(
    data_to_visualize["datetime_local"]
)
data_to_visualize["date"] = data_to_visualize["datetime_local"].dt.date
data_to_visualize["hour"] = data_to_visualize["datetime_local"].dt.hour

# data_to_visualize = data_to_visualize.pivot(index='hour', columns='date', values='residual_scaled')

# px.imshow(data_to_visualize, color_continuous_scale="RdBu", width=1000, height=400, color_continuous_midpoint=0,)

# Export the profile

In [None]:
data_to_export = combined_data[
    [
        "ba_code",
        "fuel_category",
        "datetime_utc",
        "datetime_local",
        "report_date",
        "residual_scaled",
    ]
]
data_to_export.to_csv("../data/output/residual_profiles.csv", index=False)

# Evaluate profile quality


In [None]:
with open("../data/outputs/residual_profiles.csv") as f:
    line = f.readline()
    print(line.split(","))

In [None]:
cems.head()

In [None]:
# Produced by data_pipeline
eia = pd.read_csv("../data/output/eia923_for_residual.csv")

In [None]:
# Produced by plant_class_differences
# TODO use output plant data to find smallest plants after cleaning -- some of these are in "no cems" data categories
validation_plants = pd.read_csv("../data/output/validation_plants.csv")

In [None]:
to_distribute = eia[eia.plant_id_eia.isin(validation_plants.plant_id_eia)]

In [None]:
to_distribute.head()

In [None]:
distributed = distribute_monthly_eia_data_to_hourly(
    to_distribute, combined_data, "residual_scaled"
)

In [None]:
cleaned = load_chalendar_for_pipeline(
    "../data/eia930/chalendar/EBA_adjusted_elec.csv", year=year
)
cems = pd.read_csv(
    f"../data/outputs/cems_{year}.csv",
    parse_dates=["datetime_utc"],
)

In [None]:
cleaned = cleaned.rename(columns={"datetime_utc": "datetime_utc"})
cems = cems.rename(columns={"datetime_utc": "datetime_utc"})

In [None]:
plant_attributes = pd.read_csv("../data/outputs/plant_static_attributes.csv")

In [None]:
cleaned.head()

In [None]:
calculate_residual(cems, cleaned, plant_attributes, 2020)

In [None]:
cems