# Uncertainty analysis 

* Compare results to results using a flat profile 
* Calculate maximum uncertainty bands: between CEMS hourly data and CEMS + total non-CEMS capacity 

gailin - 7/21/2022

In [None]:
# Calculate min and max possible at each hour
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import os

In [None]:
%reload_ext autoreload
%autoreload 2

# Tell python where to look for modules.
# Depending on how your jupyter handles working directories, this may not be needed.
import sys

sys.path.append("../../hourly-egrid/")

import src.load_data as load_data
import src.column_checks as column_checks

In [None]:
year = 2020

In [None]:
ba = "MISO"

# Visualize flat vs. base result

In [None]:
flat = pd.read_csv(
    f"../data/results/flat/{year}/power_sector_data/hourly/us_units/{ba}.csv",
    parse_dates=["datetime_utc"],
)
base = pd.read_csv(
    f"../data/results/{year}/power_sector_data/hourly/us_units/{ba}.csv",
    parse_dates=["datetime_utc"],
)

In [None]:
flat = flat[flat.fuel_category == "total"]
base = base[base.fuel_category == "total"]

In [None]:
both = flat.merge(base, how="left", on="datetime_utc", suffixes=("_flat", "_base"))

In [None]:
both.columns

In [None]:
fig = px.line(
    both,
    x="datetime_utc",
    y=[
        "generated_co2e_rate_lb_per_mwh_for_electricity_adjusted_flat",
        "generated_co2e_rate_lb_per_mwh_for_electricity_adjusted_base",
    ],
    title=f"{ba}, carbon intensity using flat vs. base generation",
)
fig.show()
# newnames = {'generated_co2_rate_lb_per_mwh_adjusted': 'Our data', 'generated_co2_rate_lb_per_mwh_for_electricity_adjusted': 'Real-time data'}
# fig.for_each_trace(lambda t: t.update(name = newnames[t.name]))

# Draw min (CEMS-only) and max (CEMS + total non-CEMS capacity)

In [None]:
cems = pd.read_csv(
    "../data/results/2020/plant_data/hourly/us_units/individual_plant_data.csv",
    parse_dates=["datetime_utc", "report_date"],
)
plant_meta = pd.read_csv("../data/results/2020/plant_data/plant_static_attributes.csv")

In [None]:
cems = cems.merge(plant_meta, how="left", on="plant_id_eia")

In [None]:
pudl_reader = load_data.initialize_pudl_out(year)

## Calculate maximum and minimum hourly emissions and generation 

### Generation 

* Min generation is CEMS generation 
* Max generation is CEMS generation + (plant capacity for EIA-only plants with positive 923 generation in this month)

### Emissions 

* Min emissions is CEMS emissions alone 
* Max emissions is CEMS generation + sum[(plant capacity for each plant with positive 923 generation in this month)x(monthly rate for that plant)] 

Note that this assumes (and asserts) that there are no EIA-only plants with negative 923 generation. 

In [None]:
caps = pudl_reader.gens_eia860()[
    ["plant_id_eia", "capacity_mw", "summer_capacity_mw", "winter_capacity_mw"]
].copy()
caps["max_capacity"] = caps[
    ["capacity_mw", "summer_capacity_mw", "winter_capacity_mw"]
].max(axis=1)
caps = caps.groupby("plant_id_eia").sum()["max_capacity"]

In [None]:
monthly_rates = pd.read_csv(
    "../data/results/2020/plant_data/monthly/us_units/plant_data.csv",
    parse_dates=["report_date"],
)

In [None]:
monthly_rates = monthly_rates.merge(
    caps, how="left", left_on="plant_id_eia", right_index=True
)
monthly_rates = monthly_rates.merge(plant_meta, how="left", on="plant_id_eia")

In [None]:
# Calculate per-plant, per-month maximum net generation and emissions:
#   max net gen = capacity
#   max emissions = capacity * monthly rate
pols = ["co2", "ch4", "n2o", "co2e", "nox", "so2"]
for pol in pols:
    monthly_rates[f"max_hourly_{pol}"] = monthly_rates["max_capacity"] * (
        monthly_rates[f"{pol}_mass_lb_for_electricity_adjusted"]
        / monthly_rates["net_generation_mwh"]
    )

In [None]:
# Filter for non-CEMS plants
# Filter for plants with non-zero generation in this month: we know that hourly generation shouldn't include the capacity of these plants
# Group by BA and month to get monthly max hourly emissions
pol_cols = [f"max_hourly_{p}" for p in pols] + ["max_capacity"]
filtered_monthly_rates = monthly_rates[
    (monthly_rates.data_availability == "eia_only")
    & (monthly_rates.net_generation_mwh > 0)
]
non_cems_maxes = (
    filtered_monthly_rates.groupby(["report_date", "ba_code"])[pol_cols]
    .sum()
    .reset_index()
)

In [None]:
# Identify plants with negative net generation.
# These will be included in net generation minimum, emissions maximum
negative_monthly = monthly_rates[
    (monthly_rates.data_availability == "eia_only")
    & (monthly_rates.net_generation_mwh < 0)
]
assert len(negative_monthly) == 0


# ########### FAKE NEGATIVE PLANT FOR TESTING
# negative_monthly = monthly_rates[(monthly_rates.plant_id_eia == 3)].copy()
# negative_monthly.loc[:,"net_generation_mwh"] = -300

In [None]:
cols = [f"{p}_mass_lb_for_electricity_adjusted" for p in pols] + ["net_generation_mwh"]
minimum_bound = cems.groupby(["datetime_utc", "ba_code"])[cols].sum().reset_index()

In [None]:
# Add report date back in (dropped during groupby)
report_dates = (
    cems.groupby(["datetime_utc", "ba_code"])["report_date"].first().reset_index()
)
minimum_bound = minimum_bound.merge(
    report_dates, how="left", on=["datetime_utc", "ba_code"]
)

In [None]:
# Calculate maximum by merging hourly max emissions and generation into minimum, then adding
maximum_bound = minimum_bound.merge(
    non_cems_maxes, how="left", on=["report_date", "ba_code"]
)

In [None]:
for pol in pols:
    maximum_bound[f"{pol}_mass_lb_for_electricity_adjusted"] = (
        maximum_bound[f"{pol}_mass_lb_for_electricity_adjusted"]
        + maximum_bound[f"max_hourly_{pol}"]
    )
maximum_bound["net_generation_mwh"] = (
    maximum_bound["net_generation_mwh"] + maximum_bound["max_capacity"]
)
maximum_bound = maximum_bound[cols + ["datetime_utc", "ba_code"]]

# Plot max and min bounds

In [None]:
ba = "DEAA"

In [None]:
flat = pd.read_csv(
    f"../data/results/flat/2020/power_sector_data/hourly/us_units/{ba}.csv",
    parse_dates=["datetime_utc"],
)
base = pd.read_csv(
    f"../data/results/2020/power_sector_data/hourly/us_units/{ba}.csv",
    parse_dates=["datetime_utc"],
)

In [None]:
# Only take totals
flat = flat[flat.fuel_category == "total"]
base = base[base.fuel_category == "total"]

In [None]:
this_max = maximum_bound[maximum_bound.ba_code == ba]
this_min = minimum_bound[minimum_bound.ba_code == ba]

fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=this_min.datetime_utc,
        y=this_min.net_generation_mwh,
        fill=None,
        mode="lines",
        line_color="indigo",
        showlegend=False,
    )
)
fig.add_trace(
    go.Scatter(
        x=this_max.datetime_utc,
        y=this_max.net_generation_mwh,
        fill="tonexty",  # fill area between trace0 and trace1
        mode="lines",
        line_color="indigo",
        name="Min/max possible",
    )
)

fig.add_trace(
    go.Scatter(
        x=flat.datetime_utc,
        y=flat.net_generation_mwh,
        mode="lines",
        line_color="brown",
        name="Flat",
    )
)

fig.add_trace(
    go.Scatter(
        x=base.datetime_utc,
        y=base.net_generation_mwh,
        mode="lines",
        line_color="blue",
        name="Best guess",
    )
)

fig.update_layout(
    title=f"{ba} total net generation estimates",
    xaxis_title="Hour",
    yaxis_title="Net generation (MWh)",
    legend_title="Estimate type",
)

fig.show()

In [None]:
this_max = maximum_bound[maximum_bound.ba_code == ba]
this_min = minimum_bound[minimum_bound.ba_code == ba]

fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=this_min.datetime_utc,
        y=this_min.co2_mass_lb_for_electricity_adjusted,
        fill=None,
        mode="lines",
        line_color="indigo",
        showlegend=False,
    )
)
fig.add_trace(
    go.Scatter(
        x=this_max.datetime_utc,
        y=this_max.co2_mass_lb_for_electricity_adjusted,
        fill="tonexty",  # fill area between trace0 and trace1
        mode="lines",
        line_color="indigo",
        name="Min/max possible",
    )
)

fig.add_trace(
    go.Scatter(
        x=flat.datetime_utc,
        y=flat.co2_mass_lb_for_electricity_adjusted,
        mode="lines",
        line_color="brown",
        name="Flat",
    )
)

fig.add_trace(
    go.Scatter(
        x=base.datetime_utc,
        y=base.co2_mass_lb_for_electricity_adjusted,
        mode="lines",
        line_color="blue",
        name="Best guess",
    )
)

fig.update_layout(
    title=f"{ba} total carbon emissions",
    xaxis_title="Hour",
    yaxis_title="CO2 (lb)",
    legend_title="Estimate type",
)

fig.show()

# Where do our monthly generation / emissions exceed min/max bounds? 

In [None]:
# "net_generation_mwh" or "co2_mass_lb_for_electricity_adjusted"
col_to_check = "net_generation_mwh"

issues = []
for ba in os.listdir("../data/results/2020/power_sector_data/hourly/us_units/"):
    if ".DS_Store" in ba:
        continue
    our_guess = pd.read_csv(
        f"../data/results/2020/power_sector_data/hourly/us_units/{ba}",
        parse_dates=["datetime_utc"],
    )
    our_guess = our_guess[our_guess.fuel_category == "total"]
    ba = ba.replace(".csv", "")
    to_compare = our_guess[
        ["datetime_utc", "co2_mass_lb_for_electricity_adjusted", "net_generation_mwh"]
    ].merge(
        minimum_bound.loc[
            minimum_bound.ba_code == ba,
            [
                "datetime_utc",
                "co2_mass_lb_for_electricity_adjusted",
                "net_generation_mwh",
            ],
        ],
        how="left",
        on="datetime_utc",
        suffixes=("", "_min"),
    )
    to_compare = to_compare.merge(
        maximum_bound.loc[
            maximum_bound.ba_code == ba,
            [
                "datetime_utc",
                "co2_mass_lb_for_electricity_adjusted",
                "net_generation_mwh",
            ],
        ],
        how="left",
        on="datetime_utc",
        suffixes=("_best", "_max"),
    )
    to_compare["ok"] = (
        to_compare[f"{col_to_check}_min"] > to_compare[f"{col_to_check}_best"]
    ) | (to_compare[f"{col_to_check}_best"] > to_compare[f"{col_to_check}_max"])
    issues.append(
        to_compare[["datetime_utc", "ok"]]
        .set_index("datetime_utc")
        .squeeze()
        .rename(ba)
    )

In [None]:
all_issues = pd.concat(issues, axis=1)

In [None]:
all_issues.sum(axis=0).sort_values(ascending=False).head(20)

In [None]:
monthly_rates[monthly_rates.ba_code == "DEAA"]