In [None]:
# import packages
import numpy as np
import pandas as pd
import plotly.express as px

%reload_ext autoreload
%autoreload 2

# # Tell python where to look for modules.
import sys

sys.path.append("../../hourly-egrid/")

# import local modules
import src.load_data as load_data
import src.data_cleaning as data_cleaning
import src.gross_to_net_generation as gross_to_net_generation
import src.impute_hourly_profiles as impute_hourly_profiles


from src.column_checks import get_dtypes, apply_dtypes

year = 2020

In [None]:
# validate method

# merge together monthly subplant totals from EIA and calculated from CEMS
eia_netgen = eia923_allocated.groupby(['plant_id_eia',"subplant_id","report_date"], dropna=False).sum(min_count=1)['net_generation_mwh'].reset_index().dropna(subset="net_generation_mwh")
calculated_netgen = cems.groupby(['plant_id_eia',"subplant_id","report_date"], dropna=False).sum()['net_generation_mwh'].reset_index()
validated_ng = eia_netgen.merge(calculated_netgen, how="inner", on=['plant_id_eia',"subplant_id","report_date"], suffixes=("_eia","_calc"))

validated_ng = validated_ng.groupby("plant_id_eia").sum().reset_index()

validated_ng = validated_ng.round(3)
validated_ng = validated_ng[validated_ng[["net_generation_mwh_eia","net_generation_mwh_calc"]].sum(axis=1) != 0]

validated_ng['pct_error'] = (validated_ng['net_generation_mwh_calc'] - validated_ng['net_generation_mwh_eia']) / validated_ng['net_generation_mwh_eia']
validated_ng.sort_values(by="pct_error")

In [None]:
validated_ng.sort_values(by="pct_error").tail(20)

In [None]:
# what is the most negative
negative_hourly = eia923_allocated.copy()[["plant_id_eia","generator_id","report_date","net_generation_mwh"]].dropna()
negative_hourly["hours"] = negative_hourly.report_date.dt.daysinmonth * 24
negative_hourly["avg_generation_mw"] = negative_hourly["net_generation_mwh"] / negative_hourly["hours"]
negative_hourly.sort_values(by="avg_generation_mw")

In [None]:
plant_to_test = 59073

cems_subplant_test = cems[cems["plant_id_eia"] == plant_to_test]
eia_test = eia923_allocated[eia923_allocated["plant_id_eia"] == plant_to_test]

In [None]:
# identify data source
all_data = eia_test.copy()

# create a binary column indicating whether data was reported in 923
columns_to_test = [
    "net_generation_mwh",
    "fuel_consumed_mmbtu",
    "fuel_consumed_for_electricity_mmbtu",
    "co2_mass_lb",
    "co2_mass_lb_adjusted",
]
all_data = all_data.assign(
    reported_eia923=lambda x: np.where(
        x[columns_to_test].notnull().all(axis=1), 1, 0
    )
)

cems_monthly = (
        cems_subplant_test.groupby(
            ["plant_id_eia", "subplant_id","report_date"], dropna=False
        )
        .count()["datetime_utc"].reset_index())

cems_monthly["hourly_data_source"] = "cems"

# merge in the data source column from CEMS
all_data = all_data.merge(
    cems_monthly[
        ["plant_id_eia", "subplant_id", "report_date", "hourly_data_source"]
    ],
    how="left",
    on=["plant_id_eia", "subplant_id", "report_date"],
    validate="m:1",
)

# for the remaining plants, identify the hourly data source as EIA
all_data["hourly_data_source"] = all_data["hourly_data_source"].fillna("eia")

all_data.loc[(all_data["hourly_data_source"] == "cems") & (~all_data["net_generation_mwh"].isna()), "hourly_data_source"] = "both"

all_data

### Calculate GTN Conversions

Hierarchy
1. Subplant regression
2. Plant regression
3. Subplant shift
4. Plant shift
5. Subplant ratio
6. Plant ratio


When calculating annual totals, only want to use data that is inner joined. 
But I want to apply the factors to all cems data

Because annual, I'll merge it back into cems on plant subplant

In [None]:
gtn_conversions = gross_to_net_generation.calculate_gross_to_net_conversion_factors(cems_subplant_test, eia_test, plant_attributes, year)

factors_to_use = gross_to_net_generation.filter_gtn_conversion_factors(gtn_conversions)

In [None]:
cems_subplant_test = cems_subplant_test.merge(factors_to_use[["plant_id_eia","subplant_id","report_date","data_source","annual_subplant_shift_mw","annual_plant_shift_mw","annual_subplant_ratio","annual_plant_ratio","annual_fuel_ratio"]], how="left", on=["plant_id_eia","subplant_id","report_date"])

In [None]:
#cems_subplant_test["net_generation_mwh"] = cems_subplant_test["gross_generation_mwh"] * cems_subplant_test["plant_regression_ratio"] + cems_subplant_test["plant_regression_shift_mw"]

cems_subplant_test["gtn_method"] = "1_annual_subplant_shift_factor"
cems_subplant_test["net_generation_mwh"] = cems_subplant_test["gross_generation_mwh"] + cems_subplant_test["annual_subplant_shift_mw"]

cems_subplant_test.loc[cems_subplant_test["net_generation_mwh"].isna(), "gtn_method"] = "2_annual_subplant_ratio"
cems_subplant_test["net_generation_mwh"] = cems_subplant_test["net_generation_mwh"].fillna(cems_subplant_test["gross_generation_mwh"] * cems_subplant_test["annual_subplant_ratio"])

cems_subplant_test.loc[cems_subplant_test["net_generation_mwh"].isna(), "gtn_method"] = "3_annual_plant_shift_factor"
cems_subplant_test["net_generation_mwh"] = cems_subplant_test["net_generation_mwh"].fillna(cems_subplant_test["gross_generation_mwh"] + cems_subplant_test["annual_plant_shift_mw"])

cems_subplant_test.loc[cems_subplant_test["net_generation_mwh"].isna(), "gtn_method"] = "4_annual_plant_ratio"
cems_subplant_test["net_generation_mwh"] = cems_subplant_test["net_generation_mwh"].fillna(cems_subplant_test["gross_generation_mwh"] * cems_subplant_test["annual_plant_ratio"])

cems_subplant_test.loc[cems_subplant_test["net_generation_mwh"].isna(), "gtn_method"] = "5_annual_fuel_ratio"
cems_subplant_test["net_generation_mwh"] = cems_subplant_test["net_generation_mwh"].fillna(cems_subplant_test["gross_generation_mwh"] * cems_subplant_test["annual_fuel_ratio"])

# TODO: add "data_source",
cems_subplant_test = cems_subplant_test.drop(columns=["annual_subplant_shift_mw","annual_plant_shift_mw","annual_subplant_ratio","annual_plant_ratio","annual_fuel_ratio"])

In [None]:
# now, we want to check if there is complete subplant factors of at least one type
incomplete_subplant_factors = factors_to_use.copy()[["plant_id_eia", "subplant_id", "data_source","report_date","annual_subplant_shift_mw", "annual_subplant_ratio","net_generation_mwh"]]
# see if we have data in either column that is not missing for all months
incomplete_subplant_factors["any_subplant_factor"] = np.NaN
incomplete_subplant_factors["any_subplant_factor"] = incomplete_subplant_factors["any_subplant_factor"].fillna(incomplete_subplant_factors["annual_subplant_shift_mw"])
incomplete_subplant_factors["any_subplant_factor"] = incomplete_subplant_factors["any_subplant_factor"].fillna(incomplete_subplant_factors["annual_subplant_ratio"])


incomplete_subplant_factors = incomplete_subplant_factors.groupby(
            ["plant_id_eia", "data_source"], dropna=False
        ).count()[["any_subplant_factor", "net_generation_mwh"]].reset_index()

incomplete_subplant_factors = incomplete_subplant_factors[
        (
            incomplete_subplant_factors["any_subplant_factor"]
            < incomplete_subplant_factors["net_generation_mwh"]
        )
    ]

incomplete_subplant_factors

In [None]:
# replace all of the subplant factors with NA for the entire year if some are missing
factors_to_use = factors_to_use.merge(
    incomplete_subplant_factors[["plant_id_eia", "data_source"]],
    how="outer",
    on=["plant_id_eia", "data_source"],
    indicator="incomplete_flag",
)
factors_to_use.loc[
    factors_to_use["incomplete_flag"] == "both",
    ["annual_subplant_shift_mw", "annual_subplant_ratio"],
] = np.NaN
factors_to_use = factors_to_use.drop(columns=["incomplete_flag"])


In [None]:
factors_to_use

In [None]:
factors_to_use.groupby(["data_source","subplant_id"], dropna=False).sum()["net_generation_mwh"].reset_index()

In [None]:
px.line(cems_subplant_test[(cems_subplant_test["subplant_id"] == 1)], x="datetime_utc", y=["gross_generation_mwh", "net_generation_mwh"])

In [None]:
px.line(cems[(cems["plant_id_eia"] == 60903) & (cems["unitid"] == "1")], x="datetime_utc", y=["gross_generation_mwh", "fuel_consumed_mmbtu"])

In [None]:
factors_to_use[factors_to_use["net_generation_mwh"].isna()]

In [None]:
"""
We want to filter out incomplete conversion factors so that we are not using a mix of factors for a single subplant.
If there are any monthly subplant shift factors missing, then we want to default to using the annual subplant shift factor for all months
If there are any annual subplant shift factors missing, then we 

If there are any subplants in a plant that can't use a subplant-specific factor, then all subplants should use a plant-specific factor

However we might want to filter out records where the factor is missing because the net generation is missing


If a month is missing both shift factor and scaling factor, then we want to use the annual factor 


Order of factors:
1. Monthly subplant shift factor
2. Monthly subplant scaling factor

3. Annual subplant shift factor
4. Annual subplant scaling factor

5. Monthly plant scaling factor
6. Annual plant scaling factor

7. Annual Fuel ratio
8. Gross = net
"""

for factor in ['hourly_shift_mw_monthly',"monthly_subplant_ratio","hourly_shift_mw_annual","annual_subplant_ratio"]:

    # get a list of subplants where the number of annual factors is less than the total number of records
    incomplete_factors = factors_to_use.groupby(["plant_id_eia", "subplant_id"]).count()[[factor,"net_generation_mwh"]].reset_index()
    incomplete_factors = incomplete_factors[(incomplete_factors[factor] < incomplete_factors['net_generation_mwh'])]

    # replace all of the monthly factors with NA for these incomplete factors
    factors_to_use = factors_to_use.merge(incomplete_factors[["plant_id_eia","subplant_id"]], how="outer", on=["plant_id_eia","subplant_id"], indicator="source")
    factors_to_use.loc[factors_to_use["source"] == "both", factor] = np.NaN
    factors_to_use = factors_to_use.drop(columns=["source"])

In [None]:
cems = cems.drop(columns=["gross_generation_mwh_y","minimum_gross_generation_mwh","net_generation_mwh","hourly_shift_mw_monthly","hourly_shift_mw_annual","monthly_subplant_ratio","annual_subplant_ratio","monthly_plant_ratio","annual_plant_ratio","annual_fuel_ratio","units_in_subplant","gtn_method"])
cems = cems.rename(columns={"gross_generation_mwh_x":"gross_generation_mwh"})

In [None]:
cems = cems.merge(
        factors_to_use, how="left", on=["plant_id_eia", "subplant_id", "report_date"]
    )

In [None]:
# calculate net generation using the monthly subplant shift factors where available
cems["gtn_method"] = "1_monthly_subplant_shift_factor"
cems["net_generation_mwh"] = cems["gross_generation_mwh"] + (
    cems["hourly_shift_mw_monthly"] / cems["units_in_subplant"]
)

# next use the monthly subplant scaling factor
cems.loc[cems["net_generation_mwh"].isna(), "gtn_method"] = "2_monthly_subplant_scaling_factor"
cems["net_generation_mwh"] = cems["net_generation_mwh"].fillna(
    cems["gross_generation_mwh"] * cems["monthly_subplant_ratio"]
)

# next use the annual shift factor where available
cems.loc[cems["net_generation_mwh"].isna(), "gtn_method"] = "3_annual_subplant_shift_factor"
cems["net_generation_mwh"] = cems["net_generation_mwh"].fillna(
    cems["gross_generation_mwh"]
    + (cems["hourly_shift_mw_annual"] / cems["units_in_subplant"])
)

# next use the monthly subplant scaling factor
cems.loc[cems["net_generation_mwh"].isna(), "gtn_method"] = "4_annual_subplant_scaling_factor"
cems["net_generation_mwh"] = cems["net_generation_mwh"].fillna(
    cems["gross_generation_mwh"] * cems["annual_subplant_ratio"]
)

# next use the annual plant ratio
cems.loc[cems["net_generation_mwh"].isna(), "gtn_method"] = "5_monthly_plant_scaling_factor"
cems["net_generation_mwh"] = cems["net_generation_mwh"].fillna(
    cems["gross_generation_mwh"] * cems["monthly_plant_ratio"]
)

# next use the annual plant ratio
cems.loc[cems["net_generation_mwh"].isna(), "gtn_method"] = "6_annual_plant_scaling_factor"
cems["net_generation_mwh"] = cems["net_generation_mwh"].fillna(
    cems["gross_generation_mwh"] * cems["annual_plant_ratio"]
)

# next use the annual fuel ratio
cems.loc[cems["net_generation_mwh"].isna(), "gtn_method"] = "7_annual_fuel_scaling_factor"
cems["net_generation_mwh"] = cems["net_generation_mwh"].fillna(
    cems["gross_generation_mwh"] * cems["annual_fuel_ratio"]
)

# if nothing else is abailable, use the gross generation value
cems.loc[cems["net_generation_mwh"].isna(), "gtn_method"] = "8_gross_as_net"
cems["net_generation_mwh"] = cems["net_generation_mwh"].fillna(
    cems["gross_generation_mwh"]
)

In [None]:
# whenever there is not a full 12 monthly factors, we should use the annual factors for all values. 
factors_to_use['hourly_shift_mw_monthly'].isna().groupby(factors_to_use["plant_id_eia"],factors_to_use["subplant_id"], factors_to_use["report_date"]).sum()

In [None]:
factors_to_use

In [None]:
# filter out factors that appear to be incorrect

In [None]:
gtn_conversions[gtn_conversions["hourly_shift_mw_monthly"] > 500]

In [None]:
data_to_graph = cems.loc[(cems["plant_id_eia"] == 2712) & (cems["subplant_id"] == 2), ["datetime_utc","gross_generation_mwh","fuel_consumed_mmbtu"]]
px.line(data_to_graph, x="datetime_utc", y=["gross_generation_mwh","fuel_consumed_mmbtu"])

In [None]:
# 6. Convert CEMS Hourly Gross Generation to Hourly Net Generation
####################################################################################
print("6. Converting CEMS gross generation to net generation")
cems, gtn_conversions = data_cleaning.convert_gross_to_net_generation(
    cems, eia923_allocated, plant_attributes
)

# Examine hourly profiles

In [None]:
hourly_profiles = pd.read_csv(f"../data/outputs/{year}/hourly_profiles_{year}.csv")

In [None]:
cems = pd.read_csv(f"../data/outputs/{year}/cems_{year}.csv")

In [None]:
year = 2020
eia923_allocated = pd.read_csv(f'../data/outputs/{year}/eia923_allocated_{year}.csv', dtype=get_dtypes())

In [None]:
hourly_profiles[hourly_profiles["profile"] < 0]

In [None]:
data_to_graph = hourly_profiles[(hourly_profiles["fuel_category"] == "petroleum") & (hourly_profiles["ba_code"] == "TVA")]

px.line(data_to_graph, x="datetime_utc", y=["eia930_profile","cems_profile","residual_profile","scaled_residual_profile","shifted_residual_profile","imputed_profile","profile"])

In [None]:
cems[cems["net_generation_mwh"] < -200]

In [None]:
px.line(cems[(cems["plant_id_eia"] == 3399) & (cems["unitid"] == "1")], x="datetime_utc", y=["gross_generation_mwh","net_generation_mwh"])

In [None]:
eia923_allocated[eia923_allocated.net_generation_mwh == eia923_allocated.net_generation_mwh.min()]

In [None]:
eia923_allocated[eia923_allocated["plant_id_eia"] == 3]