In [None]:
import pandas as pd
import sys
import plotly.express as px

sys.path.append('../../../hourly-egrid/')
%reload_ext autoreload
%autoreload 2
from src.column_checks import get_dtypes, apply_dtypes
import src.impute_hourly_profiles as impute_hourly_profiles
import src.data_cleaning as data_cleaning
import src.load_data as load_data
import src.validation as validation
import src.eia930 as eia930
import numpy as np

# load data from csv
year = 2020
path_prefix = ''

path_prefix = f"{path_prefix}{year}"

# About 
The purpose of this notebook is to validate the methods that we use for imputing missing wind and solar generation profiles when data is not available from EIA-930. 

Our current method uses two approaches:
1. Take the average profile from that resource in all of the directly interconnected balancing authorities located in the same time zone
2. Take the average of all profiles nationally, in the local time.

# Which version of the EIA-930 data should we use?

In [None]:
eia930_data_raw = eia930.load_chalendar_for_pipeline(f"../data/outputs/{path_prefix}/eia930/eia930_raw.csv", year=year).pipe(eia930.remove_imputed_ones).pipe(eia930.remove_months_with_zero_data)
eia930_data_roll = eia930.load_chalendar_for_pipeline(f"../data/outputs/{path_prefix}/eia930/eia930_rolling.csv", year=year).pipe(eia930.remove_imputed_ones).pipe(eia930.remove_months_with_zero_data)
eia930_data_cleaned = eia930.load_chalendar_for_pipeline(f"../data/outputs/{path_prefix}/eia930/eia930_elec.csv", year=year).pipe(eia930.remove_imputed_ones).pipe(eia930.remove_months_with_zero_data)

In [None]:
cems = pd.read_csv(f'../data/outputs/{path_prefix}/cems_{year}.csv', dtype=get_dtypes(), parse_dates=['datetime_utc', 'report_date'])
partial_cems_scaled = pd.read_csv(f'../data/outputs/{path_prefix}/partial_cems_scaled_{year}.csv', dtype=get_dtypes(), parse_dates=['datetime_utc', 'report_date'])
eia923_allocated = pd.read_csv(f'../data/outputs/{path_prefix}/eia923_allocated_{year}.csv', dtype=get_dtypes(), parse_dates=['report_date'])
plant_attributes = pd.read_csv(f"../data/outputs/{path_prefix}/plant_static_attributes_{year}.csv")
primary_fuel_table = plant_attributes.drop_duplicates(subset="plant_id_eia")[["plant_id_eia", "plant_primary_fuel"]]

# aggregate cems data to subplant level
cems = data_cleaning.aggregate_cems_to_subplant(cems)

# drop data from cems that is now in partial_cems
cems = data_cleaning.filter_unique_cems_data(cems, partial_cems_scaled)

# create a separate dataframe containing only the EIA data that is missing from cems
monthly_eia_data_to_shape = eia923_allocated[
    (eia923_allocated["hourly_data_source"] == "eia")
    & ~(eia923_allocated["fuel_consumed_mmbtu"].isna())
]


In [None]:
data_to_graph = eia930_data_roll[(eia930_data_roll["fuel_category_eia930"] == "solar") & (eia930_data_roll["report_date"] == "2020-07-01")]

px.line(data_to_graph, x="datetime_local", y="net_generation_mwh_930", color="ba_code")

In [None]:
hourly_profiles_raw = impute_hourly_profiles.calculate_hourly_profiles(
    cems,
    eia930_data_raw,
    plant_attributes,
    monthly_eia_data_to_shape,
    year,
    transmission_only=False,
    ba_column_name="ba_code",
)

hourly_profiles_roll = impute_hourly_profiles.calculate_hourly_profiles(
    cems,
    eia930_data_roll,
    plant_attributes,
    monthly_eia_data_to_shape,
    year,
    transmission_only=False,
    ba_column_name="ba_code",
)

hourly_profiles_cleaned = impute_hourly_profiles.calculate_hourly_profiles(
    cems,
    eia930_data_cleaned,
    plant_attributes,
    monthly_eia_data_to_shape,
    year,
    transmission_only=False,
    ba_column_name="ba_code",
)

### Examine raw data

In [None]:
hourly_profiles_raw[["ba_code", "fuel_category", "report_date", "profile_method"]].drop_duplicates().drop(columns=["ba_code"]).pivot_table(index="fuel_category", columns="profile_method", aggfunc="count").fillna(0).astype(int)

In [None]:
diba_validation = validation.validate_diba_imputation_method(hourly_profiles_raw, year)
diba_validation.groupby(["fuel_category"]).mean()["imputed_profile"]

In [None]:
national_validation = validation.validate_national_imputation_method(hourly_profiles_raw, year)
national_validation.groupby(["fuel_category"]).mean()["imputed_profile"]

### Examine Rolling Data

In [None]:
hourly_profiles_roll[["ba_code", "fuel_category", "report_date", "profile_method"]].drop_duplicates().drop(columns=["ba_code"]).pivot_table(index="fuel_category", columns="profile_method", aggfunc="count").fillna(0).astype(int)

In [None]:
diba_validation = validation.validate_diba_imputation_method(hourly_profiles_roll, year)
diba_validation.groupby(["fuel_category"]).mean()["imputed_profile"]

In [None]:
national_validation = validation.validate_national_imputation_method(hourly_profiles_roll, year)
national_validation.groupby(["fuel_category"]).mean()["imputed_profile"]

### Examine Cleaned Data

In [None]:
hourly_profiles_cleaned[["ba_code", "fuel_category", "report_date", "profile_method"]].drop_duplicates().drop(columns=["ba_code"]).pivot_table(index="fuel_category", columns="profile_method", aggfunc="count").fillna(0).astype(int)

In [None]:
diba_validation = validation.validate_diba_imputation_method(hourly_profiles_cleaned, year)
diba_validation.groupby(["fuel_category"]).mean()["imputed_profile"]

In [None]:
national_validation = validation.validate_national_imputation_method(hourly_profiles_cleaned, year)
national_validation.groupby(["fuel_category"]).mean()["imputed_profile"]

# Validate the data from the pipeline

In [None]:
hourly_profiles = pd.read_csv(f"../data/outputs/{path_prefix}/hourly_profiles_{year}.csv")
shaped_eia923_data = pd.read_csv(f'../data/outputs/{path_prefix}/shaped_eia923_data_{year}.csv', dtype=get_dtypes())

In [None]:
data_to_graph[data_to_graph["report_date"] == "2020-04-01"]

In [None]:
data_to_graph = hourly_profiles[(hourly_profiles["fuel_category"] == "natural_gas") & (hourly_profiles["ba_code"] == "ERCO")]

px.line(data_to_graph, x="datetime_local", y=["eia930_profile","cems_profile","residual_profile","scaled_residual_profile"])

In [None]:
data_to_graph = shaped_eia923_data[(shaped_eia923_data["fuel_category"] == "natural_gas") & (shaped_eia923_data["ba_code"] == "ERCO")]

px.line(data_to_graph, x="datetime_utc", y=["net_generation_mwh"])

In [None]:
diba_validation = validation.validate_diba_imputation_method(hourly_profiles, year)
diba_validation.groupby(["fuel_category"]).mean()["imputed_profile"]

In [None]:
national_validation = validation.validate_national_imputation_method(hourly_profiles, year)
national_validation.groupby(["fuel_category"]).mean()["imputed_profile"]

In [None]:
# how often are these methods used?
hourly_profiles[
    ["ba_code", "fuel_category", "report_date", "profile_method"]
].drop_duplicates().drop(columns=["ba_code"]).pivot_table(
    index="fuel_category", columns="profile_method", aggfunc="count"
).fillna(
    0
).astype(
    int
)