In [None]:
# import packages
import numpy as np
import pandas as pd
import plotly.express as px

%reload_ext autoreload
%autoreload 2

# # Tell python where to look for modules.
import sys

sys.path.append("../../hourly-egrid/")

# import local modules
import src.data_cleaning as data_cleaning
import src.load_data as load_data
import src.impute_hourly_profiles as impute_hourly_profiles
import src.eia930 as eia930
import src.output_data as output_data

from src.column_checks import get_dtypes, apply_dtypes


In [None]:
# load data from csv
year = 2020
path_prefix = ''
eia923_allocated = pd.read_csv(f'../data/outputs/{path_prefix}{year}/eia923_allocated_{year}.csv', dtype=get_dtypes(), parse_dates=['report_date'])
cems = pd.read_csv(f'../data/outputs/{path_prefix}{year}/cems_{year}.csv', dtype=get_dtypes(), parse_dates=['report_date', 'datetime_utc'])
partial_cems_scaled = pd.read_csv(f'../data/outputs/{path_prefix}{year}/partial_cems_scaled_{year}.csv', dtype=get_dtypes(), parse_dates=['report_date'])
plant_attributes = pd.read_csv(f"../data/outputs/{path_prefix}{year}/plant_static_attributes_{year}.csv")

clean_930_file = f"../data/outputs/{path_prefix}{year}/eia930/eia930_elec.csv"

eia930_data = eia930.load_chalendar_for_pipeline(clean_930_file, year=year)
# until we can fix the physics reconciliation, we need to apply some post-processing steps
eia930_data = eia930.remove_imputed_ones(eia930_data)
eia930_data = eia930.remove_months_with_zero_data(eia930_data)

# aggregate cems data to subplant level
cems = data_cleaning.aggregate_cems_to_subplant(cems)

# drop data from cems that is now in partial_cems
cems = data_cleaning.filter_unique_cems_data(cems, partial_cems_scaled)

# create a separate dataframe containing only the EIA data that is missing from cems
monthly_eia_data_to_shape = eia923_allocated[
    (eia923_allocated["hourly_data_source"] == "eia")
    & ~(eia923_allocated["fuel_consumed_mmbtu"].isna())
]


In [None]:
residual_profiles = impute_hourly_profiles.calculate_residual(
    cems,
    eia930_data,
    plant_attributes,
    year,
    transmission_only=False,
    ba_column_name="ba_code",
)

In [None]:
hourly_profiles = impute_hourly_profiles.calculate_hourly_profiles(
    cems,
    eia930_data,
    plant_attributes,
    monthly_eia_data_to_shape,
    year,
    transmission_only=False,
    ba_column_name="ba_code",
)
hourly_profiles = impute_hourly_profiles.convert_profile_to_percent(hourly_profiles)

In [None]:
fuel = "nuclear"
ba = "CISO"

data_to_graph = hourly_profiles[(hourly_profiles["fuel_category"] == fuel) & (hourly_profiles["ba_code"] == ba)]

px.line(data_to_graph, x="datetime_local", y=["eia930_profile","cems_profile","residual_profile","scaled_residual_profile","shifted_residual_profile","profile"], title=f"{ba}, {fuel}")

In [None]:
(
    monthly_eia_data_to_shape,
    plant_attributes,
) = impute_hourly_profiles.aggregate_eia_data_to_ba_fuel(
    monthly_eia_data_to_shape, plant_attributes
)
shaped_eia_data = impute_hourly_profiles.shape_monthly_eia_data_as_hourly(
    monthly_eia_data_to_shape, hourly_profiles
)

In [None]:
data_to_graph = shaped_eia_data[(shaped_eia_data["fuel_category"] == fuel) & (shaped_eia_data["ba_code"] == ba)]

px.line(data_to_graph, x="datetime_utc", y="net_generation_mwh", title=f"{ba}, {fuel}")

# Validate that the shaping isn't changing the monthly data

In [None]:
# load the data if necessary

shaped_eia_data = pd.read_csv(f'../data/outputs/{path_prefix}{year}/shaped_eia923_data_{year}.csv', dtype=get_dtypes(), parse_dates=['report_date'])
eia923_allocated = pd.read_csv(f'../data/outputs/{path_prefix}{year}/eia923_allocated_{year}.csv', dtype=get_dtypes(), parse_dates=['report_date'])
# create a separate dataframe containing only the EIA data that is missing from cems
monthly_eia_data_to_shape = eia923_allocated[
    (eia923_allocated["hourly_data_source"] == "eia")
    & ~(eia923_allocated["fuel_consumed_mmbtu"].isna())
]

plant_attributes = pd.read_csv(f"../data/outputs/{path_prefix}{year}/plant_attributes_with_synthetic_{year}.csv")
monthly_eia_data_to_shape = monthly_eia_data_to_shape.merge(plant_attributes, how='left', on='plant_id_eia')


In [None]:
# aggregate data to ba fuel month
shaped_data_agg = shaped_eia_data.groupby(["ba_code","fuel_category","report_date"], dropna=False).sum()[["net_generation_mwh","fuel_consumed_mmbtu"]]
eia_data_agg = monthly_eia_data_to_shape.groupby(["ba_code","fuel_category","report_date"], dropna=False).sum()[["net_generation_mwh","fuel_consumed_mmbtu"]]

# calculate the difference between the two datasets
compare = (shaped_data_agg - eia_data_agg).round(0)

compare.sum().sum()


In [None]:
compare[(compare["net_generation_mwh"] != 0) | (compare["fuel_consumed_mmbtu"] != 0)]