In [None]:
# import packages
import pandas as pd
import numpy as np
import os
import plotly.express as px

%reload_ext autoreload
%autoreload 2

# # Tell python where to look for modules.
import sys

sys.path.append("../../src/")

import oge.download_data as download_data
import oge.load_data as load_data
from oge.column_checks import get_dtypes
from oge.filepaths import *
import oge.impute_hourly_profiles as impute_hourly_profiles
import oge.data_cleaning as data_cleaning
import oge.output_data as output_data
import oge.emissions as emissions
import oge.validation as validation
import oge.gross_to_net_generation as gross_to_net_generation
import oge.eia930 as eia930
from oge.logging_util import get_logger, configure_root_logger

configure_root_logger()
logger = get_logger("test")


year = 2022
path_prefix = f"{year}/"

## Evaluate plants with 8761 data

In [None]:
data = pd.read_csv(results_folder(f"{year}/plant_data/hourly/us_units/MISO.csv"))

In [None]:
test = data.pivot(index="datetime_utc", columns="plant_id_eia", values="plant_id_eia")
test

In [None]:
test.loc["2023-01-01 06:00:00+00:00", :].dropna()

In [None]:
test[50240]

In [None]:
# plant 50240 is located in ET, where MISO spans from MT to CT to ET. 50240 has data
# starting 5am UTC (expected for EST), but ending at 5am UTC (it should end 4am.)
# It looks like this plant has data for December from both CEMS and EIA
# it is possible that we are using CT to shape the EIA data since this is in MISO -
# we need to make sure that we are using plant-specific time zones

In [None]:
cems_cleaned = pd.read_csv(outputs_folder("2022/cems_subplant_2022.csv"))

In [None]:
cems_cleaned[cems_cleaned["plant_id_eia"] == 50240]

In [None]:
# It looks like the CEMS data is correct - goes until 4am on 2023-01-01

In [None]:
profiles = pd.read_csv(
    outputs_folder("2022/hourly_profiles_2022.csv"),
    parse_dates=["datetime_utc", "datetime_local"],
)

In [None]:
profiles[(profiles["ba_code"] == "MISO") & (profiles["fuel_category"] == "natural_gas")]

### Evaluate 8759 plants

In [None]:
data = pd.read_csv(results_folder(f"{year}/plant_data/hourly/us_units/ISNE.csv"))
test = data.pivot(index="datetime_utc", columns="plant_id_eia", values="plant_id_eia")

In [None]:
test[test[542].isna()]

In [None]:
# it appears that multiple plants are missing data at this 4/1 4am timestamp
# they all appear to be petroluem - this is likely an issue with the hourly eia-930 profile being used.
# let's look at those

In [None]:
profiles = pd.read_csv(
    outputs_folder("2022/hourly_profiles_2022.csv"),
    parse_dates=["datetime_utc", "datetime_local"],
)

In [None]:
profiles[
    (profiles["ba_code"] == "ISNE")
    & (profiles["fuel_category"] == "petroleum")
    & (profiles["datetime_utc"] >= "2022-04-01")
    & (profiles["datetime_utc"] < "2022-04-02")
]

In [None]:
# it looks like this profile is non-missing for that entire day, which means it's not an issue with the profile itself
# looking at the plant metadata, we switch from using the 930 profile in march to the CEMS profile in April
# is the CEMS profile missing some data here? Is there an incorrect transition between DST where we are missing an hour?
# did this hour exist in the original CEMS data but we dropped it when removing standalone plant hours?

In [None]:
# let's look at our cleaned cems data first

In [None]:
cems_cleaned = pd.read_csv(outputs_folder("2022/cems_subplant_2022.csv"))

In [None]:
cems_cleaned[cems_cleaned["plant_id_eia"] == 540]

In [None]:
# the earliest datetime in our cleaned CEMS data is 2022-04-01 05:00:00+00:00
# at this point in the year, we are in EDT, which is UTC -4, so midnight local time on 4/1 should be 4am UTC time.
# it looks like we are missing the first hour of this month

In [None]:
# next, let's look at the "raw" CEMS data from PUDL
# it looks like on the CEMS website, there is no data available prior to 2022-04-01 at hour 0

In [None]:
# specify the columns to use from the CEMS database
cems_columns = [
    "plant_id_epa",  # try to load this column to make sure it has been converted to plant_id_eia
    "plant_id_eia",
    "emissions_unit_id_epa",
    "operating_datetime_utc",
    "operating_time_hours",
    "gross_load_mw",
    "steam_load_1000_lbs",
    "co2_mass_tons",
    "co2_mass_measurement_code",
    "nox_mass_lbs",
    "nox_mass_measurement_code",
    "so2_mass_lbs",
    "so2_mass_measurement_code",
    "heat_content_mmbtu",
]

# load the CEMS data
cems = pd.read_parquet(
    downloads_folder("pudl/hourly_emissions_epacems.parquet"),
    filters=[["year", "==", year]],
    columns=cems_columns,
)

In [None]:
cems[cems["plant_id_eia"] == 540]

In [None]:
# this data is also available at 5am UTC at the earliest

In [None]:
cems_540 = cems_cleaned[cems_cleaned["plant_id_eia"] == 540].copy()

In [None]:
cems_540.report_date = pd.to_datetime(cems_540.report_date)

In [None]:
cems_540.report_date.dt.days_in_month

In [None]:
test = (
    cems_540.groupby(["plant_id_eia", "subplant_id"] + ["report_date"])[
        ["datetime_utc"]
    ]
    .agg(["count", "min", "max"])
    .droplevel(level=0, axis=1)
).reset_index()

In [None]:
# get complete number of hours in month
test["hours_in_month"] = test["report_date"].dt.days_in_month * 24
# adjust March and November for DST

In [None]:
test

In [None]:
# we want to ensure a complete hourly timeseries for all plants in CEMS
# we want to test and make sure this works for both hourly and monthly data

# import necessary modules
from itertools import product

# define what will be function inputs
df = cems_cleaned[cems_cleaned["plant_id_eia"].isin([3, 540])]
key_columns = ["plant_id_eia", "subplant_id"]
timeseries_column = ["datetime_utc"]
year = 2022

# add timezones to each CEMS plant

# for each timezone, create a complete hourly timeseries
# we want to do this for each timezone so that we are not adding more than 8760 values for any plant

# for each group of plants in each timezone, create a complete index, concating all together

col_list = key_columns + timeseries_column

# get a list of lists of unique values in each column
unique_column_values = []
for col in col_list:
    unique_column_values.append(list(df[col].unique()))

# get the cartesian product of all possible combinations
unique_combinations = []
for element in product(*unique_column_values):
    unique_combinations.append(element)

# create a complete timeseries based on this
complete_timeseries_df = pd.DataFrame(
    unique_combinations,
    columns=["plant_id_eia", "subplant_id", "datetime_utc"],
)


# make sure the datetime dtypes match before merging
complete_timeseries_df[timeseries_column] = complete_timeseries_df[
    timeseries_column
].astype(df[timeseries_column].dtype)
# complete the report dates
df = df.merge(
    complete_timeseries_df,
    how="outer",
    on=col_list,
    validate="1:1",
).sort_values(by=col_list)