The purpose of this notebook is to evaluate how similar plants that report to CEMS are to plants that don't report to CEMS. We will look at:
 - Correlation in hourly profiles
 - Variation in generation
 - Capacity factors?
 - Heat rates?

In [None]:
# import packages
import pandas as pd
import os
import plotly.express as px

%reload_ext autoreload
%autoreload 2

# # Tell python where to look for modules.
import sys
sys.path.append('../../../open-grid-emissions/src/')

import download_data
import load_data
from column_checks import get_dtypes
from filepaths import *
import impute_hourly_profiles
import data_cleaning
import output_data
import emissions
import validation
import gross_to_net_generation
import eia930

year = 2020
path_prefix = f"{year}/"

In [None]:
profiles = pd.read_csv(outputs_folder(f"{year}/hourly_profiles_{year}.csv") ,dtype=get_dtypes())

# only keep the fuel categories that would report to CEMS
profiles = profiles[profiles["fuel_category"].isin(['coal', 'natural_gas', 'petroleum', 'other', 'biomass', 'waste'])]

# only keep data for fleet-months that report some data to CEMS
profiles = profiles[profiles.groupby(["ba_code","fuel_category","report_date"])["cems_profile"].transform(sum) > 0]

# only keep data where the residual profile is used for non-cems plants
profiles = profiles[profiles["profile_method"].isin(["shifted_residual_profile","residual_profile"])]

In [None]:
correlations = profiles.groupby(["ba_code","fuel_category","report_date"])[["cems_profile","profile"]].corr().reset_index()
correlations = correlations[correlations["level_3"] == "cems_profile"].drop(columns=["level_3", "cems_profile"])
correlations

In [None]:
px.box(correlations, x="ba_code", y="profile", facet_col="fuel_category")

In [None]:
ba = "ISNE"
fuel = "petroleum"

correlations[(correlations["ba_code"] == ba) & (correlations["fuel_category"] == fuel)]

In [None]:
ba = "ISNE"
fuel= "petroleum"
month = "2020-02-01"

data_to_graph = profiles[(profiles["ba_code"] == ba) & (profiles["fuel_category"] == fuel) & (profiles["report_date"] == month)]

px.line(data_to_graph, x= "datetime_local", y=["cems_profile","profile","eia930_profile"])

In [None]:
def cov(df, groupby_columns:list, value_column:str):
    """
    Calculates the coefficient of variation for data grouped by specific columns
    Args:
        df: pandas dataframe
        groupby_columns: list of column names to group the data by
        value_column: string name of column containing the values for which you want cov calculated
    Returns:
        result: a pandas df with grouped statistics for count, mean, population standard deviation, and cov
    """

    # define a function to calculate the population standard deviation with ddof=0
    std_p = lambda x: x.std(ddof=0)
    std_p.__name__ = 'std_p'

    columns_to_keep = groupby_columns + [value_column]

    df = df.copy()[columns_to_keep]

    result = df.groupby(groupby_columns).agg(['count','mean',std_p])

    result = result.droplevel(level=0, axis=1)

    result['cov'] = result['std_p'] / result['mean']

    return result

In [None]:
cems_cov = cov(profiles, groupby_columns=["ba_code","fuel_category","report_date"], value_column="cems_profile").reset_index()
cems_cov

In [None]:
profile_cov = cov(profiles, groupby_columns=["ba_code","fuel_category","report_date"], value_column="profile").reset_index()
profile_cov

In [None]:
cems_cov[cems_cov["ba_code"] == "ISNE"].groupby("fuel_category").mean()

In [None]:
profile_cov[profile_cov["ba_code"] == "ISNE"].groupby("fuel_category").mean()