# Perform a regression on gross and net generation data from multiple years

We want to run a regression on multiple years of CEMS and EIA data. We can do by month, and also on an annual basis

We should probably aggregate by plant-prime mover-environmental equipment.

Steps:
1. Load and clean gross generation data for multiple years
2. Load and distribute net generation data from EIA for multiple years
3. Aggregate / map data from both source

In [70]:
# Tell python where to look for modules. 
# Depending on how your jupyter handles working directories, this may not be needed.
import sys
sys.path.append('../../hourly-egrid/')

# import the necessary packages
%reload_ext autoreload
%autoreload 2

# import packages
import sqlalchemy as sa
from pathlib import Path
import pandas as pd
import plotly.express as px
import numpy as np
import statsmodels.formula.api as smf
import warnings

import pudl.analysis.allocate_net_gen as allocate_gen_fuel
import pudl.output.pudltabl

import src.data_cleaning as data_cleaning
import src.gross_to_net_generation as gross_to_net_generation
import src.load_data as load_data


# Load Data
We need to load net generation data from EIA-923 and gross generation data from CEMS

In [2]:
# start with only two years to test multi-year functionality
start_year = 2019
end_year = 2020

# create pudl_out
pudl_db = "sqlite:///../data/pudl/pudl_data/sqlite/pudl.sqlite"
pudl_engine = sa.create_engine(pudl_db)
pudl_out = pudl.output.pudltabl.PudlTabl(pudl_engine, freq="MS", start_date=f"{start_year}-01-01", end_date=f"{end_year}-12-31")

In [3]:
# allocate net generation and heat input to each generator-fuel grouping
gen_fuel_allocated = allocate_gen_fuel.allocate_gen_fuel_by_generator_energy_source(pudl_out, drop_interim_cols=True)
# aggregate the allocated data to the generator level
gen_fuel_allocated = allocate_gen_fuel.agg_by_generator(gen_fuel_allocated, 
                                                        sum_cols=["net_generation_mwh"])

In [13]:
cems_all = []
for year in range(start_year, end_year+1):
    print(f'loading {year}')
    # specify the path to the CEMS data
    cems_path = f"../data/pudl/pudl_data/parquet/epacems/year={year}"

    # specify the columns to use from the CEMS database
    cems_columns = [
        "plant_id_eia",
        "unitid",
        "operating_datetime_utc",
        "operating_time_hours",
        "gross_load_mw"]

    # load the CEMS data
    cems = pd.read_parquet(cems_path, columns=cems_columns)

    # only keep observations where gross load is greater than zero
    cems = cems[cems['gross_load_mw'] > 0]

    # rename cems plant_id_eia to plant_id_epa (PUDL simply renames the ORISPL_CODE column from the raw CEMS data as 'plant_id_eia' without actually crosswalking to the EIA id)
    # rename the heat content column to use the convention used in the EIA data
    cems = cems.rename(columns={"plant_id_eia": "plant_id_epa",})

    # if the unitid has any leading zeros, remove them
    cems["unitid"] = cems["unitid"].str.lstrip("0")

    # crosswalk the plant IDs and add a plant_id_eia column
    cems = data_cleaning.crosswalk_epa_eia_plant_ids(cems, year)

    # fill any missing values for operating time or steam load with zero
    cems["operating_time_hours"] = cems["operating_time_hours"].fillna(0)

    # calculate gross generation by multiplying gross_load_mw by operating_time_hours
    cems["gross_generation_mwh"] = cems["gross_load_mw"] * cems["operating_time_hours"]

    cems_all.append(cems)

cems = pd.concat(cems_all, axis=0)

# add a report date
print('adding report date')
cems = data_cleaning.add_report_date(cems)

loading 2019
loading 2020
adding report date


# Aggregate the data to the monthly level
For now we will aggregate to plant-month, but in the future we probably want to aggregate at a sub-plant level

In [105]:
groupby_columns = ['plant_id_eia','report_date']

net_gen = gen_fuel_allocated.groupby(groupby_columns).sum(min_count=1)['net_generation_mwh'].reset_index()
gross_gen = cems.groupby(groupby_columns).sum()['gross_generation_mwh'].reset_index()
gen_data = gross_gen.merge(net_gen, how='outer', on=groupby_columns)
# add a column for the number of hours in each month
gen_data['hours_in_month'] = gen_data['report_date'].dt.daysinmonth * 24

In [106]:
gen_data

Unnamed: 0,plant_id_eia,report_date,gross_generation_mwh,net_generation_mwh,hours_in_month
0,3,2019-01-01,1164732.50,1119662.004,744
1,3,2019-02-01,1013578.50,978682.003,672
2,3,2019-03-01,1098591.25,1055741.002,744
3,3,2019-04-01,896556.00,858151.999,720
4,3,2019-05-01,1098707.00,1057926.000,744
...,...,...,...,...,...
245368,64837,2020-08-01,,,744
245369,64837,2020-09-01,,,720
245370,64837,2020-10-01,,,744
245371,64837,2020-11-01,,,720


In [95]:
id = 2504
data_to_plot = gen_data[gen_data['plant_id_eia'] == id]
corner_0 = data_to_plot[['gross_generation_mwh','net_generation_mwh']].min().min()
corner_1 = data_to_plot[['gross_generation_mwh','net_generation_mwh']].max().max()

px.scatter(data_to_plot, 
           x='gross_generation_mwh', 
           y='net_generation_mwh',
           hover_data=['report_date'],
           width=600,
           height=600,
           trendline='ols').add_shape(type="line", x0=corner_0, y0=corner_0, x1=corner_1, y1=corner_1, line=dict(color="Black", width=1))

# Calculate the regressions

In [108]:
# test regression
id = 3
test_data =  gen_data[gen_data['plant_id_eia'] == id]
model = smf.ols('net_generation_mwh ~ gross_generation_mwh', data=test_data).fit()
model.summary()

0,1,2,3
Dep. Variable:,net_generation_mwh,R-squared:,0.999
Model:,OLS,Adj. R-squared:,0.999
Method:,Least Squares,F-statistic:,7492.0
Date:,"Fri, 06 May 2022",Prob (F-statistic):,1.7700000000000001e-29
Time:,15:52:22,Log-Likelihood:,-227.19
No. Observations:,23,AIC:,460.4
Df Residuals:,20,BIC:,463.8
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,6.417e+04,4.17e+04,1.539,0.139,-2.28e+04,1.51e+05
gross_generation_mwh,0.9574,0.008,120.328,0.000,0.941,0.974
hours_in_month,-78.9712,58.157,-1.358,0.190,-200.285,42.342

0,1,2,3
Omnibus:,3.933,Durbin-Watson:,1.826
Prob(Omnibus):,0.14,Jarque-Bera (JB):,2.263
Skew:,0.722,Prob(JB):,0.322
Kurtosis:,3.525,Cond. No.,41300000.0


In [71]:
def model_gross_to_net(df):
    """
    Create a linear regression model of monthly gross to net generation

    Args:
        arg
    Returns:
        output
    """
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")

        # get a linear model for the data points
        model = smf.ols('net_generation_mwh ~ gross_generation_mwh', data=df).fit()

        # find and remove any outliers
        try:
            outliers = model.outlier_test()
            corrected = df[~df.index.isin(
                outliers[outliers['bonf(p)'] < 0.5].index)]

            # get a linear model of the corrected data
            model = smf.ols(
                'net_generation_mwh ~ gross_generation_mwh', data=corrected).fit()
        except ValueError:
            pass
        slope = model.params[1]
        intercept = model.params[0]
        rsquared = model.rsquared
        rsquared_adj = model.rsquared_adj
        number_observations = model.nobs

    return slope, intercept, rsquared, rsquared_adj, number_observations

In [75]:
# calculate the ratio for each plant and create a dataframe
gtn_regression = gen_data.dropna().groupby('plant_id_eia').apply(model_gross_to_net)
gtn_regression = pd.DataFrame(gtn_regression.tolist(), index=gtn_regression.index, columns=['slope','intercept', 'rsquared','rsquared_adj','observations']).reset_index()

In [100]:
gtn_regression.sort_values(by='rsquared')

Unnamed: 0,plant_id_eia,slope,intercept,rsquared,rsquared_adj,observations
657,8008,0.084477,0.001408,-inf,,1.0
484,6181,-540.621593,-107.266190,-inf,,1.0
353,3344,0.336085,0.000676,-inf,,1.0
159,1382,0.772641,0.000044,-inf,,1.0
1102,56015,0.077852,0.000917,-inf,,1.0
...,...,...,...,...,...,...
139,1235,0.877425,-57.755562,1.0,,2.0
277,2503,0.264500,-4.684000,1.0,,2.0
697,10350,0.877368,3.055006,1.0,,2.0
1012,55486,0.871724,682.465537,1.0,,2.0


In [97]:
gtn_regression.mean().round(2)

plant_id_eia    26565.27
slope              49.85
intercept        5078.48
rsquared            -inf
rsquared_adj        0.76
observations       19.80
dtype: float64

# Old Code for reference

In [None]:


def convert_gross_to_net_generation(cems, gen_fuel_allocated):
    """
    Converts hourly gross generation in CEMS to hourly net generation by calculating a gross to net generation ratio
    Inputs:

    Returns: 
        cems df with an added column for net_generation_mwh and a column indicated the method used to calculate net generation
    """

    # add a placeholder column that assumes a 1:1 gross to net generation ratio
    # if for some reason we are not able to calculate a gross to net generation ratio, this will be used as the default assumption
    cems['net_generation_mwh'] = cems['gross_generation_mwh']

    # load the allocated eia data for each month where there is corresponding cems data
    eia_plant_month_net_gen = gen_fuel_allocated[(gen_fuel_allocated['data_source'] == 'cems') & ~(gen_fuel_allocated['net_generation_mwh'].isna())]
    # aggregate at the plant month level
    eia_plant_month_net_gen = eia_plant_month_net_gen.groupby(['plant_id_eia','report_date']).sum()['net_generation_mwh'].reset_index()

    # calculate the total gross generation for each plant month in cems
    cems_plant_month_gross_gen = cems.groupby(['plant_id_eia','report_date']).sum()['gross_generation_mwh'].reset_index()

    # merge the net generation data into the gross generation data
    monthly_gtn_ratio = cems_plant_month_gross_gen.merge(eia_plant_month_net_gen, how='left', on=['plant_id_eia','report_date'])

    # calculate the gtn
    monthly_gtn_ratio['gross_to_net_ratio'] = monthly_gtn_ratio['net_generation_mwh'] / monthly_gtn_ratio['gross_generation_mwh']

    # only keep values where the monthly ratio is greater than zero
    monthly_gtn_ratio.loc[(monthly_gtn_ratio['gross_to_net_ratio'] < 0), 'gross_to_net_ratio'] = np.NaN

    # Set up the regression analysis for missing values

    # only keep values where there are not missing values
    gtn_regression = monthly_gtn_ratio.copy()[~(monthly_gtn_ratio['gross_to_net_ratio'].isna())]
    # calculate the ratio for each plant and create a dataframe
    gtn_regression = gtn_regression.groupby('plant_id_eia').apply(model_gross_to_net)
    gtn_regression = pd.DataFrame(gtn_regression.tolist(), index=gtn_regression.index, columns=['gtn_linear', 'rsquared','rsquared_adj','observations']).reset_index()
    # only keep the results with adjusted rsquared values greater than 0.70
    gtn_regression = gtn_regression[gtn_regression['rsquared_adj'] >= 0.7]

    # merge in regression results
    monthly_gtn_ratio = monthly_gtn_ratio.merge(gtn_regression[['plant_id_eia','gtn_linear']], how='left', on='plant_id_eia')

    # add a status column for how the net generation was calculated 
    monthly_gtn_ratio['net_gen_method'] = 'monthly_ratio'
    monthly_gtn_ratio.loc[(monthly_gtn_ratio['gross_to_net_ratio'].isna()) & ~(monthly_gtn_ratio['gtn_linear'].isna()), 'net_gen_method'] = 'annual_regression'
    monthly_gtn_ratio.loc[(monthly_gtn_ratio['gross_to_net_ratio'].isna()) & (monthly_gtn_ratio['gtn_linear'].isna()), 'net_gen_method'] = 'net_equals_gross'

    # fill missing values using the ratio from the regression results
    monthly_gtn_ratio['gross_to_net_ratio'] = monthly_gtn_ratio['gross_to_net_ratio'].fillna(monthly_gtn_ratio['gtn_linear'])

    # merge the gtn ratio into the cems data
    cems = cems.merge(monthly_gtn_ratio[['plant_id_eia','report_date','gross_to_net_ratio','net_gen_method']], how='left', on=['plant_id_eia','report_date'])
    # calculate hourly net generation
    cems['net_generation_mwh_calculated'] = cems['gross_generation_mwh'] * cems['gross_to_net_ratio']
    
    # update the net generation column using the calculated values
    cems['net_generation_mwh'].update(cems['net_generation_mwh_calculated'])

    # update the method column to indicate which used the default assumption
    cems['net_gen_method'] = cems['net_gen_method'].fillna('net_equals_gross')

    # drop the calculated column
    cems = cems.drop(columns=['net_generation_mwh_calculated'])

    return cems

In [None]:
# TESTING

# 1a: Try monthly GTN by generator
# load the allocated eia data for each month where there is corresponding cems data
eia_gen_month_net_gen = gen_fuel_allocated[(gen_fuel_allocated['data_source'] == 'cems') & ~(gen_fuel_allocated['net_generation_mwh'].isna())]
# aggregate at the generator month level
eia_gen_month_net_gen = eia_gen_month_net_gen.groupby(['plant_id_eia','generator_id','report_date']).sum()['net_generation_mwh'].reset_index()

# match unit id to generator id
cems_gen_month_gross_gen = data_cleaning.crosswalk_epa_unit_to_eia_generator_id(cems, unique_gen_match=True)
# drop any observations where there is not a match to a generator id
cems_gen_month_gross_gen = cems_gen_month_gross_gen[~cems_gen_month_gross_gen['generator_id'].isna()]

# calculate the total gross generation for each generator month in cems
cems_gen_month_gross_gen = cems_gen_month_gross_gen.groupby(['plant_id_eia','generator_id','report_date']).sum()['gross_generation_mwh'].reset_index()

# merge the net generation data into the gross generation data
monthly_gtn_ratio = cems_gen_month_gross_gen.merge(eia_gen_month_net_gen, how='left', on=['plant_id_eia','generator_id','report_date'])

# calculate the gtn
monthly_gtn_ratio['gtn_ratio_gen_month'] = monthly_gtn_ratio['net_generation_mwh'] / monthly_gtn_ratio['gross_generation_mwh']

#### Calculate annual values by gen

# identify whether any individual month GTNs are very large
annual_gtn_ratio = monthly_gtn_ratio.groupby(['plant_id_eia','generator_id']).max()['gtn_ratio_gen_month'].reset_index()
# only keep values for which the maximum GTN values are unrealistic
annual_gtn_ratio = annual_gtn_ratio[(annual_gtn_ratio['gtn_ratio_gen_month'] > 1.1) | (annual_gtn_ratio['gtn_ratio_gen_month'] < 0)]

# calculate annual net gen values
eia_gen_year_net_gen = eia_gen_month_net_gen.groupby(['plant_id_eia','generator_id']).sum().reset_index()
cems_gen_year_gross_gen = cems_gen_month_gross_gen.groupby(['plant_id_eia','generator_id']).sum().reset_index()

# merge in the annual gross generation and net generation data
annual_gtn_ratio = annual_gtn_ratio.merge(cems_gen_year_gross_gen, how='left', on=['plant_id_eia','generator_id'])
annual_gtn_ratio = annual_gtn_ratio.merge(eia_gen_year_net_gen, how='left', on=['plant_id_eia','generator_id'])

# calculate the gtn
annual_gtn_ratio['gtn_ratio_gen_year'] = annual_gtn_ratio['net_generation_mwh'] / annual_gtn_ratio['gross_generation_mwh']

# merge this back into the monthly data
monthly_gtn_ratio = monthly_gtn_ratio.merge(annual_gtn_ratio[['plant_id_eia','generator_id','gtn_ratio_gen_year']], how='left', on=['plant_id_eia','generator_id'])

# NOTE
# In some cases, using the annual gtn value may lead to overestimating monthly generation
# it might depend on how the generation is getting allocated

monthly_gtn_ratio[monthly_gtn_ratio['plant_id_eia'] == 61242]