In [1]:
# Tell python where to look for modules. 
# Depending on how your jupyter handles working directories, this may not be needed.
import sys
sys.path.append('../../hourly-egrid/')

# import the necessary packages
%reload_ext autoreload
%autoreload 2

# import packages
import sqlalchemy as sa
from pathlib import Path
import pandas as pd
import plotly.express as px
import numpy as np
import statsmodels.formula.api as smf
import warnings

# PUDL
import pudl.analysis.allocate_net_gen as allocate_gen_fuel
import pudl.analysis.epa_crosswalk as epa_crosswalk
import pudl.output.pudltabl

# local packages
import src.data_cleaning as data_cleaning
from src.gross_to_net_generation import *
import src.load_data as load_data

from src.column_checks import get_dtypes

year = 2020


# Test updated approach to GTN conversion

In [15]:
# load data from csv
year = 2020
path_prefix = ''

cems = pd.read_csv(f'../data/outputs/{path_prefix}{year}/cems_{year}.csv', dtype=get_dtypes(), parse_dates=['datetime_utc', 'report_date'])
eia923_allocated = pd.read_csv(f'../data/outputs/{path_prefix}{year}/eia923_allocated_{year}.csv', dtype=get_dtypes(), parse_dates=['report_date'])
plant_attributes = pd.read_csv(f"../data/outputs/{path_prefix}{year}/plant_static_attributes_{year}.csv")


In [34]:
# validate method

# merge together monthly subplant totals from EIA and calculated from CEMS
eia_netgen = eia923_allocated.groupby(['plant_id_eia',"subplant_id","report_date"], dropna=False).sum(min_count=1)['net_generation_mwh'].reset_index().dropna(subset="net_generation_mwh")
calculated_netgen = cems.groupby(['plant_id_eia',"subplant_id","report_date"], dropna=False).sum()['net_generation_mwh'].reset_index()
validated_ng = eia_netgen.merge(calculated_netgen, how="inner", on=['plant_id_eia',"subplant_id","report_date"], suffixes=("_eia","_calc"))

validated_ng = validated_ng.round(3)
validated_ng = validated_ng[validated_ng[["net_generation_mwh_eia","net_generation_mwh_calc"]].sum(axis=1) != 0]

validated_ng['pct_error'] = (validated_ng['net_generation_mwh_calc'] - validated_ng['net_generation_mwh_eia']) / validated_ng['net_generation_mwh_eia']
validated_ng.sort_values(by="pct_error")

Unnamed: 0,plant_id_eia,subplant_id,report_date,net_generation_mwh_eia,net_generation_mwh_calc,pct_error
17525,50397,,2020-10-01,31857.8,10647.813,-0.665771
17856,50900,,2020-10-01,40936.8,20495.911,-0.499328
0,3,0,2020-02-01,1758.0,1758.000,0.000000
19566,55124,0,2020-02-01,703.0,703.000,0.000000
19565,55124,0,2020-01-01,329339.0,329339.000,0.000000
...,...,...,...,...,...,...
9840,3797,4,2020-04-01,121254.0,121254.000,0.000000
9839,3797,4,2020-03-01,156298.0,156298.000,0.000000
9838,3797,4,2020-02-01,288544.0,288544.000,0.000000
9930,3943,0,2020-08-01,272916.0,272916.000,0.000000


In [38]:
validated_ng[validated_ng["plant_id_eia"] == 50397]

Unnamed: 0,plant_id_eia,subplant_id,report_date,net_generation_mwh_eia,net_generation_mwh_calc,pct_error
17516,50397,,2020-01-01,38111.6,38111.6,0.0
17517,50397,,2020-02-01,34898.3,34898.3,0.0
17518,50397,,2020-03-01,37348.0,37348.0,0.0
17519,50397,,2020-04-01,30793.7,30793.7,0.0
17520,50397,,2020-05-01,31794.8,31794.8,0.0
17521,50397,,2020-06-01,26706.9,26706.9,0.0
17522,50397,,2020-07-01,35122.9,35122.9,0.0
17523,50397,,2020-08-01,33474.2,33474.2,0.0
17524,50397,,2020-09-01,35886.5,35886.5,0.0
17525,50397,,2020-10-01,31857.8,10647.813,-0.665771


In [30]:
# create a new version of cems before net generation was calculated
cems_gross = cems.drop(columns=["net_generation_mwh", "gtn_method"])

In [31]:
gtn_conversions = data_cleaning.calculate_gross_to_net_conversion_factors(
        cems_gross, eia923_allocated, plant_attributes
    )

In [44]:
units_in_subplant = (
    cems.groupby(
        ["plant_id_eia", "subplant_id", "report_date","unitid"], dropna=False
    )
    .count()
    .reset_index()
    .rename(columns={"unitid": "units_in_subplant"})
)

In [61]:
units_in_subplant = cems[
    ["plant_id_eia", "subplant_id", "report_date", "unitid","datetime_utc"]
]
# get a count of how many hours are reported in each month for each unit
units_in_subplant = (
    units_in_subplant.groupby(
        ["plant_id_eia", "subplant_id", "report_date","unitid"], dropna=False
    )
    .count()
    .reset_index())
# remove any units where there is a single hour reported for a month.
# this is likely due to an error in assigning the report date
"""units_in_subplant = units_in_subplant[units_in_subplant["datetime_utc"] > 1]
# now get a count of the number of units in each subplant-month
units_in_subplant = (
    units_in_subplant.groupby(
        ["plant_id_eia", "subplant_id", "report_date"], dropna=False
    )
    .count()
    .reset_index()
    .rename(columns={"unitid": "units_in_subplant"})).drop(columns="datetime_utc")"""

'units_in_subplant = units_in_subplant[units_in_subplant["datetime_utc"] > 1]\n# now get a count of the number of units in each subplant-month\nunits_in_subplant = (\n    units_in_subplant.groupby(\n        ["plant_id_eia", "subplant_id", "report_date"], dropna=False\n    )\n    .count()\n    .reset_index()\n    .rename(columns={"unitid": "units_in_subplant"})).drop(columns="datetime_utc")'

In [62]:
units_in_subplant[units_in_subplant["plant_id_eia"] == 55096]

Unnamed: 0,plant_id_eia,subplant_id,report_date,unitid,datetime_utc
24170,55096,0,2020-05-01,CT,744
24171,55096,0,2020-06-01,CT,720
24172,55096,0,2020-07-01,CT,744
24173,55096,0,2020-08-01,CT,744
24174,55096,0,2020-09-01,CT,720
24175,55096,0,2020-10-01,CT,1
24176,55096,1,2020-05-01,BLR1,744
24177,55096,1,2020-05-01,BLR2,744
24178,55096,1,2020-06-01,BLR1,720
24179,55096,1,2020-06-01,BLR2,720


In [39]:
gtn_conversions[gtn_conversions["plant_id_eia"] == 50397]

Unnamed: 0,plant_id_eia,subplant_id,report_date,hours_in_month,gross_generation_mwh,net_generation_mwh,source,monthly_subplant_ratio,hourly_shift_mw_monthly,annual_subplant_ratio,hourly_shift_mw_annual,monthly_plant_ratio,annual_plant_ratio,plant_primary_fuel,annual_fuel_ratio
17702,50397,,2020-01-01,744.0,0.0,38111.6,both,,51.225269,,46.798554,,,BLQ,5.493358
17703,50397,,2020-02-01,696.0,0.0,34898.3,both,,50.141236,,46.798554,,,BLQ,5.493358
17704,50397,,2020-03-01,743.0,0.0,37348.0,both,,50.266487,,46.798554,,,BLQ,5.493358
17705,50397,,2020-04-01,720.0,0.0,30793.7,both,,42.769028,,46.798554,,,BLQ,5.493358
17706,50397,,2020-05-01,744.0,0.0,31794.8,both,,42.734946,,46.798554,,,BLQ,5.493358
17707,50397,,2020-06-01,720.0,0.0,26706.9,both,,37.092917,,46.798554,,,BLQ,5.493358
17708,50397,,2020-07-01,744.0,0.0,35122.9,both,,47.208199,,46.798554,,,BLQ,5.493358
17709,50397,,2020-08-01,744.0,0.0,33474.2,both,,44.992204,,46.798554,,,BLQ,5.493358
17710,50397,,2020-09-01,720.0,0.0,35886.5,both,,49.842361,,46.798554,,,BLQ,5.493358
17711,50397,,2020-10-01,744.0,0.0,31857.8,both,,42.819624,,46.798554,,,BLQ,5.493358


## recalculate the net generation

In [71]:
# load data from csv
year = 2020
path_prefix = ''

cems = pd.read_csv(f'../data/outputs/{path_prefix}{year}/cems_{year}.csv', dtype=get_dtypes(), parse_dates=['datetime_utc', 'report_date'])
eia923_allocated = pd.read_csv(f'../data/outputs/{path_prefix}{year}/eia923_allocated_{year}.csv', dtype=get_dtypes(), parse_dates=['report_date'])
plant_attributes = pd.read_csv(f"../data/outputs/{path_prefix}{year}/plant_static_attributes_{year}.csv")

In [72]:
# create a new version of cems before net generation was calculated
cems_gross = cems.drop(columns=["net_generation_mwh", "gtn_method"])

cems_gross = data_cleaning.remove_incomplete_unit_months(cems_gross)

   Removing 75 unit-months with incomplete hourly data


In [74]:
cems, gtn_conversions = data_cleaning.convert_gross_to_net_generation(cems_gross, eia923_allocated, plant_attributes)

In [75]:
# validate method

# merge together monthly subplant totals from EIA and calculated from CEMS
eia_netgen = eia923_allocated.groupby(['plant_id_eia',"subplant_id","report_date"], dropna=False).sum(min_count=1)['net_generation_mwh'].reset_index().dropna(subset="net_generation_mwh")
calculated_netgen = cems.groupby(['plant_id_eia',"subplant_id","report_date"], dropna=False).sum()['net_generation_mwh'].reset_index()
validated_ng = eia_netgen.merge(calculated_netgen, how="inner", on=['plant_id_eia',"subplant_id","report_date"], suffixes=("_eia","_calc"))

validated_ng = validated_ng.round(3)
validated_ng = validated_ng[validated_ng[["net_generation_mwh_eia","net_generation_mwh_calc"]].sum(axis=1) != 0]

validated_ng['pct_error'] = (validated_ng['net_generation_mwh_calc'] - validated_ng['net_generation_mwh_eia']) / validated_ng['net_generation_mwh_eia']
validated_ng.sort_values(by="pct_error")

Unnamed: 0,plant_id_eia,subplant_id,report_date,net_generation_mwh_eia,net_generation_mwh_calc,pct_error
0,3,0,2020-02-01,1758.0,1758.0,0.0
19540,55126,0,2020-03-01,144187.0,144187.0,0.0
19539,55126,0,2020-02-01,164520.0,164520.0,0.0
19538,55126,0,2020-01-01,176816.0,176816.0,0.0
19537,55124,0,2020-12-01,353019.0,353019.0,0.0
...,...,...,...,...,...,...
9826,3797,2,2020-09-01,32029.0,32029.0,0.0
9825,3797,2,2020-08-01,173893.0,173893.0,0.0
9824,3797,2,2020-07-01,163793.0,163793.0,0.0
9916,3935,2,2020-05-01,725095.0,725095.0,0.0


In [None]:
# what percentage of hours 
cems.groupby("gtn_method", dropna=False).count()['net_generation_mwh'] / len(cems)

In [None]:
cems.groupby("gtn_method", dropna=False).sum()['net_generation_mwh'] / cems[['net_generation_mwh']].sum().item()

In [None]:
gtn_conversions[(gtn_conversions['plant_id_eia'] == 3) & (gtn_conversions['subplant_id'] == 4)]

In [None]:
cems[(cems['plant_id_eia'] == 60) & (cems['subplant_id'] == 0) & (cems['report_date'] == "2020-03-01")]

In [None]:
data_to_graph = cems[cems['plant_id_eia'] == 2953].groupby(["plant_id_eia","datetime_utc"]).sum()[["gross_generation_mwh","net_generation_mwh"]].reset_index()
px.line(data_to_graph, x="datetime_utc", y=["gross_generation_mwh","net_generation_mwh"])

# Refine Assumption for assumed gross to net generation ratio

In [None]:
# load data from csv
year = 2020
path_prefix = ''
cems = pd.read_csv(f'../data/outputs/{path_prefix}cems_{year}.csv', dtype=get_dtypes(), parse_dates=['report_date'])
plant_attributes = pd.read_csv(f"../data/outputs/{path_prefix}plant_static_attributes_{year}.csv", dtype=get_dtypes())

In [None]:
eia923_allocated = pd.read_csv(f'../data/outputs/{path_prefix}eia923_allocated_{year}.csv', dtype=get_dtypes(), parse_dates=['report_date'])
eia_ng_by_plant = eia923_allocated.groupby(['plant_id_eia',"subplant_id","report_date"], dropna=False).sum()['net_generation_mwh'].reset_index()

In [None]:
# remove net generation columns from cems
cems = cems.drop(columns=["net_generation_mwh","gtn_method"])
cems = data_cleaning.convert_gross_to_net_generation(cems, plant_attributes, year, method_order=[
        "subplant_ratio",
        "subplant_regression",
        "plant_ratio",
        "plant_regression",
    ])

cems.groupby("gtn_method").sum()["net_generation_mwh"] / cems[['net_generation_mwh']].sum().item()

In [None]:
cems.groupby("gtn_method").sum()["net_generation_mwh"] / cems[['net_generation_mwh']].sum().item()

In [None]:
calculated_ng_by_plant = cems.groupby(['plant_id_eia',"subplant_id","report_date",'gtn_method'], dropna=False).sum()['net_generation_mwh'].reset_index()
validated_ng = eia_ng_by_plant.merge(calculated_ng_by_plant, how="inner", on=['plant_id_eia',"report_date"], suffixes=("_eia","_calc"))
validated_ng

In [None]:

validated_ng['squared_error'] = (validated_ng['net_generation_mwh_eia'] - validated_ng['net_generation_mwh_calc'])**2
validated_ng['pct_error'] = (validated_ng['net_generation_mwh_calc'] - validated_ng['net_generation_mwh_eia']) / validated_ng['net_generation_mwh_eia']
validated_ng['abs_pct_error'] = abs(validated_ng['pct_error'])
validated_ng = validated_ng.round(2)
validated_ng[validated_ng['net_generation_mwh_eia'] != 0].mean().round(2)

In [None]:
validated_ng[validated_ng['net_generation_mwh_eia'] != 0].groupby('gtn_method', dropna=False).mean()

In [None]:
# remove net generation columns from cems
#cems = cems.drop(columns=["net_generation_mwh","gtn_method"])
cems = data_cleaning.convert_gross_to_net_generation(cems, plant_attributes, year, method_order=[
        "subplant_ratio",
        "subplant_regression",
        "plant_ratio",
        "plant_regression",
    ])

cems.groupby("gtn_method").sum()["net_generation_mwh"] / cems[['net_generation_mwh']].sum().item()

In [None]:
validated_ng = eia_ng_by_plant.merge(calculated_ng_by_plant, how="inner", on='plant_id_eia', suffixes=("_eia","_calc"))
validated_ng['squared_error'] = (validated_ng['net_generation_mwh_eia'] - validated_ng['net_generation_mwh_calc'])**2
validated_ng['pct_error'] = (validated_ng['net_generation_mwh_calc'] - validated_ng['net_generation_mwh_eia']) / validated_ng['net_generation_mwh_eia']
validated_ng['abs_pct_error'] = abs(validated_ng['pct_error'])
validated_ng = validated_ng.round(2)
validated_ng[validated_ng['net_generation_mwh_eia'] != 0].mean().round(2)

In [None]:
# remove net generation columns from cems
cems = cems.drop(columns=["net_generation_mwh","gtn_method"])
cems = data_cleaning.convert_gross_to_net_generation(cems, plant_attributes, year, method_order=[
        "plant_ratio",
        "plant_regression",
        "subplant_ratio",
        "subplant_regression",
    ])

calculated_ng_by_plant = cems.groupby(['plant_id_eia']).sum()['net_generation_mwh'].reset_index()

cems.groupby("gtn_method").sum()["net_generation_mwh"] / cems[['net_generation_mwh']].sum().item()

In [None]:
validated_ng = eia_ng_by_plant.merge(calculated_ng_by_plant, how="inner", on='plant_id_eia', suffixes=("_eia","_calc"))
validated_ng['squared_error'] = (validated_ng['net_generation_mwh_eia'] - validated_ng['net_generation_mwh_calc'])**2
validated_ng['pct_error'] = (validated_ng['net_generation_mwh_calc'] - validated_ng['net_generation_mwh_eia']) / validated_ng['net_generation_mwh_eia']
validated_ng['abs_pct_error'] = abs(validated_ng['pct_error'])
validated_ng = validated_ng.round(2)
validated_ng[validated_ng['net_generation_mwh_eia'] != 0].mean().round(2)

In [None]:
# remove net generation columns from cems
cems = cems.drop(columns=["net_generation_mwh","gtn_method"])
cems = data_cleaning.convert_gross_to_net_generation(cems, plant_attributes, year, method_order=[
        "subplant_ratio",
        "plant_ratio",
        "subplant_regression",
        "plant_regression",
    ])

calculated_ng_by_plant = cems.groupby(['plant_id_eia']).sum()['net_generation_mwh'].reset_index()

cems.groupby("gtn_method").sum()["net_generation_mwh"] / cems[['net_generation_mwh']].sum().item()

In [None]:
validated_ng = eia_ng_by_plant.merge(calculated_ng_by_plant, how="inner", on='plant_id_eia', suffixes=("_eia","_calc"))
validated_ng['squared_error'] = (validated_ng['net_generation_mwh_eia'] - validated_ng['net_generation_mwh_calc'])**2
validated_ng['pct_error'] = (validated_ng['net_generation_mwh_calc'] - validated_ng['net_generation_mwh_eia']) / validated_ng['net_generation_mwh_eia']
validated_ng['abs_pct_error'] = abs(validated_ng['pct_error'])
validated_ng = validated_ng.round(2)
validated_ng[validated_ng['net_generation_mwh_eia'] != 0].mean().round(2)

# Perform a regression on gross and net generation data from multiple years

We want to run a regression on multiple years of CEMS and EIA data. We can do by month, and also on an annual basis

We should probably aggregate by plant-prime mover-environmental equipment.

Steps:
1. Load and clean gross generation data for multiple years
2. Load and distribute net generation data from EIA for multiple years
3. Aggregate / map data from both source

In [None]:
# year = 2020
# number_of_years = 2

# start_year = year - (number_of_years - 1)
# end_year = year
start_year = 2001
end_year = 2020

In [None]:
# load 5 years of monthly data from CEMS and EIA-923
cems_monthly, gen_fuel_allocated = load_monthly_gross_and_net_generation(
    start_year, end_year
)

# add subplant ids to the data
print("Creating subplant IDs")
cems_monthly, gen_fuel_allocated = generate_subplant_ids(
    start_year, end_year, cems_monthly, gen_fuel_allocated
)


In [None]:
gen_data, plant_aggregation_columns = combine_gross_and_net_generation_data(
    cems_monthly, gen_fuel_allocated, 'subplant'
)

# calculate the hourly average generation values
gen_data["hours_in_month"] = gen_data["report_date"].dt.daysinmonth * 24
gen_data["gross_generation_mw"] = (
    gen_data["gross_generation_mwh"] / gen_data["hours_in_month"]
)
gen_data["net_generation_mw"] = (
    gen_data["net_generation_mwh"] / gen_data["hours_in_month"]
)

gen_data

In [None]:
# test regression
id = 3
subplant = 4
test_data = gen_data[(gen_data['plant_id_eia'] == id) & (gen_data['subplant_id'] == subplant)]
model = smf.ols('net_generation_mw ~ gross_generation_mw -1', data=test_data).fit()
outliers = model.outlier_test()
model.summary()

In [None]:
model.params

In [None]:
# get outputs of final adjusted model
slope = model.params[1]
intercept = model.params[0]
rsquared = model.rsquared
rsquared_adj = model.rsquared_adj
number_observations = model.nobs

In [None]:
# calculate the ratio for each plant and create a dataframe
gtn_regression = (
    gen_data.dropna().groupby(plant_aggregation_columns).apply(model_gross_to_net)
)

# Load Data
We need to load net generation data from EIA-923 and gross generation data from CEMS

In [None]:
crosswalk = pudl.output.epacems.epa_crosswalk()

In [None]:


# TODO: move the following code to a separate function so that it does not hold these dataframes in memory after calculation

# load 5 years of monthly data from CEMS and EIA-923
cems_monthly, gen_fuel_allocated = gross_to_net_generation.load_monthly_gross_and_net_generation(
    start_year, end_year
)

# Test Subplant assignment

In [None]:
cems_monthly[cems_monthly['plant_id_eia'] == 7063]

In [None]:
ids = cems_monthly[["plant_id_eia", "unitid"]].drop_duplicates()

In [None]:
ids[ids['plant_id_eia'] == 7063]

In [None]:
crosswalk = pudl.output.epacems.epa_crosswalk()

In [None]:
crosswalk[crosswalk['EIA_PLANT_ID'] == 7063]

In [None]:
# identify epa unit ids missing from the crosswalk

ids.merge(crosswalk[['EIA_PLANT_ID','CAMD_UNIT_ID']], how='outer', left_on=['plant_id_eia','unitid'], right_on=['EIA_PLANT_ID','CAMD_UNIT_ID'], indicator='source')

In [None]:
unique_eia_ids = gen_fuel_allocated[
        ["plant_id_eia", "generator_id"]
    ].drop_duplicates()

In [None]:
unique_eia_ids[unique_eia_ids['plant_id_eia'] == 7063]

# Aggregate the data to the monthly level
For now we will aggregate to plant-month, but in the future we probably want to aggregate at a sub-plant level

## Identify subplants

In [None]:
subplant_crosswalk = identify_subplants(start_year, end_year, gen_fuel_allocated, cems_monthly)
# export the crosswalk to csv
subplant_crosswalk.to_csv('../data/outputs/subplant_crosswalk.csv', index=False)
subplant_crosswalk

In [None]:
# merge the subplant ids into each dataframe
gen_fuel_allocated = gen_fuel_allocated.merge(subplant_crosswalk[['plant_id_eia','generator_id','subplant_id']], how='left', on=['plant_id_eia','generator_id'])
cems_monthly = cems_monthly.merge(subplant_crosswalk[['plant_id_eia','unitid','subplant_id']], how='left', on=['plant_id_eia','unitid'])

## Investigate relationships between units and gens
This is just exploration and not used in this pipeline

In [None]:
crosswalk = load_data.load_epa_eia_crosswalk(year=2020)[['plant_id_epa','unitid','plant_id_eia','generator_id']]

# drop duplicated rows (might have had multiple entries for boilers, which we do not care about)
crosswalk = crosswalk.drop_duplicates()
# drop rows with missing crosswalks
crosswalk = crosswalk[~crosswalk['plant_id_eia'].isna()]

# one-to-one relationships
one_to_one = crosswalk.copy()[(~crosswalk.duplicated(subset=['plant_id_epa','unitid'], keep=False)) & (~crosswalk.duplicated(subset=['plant_id_eia','generator_id'], keep=False))]
#crosswalk.drop_duplicates(subset=['plant_id_epa','unitid'], keep=False).drop_duplicates(subset=['plant_id_eia','generator_id'], keep=False)
one_to_one['relationship'] = '1:1'

# many-to-one relationships
many_to_one = crosswalk.drop_duplicates(subset=['plant_id_epa','unitid'], keep=False)
many_to_one = many_to_one[many_to_one.duplicated(subset=['plant_id_eia','generator_id'], keep=False)]
many_to_one['relationship'] = 'm:1'

# one_to_many
one_to_many = crosswalk.drop_duplicates(subset=['plant_id_eia','generator_id'], keep=False)
one_to_many = one_to_many[one_to_many.duplicated(subset=['plant_id_epa','unitid'], keep=False)]
one_to_many['relationship'] = '1:m'

# many-to-many
# NOTE: this isn't working entirely correctly
# find all 1:m
many_to_many = crosswalk[crosswalk.duplicated(subset=['plant_id_epa','unitid'], keep=False)]
# create a dataframe that has all epa plant-units in this list that are associated with multiple generators
many_to_many_units = many_to_many[many_to_many.duplicated(subset=['plant_id_eia','generator_id'], keep=False)][['plant_id_epa','unitid']]
many_to_many_units['relationship'] = 'm:m'
# merge this back into the 1:m frame to identify all plant-units that are m:m
many_to_many = many_to_many.merge(many_to_many_units.drop_duplicates(), how='left', on=['plant_id_epa','unitid'])
#remove any rows where the relationship column is na (meaning not m:m)
many_to_many = many_to_many[many_to_many['relationship'] == 'm:m']

relationship = pd.concat([one_to_one,one_to_many,many_to_one,many_to_many], axis=0)

missing = crosswalk.merge(relationship, how='left', on=['plant_id_epa','unitid','plant_id_eia','generator_id'])
missing[missing['relationship'].isna()]

In [None]:
missing[missing['relationship'].isna()]

## Identify for which months we have complete reporting from all units/generators in a subplant

In [None]:
# for each subplant-month, get a list of all unitids that reported data in CEMS
missing_units_in_subplant = (
    cems.sort_values(by=["plant_id_eia", "subplant_id", "unitid"])
    .groupby(["plant_id_eia", "subplant_id", "report_date"])["unitid"]
    .unique()
    .apply(list)
    .reset_index()
)
# get a complete list of all of the cems units that belong in each subplant
units_in_subplant = (
    subplant_crosswalk.sort_values(by=["plant_id_eia", "subplant_id", "unitid"])
    .groupby(["plant_id_eia", "subplant_id"])["unitid"]
    .unique()
    .apply(list)
    .reset_index()
)
# merge the list of complete units into the list of monthly reporting units
missing_units_in_subplant = missing_units_in_subplant.merge(
    units_in_subplant,
    how="left",
    on=["plant_id_eia", "subplant_id"],
    suffixes=("_reported", "_complete"),
)
# where is there missing unit reporting
missing_units_in_subplant = missing_units_in_subplant[
    (
        missing_units_in_subplant["unitid_reported"]
        != missing_units_in_subplant["unitid_complete"]
    )
    & (missing_units_in_subplant["unitid_complete"].notnull())
]
missing_units_in_subplant


In [None]:
# remove unit-months where there is incomplete data
cems = cems.merge(missing_units_in_subplant[['plant_id_eia','subplant_id','report_date']], how='left', indicator=True)
cems = cems[cems['_merge'] != 'both'].drop(columns='_merge')

In [None]:
# for each subplant-month, get a list of all generator_ids that reported data in CEMS
missing_gens_in_subplant = gen_fuel_allocated.sort_values(by=['plant_id_eia','subplant_id','generator_id']).groupby(['plant_id_eia','subplant_id','report_date'])['generator_id'].unique().apply(list).reset_index()
# get a complete list of all of the cems units that belong in each subplant
units_in_subplant = subplant_crosswalk.sort_values(by=['plant_id_eia','subplant_id','generator_id']).groupby(['plant_id_eia','subplant_id'])['generator_id'].unique().apply(list).reset_index()
# merge the list of complete units into the list of monthly reporting units
missing_gens_in_subplant = missing_gens_in_subplant.merge(units_in_subplant, how='left', on=['plant_id_eia','subplant_id'], suffixes=('_reported','_complete'))
# where is there missing unit reporting
missing_gens_in_subplant = missing_gens_in_subplant[(missing_gens_in_subplant['generator_id_reported'] != missing_gens_in_subplant['generator_id_complete']) & (missing_gens_in_subplant['generator_id_complete'].notnull())]
missing_gens_in_subplant

In [None]:
gen_fuel_allocated[(gen_fuel_allocated['plant_id_eia'] == 3982) & (gen_fuel_allocated['generator_id'] == '4')]

In [None]:
# remove generator-months where there is incomplete data
gen_fuel_allocated = gen_fuel_allocated.merge(missing_gens_in_subplant[['plant_id_eia','subplant_id','report_date']], how='left', indicator=True)
gen_fuel_allocated = gen_fuel_allocated[gen_fuel_allocated['_merge'] != 'both'].drop(columns='_merge')

## Aggregate Data

In [None]:
groupby_columns = ['plant_id_eia','subplant_id','report_date']

net_gen = gen_fuel_allocated.groupby(groupby_columns).sum(min_count=1)['net_generation_mwh'].reset_index()
gross_gen = cems_monthly.groupby(groupby_columns).sum()['gross_generation_mwh'].reset_index()
gen_data = gross_gen.merge(net_gen, how='outer', on=groupby_columns)

# calculate the hourly average generation values
gen_data['hours_in_month'] = gen_data['report_date'].dt.daysinmonth * 24
gen_data['gross_generation_mw'] = gen_data['gross_generation_mwh'] / gen_data['hours_in_month']
gen_data['net_generation_mw'] = gen_data['net_generation_mwh'] / gen_data['hours_in_month']
gen_data

In [None]:
id = 54
subplant = 3
data_to_plot = gen_data[(gen_data['plant_id_eia'] == id) & (gen_data['subplant_id'] == subplant)]
corner_0 = data_to_plot[['gross_generation_mw','net_generation_mw']].min().min()
corner_1 = data_to_plot[['gross_generation_mw','net_generation_mw']].max().max()

px.scatter(data_to_plot, 
           x='gross_generation_mw', 
           y='net_generation_mw',
           hover_data=['report_date'],
           width=600,
           height=600,
           trendline='ols').add_shape(type="line", x0=corner_0, y0=corner_0, x1=corner_1, y1=corner_1, line=dict(color="Black", width=1))

# Calculate the regressions

In [None]:
# test regression
id = 3
subplant = 4
test_data = gen_data[(gen_data['plant_id_eia'] == id) & (gen_data['subplant_id'] == subplant)]
model = smf.ols('net_generation_mw ~ gross_generation_mw', data=test_data).fit()
outliers = model.outlier_test()
model.summary()

In [None]:
def model_gross_to_net(df):
    """
    Performs a linear regression model of monthly gross to net generation.

    Performs recursive outlier removal up to two times if the absolute value of 
    the studentizes residual > 3

    Args:
        df: dataframe containing all values of gross and net generation that should be regressed
    Returns:
        various model parameters
    """
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")

        # get a linear model for the data points
        model = smf.ols('net_generation_mw ~ gross_generation_mw', data=df).fit()

        # find and remove any outliers recursively up to two times
        outliers = model.outlier_test()
        if abs(outliers['student_resid']).max() > 3:
            # remove any outlier values
            df = df[~df.index.isin(outliers[abs(outliers['student_resid']) > 3].index)]

            # get a linear model of the corrected data
            model = smf.ols(
                    'net_generation_mw ~ gross_generation_mw', data=df).fit()
            
            outliers = model.outlier_test()

            # perform this removal one more time in case any outliers were masked by the first outlier(s)
            if abs(outliers['student_resid']).max() > 3:
                # remove any outlier values
                df = df[~df.index.isin(outliers[abs(outliers['student_resid']) > 3].index)]

                # get a linear model of the corrected data
                try:
                    model = smf.ols(
                            'net_generation_mw ~ gross_generation_mw', data=df).fit()
                except ValueError:
                    pass
        slope = model.params[1]
        intercept = model.params[0]
        rsquared = model.rsquared
        rsquared_adj = model.rsquared_adj
        number_observations = model.nobs

    return slope, intercept, rsquared, rsquared_adj, number_observations

In [None]:
# calculate the ratio for each plant and create a dataframe
gtn_regression = gen_data.dropna().groupby(['plant_id_eia','subplant_id']).apply(model_gross_to_net)
gtn_regression = pd.DataFrame(gtn_regression.tolist(), index=gtn_regression.index, columns=['slope','intercept', 'rsquared','rsquared_adj','observations']).reset_index()
gtn_regression

In [None]:
gtn_regression[(gtn_regression['rsquared_adj'] < 0.9) & (gtn_regression['rsquared_adj'] > 0.8)]

In [None]:
# remove results with a negative rsquared adjusted
gtn_regression = gtn_regression[gtn_regression['rsquared_adj'] > 0]

In [None]:
gtn_regression.to_csv('../data/outputs/gross_to_net_regression.csv', index=False)

In [None]:
gtn_regression.mean().round(2)

# Test Applying the values to the hourly CEMS data

In [None]:
id = 3
test_cems = cems[cems['plant_id_eia'] == id]
test_cems

In [None]:
test_cems = test_cems.merge(gtn_regression[['plant_id_eia','subplant_id','slope','intercept']], how='left', on=['plant_id_eia','subplant_id'])
# divide the house load by the number of hours in each month
#test_cems['intercept'] = test_cems['intercept'] / (test_cems.report_date.dt.daysinmonth * 24)
# divide the house load by the number of units in each subplant
units_in_subplant = subplant_crosswalk.groupby(['plant_id_eia','subplant_id'])['unitid'].count().reset_index().rename(columns={'unitid':'units_in_subplant'})
test_cems = test_cems.merge(units_in_subplant, how='left', on=['plant_id_eia','subplant_id'])
test_cems['intercept'] = test_cems['intercept'] / test_cems['units_in_subplant']
# calculate net generation
test_cems['net_generation_mwh'] = test_cems['gross_generation_mwh'] * test_cems['slope'] + test_cems['intercept']
test_cems

In [None]:
px.line(test_cems, x='datetime_utc', y=['gross_generation_mwh','net_generation_mwh'], facet_col='unitid')

In [None]:
test_cems.groupby(['plant_id_eia','subplant_id','report_date'])['net_generation_mwh'].sum()

In [None]:
gf_to_compare = gen_fuel_allocated[gen_fuel_allocated['plant_id_eia'] == 3]
gf_to_compare = gf_to_compare.groupby(['plant_id_eia','subplant_id','report_date'])['net_generation_mwh'].sum()
gf_to_compare

# Old Code for reference

In [None]:


def convert_gross_to_net_generation(cems, gen_fuel_allocated):
    """
    Converts hourly gross generation in CEMS to hourly net generation by calculating a gross to net generation ratio
    Inputs:

    Returns: 
        cems df with an added column for net_generation_mwh and a column indicated the method used to calculate net generation
    """

    # add a placeholder column that assumes a 1:1 gross to net generation ratio
    # if for some reason we are not able to calculate a gross to net generation ratio, this will be used as the default assumption
    cems['net_generation_mwh'] = cems['gross_generation_mwh']

    # load the allocated eia data for each month where there is corresponding cems data
    eia_plant_month_net_gen = gen_fuel_allocated[(gen_fuel_allocated['data_source'] == 'cems') & ~(gen_fuel_allocated['net_generation_mwh'].isna())]
    # aggregate at the plant month level
    eia_plant_month_net_gen = eia_plant_month_net_gen.groupby(['plant_id_eia','report_date']).sum()['net_generation_mwh'].reset_index()

    # calculate the total gross generation for each plant month in cems
    cems_plant_month_gross_gen = cems.groupby(['plant_id_eia','report_date']).sum()['gross_generation_mwh'].reset_index()

    # merge the net generation data into the gross generation data
    monthly_gtn_ratio = cems_plant_month_gross_gen.merge(eia_plant_month_net_gen, how='left', on=['plant_id_eia','report_date'])

    # calculate the gtn
    monthly_gtn_ratio['gross_to_net_ratio'] = monthly_gtn_ratio['net_generation_mwh'] / monthly_gtn_ratio['gross_generation_mwh']

    # only keep values where the monthly ratio is greater than zero
    monthly_gtn_ratio.loc[(monthly_gtn_ratio['gross_to_net_ratio'] < 0), 'gross_to_net_ratio'] = np.NaN

    # Set up the regression analysis for missing values

    # only keep values where there are not missing values
    gtn_regression = monthly_gtn_ratio.copy()[~(monthly_gtn_ratio['gross_to_net_ratio'].isna())]
    # calculate the ratio for each plant and create a dataframe
    gtn_regression = gtn_regression.groupby('plant_id_eia').apply(model_gross_to_net)
    gtn_regression = pd.DataFrame(gtn_regression.tolist(), index=gtn_regression.index, columns=['gtn_linear', 'rsquared','rsquared_adj','observations']).reset_index()
    # only keep the results with adjusted rsquared values greater than 0.70
    gtn_regression = gtn_regression[gtn_regression['rsquared_adj'] >= 0.7]

    # merge in regression results
    monthly_gtn_ratio = monthly_gtn_ratio.merge(gtn_regression[['plant_id_eia','gtn_linear']], how='left', on='plant_id_eia')

    # add a status column for how the net generation was calculated 
    monthly_gtn_ratio['net_gen_method'] = 'monthly_ratio'
    monthly_gtn_ratio.loc[(monthly_gtn_ratio['gross_to_net_ratio'].isna()) & ~(monthly_gtn_ratio['gtn_linear'].isna()), 'net_gen_method'] = 'annual_regression'
    monthly_gtn_ratio.loc[(monthly_gtn_ratio['gross_to_net_ratio'].isna()) & (monthly_gtn_ratio['gtn_linear'].isna()), 'net_gen_method'] = 'net_equals_gross'

    # fill missing values using the ratio from the regression results
    monthly_gtn_ratio['gross_to_net_ratio'] = monthly_gtn_ratio['gross_to_net_ratio'].fillna(monthly_gtn_ratio['gtn_linear'])

    # merge the gtn ratio into the cems data
    cems = cems.merge(monthly_gtn_ratio[['plant_id_eia','report_date','gross_to_net_ratio','net_gen_method']], how='left', on=['plant_id_eia','report_date'])
    # calculate hourly net generation
    cems['net_generation_mwh_calculated'] = cems['gross_generation_mwh'] * cems['gross_to_net_ratio']
    
    # update the net generation column using the calculated values
    cems['net_generation_mwh'].update(cems['net_generation_mwh_calculated'])

    # update the method column to indicate which used the default assumption
    cems['net_gen_method'] = cems['net_gen_method'].fillna('net_equals_gross')

    # drop the calculated column
    cems = cems.drop(columns=['net_generation_mwh_calculated'])

    return cems

In [None]:
# TESTING

# 1a: Try monthly GTN by generator
# load the allocated eia data for each month where there is corresponding cems data
eia_gen_month_net_gen = gen_fuel_allocated[(gen_fuel_allocated['data_source'] == 'cems') & ~(gen_fuel_allocated['net_generation_mwh'].isna())]
# aggregate at the generator month level
eia_gen_month_net_gen = eia_gen_month_net_gen.groupby(['plant_id_eia','generator_id','report_date']).sum()['net_generation_mwh'].reset_index()

# match unit id to generator id
cems_gen_month_gross_gen = data_cleaning.crosswalk_epa_unit_to_eia_generator_id(cems, unique_gen_match=True)
# drop any observations where there is not a match to a generator id
cems_gen_month_gross_gen = cems_gen_month_gross_gen[~cems_gen_month_gross_gen['generator_id'].isna()]

# calculate the total gross generation for each generator month in cems
cems_gen_month_gross_gen = cems_gen_month_gross_gen.groupby(['plant_id_eia','generator_id','report_date']).sum()['gross_generation_mwh'].reset_index()

# merge the net generation data into the gross generation data
monthly_gtn_ratio = cems_gen_month_gross_gen.merge(eia_gen_month_net_gen, how='left', on=['plant_id_eia','generator_id','report_date'])

# calculate the gtn
monthly_gtn_ratio['gtn_ratio_gen_month'] = monthly_gtn_ratio['net_generation_mwh'] / monthly_gtn_ratio['gross_generation_mwh']

#### Calculate annual values by gen

# identify whether any individual month GTNs are very large
annual_gtn_ratio = monthly_gtn_ratio.groupby(['plant_id_eia','generator_id']).max()['gtn_ratio_gen_month'].reset_index()
# only keep values for which the maximum GTN values are unrealistic
annual_gtn_ratio = annual_gtn_ratio[(annual_gtn_ratio['gtn_ratio_gen_month'] > 1.1) | (annual_gtn_ratio['gtn_ratio_gen_month'] < 0)]

# calculate annual net gen values
eia_gen_year_net_gen = eia_gen_month_net_gen.groupby(['plant_id_eia','generator_id']).sum().reset_index()
cems_gen_year_gross_gen = cems_gen_month_gross_gen.groupby(['plant_id_eia','generator_id']).sum().reset_index()

# merge in the annual gross generation and net generation data
annual_gtn_ratio = annual_gtn_ratio.merge(cems_gen_year_gross_gen, how='left', on=['plant_id_eia','generator_id'])
annual_gtn_ratio = annual_gtn_ratio.merge(eia_gen_year_net_gen, how='left', on=['plant_id_eia','generator_id'])

# calculate the gtn
annual_gtn_ratio['gtn_ratio_gen_year'] = annual_gtn_ratio['net_generation_mwh'] / annual_gtn_ratio['gross_generation_mwh']

# merge this back into the monthly data
monthly_gtn_ratio = monthly_gtn_ratio.merge(annual_gtn_ratio[['plant_id_eia','generator_id','gtn_ratio_gen_year']], how='left', on=['plant_id_eia','generator_id'])

# NOTE
# In some cases, using the annual gtn value may lead to overestimating monthly generation
# it might depend on how the generation is getting allocated

monthly_gtn_ratio[monthly_gtn_ratio['plant_id_eia'] == 61242]