In [1]:
# Tell python where to look for modules. 
# Depending on how your jupyter handles working directories, this may not be needed.
import sys
sys.path.append('../../hourly-egrid/')

# import the necessary packages
%reload_ext autoreload
%autoreload 2

# import packages
import sqlalchemy as sa
from pathlib import Path
import pandas as pd
import plotly.express as px
import numpy as np
import statsmodels.formula.api as smf
import warnings

# PUDL
import pudl.analysis.allocate_net_gen as allocate_gen_fuel
import pudl.analysis.epa_crosswalk as epa_crosswalk
import pudl.output.pudltabl

# local packages
import src.data_cleaning as data_cleaning
from src.gross_to_net_generation import *
import src.load_data as load_data

from src.column_checks import get_dtypes

year = 2020


### Where is data getting duplicated in CEMS?

In [29]:
# load the CEMS data
cems = load_data.load_cems_data(year)

In [30]:
cems[cems.duplicated(subset=["plant_id_eia","unitid","datetime_utc"])]

Unnamed: 0,plant_id_eia,unitid,datetime_utc,operating_time_hours,gross_generation_mwh,steam_load_1000_lb,fuel_consumed_mmbtu,co2_mass_lb,nox_mass_lb,so2_mass_lb,plant_id_epa,co2_mass_measurement_code,nox_mass_measurement_code,so2_mass_measurement_code
1216393,59338,1CTGA,2020-01-01 07:00:00+00:00,0.0,0.0,0.0,0.000000,,,,55306,,,
1216395,59338,1CTGA,2020-01-01 08:00:00+00:00,0.0,0.0,0.0,0.000000,,,,55306,,,
1216397,59338,1CTGA,2020-01-01 09:00:00+00:00,0.0,0.0,0.0,0.000000,,,,55306,,,
1216399,59338,1CTGA,2020-01-01 10:00:00+00:00,0.0,0.0,0.0,0.000000,,,,55306,,,
1216401,59338,1CTGA,2020-01-01 11:00:00+00:00,0.0,0.0,0.0,0.000000,,,,55306,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32997519,10789,CTG-2,2021-01-01 01:00:00+00:00,1.0,224.0,0.0,1643.599976,217508.0,31.200001,0.986,55120,Other,Measured,Measured
32997521,10789,CTG-2,2021-01-01 02:00:00+00:00,1.0,224.0,0.0,1642.199951,217508.0,31.200001,0.985,55120,Other,Measured,Measured
32997523,10789,CTG-2,2021-01-01 03:00:00+00:00,1.0,224.0,0.0,1646.000000,217508.0,31.299999,0.988,55120,Other,Measured,Measured
32997525,10789,CTG-2,2021-01-01 04:00:00+00:00,1.0,224.0,0.0,1646.900024,217508.0,31.299999,0.988,55120,Other,Measured,Measured


In [None]:
# remove non-grid connected plants
cems = remove_plants(
    cems,
    non_grid_connected=True,
    remove_states=["PR"],
    steam_only_plants=False,
    distribution_connected_plants=False,
)

# manually remove steam-only units
cems = manually_remove_steam_units(cems)

# Test updated approach to GTN conversion

In [2]:
# load data from csv
year = 2020
path_prefix = ''

cems = pd.read_csv(f'../data/outputs/{path_prefix}{year}/cems_{year}.csv', dtype=get_dtypes(), parse_dates=['datetime_utc', 'report_date'])
eia923_allocated = pd.read_csv(f'../data/outputs/{path_prefix}{year}/eia923_allocated_{year}.csv', dtype=get_dtypes(), parse_dates=['report_date'])
plant_attributes = pd.read_csv(f"../data/outputs/{path_prefix}{year}/plant_static_attributes_{year}.csv")


In [13]:
# create a new version of cems before net generation was calculated
cems_gross = cems.drop(columns=["net_generation_mwh", "gtn_method"])

In [14]:
gtn_conversions = data_cleaning.calculate_gross_to_net_conversion_factors(
        cems_gross, eia923_allocated, plant_attributes
    )

In [17]:
units_in_subplant = cems[
    ["plant_id_eia", "subplant_id", "report_date", "unitid"]
].drop_duplicates()
units_in_subplant = (
    units_in_subplant.groupby(
        ["plant_id_eia", "subplant_id", "report_date"], dropna=False
    )
    .count()
    .reset_index()
    .rename(columns={"unitid": "units_in_subplant"})
)

In [19]:
units_in_subplant[units_in_subplant["plant_id_eia"] == 58215]

Unnamed: 0,plant_id_eia,subplant_id,report_date,units_in_subplant
28562,58215,,2020-01-01,3
28563,58215,,2020-02-01,3
28564,58215,,2020-03-01,3
28565,58215,,2020-04-01,3
28566,58215,,2020-05-01,3
28567,58215,,2020-06-01,3
28568,58215,,2020-07-01,3
28569,58215,,2020-08-01,3
28570,58215,,2020-09-01,3
28571,58215,,2020-10-01,3


In [15]:
gtn_conversions[gtn_conversions["plant_id_eia"] == 58215]

Unnamed: 0,plant_id_eia,subplant_id,report_date,hours_in_month,gross_generation_mwh,net_generation_mwh,source,monthly_subplant_ratio,hourly_shift_mw_monthly,annual_subplant_ratio,hourly_shift_mw_annual,monthly_plant_ratio,annual_plant_ratio,plant_primary_fuel,annual_fuel_ratio
28562,58215,,2020-01-01,744.0,1179421.0,579322.0,both,0.491192,-806.584247,0.489771,-670.557225,0.491192,0.489771,NG,0.918472
28563,58215,,2020-02-01,696.0,1190622.0,585188.0,both,0.491498,-869.876437,0.489771,-670.557225,0.491498,0.489771,NG,0.918472
28564,58215,,2020-03-01,743.0,1221494.0,599973.0,both,0.49118,-836.502019,0.489771,-670.557225,0.49118,0.489771,NG,0.918472
28565,58215,,2020-04-01,720.0,829448.2,405234.0,both,0.488559,-589.186444,0.489771,-670.557225,0.488559,0.489771,NG,0.918472
28566,58215,,2020-05-01,744.0,837429.2,409560.0,both,0.489068,-575.092984,0.489771,-670.557225,0.489068,0.489771,NG,0.918472
28567,58215,,2020-06-01,720.0,496297.7,241903.0,both,0.487415,-353.325972,0.489771,-670.557225,0.487415,0.489771,NG,0.918472
28568,58215,,2020-07-01,744.0,1233484.0,604632.0,both,0.490182,-845.231183,0.489771,-670.557225,0.490182,0.489771,NG,0.918472
28569,58215,,2020-08-01,744.0,1096368.0,536576.0,both,0.489412,-752.409113,0.489771,-670.557225,0.489412,0.489771,NG,0.918472
28570,58215,,2020-09-01,720.0,978421.7,478772.0,both,0.489331,-693.957917,0.489771,-670.557225,0.489331,0.489771,NG,0.918472
28571,58215,,2020-10-01,744.0,462719.8,224722.0,both,0.485655,-319.889489,0.489771,-670.557225,0.485655,0.489771,NG,0.918472


In [22]:
cems[cems["plant_id_eia"] == 58215]

Unnamed: 0,plant_id_eia,unitid,datetime_utc,operating_time_hours,gross_generation_mwh,steam_load_1000_lb,fuel_consumed_mmbtu,co2_mass_lb,nox_mass_lb,so2_mass_lb,plant_id_epa,co2_mass_measurement_code,nox_mass_measurement_code,so2_mass_measurement_code,report_date,energy_source_code,ch4_mass_lb,n2o_mass_lb,fuel_consumed_for_electricity_mmbtu,co2_mass_lb_for_electricity,ch4_mass_lb_for_electricity,n2o_mass_lb_for_electricity,nox_mass_lb_for_electricity,so2_mass_lb_for_electricity,co2_mass_lb_adjusted,ch4_mass_lb_adjusted,n2o_mass_lb_adjusted,nox_mass_lb_adjusted,so2_mass_lb_adjusted,subplant_id,gtn_method,net_generation_mwh
14485933,58215,1A,2020-01-01 05:00:00+00:00,1.0,347.0,0.0,2363.699951,281000.0,82.699997,1.4,2709,Measured,Measured,Measured,2020-01-01,NG,5.20014,0.520014,2363.699951,281000.0,5.20014,0.520014,82.699997,1.4,281000.0,5.20014,0.520014,82.699997,1.4,,monthly_shift_factor,78.138584
14485934,58215,1A,2020-01-01 05:00:00+00:00,1.0,347.0,0.0,2363.699951,281000.0,82.699997,1.4,2709,Measured,Measured,Measured,2020-01-01,NG,5.20014,0.520014,2363.699951,281000.0,5.20014,0.520014,82.699997,1.4,281000.0,5.20014,0.520014,82.699997,1.4,,monthly_shift_factor,78.138584
14485935,58215,1A,2020-01-01 06:00:00+00:00,1.0,348.0,0.0,2370.699951,281800.0,83.000000,1.4,2709,Measured,Measured,Measured,2020-01-01,NG,5.21554,0.521554,2370.699951,281800.0,5.21554,0.521554,83.000000,1.4,281800.0,5.21554,0.521554,83.000000,1.4,,monthly_shift_factor,79.138584
14485936,58215,1A,2020-01-01 06:00:00+00:00,1.0,348.0,0.0,2370.699951,281800.0,83.000000,1.4,2709,Measured,Measured,Measured,2020-01-01,NG,5.21554,0.521554,2370.699951,281800.0,5.21554,0.521554,83.000000,1.4,281800.0,5.21554,0.521554,83.000000,1.4,,monthly_shift_factor,79.138584
14485937,58215,1A,2020-01-01 07:00:00+00:00,1.0,349.0,0.0,2382.300049,283200.0,83.400002,1.4,2709,Measured,Measured,Measured,2020-01-01,NG,5.24106,0.524106,2382.300049,283200.0,5.24106,0.524106,83.400002,1.4,283200.0,5.24106,0.524106,83.400002,1.4,,monthly_shift_factor,80.138584
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14538632,58215,1C,2021-01-01 02:00:00+00:00,1.0,134.0,0.0,1055.599976,125400.0,126.699997,0.6,2709,Measured,Measured,Measured,2020-12-01,NG,2.32232,0.232232,1055.599976,125400.0,2.32232,0.232232,126.699997,0.6,125400.0,2.32232,0.232232,126.699997,0.6,,monthly_shift_factor,-112.273790
14538633,58215,1C,2021-01-01 03:00:00+00:00,1.0,127.0,0.0,1022.900024,121600.0,113.500000,0.6,2709,Measured,Measured,Measured,2020-12-01,NG,2.25038,0.225038,1022.900024,121600.0,2.25038,0.225038,113.500000,0.6,121600.0,2.25038,0.225038,113.500000,0.6,,monthly_shift_factor,-119.273790
14538634,58215,1C,2021-01-01 03:00:00+00:00,1.0,127.0,0.0,1022.900024,121600.0,113.500000,0.6,2709,Measured,Measured,Measured,2020-12-01,NG,2.25038,0.225038,1022.900024,121600.0,2.25038,0.225038,113.500000,0.6,121600.0,2.25038,0.225038,113.500000,0.6,,monthly_shift_factor,-119.273790
14538635,58215,1C,2021-01-01 04:00:00+00:00,1.0,93.0,0.0,866.500000,103000.0,81.500000,0.5,2709,Measured,Measured,Measured,2020-12-01,NG,1.90630,0.190630,866.500000,103000.0,1.90630,0.190630,81.500000,0.5,103000.0,1.90630,0.190630,81.500000,0.5,,monthly_shift_factor,-153.273790


In [24]:
cems[cems.duplicated(subset=["plant_id_eia","unitid","datetime_utc"])]

Unnamed: 0,plant_id_eia,unitid,datetime_utc,operating_time_hours,gross_generation_mwh,steam_load_1000_lb,fuel_consumed_mmbtu,co2_mass_lb,nox_mass_lb,so2_mass_lb,plant_id_epa,co2_mass_measurement_code,nox_mass_measurement_code,so2_mass_measurement_code,report_date,energy_source_code,ch4_mass_lb,n2o_mass_lb,fuel_consumed_for_electricity_mmbtu,co2_mass_lb_for_electricity,ch4_mass_lb_for_electricity,n2o_mass_lb_for_electricity,nox_mass_lb_for_electricity,so2_mass_lb_for_electricity,co2_mass_lb_adjusted,ch4_mass_lb_adjusted,n2o_mass_lb_adjusted,nox_mass_lb_adjusted,so2_mass_lb_adjusted,subplant_id,gtn_method,net_generation_mwh
1245078,59338,1CTGA,2020-01-01 07:00:00+00:00,0.0,0.0,0.0,0.000000,0.0,,,55306,,,,2020-01-01,NG,0.00000,0.000000,0.000000,0.0,0.00000,0.000000,,,0.0,0.00000,0.000000,,,,monthly_shift_factor,-85.378253
1245080,59338,1CTGA,2020-01-01 08:00:00+00:00,0.0,0.0,0.0,0.000000,0.0,,,55306,,,,2020-01-01,NG,0.00000,0.000000,0.000000,0.0,0.00000,0.000000,,,0.0,0.00000,0.000000,,,,monthly_shift_factor,-85.378253
1245082,59338,1CTGA,2020-01-01 09:00:00+00:00,0.0,0.0,0.0,0.000000,0.0,,,55306,,,,2020-01-01,NG,0.00000,0.000000,0.000000,0.0,0.00000,0.000000,,,0.0,0.00000,0.000000,,,,monthly_shift_factor,-85.378253
1245084,59338,1CTGA,2020-01-01 10:00:00+00:00,0.0,0.0,0.0,0.000000,0.0,,,55306,,,,2020-01-01,NG,0.00000,0.000000,0.000000,0.0,0.00000,0.000000,,,0.0,0.00000,0.000000,,,,monthly_shift_factor,-85.378253
1245086,59338,1CTGA,2020-01-01 11:00:00+00:00,0.0,0.0,0.0,0.000000,0.0,,,55306,,,,2020-01-01,NG,0.00000,0.000000,0.000000,0.0,0.00000,0.000000,,,0.0,0.00000,0.000000,,,,monthly_shift_factor,-85.378253
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24668767,10789,CTG-2,2021-01-01 01:00:00+00:00,1.0,224.0,0.0,1643.599976,217508.0,31.200001,0.986,55120,Other,Measured,Measured,2020-12-01,NG,3.61592,0.361592,1643.599976,217508.0,3.61592,0.361592,31.200001,0.986,217508.0,3.61592,0.361592,31.200001,0.986,,monthly_shift_factor,9.227204
24668769,10789,CTG-2,2021-01-01 02:00:00+00:00,1.0,224.0,0.0,1642.199951,217508.0,31.200001,0.985,55120,Other,Measured,Measured,2020-12-01,NG,3.61284,0.361284,1642.199951,217508.0,3.61284,0.361284,31.200001,0.985,217508.0,3.61284,0.361284,31.200001,0.985,,monthly_shift_factor,9.227204
24668771,10789,CTG-2,2021-01-01 03:00:00+00:00,1.0,224.0,0.0,1646.000000,217508.0,31.299999,0.988,55120,Other,Measured,Measured,2020-12-01,NG,3.62120,0.362120,1646.000000,217508.0,3.62120,0.362120,31.299999,0.988,217508.0,3.62120,0.362120,31.299999,0.988,,monthly_shift_factor,9.227204
24668773,10789,CTG-2,2021-01-01 04:00:00+00:00,1.0,224.0,0.0,1646.900024,217508.0,31.299999,0.988,55120,Other,Measured,Measured,2020-12-01,NG,3.62318,0.362318,1646.900024,217508.0,3.62318,0.362318,31.299999,0.988,217508.0,3.62318,0.362318,31.299999,0.988,,monthly_shift_factor,9.227204


In [25]:
cems[(cems["plant_id_eia"] == 58215) & (cems["unitid"] == "1A") & (cems["datetime_utc"] == "2020-01-01 06:00:00+00:00")]

Unnamed: 0,plant_id_eia,unitid,datetime_utc,operating_time_hours,gross_generation_mwh,steam_load_1000_lb,fuel_consumed_mmbtu,co2_mass_lb,nox_mass_lb,so2_mass_lb,plant_id_epa,co2_mass_measurement_code,nox_mass_measurement_code,so2_mass_measurement_code,report_date,energy_source_code,ch4_mass_lb,n2o_mass_lb,fuel_consumed_for_electricity_mmbtu,co2_mass_lb_for_electricity,ch4_mass_lb_for_electricity,n2o_mass_lb_for_electricity,nox_mass_lb_for_electricity,so2_mass_lb_for_electricity,co2_mass_lb_adjusted,ch4_mass_lb_adjusted,n2o_mass_lb_adjusted,nox_mass_lb_adjusted,so2_mass_lb_adjusted,subplant_id,gtn_method,net_generation_mwh
14485935,58215,1A,2020-01-01 06:00:00+00:00,1.0,348.0,0.0,2370.699951,281800.0,83.0,1.4,2709,Measured,Measured,Measured,2020-01-01,NG,5.21554,0.521554,2370.699951,281800.0,5.21554,0.521554,83.0,1.4,281800.0,5.21554,0.521554,83.0,1.4,,monthly_shift_factor,79.138584
14485936,58215,1A,2020-01-01 06:00:00+00:00,1.0,348.0,0.0,2370.699951,281800.0,83.0,1.4,2709,Measured,Measured,Measured,2020-01-01,NG,5.21554,0.521554,2370.699951,281800.0,5.21554,0.521554,83.0,1.4,281800.0,5.21554,0.521554,83.0,1.4,,monthly_shift_factor,79.138584


In [None]:
cems, gtn_conversions = data_cleaning.convert_gross_to_net_generation(cems, eia923_allocated, plant_attributes)

In [None]:
# what percentage of hours 
cems.groupby("gtn_method", dropna=False).count()['net_generation_mwh'] / len(cems)

In [None]:
cems.groupby("gtn_method", dropna=False).sum()['net_generation_mwh'] / cems[['net_generation_mwh']].sum().item()

In [9]:
# validate method

# merge together monthly subplant totals from EIA and calculated from CEMS
eia_ng_by_subplant = eia923_allocated.groupby(['plant_id_eia',"report_date"], dropna=False).sum(min_count=1)['net_generation_mwh'].reset_index().dropna(subset="net_generation_mwh")
calculated_ng_by_subplant = cems.groupby(['plant_id_eia',"report_date"], dropna=False).sum()['net_generation_mwh'].reset_index()
validated_ng = eia_ng_by_subplant.merge(calculated_ng_by_subplant, how="inner", on=['plant_id_eia',"report_date"], suffixes=("_eia","_calc"))




In [10]:
validated_ng[validated_ng["plant_id_eia"] == 58215]

Unnamed: 0,plant_id_eia,report_date,net_generation_mwh_eia,net_generation_mwh_calc
13172,58215,2020-01-01,579322.0,-20776.680015
13173,58215,2020-02-01,585188.0,-20246.0
13174,58215,2020-03-01,599973.0,-21548.0
13175,58215,2020-04-01,405234.0,-18980.240004
13176,58215,2020-05-01,409560.0,-18309.180001
13177,58215,2020-06-01,241903.0,-12491.700018
13178,58215,2020-07-01,604632.0,-24220.0
13179,58215,2020-08-01,536576.0,-23216.380011
13180,58215,2020-09-01,478772.0,-20877.7
13181,58215,2020-10-01,224722.0,-13275.78


In [5]:
# validate method

# merge together monthly subplant totals from EIA and calculated from CEMS
eia_ng_by_subplant = eia923_allocated.groupby(['plant_id_eia',"subplant_id","report_date"], dropna=False).sum(min_count=1)['net_generation_mwh'].reset_index().dropna(subset="net_generation_mwh")
calculated_ng_by_subplant = cems.groupby(['plant_id_eia',"subplant_id","report_date","gtn_method"]).sum()['net_generation_mwh'].reset_index()
validated_ng = eia_ng_by_subplant.merge(calculated_ng_by_subplant, how="inner", on=['plant_id_eia',"subplant_id","report_date"], suffixes=("_eia","_calc"))

validated_ng['squared_error'] = (validated_ng['net_generation_mwh_eia'] - validated_ng['net_generation_mwh_calc'])**2
validated_ng['pct_error'] = (validated_ng['net_generation_mwh_calc'] - validated_ng['net_generation_mwh_eia']) / validated_ng['net_generation_mwh_eia']
validated_ng['abs_pct_error'] = abs(validated_ng['pct_error'])
validated_ng = validated_ng.round(2)
validated_ng


Unnamed: 0,plant_id_eia,subplant_id,report_date,net_generation_mwh_eia,gtn_method,net_generation_mwh_calc,squared_error,pct_error,abs_pct_error
0,3,0,2020-02-01,1758.0,annual_plant_ratio,0.0,3090564.00,-1.0,1.0
1,3,0,2020-02-01,1758.0,annual_shift_factor,0.0,3090564.00,-1.0,1.0
2,3,0,2020-02-01,1758.0,monthly_shift_factor,1758.0,0.00,0.0,0.0
3,3,0,2020-02-01,1758.0,annual_fuel_ratio,0.0,3090564.00,-1.0,1.0
4,3,0,2020-02-01,1758.0,gross_as_net,0.0,3090564.00,-1.0,1.0
...,...,...,...,...,...,...,...,...,...
181650,61242,1,2020-12-01,2501.1,annual_plant_ratio,0.0,6255501.21,-1.0,1.0
181651,61242,1,2020-12-01,2501.1,annual_shift_factor,0.0,6255501.21,-1.0,1.0
181652,61242,1,2020-12-01,2501.1,monthly_shift_factor,0.0,6255501.21,-1.0,1.0
181653,61242,1,2020-12-01,2501.1,annual_fuel_ratio,0.0,6255501.21,-1.0,1.0


In [None]:
validated_ng[(validated_ng["pct_error"] != 0) & (validated_ng["net_generation_mwh_calc"] != 0) & (validated_ng["net_generation_mwh_eia"] != 0)]

In [None]:
gtn_conversions[(gtn_conversions['plant_id_eia'] == 3) & (gtn_conversions['subplant_id'] == 4)]

In [None]:
cems[(cems['plant_id_eia'] == 60) & (cems['subplant_id'] == 0) & (cems['report_date'] == "2020-03-01")]

In [None]:
data_to_graph = cems[cems['plant_id_eia'] == 2953].groupby(["plant_id_eia","datetime_utc"]).sum()[["gross_generation_mwh","net_generation_mwh"]].reset_index()
px.line(data_to_graph, x="datetime_utc", y=["gross_generation_mwh","net_generation_mwh"])

# Refine Assumption for assumed gross to net generation ratio

In [None]:
# load data from csv
year = 2020
path_prefix = ''
cems = pd.read_csv(f'../data/outputs/{path_prefix}cems_{year}.csv', dtype=get_dtypes(), parse_dates=['report_date'])
plant_attributes = pd.read_csv(f"../data/outputs/{path_prefix}plant_static_attributes_{year}.csv", dtype=get_dtypes())

In [None]:
eia923_allocated = pd.read_csv(f'../data/outputs/{path_prefix}eia923_allocated_{year}.csv', dtype=get_dtypes(), parse_dates=['report_date'])
eia_ng_by_plant = eia923_allocated.groupby(['plant_id_eia',"subplant_id","report_date"], dropna=False).sum()['net_generation_mwh'].reset_index()

In [None]:
# remove net generation columns from cems
cems = cems.drop(columns=["net_generation_mwh","gtn_method"])
cems = data_cleaning.convert_gross_to_net_generation(cems, plant_attributes, year, method_order=[
        "subplant_ratio",
        "subplant_regression",
        "plant_ratio",
        "plant_regression",
    ])

cems.groupby("gtn_method").sum()["net_generation_mwh"] / cems[['net_generation_mwh']].sum().item()

In [None]:
cems.groupby("gtn_method").sum()["net_generation_mwh"] / cems[['net_generation_mwh']].sum().item()

In [None]:
calculated_ng_by_plant = cems.groupby(['plant_id_eia',"subplant_id","report_date",'gtn_method'], dropna=False).sum()['net_generation_mwh'].reset_index()
validated_ng = eia_ng_by_plant.merge(calculated_ng_by_plant, how="inner", on=['plant_id_eia',"report_date"], suffixes=("_eia","_calc"))
validated_ng

In [None]:

validated_ng['squared_error'] = (validated_ng['net_generation_mwh_eia'] - validated_ng['net_generation_mwh_calc'])**2
validated_ng['pct_error'] = (validated_ng['net_generation_mwh_calc'] - validated_ng['net_generation_mwh_eia']) / validated_ng['net_generation_mwh_eia']
validated_ng['abs_pct_error'] = abs(validated_ng['pct_error'])
validated_ng = validated_ng.round(2)
validated_ng[validated_ng['net_generation_mwh_eia'] != 0].mean().round(2)

In [None]:
validated_ng[validated_ng['net_generation_mwh_eia'] != 0].groupby('gtn_method', dropna=False).mean()

In [None]:
# remove net generation columns from cems
#cems = cems.drop(columns=["net_generation_mwh","gtn_method"])
cems = data_cleaning.convert_gross_to_net_generation(cems, plant_attributes, year, method_order=[
        "subplant_ratio",
        "subplant_regression",
        "plant_ratio",
        "plant_regression",
    ])

cems.groupby("gtn_method").sum()["net_generation_mwh"] / cems[['net_generation_mwh']].sum().item()

In [None]:
validated_ng = eia_ng_by_plant.merge(calculated_ng_by_plant, how="inner", on='plant_id_eia', suffixes=("_eia","_calc"))
validated_ng['squared_error'] = (validated_ng['net_generation_mwh_eia'] - validated_ng['net_generation_mwh_calc'])**2
validated_ng['pct_error'] = (validated_ng['net_generation_mwh_calc'] - validated_ng['net_generation_mwh_eia']) / validated_ng['net_generation_mwh_eia']
validated_ng['abs_pct_error'] = abs(validated_ng['pct_error'])
validated_ng = validated_ng.round(2)
validated_ng[validated_ng['net_generation_mwh_eia'] != 0].mean().round(2)

In [None]:
# remove net generation columns from cems
cems = cems.drop(columns=["net_generation_mwh","gtn_method"])
cems = data_cleaning.convert_gross_to_net_generation(cems, plant_attributes, year, method_order=[
        "plant_ratio",
        "plant_regression",
        "subplant_ratio",
        "subplant_regression",
    ])

calculated_ng_by_plant = cems.groupby(['plant_id_eia']).sum()['net_generation_mwh'].reset_index()

cems.groupby("gtn_method").sum()["net_generation_mwh"] / cems[['net_generation_mwh']].sum().item()

In [None]:
validated_ng = eia_ng_by_plant.merge(calculated_ng_by_plant, how="inner", on='plant_id_eia', suffixes=("_eia","_calc"))
validated_ng['squared_error'] = (validated_ng['net_generation_mwh_eia'] - validated_ng['net_generation_mwh_calc'])**2
validated_ng['pct_error'] = (validated_ng['net_generation_mwh_calc'] - validated_ng['net_generation_mwh_eia']) / validated_ng['net_generation_mwh_eia']
validated_ng['abs_pct_error'] = abs(validated_ng['pct_error'])
validated_ng = validated_ng.round(2)
validated_ng[validated_ng['net_generation_mwh_eia'] != 0].mean().round(2)

In [None]:
# remove net generation columns from cems
cems = cems.drop(columns=["net_generation_mwh","gtn_method"])
cems = data_cleaning.convert_gross_to_net_generation(cems, plant_attributes, year, method_order=[
        "subplant_ratio",
        "plant_ratio",
        "subplant_regression",
        "plant_regression",
    ])

calculated_ng_by_plant = cems.groupby(['plant_id_eia']).sum()['net_generation_mwh'].reset_index()

cems.groupby("gtn_method").sum()["net_generation_mwh"] / cems[['net_generation_mwh']].sum().item()

In [None]:
validated_ng = eia_ng_by_plant.merge(calculated_ng_by_plant, how="inner", on='plant_id_eia', suffixes=("_eia","_calc"))
validated_ng['squared_error'] = (validated_ng['net_generation_mwh_eia'] - validated_ng['net_generation_mwh_calc'])**2
validated_ng['pct_error'] = (validated_ng['net_generation_mwh_calc'] - validated_ng['net_generation_mwh_eia']) / validated_ng['net_generation_mwh_eia']
validated_ng['abs_pct_error'] = abs(validated_ng['pct_error'])
validated_ng = validated_ng.round(2)
validated_ng[validated_ng['net_generation_mwh_eia'] != 0].mean().round(2)

# Perform a regression on gross and net generation data from multiple years

We want to run a regression on multiple years of CEMS and EIA data. We can do by month, and also on an annual basis

We should probably aggregate by plant-prime mover-environmental equipment.

Steps:
1. Load and clean gross generation data for multiple years
2. Load and distribute net generation data from EIA for multiple years
3. Aggregate / map data from both source

In [None]:
# year = 2020
# number_of_years = 2

# start_year = year - (number_of_years - 1)
# end_year = year
start_year = 2001
end_year = 2020

In [None]:
# load 5 years of monthly data from CEMS and EIA-923
cems_monthly, gen_fuel_allocated = load_monthly_gross_and_net_generation(
    start_year, end_year
)

# add subplant ids to the data
print("Creating subplant IDs")
cems_monthly, gen_fuel_allocated = generate_subplant_ids(
    start_year, end_year, cems_monthly, gen_fuel_allocated
)


In [None]:
gen_data, plant_aggregation_columns = combine_gross_and_net_generation_data(
    cems_monthly, gen_fuel_allocated, 'subplant'
)

# calculate the hourly average generation values
gen_data["hours_in_month"] = gen_data["report_date"].dt.daysinmonth * 24
gen_data["gross_generation_mw"] = (
    gen_data["gross_generation_mwh"] / gen_data["hours_in_month"]
)
gen_data["net_generation_mw"] = (
    gen_data["net_generation_mwh"] / gen_data["hours_in_month"]
)

gen_data

In [None]:
# test regression
id = 3
subplant = 4
test_data = gen_data[(gen_data['plant_id_eia'] == id) & (gen_data['subplant_id'] == subplant)]
model = smf.ols('net_generation_mw ~ gross_generation_mw -1', data=test_data).fit()
outliers = model.outlier_test()
model.summary()

In [None]:
model.params

In [None]:
# get outputs of final adjusted model
slope = model.params[1]
intercept = model.params[0]
rsquared = model.rsquared
rsquared_adj = model.rsquared_adj
number_observations = model.nobs

In [None]:
# calculate the ratio for each plant and create a dataframe
gtn_regression = (
    gen_data.dropna().groupby(plant_aggregation_columns).apply(model_gross_to_net)
)

# Load Data
We need to load net generation data from EIA-923 and gross generation data from CEMS

In [None]:
crosswalk = pudl.output.epacems.epa_crosswalk()

In [None]:


# TODO: move the following code to a separate function so that it does not hold these dataframes in memory after calculation

# load 5 years of monthly data from CEMS and EIA-923
cems_monthly, gen_fuel_allocated = gross_to_net_generation.load_monthly_gross_and_net_generation(
    start_year, end_year
)

# Test Subplant assignment

In [None]:
cems_monthly[cems_monthly['plant_id_eia'] == 7063]

In [None]:
ids = cems_monthly[["plant_id_eia", "unitid"]].drop_duplicates()

In [None]:
ids[ids['plant_id_eia'] == 7063]

In [None]:
crosswalk = pudl.output.epacems.epa_crosswalk()

In [None]:
crosswalk[crosswalk['EIA_PLANT_ID'] == 7063]

In [None]:
# identify epa unit ids missing from the crosswalk

ids.merge(crosswalk[['EIA_PLANT_ID','CAMD_UNIT_ID']], how='outer', left_on=['plant_id_eia','unitid'], right_on=['EIA_PLANT_ID','CAMD_UNIT_ID'], indicator='source')

In [None]:
unique_eia_ids = gen_fuel_allocated[
        ["plant_id_eia", "generator_id"]
    ].drop_duplicates()

In [None]:
unique_eia_ids[unique_eia_ids['plant_id_eia'] == 7063]

# Aggregate the data to the monthly level
For now we will aggregate to plant-month, but in the future we probably want to aggregate at a sub-plant level

## Identify subplants

In [None]:
subplant_crosswalk = identify_subplants(start_year, end_year, gen_fuel_allocated, cems_monthly)
# export the crosswalk to csv
subplant_crosswalk.to_csv('../data/outputs/subplant_crosswalk.csv', index=False)
subplant_crosswalk

In [None]:
# merge the subplant ids into each dataframe
gen_fuel_allocated = gen_fuel_allocated.merge(subplant_crosswalk[['plant_id_eia','generator_id','subplant_id']], how='left', on=['plant_id_eia','generator_id'])
cems_monthly = cems_monthly.merge(subplant_crosswalk[['plant_id_eia','unitid','subplant_id']], how='left', on=['plant_id_eia','unitid'])

## Investigate relationships between units and gens
This is just exploration and not used in this pipeline

In [None]:
crosswalk = load_data.load_epa_eia_crosswalk(year=2020)[['plant_id_epa','unitid','plant_id_eia','generator_id']]

# drop duplicated rows (might have had multiple entries for boilers, which we do not care about)
crosswalk = crosswalk.drop_duplicates()
# drop rows with missing crosswalks
crosswalk = crosswalk[~crosswalk['plant_id_eia'].isna()]

# one-to-one relationships
one_to_one = crosswalk.copy()[(~crosswalk.duplicated(subset=['plant_id_epa','unitid'], keep=False)) & (~crosswalk.duplicated(subset=['plant_id_eia','generator_id'], keep=False))]
#crosswalk.drop_duplicates(subset=['plant_id_epa','unitid'], keep=False).drop_duplicates(subset=['plant_id_eia','generator_id'], keep=False)
one_to_one['relationship'] = '1:1'

# many-to-one relationships
many_to_one = crosswalk.drop_duplicates(subset=['plant_id_epa','unitid'], keep=False)
many_to_one = many_to_one[many_to_one.duplicated(subset=['plant_id_eia','generator_id'], keep=False)]
many_to_one['relationship'] = 'm:1'

# one_to_many
one_to_many = crosswalk.drop_duplicates(subset=['plant_id_eia','generator_id'], keep=False)
one_to_many = one_to_many[one_to_many.duplicated(subset=['plant_id_epa','unitid'], keep=False)]
one_to_many['relationship'] = '1:m'

# many-to-many
# NOTE: this isn't working entirely correctly
# find all 1:m
many_to_many = crosswalk[crosswalk.duplicated(subset=['plant_id_epa','unitid'], keep=False)]
# create a dataframe that has all epa plant-units in this list that are associated with multiple generators
many_to_many_units = many_to_many[many_to_many.duplicated(subset=['plant_id_eia','generator_id'], keep=False)][['plant_id_epa','unitid']]
many_to_many_units['relationship'] = 'm:m'
# merge this back into the 1:m frame to identify all plant-units that are m:m
many_to_many = many_to_many.merge(many_to_many_units.drop_duplicates(), how='left', on=['plant_id_epa','unitid'])
#remove any rows where the relationship column is na (meaning not m:m)
many_to_many = many_to_many[many_to_many['relationship'] == 'm:m']

relationship = pd.concat([one_to_one,one_to_many,many_to_one,many_to_many], axis=0)

missing = crosswalk.merge(relationship, how='left', on=['plant_id_epa','unitid','plant_id_eia','generator_id'])
missing[missing['relationship'].isna()]

In [None]:
missing[missing['relationship'].isna()]

## Identify for which months we have complete reporting from all units/generators in a subplant

In [None]:
# for each subplant-month, get a list of all unitids that reported data in CEMS
missing_units_in_subplant = (
    cems.sort_values(by=["plant_id_eia", "subplant_id", "unitid"])
    .groupby(["plant_id_eia", "subplant_id", "report_date"])["unitid"]
    .unique()
    .apply(list)
    .reset_index()
)
# get a complete list of all of the cems units that belong in each subplant
units_in_subplant = (
    subplant_crosswalk.sort_values(by=["plant_id_eia", "subplant_id", "unitid"])
    .groupby(["plant_id_eia", "subplant_id"])["unitid"]
    .unique()
    .apply(list)
    .reset_index()
)
# merge the list of complete units into the list of monthly reporting units
missing_units_in_subplant = missing_units_in_subplant.merge(
    units_in_subplant,
    how="left",
    on=["plant_id_eia", "subplant_id"],
    suffixes=("_reported", "_complete"),
)
# where is there missing unit reporting
missing_units_in_subplant = missing_units_in_subplant[
    (
        missing_units_in_subplant["unitid_reported"]
        != missing_units_in_subplant["unitid_complete"]
    )
    & (missing_units_in_subplant["unitid_complete"].notnull())
]
missing_units_in_subplant


In [None]:
# remove unit-months where there is incomplete data
cems = cems.merge(missing_units_in_subplant[['plant_id_eia','subplant_id','report_date']], how='left', indicator=True)
cems = cems[cems['_merge'] != 'both'].drop(columns='_merge')

In [None]:
# for each subplant-month, get a list of all generator_ids that reported data in CEMS
missing_gens_in_subplant = gen_fuel_allocated.sort_values(by=['plant_id_eia','subplant_id','generator_id']).groupby(['plant_id_eia','subplant_id','report_date'])['generator_id'].unique().apply(list).reset_index()
# get a complete list of all of the cems units that belong in each subplant
units_in_subplant = subplant_crosswalk.sort_values(by=['plant_id_eia','subplant_id','generator_id']).groupby(['plant_id_eia','subplant_id'])['generator_id'].unique().apply(list).reset_index()
# merge the list of complete units into the list of monthly reporting units
missing_gens_in_subplant = missing_gens_in_subplant.merge(units_in_subplant, how='left', on=['plant_id_eia','subplant_id'], suffixes=('_reported','_complete'))
# where is there missing unit reporting
missing_gens_in_subplant = missing_gens_in_subplant[(missing_gens_in_subplant['generator_id_reported'] != missing_gens_in_subplant['generator_id_complete']) & (missing_gens_in_subplant['generator_id_complete'].notnull())]
missing_gens_in_subplant

In [None]:
gen_fuel_allocated[(gen_fuel_allocated['plant_id_eia'] == 3982) & (gen_fuel_allocated['generator_id'] == '4')]

In [None]:
# remove generator-months where there is incomplete data
gen_fuel_allocated = gen_fuel_allocated.merge(missing_gens_in_subplant[['plant_id_eia','subplant_id','report_date']], how='left', indicator=True)
gen_fuel_allocated = gen_fuel_allocated[gen_fuel_allocated['_merge'] != 'both'].drop(columns='_merge')

## Aggregate Data

In [None]:
groupby_columns = ['plant_id_eia','subplant_id','report_date']

net_gen = gen_fuel_allocated.groupby(groupby_columns).sum(min_count=1)['net_generation_mwh'].reset_index()
gross_gen = cems_monthly.groupby(groupby_columns).sum()['gross_generation_mwh'].reset_index()
gen_data = gross_gen.merge(net_gen, how='outer', on=groupby_columns)

# calculate the hourly average generation values
gen_data['hours_in_month'] = gen_data['report_date'].dt.daysinmonth * 24
gen_data['gross_generation_mw'] = gen_data['gross_generation_mwh'] / gen_data['hours_in_month']
gen_data['net_generation_mw'] = gen_data['net_generation_mwh'] / gen_data['hours_in_month']
gen_data

In [None]:
id = 54
subplant = 3
data_to_plot = gen_data[(gen_data['plant_id_eia'] == id) & (gen_data['subplant_id'] == subplant)]
corner_0 = data_to_plot[['gross_generation_mw','net_generation_mw']].min().min()
corner_1 = data_to_plot[['gross_generation_mw','net_generation_mw']].max().max()

px.scatter(data_to_plot, 
           x='gross_generation_mw', 
           y='net_generation_mw',
           hover_data=['report_date'],
           width=600,
           height=600,
           trendline='ols').add_shape(type="line", x0=corner_0, y0=corner_0, x1=corner_1, y1=corner_1, line=dict(color="Black", width=1))

# Calculate the regressions

In [None]:
# test regression
id = 3
subplant = 4
test_data = gen_data[(gen_data['plant_id_eia'] == id) & (gen_data['subplant_id'] == subplant)]
model = smf.ols('net_generation_mw ~ gross_generation_mw', data=test_data).fit()
outliers = model.outlier_test()
model.summary()

In [None]:
def model_gross_to_net(df):
    """
    Performs a linear regression model of monthly gross to net generation.

    Performs recursive outlier removal up to two times if the absolute value of 
    the studentizes residual > 3

    Args:
        df: dataframe containing all values of gross and net generation that should be regressed
    Returns:
        various model parameters
    """
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")

        # get a linear model for the data points
        model = smf.ols('net_generation_mw ~ gross_generation_mw', data=df).fit()

        # find and remove any outliers recursively up to two times
        outliers = model.outlier_test()
        if abs(outliers['student_resid']).max() > 3:
            # remove any outlier values
            df = df[~df.index.isin(outliers[abs(outliers['student_resid']) > 3].index)]

            # get a linear model of the corrected data
            model = smf.ols(
                    'net_generation_mw ~ gross_generation_mw', data=df).fit()
            
            outliers = model.outlier_test()

            # perform this removal one more time in case any outliers were masked by the first outlier(s)
            if abs(outliers['student_resid']).max() > 3:
                # remove any outlier values
                df = df[~df.index.isin(outliers[abs(outliers['student_resid']) > 3].index)]

                # get a linear model of the corrected data
                try:
                    model = smf.ols(
                            'net_generation_mw ~ gross_generation_mw', data=df).fit()
                except ValueError:
                    pass
        slope = model.params[1]
        intercept = model.params[0]
        rsquared = model.rsquared
        rsquared_adj = model.rsquared_adj
        number_observations = model.nobs

    return slope, intercept, rsquared, rsquared_adj, number_observations

In [None]:
# calculate the ratio for each plant and create a dataframe
gtn_regression = gen_data.dropna().groupby(['plant_id_eia','subplant_id']).apply(model_gross_to_net)
gtn_regression = pd.DataFrame(gtn_regression.tolist(), index=gtn_regression.index, columns=['slope','intercept', 'rsquared','rsquared_adj','observations']).reset_index()
gtn_regression

In [None]:
gtn_regression[(gtn_regression['rsquared_adj'] < 0.9) & (gtn_regression['rsquared_adj'] > 0.8)]

In [None]:
# remove results with a negative rsquared adjusted
gtn_regression = gtn_regression[gtn_regression['rsquared_adj'] > 0]

In [None]:
gtn_regression.to_csv('../data/outputs/gross_to_net_regression.csv', index=False)

In [None]:
gtn_regression.mean().round(2)

# Test Applying the values to the hourly CEMS data

In [None]:
id = 3
test_cems = cems[cems['plant_id_eia'] == id]
test_cems

In [None]:
test_cems = test_cems.merge(gtn_regression[['plant_id_eia','subplant_id','slope','intercept']], how='left', on=['plant_id_eia','subplant_id'])
# divide the house load by the number of hours in each month
#test_cems['intercept'] = test_cems['intercept'] / (test_cems.report_date.dt.daysinmonth * 24)
# divide the house load by the number of units in each subplant
units_in_subplant = subplant_crosswalk.groupby(['plant_id_eia','subplant_id'])['unitid'].count().reset_index().rename(columns={'unitid':'units_in_subplant'})
test_cems = test_cems.merge(units_in_subplant, how='left', on=['plant_id_eia','subplant_id'])
test_cems['intercept'] = test_cems['intercept'] / test_cems['units_in_subplant']
# calculate net generation
test_cems['net_generation_mwh'] = test_cems['gross_generation_mwh'] * test_cems['slope'] + test_cems['intercept']
test_cems

In [None]:
px.line(test_cems, x='datetime_utc', y=['gross_generation_mwh','net_generation_mwh'], facet_col='unitid')

In [None]:
test_cems.groupby(['plant_id_eia','subplant_id','report_date'])['net_generation_mwh'].sum()

In [None]:
gf_to_compare = gen_fuel_allocated[gen_fuel_allocated['plant_id_eia'] == 3]
gf_to_compare = gf_to_compare.groupby(['plant_id_eia','subplant_id','report_date'])['net_generation_mwh'].sum()
gf_to_compare

# Old Code for reference

In [None]:


def convert_gross_to_net_generation(cems, gen_fuel_allocated):
    """
    Converts hourly gross generation in CEMS to hourly net generation by calculating a gross to net generation ratio
    Inputs:

    Returns: 
        cems df with an added column for net_generation_mwh and a column indicated the method used to calculate net generation
    """

    # add a placeholder column that assumes a 1:1 gross to net generation ratio
    # if for some reason we are not able to calculate a gross to net generation ratio, this will be used as the default assumption
    cems['net_generation_mwh'] = cems['gross_generation_mwh']

    # load the allocated eia data for each month where there is corresponding cems data
    eia_plant_month_net_gen = gen_fuel_allocated[(gen_fuel_allocated['data_source'] == 'cems') & ~(gen_fuel_allocated['net_generation_mwh'].isna())]
    # aggregate at the plant month level
    eia_plant_month_net_gen = eia_plant_month_net_gen.groupby(['plant_id_eia','report_date']).sum()['net_generation_mwh'].reset_index()

    # calculate the total gross generation for each plant month in cems
    cems_plant_month_gross_gen = cems.groupby(['plant_id_eia','report_date']).sum()['gross_generation_mwh'].reset_index()

    # merge the net generation data into the gross generation data
    monthly_gtn_ratio = cems_plant_month_gross_gen.merge(eia_plant_month_net_gen, how='left', on=['plant_id_eia','report_date'])

    # calculate the gtn
    monthly_gtn_ratio['gross_to_net_ratio'] = monthly_gtn_ratio['net_generation_mwh'] / monthly_gtn_ratio['gross_generation_mwh']

    # only keep values where the monthly ratio is greater than zero
    monthly_gtn_ratio.loc[(monthly_gtn_ratio['gross_to_net_ratio'] < 0), 'gross_to_net_ratio'] = np.NaN

    # Set up the regression analysis for missing values

    # only keep values where there are not missing values
    gtn_regression = monthly_gtn_ratio.copy()[~(monthly_gtn_ratio['gross_to_net_ratio'].isna())]
    # calculate the ratio for each plant and create a dataframe
    gtn_regression = gtn_regression.groupby('plant_id_eia').apply(model_gross_to_net)
    gtn_regression = pd.DataFrame(gtn_regression.tolist(), index=gtn_regression.index, columns=['gtn_linear', 'rsquared','rsquared_adj','observations']).reset_index()
    # only keep the results with adjusted rsquared values greater than 0.70
    gtn_regression = gtn_regression[gtn_regression['rsquared_adj'] >= 0.7]

    # merge in regression results
    monthly_gtn_ratio = monthly_gtn_ratio.merge(gtn_regression[['plant_id_eia','gtn_linear']], how='left', on='plant_id_eia')

    # add a status column for how the net generation was calculated 
    monthly_gtn_ratio['net_gen_method'] = 'monthly_ratio'
    monthly_gtn_ratio.loc[(monthly_gtn_ratio['gross_to_net_ratio'].isna()) & ~(monthly_gtn_ratio['gtn_linear'].isna()), 'net_gen_method'] = 'annual_regression'
    monthly_gtn_ratio.loc[(monthly_gtn_ratio['gross_to_net_ratio'].isna()) & (monthly_gtn_ratio['gtn_linear'].isna()), 'net_gen_method'] = 'net_equals_gross'

    # fill missing values using the ratio from the regression results
    monthly_gtn_ratio['gross_to_net_ratio'] = monthly_gtn_ratio['gross_to_net_ratio'].fillna(monthly_gtn_ratio['gtn_linear'])

    # merge the gtn ratio into the cems data
    cems = cems.merge(monthly_gtn_ratio[['plant_id_eia','report_date','gross_to_net_ratio','net_gen_method']], how='left', on=['plant_id_eia','report_date'])
    # calculate hourly net generation
    cems['net_generation_mwh_calculated'] = cems['gross_generation_mwh'] * cems['gross_to_net_ratio']
    
    # update the net generation column using the calculated values
    cems['net_generation_mwh'].update(cems['net_generation_mwh_calculated'])

    # update the method column to indicate which used the default assumption
    cems['net_gen_method'] = cems['net_gen_method'].fillna('net_equals_gross')

    # drop the calculated column
    cems = cems.drop(columns=['net_generation_mwh_calculated'])

    return cems

In [None]:
# TESTING

# 1a: Try monthly GTN by generator
# load the allocated eia data for each month where there is corresponding cems data
eia_gen_month_net_gen = gen_fuel_allocated[(gen_fuel_allocated['data_source'] == 'cems') & ~(gen_fuel_allocated['net_generation_mwh'].isna())]
# aggregate at the generator month level
eia_gen_month_net_gen = eia_gen_month_net_gen.groupby(['plant_id_eia','generator_id','report_date']).sum()['net_generation_mwh'].reset_index()

# match unit id to generator id
cems_gen_month_gross_gen = data_cleaning.crosswalk_epa_unit_to_eia_generator_id(cems, unique_gen_match=True)
# drop any observations where there is not a match to a generator id
cems_gen_month_gross_gen = cems_gen_month_gross_gen[~cems_gen_month_gross_gen['generator_id'].isna()]

# calculate the total gross generation for each generator month in cems
cems_gen_month_gross_gen = cems_gen_month_gross_gen.groupby(['plant_id_eia','generator_id','report_date']).sum()['gross_generation_mwh'].reset_index()

# merge the net generation data into the gross generation data
monthly_gtn_ratio = cems_gen_month_gross_gen.merge(eia_gen_month_net_gen, how='left', on=['plant_id_eia','generator_id','report_date'])

# calculate the gtn
monthly_gtn_ratio['gtn_ratio_gen_month'] = monthly_gtn_ratio['net_generation_mwh'] / monthly_gtn_ratio['gross_generation_mwh']

#### Calculate annual values by gen

# identify whether any individual month GTNs are very large
annual_gtn_ratio = monthly_gtn_ratio.groupby(['plant_id_eia','generator_id']).max()['gtn_ratio_gen_month'].reset_index()
# only keep values for which the maximum GTN values are unrealistic
annual_gtn_ratio = annual_gtn_ratio[(annual_gtn_ratio['gtn_ratio_gen_month'] > 1.1) | (annual_gtn_ratio['gtn_ratio_gen_month'] < 0)]

# calculate annual net gen values
eia_gen_year_net_gen = eia_gen_month_net_gen.groupby(['plant_id_eia','generator_id']).sum().reset_index()
cems_gen_year_gross_gen = cems_gen_month_gross_gen.groupby(['plant_id_eia','generator_id']).sum().reset_index()

# merge in the annual gross generation and net generation data
annual_gtn_ratio = annual_gtn_ratio.merge(cems_gen_year_gross_gen, how='left', on=['plant_id_eia','generator_id'])
annual_gtn_ratio = annual_gtn_ratio.merge(eia_gen_year_net_gen, how='left', on=['plant_id_eia','generator_id'])

# calculate the gtn
annual_gtn_ratio['gtn_ratio_gen_year'] = annual_gtn_ratio['net_generation_mwh'] / annual_gtn_ratio['gross_generation_mwh']

# merge this back into the monthly data
monthly_gtn_ratio = monthly_gtn_ratio.merge(annual_gtn_ratio[['plant_id_eia','generator_id','gtn_ratio_gen_year']], how='left', on=['plant_id_eia','generator_id'])

# NOTE
# In some cases, using the annual gtn value may lead to overestimating monthly generation
# it might depend on how the generation is getting allocated

monthly_gtn_ratio[monthly_gtn_ratio['plant_id_eia'] == 61242]