In [1]:
# import packages
import pandas as pd
import numpy as np
import os
import plotly.express as px



# import packages
import argparse
import shutil

%reload_ext autoreload
%autoreload 2

# import local modules
import sys
sys.path.append('../../../open-grid-emissions/src/')
# import local modules
# # # Tell python where to look for modules.
import download_data
import load_data
import data_cleaning
import emissions
import gross_to_net_generation
import impute_hourly_profiles
import eia930
import validation
import output_data
import consumed
from filepaths import downloads_folder, outputs_folder, results_folder

(CVXPY) Dec 10 01:53:42 PM: Encountered unexpected exception importing solver OSQP:
ImportError('DLL load failed while importing qdldl: The specified module could not be found.')


In [2]:
year = 2020
number_of_years = 2

In [4]:
# [x] test subplant assignment
data_cleaning.identify_subplants(year, number_of_years)

    loading CEMS ids
    identifying unique subplants


In [35]:
primary_fuel_table

Unnamed: 0,plant_id_eia,generator_id,subplant_id,energy_source_code,plant_primary_fuel,subplant_primary_fuel
0,1,1,0,DFO,WND,DFO
1,1,2,1,DFO,WND,DFO
2,1,3,2,DFO,WND,DFO
3,1,5,3,DFO,WND,DFO
4,1,WT1,4,WND,WND,WND
...,...,...,...,...,...,...
23716,62620,SD,0,WND,WND,WND
23717,62562,HILO,1,WND,WND,WND
23718,62562,HILO2,2,WND,WND,WND
23719,63258,AURWP,0,WND,WND,WND


In [5]:
# [x] test eia923 cleaning
(
    eia923_allocated,
    primary_fuel_table,
    subplant_emission_factors,
) = data_cleaning.clean_eia923(year, False)
# Add primary fuel data to each generator
eia923_allocated = eia923_allocated.merge(
    primary_fuel_table,
    how="left",
    on=["plant_id_eia", "subplant_id", "generator_id"],
    validate="m:1",
)

    Checking that there are no missing energy source codes associated with non-zero fuel consumption...  OK
    Checking that fuel and emissions values are positive...  OK
 
Missing factors for FC prime movers are currently expected
      prime_mover_code energy_source_code boiler_bottom_type boiler_firing_type
20404               FC                LFG                NaN                NaN
13250               FC                 NG                NaN                NaN
19929               FC                OBG                NaN                NaN
 
 
Missing factors for FC prime movers are currently expected
      prime_mover_code energy_source_code boiler_bottom_type boiler_firing_type
20404               FC                LFG                NaN                NaN
13250               FC                 NG                NaN                NaN
19929               FC                OBG                NaN                NaN
 
 
Missing factors for FC prime movers are currently expected
 

In [30]:
from column_checks import get_dtypes, apply_dtypes

In [7]:
# load the CEMS data
cems = load_data.load_cems_data(year)

# remove non-grid connected plants
cems = data_cleaning.remove_plants(
    cems,
    non_grid_connected=True,
    remove_states=["PR"],
    steam_only_plants=False,
    distribution_connected_plants=False,
)

# manually remove steam-only units
cems = data_cleaning.manually_remove_steam_units(cems)

# add a report date
cems = load_data.add_report_date(cems)

# remove data for any unit-months where there are incomplete data reported
# this is generally when there is a single observation reported for an entire month
cems = data_cleaning.remove_incomplete_unit_months(cems)

# TODO: identify and remove any hourly values that appear to be outliers

# add subplant id
subplant_crosswalk = (
    pd.read_csv(
        outputs_folder(f"{year}/subplant_crosswalk_{year}.csv"),
        dtype=get_dtypes(),
    )[["plant_id_eia", "emissions_unit_id_epa", "subplant_id"]]
    .drop_duplicates()
    .dropna(subset="emissions_unit_id_epa")
)
cems = cems.merge(
    subplant_crosswalk,
    how="left",
    on=["plant_id_eia", "emissions_unit_id_epa"],
    validate="m:1",
)
validation.test_for_missing_subplant_id(cems)

    Removing 46 plants that are not grid-connected
    Removing 0 plants located in the following states: ['PR']
    Removing 3 units that only produce steam and do not report to EIA
    Removing 273 unit-months with incomplete hourly data
    Checking that all data has an associated `subplant_id`...  OK


Unnamed: 0,plant_id_eia,emissions_unit_id_epa,datetime_utc,operating_time_hours,gross_generation_mwh,steam_load_1000_lb,fuel_consumed_mmbtu,co2_mass_lb,nox_mass_lb,so2_mass_lb,plant_id_epa,co2_mass_measurement_code,nox_mass_measurement_code,so2_mass_measurement_code,report_date,subplant_id


In [25]:
# add a fuel type to each observation
cems = data_cleaning.assign_fuel_type_to_cems(cems, year, primary_fuel_table)


In [26]:
test = validation.test_for_missing_energy_source_code(cems)
test[["plant_id_eia", "emissions_unit_id_epa"]].drop_duplicates()

    Checking that there are no missing energy source codes associated with non-zero fuel consumption...  OK


Unnamed: 0,plant_id_eia,emissions_unit_id_epa


In [29]:
# fill in missing hourly emissions data using the fuel type and heat input
cems = emissions.fill_cems_missing_co2(cems, year, subplant_emission_factors)

In [31]:
# calculate ch4 and n2o emissions
cems = emissions.calculate_ghg_emissions_from_fuel_consumption(
    df=cems, year=year, include_co2=False, include_ch4=True, include_n2o=True
)

# remove any observations from cems where zero operation is reported for an entire month
# although this data could be considered to be accurately reported, let's remove it so that we can double check against the eia data
# TODO: check if any of these observations are from geothermal generators
cems = data_cleaning.remove_cems_with_zero_monthly_data(cems)

validation.test_for_negative_values(cems)

cems = apply_dtypes(cems)

    Removing 7286205 observations from cems for unit-months where no data reported
    Checking that fuel and emissions values are positive...  OK


In [32]:
# calculate biomass-adjusted emissions while cems data is at the unit level
cems = emissions.adjust_emissions_for_biomass(cems)

In [33]:
plant_attributes = data_cleaning.create_plant_attributes_table(
    cems, eia923_allocated, year, primary_fuel_table
)

In [39]:
# 6. Crosswalk CEMS and EIA data
####################################################################################
print("6. Identifying source for hourly data")
eia923_allocated = data_cleaning.identify_hourly_data_source(
    eia923_allocated, cems, year
)


6. Identifying source for hourly data


In [40]:
print("7. Aggregating CEMS data from unit to subplant")
# aggregate cems data to subplant level
cems = data_cleaning.aggregate_cems_to_subplant(cems)

7. Aggregating CEMS data from unit to subplant


In [41]:
partial_cems_plant = impute_hourly_profiles.shape_partial_cems_plants(
    cems, eia923_allocated
)
validation.validate_unique_datetimes(
    df=partial_cems_plant,
    df_name="partial_cems_plant",
    keys=["plant_id_eia", "subplant_id"],
)
# shape partial CEMS subplant data
(
    cems,
    partial_cems_subplant,
) = impute_hourly_profiles.shape_partial_cems_subplants(cems, eia923_allocated)

validation.validate_unique_datetimes(
    df=partial_cems_subplant,
    df_name="partial_cems_subplant",
    keys=["plant_id_eia", "subplant_id"],
)

    Checking that shaped hourly data matches monthly totals...  OK
    Checking that shaped hourly data matches monthly totals...  OK


In [42]:
print("9. Converting CEMS gross generation to net generation")
cems, gtn_conversions = gross_to_net_generation.convert_gross_to_net_generation(
    cems, eia923_allocated, plant_attributes, year
)

9. Converting CEMS gross generation to net generation
    Checking that calculated net generation matches reported net generation in EIA-923...   
              net_generation_mwh_eia  net_generation_mwh_calc  pct_error
plant_id_eia                                                            
1391                       3007932.0              3154707.385   0.048796


In [43]:
print("10. Adjusting CEMS emissions for CHP")
cems = data_cleaning.adjust_cems_for_chp(cems, eia923_allocated)
cems = emissions.calculate_co2e_mass(
    cems, year, gwp_horizon=100, ar5_climate_carbon_feedback=True
)
validation.test_emissions_adjustments(cems)
validation.validate_unique_datetimes(
    df=cems,
    df_name="cems",
    keys=["plant_id_eia", "subplant_id"],
)


10. Adjusting CEMS emissions for CHP
    Checking that total fuel consumed >= fuel consumed for electricity...  OK
    Checking that adjusted emission values are less than total emissions...  OK
