In [None]:
# import packages
import pandas as pd
import numpy as np

%reload_ext autoreload
%autoreload 2

# # Tell python where to look for modules.
import sys
sys.path.append('../../../open-grid-emissions/src/')

import data_cleaning
import emissions
import output_data
from column_checks import get_dtypes
from filepaths import *

year = 2020
path_prefix = f"{year}/"

In [None]:
# 3. Clean EIA-923 Generation and Fuel Data at the Monthly Level
####################################################################################
print("3. Cleaning EIA-923 data")
eia923_allocated, primary_fuel_table = data_cleaning.clean_eia923(year, False)
# Add primary fuel data to each generator
eia923_allocated = eia923_allocated.merge(
    primary_fuel_table,
    how="left",
    on=["plant_id_eia", "generator_id"],
    validate="m:1",
)

# 4. Clean Hourly Data from CEMS
####################################################################################
print("4. Cleaning CEMS data")
cems = data_cleaning.clean_cems(year, False)

# calculate biomass-adjusted emissions while cems data is at the unit level
cems = emissions.adjust_emissions_for_biomass(cems)

# 5. Assign static characteristics to CEMS and EIA data to aid in aggregation
####################################################################################
print("5. Loading plant static attributes")
plant_attributes = data_cleaning.create_plant_attributes_table(
    cems, eia923_allocated, year, primary_fuel_table
)


In [None]:
all_data = eia923_allocated.copy()

# create a binary column indicating whether any data was reported in 923
columns_to_test = [
    "net_generation_mwh",
    "fuel_consumed_mmbtu",
    "fuel_consumed_for_electricity_mmbtu",
    "co2_mass_lb",
]
all_data = all_data.assign(
    reported_eia923=lambda x: np.where(
        x[columns_to_test].notnull().all(axis=1), 1, 0
    )
)

# identify which cems data only represents part of a subplant
cems_status = data_cleaning.identify_partial_cems_subplants(year, cems, eia923_allocated)

# merge in the data source column from CEMS
all_data = all_data.merge(
    cems_status,
    how="left",
    on=["plant_id_eia", "subplant_id", "report_date"],
    validate="m:1",
)

# for the remaining plants, identify the hourly data source as EIA
all_data["hourly_data_source"] = all_data["hourly_data_source"].fillna("eia")

# remove any generator-months for which there is no data reported in either data source
all_data = all_data[
    ~(
        (all_data["reported_eia923"] == 0)
        & (all_data["hourly_data_source"] == "eia")
    )
]

all_data = all_data.drop(columns=["reported_eia923"])



In [None]:
all_data[all_data["hourly_data_source"] == "partial_cems_subplant"]

In [None]:
# identify the partial cems plants
all_data = data_cleaning.identify_partial_cems_plants(all_data)

In [None]:
all_data[all_data["hourly_data_source"] == "partial_cems_plant"]

In [None]:
all_data[all_data["plant_id_eia"] == 60926]