In [None]:
# import packages
import pandas as pd

%reload_ext autoreload
%autoreload 2

# Tell python where to look for modules.
import sys

sys.path.append("../../../open-grid-emissions/")

# import local modules
import src.validation as validation

from src.column_checks import get_dtypes

year = 2020
path_prefix = ""
path_prefix = f"{path_prefix}{year}"

# Validate Cleaned EIA-923 Data
Notes:
- When net generation is positive but no fuel consumption is reported, this could be due to several reasons:
    - The generator uses a clean fuel, like WAT, and reports 0 mmbtu per physical unit fuel consumed
    - The generator is part of a combined cycle unit and all of the fuel is reported on the other cycle
- Records where all data is zero could just mean that the generator didn't operate in that month

In [None]:
eia923_allocated = pd.read_csv(
    f"../data/outputs/{path_prefix}/eia923_allocated_{year}.csv",
    dtype=get_dtypes(),
    parse_dates=["report_date"],
)

In [None]:
# perform checks on allocated data
# fuel consumption and co2 emissions should be positive
negative_test = validation.test_for_negative_values(
    eia923_allocated,
    [
        "fuel_consumed_mmbtu",
        "fuel_consumed_for_electricity_mmbtu",
        "co2_mass_lb",
        "co2_mass_lb_for_electricity",
        "co2_mass_lb_adjusted",
    ],
)

# if net generation is positive, fuel consumption should be non zero
missing_fuel_test = validation.test_for_missing_fuel(
    eia923_allocated, "net_generation_mwh"
)

# fuel consumed for electricity should be less than fuel consumed
chp_allocation_test = validation.test_chp_allocation(eia923_allocated)

# check for missing co2 data
missing_co2_test = validation.test_for_missing_co2(eia923_allocated)

# check for generators with no data
missing_data_test = validation.test_for_missing_data(
    eia923_allocated,
    [
        "net_generation_mwh",
        "fuel_consumed_mmbtu",
        "fuel_consumed_for_electricity_mmbtu",
        "co2_mass_lb",
        "co2_mass_lb_for_electricity",
        "co2_mass_lb_adjusted",
    ],
)

# check for generators with all data = 0
zero_data_test = validation.test_for_zero_data(
    eia923_allocated,
    [
        "net_generation_mwh",
        "fuel_consumed_mmbtu",
        "fuel_consumed_for_electricity_mmbtu",
        "co2_mass_lb",
        "co2_mass_lb_for_electricity",
        "co2_mass_lb_adjusted",
    ],
)

# check for missing energy source code
missing_esc_test = validation.test_for_missing_energy_source_code(eia923_allocated)

# check for missing and incorrect prime movers
incorrect_pm_test, missing_pm_test = validation.test_for_missing_incorrect_prime_movers(
    eia923_allocated, year
)

# check for missing subplant ids
eia_missing_subplant_test = validation.test_for_missing_subplant_id(eia923_allocated)

In [None]:
missing_data_test

In [None]:
# check heat rates
heat_rate_test = validation.test_for_outlier_heat_rates(eia923_allocated)

In [None]:
# what percent of emissions is reported in CEMS vs EIA
# NOTE: This does not include emissions only reported by CEMS, so the % may be higher
(
    eia923_allocated.groupby("hourly_data_source")[
        [
            "net_generation_mwh",
            "fuel_consumed_mmbtu",
            "co2_mass_lb",
            "co2_mass_lb_for_electricity",
        ]
    ].sum()
    / eia923_allocated.groupby("hourly_data_source")[
        [
            "net_generation_mwh",
            "fuel_consumed_mmbtu",
            "co2_mass_lb",
            "co2_mass_lb_for_electricity",
        ]
    ]
    .sum()
    .sum(axis=0)
).round(3)

### Run Validation tests on cleaned CEMS data

In [None]:
cems = pd.read_csv(
    f"../data/outputs/{path_prefix}/cems_subplant_{year}.csv",
    dtype=get_dtypes(),
    parse_dates=["report_date"],
)

In [None]:
# fuel consumption and co2 emissions should be positive
cems_negative_test = validation.test_for_negative_values(cems)

# if net generation is positive, fuel consumption should be non zero
cems_missing_fuel_test = validation.test_for_missing_fuel(cems, "gross_generation_mwh")

# fuel consumed for electricity should be less than fuel consumed
cems_chp_allocation_test = validation.test_chp_allocation(cems)

# check for missing co2 data
cems_missing_co2_test = validation.test_for_missing_co2(cems)

# check for missing energy source code
# cems_missing_esc_test = validation.test_for_missing_energy_source_code(cems)

# test to make sure that there is a complete subplant mapping
cems_missing_subplant_test = validation.test_for_missing_subplant_id(cems)

# test to see if there are any net generation values greater than gross generation
gtn_test = validation.test_gtn_results(cems)

# Make sure that each subplant has been identified from a unique source