In [None]:
# import packages
import pandas as pd

%reload_ext autoreload
%autoreload 2

# # Tell python where to look for modules.
import sys
sys.path.append('../../../open-grid-emissions/src/')

from column_checks import get_dtypes
from filepaths import *
import load_data
from data_cleaning import *
import validation
import emissions

year = 2021

## What does the cleaned CEMS data look like

In [None]:
# load data from csv
year = 2021
path_prefix = f"{year}/"

cems = pd.read_csv(outputs_folder(f"{path_prefix}/cems_cleaned_{year}.csv"), dtype=get_dtypes())

In [None]:
cems[cems["plant_id_eia"] == 3].groupby(["plant_id_eia","emissions_unit_id_epa",]).sum(numeric_only=True)

In [None]:
cems[cems["plant_id_eia"] == 3].sum(numeric_only=True)

In [None]:
eia923_allocated = pd.read_csv(outputs_folder(f"{path_prefix}/eia923_allocated_{year}.csv"), dtype=get_dtypes())
eia923_allocated[eia923_allocated["plant_id_eia"] == 3].groupby(["plant_id_eia","subplant_id","report_date"]).sum(numeric_only=True).head(20)

## Test where data is being dropped

In [None]:
(
    eia923_allocated,
    primary_fuel_table,
    subplant_emission_factors,
) = clean_eia923(year, False)

In [None]:
# does the raw cems match this?
cems_raw = load_data.load_cems_data(year)

In [None]:
cems_raw[cems_raw["plant_id_eia"] == 3].groupby(["plant_id_eia","emissions_unit_id_epa",]).sum(numeric_only=True)

In [None]:
barry.sum(numeric_only=True)

In [None]:
# remove non-grid connected plants
cems_raw = remove_plants(
    cems_raw,
    non_grid_connected=True,
    remove_states=["PR"],
    steam_only_plants=False,
    distribution_connected_plants=False,
)

In [None]:
cems_raw[cems_raw["plant_id_eia"] == 3].groupby(["plant_id_eia","emissions_unit_id_epa",]).sum(numeric_only=True)

In [None]:
# manually remove steam-only units
cems_raw = manually_remove_steam_units(cems_raw)

In [None]:
cems_raw[cems_raw["plant_id_eia"] == 3].groupby(["plant_id_eia","emissions_unit_id_epa",]).sum(numeric_only=True)

In [None]:
# add a report date
cems_raw = load_data.add_report_date(cems_raw)

In [None]:
cems_raw[cems_raw["plant_id_eia"] == 3].groupby(["plant_id_eia","emissions_unit_id_epa",]).sum(numeric_only=True)

In [None]:
# remove data for any unit-months where there are incomplete data reported
# this is generally when there is a single observation reported for an entire month
cems_raw = remove_incomplete_unit_months(cems_raw)

In [None]:
cems_raw[cems_raw["plant_id_eia"] == 3].groupby(["plant_id_eia","emissions_unit_id_epa",]).sum(numeric_only=True)

In [None]:
# add subplant id
subplant_crosswalk = (
    pd.read_csv(
        outputs_folder(f"{year}/subplant_crosswalk_{year}.csv"),
        dtype=get_dtypes(),
    )[["plant_id_eia", "emissions_unit_id_epa", "subplant_id"]]
    .drop_duplicates()
    .dropna(subset="emissions_unit_id_epa")
)
cems_raw = cems_raw.merge(
    subplant_crosswalk,
    how="left",
    on=["plant_id_eia", "emissions_unit_id_epa"],
    validate="m:1",
)
validation.test_for_missing_subplant_id(cems_raw)

In [None]:
cems_raw[cems_raw["plant_id_eia"] == 3].groupby(["plant_id_eia","emissions_unit_id_epa",]).sum(numeric_only=True)

In [None]:
# add a fuel type to each observation
cems_raw = assign_fuel_type_to_cems(cems_raw, year, primary_fuel_table)

In [None]:
cems_raw[cems_raw["plant_id_eia"] == 3].groupby(["plant_id_eia","emissions_unit_id_epa",]).sum(numeric_only=True)

In [None]:
# fill in missing hourly emissions data using the fuel type and heat input
validation.test_for_missing_energy_source_code(cems_raw)

In [None]:
cems_raw[cems_raw["plant_id_eia"] == 3].groupby(["plant_id_eia","emissions_unit_id_epa",]).sum(numeric_only=True)

In [None]:
# save a copy of the cems data at this point to test later
cems_test = cems_raw.copy()

In [None]:
cems_raw = emissions.fill_cems_missing_co2(cems_test, year, subplant_emission_factors)

In [None]:
cems_raw[cems_raw["plant_id_eia"] == 3].groupby(["plant_id_eia","emissions_unit_id_epa",]).sum(numeric_only=True)

In [None]:
cems_raw = emissions.calculate_ghg_emissions_from_fuel_consumption(
        df=cems_raw, year=year, include_co2=False, include_ch4=True, include_n2o=True
    )

In [None]:
cems_raw[cems_raw["plant_id_eia"] == 3].groupby(["plant_id_eia","emissions_unit_id_epa",]).sum(numeric_only=True)

In [None]:
cems_raw = remove_cems_with_zero_monthly_data(cems_raw)

In [None]:
cems_raw[cems_raw["plant_id_eia"] == 3].groupby(["plant_id_eia","emissions_unit_id_epa",]).sum(numeric_only=True)

## Investigate emissions filling

In [None]:
cems_test[cems_test["plant_id_eia"] == 3].groupby(["plant_id_eia","emissions_unit_id_epa",]).sum(numeric_only=True)

In [None]:
barry = cems_test.copy() #[(cems_test["plant_id_eia"] == 3)]
barry

In [None]:
import numpy as np

In [None]:
# add a new categorical option to the mass measurement code
barry["co2_mass_measurement_code"] = barry[
    "co2_mass_measurement_code"
].cat.add_categories("Imputed")

# replace all "missing" CO2 values with zero
barry["co2_mass_lb"] = barry["co2_mass_lb"].fillna(0)



In [None]:
# replace 0 reported CO2 values with missing values, if there was reported heat input
barry.loc[
    (barry["co2_mass_lb"] == 0) & (barry["fuel_consumed_mmbtu"] > 0),
]

In [None]:
# replace 0 reported CO2 values with missing values, if there was reported heat input
barry.loc[
    (barry["co2_mass_lb"] == 0) & (barry["fuel_consumed_mmbtu"] > 0),
    "co2_mass_lb",
] = np.NaN

In [None]:
# create a new df with all observations with missing co2 data
missing_co2 = barry[barry["co2_mass_lb"].isnull()]
missing_co2

In [None]:
unit_months_missing_co2 = missing_co2[
        ["plant_id_eia", "emissions_unit_id_epa", "report_date"]
    ].drop_duplicates()
unit_months_missing_co2

In [None]:
# get non-missing data from cems for these unit months
unit_months_missing_co2 = unit_months_missing_co2.merge(
    barry[
        [
            "plant_id_eia",
            "emissions_unit_id_epa",
            "report_date",
            "co2_mass_lb",
            "fuel_consumed_mmbtu",
        ]
    ],
    how="left",
    on=["plant_id_eia", "emissions_unit_id_epa", "report_date"],
    validate="1:m",
)
unit_months_missing_co2

In [None]:
unit_months_missing_co2 = unit_months_missing_co2[
        (unit_months_missing_co2["co2_mass_lb"] > 0)
        & (unit_months_missing_co2["fuel_consumed_mmbtu"] > 0)
    ]
unit_months_missing_co2

In [None]:
# calculate total fuel consumption and emissions by month
unit_month_efs = (
    unit_months_missing_co2.groupby(
        ["plant_id_eia", "emissions_unit_id_epa", "report_date"], dropna=False
    )
    .sum()
    .reset_index()
)
unit_month_efs["co2_lb_per_mmbtu"] = (
    unit_month_efs["co2_mass_lb"] / unit_month_efs["fuel_consumed_mmbtu"]
)
unit_month_efs

In [None]:
# merge these EFs into the missing cems data
missing_co2 = missing_co2.merge(
    unit_month_efs[
        ["plant_id_eia", "report_date", "emissions_unit_id_epa", "co2_lb_per_mmbtu"]
    ],
    how="left",
    on=["plant_id_eia", "report_date", "emissions_unit_id_epa"],
    validate="m:1",
).set_index(missing_co2.index)
missing_co2

In [None]:
# only keep observations where there is a non-missing ef
missing_co2 = missing_co2[~missing_co2["co2_lb_per_mmbtu"].isna()]

# calculate missing co2 data
missing_co2["co2_mass_lb"] = (
    missing_co2["fuel_consumed_mmbtu"] * missing_co2["co2_lb_per_mmbtu"]
)
missing_co2

In [None]:
# update in CEMS table
barry.update(missing_co2[["co2_mass_lb"]])

In [None]:
# update the co2 mass measurement code
barry.loc[missing_co2.index, "co2_mass_measurement_code"] = "Imputed"

# identify all observations that are still missing co2 data
missing_co2 = barry[barry["co2_mass_lb"].isnull()]
missing_co2

In [None]:
# merge the weighted ef into the missing data
missing_co2 = missing_co2.merge(
    subplant_emission_factors[
        ["plant_id_eia", "report_date", "subplant_id", "co2_lb_per_mmbtu"]
    ],
    how="left",
    on=["plant_id_eia", "report_date", "subplant_id"],
    validate="m:1",
).set_index(missing_co2.index)
missing_co2

In [None]:
# only keep observations where there is a non-missing ef
missing_co2 = missing_co2[~missing_co2["co2_lb_per_mmbtu"].isna()]

# calculate missing co2 data
missing_co2["co2_mass_lb"] = (
    missing_co2["fuel_consumed_mmbtu"] * missing_co2["co2_lb_per_mmbtu"]
)
missing_co2

In [None]:
# update in barry table
barry.update(missing_co2[["co2_mass_lb"]])

# update the co2 mass measurement code
barry.loc[missing_co2.index, "co2_mass_measurement_code"] = "Imputed"

# identify all observations that are still missing co2 data
missing_co2 = barry[barry["co2_mass_lb"].isnull()]

In [None]:
missing_co2

In [None]:
# for rows that have a successful fuel code match, move to a temporary dataframe to hold the data
co2_to_fill = missing_co2.copy()[~missing_co2["energy_source_code"].isna()]
fill_index = co2_to_fill.index
co2_to_fill

In [None]:
# calculate emissions based on fuel type
co2_to_fill = emissions.calculate_ghg_emissions_from_fuel_consumption(
    df=co2_to_fill,
    year=year,
    include_co2=True,
    include_ch4=False,
    include_n2o=False,
).set_index(fill_index)

co2_to_fill

In [None]:
# fill this data into the original cems data
barry.update(co2_to_fill[["co2_mass_lb"]])

In [None]:
barry[["plant_id_eia","emissions_unit_id_epa","datetime_utc", "co2_mass_lb"]]

In [None]:
test_fill = cems_test.loc[cems_test["co2_mass_lb"] > 0,["plant_id_eia","emissions_unit_id_epa","datetime_utc", "co2_mass_lb"]]
test_fill = test_fill.merge(barry[["plant_id_eia","emissions_unit_id_epa","datetime_utc", "co2_mass_lb"]], how="left", on=["plant_id_eia","emissions_unit_id_epa","datetime_utc"], validate="1:1", suffixes=("_original","_postfill"))
test_fill["diff"] = test_fill["co2_mass_lb_postfill"] - test_fill["co2_mass_lb_original"]
test_fill[test_fill["diff"] != 0]

## Why are non-missing values being removed?

In [None]:
(
    eia923_allocated,
    primary_fuel_table,
    subplant_emission_factors,
) = clean_eia923(year, False)

# load the CEMS data
cems = load_data.load_cems_data(year)


# remove non-grid connected plants
cems = remove_plants(
    cems,
    non_grid_connected=True,
    remove_states=["PR"],
    steam_only_plants=False,
    distribution_connected_plants=False,
)

# manually remove steam-only units
cems = manually_remove_steam_units(cems)

# add a report date
cems = load_data.add_report_date(cems)

# remove data for any unit-months where there are incomplete data reported
# this is generally when there is a single observation reported for an entire month
cems = remove_incomplete_unit_months(cems)

# TODO: identify and remove any hourly values that appear to be outliers
# See: https://github.com/singularity-energy/open-grid-emissions/issues/50

# add subplant id
subplant_crosswalk = (
    pd.read_csv(
        outputs_folder(f"{year}/subplant_crosswalk_{year}.csv"),
        dtype=get_dtypes(),
    )[["plant_id_eia", "emissions_unit_id_epa", "subplant_id"]]
    .drop_duplicates()
    .dropna(subset="emissions_unit_id_epa")
)
cems = cems.merge(
    subplant_crosswalk,
    how="left",
    on=["plant_id_eia", "emissions_unit_id_epa"],
    validate="m:1",
)
validation.test_for_missing_subplant_id(cems)

# add a fuel type to each observation
cems = assign_fuel_type_to_cems(cems, year, primary_fuel_table)

# fill in missing hourly emissions data using the fuel type and heat input
validation.test_for_missing_energy_source_code(cems)

In [None]:
cems_test = cems.copy()

In [None]:
cems_test = emissions.fill_cems_missing_co2(cems_test, year, subplant_emission_factors)