In [1]:
# import packages
import pandas as pd
import numpy as np
import os
import plotly.express as px



# import packages
import argparse
import shutil

%reload_ext autoreload
%autoreload 2

# import local modules
import sys
sys.path.append('../../../open-grid-emissions/src/')
# import local modules
# # # Tell python where to look for modules.
import download_data
import load_data
import data_cleaning
import emissions
import gross_to_net_generation
import impute_hourly_profiles
import eia930
import validation
import output_data
import consumed
from filepaths import downloads_folder, outputs_folder, results_folder



In [None]:
year = 2019
number_of_years = 2

In [None]:
# 3. Clean EIA-923 Generation and Fuel Data at the Monthly Level
####################################################################################
print("3. Cleaning EIA-923 data")
(
    eia923_allocated,
    primary_fuel_table,
    subplant_emission_factors,
) = data_cleaning.clean_eia923(year, False)
# Add primary fuel data to each generator
eia923_allocated = eia923_allocated.merge(
    primary_fuel_table,
    how="left",
    on=["plant_id_eia", "subplant_id", "generator_id"],
    validate="m:1",
)

# 4. Clean Hourly Data from CEMS
####################################################################################
print("4. Cleaning CEMS data")
cems = data_cleaning.clean_cems(
    year, False, primary_fuel_table, subplant_emission_factors
)

In [None]:
cems.to_csv(outputs_folder(f"{year}/cems_cleaned.csv"), index=False)

In [3]:
year = 2021
cems = pd.read_csv(outputs_folder(f"{year}/cems_cleaned_{year}.csv"))


  cems = pd.read_csv(outputs_folder(f"{year}/cems_cleaned_{year}.csv"))


In [5]:
px.line(cems[cems["plant_id_eia"] == 57865], x="datetime_utc", y="gross_generation_mwh", color="emissions_unit_id_epa")

In [None]:
start_year = year - (number_of_years - 1)
end_year = year

# load 5 years of monthly data from CEMS
print("    loading CEMS ids")
cems_ids = load_data.load_cems_ids(start_year, end_year)

In [None]:
import pudl.analysis.epacamd_eia as epacamd_eia_crosswalk

In [None]:
# load the crosswalk and filter it by the data that actually exists in cems
crosswalk = load_data.load_epa_eia_crosswalk(end_year)

# filter the crosswalk to drop any units that don't exist in CEMS
filtered_crosswalk = epacamd_eia_crosswalk.filter_crosswalk(crosswalk, cems_ids)

# use graph analysis to identify subplants
crosswalk_with_subplant_ids = epacamd_eia_crosswalk.make_subplant_ids(
    filtered_crosswalk
)

In [None]:
crosswalk_with_subplant_ids

In [None]:
# [x] test subplant assignment
data_cleaning.identify_subplants(year, number_of_years)

In [None]:
primary_fuel_table

In [None]:
# [x] test eia923 cleaning
(
    eia923_allocated,
    primary_fuel_table,
    subplant_emission_factors,
) = data_cleaning.clean_eia923(year, False)
# Add primary fuel data to each generator
eia923_allocated = eia923_allocated.merge(
    primary_fuel_table,
    how="left",
    on=["plant_id_eia", "subplant_id", "generator_id"],
    validate="m:1",
)

In [None]:
from column_checks import get_dtypes, apply_dtypes

In [None]:
# load the CEMS data
cems = load_data.load_cems_data(year)

# remove non-grid connected plants
cems = data_cleaning.remove_plants(
    cems,
    non_grid_connected=True,
    remove_states=["PR"],
    steam_only_plants=False,
    distribution_connected_plants=False,
)

# manually remove steam-only units
cems = data_cleaning.manually_remove_steam_units(cems)

# add a report date
cems = load_data.add_report_date(cems)

# remove data for any unit-months where there are incomplete data reported
# this is generally when there is a single observation reported for an entire month
cems = data_cleaning.remove_incomplete_unit_months(cems)

# TODO: identify and remove any hourly values that appear to be outliers

# add subplant id
subplant_crosswalk = (
    pd.read_csv(
        outputs_folder(f"{year}/subplant_crosswalk_{year}.csv"),
        dtype=get_dtypes(),
    )[["plant_id_eia", "emissions_unit_id_epa", "subplant_id"]]
    .drop_duplicates()
    .dropna(subset="emissions_unit_id_epa")
)
cems = cems.merge(
    subplant_crosswalk,
    how="left",
    on=["plant_id_eia", "emissions_unit_id_epa"],
    validate="m:1",
)
validation.test_for_missing_subplant_id(cems)

# add a fuel type to each observation
cems = data_cleaning.assign_fuel_type_to_cems(cems, year, primary_fuel_table)

test = validation.test_for_missing_energy_source_code(cems)
test[["plant_id_eia", "emissions_unit_id_epa"]].drop_duplicates()

In [None]:
sc = pd.read_csv(outputs_folder(f"{year}/subplant_crosswalk_{year}.csv"))
sc

In [None]:
crosswalk = load_data.load_epa_eia_crosswalk_from_raw(year)[
        ["plant_id_eia", "emissions_unit_id_epa", "energy_source_code_epa"]
    ].drop_duplicates(subset=["plant_id_eia", "emissions_unit_id_epa"])

In [None]:
crosswalk[crosswalk["plant_id_eia"] == 59073]

In [None]:
# fill in missing hourly emissions data using the fuel type and heat input
cems = emissions.fill_cems_missing_co2(cems, year, subplant_emission_factors)

In [None]:
# calculate ch4 and n2o emissions
cems = emissions.calculate_ghg_emissions_from_fuel_consumption(
    df=cems, year=year, include_co2=False, include_ch4=True, include_n2o=True
)

# remove any observations from cems where zero operation is reported for an entire month
# although this data could be considered to be accurately reported, let's remove it so that we can double check against the eia data
# TODO: check if any of these observations are from geothermal generators
cems = data_cleaning.remove_cems_with_zero_monthly_data(cems)

validation.test_for_negative_values(cems)

cems = apply_dtypes(cems)

In [None]:
# calculate biomass-adjusted emissions while cems data is at the unit level
cems = emissions.adjust_emissions_for_biomass(cems)

In [None]:
plant_attributes = data_cleaning.create_plant_attributes_table(
    cems, eia923_allocated, year, primary_fuel_table
)

In [None]:
# 6. Crosswalk CEMS and EIA data
####################################################################################
print("6. Identifying source for hourly data")
eia923_allocated = data_cleaning.identify_hourly_data_source(
    eia923_allocated, cems, year
)


In [None]:
print("7. Aggregating CEMS data from unit to subplant")
# aggregate cems data to subplant level
cems = data_cleaning.aggregate_cems_to_subplant(cems)

In [None]:
partial_cems_plant = impute_hourly_profiles.shape_partial_cems_plants(
    cems, eia923_allocated
)
validation.validate_unique_datetimes(
    df=partial_cems_plant,
    df_name="partial_cems_plant",
    keys=["plant_id_eia", "subplant_id"],
)
# shape partial CEMS subplant data
(
    cems,
    partial_cems_subplant,
) = impute_hourly_profiles.shape_partial_cems_subplants(cems, eia923_allocated)

validation.validate_unique_datetimes(
    df=partial_cems_subplant,
    df_name="partial_cems_subplant",
    keys=["plant_id_eia", "subplant_id"],
)

In [None]:
print("9. Converting CEMS gross generation to net generation")
cems, gtn_conversions = gross_to_net_generation.convert_gross_to_net_generation(
    cems, eia923_allocated, plant_attributes, year
)

In [None]:
print("10. Adjusting CEMS emissions for CHP")
cems = data_cleaning.adjust_cems_for_chp(cems, eia923_allocated)
cems = emissions.calculate_co2e_mass(
    cems, year, gwp_horizon=100, ar5_climate_carbon_feedback=True
)
validation.test_emissions_adjustments(cems)
validation.validate_unique_datetimes(
    df=cems,
    df_name="cems",
    keys=["plant_id_eia", "subplant_id"],
)


In [None]:
import cvxpy

In [None]:
cvxpy.settings.SOLVERS

In [None]:
cvxpy.installed_solvers()

In [None]:
cvxpy

In [None]:
%reload_ext autoreload
%autoreload 2

# import local modules
import sys
sys.path.append('../../../open-grid-emissions/src/')
# import local modules
# # # Tell python where to look for modules.
import eia930


In [None]:
year = 2020
path_prefix = f"{year}/"
eia930.clean_930(year, small=False, path_prefix=path_prefix)


In [None]:
year = 2019
path_prefix = f"{year}/"

clean_930_file = outputs_folder(f"{path_prefix}/eia930/eia930_elec.csv")

hourly_consumed_calc = consumed.HourlyConsumed(
    clean_930_file,
    path_prefix,
    year,
    small=False,
    skip_outputs=True,
)
hourly_consumed_calc.run()