Merge pull request #367 from singularity-energy/ben/dates

Add operating and retirement dates to plant static attributes
singularity-energy · May 24, 2024 · 6f4c9e3 · 6f4c9e3
2 parents 43ff1df + c225312
commit 6f4c9e3
Show file tree

Hide file tree

Showing 2 changed files with 170 additions and 16 deletions.
diff --git a/src/oge/column_checks.py b/src/oge/column_checks.py
@@ -203,6 +203,8 @@
         "city",
         "plant_name_eia",
         "capacity_mw",
+        "plant_operating_date",
+        "plant_retirement_date",
     },
     "plant_metadata": {
         "plant_id_eia",
@@ -422,6 +424,7 @@ def get_dtypes():
         "ch4_mass_lb_adjusted": "float64",
         "ch4_mass_lb_for_electricity": "float64",
         "ch4_mass_lb_for_electricity_adjusted": "float64",
+        "city": "str",
         "co2_mass_lb": "float64",
         "co2_mass_lb_adjusted": "float64",
         "co2_mass_lb_for_electricity": "float64",
@@ -431,6 +434,7 @@ def get_dtypes():
         "co2e_mass_lb_adjusted": "float64",
         "co2e_mass_lb_for_electricity": "float64",
         "co2e_mass_lb_for_electricity_adjusted": "float64",
+        "county": "str",
         "data_availability": "category",
         "distribution_flag": "bool",
         "eia930_profile": "float64",
@@ -455,6 +459,8 @@ def get_dtypes():
         "hourly_data_source": "category",
         "hours_in_service": "float64",
         "imputed_profile": "float64",
+        "latitude": "float64",
+        "longitude": "float64",
         "mercury_control_id_eia": "str",
         "mercury_emission_rate_lb_per_trillion_btu": "float64",
         "mercury_removal_efficiency": "float64",
@@ -478,6 +484,7 @@ def get_dtypes():
         "particulate_removal_efficiency_at_full_load": "float64",
         "plant_id_eia": "Int32",
         "plant_id_epa": "Int32",
+        "plant_name_eia": "str",
         "plant_primary_fuel": "str",
         "plant_primary_fuel_from_capacity_mw": "str",
         "plant_primary_fuel_from_fuel_consumed_for_electricity_mmbtu": "str",
@@ -507,11 +514,6 @@ def get_dtypes():
         "subplant_primary_fuel_from_net_generation_mwh": "str",
         "timezone": "str",
         "wet_dry_bottom": "str",
-        "latitude": "float64",
-        "longitude": "float64",
-        "county": "str",
-        "city": "str",
-        "plant_name_eia": "str",
     }
 
     return dtypes_to_use
@@ -528,7 +530,16 @@ def apply_dtypes(df: pd.DataFrame) -> pd.DataFrame:
         pd.DataFrame: original data frame with type converted columns.
     """
     dtypes = get_dtypes()
-    datetime_columns = ["datetime_utc", "datetime_local", "report_date"]
+    datetime_columns = [
+        "datetime_utc",
+        "datetime_local",
+        "report_date",
+        "generator_operating_date",
+        "generator_retirement_date",
+        "current_planned_generator_operating_date",
+        "plant_operating_date",
+        "plant_retirement_date",
+    ]
     cols_missing_dtypes = [
         col
         for col in df.columns

diff --git a/src/oge/helpers.py b/src/oge/helpers.py
@@ -2,7 +2,7 @@
 import pandas as pd
 
 from oge.column_checks import get_dtypes, apply_dtypes
-from oge.constants import latest_validated_year
+from oge.constants import earliest_data_year, latest_validated_year
 from oge.filepaths import reference_table_folder, outputs_folder
 import oge.load_data as load_data
 from oge.logging_util import get_logger
@@ -121,10 +121,37 @@ def create_plant_attributes_table(
     plant_attributes = add_plant_entity(plant_attributes)
 
     # add nameplate capacity
-    plant_attributes = add_plant_nameplate_capacity(plant_attributes)
+    plant_attributes = add_plant_nameplate_capacity(year, plant_attributes)
 
+    # add operating and retirement dates
+    plant_attributes = add_plant_operating_and_retirement_dates(plant_attributes)
+
+    # convert types
     plant_attributes = apply_dtypes(plant_attributes)
 
+    # change order of columns
+    new_column_ordering = [
+        "plant_id_eia",
+        "plant_name_eia",
+        "capacity_mw",
+        "plant_primary_fuel",
+        "fuel_category",
+        "fuel_category_eia930",
+        "state",
+        "county",
+        "city",
+        "ba_code",
+        "ba_code_physical",
+        "latitude",
+        "longitude",
+        "plant_operating_date",
+        "plant_retirement_date",
+        "distribution_flag",
+        "timezone",
+        "data_availability",
+    ]
+    plant_attributes = plant_attributes[new_column_ordering]
+
     return plant_attributes
 
 
@@ -178,12 +205,45 @@ def create_plant_ba_table(year: int) -> pd.DataFrame:
         ],
     )
 
+    # for some earlier years, the plants data is missing BA codes.
+    # backfill and forwardfill to make sure that we have complete data for all years, if
+    # data is available for any year
+    for col in [
+        "balancing_authority_code_eia",
+        "utility_id_eia",
+        "balancing_authority_name_eia",
+        "transmission_distribution_owner_name",
+    ]:
+        plant_ba[col] = plant_ba.groupby(["plant_id_eia"])[col].bfill()
+        plant_ba[col] = plant_ba.groupby(["plant_id_eia"])[col].ffill()
+
+    # some plants only have a record for years after the current year. To help ensure
+    # that we have complete BA codes, create a dataframe containing only those plants
+    # whose first record is after the current year, so that we can add these plants back
+    # to plant_ba after filtering
+    plant_ba_only_data_after_year = plant_ba[
+        plant_ba.groupby(["plant_id_eia"])["report_date"].transform("min").dt.year
+        > year
+    ]
+    # only keep the oldest record
+    plant_ba_only_data_after_year = plant_ba_only_data_after_year[
+        plant_ba_only_data_after_year["report_date"]
+        == plant_ba_only_data_after_year.groupby(["plant_id_eia"])[
+            "report_date"
+        ].transform("min")
+    ]
+
     # remove report dates newer than the current year
     plant_ba = plant_ba[plant_ba["report_date"].dt.year <= year]
 
     # sort the data from newest to oldest
     plant_ba = plant_ba.sort_values(by=["plant_id_eia", "report_date"], ascending=False)
 
+    # add back plants that only have records after the current year
+    # if for some reason this adds a duplicate plant, this will be dropped in the next
+    # step since these records will be added to the end of the dataframe
+    plant_ba = pd.concat([plant_ba, plant_ba_only_data_after_year], axis=0)
+
     # only keep the most recent row of data
     plant_ba = plant_ba.drop_duplicates(subset=["plant_id_eia"], keep="first")
 
@@ -287,30 +347,113 @@ def create_plant_ba_table(year: int) -> pd.DataFrame:
     return plant_ba
 
 
-def add_plant_nameplate_capacity(df: pd.DataFrame) -> pd.DataFrame:
+def add_plant_operating_and_retirement_dates(df: pd.DataFrame) -> pd.DataFrame:
+    """Adds the operating and retirement dates of a plant to input data frame. The
+    operating date of a plant is taken as the earliest date among all generators'
+    operating date over all report dates. Likewise, the retirement date of a plant is
+    taken as the latest date among all generators' retirement date over all report
+    dates.
+
+    Note that the operating date is the date the generator began commercial operation.
+    The retirement date is the date of the scheduled or effected retirement of the
+    generator.
+
+    Args:
+        df (pd.DataFrame): table with a 'plant_id_eia' column.
+
+    Returns:
+        pd.DataFrame: original data frame with additional 'plant_operating_date' and
+            'plant_retirement_date' column.
+    """
+    generator_dates = load_data.load_pudl_table(
+        "denorm_generators_eia",
+        year=earliest_data_year,
+        end_year=latest_validated_year,
+        columns=[
+            "plant_id_eia",
+            "generator_id",
+            "report_date",
+            "generator_operating_date",
+            "generator_retirement_date",
+        ],
+    ).sort_values(by=["plant_id_eia", "generator_id", "report_date"], ascending=True)
+
+    # fill missing dates
+    date_columns = ["generator_operating_date", "generator_retirement_date"]
+
+    for col in date_columns:
+        generator_dates[col] = generator_dates.groupby(
+            ["plant_id_eia", "generator_id"]
+        )[col].bfill()
+        generator_dates[col] = generator_dates.groupby(
+            ["plant_id_eia", "generator_id"]
+        )[col].ffill()
+
+    # keep only the most recent year of data
+    generator_dates = generator_dates.drop_duplicates(
+        subset=["plant_id_eia", "generator_id"], keep="last"
+    )
+
+    plant_dates = (
+        generator_dates.groupby("plant_id_eia")[
+            ["generator_operating_date", "generator_retirement_date"]
+        ]
+        .agg(
+            {
+                "generator_operating_date": "min",
+                "generator_retirement_date": lambda x: x.max(skipna=False),
+            }
+        )
+        .rename(
+            columns={
+                "generator_operating_date": "plant_operating_date",
+                "generator_retirement_date": "plant_retirement_date",
+            }
+        )
+    )
+
+    df = df.merge(plant_dates, how="left", on=["plant_id_eia"], validate="1:1")
+
+    return df
+
+
+def add_plant_nameplate_capacity(year: int, df: pd.DataFrame) -> pd.DataFrame:
     """Adds nameplate capacity to input data frame.
 
     Args:
+        year (int): a four-digit year.
         df (pd.DataFrame): table with a 'plant_id_eia' column.
 
     Returns:
         pd.DataFrame: original data frame with additional 'capacity_mw' column.
     """
-    generators_capacity = load_data.load_pudl_table(
+    generator_capacity = load_data.load_pudl_table(
         "generators_eia860",
+        year=earliest_data_year,
+        end_year=latest_validated_year,
         columns=["plant_id_eia", "generator_id", "report_date", "capacity_mw"],
-    )
-    generators_capacity[
-        generators_capacity["report_date"] == generators_capacity["report_date"].max()
+    ).sort_values(by=["plant_id_eia", "generator_id", "report_date"], ascending=True)
+
+    generator_capacity["capacity_mw"] = generator_capacity.groupby(
+        ["plant_id_eia", "generator_id"]
+    )["capacity_mw"].bfill()
+    generator_capacity["capacity_mw"] = generator_capacity.groupby(
+        ["plant_id_eia", "generator_id"]
+    )["capacity_mw"].ffill()
+
+    # keep only the specified year of data
+    generator_capacity = generator_capacity[
+        generator_capacity["report_date"].dt.year == year
     ]
-    plants_capacity = (
-        generators_capacity.groupby(["plant_id_eia"])["capacity_mw"]
+
+    plant_capacity = (
+        generator_capacity.groupby(["plant_id_eia"])["capacity_mw"]
         .sum()
         .round(2)
         .reset_index()
     )
 
-    df = df.merge(plants_capacity, how="left", on=["plant_id_eia"], validate="1:1")
+    df = df.merge(plant_capacity, how="left", on=["plant_id_eia"], validate="1:1")
 
     return df