feat: add information to plant static attributes data frame

singularity-energy · May 21, 2024 · a52a227 · a52a227
1 parent 1363803
commit a52a227
Show file tree

Hide file tree

Showing 2 changed files with 99 additions and 30 deletions.
diff --git a/src/oge/column_checks.py b/src/oge/column_checks.py
@@ -190,11 +190,18 @@
         "fuel_category_eia930",
         "ba_code",
         "ba_code_physical",
-        "state",
         "distribution_flag",
         "timezone",
         "data_availability",
         "shaped_plant_id",
+        "latitude",
+        "longitude",
+        "state",
+        "county",
+        "city",
+        "zip_code",
+        "street_address",
+        "capacity_mw",
     },
     "plant_metadata": {
         "plant_id_eia",
@@ -491,6 +498,12 @@ def get_dtypes():
         "subplant_primary_fuel_from_net_generation_mwh": "str",
         "timezone": "str",
         "wet_dry_bottom": "str",
+        "latitude": "float64",
+        "longitude": "float64",
+        "county": "str",
+        "city": "str",
+        "zip_code": "Int64",
+        "street_adress": "str",
     }
 
     return dtypes_to_use

diff --git a/src/oge/helpers.py b/src/oge/helpers.py
@@ -2,6 +2,7 @@
 import pandas as pd
 
 from oge.column_checks import get_dtypes, apply_dtypes
+from oge.constants import latest_validated_year
 from oge.filepaths import reference_table_folder
 import oge.load_data as load_data
 from oge.logging_util import get_logger
@@ -18,10 +19,10 @@ def create_plant_attributes_table(
     """Creates the plant attributes table.
 
     Args:
-        cems (pd.DataFrame): _description_
-        eia923_allocated (pd.DataFrame): _description_
-        year (int): _description_
-        primary_fuel_table (pd.DataFrame): _description_
+        cems (pd.DataFrame): CEMS table.
+        eia923_allocated (pd.DataFrame): allocated EIA-923 data.
+        year (int): a four-digit year
+        primary_fuel_table (pd.DataFrame): primary fuel table.
 
     Returns:
         pd.DataFrame: the plants attributes table. Timezone, geographical and fuel
@@ -100,7 +101,7 @@ def create_plant_attributes_table(
         }
     )
 
-    # assign a BA code and state code to each plant
+    # assign a BA code to each plant
     plant_attributes = assign_ba_code_to_plant(plant_attributes, year)
 
     # add a flag about whether the plant is distribution connected
@@ -115,27 +116,29 @@ def create_plant_attributes_table(
         esc_column="plant_primary_fuel",
     )
 
-    # add timezone info
-    plant_attributes = add_plant_local_timezone(plant_attributes, year)
+    # add geographical info
+    plant_attributes = add_plant_entity(plant_attributes)
+
+    # add nameplate capacity
+    plant_attributes = add_plant_nameplate_capacity(plant_attributes)
 
     plant_attributes = apply_dtypes(plant_attributes)
 
     return plant_attributes
 
 
 def assign_ba_code_to_plant(df: pd.DataFrame, year: int) -> pd.DataFrame:
-    """Assigns a balancing authority code and state to each plant based on the plant id.
+    """Assigns a balancing authority code to each plant based on the plant id.
 
     Args:
          df (pd.DataFrame): data frame containing a 'plant_id_eia' column.
          year (int): a four-digit year.
 
      Returns:
-         pd.DataFrame: original data frame with additional 'ba_code' and 'state'
-            columns.
+         pd.DataFrame: original data frame with additional 'ba_code' column.
     """
     plant_ba = create_plant_ba_table(year)[
-        ["plant_id_eia", "ba_code", "ba_code_physical", "state"]
+        ["plant_id_eia", "ba_code", "ba_code_physical"]
     ]
 
     # merge the ba code into the dataframe
@@ -198,17 +201,11 @@ def create_plant_ba_table(year: int) -> pd.DataFrame:
         plant_states, how="left", on="plant_id_eia", validate="m:1"
     )
 
-    # convert the dtype of the balancing authority code column from string to object
-    # this will allow for missing values to be filled
-    plant_ba["balancing_authority_code_eia"] = plant_ba[
-        "balancing_authority_code_eia"
-    ].astype(object)
-    plant_ba["balancing_authority_code_eia"] = plant_ba[
-        "balancing_authority_code_eia"
-    ].fillna(value=np.NaN)
-
     # load the ba name reference
-    ba_name_to_ba_code = pd.read_csv(reference_table_folder("ba_reference.csv"))
+    ba_name_to_ba_code = pd.read_csv(
+        reference_table_folder("ba_reference.csv"),
+        dtype={"ba_name": "string", "ba_code": "string"},
+    )
     ba_name_to_ba_code = dict(
         zip(
             ba_name_to_ba_code["ba_name"],
@@ -218,7 +215,8 @@ def create_plant_ba_table(year: int) -> pd.DataFrame:
 
     # specify a ba code for certain utilities
     utility_as_ba_code = pd.read_csv(
-        reference_table_folder("utility_name_ba_code_map.csv")
+        reference_table_folder("utility_name_ba_code_map.csv"),
+        dtype={"name": "string", "ba_code": "string"},
     )
     utility_as_ba_code = dict(
         zip(
@@ -283,10 +281,39 @@ def create_plant_ba_table(year: int) -> pd.DataFrame:
         validate="m:1",
     )
     plant_ba.update({"ba_code_physical": plant_ba["ba_code_physical_map"]})
+    plant_ba.drop(columns="state")
 
     return plant_ba
 
 
+def add_plant_nameplate_capacity(df: pd.DataFrame) -> pd.DataFrame:
+    """Adds nameplate capacity to input data frame.
+
+    Args:
+        df (pd.DataFrame): table with a 'plant_id_eia' column.
+
+    Returns:
+        pd.DataFrame: original data frame with additional 'capacity_mw' column.
+    """
+    generators_capacity = load_data.load_pudl_table(
+        "generators_eia860",
+        columns=["plant_id_eia", "generator_id", "report_date", "capacity_mw"],
+    )
+    plants_capacity = (
+        generators_capacity.query(
+            "report_date == @generators_capacity['report_date'].max()"
+        )
+        .groupby(["plant_id_eia"])["capacity_mw"]
+        .sum()
+        .round(2)
+        .reset_index()
+    )
+
+    df = df.merge(plants_capacity, how="left", on=["plant_id_eia"], validate="1:1")
+
+    return df
+
+
 def identify_distribution_connected_plants(
     df: pd.DataFrame, year: int, voltage_threshold_kv: int = 60
 ) -> pd.DataFrame:
@@ -361,19 +388,48 @@ def assign_fuel_category_to_ESC(
     return df
 
 
-def add_plant_local_timezone(df: pd.DataFrame, year: int) -> pd.DataFrame:
-    """Adds timezone in which plant operates
+def add_plant_entity(df: pd.DataFrame) -> pd.DataFrame:
+    """Adds timezone and geographical information to input data frame.
 
     Args:
         df (pd.DataFrame): table with a 'plant_id_eia' column.
-        year (int): a four-digit year.
 
     Returns:
-        pd.DataFrame: original data frame with additional 'timezone' column
+        pd.DataFrame: original data frame with additional 'timezone', 'lat', 'lon',
+            'state', 'county', 'city', 'zip_code' and 'street_address' columns
     """
-    plant_tz = load_data.load_pudl_table(
-        "plants_entity_eia", columns=["plant_id_eia", "timezone"]
+    geographical_info = [
+        "latitude",
+        "longitude",
+        "state",
+        "county",
+        "city",
+        "zip_code",
+        "street_address",
+    ]
+    plants_entity = load_data.load_pudl_table(
+        "plants_entity_eia", columns=["plant_id_eia", "timezone"] + geographical_info
+    ).astype({"zip_code": "Int64"})
+    plants_entity_from_eia860 = load_data.load_raw_eia860_plant_geographical_info(
+        latest_validated_year
+    )
+    complete_plants_entity = plants_entity.merge(
+        plants_entity_from_eia860,
+        how="left",
+        on=["plant_id_eia"],
+        validate="1:1",
+        suffixes=["", "_eia"],
+    )
+
+    for c in geographical_info:
+        if complete_plants_entity[c].isna().sum() > 0:
+            complete_plants_entity[c] = complete_plants_entity[c].fillna(
+                complete_plants_entity[f"{c}_eia"]
+            )
+        complete_plants_entity = complete_plants_entity.drop(columns=f"{c}_eia")
+
+    df = df.merge(
+        complete_plants_entity, how="left", on=["plant_id_eia"], validate="m:1"
     )
-    df = df.merge(plant_tz, how="left", on=["plant_id_eia"], validate="m:1")
 
     return df