Skip to content

Commit

Permalink
feat: add information to plant static attributes data frame
Browse files Browse the repository at this point in the history
  • Loading branch information
rouille committed May 21, 2024
1 parent 1363803 commit a52a227
Show file tree
Hide file tree
Showing 2 changed files with 99 additions and 30 deletions.
15 changes: 14 additions & 1 deletion src/oge/column_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,11 +190,18 @@
"fuel_category_eia930",
"ba_code",
"ba_code_physical",
"state",
"distribution_flag",
"timezone",
"data_availability",
"shaped_plant_id",
"latitude",
"longitude",
"state",
"county",
"city",
"zip_code",
"street_address",
"capacity_mw",
},
"plant_metadata": {
"plant_id_eia",
Expand Down Expand Up @@ -491,6 +498,12 @@ def get_dtypes():
"subplant_primary_fuel_from_net_generation_mwh": "str",
"timezone": "str",
"wet_dry_bottom": "str",
"latitude": "float64",
"longitude": "float64",
"county": "str",
"city": "str",
"zip_code": "Int64",
"street_adress": "str",
}

return dtypes_to_use
Expand Down
114 changes: 85 additions & 29 deletions src/oge/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import pandas as pd

from oge.column_checks import get_dtypes, apply_dtypes
from oge.constants import latest_validated_year
from oge.filepaths import reference_table_folder
import oge.load_data as load_data
from oge.logging_util import get_logger
Expand All @@ -18,10 +19,10 @@ def create_plant_attributes_table(
"""Creates the plant attributes table.
Args:
cems (pd.DataFrame): _description_
eia923_allocated (pd.DataFrame): _description_
year (int): _description_
primary_fuel_table (pd.DataFrame): _description_
cems (pd.DataFrame): CEMS table.
eia923_allocated (pd.DataFrame): allocated EIA-923 data.
year (int): a four-digit year
primary_fuel_table (pd.DataFrame): primary fuel table.
Returns:
pd.DataFrame: the plants attributes table. Timezone, geographical and fuel
Expand Down Expand Up @@ -100,7 +101,7 @@ def create_plant_attributes_table(
}
)

# assign a BA code and state code to each plant
# assign a BA code to each plant
plant_attributes = assign_ba_code_to_plant(plant_attributes, year)

# add a flag about whether the plant is distribution connected
Expand All @@ -115,27 +116,29 @@ def create_plant_attributes_table(
esc_column="plant_primary_fuel",
)

# add timezone info
plant_attributes = add_plant_local_timezone(plant_attributes, year)
# add geographical info
plant_attributes = add_plant_entity(plant_attributes)

# add nameplate capacity
plant_attributes = add_plant_nameplate_capacity(plant_attributes)

plant_attributes = apply_dtypes(plant_attributes)

return plant_attributes


def assign_ba_code_to_plant(df: pd.DataFrame, year: int) -> pd.DataFrame:
"""Assigns a balancing authority code and state to each plant based on the plant id.
"""Assigns a balancing authority code to each plant based on the plant id.
Args:
df (pd.DataFrame): data frame containing a 'plant_id_eia' column.
year (int): a four-digit year.
Returns:
pd.DataFrame: original data frame with additional 'ba_code' and 'state'
columns.
pd.DataFrame: original data frame with additional 'ba_code' column.
"""
plant_ba = create_plant_ba_table(year)[
["plant_id_eia", "ba_code", "ba_code_physical", "state"]
["plant_id_eia", "ba_code", "ba_code_physical"]
]

# merge the ba code into the dataframe
Expand Down Expand Up @@ -198,17 +201,11 @@ def create_plant_ba_table(year: int) -> pd.DataFrame:
plant_states, how="left", on="plant_id_eia", validate="m:1"
)

# convert the dtype of the balancing authority code column from string to object
# this will allow for missing values to be filled
plant_ba["balancing_authority_code_eia"] = plant_ba[
"balancing_authority_code_eia"
].astype(object)
plant_ba["balancing_authority_code_eia"] = plant_ba[
"balancing_authority_code_eia"
].fillna(value=np.NaN)

# load the ba name reference
ba_name_to_ba_code = pd.read_csv(reference_table_folder("ba_reference.csv"))
ba_name_to_ba_code = pd.read_csv(
reference_table_folder("ba_reference.csv"),
dtype={"ba_name": "string", "ba_code": "string"},
)
ba_name_to_ba_code = dict(
zip(
ba_name_to_ba_code["ba_name"],
Expand All @@ -218,7 +215,8 @@ def create_plant_ba_table(year: int) -> pd.DataFrame:

# specify a ba code for certain utilities
utility_as_ba_code = pd.read_csv(
reference_table_folder("utility_name_ba_code_map.csv")
reference_table_folder("utility_name_ba_code_map.csv"),
dtype={"name": "string", "ba_code": "string"},
)
utility_as_ba_code = dict(
zip(
Expand Down Expand Up @@ -283,10 +281,39 @@ def create_plant_ba_table(year: int) -> pd.DataFrame:
validate="m:1",
)
plant_ba.update({"ba_code_physical": plant_ba["ba_code_physical_map"]})
plant_ba.drop(columns="state")

return plant_ba


def add_plant_nameplate_capacity(df: pd.DataFrame) -> pd.DataFrame:
"""Adds nameplate capacity to input data frame.
Args:
df (pd.DataFrame): table with a 'plant_id_eia' column.
Returns:
pd.DataFrame: original data frame with additional 'capacity_mw' column.
"""
generators_capacity = load_data.load_pudl_table(
"generators_eia860",
columns=["plant_id_eia", "generator_id", "report_date", "capacity_mw"],
)
plants_capacity = (
generators_capacity.query(
"report_date == @generators_capacity['report_date'].max()"
)
.groupby(["plant_id_eia"])["capacity_mw"]
.sum()
.round(2)
.reset_index()
)

df = df.merge(plants_capacity, how="left", on=["plant_id_eia"], validate="1:1")

return df


def identify_distribution_connected_plants(
df: pd.DataFrame, year: int, voltage_threshold_kv: int = 60
) -> pd.DataFrame:
Expand Down Expand Up @@ -361,19 +388,48 @@ def assign_fuel_category_to_ESC(
return df


def add_plant_local_timezone(df: pd.DataFrame, year: int) -> pd.DataFrame:
"""Adds timezone in which plant operates
def add_plant_entity(df: pd.DataFrame) -> pd.DataFrame:
"""Adds timezone and geographical information to input data frame.
Args:
df (pd.DataFrame): table with a 'plant_id_eia' column.
year (int): a four-digit year.
Returns:
pd.DataFrame: original data frame with additional 'timezone' column
pd.DataFrame: original data frame with additional 'timezone', 'lat', 'lon',
'state', 'county', 'city', 'zip_code' and 'street_address' columns
"""
plant_tz = load_data.load_pudl_table(
"plants_entity_eia", columns=["plant_id_eia", "timezone"]
geographical_info = [
"latitude",
"longitude",
"state",
"county",
"city",
"zip_code",
"street_address",
]
plants_entity = load_data.load_pudl_table(
"plants_entity_eia", columns=["plant_id_eia", "timezone"] + geographical_info
).astype({"zip_code": "Int64"})
plants_entity_from_eia860 = load_data.load_raw_eia860_plant_geographical_info(
latest_validated_year
)
complete_plants_entity = plants_entity.merge(
plants_entity_from_eia860,
how="left",
on=["plant_id_eia"],
validate="1:1",
suffixes=["", "_eia"],
)

for c in geographical_info:
if complete_plants_entity[c].isna().sum() > 0:
complete_plants_entity[c] = complete_plants_entity[c].fillna(
complete_plants_entity[f"{c}_eia"]
)
complete_plants_entity = complete_plants_entity.drop(columns=f"{c}_eia")

df = df.merge(
complete_plants_entity, how="left", on=["plant_id_eia"], validate="m:1"
)
df = df.merge(plant_tz, how="left", on=["plant_id_eia"], validate="m:1")

return df

0 comments on commit a52a227

Please sign in to comment.