Skip to content

Commit

Permalink
Merge pull request #367 from singularity-energy/ben/dates
Browse files Browse the repository at this point in the history
Add operating and retirement dates to plant static attributes
  • Loading branch information
grgmiller committed May 24, 2024
2 parents 43ff1df + c225312 commit 6f4c9e3
Show file tree
Hide file tree
Showing 2 changed files with 170 additions and 16 deletions.
23 changes: 17 additions & 6 deletions src/oge/column_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,8 @@
"city",
"plant_name_eia",
"capacity_mw",
"plant_operating_date",
"plant_retirement_date",
},
"plant_metadata": {
"plant_id_eia",
Expand Down Expand Up @@ -422,6 +424,7 @@ def get_dtypes():
"ch4_mass_lb_adjusted": "float64",
"ch4_mass_lb_for_electricity": "float64",
"ch4_mass_lb_for_electricity_adjusted": "float64",
"city": "str",
"co2_mass_lb": "float64",
"co2_mass_lb_adjusted": "float64",
"co2_mass_lb_for_electricity": "float64",
Expand All @@ -431,6 +434,7 @@ def get_dtypes():
"co2e_mass_lb_adjusted": "float64",
"co2e_mass_lb_for_electricity": "float64",
"co2e_mass_lb_for_electricity_adjusted": "float64",
"county": "str",
"data_availability": "category",
"distribution_flag": "bool",
"eia930_profile": "float64",
Expand All @@ -455,6 +459,8 @@ def get_dtypes():
"hourly_data_source": "category",
"hours_in_service": "float64",
"imputed_profile": "float64",
"latitude": "float64",
"longitude": "float64",
"mercury_control_id_eia": "str",
"mercury_emission_rate_lb_per_trillion_btu": "float64",
"mercury_removal_efficiency": "float64",
Expand All @@ -478,6 +484,7 @@ def get_dtypes():
"particulate_removal_efficiency_at_full_load": "float64",
"plant_id_eia": "Int32",
"plant_id_epa": "Int32",
"plant_name_eia": "str",
"plant_primary_fuel": "str",
"plant_primary_fuel_from_capacity_mw": "str",
"plant_primary_fuel_from_fuel_consumed_for_electricity_mmbtu": "str",
Expand Down Expand Up @@ -507,11 +514,6 @@ def get_dtypes():
"subplant_primary_fuel_from_net_generation_mwh": "str",
"timezone": "str",
"wet_dry_bottom": "str",
"latitude": "float64",
"longitude": "float64",
"county": "str",
"city": "str",
"plant_name_eia": "str",
}

return dtypes_to_use
Expand All @@ -528,7 +530,16 @@ def apply_dtypes(df: pd.DataFrame) -> pd.DataFrame:
pd.DataFrame: original data frame with type converted columns.
"""
dtypes = get_dtypes()
datetime_columns = ["datetime_utc", "datetime_local", "report_date"]
datetime_columns = [
"datetime_utc",
"datetime_local",
"report_date",
"generator_operating_date",
"generator_retirement_date",
"current_planned_generator_operating_date",
"plant_operating_date",
"plant_retirement_date",
]
cols_missing_dtypes = [
col
for col in df.columns
Expand Down
163 changes: 153 additions & 10 deletions src/oge/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import pandas as pd

from oge.column_checks import get_dtypes, apply_dtypes
from oge.constants import latest_validated_year
from oge.constants import earliest_data_year, latest_validated_year
from oge.filepaths import reference_table_folder, outputs_folder
import oge.load_data as load_data
from oge.logging_util import get_logger
Expand Down Expand Up @@ -121,10 +121,37 @@ def create_plant_attributes_table(
plant_attributes = add_plant_entity(plant_attributes)

# add nameplate capacity
plant_attributes = add_plant_nameplate_capacity(plant_attributes)
plant_attributes = add_plant_nameplate_capacity(year, plant_attributes)

# add operating and retirement dates
plant_attributes = add_plant_operating_and_retirement_dates(plant_attributes)

# convert types
plant_attributes = apply_dtypes(plant_attributes)

# change order of columns
new_column_ordering = [
"plant_id_eia",
"plant_name_eia",
"capacity_mw",
"plant_primary_fuel",
"fuel_category",
"fuel_category_eia930",
"state",
"county",
"city",
"ba_code",
"ba_code_physical",
"latitude",
"longitude",
"plant_operating_date",
"plant_retirement_date",
"distribution_flag",
"timezone",
"data_availability",
]
plant_attributes = plant_attributes[new_column_ordering]

return plant_attributes


Expand Down Expand Up @@ -178,12 +205,45 @@ def create_plant_ba_table(year: int) -> pd.DataFrame:
],
)

# for some earlier years, the plants data is missing BA codes.
# backfill and forwardfill to make sure that we have complete data for all years, if
# data is available for any year
for col in [
"balancing_authority_code_eia",
"utility_id_eia",
"balancing_authority_name_eia",
"transmission_distribution_owner_name",
]:
plant_ba[col] = plant_ba.groupby(["plant_id_eia"])[col].bfill()
plant_ba[col] = plant_ba.groupby(["plant_id_eia"])[col].ffill()

# some plants only have a record for years after the current year. To help ensure
# that we have complete BA codes, create a dataframe containing only those plants
# whose first record is after the current year, so that we can add these plants back
# to plant_ba after filtering
plant_ba_only_data_after_year = plant_ba[
plant_ba.groupby(["plant_id_eia"])["report_date"].transform("min").dt.year
> year
]
# only keep the oldest record
plant_ba_only_data_after_year = plant_ba_only_data_after_year[
plant_ba_only_data_after_year["report_date"]
== plant_ba_only_data_after_year.groupby(["plant_id_eia"])[
"report_date"
].transform("min")
]

# remove report dates newer than the current year
plant_ba = plant_ba[plant_ba["report_date"].dt.year <= year]

# sort the data from newest to oldest
plant_ba = plant_ba.sort_values(by=["plant_id_eia", "report_date"], ascending=False)

# add back plants that only have records after the current year
# if for some reason this adds a duplicate plant, this will be dropped in the next
# step since these records will be added to the end of the dataframe
plant_ba = pd.concat([plant_ba, plant_ba_only_data_after_year], axis=0)

# only keep the most recent row of data
plant_ba = plant_ba.drop_duplicates(subset=["plant_id_eia"], keep="first")

Expand Down Expand Up @@ -287,30 +347,113 @@ def create_plant_ba_table(year: int) -> pd.DataFrame:
return plant_ba


def add_plant_nameplate_capacity(df: pd.DataFrame) -> pd.DataFrame:
def add_plant_operating_and_retirement_dates(df: pd.DataFrame) -> pd.DataFrame:
"""Adds the operating and retirement dates of a plant to input data frame. The
operating date of a plant is taken as the earliest date among all generators'
operating date over all report dates. Likewise, the retirement date of a plant is
taken as the latest date among all generators' retirement date over all report
dates.
Note that the operating date is the date the generator began commercial operation.
The retirement date is the date of the scheduled or effected retirement of the
generator.
Args:
df (pd.DataFrame): table with a 'plant_id_eia' column.
Returns:
pd.DataFrame: original data frame with additional 'plant_operating_date' and
'plant_retirement_date' column.
"""
generator_dates = load_data.load_pudl_table(
"denorm_generators_eia",
year=earliest_data_year,
end_year=latest_validated_year,
columns=[
"plant_id_eia",
"generator_id",
"report_date",
"generator_operating_date",
"generator_retirement_date",
],
).sort_values(by=["plant_id_eia", "generator_id", "report_date"], ascending=True)

# fill missing dates
date_columns = ["generator_operating_date", "generator_retirement_date"]

for col in date_columns:
generator_dates[col] = generator_dates.groupby(
["plant_id_eia", "generator_id"]
)[col].bfill()
generator_dates[col] = generator_dates.groupby(
["plant_id_eia", "generator_id"]
)[col].ffill()

# keep only the most recent year of data
generator_dates = generator_dates.drop_duplicates(
subset=["plant_id_eia", "generator_id"], keep="last"
)

plant_dates = (
generator_dates.groupby("plant_id_eia")[
["generator_operating_date", "generator_retirement_date"]
]
.agg(
{
"generator_operating_date": "min",
"generator_retirement_date": lambda x: x.max(skipna=False),
}
)
.rename(
columns={
"generator_operating_date": "plant_operating_date",
"generator_retirement_date": "plant_retirement_date",
}
)
)

df = df.merge(plant_dates, how="left", on=["plant_id_eia"], validate="1:1")

return df


def add_plant_nameplate_capacity(year: int, df: pd.DataFrame) -> pd.DataFrame:
"""Adds nameplate capacity to input data frame.
Args:
year (int): a four-digit year.
df (pd.DataFrame): table with a 'plant_id_eia' column.
Returns:
pd.DataFrame: original data frame with additional 'capacity_mw' column.
"""
generators_capacity = load_data.load_pudl_table(
generator_capacity = load_data.load_pudl_table(
"generators_eia860",
year=earliest_data_year,
end_year=latest_validated_year,
columns=["plant_id_eia", "generator_id", "report_date", "capacity_mw"],
)
generators_capacity[
generators_capacity["report_date"] == generators_capacity["report_date"].max()
).sort_values(by=["plant_id_eia", "generator_id", "report_date"], ascending=True)

generator_capacity["capacity_mw"] = generator_capacity.groupby(
["plant_id_eia", "generator_id"]
)["capacity_mw"].bfill()
generator_capacity["capacity_mw"] = generator_capacity.groupby(
["plant_id_eia", "generator_id"]
)["capacity_mw"].ffill()

# keep only the specified year of data
generator_capacity = generator_capacity[
generator_capacity["report_date"].dt.year == year
]
plants_capacity = (
generators_capacity.groupby(["plant_id_eia"])["capacity_mw"]

plant_capacity = (
generator_capacity.groupby(["plant_id_eia"])["capacity_mw"]
.sum()
.round(2)
.reset_index()
)

df = df.merge(plants_capacity, how="left", on=["plant_id_eia"], validate="1:1")
df = df.merge(plant_capacity, how="left", on=["plant_id_eia"], validate="1:1")

return df

Expand Down

0 comments on commit 6f4c9e3

Please sign in to comment.