In [None]:
# Currently unused code for exploring gross to net conversions over multiple years
########################################################################################


def calculate_multiyear_gtn_factors(year, number_of_years):
    """This is the coordinating function for loading and calculating subplant IDs, GTN regressions, and GTN ratios."""
    start_year = year - (number_of_years - 1)
    end_year = year

    # load 5 years of monthly data from CEMS and EIA-923
    cems_monthly, gen_fuel_allocated = load_monthly_gross_and_net_generation(
        start_year, end_year
    )

    # add subplant ids to the data
    logger.info("Creating subplant IDs")
    cems_monthly, gen_fuel_allocated = data_cleaning.generate_subplant_ids(
        start_year, end_year, cems_monthly, gen_fuel_allocated
    )

    logger.info("Calculating Gross to Net regressions and ratios")
    # perform regression at subplant level
    gross_to_net_regression(
        gross_gen_data=cems_monthly,
        net_gen_data=gen_fuel_allocated,
        agg_level="subplant",
    )

    # perform regression at plant level
    gross_to_net_regression(
        gross_gen_data=cems_monthly, net_gen_data=gen_fuel_allocated, agg_level="plant"
    )

    # calculate monthly ratios at subplant level
    gross_to_net_ratio(
        gross_gen_data=cems_monthly,
        net_gen_data=gen_fuel_allocated,
        agg_level="subplant",
        year=year,
    )

    # calculate monthly ratios at plant level
    gross_to_net_ratio(
        gross_gen_data=cems_monthly,
        net_gen_data=gen_fuel_allocated,
        agg_level="plant",
        year=year,
    )


def load_monthly_gross_and_net_generation(start_year, end_year):
    # load cems data
    cems_monthly = load_data.load_cems_gross_generation(start_year, end_year)

    # allocate net generation and heat input to each generator-fuel grouping
    logger.info("Allocating EIA-923 generation data")
    gen_fuel_allocated = load_data.load_pudl_table(
        "generation_fuel_by_generator_energy_source_monthly_eia923",
        year=start_year,
        columns=None,
        end_year=end_year,
    )
    # aggregate the allocated data to the generator level
    gen_fuel_allocated = allocate_gen_fuel.agg_by_generator(
        gen_fuel_allocated, sum_cols=["net_generation_mwh"]
    )

    return cems_monthly, gen_fuel_allocated


def gross_to_net_ratio(gross_gen_data, net_gen_data, agg_level, year):
    if agg_level == "plant":
        plant_aggregation_columns = ["plant_id_eia"]
    elif agg_level == "subplant":
        plant_aggregation_columns = ["plant_id_eia", "subplant_id"]
    else:
        raise ValueError("agg_level must be either 'plant' or 'subplant'")

    groupby_columns = plant_aggregation_columns + ["report_date"]

    gen_data = gross_gen_data.merge(
        net_gen_data,
        how="outer",
        on=["plant_id_eia", "subplant_id", "report_date"],
        validate="m:m",
    )

    # identify any rows where gross or net generation are missing
    incomplete_data = gen_data[
        gen_data[["gross_generation_mwh", "net_generation_mwh"]].isnull().any(axis=1)
    ]

    # load the activation and retirement dates into the data
    subplant_crosswalk = pd.read_csv(
        outputs_folder(f"{year}/subplant_crosswalk_{year}.csv"),
        dtype=get_dtypes(),
    ).dropna(subset="emissions_unit_id_epa")
    incomplete_data = incomplete_data.merge(
        subplant_crosswalk,
        how="left",
        on=(["plant_id_eia", "subplant_id", "emissions_unit_id_epa", "generator_id"]),
        validate="m:1",
    ).drop(columns="plant_id_epa")

    # drop any of these rows where the retirement date is before the report date (only applies if net generation missing)
    incomplete_data = incomplete_data[
        ~(incomplete_data["generator_retirement_date"] < incomplete_data["report_date"])
    ]

    # drop any of these rows where the report date is before the planned operating date
    incomplete_data = incomplete_data[
        ~(
            incomplete_data["report_date"]
            < incomplete_data["current_planned_generator_operating_date"]
        )
    ]

    # get a list of unique subplant ids and report dates - this identifies where we have missing data we shouldn't calculate a ratio for
    incomplete_data = incomplete_data.drop_duplicates(subset=groupby_columns)[
        groupby_columns
    ]

    # only keep gen data that is not incomplete
    gtn_ratio = gen_data.merge(
        incomplete_data,
        how="outer",
        on=groupby_columns,
        indicator="source",
        validate="1:1",
    )
    gtn_ratio = gtn_ratio[gtn_ratio["source"] == "left_only"].drop(columns="source")

    # group data by aggregation columns
    gtn_ratio = gtn_ratio.groupby(groupby_columns, dropna=False).sum().reset_index()

    # calculate gross to net ratios for the remaining data
    gtn_ratio["gtn_ratio"] = (
        gtn_ratio["net_generation_mwh"] / gtn_ratio["gross_generation_mwh"]
    )

    gtn_ratio = gtn_ratio[groupby_columns + ["gtn_ratio"]]

    os.makedirs(outputs_folder("gross_to_net"), exist_ok=True)

    gtn_ratio.to_csv(
        outputs_folder(f"gross_to_net/{agg_level}_gross_to_net_ratio.csv"),
        index=False,
    )