In [1]:
import pandas as pd
import math
%run get_merged_blooms.ipynb

def get_enriched_data(locations=all_locations):
  climate_df = pd.read_parquet(r"../preprocessing_data/full_station_data.parquet")
  bloom_df = get_merged_blooms(locations)

  enriched = pd.merge(bloom_df, climate_df, on=["location", "year"])

  def filter_doy(initial_doy, last_doy):
    columns = enriched.columns.copy()
    for column in columns:
        doy = int(column.split('_')[-1]) - 1
        if doy < initial_doy or doy > last_doy:
          enriched.drop(column, axis=1, inplace=True)

  def aggregate_columns(column_preffix, aggregate_size):
    selected_columns = [column for column in enriched.columns if column_preffix in column]

    for column in selected_columns:
        doy = int(column.split('_')[-1]) - 1
        batch = doy//aggregate_size
        new_column = f"{column_preffix}_{batch*aggregate_size}_{batch*aggregate_size + aggregate_size}"

        if new_column not in enriched.columns:
          enriched[new_column] = 0

        enriched[new_column] += enriched[column]/aggregate_size
        enriched.drop(column, axis=1, inplace=True)

  for i in range(1, 40):
    enriched[f"prev_bloom_{i}"] = enriched.index.to_series().apply(lambda year: enriched.loc[year - i, "bloom_doy"] if year - i >= enriched.index.min() else math.nan)
  enriched = enriched.fillna(enriched.mean())
  enriched = enriched.drop(["bloom_date"], axis=1)
  enriched = enriched[(enriched["bloom_doy"] > 60) & (enriched["year"] > 1950)]

  aggregate_columns("PRCP", 6)
  aggregate_columns("T", 5)
  aggregate_columns("prev_bloom", 4)

  return enriched