In [None]:
import pandas as pd
import os
from geopy import distance
from pathlib import Path
%run get_station_data.ipynb
%run get_merged_blooms.ipynb

Path("../preprocessing_data/best_station").mkdir(parents=True, exist_ok=True)
Path("../preprocessing_data/stations").mkdir(parents=True, exist_ok=True)

def get_best_station_data(location, bloom_coord, first_year):
  first_year = max(first_year, 1950)
  save_path = f"../preprocessing_data/best_station/{location}.parquet"
  if os.path.exists(save_path):
    return pd.read_parquet(save_path)
  
  if location.lower() == "washingtondc":
    station = get_station_data("USC00186350")
  elif location.lower() == "liestal":
    station = get_station_data("GME00127786")
  # elif location.lower() == "kyoto":
  #   station = get_station_data("JA000047759")
  elif location.lower() == "vancouver":
    station = get_station_data("CA001108395")
  else:
    df = pd.read_parquet("../preprocessing_data/stations.parquet")

    df["dist (km)"] = df["coord"].apply(lambda coord: distance.distance(coord, bloom_coord).km)
    df = df.sort_values('dist (km)')
    df = df.loc[df['dist (km)'] < 20]

    station = None
    columns = ["PRCP", "TMIN", "TMAX"]
    years_of_data = 0 
    for index, row in df.iterrows():
      new_station = get_station_data(index)
      if not all([column in new_station.columns for column in columns]):
        continue
      new_years_of_data = new_station.loc[new_station.index.to_series().dt.year >= first_year, columns].dropna().shape[0]
      if years_of_data < new_years_of_data:
        years_of_data = new_years_of_data
        station = new_station
    
  if station is None or station.empty:
    pd.DataFrame().to_parquet(save_path)
    return pd.DataFrame()

  station["year"] = station.index.to_series().dt.year
  station["doy"] = station.index.to_series().apply(lambda x: x.timetuple().tm_yday)
  station["T"] = (station["TMIN"] + station["TMAX"])/2
  station = station.loc[
    station["doy"] < 90,
    ["year", "doy", "T", "PRCP"]
  ]
  station = station.pivot_table(["T", "PRCP"], "year", "doy")
  station.columns = ['_'.join([str(x) for x in col]) for col in station.columns.values]

  station["location"] = location
  station.to_parquet(save_path)
  return station

In [None]:
bloom_df = get_merged_blooms().sort_values("year")
locations = pd.unique(bloom_df["location"].values)

full_station_data = pd.DataFrame()
for location in locations:
  row = (bloom_df[bloom_df["location"] == location]).iloc[0]
  full_station_data = pd.concat([full_station_data, get_best_station_data(location, (row["lat"], row["long"]), row["year"])])


In [None]:
full_station_data.to_parquet("../preprocessing_data/full_station_data.parquet")