In [157]:
import pandas as pd
import math
from sklearn.preprocessing import LabelEncoder
import joblib
# Forecast data from https://www.qweather.com/en/weather30d/vancouver-EF058.html

# From https://www.qweather.com/en/weather30d/vancouver-EF058.html at 8:30 pm PST
liestal = [47.48, 7.73, 350.0]
vancouver = [49.22, -123.16, 24.0]
kyoto = [35.01, 135.67, 44.0]
washingtondc = [38.88, 77.03, 0.0]

t = {
  "vancouver": [95, 80, 70, 60, 55, 45, 45, 55, 45, 50, 40, 50, 50, 60, 60, 55, 55, 50, 50, 55, 70, 75, 65, 60, 65, 65, 75, 60, 60, 60],
  "liestal": [25, 40, 40, 35, 35, 40, 25, 40, 40, 55, 70, 75, 80, 35, 35, 30, 30, 25, 20, 25, 25, 25, 30, 45, 60, 60, 50, 60, 75, 85],
  "kyoto": [85, 75, 55, 75, 80, 50, 50, 50, 65, 90, 100, 110, 120, 60, 55, 70, 90, 80, 60, 55, 55, 55, 55, 60, 70, 105, 85, 105, 120, 105],
  "washingtondc": [95, 80, 70, 60, 55, 45, 45, 55, 45, 50, 40, 50, 50, 60, 60, 55, 55, 50, 50, 55, 70, 75, 65, 60, 65, 65, 75, 60, 60, 60],
}

p = {
  "vancouver": [96, 45, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 107, 99, 97, 50, 51, 30, 98, 59, 81, 0, 0, 0, 15, 57, 15, 15],
  "liestal": [0, 0, 0, 0, 0, 27, 0, 7, 0, 0, 41, 0, 0, 0, 0, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  "kyoto": [101, 5, 0, 0, 22, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 93, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 83, 14],
  "washingtondc": [96, 45, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 107, 99, 97, 50, 51, 30, 98, 59, 81, 0, 0, 0, 15, 57, 15, 15],
}

locations = ["kyoto", "liestal", "vancouver", "washingtondc"]
climate_df = pd.DataFrame()
for location in locations:
  data = pd.read_parquet(f"../preprocessing_data/prediction/{location}.parquet")

  for i in range (1, 31):
    data.loc[2022, f"T_{60 + i}"] = t[location][i - 1]
    data.loc[2022, f"PRCP_{60 + i}"] = p[location][i - 1]
  climate_df  = pd.concat([climate_df , data])

climate_df = climate_df.loc[climate_df.index == 2022]
climate_df


Unnamed: 0_level_0,PRCP_1,PRCP_2,PRCP_3,PRCP_4,PRCP_5,PRCP_6,PRCP_7,PRCP_8,PRCP_9,PRCP_10,...,T_83,T_84,T_85,T_86,T_87,T_88,T_89,location,T_90,PRCP_90
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022,10.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,...,55.0,60.0,70.0,105.0,85.0,105.0,120.0,kyoto,105.0,14.0
2022,0.0,0.0,25.0,135.0,3.0,10.0,32.0,22.0,35.0,0.0,...,30.0,45.0,60.0,60.0,50.0,60.0,75.0,liestal,85.0,0.0
2022,0.0,86.0,76.0,70.0,9.0,164.0,76.0,30.0,0.0,46.0,...,65.0,60.0,65.0,65.0,75.0,60.0,60.0,vancouver,60.0,15.0
2022,165.1,17.78,185.42,0.0,0.0,0.0,66.04,0.0,81.28,0.0,...,65.0,60.0,65.0,65.0,75.0,60.0,60.0,washingtondc,60.0,15.0


In [158]:
%run get_merged_blooms.ipynb
bloom_df = get_merged_blooms(locations)
bloom_df.head()

Unnamed: 0,location,lat,long,alt,year,bloom_date,bloom_doy
0,kyoto,35.011983,135.676114,44,812,0812-04-01,92
1,kyoto,35.011983,135.676114,44,815,0815-04-15,105
2,kyoto,35.011983,135.676114,44,831,0831-04-06,96
3,kyoto,35.011983,135.676114,44,851,0851-04-18,108
4,kyoto,35.011983,135.676114,44,853,0853-04-14,104


In [159]:
enriched = pd.merge(bloom_df, climate_df, on=["location", "year"], how='left')

def filter_doy(initial_doy, last_doy):
  columns = enriched.columns.copy()
  for column in columns:
      doy = int(column.split('_')[-1]) - 1
      if doy < initial_doy or doy > last_doy:
        enriched.drop(column, axis=1, inplace=True)

def aggregate_columns(column_preffix, aggregate_size):
  selected_columns = [column for column in enriched.columns if column_preffix in column]

  for column in selected_columns:
      doy = int(column.split('_')[-1]) - 1
      batch = doy//aggregate_size
      new_column = f"{column_preffix}_{batch*aggregate_size}_{batch*aggregate_size + aggregate_size}"

      if new_column not in enriched.columns:
        enriched[new_column] = 0

      enriched[new_column] += enriched[column]/aggregate_size
      enriched.drop(column, axis=1, inplace=True)

for i in range(1, 40):
  enriched[f"prev_bloom_{i}"] = enriched.index.to_series().apply(lambda year: enriched.loc[year - i, "bloom_doy"] if year - i >= enriched.index.min() else math.nan)
enriched = enriched.fillna(enriched.mean())
enriched = enriched.drop(["bloom_date"], axis=1)
enriched = enriched[(enriched["bloom_doy"] > 60) & (enriched["year"] > 1950)]

aggregate_columns("PRCP", 6)
aggregate_columns("T", 5)
aggregate_columns("prev_bloom", 4)

enriched = enriched.loc[enriched["year"] == 2022].drop("bloom_doy", axis=1)

enriched.head()

  enriched = enriched.fillna(enriched.mean())


Unnamed: 0,location,lat,long,alt,year,PRCP_0_6,PRCP_6_12,PRCP_12_18,PRCP_18_24,PRCP_24_30,...,prev_bloom_0_4,prev_bloom_4_8,prev_bloom_8_12,prev_bloom_12_16,prev_bloom_16_20,prev_bloom_20_24,prev_bloom_24_28,prev_bloom_28_32,prev_bloom_32_36,prev_bloom_36_40
833,kyoto,35.011983,135.676114,44,2022,1.666667,18.166667,8.5,41.333333,0.0,...,90.25,95.25,97.0,96.25,95.0,95.25,99.5,94.0,99.0,76.75
962,liestal,47.4814,7.730519,350,2022,28.833333,14.833333,0.0,3.166667,4.333333,...,87.0,91.75,97.0,98.5,89.0,91.25,91.25,93.0,107.0,79.25
963,vancouver,49.2237,-123.1636,24,2022,67.5,103.666667,8.666667,31.666667,34.0,...,84.25,95.25,93.5,95.25,97.5,88.0,93.25,94.75,95.5,84.25
1065,washingtondc,38.88535,-77.038628,0,2022,61.383333,24.553333,50.376667,23.283333,5.503333,...,88.25,92.25,89.25,89.25,93.5,88.5,91.75,90.25,89.5,72.0


In [160]:
data = [[0.0, 0.0, 0.0, 0.0] for i in range (2022, 2032)]
index = range(2022, 2032)
result = pd.DataFrame(data=data, columns=["kyoto", "liestal", "washingtondc", "vancouver"], index=index)
result.index.name = "year"

In [161]:
best_washingtondc = joblib.load("../best_washingtondc.joblib")
row = enriched.loc[enriched["location"] == "washingtondc"]
row["location"] = 1

for i in range(2022, 2032):
  row["year"] = i
  result_washingtondc = best_washingtondc.predict(row)[0]
  result.loc[i]["washingtondc"] = result_washingtondc


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row["location"] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row["year"] = i


In [162]:
best_kyoto = joblib.load("../best_kyoto.joblib")
row = enriched.loc[enriched["location"] == "kyoto"]
row["location"] = 106

for i in range(2022, 2032):
  row["year"] = i
  result_kyoto = best_kyoto.predict(row)[0]
  result.loc[i]["kyoto"] = result_kyoto

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row["location"] = 106
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row["year"] = i


In [166]:
best_liestal = joblib.load("../best_liestal.joblib")
row = enriched.loc[enriched["location"] == "liestal"]
row["location"] = 0

for i in range(2022, 2032):
  row["year"] = i
  result_liestal = best_liestal.predict(row)[0]
  result.loc[i]["liestal"] = result_liestal

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row["location"] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row["year"] = i


In [163]:
best_vancouver = joblib.load("../best_vancouver.joblib")
row = enriched.loc[enriched["location"] == location, best_vancouver.feature_name_]

for i in range(2022, 2032):
  row["year"] = i
  result_vancouver = best_vancouver.predict(row)[0]
  result.loc[i]["vancouver"] = result_vancouver

In [164]:
result.round(0).astype(int).to_csv("../cherry-predictions.csv")

In [167]:
result

Unnamed: 0_level_0,kyoto,liestal,washingtondc,vancouver
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022,91.837296,103.719054,94.973525,95.790611
2023,91.837296,103.719054,94.973525,95.790611
2024,91.837296,103.719054,94.973525,95.790611
2025,91.837296,103.719054,94.973525,95.790611
2026,91.837296,103.719054,94.973525,95.790611
2027,91.837296,103.719054,94.973525,95.790611
2028,91.837296,103.719054,94.973525,95.790611
2029,91.837296,103.719054,94.973525,95.790611
2030,91.837296,103.719054,94.973525,95.790611
2031,91.837296,103.719054,94.973525,95.790611
