## Categorical values

In [19]:
import copy
import warnings
import numbers
import json
import calendar
import math
import datetime as dt
import numpy as np
from numpy import power
from math import inf

from spotPython.spot import spot
from spotPython.utils.convert import class_for_name
from spotPython.hyperparameters.values import (modify_hyper_parameter_levels,
    modify_hyper_parameter_bounds, get_default_values, get_var_name, get_var_type, get_bound_values,
    get_dict_with_levels_and_types)
from spotPython.utils.transform import transform_hyper_parameter_values

from spotRiver.fun.hyperriver import HyperRiver
from spotRiver.utils.selectors import select_leaf_prediction, select_leaf_model
from spotRiver import data
from spotRiver.data.bike_sharing import get_bike_sharing_data
from spotRiver.data.river_hyper_dict import HyperDict
from spotRiver.utils.assignments import assign_values, iterate_dict_values, convert_keys

from scipy.optimize import differential_evolution
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder , MinMaxScaler
from sklearn.linear_model import RidgeCV
from sklearn.pipeline import make_pipeline , Pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import cross_validate
from sklearn.datasets import fetch_openml
from sklearn.metrics import mean_absolute_error

import river.stream as river_stream
from river.tree.splitter import EBSTSplitter, QOSplitter, TEBSTSplitter, GaussianSplitter, HistogramSplitter
from river.linear_model import LinearRegression, PARegressor, Perceptron
from river.tree import HoeffdingAdaptiveTreeRegressor
from river.preprocessing import StandardScaler
from river.compose import Pipeline
from river import compose
from river import datasets, time_series, utils, compose, linear_model, optim, preprocessing, evaluate, metrics, tree 
from river.datasets import synth
from river import feature_extraction
from river import stats, compose, preprocessing, tree
from river import metrics
warnings.filterwarnings("ignore")

## Bike Sharing

In [20]:
df, train, test = get_bike_sharing_data()
target_column="count"
n_samples = df.shape[0]
X = copy.deepcopy(train)
y = X.pop("count")
data = river_stream.iter_pandas(X, y)
dataset = list(data)
#
categorical_columns = [
    "weather",
    "season",
    "holiday",
    "workingday",
]
categories = [
    ["clear", "misty", "rain"],
    ["spring", "summer", "fall", "winter"],
    ["False", "True"],
    ["False", "True"],
]

m = test.shape[0]
a = int(m/2)-100
b = int(m/2)

## Load default `hyper_dict` for all algorithms, hyperparameters and levels

In [21]:
# river_hyper_dict = HyperDict().load()
# Load local hyper_dict:
with open("river_hyper_dict.json", "r") as f:
        river_hyper_dict = json.load(f)

In [22]:
fun_control = {}
core_model  = HoeffdingAdaptiveTreeRegressor
fun_control.update({"core_model": core_model})

## Select `algorithm` and `core_model_hyper_dict`

In [23]:
fun_control.update({"core_model_hyper_dict": river_hyper_dict[core_model.__name__]})

### Modify hyperparameter of type factor

In [24]:
fun_control = modify_hyper_parameter_levels(fun_control, "leaf_prediction", ["model"])
fun_control = modify_hyper_parameter_levels(fun_control, "leaf_model", ["LinearRegression"])
# fun_control["core_model_hyper_dict"]

## Modify hyperparameter of type numeric

In [25]:
fun_control = modify_hyper_parameter_bounds(fun_control, "min_samples_split", bounds=[3, 11])
# fun_control

## Preprocessing model `prep_model`

In [26]:
# prep_model = StandardScaler()
prep_model = compose.Select("weekday", "month", "temp", "feel_temp", "humidity", "windspeed")
prep_model += (
    feature_extraction.TargetAgg(by=['hour'], how=stats.Mean())
)
prep_model |= preprocessing.StandardScaler()
prep_model
fun_control.update({"prep_model": prep_model})

## Get Type and Variable names

In [27]:
var_type = get_var_type(fun_control)
var_name = get_var_name(fun_control)
var_type


['int',
 'int',
 'float',
 'float',
 'factor',
 'factor',
 'float',
 'factor',
 'int',
 'factor',
 'int',
 'float',
 'factor',
 'float',
 'int',
 'factor',
 'factor',
 'factor']

## Get lower and upper bounds

In [28]:
lower = get_bound_values(fun_control, "lower")
upper = get_bound_values(fun_control, "upper")


## Compile fun_control for spot

* To be updated: Check if fun_control is correct!

In [29]:
horizon = 7*24
oml_grace_period = 2
fun = HyperRiver(seed=123, log_level=50).fun_oml_horizon
fun_control.update({"data": None, # dataset,
               "train": train,
               "test": test,
               "target_column": target_column,
               "horizon": horizon,
               "oml_grace_period": oml_grace_period,
               "n_samples": n_samples,
               "weights": np.array([1, 1/1000, 1/1000])*10_000.0,
               "step": 100,
               "log_level": 50,
               "weight_coeff": 1.0,
               "metric": metrics.MAE(),
               "metric_sklearn": mean_absolute_error,
               "var_name": var_name,
               "var_type": var_type,
               "prep_model": prep_model
               })

In [30]:
fun_control

{'core_model': river.tree.hoeffding_adaptive_tree_regressor.HoeffdingAdaptiveTreeRegressor,
 'core_model_hyper_dict': {'grace_period': {'type': 'int',
   'default': 200,
   'transform': 'None',
   'lower': 10,
   'upper': 1000},
  'max_depth': {'type': 'int',
   'default': 20,
   'transform': 'transform_power_2_int',
   'lower': 2,
   'upper': 20},
  'delta': {'type': 'float',
   'default': 1e-07,
   'transform': 'None',
   'lower': 1e-08,
   'upper': 1e-06},
  'tau': {'type': 'float',
   'default': 0.05,
   'transform': 'None',
   'lower': 0.01,
   'upper': 0.1},
  'leaf_prediction': {'levels': ['model'],
   'type': 'factor',
   'default': 'mean',
   'transform': 'None',
   'core_model_parameter_type': 'str',
   'lower': 0,
   'upper': 0},
  'leaf_model': {'levels': ['LinearRegression'],
   'type': 'factor',
   'default': 'LinearRegression',
   'transform': 'None',
   'class_name': 'river.linear_model',
   'core_model_parameter_type': 'instance()',
   'lower': 0,
   'upper': 0},
  'mo

# Test if fun_control is correct

## fun_hyperriver: fun_oml_horizon

* Generate  (simulated) X hyperparameter config from spot():

In [31]:
X = get_default_values(fun_control)
# get values from dict as np.array. If string, return zero
X = np.array([float(X[key]) if isinstance(X[key], numbers.Number) else 0 for key in X.keys()])
X

array([2.0e+02, 2.0e+01, 1.0e-07, 5.0e-02, 0.0e+00, 0.0e+00, 9.5e-01,
       0.0e+00, 5.0e+00, 0.0e+00, 3.0e+02, 5.0e-02, 0.0e+00, 5.0e+02,
       1.0e+06, 0.0e+00, 0.0e+00, 0.0e+00])

In [32]:
X = np.array([X])
X.shape[1]
var_dict = assign_values(X, fun_control["var_name"])
var_dict

{'grace_period': array([200.]),
 'max_depth': array([20.]),
 'delta': array([1.e-07]),
 'tau': array([0.05]),
 'leaf_prediction': array([0.]),
 'leaf_model': array([0.]),
 'model_selector_decay': array([0.95]),
 'splitter': array([0.]),
 'min_samples_split': array([5.]),
 'bootstrap_sampling': array([0.]),
 'drift_window_threshold': array([300.]),
 'switch_significance': array([0.05]),
 'binary_split': array([0.]),
 'max_size': array([500.]),
 'memory_estimate_period': array([1000000.]),
 'stop_mem_management': array([0.]),
 'remove_poor_attrs': array([0.]),
 'merit_preprune': array([0.])}

In [33]:
fun_control["var_type"]

['int',
 'int',
 'float',
 'float',
 'factor',
 'factor',
 'float',
 'factor',
 'int',
 'factor',
 'int',
 'float',
 'factor',
 'float',
 'int',
 'factor',
 'factor',
 'factor']

In [34]:
for values in iterate_dict_values(var_dict):
            values = convert_keys(values, fun_control["var_type"])
            print(values)
            values = get_dict_with_levels_and_types(fun_control=fun_control, v=values)
            values = transform_hyper_parameter_values(fun_control=fun_control, hyper_parameter_values=values)
            print(values)
            model = compose.Pipeline(fun_control["prep_model"], fun_control["core_model"](**values))

{'grace_period': 200, 'max_depth': 20, 'delta': 1e-07, 'tau': 0.05, 'leaf_prediction': 0, 'leaf_model': 0, 'model_selector_decay': 0.95, 'splitter': 0, 'min_samples_split': 5, 'bootstrap_sampling': 0, 'drift_window_threshold': 300, 'switch_significance': 0.05, 'binary_split': 0, 'max_size': 500.0, 'memory_estimate_period': 1000000, 'stop_mem_management': 0, 'remove_poor_attrs': 0, 'merit_preprune': 0}
{'grace_period': 200, 'max_depth': 1048576, 'delta': 1e-07, 'tau': 0.05, 'leaf_prediction': 'model', 'leaf_model': LinearRegression (
  optimizer=SGD (
    lr=Constant (
      learning_rate=0.01
    )
  )
  loss=Squared ()
  l2=0.
  l1=0.
  intercept_init=0.
  intercept_lr=Constant (
    learning_rate=0.01
  )
  clip_gradient=1e+12
  initializer=Zeros ()
), 'model_selector_decay': 0.95, 'splitter': EBSTSplitter (), 'min_samples_split': 5, 'bootstrap_sampling': 0, 'drift_window_threshold': 300, 'switch_significance': 0.05, 'binary_split': 0, 'max_size': 500.0, 'memory_estimate_period': 100

In [35]:
values

{'grace_period': 200,
 'max_depth': 1048576,
 'delta': 1e-07,
 'tau': 0.05,
 'leaf_prediction': 'model',
 'leaf_model': LinearRegression (
   optimizer=SGD (
     lr=Constant (
       learning_rate=0.01
     )
   )
   loss=Squared ()
   l2=0.
   l1=0.
   intercept_init=0.
   intercept_lr=Constant (
     learning_rate=0.01
   )
   clip_gradient=1e+12
   initializer=Zeros ()
 ),
 'model_selector_decay': 0.95,
 'splitter': EBSTSplitter (),
 'min_samples_split': 5,
 'bootstrap_sampling': 0,
 'drift_window_threshold': 300,
 'switch_significance': 0.05,
 'binary_split': 0,
 'max_size': 500.0,
 'memory_estimate_period': 1000000,
 'stop_mem_management': 0,
 'remove_poor_attrs': 0,
 'merit_preprune': 0}

In [36]:
from spotRiver.evaluation.eval_bml import eval_oml_horizon
eval_oml_horizon(
                    model=model,
                    train=fun_control["train"],
                    test=fun_control["test"],
                    target_column=fun_control["target_column"],
                    horizon=fun_control["horizon"],
                    oml_grace_period=fun_control["oml_grace_period"],
                    metric=fun_control["metric_sklearn"],
                )

(      Metric  Memory (MB)  CompTime (s)
 0        NaN     0.046823      0.002856
 1   0.127892     0.604399      0.064493
 2   0.078384     0.672849      0.270318
 3   0.093580     0.483761      0.259147
 4   0.070639     0.686518      0.120415
 5   0.080238     0.848624      0.320748
 6   0.078098     0.595728      0.296026
 7   0.076007     0.736459      0.101649
 8   0.080057     0.690916      0.338796
 9   0.079941     0.727486      0.116066
 10  0.080650     0.743726      0.377021
 11  0.086142     0.610918      0.337487
 12  0.103953     0.686293      0.235779
 13  0.085606     0.673301      0.351958
 14  0.081359     0.748987      0.323849
 15  0.082154     0.636775      0.263053
 16  0.078334     0.716377      0.092020
 17  0.075139     0.731103      0.252574
 18  0.084469     0.730888      0.147047
 19  0.077674     0.792536      0.118348
 20  0.079568     0.867287      0.469078
 21  0.074213     0.772905      0.379349
 22  0.078001     0.765746      0.117334
 23  0.073521   

# Run SPOT

In [37]:
spot_htr = spot.Spot(fun=fun,
                   lower = lower,
                   upper = upper,
                   fun_evals = inf,
                   fun_repeats = 1,
                   max_time = 1,
                   noise = False,
                   tolerance_x = np.sqrt(np.spacing(1)),
                   var_type=var_type,
                   var_name=var_name,
                   infill_criterion = "y",
                   n_points = 1,
                   seed=123,
                   log_level = 10,
                   show_models= False,
                   show_progress= True,
                   fun_control = fun_control,
                   design_control={"init_size": 20,
                                   "repeats": 1},
                   surrogate_control={"noise": True,
                                      "cod_type": "norm",
                                      "min_theta": -4,
                                      "max_theta": 3,
                                      "n_theta": len(var_name),
                                      "model_optimizer": differential_evolution,
                                      "model_fun_evals": 10_000,
                                      "log_level": 50
                                      })
spot_htr.run()

{'grace_period': 125, 'max_depth': 11, 'delta': 9.395921862977558e-07, 'tau': 0.05867032685185599, 'leaf_prediction': 0, 'leaf_model': 0, 'model_selector_decay': 0.9757084234451173, 'splitter': 1, 'min_samples_split': 4, 'bootstrap_sampling': 1, 'drift_window_threshold': 344, 'switch_significance': 0.06899548288099967, 'binary_split': 0, 'max_size': 628.9765929519042, 'memory_estimate_period': 782909, 'stop_mem_management': 1, 'remove_poor_attrs': 1, 'merit_preprune': 1}
{'grace_period': 125, 'max_depth': 2048, 'delta': 9.395921862977558e-07, 'tau': 0.05867032685185599, 'leaf_prediction': 'model', 'leaf_model': LinearRegression (
  optimizer=SGD (
    lr=Constant (
      learning_rate=0.01
    )
  )
  loss=Squared ()
  l2=0.
  l1=0.
  intercept_init=0.
  intercept_lr=Constant (
    learning_rate=0.01
  )
  clip_gradient=1e+12
  initializer=Zeros ()
), 'model_selector_decay': 0.9757084234451173, 'splitter': TEBSTSplitter (
  digits=1
), 'min_samples_split': 4, 'bootstrap_sampling': 1, '

In [None]:
spot_htr.plot_progress(log_y=True)

In [None]:
spot_htr.print_results()

In [None]:
spot_htr.print_importance()