# Benchmarking with sktime

The benchmarking modules allows you to easily orchestrate benchmarking experiments in which you want to compare the performance of one or more algorithms over one or more data sets. It also provides a number of statistical tests to check if observed performance differences are statistically significant.


## Preliminaries

In [1]:
# import required functions and classes
import warnings

from sktime.benchmarking.forecasting_new import (
    ForecastingBenchmark as ForecastingBenchmarkNew,
)
from sktime.datasets import load_airline
from sktime.forecasting.moirai_forecaster import MOIRAIForecaster
from sktime.forecasting.naive import NaiveForecaster
from sktime.forecasting.reconcile import ReconcilerForecaster
from sktime.forecasting.ttm import TinyTimeMixerForecaster
from sktime.performance_metrics.forecasting import (
    MeanAbsoluteError, MeanSquaredError
)
from sktime.split import SlidingWindowSplitter

# hide warnings
warnings.filterwarnings("ignore")



### Univariate Time Series Forecasting Benchmarking

In [2]:
benchmark = ForecastingBenchmarkNew()


## Add estimators
benchmark.add_estimator(TinyTimeMixerForecaster())
benchmark.add_estimator(MOIRAIForecaster("sktime/moirai-1.0-R-small"))


# TODO need rerun if new metric is added
# TODO need to handle metrics with same name
# TODO handle global forecasting
# TODO Add other tasks (classification, anomaly detection, etc)
# TODO ResultObject to dataframe method. 

scorers = [MeanAbsoluteError(multilevel="raw_values"), MeanSquaredError(multilevel="raw_values", by_index=True)]

benchmark.add_task(
    load_airline,
    SlidingWindowSplitter(range(1, 12), 108, 12),
    scorers,
)

benchmark_result = benchmark.run(
    "./benchmarking_results.json",
)
for result in benchmark_result.results:
    print(result.model_id)
    print(result.means)
    print("---")


TinyTimeMixerForecaster
[ScoreResult(name='MeanAbsoluteError', score=array([60.63170647])), ScoreResult(name='fit_time', score=0.9096122640088046), ScoreResult(name='pred_time', score=0.008292305671299497), ScoreResult(name='MeanSquaredError', score=array([[  722.77371458],
       [ 1981.52833039],
       [  340.57998238],
       [  292.40980806],
       [  725.22844361],
       [ 7601.74786264],
       [27253.51022393],
       [28027.72359794],
       [ 5546.53886084],
       [ 1621.03561193],
       [  377.11442601]]))]
---
MOIRAIForecaster
[ScoreResult(name='MeanAbsoluteError', score=array([51.64224891])), ScoreResult(name='fit_time', score=0.24815930565819144), ScoreResult(name='pred_time', score=0.036989958345657215), ScoreResult(name='MeanSquaredError', score=array([[  408.61350485],
       [ 2965.37288656],
       [ 1207.73995765],
       [ 1216.85289357],
       [  743.90950801],
       [ 1972.86096729],
       [15208.95814726],
       [16816.04782807],
       [ 1783.51137201],

In [3]:
benchmark.add_estimator(NaiveForecaster(strategy="last"))

benchmark_result = benchmark.run(
    "./benchmarking_results.json",
)

for result in benchmark_result.results:
    print(result.model_id)
    print(result.means)
    print("---")

TinyTimeMixerForecaster
[ScoreResult(name='MeanAbsoluteError', score=array([60.63170647])), ScoreResult(name='fit_time', score=0.9096122640088046), ScoreResult(name='pred_time', score=0.008292305671299497), ScoreResult(name='MeanSquaredError', score=array([[  722.77371458],
       [ 1981.52833039],
       [  340.57998238],
       [  292.40980806],
       [  725.22844361],
       [ 7601.74786264],
       [27253.51022393],
       [28027.72359794],
       [ 5546.53886084],
       [ 1621.03561193],
       [  377.11442601]]))]
---
MOIRAIForecaster
[ScoreResult(name='MeanAbsoluteError', score=array([51.64224891])), ScoreResult(name='fit_time', score=0.24815930565819144), ScoreResult(name='pred_time', score=0.036989958345657215), ScoreResult(name='MeanSquaredError', score=array([[  408.61350485],
       [ 2965.37288656],
       [ 1207.73995765],
       [ 1216.85289357],
       [  743.90950801],
       [ 1972.86096729],
       [15208.95814726],
       [16816.04782807],
       [ 1783.51137201],

# The following cell shows that data can also be passed directly either via a tuple containing y and X or only a y object

In [8]:
benchmark = ForecastingBenchmarkNew()


## Add estimators
benchmark.add_estimator(TinyTimeMixerForecaster())
# USING MOIRAI fails. Probably a issue with MOIRAI TODO
benchmark.add_estimator(MOIRAIForecaster("sktime/moirai-1.0-R-small"))
benchmark.add_estimator(NaiveForecaster(strategy="last"))


# TODO need rerun if new metric is added
# TODO need to handle metrics with same name
# TODO handle global forecasting
# TODO Add other tasks (classification, anomaly detection, etc)
# TODO ResultObject to dataframe method. 

scorers = [MeanAbsoluteError(multilevel="raw_values"), MeanSquaredError(multilevel="raw_values", by_index=True)]

# benchmark.add_task(
#     load_airline(),
#     SlidingWindowSplitter(range(1, 12), 108, 12),
#     scorers,
# )

benchmark.add_task(
    (load_airline(), load_airline()),
    SlidingWindowSplitter(range(1, 12), 108, 12),
    scorers,
    task_id="Y=X",
)

benchmark_result = benchmark.run(
    "./benchmarking_results_additional.json",
)
for result in benchmark_result.results:
    print(result.model_id)
    print(result.means)
    print("---")


KeyboardInterrupt: 

## Hierachical Forecastin Benchmark

In [5]:
benchmark = ForecastingBenchmarkNew()

forecaster = NaiveForecaster(strategy="last")
reconciler_1 = ReconcilerForecaster(forecaster, method="mint_shrink")

forecaster = NaiveForecaster(strategy="drift")
reconciler_2 = ReconcilerForecaster(forecaster, method="mint_shrink")


benchmark.add_estimator(reconciler_1)
benchmark.add_estimator(reconciler_2, estimator_id="Reconciler_2")


# TODO Data generation needs to be prettier
from sktime.transformations.hierarchical.aggregate import Aggregator
from sktime.utils._testing.hierarchical import _bottom_hier_datagen


def get_data():
    agg = Aggregator()

    y = _bottom_hier_datagen(
        no_bottom_nodes=3,
        no_levels=1,
        random_seed=123,
        length=9,
    )

    y = agg.fit_transform(y)
    return y


scorers = [MeanAbsoluteError(multilevel="raw_values"), MeanAbsoluteError(multilevel="raw_values", by_index=True)]

splitter = SlidingWindowSplitter(fh=[1, 2, 3], window_length=4, step_length=2)

benchmark.add_task(
    get_data,
    splitter,
    scorers,
)

benchmark_result = benchmark.run(
    "./hierachical_benchmark.json",
)

for result in benchmark_result.results:
    print(result.model_id)
    print(result.means)
    print("---")

ReconcilerForecaster
[ScoreResult(name='MeanAbsoluteError', score=array([199.95165371, 180.90826979, 190.42977317,  10.5       ,
         9.5       ,  10.        , 143.24260979, 129.60019271,
       136.42126655,  46.20904392,  41.80807708,  44.00850662])), ScoreResult(name='fit_time', score=0.03578204198856838), ScoreResult(name='pred_time', score=0.019599416496930644)]
---
Reconciler_2
[ScoreResult(name='MeanAbsoluteError', score=array([244.38584522, 155.51837348,  38.08419188,  12.83333333,
         8.16666667,   2.        , 175.07465135, 111.41121915,
        27.2829942 ,  56.47786054,  35.94048766,   8.80119768])), ScoreResult(name='fit_time', score=0.09586581200710498), ScoreResult(name='pred_time', score=0.009146374504780397)]
---
