# Benchmarking with sktime

The benchmarking modules allows you to easily orchestrate benchmarking experiments in which you want to compare the performance of one or more algorithms over one or more data sets. It also provides a number of statistical tests to check if observed performance differences are statistically significant.


## Preliminaries

In [1]:
# import required functions and classes
import warnings

from sktime.benchmarking.forecasting_new import (
    ForecastingBenchmark as ForecastingBenchmarkNew,
)
from sktime.datasets import load_airline
from sktime.forecasting.moirai_forecaster import MOIRAIForecaster
from sktime.forecasting.naive import NaiveForecaster
from sktime.forecasting.reconcile import ReconcilerForecaster
from sktime.forecasting.ttm import TinyTimeMixerForecaster
from sktime.performance_metrics.forecasting import (
    MeanAbsoluteError,
)
from sktime.split import SlidingWindowSplitter

# hide warnings
warnings.filterwarnings("ignore")



### Univariate Time Series Forecasting Benchmarking

In [2]:
benchmark = ForecastingBenchmarkNew()


## Add estimators
benchmark.add_estimator(TinyTimeMixerForecaster())
benchmark.add_estimator(MOIRAIForecaster("sktime/moirai-1.0-R-small"))

scorers = [MeanAbsoluteError(multioutput="raw_values")]

benchmark.add_task(
    load_airline,
    SlidingWindowSplitter(range(1, 12), 108, 12),
    scorers,
)

benchmark_result = benchmark.run(
    "./benchmarking_results.json",
)
for result in benchmark_result.results:
    print(result.model_id)
    print(result.means)
    print("---")

TinyTimeMixerForecaster
[ScoreResult(name='MeanAbsoluteError', score=array([60.63170647])), ScoreResult(name='fit_time', score=0.8591277083323803), ScoreResult(name='pred_time', score=0.006748305330499231)]
---
MOIRAIForecaster
[ScoreResult(name='MeanAbsoluteError', score=array([52.09134558])), ScoreResult(name='fit_time', score=0.23429156933707418), ScoreResult(name='pred_time', score=0.03190738899623587)]
---
NaiveForecaster
[ScoreResult(name='MeanAbsoluteError', score=array([76.96969697])), ScoreResult(name='fit_time', score=0.0010273609999179218), ScoreResult(name='pred_time', score=0.004412986333287942)]
---


In [3]:
benchmark.add_estimator(NaiveForecaster(strategy="last"))

benchmark_result = benchmark.run(
    "./benchmarking_results.json",
)

for result in benchmark_result.results:
    print(result.model_id)
    print(result.means)
    print("---")

TinyTimeMixerForecaster
[ScoreResult(name='MeanAbsoluteError', score=array([60.63170647])), ScoreResult(name='fit_time', score=0.8591277083323803), ScoreResult(name='pred_time', score=0.006748305330499231)]
---
MOIRAIForecaster
[ScoreResult(name='MeanAbsoluteError', score=array([52.09134558])), ScoreResult(name='fit_time', score=0.23429156933707418), ScoreResult(name='pred_time', score=0.03190738899623587)]
---
NaiveForecaster
[ScoreResult(name='MeanAbsoluteError', score=array([76.96969697])), ScoreResult(name='fit_time', score=0.0010273609999179218), ScoreResult(name='pred_time', score=0.004412986333287942)]
---


## Hierachical Forecastin Benchmark

In [None]:
benchmark = ForecastingBenchmarkNew()

forecaster = NaiveForecaster(strategy="last")
reconciler_1 = ReconcilerForecaster(forecaster, method="mint_shrink")

forecaster = NaiveForecaster(strategy="drift")
reconciler_2 = ReconcilerForecaster(forecaster, method="mint_shrink")


benchmark.add_estimator(reconciler_1)
benchmark.add_estimator(reconciler_2, estimator_id="Reconciler_2")


# TODO Data generation needs to be prettier
from sktime.transformations.hierarchical.aggregate import Aggregator
from sktime.utils._testing.hierarchical import _bottom_hier_datagen


def get_data():
    agg = Aggregator()

    y = _bottom_hier_datagen(
        no_bottom_nodes=3,
        no_levels=1,
        random_seed=123,
        length=9,
    )

    y = agg.fit_transform(y)
    return y


scorers = [MeanAbsoluteError(multilevel="raw_values")]

splitter = SlidingWindowSplitter(fh=[1, 2, 3], window_length=4, step_length=2)

benchmark.add_task(
    get_data,
    splitter,
    scorers,
)

benchmark_result = benchmark.run(
    "./hierachical_benchmark.json",
)

for result in benchmark_result.results:
    print(result.model_id)
    print(result.means)
    print("---")

ReconcilerForecaster
[ScoreResult(name='MeanAbsoluteError', score=array([[190.42989889],
       [ 10.        ],
       [136.42135635],
       [ 44.00854254]])), ScoreResult(name='fit_time', score=0.03225533350268961), ScoreResult(name='pred_time', score=0.01662606250101817)]
---
Reconciler_2
[ScoreResult(name='MeanAbsoluteError', score=array([[145.99613686],
       [  7.66666667],
       [104.58962157],
       [ 33.73984863]])), ScoreResult(name='fit_time', score=0.08565895849824301), ScoreResult(name='pred_time', score=0.008330624998052372)]
---
