In [1]:
%load_ext autoreload
%autoreload 2
import warnings
from pathlib import Path

import datasets
from tqdm.auto import tqdm

import fev

warnings.simplefilter("ignore")
datasets.disable_progress_bars()

## Dataset format
We store time series datasets using the Hugging Face `datasets` library.

We assume that all time series datasets obey the following schema:
- each dataset entry (=row) represents a single (univariate/multivariate) time series
- each entry contains
    - 1/ a field of type `Sequence(timestamp)` that contains the timestamps of observations
    - 2/ at least one field of type `Sequence(float)` that can be used as the target time series
    - 3/ a field of type `string` that contains the unique ID of each time series
- all fields of type Sequence have the same length

In [2]:
ds = datasets.load_dataset("autogluon/chronos_datasets", "monash_kdd_cup_2018", split="train")
ds.set_format("numpy")
ds

Dataset({
    features: ['id', 'timestamp', 'target', 'city', 'station', 'measurement'],
    num_rows: 270
})

In [3]:
# Each entry (=row) corresponds to a single time series
ds[0]

{'id': np.str_('T000000'),
 'timestamp': array(['2017-01-01T14:00:00.000', '2017-01-01T15:00:00.000',
        '2017-01-01T16:00:00.000', ..., '2018-03-31T13:00:00.000',
        '2018-03-31T14:00:00.000', '2018-03-31T15:00:00.000'],
       dtype='datetime64[ms]'),
 'target': array([453., 417., 395., ..., 132., 158., 118.], dtype=float32),
 'city': np.str_('Beijing'),
 'station': np.str_('aotizhongxin_aq'),
 'measurement': np.str_('PM2.5')}

In [4]:
# Feature metadata is stored in the datasets
ds.features

{'id': Value(dtype='string', id=None),
 'timestamp': Sequence(feature=Value(dtype='timestamp[ms]', id=None), length=-1, id=None),
 'target': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None),
 'city': Value(dtype='string', id=None),
 'station': Value(dtype='string', id=None),
 'measurement': Value(dtype='string', id=None)}

## Evaluation on a single Task
A `fev.Task` object contains all information that uniquely identifies a time series forecasting task.

### Data sources
Dataset stored on Hugging Face Hub: https://huggingface.co/datasets/autogluon/chronos_datasets

In [5]:
task = fev.Task(
    dataset_path="autogluon/chronos_datasets",
    dataset_config="monash_cif_2016",
    horizon=12,
)

Dataset stored on S3

In [6]:
# Dataset consisting of a single parquet / arrow file
task = fev.Task(
    dataset_path="s3://autogluon/datasets/timeseries/m1_monthly/data.parquet",
    horizon=12,
)
# Dataset consisting of multiple parquet / arrow files
task = fev.Task(
    dataset_path="s3://autogluon/datasets/timeseries/m1_monthly/*.parquet",
    horizon=12,
)

Dataset stored locally

In [7]:
# Download dataset from HF Hub and save it locally
ds = datasets.load_dataset("autogluon/chronos_datasets", name="m4_hourly", split="train")
local_path = "/tmp/m4_hourly/data.parquet"
ds.to_parquet(local_path)

task = fev.Task(
    dataset_path=local_path,
    horizon=48,
)

### Covariates
By default, all columns of type `Sequence` are interpreted as known covariates, and all remaining columns are interpreted as static covariates.

In [8]:
task = fev.Task(
    dataset_path="autogluon/chronos_datasets_extra",
    dataset_config="ETTh",
    horizon=24,
    target_column="OT",
)
past_data, future_data = task.get_input_data()
print(past_data)
print(future_data)

Dataset({
    features: ['id', 'timestamp', 'HUFL', 'HULL', 'MUFL', 'MULL', 'LUFL', 'LULL', 'OT'],
    num_rows: 2
})
Dataset({
    features: ['id', 'timestamp', 'HUFL', 'HULL', 'MUFL', 'MULL', 'LUFL', 'LULL'],
    num_rows: 2
})


We can configure how the covariates are used as part of the task definition.

For example, here we say that 
- columns `HUFL` and `HULL` are known only in the past
- columns `MUFL` and `MULL` are excluded from the dataset

In [9]:
task = fev.Task(
    dataset_path="autogluon/chronos_datasets_extra",
    dataset_config="ETTh",
    horizon=24,
    target_column="OT",
    past_dynamic_columns=["HUFL", "HULL"],
    excluded_columns=["MUFL", "MULL"],
)

past_data, future_data = task.get_input_data()
print(past_data)
print(future_data)

Dataset({
    features: ['id', 'timestamp', 'HUFL', 'HULL', 'LUFL', 'LULL', 'OT'],
    num_rows: 2
})
Dataset({
    features: ['id', 'timestamp', 'LUFL', 'LULL'],
    num_rows: 2
})


### Predictions format
Each task expects predictions to follow a certain format that is specified by `task.predictions_schema`.

For point forecasting tasks (i.e., if `quantile_levels=None`), predictions must contain a single array of length `horizon` for each time series.

In [10]:
task = fev.Task(
    dataset_path="autogluon/chronos_datasets",
    dataset_config="m4_hourly",
    horizon=48,
    eval_metric="MASE",
    seasonality=24,
)

In [11]:
task.predictions_schema

{'predictions': Sequence(feature=Value(dtype='float64', id=None), length=48, id=None)}

For probabilistic forecasting tasks (i.e., if `quantile_levels` is provided), predictions must additionally contain a prediction for each quantile level.

In [12]:
task = fev.Task(
    dataset_path="autogluon/chronos_datasets",
    dataset_config="m4_hourly",
    horizon=48,
    seasonality=24,
    quantile_levels=[0.1, 0.5, 0.9],
    eval_metric="WQL",
)

In [13]:
task.predictions_schema

{'predictions': Sequence(feature=Value(dtype='float64', id=None), length=48, id=None),
 '0.1': Sequence(feature=Value(dtype='float64', id=None), length=48, id=None),
 '0.5': Sequence(feature=Value(dtype='float64', id=None), length=48, id=None),
 '0.9': Sequence(feature=Value(dtype='float64', id=None), length=48, id=None)}

## Backtesting & custom cutoffs
By default, the train/test split is generated as follows:
- test set contains the last `horizon` time steps of each time series
- train set contains everything up to the last `horizon` time steps of each time series

We can create the train/test splits at custom points in the time series using the `cutoff` argument.

The default behavior corresponds to setting `cutoff = -horizon`:

In [14]:
task = fev.Task(
    dataset_path="autogluon/chronos_datasets_extra",
    dataset_config="ETTh",
    horizon=24,
    target_column="OT",
    cutoff=-24,
)

We can set cutoff to a positive or negative integer. In this case, the training data will correspond to `y[:cutoff]` and the test set will be `y[cutoff : cutoff + horizon]`.

We can also set `cutoff` to a datetime-like string. In this case, `cutoff` will be the last timestamp in the training data.

In [15]:
task = fev.Task(
    dataset_path="autogluon/chronos_datasets_extra",
    dataset_config="ETTh",
    horizon=24,
    target_column="OT",
    cutoff="2017-01-01",
)
past_data, future_data = task.get_input_data()
print(f"Last train timestamp: {past_data[0]['timestamp'][-1]}")
print(f"First test timestamp: {future_data[0]['timestamp'][0]}")

Last train timestamp: 2017-01-01T00:00:00.000000000
First test timestamp: 2017-01-01T01:00:00.000000000


We can create tasks corresponding to multiple backtests by providing different values for the `cutoff`:

In [16]:
tasks = [
    fev.Task(
        dataset_path="autogluon/chronos_datasets_extra",
        dataset_config="ETTh",
        horizon=24,
        target_column="OT",
        cutoff="2017-01-01",
    ),
    fev.Task(
        dataset_path="autogluon/chronos_datasets_extra",
        dataset_config="ETTh",
        horizon=24,
        target_column="OT",
        cutoff="2017-02-07",
    ),
    fev.Task(
        dataset_path="autogluon/chronos_datasets_extra",
        dataset_config="ETTh",
        horizon=24,
        target_column="OT",
        cutoff="2017-06-03",
    ),
]

The `fev.TaskGenerator` class provides a more concise way to create multiple related configurations, e.g., for backtesting:

In [17]:
task_generator = fev.TaskGenerator(
    dataset_path="autogluon/chronos_datasets_extra",
    dataset_config="ETTh",
    horizon=24,
    target_column="OT",
    variants=[
        {"cutoff": "2017-01-01"},
        {"cutoff": "2017-02-07"},
        {"cutoff": "2017-06-03"},
    ]
)
tasks = task_generator.generate_tasks()
for i, task in enumerate(tasks):
    print(f"Task {i}")
    past_data, future_data = task.get_input_data()
    print(f"\tLast train timestamp: {past_data[0]['timestamp'][-1]}")
    print(f"\tFirst test timestamp: {future_data[0]['timestamp'][0]}")

Task 0
	Last train timestamp: 2017-01-01T00:00:00.000000000
	First test timestamp: 2017-01-01T01:00:00.000000000
Task 1
	Last train timestamp: 2017-02-07T00:00:00.000000000
	First test timestamp: 2017-02-07T01:00:00.000000000
Task 2
	Last train timestamp: 2017-06-03T00:00:00.000000000
	First test timestamp: 2017-06-03T01:00:00.000000000


If we don't specify `variants`, then `TaskGenerator.generate_tasks()` will produce a single `Task`.

In [18]:
task_generator = fev.TaskGenerator(
    dataset_path="my_dataset",
    dataset_config="my_config",
    horizon=12,
)
task_generator.generate_tasks()

[Task(dataset_path='my_dataset', dataset_config='my_config', horizon=12, cutoff=-12, lead_time=1, min_ts_length=13, max_context_length=None, seasonality=1, eval_metric='MASE', extra_metrics=[], quantile_levels=None, id_column='id', timestamp_column='timestamp', target_column='target', multiple_target_columns=None, past_dynamic_columns=[], excluded_columns=[])]

If we do specify `variants`, then `TaskGenerator.generate_tasks()` will produce a single `Task` for each variant in `variants`.

In each of the variants, the dict provided in variants will override the default parameters for the task.

In [19]:
task_generator = fev.TaskGenerator(
    dataset_path="my_dataset",
    dataset_config="my_config",
    variants=[
        {"horizon": 12},
        {"horizon": 24},
    ]
)
task_generator.generate_tasks()

[Task(dataset_path='my_dataset', dataset_config='my_config', horizon=12, cutoff=-12, lead_time=1, min_ts_length=13, max_context_length=None, seasonality=1, eval_metric='MASE', extra_metrics=[], quantile_levels=None, id_column='id', timestamp_column='timestamp', target_column='target', multiple_target_columns=None, past_dynamic_columns=[], excluded_columns=[]),
 Task(dataset_path='my_dataset', dataset_config='my_config', horizon=24, cutoff=-24, lead_time=1, min_ts_length=25, max_context_length=None, seasonality=1, eval_metric='MASE', extra_metrics=[], quantile_levels=None, id_column='id', timestamp_column='timestamp', target_column='target', multiple_target_columns=None, past_dynamic_columns=[], excluded_columns=[])]

## Evaluation on a Benchmark consisting of multiple tasks
A `fev.Benchmark` object is essentially a collection of `Task`s.

We can create a benchmark from a list of dictionaries. Each dictionary is interpreted as a `fev.TaskGenerator`.

In [20]:
task_generators = [
    {
        "dataset_path": "autogluon/chronos_datasets",
        "dataset_config": "monash_m3_monthly",
        "horizon": 18,
        "seasonality": 12,
        "eval_metric": "MASE",
    },
    {
        "dataset_path": "autogluon/chronos_datasets",
        "dataset_config": "monash_electricity_weekly",
        "horizon": 8,
        "quantile_levels": [0.1, 0.5, 0.9],
        "eval_metric": "WQL",
        "variants": [
            {"cutoff": "2013-01-01"},
            {"cutoff": "2014-01-01"},
        ]
    },
]
benchmark = fev.Benchmark.from_list(task_generators)

Or from a YAML file

In [21]:
benchmark_path = Path(fev.__file__).parents[2] / "benchmarks" / "example" / "tasks.yaml"
# Show contents of the benchmark YAML file
!cat {benchmark_path}

tasks:
- dataset_path: autogluon/chronos_datasets
  dataset_config: monash_m1_yearly
  horizon: 8
- dataset_path: autogluon/chronos_datasets
  dataset_config: monash_electricity_weekly
  horizon: 8
  seasonality: 1
  variants:
  - cutoff: "2013-01-01"
  - cutoff: "2014-01-01"


In [22]:
benchmark = fev.Benchmark.from_yaml(benchmark_path)

In [23]:
benchmark.tasks

[Task(dataset_path='autogluon/chronos_datasets', dataset_config='monash_m1_yearly', horizon=8, cutoff=-8, lead_time=1, min_ts_length=9, max_context_length=None, seasonality=1, eval_metric='MASE', extra_metrics=[], quantile_levels=None, id_column='id', timestamp_column='timestamp', target_column='target', multiple_target_columns=None, past_dynamic_columns=[], excluded_columns=[]),
 Task(dataset_path='autogluon/chronos_datasets', dataset_config='monash_electricity_weekly', horizon=8, cutoff='2013-01-01T00:00:00', lead_time=1, min_ts_length=9, max_context_length=None, seasonality=1, eval_metric='MASE', extra_metrics=[], quantile_levels=None, id_column='id', timestamp_column='timestamp', target_column='target', multiple_target_columns=None, past_dynamic_columns=[], excluded_columns=[]),
 Task(dataset_path='autogluon/chronos_datasets', dataset_config='monash_electricity_weekly', horizon=8, cutoff='2014-01-01T00:00:00', lead_time=1, min_ts_length=9, max_context_length=None, seasonality=1, ev

Now let's evaluate some simple forecasting models on this toy benchmark.

In [None]:
!pip install statsforecast

In [None]:
from statsforecast.models import ARIMA, SeasonalNaive, Theta


def simple_forecast(task: fev.Task, model_name: str = "naive") -> list[dict]:
    past_data, future_data = task.get_input_data()
    if model_name == "seasonal_naive":
        model = SeasonalNaive(season_length=task.seasonality)
    elif model_name == "theta":
        model = Theta(season_length=task.seasonality)
    elif model_name == "arima":
        model = ARIMA(season_length=task.seasonality)
    else:
        raise ValueError(f"Unknown model_name: {model_name}")

    predictions = []
    for ts in past_data:
        predictions.append(
            {"predictions": model.forecast(y=ts[task.target_column], h=task.horizon)["mean"]}
        )
    return predictions

In [25]:
import time

summaries = []
for task in tqdm(benchmark.tasks, desc="Tasks completed"):
    for model_name in ["seasonal_naive", "arima", "theta"]:
        start_time = time.time()
        predictions = simple_forecast(task, model_name=model_name)
        infer_time_s = time.time() - start_time
        eval_summary = task.evaluation_summary(predictions, model_name=model_name, inference_time_s=infer_time_s, training_time_s=0.0)

        summaries.append(eval_summary)

Tasks completed:   0%|          | 0/3 [00:00<?, ?it/s]

In [26]:
fev.leaderboard(summaries, baseline_model="seasonal_naive")

Unnamed: 0_level_0,gmean_relative_error,avg_rank,avg_inference_time_s,median_inference_time_s,avg_training_time_s,median_training_time_s,training_corpus_overlap,num_failures
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
theta,0.914107,1.0,3.091147,1.082445,0.0,0.0,0.0,0
seasonal_naive,1.0,2.0,1.968541,2.152631,0.0,0.0,0.0,0
arima,1.870027,3.0,0.168469,0.167763,0.0,0.0,0.0,0


## Dataset adapters

In [27]:
task = fev.Task(
    dataset_path="autogluon/chronos_datasets",
    dataset_config="monash_rideshare",
    horizon=30,
    target_column="price_mean",
    past_dynamic_columns=["distance_mean", "surge_mean"],
    excluded_columns=["price_min", "price_max", "distance_min", "distance_max", "surge_min", "surge_max"],
)

By default, `task.get_input_data()` returns two `datasets.Dataset` objects:
- `past_data` contains all past data including target, timestamps, and covariates
- `future_data` contains future values of timestamps and known covariates

In [28]:
past_data, future_data = task.get_input_data()
print(past_data)
print(future_data)

Dataset({
    features: ['id', 'timestamp', 'source_location', 'provider_name', 'provider_service', 'price_mean', 'distance_mean', 'surge_mean', 'api_calls', 'temp', 'rain', 'humidity', 'clouds', 'wind'],
    num_rows: 156
})
Dataset({
    features: ['id', 'timestamp', 'source_location', 'provider_name', 'provider_service', 'api_calls', 'temp', 'rain', 'humidity', 'clouds', 'wind'],
    num_rows: 156
})


We also provide adapters `fev.convert_input_data()` that make it easy to convert data into formats expected by other frameworks.

In [29]:
from IPython.display import display

train_df, future_df, static_df = fev.convert_input_data(task, adapter="nixtla")
print("train_df")
display(train_df.head())
print("future_df")
display(future_df.head())
print("static_df")
display(static_df.head())

train_df


Unnamed: 0,unique_id,ds,y,distance_mean,surge_mean,api_calls,temp,rain,humidity,clouds,wind
0,T000000,2018-11-26 06:00:00,16.555555,1.726667,1.055556,9.0,40.627335,0.0,0.913333,0.990667,1.350667
1,T000000,2018-11-26 07:00:00,17.299999,1.69,1.1,10.0,41.137501,0.0,0.92,0.97,1.735
2,T000000,2018-11-26 08:00:00,13.5,1.38,1.0,1.0,40.919998,0.0,0.923333,0.98,1.33
3,T000000,2018-11-26 09:00:00,17.954546,1.920909,1.113636,11.0,40.9375,0.0,0.9275,1.0,1.365
4,T000000,2018-11-26 10:00:00,18.625,2.1225,1.083333,12.0,40.695,0.0,0.94,0.995,1.895


future_df


Unnamed: 0,unique_id,ds,api_calls,temp,rain,humidity,clouds,wind
0,T000000,2018-12-17 13:00:00,10.0,35.169998,0.0,0.9,0.97,7.22
1,T000000,2018-12-17 14:00:00,7.0,36.299999,0.0,0.9,0.92,6.87
2,T000000,2018-12-17 15:00:00,13.0,37.25,0.0,0.87,0.88,7.58
3,T000000,2018-12-17 16:00:00,12.0,39.0,0.0,0.84,1.0,6.28
4,T000000,2018-12-17 17:00:00,9.0,40.009998,0.0,0.81,0.95,6.46


static_df


Unnamed: 0,unique_id,source_location,provider_name,provider_service
0,T000000,Back Bay,Lyft,Lux
1,T000001,Back Bay,Lyft,Lux Black
2,T000002,Back Bay,Lyft,Lux Black XL
3,T000003,Back Bay,Lyft,Lyft
4,T000004,Back Bay,Lyft,Lyft XL


In [30]:
train_dataset, prediction_dataset = fev.convert_input_data(task, adapter="gluonts")
print("train_dataset")
print(train_dataset)
print("prediction_dataset")
print(prediction_dataset)

train_dataset
PandasDataset<size=156, freq=h, num_feat_dynamic_real=6, num_past_feat_dynamic_real=2, num_feat_static_real=0, num_feat_static_cat=3, static_cardinalities=[12.  2. 13.]>
prediction_dataset
PandasDataset<size=156, freq=h, num_feat_dynamic_real=6, num_past_feat_dynamic_real=2, num_feat_static_real=0, num_feat_static_cat=3, static_cardinalities=[12.  2. 13.]>


In [31]:
train_df, known_covariates = fev.convert_input_data(task, adapter="autogluon")
print("train_df")
display(train_df)
print("train_df.static_features")
display(train_df.static_features)
print("known_covariates")
display(known_covariates)

train_df


Unnamed: 0_level_0,Unnamed: 1_level_0,price_mean,distance_mean,surge_mean,api_calls,temp,rain,humidity,clouds,wind
item_id,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
T000000,2018-11-26 06:00:00,16.555555,1.726667,1.055556,9.0,40.627335,0.000,0.913333,0.990667,1.350667
T000000,2018-11-26 07:00:00,17.299999,1.690000,1.100000,10.0,41.137501,0.000,0.920000,0.970000,1.735000
T000000,2018-11-26 08:00:00,13.500000,1.380000,1.000000,1.0,40.919998,0.000,0.923333,0.980000,1.330000
T000000,2018-11-26 09:00:00,17.954546,1.920909,1.113636,11.0,40.937500,0.000,0.927500,1.000000,1.365000
T000000,2018-11-26 10:00:00,18.625000,2.122500,1.083333,12.0,40.695000,0.000,0.940000,0.995000,1.895000
...,...,...,...,...,...,...,...,...,...,...
T000155,2018-12-17 08:00:00,9.454545,2.230909,1.000000,11.0,37.279999,0.000,0.920000,1.000000,10.670000
T000155,2018-12-17 09:00:00,9.700000,2.447333,1.000000,15.0,36.189999,0.000,0.930000,1.000000,9.760000
T000155,2018-12-17 10:00:00,9.300000,2.203000,1.000000,10.0,34.750000,0.003,0.930000,1.000000,9.950000
T000155,2018-12-17 11:00:00,9.400000,2.139333,1.000000,15.0,34.180000,0.009,0.930000,1.000000,9.240000


train_df.static_features


Unnamed: 0_level_0,source_location,provider_name,provider_service
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
T000000,Back Bay,Lyft,Lux
T000001,Back Bay,Lyft,Lux Black
T000002,Back Bay,Lyft,Lux Black XL
T000003,Back Bay,Lyft,Lyft
T000004,Back Bay,Lyft,Lyft XL
...,...,...,...
T000151,West End,Uber,Taxi
T000152,West End,Uber,UberPool
T000153,West End,Uber,UberX
T000154,West End,Uber,UberXL


known_covariates


Unnamed: 0_level_0,Unnamed: 1_level_0,api_calls,temp,rain,humidity,clouds,wind
item_id,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
T000000,2018-12-17 13:00:00,10.0,35.169998,0.0,0.90,0.97,7.22
T000000,2018-12-17 14:00:00,7.0,36.299999,0.0,0.90,0.92,6.87
T000000,2018-12-17 15:00:00,13.0,37.250000,0.0,0.87,0.88,7.58
T000000,2018-12-17 16:00:00,12.0,39.000000,0.0,0.84,1.00,6.28
T000000,2018-12-17 17:00:00,9.0,40.009998,0.0,0.81,0.95,6.46
...,...,...,...,...,...,...,...
T000155,2018-12-18 14:00:00,17.0,26.190001,0.0,0.47,0.48,13.89
T000155,2018-12-18 15:00:00,15.0,27.219999,0.0,0.46,0.34,15.03
T000155,2018-12-18 16:00:00,15.0,28.700001,0.0,0.47,0.31,14.60
T000155,2018-12-18 17:00:00,9.0,30.049999,0.0,0.46,0.15,13.55
