In [1]:
import numpy as np
import pandas as pd
import warnings
from pathlib import Path

from gluonts.dataset.common import load_datasets, TrainDatasets
from tqdm.auto import tqdm

In [2]:
datasets_path = Path("/home/ubuntu/data/datasets/")
evaluations_path = Path("/home/ubuntu/evaluations/")
experiment = "agts-medium-0-5-1"

## Collect AutoGluon leaderboards for each dataset
Before this we need to download the results with
```bash
tsbench evaluations download \
    --experiment agts-medium-0-5-1 \
    --include_forecasts=False \
    --include_leaderboard=True
```

In [3]:
def get_leaderboards(evaluations_path: Path, experiment: str):
    results = []
    source = Path(evaluations_path)
    source = Path.joinpath(source, experiment, "autogluon")
    for ds_dir in sorted(source.iterdir()):
        ds = ds_dir.name
        hparam_dirs = list(ds_dir.iterdir())
        assert len(hparam_dirs) == 1, "There should only be 1 hyperparam setting per dataset for AutoGluon"
        for hp_dir in hparam_dirs:
            lb = pd.read_csv(hp_dir / "leaderboard.csv", index_col=0)
            df = pd.concat({ds: lb}, names=["dataset"])
            results.append(df)
    return pd.concat(results)

lbs = get_leaderboards(evaluations_path, experiment)
dsets_with_results = lbs.index.unique(level=0)

## Dataset statistics

In [4]:
# Collected manually from https://forecastingdata.org/
HAS_MISSING = {"london_smart_meters", "wind_farms", "bitcoin", "vehicle_trips", "kdd_2018", "nn5", "car_parts", "rideshare", "temperature_rain"}
IS_MULTIVARIATE = {"nn5", "solar", "electricity", "car_parts", "fred_md", "san_francisco_traffic", "rideshare", "hospital", "covid_deaths", "temperature_rain"}

In [5]:
def get_stats(dset: TrainDatasets, name: str) -> dict:
    lens_train = np.array([len(s["target"]) for s in dset.train])
    meta = dset.metadata
    num_train = len(dset.train)
    num_test = len(dset.test)
    return {
        "dataset": name,
        "num_series_train": num_train,
        "num_series_test": num_test,
        "separate_test": num_train != num_test,
        "min_len": lens_train.min(),
        "max_len": lens_train.max(),
        "prediction_length": meta.prediction_length,
        "freq": meta.freq,
        "has_missing": name in HAS_MISSING,
        "multivariate": name in IS_MULTIVARIATE,
        "num_static_feat": len(meta.feat_static_cat) + len(meta.feat_static_real),
        "tsbench_failed": name not in dsets_with_results,
    }

dataset_stats = []
for dset_path in tqdm(list(sorted(datasets_path.iterdir()))):
    warnings.filterwarnings(action="ignore", category=FutureWarning)
    data_dir = dset_path / "gluonts"
    # Load validation set since the train set doesn't contain the validation period
    dset = load_datasets(data_dir, data_dir / "val", data_dir / "test", )
    stats = {**get_stats(dset, dset_path.name)}
    dataset_stats.append(stats)
stats_df = pd.DataFrame(dataset_stats).set_index("dataset")

  0%|          | 0/44 [00:00<?, ?it/s]

In [14]:
stats_df = pd.DataFrame(dataset_stats).set_index("dataset").sort_values(by=["tsbench_failed", "dataset"])

Compute summary statistics for each dataset in the benchmark.
- `num_series_train`: Number of series in the training set.
- `num_series_test`: Number of series in the test set.
- `separate_test`: If `True`, test series are different from the training series. If `False`, test set consists of continuations of train series.
- `min_len`: Minimum length of series in the training set.
- `max_len`: Maximum length of series in the training set.
- `prediction_length`: Number of time steps for which the predictions must be generated.
- `freq`: Frequency of measurements in the time series.
- `has_missing`: Contains missing values that were replaced by zeros.
- `multivariate`: Originally a multivariate time series broken down into individual components.
- `num_static_feat`: Number of static features.
- `tsbench_failed`: `tsbench` failed on this dataset or did not finish in 24 hours. Need to figure out why this happened!

In [15]:
stats_df

Unnamed: 0_level_0,num_series_train,num_series_test,separate_test,min_len,max_len,prediction_length,freq,has_missing,multivariate,num_static_feat,tsbench_failed
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
australian_electricity_demand,5,5,False,230688,232224,48,0.5H,False,False,1,False
bitcoin,17,17,False,2629,4551,30,D,True,False,1,False
car_parts,2658,2658,False,39,39,12,M,True,True,1,False
cif_2016,72,72,False,16,108,12,M,False,False,1,False
covid_deaths,230,230,False,182,182,30,D,False,True,1,False
exchange_rate,8,40,True,6071,6071,30,B,False,False,1,False
fred_md,107,107,False,716,716,12,M,False,True,1,False
hospital,767,767,False,72,72,12,M,False,True,1,False
kdd_2018,270,270,False,9456,10872,48,H,True,False,1,False
m1_monthly,617,617,False,30,132,18,M,False,False,1,False


In [16]:
stats_df.to_csv("datasets_stats.csv")

## Quantify overfitting on the validation set

In [9]:
from scipy.stats import kendalltau, spearmanr

In [10]:
agreement = []
for dset in dsets_with_results:
    subset = lbs.loc[dset]
    tau = kendalltau(subset["score_test"], subset["score_val"]).correlation
    r = spearmanr(subset["score_test"], subset["score_val"]).correlation
    agreement.append({"dataset": dset, "kendalltau": tau, "spearmanr": r})
agreement = pd.DataFrame(agreement)

agreement



Unnamed: 0,dataset,kendalltau,spearmanr
0,australian_electricity_demand,0.866667,0.942857
1,bitcoin,0.066667,0.142857
2,car_parts,0.733333,0.885714
3,cif_2016,0.333333,0.6
4,covid_deaths,1.0,1.0
5,exchange_rate,-0.2,-0.542857
6,fred_md,0.066667,0.2
7,hospital,0.6,0.771429
8,kdd_2018,0.333333,0.485714
9,m1_monthly,0.466667,0.6
