In [3]:
import datasets
import pyarrow.compute as pc
import numpy as np

from uni2ts.common.env import env
from uni2ts.data.builder.lotsa_v1 import (
    Buildings900KDatasetBuilder,
    BuildingsBenchDatasetBuilder,
    CloudOpsTSFDatasetBuilder,
    CMIP6DatasetBuilder,
    ERA5DatasetBuilder,
    GluonTSDatasetBuilder,
    LargeSTDatasetBuilder,
    LibCityDatasetBuilder,
    OthersLOTSADatasetBuilder,
    ProEnFoDatasetBuilder,
    SubseasonalDatasetBuilder,
)

In this notebook, we will see how to calculate the dataset weighting in the [pre-training dataset config file](../cli/conf/pretrain/data/lotsa_v1_weighted.yaml). We will see how to automatically generate this file to avoid excessive manual labor.

In [4]:
dataset_list = (
    Buildings900KDatasetBuilder.dataset_list
    + BuildingsBenchDatasetBuilder.dataset_list
    + CloudOpsTSFDatasetBuilder.dataset_list
    + CMIP6DatasetBuilder.dataset_list
    + ERA5DatasetBuilder.dataset_list
    + GluonTSDatasetBuilder.dataset_list
    + LargeSTDatasetBuilder.dataset_list
    + LibCityDatasetBuilder.dataset_list
    + OthersLOTSADatasetBuilder.dataset_list
    + ProEnFoDatasetBuilder.dataset_list
    + SubseasonalDatasetBuilder.dataset_list
)

1. Obtain the lengths of all time series from all available datasets.

In [5]:
lengths = {}
for name in dataset_list:
    dataset = datasets.load_from_disk(str(env.LOTSA_V1_PATH / name)).with_format(
        "numpy"
    )
    if dataset[0]["target"].ndim > 1:
        lengths[name] = pc.list_value_length(
            pc.list_flatten(pc.list_slice(dataset.data.column("target"), 0, 1))
        ).to_numpy()
    else:
        lengths[name] = pc.list_value_length(dataset.data.column("target")).to_numpy()

TypeError: unsupported operand type(s) for /: 'NoneType' and 'str'

2. Some datasets have been split into smaller chunks for efficiency -- group them back together

In [6]:
def get_dataset_group(name):
    if name.startswith("era5"):
        return "era5"
    if name.startswith("cmip6"):
        return "cmip6"
    if name.startswith("largest"):
        return "largest"
    return name


group_lengths = {}
for k, v in lengths.items():
    group = get_dataset_group(k)
    if group in group_lengths:
        group_lengths[group] = np.concatenate([group_lengths[group], v])
    else:
        group_lengths[group] = v

3. Compute the weights for each dataset

From the [paper](https://arxiv.org/abs/2402.02592), the sampling distribution for dataset $D_k$ is given by:

\begin{align}
p(D_k) & = \frac{\min(\omega_k, \epsilon)}{\sum_{i=1}^K \min(\omega_i, \epsilon)} \\
\\
\omega_k & = \frac{|D_k|}{\sum_i^K |D_i|} \\
\\
|D_k| & = \sum_{i \in D_k} T_i
\end{align}
where $T_i$ for $i \in D_k$ is the length of time series $i$ belonging to dataset $D_k$.

The default PyTorch sampler samples each dataset $D_k$ based on the number of time series in the dataset, i.e. with probability:

\begin{align}
\frac{\sum_{i \in D_k} 1}{\sum_{j=1}^K \sum_{l \in D_j} 1}
\end{align}

`weight_map` aims to _reweight_ the probability to sample dataset $D_k$ with a multiplier. 

$\mathrm{reweight}_k$, which is calculated as follows:

\begin{align}
\mathrm{reweight}_k = \omega_k * \frac{\sum_{j=1}^K \sum_{l \in D_j} 1}{\sum_{i \in D_k} 1}
\end{align}

Thus, the probability to sample dataset $D_k$ is $\omega_k$.

In [7]:
# Compute \omega_k
total_lengths = np.asarray([v.sum() for v in group_lengths.values()])
omegas = total_lengths / total_lengths.sum()
omegas = np.clip(omegas, a_min=0, a_max=0.001)
weights = omegas / omegas.sum()

In [8]:
# Compute reweight_k
num_ts = np.asarray([v.shape[0] for v in group_lengths.values()])
group_reweights = weights * num_ts.sum() / num_ts
group_reweights = {k: v for k, v in zip(group_lengths.keys(), group_reweights)}
group_size = {
    group: len([k for k in lengths.keys() if get_dataset_group(k) == group])
    for group in group_lengths.keys()
}
reweights = {
    k: group_reweights[group] / group_size[group]
    for k in lengths.keys()
    if (group := get_dataset_group(k))
}

4. Finally, we can generate the YAML file required for the pre-training dataset with the appropriate `weight_map`.

In [9]:
for builder_cls in [
    Buildings900KDatasetBuilder,
    BuildingsBenchDatasetBuilder,
    CloudOpsTSFDatasetBuilder,
    CMIP6DatasetBuilder,
    ERA5DatasetBuilder,
    GluonTSDatasetBuilder,
    LargeSTDatasetBuilder,
    LibCityDatasetBuilder,
    OthersLOTSADatasetBuilder,
    ProEnFoDatasetBuilder,
    SubseasonalDatasetBuilder,
]:
    print(f"- _target_: uni2ts.data.builder.lotsa_v1.{builder_cls.__name__}")
    print("  datasets: ${cls_getattr:${._target_},dataset_list}")
    print("  weight_map:")
    for dataset in builder_cls.dataset_list:
        print(f"    {dataset}: {reweights[dataset]}")
    print("  sample_time_series:")
    print("    _target_: uni2ts.data.dataset.SampleTimeSeriesType")
    print('    _args_: ["proportional"]')

- _target_: uni2ts.data.builder.lotsa_v1.Buildings900KDatasetBuilder
  datasets: ${cls_getattr:${._target_},dataset_list}
  weight_map:


KeyError: 'buildings_900k'