Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BUG] fix temporal_train_test_split for hierarchical and panel data in case where fh is not passed #5330

Merged
merged 46 commits into from Oct 7, 2023
Merged
Show file tree
Hide file tree
Changes from 44 commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
097f5a6
Update expandinggreedy.py
fkiraly Sep 29, 2023
75f8c77
Update expandinggreedy.py
fkiraly Sep 29, 2023
b5c7912
reversed greedy
fkiraly Sep 29, 2023
69a93f5
temporal fix
fkiraly Sep 29, 2023
234a361
Update expandinggreedy.py
fkiraly Sep 29, 2023
5f47f51
own class
fkiraly Sep 29, 2023
504df4e
linting
fkiraly Sep 29, 2023
2c56c0f
treat hierarchical splitter
fkiraly Sep 29, 2023
02a642a
Update temporal_train_test_split.py
fkiraly Sep 29, 2023
0425e15
Update temporal_train_test_split.py
fkiraly Sep 29, 2023
153171a
typo
fkiraly Sep 29, 2023
d7285a3
Merge branch 'expandinggreedysplitter-float' into temporal_train_test…
fkiraly Sep 29, 2023
c9fe290
Update expandinggreedy.py
fkiraly Sep 29, 2023
4ec9a10
Update temporal_train_test_split.py
fkiraly Sep 29, 2023
0357421
Update temporal_train_test_split.py
fkiraly Sep 29, 2023
e536bff
Update expandinggreedy.py
fkiraly Sep 29, 2023
c1ec099
Merge branch 'expandinggreedysplitter-float' into temporal_train_test…
fkiraly Sep 29, 2023
24c2b89
Merge branch 'main' into temporal_train_test_split-hierarchical
fkiraly Oct 1, 2023
fa4d02a
fixes
fkiraly Oct 1, 2023
fb591ee
docstrig
fkiraly Oct 1, 2023
58c4f60
Update test_temporaltraintest.py
fkiraly Oct 1, 2023
6ffe1b4
Update test_temporaltraintest.py
fkiraly Oct 1, 2023
b8dda31
linting
fkiraly Oct 1, 2023
7cf6c54
Merge branch 'test-temporal_train_test' into temporal_train_test_spli…
fkiraly Oct 1, 2023
0d11ad7
Update temporal_train_test_split.py
fkiraly Oct 1, 2023
56be74b
Update temporal_train_test_split.py
fkiraly Oct 1, 2023
1fd85b5
Update temporal_train_test_split.py
fkiraly Oct 1, 2023
0986c71
fixes
fkiraly Oct 1, 2023
c61ce21
Update temporal_train_test_split.py
fkiraly Oct 1, 2023
641437c
linting
fkiraly Oct 1, 2023
fe47289
revert expanding greedy
fkiraly Oct 1, 2023
292f996
X split
fkiraly Oct 1, 2023
4ad2580
docstring
fkiraly Oct 1, 2023
f4d32a5
Update temporal_train_test_split.py
fkiraly Oct 1, 2023
5334209
Revert "Revert "export""
fkiraly Oct 1, 2023
6de3de0
Update __init__.py
fkiraly Oct 1, 2023
bbee8a8
Update temporal_train_test_split.py
fkiraly Oct 1, 2023
18cebc5
docstring
fkiraly Oct 2, 2023
ce2592c
Update temporal_train_test_split.py
fkiraly Oct 2, 2023
612e408
test for hierarchical
fkiraly Oct 2, 2023
2418aac
Update test_temporaltraintest.py
fkiraly Oct 2, 2023
f2b9571
Update temporal_train_test_split.py
fkiraly Oct 2, 2023
a7a2250
Update temporal_train_test_split.py
fkiraly Oct 2, 2023
4a8da32
Update temporal_train_test_split.py
fkiraly Oct 2, 2023
3a97912
Merge branch 'main' into temporal_train_test_split-hierarchical
fkiraly Oct 2, 2023
efa1dec
remove incorrect osuleaf example
fkiraly Oct 2, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 5 additions & 1 deletion sktime/split/__init__.py
Expand Up @@ -7,6 +7,7 @@
"SameLocSplitter",
"SingleWindowSplitter",
"SlidingWindowSplitter",
"TemporalTrainTestSplitter",
"TestPlusTrainSplitter",
"temporal_train_test_split",
]
Expand All @@ -17,5 +18,8 @@
from sktime.split.sameloc import SameLocSplitter
from sktime.split.singlewindow import SingleWindowSplitter
from sktime.split.slidingwindow import SlidingWindowSplitter
from sktime.split.temporal_train_test_split import temporal_train_test_split
from sktime.split.temporal_train_test_split import (
TemporalTrainTestSplitter,
temporal_train_test_split,
)
from sktime.split.testplustrain import TestPlusTrainSplitter
295 changes: 217 additions & 78 deletions sktime/split/temporal_train_test_split.py
Expand Up @@ -2,14 +2,18 @@
# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
"""Implement cutoff dataset splitting for model evaluation and selection."""

__all__ = ["temporal_train_test_split"]
__all__ = [
"temporal_train_test_split",
"TemporalTrainTestSplitter",
]

import math
from typing import Optional

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split as _train_test_split

from sktime.datatypes._utilities import get_time_index
from sktime.split.base import BaseSplitter
from sktime.split.base._config import (
ACCEPTED_Y_TYPES,
FORECASTING_HORIZON_TYPES,
Expand All @@ -24,105 +28,240 @@ def temporal_train_test_split(
test_size: Optional[float] = None,
train_size: Optional[float] = None,
fh: Optional[FORECASTING_HORIZON_TYPES] = None,
anchor: str = "start",
) -> SPLIT_TYPE:
"""Split arrays or matrices into sequential train and test subsets.
"""Split time series data containers into a single train/test split.

Creates train/test splits over endogenous arrays an optional exogenous
arrays.
Splits time series ``y`` into a single temporally ordered train and test split.
The split is based on ``test_size`` and ``train_size`` parameters,
yarnabrina marked this conversation as resolved.
Show resolved Hide resolved
which can signify fractions of total number of indices,
or an absolute number of integers to cut.

This is a wrapper of scikit-learn's ``train_test_split`` that
does not shuffle the data.
If the data contains multiple time series (Panel or Hierarchical),
fractions and train-test sets will be computed per individual time series.

If ``X`` is provided, will also produce a single train/test split of ``X``,
at the same ``loc`` indices as ``y``. If non-``pandas`` based containers are used,
will use ``iloc`` index instead.

Parameters
----------
y : time series in sktime compatible data container format
X : time series in sktime compatible data container format, optional, default=None
y and X can be in one of the following formats:
Series scitype: pd.Series, pd.DataFrame, or np.ndarray (1D or 2D)
for vanilla forecasting, one time series
Panel scitype: pd.DataFrame with 2-level row MultiIndex,
3D np.ndarray, list of Series pd.DataFrame, or nested pd.DataFrame
for global or panel forecasting
Hierarchical scitype: pd.DataFrame with 3 or more level row MultiIndex
for hierarchical forecasting
Number of columns admissible depend on the "scitype:y" tag:
if self.get_tag("scitype:y")=="univariate":
y must have a single column/variable
if self.get_tag("scitype:y")=="multivariate":
y must have 2 or more columns
if self.get_tag("scitype:y")=="both": no restrictions on columns apply
For further details:
on usage, see forecasting tutorial examples/01_forecasting.ipynb
on specification of formats, examples/AA_datatypes_and_datasets.ipynb
test_size : float, int or None, optional (default=None)
If float, should be between 0.0 and 1.0 and represent the proportion
of the dataset to include in the test split. If int, represents the
relative number of test samples. If None, the value is set to the
complement of the train size. If ``train_size`` is also None, it will
be set to 0.25.
If float, must be between 0.0 and 1.0, and is interpreted as the proportion
of the dataset to include in the test split. Proportions are rounded to the
next higher integer count of samples (ceil).
If int, is interpreted as total number of test samples.
If None, the value is set to the complement of the train size.
If ``train_size`` is also None, it will be set to 0.25.
train_size : float, int, or None, (default=None)
If float, should be between 0.0 and 1.0 and represent the
proportion of the dataset to include in the train split. If
int, represents the relative number of train samples. If None,
the value is automatically set to the complement of the test size.
If float, must be between 0.0 and 1.0, and is interpreted as the proportion
of the dataset to include in the train split. Proportions are rounded to the
next lower integer count of samples (floor).
If int, is interpreted as total number of train samples.
If None, the value is set to the complement of the test size.
fh : ForecastingHorizon
A forecast horizon to use for splitting, alternative specification for test set.
If given, ``test_size`` and ``train_size`` cannot also be specified and must
be None. If ``fh`` is passed, the test set will be:
if ``fh.is_relative``: the last possible indices to match ``fh`` within ``y``
if ``not fh.is_relative``: the indices at the absolute index of ``fh``
anchor : str, "start" (default) or "end"
determines behaviour if train and test sizes do not sum up to all data
used only if ``fh=None`` and both ``test_size`` and ``train_size`` are not None
if "start", cuts train and test set from start of available series
if "end", cuts train and test set from end of available series

Returns
-------
splitting : tuple, length=2 * len(arrays)
List containing train-test split of `y` and `X` if given.
splitting : tuple, length = 2 * len(arrays)
Tuple containing train-test split of `y`, and `X` if given.
if ``X is None``, returns ``(y_train, y_test)``.
Else, returns ``(y_train, y_test, X_train, X_test)``.

References
----------
.. [1] adapted from https://github.com/alkaline-ml/pmdarima/
.. [1] originally adapted from https://github.com/alkaline-ml/pmdarima/

Examples
--------
>>> from sktime.datasets import load_airline
>>> from sktime.split import temporal_train_test_split
>>> y = load_airline()
>>> y_train, y_test = temporal_train_test_split(y, test_size=0.2)

The function can also be applied to panel or hierarchical data,
in this case the split will be applied per individual time series:
>>> from sktime.utils._testing.hierarchical import _make_hierarchical
>>> y = _make_hierarchical()
>>> y_train, y_test = temporal_train_test_split(y, test_size=0.2)
"""
# the code has two disjoint branches, one for fh and one for test_size/train_size

# branch 1: fh is not None, use fh to split
# this assumes (or enforces) that test_size and train_size are None
if fh is not None:
if test_size is not None or train_size is not None:
raise ValueError(
"If `fh` is given, `test_size` and `train_size` cannot "
"also be specified."
)
return _split_by_fh(y, fh, X=X)
else:
pd_format = isinstance(y, pd.Series) or isinstance(y, pd.DataFrame)
if pd_format is True and isinstance(y.index, pd.MultiIndex):
ys = get_time_index(y)
# Get index to group across (only indices other than timepoints index)
yi_name = y.index.names
yi_grp = yi_name[0:-1]

# Get split into test and train data for timeindex only
series = (ys,)
yret = _train_test_split(
*series,
shuffle=False,
stratify=None,
test_size=test_size,
train_size=train_size,
)

# Convert into list indices
ysl = ys.to_list()
yrl1 = yret[0].to_list()
yrl2 = yret[1].to_list()
p1 = [index for (index, item) in enumerate(ysl) if item in yrl1]
p2 = [index for (index, item) in enumerate(ysl) if item in yrl2]

# Subset by group based on identified indices
y_train = y.groupby(yi_grp, as_index=False).nth(p1)
y_test = y.groupby(yi_grp, as_index=False).nth(p2)
if X is not None:
X_train = X.groupby(yi_grp, as_index=False).nth(p1)
X_test = X.groupby(yi_grp, as_index=False).nth(p2)
return y_train, y_test, X_train, X_test
else:
return y_train, y_test
# branch 2: fh is None, use test_size and train_size to split
# from the above, we know that fh is None
temporal_splitter = TemporalTrainTestSplitter(
test_size=test_size, train_size=train_size, anchor=anchor
)

y_train, y_test = list(temporal_splitter.split_series(y))[0]

# if X is None, return y_train, y_test
if X is None:
return y_train, y_test

# if X is not None, split X as well
# the split of X uses the same indices as the split of y
from sktime.split import SameLocSplitter

X_splitter = SameLocSplitter(temporal_splitter, y)
X_train, X_test = list(X_splitter.split_series(X))[0]

return y_train, y_test, X_train, X_test


class TemporalTrainTestSplitter(BaseSplitter):
r"""Temporal train-test splitter, based on sample sizes of train or test set.

Cuts test and train sets from the start or end of available data,
based on ``test_size`` and ``train_size`` parameters,
which can signify fractions of total number of indices,
or an absolute number of integers to cut.

If the data contains multiple time series (Panel or Hierarchical),
fractions and train-test sets will be computed per individual time series.

Parameters
----------
test_size : float, int or None, optional (default=None)
If float, must be between 0.0 and 1.0, and is interpreted as the proportion
of the dataset to include in the test split. Proportions are rounded to the
next higher integer count of samples (ceil).
If int, is interpreted as total number of test samples.
If None, the value is set to the complement of the train size.
If ``train_size`` is also None, it will be set to 0.25.
train_size : float, int, or None, (default=None)
If float, must be between 0.0 and 1.0, and is interpreted as the proportion
of the dataset to include in the train split. Proportions are rounded to the
next lower integer count of samples (floor).
If int, is interpreted as total number of train samples.
If None, the value is set to the complement of the test size.
anchor : str, "start" (default) or "end"
determines behaviour if train and test sizes do not sum up to all data
if "start", cuts train and test set from start of available series
if "end", cuts train and test set from end of available series

Examples
--------
>>> import numpy as np
>>> from sktime.split import TemporalTrainTestSplitter
>>> ts = np.arange(10)
>>> splitter = TemporalTrainTestSplitter(test_size=0.3)
>>> list(splitter.split(ts)) # doctest: +SKIP
"""

_tags = {"split_hierarchical": False}

def __init__(self, train_size=None, test_size=None, anchor="start"):
self.train_size = train_size
self.test_size = test_size
self.anchor = anchor
super().__init__()

def _split(self, y: pd.Index):
test_size = self.test_size
train_size = self.train_size
anchor = self.anchor

len_y = len(y)

if test_size is None and train_size is None:
test_size = 0.25

if train_size is None:
anchor = "end"
if test_size is None:
anchor = "start"

if isinstance(test_size, float):
test_size = math.ceil(test_size * len(y))
if isinstance(train_size, float):
train_size = math.floor(train_size * len(y))
if test_size is None:
test_size = len_y - train_size
if train_size is None:
train_size = len_y - test_size

if anchor == "end":
test_size = min(len_y, test_size)
train_size = min(len_y - test_size, train_size)
else:
series = (y,) if X is None else (y, X)
return _train_test_split(
*series,
shuffle=False,
stratify=None,
test_size=test_size,
train_size=train_size,
)
train_size = min(len_y, train_size)
test_size = min(len_y - train_size, test_size)

all_ix = np.arange(len_y)

if anchor == "end":
y_train_ix = all_ix[:-test_size]
y_test_ix = all_ix[-test_size:]
y_train_ix = y_train_ix[-train_size:]
else: # if anchor == "start"
y_train_ix = all_ix[:train_size]
y_test_ix = all_ix[train_size:]
y_test_ix = y_test_ix[:test_size]

yield y_train_ix, y_test_ix

def get_n_splits(self, y: Optional[ACCEPTED_Y_TYPES] = None) -> int:
"""Return the number of splits.

Since this splitter returns a single train/test split,
this number is trivially 1.

Parameters
----------
y : pd.Series or pd.Index, optional (default=None)
Time series to split

Returns
-------
n_splits : int
The number of splits.
"""
return 1

@classmethod
def get_test_params(cls, parameter_set="default"):
"""Return testing parameter settings for the splitter.

Parameters
----------
parameter_set : str, default="default"
Name of the set of test parameters to return, for use in tests. If no
special parameters are defined for a value, will return `"default"` set.

Returns
-------
params : dict or list of dict, default = {}
Parameters to create testing instances of the class
Each dict are parameters to construct an "interesting" test instance, i.e.,
`MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance.
`create_test_instance` uses the first (or only) dictionary in `params`
"""
params1 = {"test_size": 0.2, "train_size": 0.3, "anchor": "start"}
params2 = {"test_size": 0.2, "train_size": 0.3, "anchor": "end"}
params3 = {"test_size": 2}
params4 = {"train_size": 3}
params5 = {}
return [params1, params2, params3, params4, params5]