diff --git a/extension_templates/annotation.py b/extension_templates/annotation.py index e01c6678aa4..02b256596ee 100644 --- a/extension_templates/annotation.py +++ b/extension_templates/annotation.py @@ -42,16 +42,6 @@ class MySeriesAnnotator(BaseSeriesAnnotator): Parameters ---------- - fmt : str {"dense", "sparse"}, optional (default="dense") - Annotation output format: - * If "sparse", a sub-series of labels for only the outliers in X is returned, - * If "dense", a series of labels for all values in X is returned. - labels : str {"indicator", "score"}, optional (default="indicator") - Annotation output labels: - * If "indicator", returned values are boolean, indicating whether a value is an - outlier, - * If "score", returned values are floats, giving the outlier score. - parama : int descriptive explanation of parama paramb : string, optional (default='default') @@ -69,6 +59,12 @@ class MySeriesAnnotator(BaseSeriesAnnotator): and so on """ + # Change the `task` and `learning_type` as needed + _tags = { + "task": "segmentation", + "learning_type": "unsupervised", + } + # todo: add any hyper-parameters and components to constructor def __init__( self, @@ -77,8 +73,6 @@ def __init__( est2=None, paramb="default", paramc=None, - fmt="dense", - labels="indicator", ): # estimators should precede parameters # if estimators have default values, set None and initialize below @@ -89,8 +83,7 @@ def __init__( self.paramb = paramb self.paramc = paramc - # leave this as is - super().__init__(fmt=fmt, labels=labels) + super().__init__() # todo: optional, parameter checking logic (if applicable) should happen here # if writes derived values to self, should *not* overwrite self.parama etc diff --git a/sktime/annotation/adapters/_pyod.py b/sktime/annotation/adapters/_pyod.py index 99504f63852..89ba16130ed 100644 --- a/sktime/annotation/adapters/_pyod.py +++ b/sktime/annotation/adapters/_pyod.py @@ -7,6 +7,7 @@ from sktime.annotation.base._base import BaseSeriesAnnotator from sktime.utils.validation._dependencies import _check_soft_dependencies +from sktime.utils.warnings import warn __author__ = ["mloning", "satya-pattnaik", "fkiraly"] @@ -21,22 +22,34 @@ class PyODAnnotator(BaseSeriesAnnotator): estimator : PyOD estimator See ``https://pyod.readthedocs.io/en/latest/`` documentation for a detailed description of all options. - fmt : str {"dense", "sparse"}, optional (default="dense") - Annotation output format: - * If "sparse", a sub-series of labels for only the outliers in X is returned, - * If "dense", a series of labels for all values in X is returned. - labels : str {"indicator", "score"}, optional (default="indicator") - Annotation output labels: - * If "indicator", returned values are boolean, indicating whether a value is an - outlier, - * If "score", returned values are floats, giving the outlier score. """ - _tags = {"python_dependencies": "pyod"} + _tags = { + "python_dependencies": "pyod", + "task": "anomaly_detection", + "learning_type": "unsupervised", + } - def __init__(self, estimator, fmt="dense", labels="indicator"): + # todo 0.31.0: remove fmt argument and warning + def __init__(self, estimator, fmt="deprecated", labels="indicator"): self.estimator = estimator # pyod estimator - super().__init__(fmt=fmt, labels=labels) + self.fmt = fmt + self.labels = labels + + super().__init__() + + if fmt == "deprecated": + self._fmt = "sparse" + warn( + f"Warning from {type(self).__name__}: fmt argument will be removed in" + " 0.31.0. For behaviour equivalent to fmt=dense, use transform instead " + "of predict. In 0.31.0 the behaviour of predict will equivalent to the" + " current behaviour of predict when fmt=sparse.", + DeprecationWarning, + obj=self, + ) + else: + self._fmt = fmt def _fit(self, X, Y=None): """Fit to training data. @@ -77,9 +90,8 @@ def _predict(self, X): Returns ------- Y : pd.Series - annotations for sequence X - exact format depends on annotation type """ - fmt = self.fmt + fmt = self._fmt labels = self.labels X_np = X.to_numpy() diff --git a/sktime/annotation/base/_base.py b/sktime/annotation/base/_base.py index 4d14b50f654..d7ec54a5a27 100644 --- a/sktime/annotation/base/_base.py +++ b/sktime/annotation/base/_base.py @@ -22,27 +22,32 @@ class name: BaseSeriesAnnotator __author__ = ["satya-pattnaik ", "fkiraly"] __all__ = ["BaseSeriesAnnotator"] +import numpy as np +import pandas as pd + from sktime.base import BaseEstimator -from sktime.utils.validation.annotation import check_fmt, check_labels from sktime.utils.validation.series import check_series class BaseSeriesAnnotator(BaseEstimator): """Base series annotator. - Parameters - ---------- - fmt : str {"dense", "sparse"}, optional (default="dense") - Annotation output format: - * If "sparse", a sub-series of labels for only the outliers in X is returned, - * If "dense", a series of labels for all values in X is returned. - labels : str {"indicator", "score", "int_label"}, optional (default="indicator") - Annotation output labels: - * If "indicator", returned values are boolean, indicating whether a value is an - outlier, - * If "score", returned values are floats, giving the outlier score. - * If "int_label", returned values are integers indicating which segment the - value belongs to. + Developers should set the task and learning_type tags in the derived class. + + task : str {"segmentation", "change_point_detection", "anomaly_detection"} + The main annotation task: + * If ``segmentation``, the annotator divides timeseries into discrete chunks + based on certain criteria. The same label can be applied at mulitple + disconnected regions of the timeseries. + * If ``change_point_detection``, the annotator finds points where the + statistical properties of the timeseries change significantly. + * If ``anomaly_detection``, the annotator finds points that differ significantly + from the normal statistical properties of the timeseries. + + learning_type : str {"supervised", "unsupervised"} + Annotation learning type: + * If ``supervised``, the annotator learns from labelled data. + * If ``unsupervised``, the annotator learns from unlabelled data. Notes ----- @@ -58,12 +63,16 @@ class BaseSeriesAnnotator(BaseEstimator): _tags = { "object_type": "series-annotator", # type of object + "learning_type": "None", # Tag to determine test in test_all_annotators + "task": "None", # Tag to determine test in test_all_annotators + # + # todo: distribution_type? we may have to refactor this, seems very soecufuc "distribution_type": "None", # Tag to determine test in test_all_annotators } # for unit test cases - def __init__(self, fmt="dense", labels="indicator"): - self.fmt = fmt - self.labels = labels + def __init__(self): + self.task = self.get_class_tag("task") + self.learning_type = self.get_class_tag("learning_type") self._is_fitted = False @@ -92,8 +101,6 @@ def fit(self, X, Y=None): Creates fitted model that updates attributes ending in "_". Sets _is_fitted flag to True. """ - check_labels(self.labels) - check_fmt(self.fmt) X = check_series(X) if Y is not None: @@ -134,6 +141,27 @@ def predict(self, X): return Y + def transform(self, X): + """Create annotations on test/deployment data. + + Parameters + ---------- + X : pd.DataFrame + Data to annotate (time series). + + Returns + ------- + Y : pd.Series + Annotations for sequence X. The returned annotations will be in the dense + format. + """ + if self.task == "anomaly_detection" or self.task == "change_point_detection": + Y = self.predict_points(X) + elif self.task == "segmentation": + Y = self.predict_segments(X) + + return self.sparse_to_dense(Y, X.index) + def predict_scores(self, X): """Return scores for predicted annotations on test/deployment data. @@ -314,3 +342,341 @@ def _update(self, X, Y=None): self._fit(self._X, self._Y) return self + + def predict_segments(self, X): + """Predict segments on test/deployment data. + + Parameters + ---------- + X : pd.DataFrame + Data to annotate, time series. + + Returns + ------- + Y : pd.Series + A series with an index of intervals. Each interval is the range of a + segment and the corresponding value is the label of the segment. + """ + if self.task == "anomaly_detection": + raise RuntimeError( + "Anomaly detection annotators should not be used for segmentation." + ) + self.check_is_fitted() + X = check_series(X) + + if self.task == "change_point_detection": + return self.segments_to_change_points(self.predict_points(X)) + elif self.task == "segmentation": + return self._predict_segments(X) + + def predict_points(self, X): + """Predict changepoints/anomalies on test/deployment data. + + Parameters + ---------- + X : pd.DataFrame + Data to annotate, time series. + + Returns + ------- + Y : pd.Series + A series whose values are the changepoints/anomalies in X. + """ + self.check_is_fitted() + X = check_series(X) + + if self.task == "anomaly_detection" or self.task == "change_point_detection": + return self._predict_points(X) + elif self.task == "segmentation": + return self.segments_to_change_points(self.predict_segments(X)) + + def _predict_segments(self, X): + """Predict segments on test/deployment data. + + Parameters + ---------- + X : pd.DataFrame + Data to annotate, time series. + + Returns + ------- + Y : pd.Series + A series with an index of intervals. Each interval is the range of a + segment and the corresponding value is the label of the segment. + """ + raise NotImplementedError("abstract method") + + def _predict_points(self, X): + """Predict changepoints/anomalies on test/deployment data. + + Parameters + ---------- + X : pd.DataFrame + Data to annotate, time series. + + Returns + ------- + Y : pd.Series + A series whose values are the changepoints/anomalies in X. + """ + raise NotImplementedError("abstract method") + + @staticmethod + def sparse_to_dense(y_sparse, index): + """Convert the sparse output from an annotator to a dense format. + + Parameters + ---------- + y_sparse : pd.Series + * If ``y_sparse`` is a series with an index of intervals, it should + represent segments where each value of the series is label of a segment. + Unclassified intervals should be labelled -1. Segments must never have + the label 0. + * If the index of ``y_sparse`` is not a set of intervals, the values of the + series should represent the indexes of changepoints/anomalies. + index : array-like + Indices that are to be annotated according to ``y_sparse``. + + Returns + ------- + pd.Series + A series with an index of ``index`` is returned. + * If ``y_sparse`` is a series of changepoints/anomalies then the returned + series is labelled 0 and 1 dependendy on whether the index is associated + with an anomaly/changepoint. Where 1 means anomaly/changepoint. + * If ``y_sparse`` is a series of segments then the returned series is + labelled depending on the segment its indexes fall into. Indexes that + fall into no segments are labelled -1. + + Examples + -------- + >>> import pandas as pd + >>> from sktime.annotation.base._base import BaseSeriesAnnotator + >>> y_sparse = pd.Series([2, 5, 7]) # Indices of changepoints/anomalies + >>> index = range(0, 8) + >>> BaseSeriesAnnotator.sparse_to_dense(y_sparse, index=index) + 0 0 + 1 0 + 2 1 + 3 0 + 4 0 + 5 1 + 6 0 + 7 1 + dtype: int64 + >>> y_sparse = pd.Series( + ... [1, 2, 1], + ... index=pd.IntervalIndex.from_arrays( + ... [0, 4, 6], [4, 6, 10], closed="left" + ... ) + ... ) + >>> index = range(10) + >>> BaseSeriesAnnotator.sparse_to_dense(y_sparse, index=index) + 0 1 + 1 1 + 2 1 + 3 1 + 4 2 + 5 2 + 6 1 + 7 1 + 8 1 + 9 1 + dtype: int64 + """ + if isinstance(y_sparse.index.dtype, pd.IntervalDtype): + # Segmentation case + y_dense = BaseSeriesAnnotator._sparse_segments_to_dense(y_sparse, index) + return y_dense + else: + # Anomaly/changepoint detection case + y_dense = BaseSeriesAnnotator._sparse_points_to_dense(y_sparse, index) + return y_dense + + @staticmethod + def _sparse_points_to_dense(y_sparse, index): + """Label the indexes in ``index`` if they are in ``y_sparse``. + + Parameters + ---------- + y_sparse: pd.Series + The values of the series must be the indexes of changepoints/anomalies. + index: array-like + Array of indexes that are to be labelled according to ``y_sparse``. + + Returns + ------- + pd.Series + A series with an index of ``index``. Its values are 1 if the index is in + y_sparse and 0 otherwise. + """ + y_dense = pd.Series(np.zeros(len(index)), index=index, dtype="int64") + y_dense[y_sparse.values] = 1 + return y_dense + + @staticmethod + def _sparse_segments_to_dense(y_sparse, index): + """Find the label for each index in ``index`` from sparse segments. + + Parameters + ---------- + y_sparse : pd.Series + A sparse representation of segments. The index must be the pandas interval + datatype and the values must be the integer labels of the segments. + index : array-like + List of indexes that are to be labelled according to ``y_sparse``. + + Returns + ------- + pd.Series + A series with the same index as ``index`` where each index is labelled + according to ``y_sparse``. Indexes that do not fall within any index are + labelled -1. + """ + if y_sparse.index.is_overlapping: + raise NotImplementedError( + "Cannot convert overlapping segments to a dense formet yet." + ) + + interval_indexes = y_sparse.index.get_indexer(index) + + # Negative indexes do not fall within any interval so they are ignored + interval_labels = y_sparse.iloc[ + interval_indexes[interval_indexes >= 0] + ].to_numpy() + + # -1 is used to represent points do not fall within a segment + labels_dense = interval_indexes.copy() + labels_dense[labels_dense >= 0] = interval_labels + + y_dense = pd.Series(labels_dense, index=index) + return y_dense + + @staticmethod + def dense_to_sparse(y_dense): + """Convert the dense output from an annotator to a sparse format. + + Parameters + ---------- + y_dense : pd.Series + * If ``y_sparse`` contains only 1's and 0's, the 1's represent change + points or anomalies. + * If ``y_sparse`` contains only contains integers greater than 0, it is an + an array of segments. + + Returns + ------- + pd.Series + * If ``y_sparse`` is a series of changepoints/anomalies, a pandas series + will be returned containing the indexes of the changepoints/anomalies + * If ``y_sparse`` is a series of segments, a series with an interval + datatype index will be returned. The values of the series will be the + labels of segments. + """ + if 0 in y_dense.values: + # y_dense is a series of change points + change_points = np.where(y_dense.values != 0)[0] + return pd.Series(change_points) + else: + segment_start_indexes = np.where(y_dense.diff() != 0)[0] + segment_end_indexes = np.roll(segment_start_indexes, -1) + + # The final index is always the end of a segment + segment_end_indexes[-1] = y_dense.index[-1] + + segment_labels = y_dense.iloc[segment_start_indexes].to_numpy() + interval_index = pd.IntervalIndex.from_arrays( + segment_start_indexes, segment_end_indexes, closed="left" + ) + y_sparse = pd.Series(segment_labels, index=interval_index) + + # -1 represents unclassified regions so we remove them + y_sparse = y_sparse.loc[y_sparse != -1] + return y_sparse + + @staticmethod + def change_points_to_segments(y_sparse, start=None, end=None): + """Convert an series of change point indexes to segments. + + Parameters + ---------- + y_sparse : pd.Series + A series containing the indexes of change points. + start : optional + Starting point of the first segment. + end : optional + Ending point of the last segment + + Returns + ------- + pd.Series + A series with an interval index indicating the start and end points of the + segments. The values of the series are the labels of the segments. + + Examples + -------- + >>> import pandas as pd + >>> from sktime.annotation.base._base import BaseSeriesAnnotator + >>> change_points = pd.Series([1, 2, 5]) + >>> BaseSeriesAnnotator.change_points_to_segments(change_points, 0, 7) + [0, 1) -1 + [1, 2) 1 + [2, 5) 2 + [5, 7) 3 + dtype: int64 + """ + breaks = y_sparse.values + + if start > breaks.min(): + raise ValueError( + "The starting index must be before the first change point." + ) + first_change_point = breaks.min() + + if start is not None: + breaks = np.insert(breaks, 0, start) + if end is not None: + breaks = np.append(breaks, end) + + index = pd.IntervalIndex.from_breaks(breaks, copy=True, closed="left") + segments = pd.Series(0, index=index) + + in_range = index.left >= first_change_point + + number_of_segments = in_range.sum() + segments.loc[in_range] = range(1, number_of_segments + 1) + segments.loc[~in_range] = -1 + + return segments + + @staticmethod + def segments_to_change_points(y_sparse): + """Convert segments to change points. + + Parameters + ---------- + y_sparse : pd.DataFrame + A series of segments. The index must be the interval data type and the + values should be the integer labels of the segments. + + Returns + ------- + pd.Series + A series containing the indexes of the start of each segment. + + Examples + -------- + >>> import pandas as pd + >>> from sktime.annotation.base._base import BaseSeriesAnnotator + >>> segments = pd.Series( + ... [3, -1, 2], + ... index=pd.IntervalIndex.from_breaks([2, 5, 7, 9], closed="left") + ... ) + >>> BaseSeriesAnnotator.segments_to_change_points(segments) + 0 2 + 1 5 + 2 7 + dtype: int64 + """ + change_points = pd.Series(y_sparse.index.left) + return change_points diff --git a/sktime/annotation/clasp.py b/sktime/annotation/clasp.py index 63a32faf587..c0ca92b834d 100644 --- a/sktime/annotation/clasp.py +++ b/sktime/annotation/clasp.py @@ -23,6 +23,7 @@ from sktime.transformations.series.clasp import ClaSPTransformer from sktime.utils.validation.series import check_series +from sktime.utils.warnings import warn def find_dominant_window_sizes(X, offset=0.05): @@ -188,10 +189,6 @@ class ClaSPSegmentation(BaseSeriesAnnotator): size of window for sliding, based on the period length of the data n_cps : int, default = 1 the number of change points to search - fmt : str {"dense", "sparse"}, optional (default="sparse") - Annotation output format: - * If "sparse", a pd.Series of the found Change Points is returned - * If "dense", a pd.IndexSeries with the Segmentation of X is returned exclusion_radius : int Exclusion Radius for change points to be non-trivial matches @@ -219,16 +216,36 @@ class ClaSPSegmentation(BaseSeriesAnnotator): """ _tags = { + "task": "change_point_detection", + "learning_type": "unsupervised", "univariate-only": True, "fit_is_empty": True, "python_dependencies": "numba", } # for unit test cases - def __init__(self, period_length=10, n_cps=1, fmt="sparse", exclusion_radius=0.05): + # todo 0.31.0: remove fmt argument, remove _fmt attribute and warning + def __init__( + self, period_length=10, n_cps=1, fmt="deprecated", exclusion_radius=0.05 + ): self.period_length = int(period_length) self.n_cps = n_cps self.exclusion_radius = exclusion_radius - super().__init__(fmt) + self.fmt = fmt + + super().__init__() + + if fmt == "deprecated": + self._fmt = "sparse" + warn( + f"Warning from {type(self).__name__}: fmt argument will be removed in" + " 0.31.0. For behaviour equivalent to fmt=dense, use transform instead " + "of predict. In 0.31.0 the behaviour of predict will equivalent to the" + " current behaviour of predict when fmt=sparse.", + DeprecationWarning, + obj=self, + ) + else: + self._fmt = fmt def _fit(self, X, Y=None): """Do nothing, as there is no need to fit a model for ClaSP. @@ -257,19 +274,30 @@ def _predict(self, X): Returns ------- Y : pd.Series or an IntervalSeries - Annotations for sequence X exact format depends on annotation type. - fmt=sparse : only the found change point locations are returned - fnt=dense : an interval series is returned which contains the segmetation. + Change points in sequence X. """ - self.found_cps, self.profiles, self.scores = self._run_clasp(X) + change_points = self._predict_points(X) + if self._fmt == "dense": + return self.change_points_to_segments( + change_points, X.index.min(), X.index.max() + ) + return change_points - # Change Points - if self.fmt == "sparse": - return pd.Series(self.found_cps) + def _predict_points(self, X): + """Predict changepoints on test/deployment data. - # Segmentation - elif self.fmt == "dense": - return self._get_interval_series(X, self.found_cps) + Parameters + ---------- + X : pd.DataFrame + Data to annotate, time series. + + Returns + ------- + Y : pd.Series + Series containing the indexes of the changepoints in X. + """ + self.found_cps, self.profiles, self.scores = self._run_clasp(X) + return pd.Series(self.found_cps) def _predict_scores(self, X): """Return scores in ClaSP's profile for each annotation. @@ -286,15 +314,15 @@ def _predict_scores(self, X): """ self.found_cps, self.profiles, self.scores = self._run_clasp(X) - # Scores of the Change Points - if self.fmt == "sparse": - return pd.Series(self.scores) - - # Full Profile of Segmentation - # ClaSP creates multiple profiles. Hard to map. - # Thus, we return the main (first) one - elif self.fmt == "dense": - return pd.Series(self.profiles[0]) + if self._fmt == "sparse": + # Scores of the Change Points + scores = pd.Series(self.scores) + return scores + elif self._fmt == "dense": + # ClaSP creates multiple profiles. Hard to map. Thus, we return the main + # (first) one + profile = pd.Series(self.profiles[0]) + return profile def get_fitted_params(self): """Get fitted parameters. diff --git a/sktime/annotation/ggs.py b/sktime/annotation/ggs.py index 9861df3409d..db2c9a8a6f2 100644 --- a/sktime/annotation/ggs.py +++ b/sktime/annotation/ggs.py @@ -425,7 +425,11 @@ class GreedyGaussianSegmentation(BaseSeriesAnnotator): https://doi.org/10.1007/s11634-018-0335-0 """ - _tags = {"fit_is_empty": True} + _tags = { + "fit_is_empty": True, + "task": "segmentation", + "learning_type": "unsupervised", + } def __init__( self, @@ -443,7 +447,7 @@ def __init__( self.random_state = random_state _check_estimator_deps(self) - super().__init__(fmt="dense", labels="int_label") + super().__init__() self._adaptee = GGS( k_max=k_max, diff --git a/sktime/annotation/hmm.py b/sktime/annotation/hmm.py index bc65dcde4dd..42c5fc9d18a 100644 --- a/sktime/annotation/hmm.py +++ b/sktime/annotation/hmm.py @@ -128,7 +128,12 @@ class HMM(BaseSeriesAnnotator): """ # plan to update to make multivariate. - _tags = {"univariate-only": True, "fit_is_empty": True} + _tags = { + "univariate-only": True, + "fit_is_empty": True, + "task": "segmentation", + "learning_type": "unsupervised", + } def __init__( self, @@ -139,7 +144,7 @@ def __init__( self.initial_probs = initial_probs self.emission_funcs = emission_funcs self.transition_prob_mat = transition_prob_mat - super().__init__(fmt="dense", labels="int_label") + super().__init__() self._validate_init() def _validate_init(self): diff --git a/sktime/annotation/hmm_learn/base.py b/sktime/annotation/hmm_learn/base.py index d43a22bab84..15dae8e89fd 100644 --- a/sktime/annotation/hmm_learn/base.py +++ b/sktime/annotation/hmm_learn/base.py @@ -23,6 +23,8 @@ class BaseHMMLearn(BaseSeriesAnnotator): "univariate-only": True, "fit_is_empty": True, "python_dependencies": "hmmlearn", + "task": "segmentation", + "learning_type": "unsupervised", } # for unit test cases _hmm_estimator = None diff --git a/sktime/annotation/tests/test_all_annotators.py b/sktime/annotation/tests/test_all_annotators.py index bc819cf47cc..2054ec924b1 100644 --- a/sktime/annotation/tests/test_all_annotators.py +++ b/sktime/annotation/tests/test_all_annotators.py @@ -10,6 +10,7 @@ from sktime.registry import all_estimators from sktime.tests.test_switch import run_test_for_class from sktime.utils._testing.annotation import make_annotation_problem +from sktime.utils.validation.annotation import check_learning_type, check_task ALL_ANNOTATORS = all_estimators(estimator_types="series-annotator", return_names=False) @@ -31,3 +32,10 @@ def test_output_type(Estimator): ) y_pred = estimator.predict(arg) assert isinstance(y_pred, (pd.Series, np.ndarray)) + + +@pytest.mark.parametrize("Estimator", ALL_ANNOTATORS) +def test_annotator_tags(Estimator): + """Check the learning_type and task tags are valid.""" + check_task(Estimator.get_class_tag("task")) + check_learning_type(Estimator.get_class_tag("learning_type")) diff --git a/sktime/annotation/tests/test_base.py b/sktime/annotation/tests/test_base.py new file mode 100644 index 00000000000..53adb6ede8a --- /dev/null +++ b/sktime/annotation/tests/test_base.py @@ -0,0 +1,136 @@ +"""Tests for the BaseSeriesAnnotator class.""" + +__author__ = ["Alex-JG3"] +__all__ = [] + +import pandas as pd +import pytest +from pandas import testing + +from sktime.annotation.base._base import BaseSeriesAnnotator + + +@pytest.mark.parametrize( + "y_sparse, y_dense_expected, index", + [ + (pd.Series([1, 3]), pd.Series([0, 1, 0, 1]), pd.RangeIndex(0, 4, 1)), + (pd.Series([1, 3]), pd.Series([0, 1, 0, 1, 0, 0]), pd.RangeIndex(0, 6, 1)), + ( + pd.Series( + [1, 2], + index=pd.IntervalIndex.from_arrays([0, 3], [3, 5], closed="left"), + ), + pd.Series([1, 1, 1, 2, 2, -1]), + pd.RangeIndex(0, 6, 1), + ), + ( + pd.Series( + [1, 2], + index=pd.IntervalIndex.from_arrays([0, 3], [3, 5], closed="left"), + ), + pd.Series([1, 1, 1, 2, 2, -1, -1]), + pd.RangeIndex(0, 7, 1), + ), + ( + pd.Series( + [1, 2], + index=pd.IntervalIndex.from_arrays([2, 4], [3, 6], closed="left"), + ), + pd.Series([-1, -1, 1, -1, 2, 2, -1]), + pd.RangeIndex(0, 7, 1), + ), + ], +) +def test_sparse_to_dense(y_sparse, y_dense_expected, index): + """Test converting from sparse to dense.""" + y_dense_actual = BaseSeriesAnnotator.sparse_to_dense(y_sparse, index=index) + testing.assert_series_equal(y_dense_actual, y_dense_expected) + + +@pytest.mark.parametrize( + "y_dense, y_sparse_expected", + [ + (pd.Series([0, 1, 0, 1]), pd.Series([1, 3])), + (pd.Series([0, 1, 0, 1, 0, 0]), pd.Series([1, 3])), + ( + pd.Series([-1, -1, -1, 1, 1, -1, 2]), + pd.Series( + [1, 2], + index=pd.IntervalIndex.from_arrays([3, 6], [5, 6], closed="left"), + ), + ), + ], +) +def test_dense_to_sparse(y_dense, y_sparse_expected): + """Test converting from dense to sparse.""" + y_sparse_actual = BaseSeriesAnnotator.dense_to_sparse(y_dense) + testing.assert_series_equal(y_sparse_actual, y_sparse_expected) + + +@pytest.mark.parametrize( + "change_points, expected_segments, start, end", + [ + ( + pd.Series([1, 2, 5]), + pd.Series( + [-1, 1, 2, 3], + index=pd.IntervalIndex.from_breaks([0, 1, 2, 5, 7], closed="left"), + ), + 0, + 7, + ) + ], +) +def test_change_points_to_segments(change_points, expected_segments, start, end): + """Test converting change points to segments.""" + actual_segments = BaseSeriesAnnotator.change_points_to_segments( + change_points, start, end + ) + testing.assert_series_equal(actual_segments, expected_segments) + + +@pytest.mark.parametrize( + "segments, expected_change_points", + [ + ( + pd.Series( + [1, -1, 2], + index=pd.IntervalIndex.from_breaks([2, 5, 7, 9], closed="left"), + ), + pd.Series([2, 5, 7]), + ) + ], +) +def test_segments_to_change_points(segments, expected_change_points): + """Test converting change points to segments.""" + actual_change_points = BaseSeriesAnnotator.segments_to_change_points(segments) + testing.assert_series_equal( + actual_change_points, expected_change_points, check_dtype=False + ) + + +@pytest.mark.parametrize( + "y_sparse, index, y_dense_expected", + [ + ( + pd.Series( + [1, 2, 1], + index=pd.IntervalIndex.from_arrays([0, 2, 4], [1, 3, 5]), + ), + [0, 1, 2, 3, 4, 5, 6], + pd.Series([-1, 1, -1, 2, -1, 1, -1]), + ) + ], +) +def test_sparse_segments_to_dense(y_sparse, index, y_dense_expected): + y_dense_actual = BaseSeriesAnnotator._sparse_segments_to_dense(y_sparse, index) + testing.assert_series_equal(y_dense_expected, y_dense_actual) + + +@pytest.mark.parametrize( + "y_sparse, index, y_dense_expected", + [(pd.Series([2, 4]), [0, 1, 2, 3, 4, 5, 6], pd.Series([0, 0, 1, 0, 1, 0, 0]))], +) +def test_sparse_points_to_dense(y_sparse, index, y_dense_expected): + y_dense_actual = BaseSeriesAnnotator._sparse_points_to_dense(y_sparse, index) + testing.assert_series_equal(y_dense_actual, y_dense_expected) diff --git a/sktime/annotation/tests/test_clasp.py b/sktime/annotation/tests/test_clasp.py index 9244d225a57..d585eaf1ada 100644 --- a/sktime/annotation/tests/test_clasp.py +++ b/sktime/annotation/tests/test_clasp.py @@ -49,7 +49,8 @@ def test_clasp_dense(): clasp = ClaSPSegmentation(period_size, n_cps=1, fmt="dense") clasp.fit(ts) segmentation = clasp.predict(ts) - scores = clasp.predict_scores(ts) - assert len(segmentation) == 2 and segmentation[0].right == 893 - assert np.argmax(scores) == 893 + profile = clasp.predict_scores(ts) + + assert len(segmentation) == 2 and segmentation.index[0].right == 893 + assert np.argmax(profile) == 893 diff --git a/sktime/registry/_tags.py b/sktime/registry/_tags.py index d5edd23be2c..7f418f5de53 100644 --- a/sktime/registry/_tags.py +++ b/sktime/registry/_tags.py @@ -1651,6 +1651,24 @@ class transform_returns_same_time_index(_BaseTag): "bool", "whether estimator remembers all data seen as self._X, self._y, etc", ), + ( + "distribution_type", + "estimator", + "str", + "distribution type of data as str", + ), + ( + "task", + "series-annotator", + "str", + "subtype of series annotator, e.g., 'anomaly_detection', 'segmentation'", + ), + ( + "learning_type", + "series-annotator", + "str", + "type of learning, e.g., 'supervised', 'unsupervised'", + ), ( "reserved_params", "estimator", diff --git a/sktime/utils/validation/annotation.py b/sktime/utils/validation/annotation.py index 76b5e6ca157..bafad89c0de 100644 --- a/sktime/utils/validation/annotation.py +++ b/sktime/utils/validation/annotation.py @@ -39,3 +39,32 @@ def check_labels(labels): valid_labels = ["indicator", "score", "int_label"] if labels not in valid_labels: raise ValueError(f"`labels` must be in: {valid_labels}, but found: {labels}.") + + +def check_task(task): + """Check annotation task. + + Parameters + ---------- + task : str {"segmentation", "change_point_detection", "anomaly_detection"} + Annotation task. + """ + valid_tasks = ["segmentation", "change_point_detection", "anomaly_detection"] + if task not in valid_tasks: + raise ValueError(f"`task` must be in: {valid_tasks}, but found: {task}.") + + +def check_learning_type(learning_type): + """Check learning type. + + Parameters + ---------- + learning_type : str {"supervised", "unsupervised"} + Annotation Learning type. + """ + valid_learning_types = ["supervised", "unsupervised"] + if learning_type not in valid_learning_types: + raise ValueError( + f"`learning_type` must be in: {valid_learning_types}, " + f"but found: {learning_type}." + )