[ENH] Change GGS to inherit from BaseSeriesAnnotator (#5315)

Updates `GreedyGaussianSegmentation` to inherit from `BaseSerieAnnotator` following the `HMM` class as a guide. #### What should a reviewer concentrate their feedback on?  * Currently type check and conversion is done in the `_predict` method. It would be good to refactor this out into the `BaseSeriesAnnotator` class but I think that is a job for another PR. * This PR makes no attempt to convert the `GGS` class to inherit from `BaseSeriesAnnotator` class. It would be good for this to be done but again I think this is a job for another PR. I had to add the type conversions in the `_predict` method to get the `annotation` tests to pass locally.
sktime · Nov 17, 2023 · b031687 · b031687
1 parent e547c5e
commit b031687
Showing 1 changed file with 60 additions and 66 deletions.
diff --git a/sktime/annotation/ggs.py b/sktime/annotation/ggs.py
@@ -34,14 +34,15 @@
 
 import logging
 import math
-from dataclasses import asdict, dataclass, field
-from typing import Dict, List, Tuple
+from dataclasses import dataclass, field
+from typing import List, Tuple
 
 import numpy as np
 import numpy.typing as npt
+import pandas as pd
 from sklearn.utils.validation import check_random_state
 
-from sktime.base import BaseEstimator
+from sktime.annotation.base._base import BaseSeriesAnnotator
 from sktime.utils.validation._dependencies import _check_estimator_deps
 
 logger = logging.getLogger(__name__)
@@ -367,7 +368,7 @@ def find_change_points(self, data: npt.ArrayLike) -> List[int]:
         return change_points
 
 
-class GreedyGaussianSegmentation(BaseEstimator):
+class GreedyGaussianSegmentation(BaseSeriesAnnotator):
     """Greedy Gaussian Segmentation Estimator.
 
     The method approximates solutions for the problem of breaking a
@@ -408,10 +409,6 @@ class GreedyGaussianSegmentation(BaseEstimator):
     change_points_: array_like, default=[]
         Locations of change points as integer indexes. By convention change points
         include the identity segmentation, i.e. first and last index + 1 values.
-    _intermediate_change_points: List[List[int]], default=[]
-        Intermediate values of change points for each value of k = 1...k_max
-    _intermediate_ll: List[float], default=[]
-        Intermediate values for log-likelihood for each value of k = 1...k_max
 
     Notes
     -----
@@ -428,6 +425,8 @@ class GreedyGaussianSegmentation(BaseEstimator):
        https://doi.org/10.1007/s11634-018-0335-0
     """
 
+    _tags = {"fit_is_empty": True}
+
     def __init__(
         self,
         k_max: int = 10,
@@ -444,7 +443,7 @@ def __init__(
         self.random_state = random_state
 
         _check_estimator_deps(self)
-        super().__init__()
+        super().__init__(fmt="dense", labels="int_label")
 
         self._adaptee = GGS(
             k_max=k_max,
@@ -454,33 +453,47 @@ def __init__(
             random_state=random_state,
         )
 
-    def fit(self, X: npt.ArrayLike, y: npt.ArrayLike = None):
-        """Fit method for compatibility with sklearn-type estimator interface.
+    @property
+    def _intermediate_change_points(self) -> List[List[int]]:
+        """Intermediate values of change points for each value of k = 1...k_max.
+
+        Default value is an empty list.
+        """
+        return self._adaptee._intermediate_change_points
 
-        It sets the internal state of the estimator and returns the initialized
-        instance.
+    @property
+    def _intermediate_ll(self) -> List[float]:
+        """Intermediate values for log-likelihood for each value of k = 1...k_max.
+
+        Default value is an empty list.
+        """
+        return self._adaptee._intermediate_ll
+
+    def _fit(self, X, Y=None):
+        """Fit method for compatibility with sklearn-type estimator interface.
 
         Parameters
         ----------
-        X: array_like
-            2D `array_like` representing time series with sequence index along
-            the first dimension and value series as columns.
+        X: array_like (1D or 2D), pd.Series, or pd.DataFrame
+            1D array of timeseries values, or 2D array with index along the first
+            dimension and columns representing features of the timeseries. If pd.Series,
+            the values of the timeseries are the values of the series. If pd.DataFrame,
+            each column represents a feature of the timeseries.
         y: array_like
             Placeholder for compatibility with sklearn-api, not used, default=None.
         """
-        self._adaptee.initialize_intermediates()
         return self
 
-    def predict(self, X: npt.ArrayLike, y: npt.ArrayLike = None) -> npt.ArrayLike:
+    def _predict(self, X) -> npt.ArrayLike:
         """Perform segmentation.
 
         Parameters
         ----------
-        X: array_like
-            2D `array_like` representing time series with sequence index along
-            the first dimension and value series as columns.
-        y: array_like
-            Placeholder for compatibility with sklearn-api, not used, default=None.
+        X: array_like (1D or 2D), pd.Series, or pd.DataFrame
+            1D array of timeseries values, or 2D array with index along the first
+            dimension and columns representing features of the timeseries. If pd.Series,
+            the values of the timeseries are the values of the series. If pd.DataFrame,
+            each column represents a feature of the timeseries.
 
         Returns
         -------
@@ -489,6 +502,15 @@ def predict(self, X: npt.ArrayLike, y: npt.ArrayLike = None) -> npt.ArrayLike:
             dimension of X. The numerical values represent distinct segments
             labels for each of the data points.
         """
+        if isinstance(X, pd.Series):
+            X = X.values[:, np.newaxis]
+        elif isinstance(X, pd.DataFrame):
+            X = X.values
+        elif len(X.shape) == 1:
+            X = X[:, np.newaxis]
+        elif len(X.shape) > 2:
+            raise ValueError("X must not have more than two dimensions.")
+        self._adaptee.initialize_intermediates()
         self.change_points_ = self._adaptee.find_change_points(X)
 
         labels = np.zeros(X.shape[0], dtype=np.int32)
@@ -498,16 +520,16 @@ def predict(self, X: npt.ArrayLike, y: npt.ArrayLike = None) -> npt.ArrayLike:
             labels[start:stop] = i
         return labels
 
-    def fit_predict(self, X: npt.ArrayLike, y: npt.ArrayLike = None) -> npt.ArrayLike:
+    def fit_predict(self, X) -> npt.ArrayLike:
         """Perform segmentation.
 
         Parameters
         ----------
-        X: array_like
-            2D `array_like` representing time series with sequence index along
-            the first dimension and value series as columns.
-        y: array_like
-            Placeholder for compatibility with sklearn-api, not used, default=None.
+        X: array_like (1D or 2D), pd.Series, or pd.DataFrame
+            1D array of timeseries values, or 2D array with index along the first
+            dimension and columns representing features of the timeseries. If pd.Series,
+            the values of the timeseries are the values of the series. If pd.DataFrame,
+            each column represents a feature of the timeseries.
 
         Returns
         -------
@@ -516,49 +538,21 @@ def fit_predict(self, X: npt.ArrayLike, y: npt.ArrayLike = None) -> npt.ArrayLik
             dimension of X. The numerical values represent distinct segments
             labels for each of the data points.
         """
-        return self.fit(X, y).predict(X, y)
+        return self.fit(X, None).predict(X)
 
-    def get_params(self, deep: bool = True) -> Dict:
-        """Return initialization parameters.
+    @classmethod
+    def get_test_params(cls, parameter_set="default"):
+        """Return testing parameter settings for the estimator.
 
         Parameters
         ----------
-        deep: bool
-            Dummy argument for compatibility with sklearn-api, not used.
+        parameter_set : str, default="default"
+            Name of the set of test parameters to return, for use in tests. If no
+            special parameters are defined for a value, will return `"default"` set.
 
         Returns
         -------
-        params: dict
-            Dictionary with the estimator's initialization parameters, with
-            keys being argument names and values being argument values.
+        params : dict or list of dict
         """
-        attrs_to_ignore = [
-            "change_points_",
-            "_intermediate_change_points",
-            "_intermediate_ll",
-        ]
-        params = asdict(self._adaptee)
-        params = {
-            key: value for key, value in params.items() if key not in attrs_to_ignore
-        }
+        params = {"k_max": 10, "lamb": 1.0}
         return params
-
-    def set_params(self, **parameters):
-        """Set the parameters of this object.
-
-        Parameters
-        ----------
-        parameters : dict
-            Initialization parameters for th estimator.
-
-        Returns
-        -------
-        self : reference to self (after parameters have been set)
-        """
-        for key, value in parameters.items():
-            setattr(self._adaptee, key, value)
-        return self
-
-    def __repr__(self) -> str:
-        """Return a string representation of the estimator."""
-        return self._adaptee.__repr__()