Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Change GGS to inherit from BaseSeriesAnnotator #5315

Merged
merged 5 commits into from
Nov 17, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
126 changes: 60 additions & 66 deletions sktime/annotation/ggs.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,15 @@

import logging
import math
from dataclasses import asdict, dataclass, field
from typing import Dict, List, Tuple
from dataclasses import dataclass, field
from typing import List, Tuple

import numpy as np
import numpy.typing as npt
import pandas as pd
from sklearn.utils.validation import check_random_state

from sktime.base import BaseEstimator
from sktime.annotation.base._base import BaseSeriesAnnotator
from sktime.utils.validation._dependencies import _check_estimator_deps

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -367,7 +368,7 @@ def find_change_points(self, data: npt.ArrayLike) -> List[int]:
return change_points


class GreedyGaussianSegmentation(BaseEstimator):
class GreedyGaussianSegmentation(BaseSeriesAnnotator):
"""Greedy Gaussian Segmentation Estimator.

The method approxmates solutions for the problem of breaking a
Expand Down Expand Up @@ -408,10 +409,6 @@ class GreedyGaussianSegmentation(BaseEstimator):
change_points_: array_like, default=[]
Locations of change points as integer indexes. By convention change points
include the identity segmentation, i.e. first and last index + 1 values.
_intermediate_change_points: List[List[int]], default=[]
Intermediate values of change points for each value of k = 1...k_max
_intermediate_ll: List[float], default=[]
Intermediate values for log-likelihood for each value of k = 1...k_max

Notes
-----
Expand All @@ -428,6 +425,8 @@ class GreedyGaussianSegmentation(BaseEstimator):
https://doi.org/10.1007/s11634-018-0335-0
"""

_tags = {"fit_is_empty": True}

def __init__(
self,
k_max: int = 10,
Expand All @@ -444,7 +443,7 @@ def __init__(
self.random_state = random_state

_check_estimator_deps(self)
super().__init__()
super().__init__(fmt="dense", labels="int_label")

self._adaptee = GGS(
k_max=k_max,
Expand All @@ -454,33 +453,47 @@ def __init__(
random_state=random_state,
)

def fit(self, X: npt.ArrayLike, y: npt.ArrayLike = None):
"""Fit method for compatibility with sklearn-type estimator interface.
@property
def _intermediate_change_points(self) -> List[List[int]]:
"""Intermediate values of change points for each value of k = 1...k_max.

Default value is an empty list.
"""
return self._adaptee._intermediate_change_points

It sets the internal state of the estimator and returns the initialized
instance.
@property
def _intermediate_ll(self) -> List[float]:
"""Intermediate values for log-likelihood for each value of k = 1...k_max.

Default value is an empty list.
"""
return self._adaptee._intermediate_ll

def _fit(self, X, Y=None):
"""Fit method for compatibility with sklearn-type estimator interface.

Parameters
----------
X: array_like
2D `array_like` representing time series with sequence index along
the first dimension and value series as columns.
X: array_like (1D or 2D), pd.Series, or pd.DataFrame
1D array of timeseries values, or 2D array with index along the first
dimension and columns representing features of the timeseries. If pd.Series,
the values of the timeseries are the values of the series. If pd.DataFrame,
each column represents a feature of the timeseries.
y: array_like
Placeholder for compatibility with sklearn-api, not used, default=None.
"""
self._adaptee.initialize_intermediates()
return self

def predict(self, X: npt.ArrayLike, y: npt.ArrayLike = None) -> npt.ArrayLike:
def _predict(self, X) -> npt.ArrayLike:
"""Perform segmentation.

Parameters
----------
X: array_like
2D `array_like` representing time series with sequence index along
the first dimension and value series as columns.
y: array_like
Placeholder for compatibility with sklearn-api, not used, default=None.
X: array_like (1D or 2D), pd.Series, or pd.DataFrame
1D array of timeseries values, or 2D array with index along the first
dimension and columns representing features of the timeseries. If pd.Series,
the values of the timeseries are the values of the series. If pd.DataFrame,
each column represents a feature of the timeseries.

Returns
-------
Expand All @@ -489,6 +502,15 @@ def predict(self, X: npt.ArrayLike, y: npt.ArrayLike = None) -> npt.ArrayLike:
dimension of X. The numerical values represent distinct segments
labels for each of the data points.
"""
if isinstance(X, pd.Series):
X = X.values[:, np.newaxis]
elif isinstance(X, pd.DataFrame):
X = X.values
elif len(X.shape) == 1:
X = X[:, np.newaxis]
elif len(X.shape) > 2:
raise ValueError("X must not have more than two dimensions.")
self._adaptee.initialize_intermediates()
self.change_points_ = self._adaptee.find_change_points(X)

labels = np.zeros(X.shape[0], dtype=np.int32)
Expand All @@ -498,16 +520,16 @@ def predict(self, X: npt.ArrayLike, y: npt.ArrayLike = None) -> npt.ArrayLike:
labels[start:stop] = i
return labels

def fit_predict(self, X: npt.ArrayLike, y: npt.ArrayLike = None) -> npt.ArrayLike:
def fit_predict(self, X) -> npt.ArrayLike:
"""Perform segmentation.

Parameters
----------
X: array_like
2D `array_like` representing time series with sequence index along
the first dimension and value series as columns.
y: array_like
Placeholder for compatibility with sklearn-api, not used, default=None.
X: array_like (1D or 2D), pd.Series, or pd.DataFrame
1D array of timeseries values, or 2D array with index along the first
dimension and columns representing features of the timeseries. If pd.Series,
the values of the timeseries are the values of the series. If pd.DataFrame,
each column represents a feature of the timeseries.

Returns
-------
Expand All @@ -516,49 +538,21 @@ def fit_predict(self, X: npt.ArrayLike, y: npt.ArrayLike = None) -> npt.ArrayLik
dimension of X. The numerical values represent distinct segments
labels for each of the data points.
"""
return self.fit(X, y).predict(X, y)
return self.fit(X, None).predict(X)

def get_params(self, deep: bool = True) -> Dict:
"""Return initialization parameters.
@classmethod
def get_test_params(cls, parameter_set="default"):
"""Return testing parameter settings for the estimator.

Parameters
----------
deep: bool
Dummy argument for compatibility with sklearn-api, not used.
parameter_set : str, default="default"
Name of the set of test parameters to return, for use in tests. If no
special parameters are defined for a value, will return `"default"` set.

Returns
-------
params: dict
Dictionary with the estimator's initialization parameters, with
keys being argument names and values being argument values.
params : dict or list of dict
"""
attrs_to_ignore = [
"change_points_",
"_intermediate_change_points",
"_intermediate_ll",
]
params = asdict(self._adaptee)
params = {
key: value for key, value in params.items() if key not in attrs_to_ignore
}
params = {"k_max": 10, "lamb": 1.0}
return params

def set_params(self, **parameters):
"""Set the parameters of this object.

Parameters
----------
parameters : dict
Initialization parameters for th estimator.

Returns
-------
self : reference to self (after parameters have been set)
"""
for key, value in parameters.items():
setattr(self._adaptee, key, value)
return self

def __repr__(self) -> str:
"""Return a string representation of the estimator."""
return self._adaptee.__repr__()
Loading