diff --git a/.gitignore b/.gitignore index 569c3277c5f..73d7ddd9b33 100644 --- a/.gitignore +++ b/.gitignore @@ -160,3 +160,4 @@ sktime/contrib/nothing_to_see_here.py sktime/distances/elastic_cython.html *.html sktime/contrib/distance_based/old_time_series_neighbors.py +sktime/contrib/debug.py diff --git a/sktime/base/_base.py b/sktime/base/_base.py index b1b75665d0d..ea32fc4351d 100644 --- a/sktime/base/_base.py +++ b/sktime/base/_base.py @@ -49,7 +49,6 @@ class name: BaseEstimator __all__ = ["BaseEstimator", "BaseObject"] import inspect - from copy import deepcopy from sklearn import clone @@ -147,13 +146,13 @@ def get_tag(self, tag_name, tag_value_default=None, raise_error=True): Returns ------- tag_value : - Value of the `tag_name` tag in self. If not found, returns - `tag_value_default`. + Value of the `tag_name` tag in self. If not found, returns an error if + raise_error is True, otherwise it returns `tag_value_default`. Raises ------ - ValueError if raise_error is True and tag_name does not exist - i.e., if tag_name is not in self.get_tags().keys() + ValueError if raise_error is True i.e. if tag_name is not in self.get_tags( + ).keys() """ collected_tags = self.get_tags() diff --git a/sktime/classification/base.py b/sktime/classification/base.py index 0d48d814d7b..8767fdea9ba 100644 --- a/sktime/classification/base.py +++ b/sktime/classification/base.py @@ -1,18 +1,14 @@ # -*- coding: utf-8 -*- # copyright: sktime developers, BSD-3-Clause License (see LICENSE file) """ -Base class template for time series classifier scitype. +Abstract base class for time series classifiers. class name: BaseClassifier -Scitype defining methods: +Defining methods: fitting - fit(self, X, y) predicting - predict(self, X) - -State: - fitted model/strategy - by convention, any attributes ending in "_" - fitted state flag - is_fitted (property) - fitted state inspection - check_is_fitted() + - predict_proba(self, X) Inspection methods: hyper-parameter inspection - get_params() @@ -26,60 +22,39 @@ class name: BaseClassifier __all__ = [ "BaseClassifier", - "classifier_list", ] -__author__ = ["mloning", "fkiraly"] +__author__ = ["mloning", "fkiraly", "TonyBagnall", "MatthewMiddlehurst"] import numpy as np from sktime.base import BaseEstimator +from sktime.utils.validation import check_n_jobs from sktime.utils.validation.panel import check_X, check_X_y -""" -Main list of classifiers extending this class. For clarity, some utility classifiers, -such as Proximity Stump, are not listed. -""" -classifier_list = [ - # in classification/distance_based - "ProximityForest", - # "KNeighborsTimeSeriesClassifier", - # "ElasticEnsemble", - # "ShapeDTW", - # in classification/dictionary_based - "BOSS", - "ContractableBOSS", - "TemporalDictionaryEnsemble", - "WEASEL", - "MUSE", - # in classification/interval_based - "RandomIntervalSpectralForest", - "TimeSeriesForest", - "CanonicalIntervalForest", - # in classification/shapelet_based - "ShapeletTransformClassifier", - "ROCKET", - "MrSEQLClassifier", -] - class BaseClassifier(BaseEstimator): - """Base time series classifier template class. - - The base classifier specifies the methods and method - signatures that all forecasters have to implement. + """Abstract base class for time series classifiers. - Specific implementations of these methods is deferred to concrete - forecasters. + The base classifier specifies the methods and method signatures that all + classifiers have to implement. """ _tags = { "coerce-X-to-numpy": True, "coerce-X-to-pandas": False, + "capability:multivariate": False, + "capability:unequal_length": False, + "capability:missing_values": False, + "capability:train_estimate": False, + "capability:contractable": False, + "capability:multithreading": False, } def __init__(self): - self._is_fitted = False - + self.classes_ = [] + self.n_classes_ = 0 + self._class_dictionary = {} + self._threads_to_use = 1 super(BaseClassifier, self).__init__() def fit(self, X, y): @@ -87,11 +62,13 @@ def fit(self, X, y): Parameters ---------- - X : 3D np.array, array-like or sparse matrix - of shape = [n_instances,n_dimensions,series_length] - or shape = [n_instances,series_length] - or single-column pd.DataFrame with pd.Series entries - y : array-like, shape = [n_instances] - the class labels. + X : 2D np.array (univariate, equal length series) of shape = [n_instances, + series_length] + or 3D np.array (any number of dimensions, equal length series) of shape = + [n_instances,n_dimensions,series_length] + or pd.DataFrame with each column a dimension, each cell a pd.Series (any + number of dimensions, equal or unequal length series) + y : 1D np.array of shape = [n_instances] - the class labels. Returns ------- @@ -103,12 +80,31 @@ def fit(self, X, y): Changes state by creating a fitted model that updates attributes ending in "_" and sets is_fitted flag to True. """ - coerce_to_numpy = self.get_tag("coerce-X-to-numpy", False) - coerce_to_pandas = self.get_tag("coerce-X-to-pandas", False) + coerce_to_numpy = self.get_tag("coerce-X-to-numpy") + coerce_to_pandas = self.get_tag("coerce-X-to-pandas") + allow_multivariate = self.get_tag("capability:multivariate") X, y = check_X_y( - X, y, coerce_to_numpy=coerce_to_numpy, coerce_to_pandas=coerce_to_pandas + X, + y, + coerce_to_numpy=coerce_to_numpy, + coerce_to_pandas=coerce_to_pandas, + enforce_univariate=not allow_multivariate, ) + multithread = self.get_tag("capability:multithreading") + if multithread: + try: + self._threads_to_use = check_n_jobs(self.n_jobs) + except NameError: + raise AttributeError( + "self.n_jobs must be set if capability:multithreading is True" + ) + + self.classes_ = np.unique(y) + self.n_classes_ = self.classes_.shape[0] + for index, classVal in enumerate(self.classes_): + self._class_dictionary[classVal] = index + self._fit(X, y) # this should happen last @@ -121,58 +117,74 @@ def predict(self, X): Parameters ---------- - X : 3D np.array, array-like or sparse matrix - of shape = [n_instances,n_dimensions,series_length] - or shape = [n_instances,series_length] - or single-column pd.DataFrame with pd.Series entries + X : 2D np.array (univariate, equal length series) of shape = [n_instances, + series_length] + or 3D np.array (any number of dimensions, equal length series) of shape = + [n_instances,n_dimensions,series_length] + or pd.DataFrame with each column a dimension, each cell a pd.Series (any + number of dimensions, equal or unequal length series) Returns ------- - y : array-like, shape = [n_instances] - predicted class labels + y : 1D np.array of shape = [n_instances] - predicted class labels """ - coerce_to_numpy = self.get_tag("coerce-X-to-numpy", False) - coerce_to_pandas = self.get_tag("coerce-X-to-pandas", False) - X = check_X( - X, coerce_to_numpy=coerce_to_numpy, coerce_to_pandas=coerce_to_pandas - ) self.check_is_fitted() - y = self._predict(X) + coerce_to_numpy = self.get_tag("coerce-X-to-numpy") + coerce_to_pandas = self.get_tag("coerce-X-to-pandas") + allow_multivariate = self.get_tag("capability:multivariate") + X = check_X( + X, + coerce_to_numpy=coerce_to_numpy, + coerce_to_pandas=coerce_to_pandas, + enforce_univariate=not allow_multivariate, + ) - return y + return self._predict(X) def predict_proba(self, X): """Predicts labels probabilities for sequences in X. Parameters ---------- - X : 3D np.array, array-like or sparse matrix - of shape = [n_instances,n_dimensions,series_length] - or shape = [n_instances,series_length] - or single-column pd.DataFrame with pd.Series entries + X : 2D np.array (univariate, equal length series) of shape = [n_instances, + series_length] + or 3D np.array (any number of dimensions, equal length series) of shape = + [n_instances,n_dimensions,series_length] + or pd.DataFrame with each column a dimension, each cell a pd.Series (any + number of dimensions, equal or unequal length series) Returns ------- - y : array-like, shape = [n_instances, n_classes] - predictive pmf + y : 2D array of shape = [n_instances, n_classes] - estimated class + probabilities """ - coerce_to_numpy = self.get_tag("coerce-X-to-numpy", False) - coerce_to_pandas = self.get_tag("coerce-X-to-pandas", False) + self.check_is_fitted() + + coerce_to_numpy = self.get_tag("coerce-X-to-numpy") + coerce_to_pandas = self.get_tag("coerce-X-to-pandas") + allow_multivariate = self.get_tag("capability:multivariate") X = check_X( - X, coerce_to_numpy=coerce_to_numpy, coerce_to_pandas=coerce_to_pandas + X, + coerce_to_numpy=coerce_to_numpy, + coerce_to_pandas=coerce_to_pandas, + enforce_univariate=not allow_multivariate, ) - self.check_is_fitted() + return self._predict_proba(X) - def score(self, X, y): + def score(self, X, y) -> float: """Scores predicted labels against ground truth labels on X. Parameters ---------- - X : 3D np.array, array-like or sparse matrix - of shape = [n_instances,n_dimensions,series_length] - or shape = [n_instances,series_length] - or single-column pd.DataFrame with pd.Series entries - y : array-like, shape = [n_instances] - predicted class labels + X : 2D np.array (univariate, equal length series) of shape = [n_instances, + series_length] + or 3D np.array (any number of dimensions, equal length series) of shape = + [n_instances,n_dimensions,series_length] + or pd.DataFrame with each column a dimension, each cell a pd.Series (any + number of dimensions, equal or unequal length series) + y : array-like, shape = [n_instances] - actual class labels Returns ------- @@ -185,14 +197,14 @@ def score(self, X, y): def _fit(self, X, y): """Fit time series classifier to training data. - core logic + Abstract method, must be implemented. Parameters ---------- X : 3D np.array, array-like or sparse matrix of shape = [n_instances,n_dimensions,series_length] or shape = [n_instances,series_length] - or single-column pd.DataFrame with pd.Series entries + or pd.DataFrame with each column a dimension, each cell a pd.Series y : array-like, shape = [n_instances] - the class labels Returns @@ -205,46 +217,52 @@ def _fit(self, X, y): Changes state by creating a fitted model that updates attributes ending in "_" and sets is_fitted flag to True. """ - raise NotImplementedError("abstract method") + raise NotImplementedError( + "_fit is a protected abstract method, it must be implemented." + ) def _predict(self, X): """Predicts labels for sequences in X. - core logic + Abstract method, must be implemented. Parameters ---------- X : 3D np.array, array-like or sparse matrix of shape = [n_instances,n_dimensions,series_length] or shape = [n_instances,series_length] - or single-column pd.DataFrame with pd.Series entries + or pd.DataFrame with each column a dimension, each cell a pd.Series Returns ------- y : array-like, shape = [n_instances] - predicted class labels """ - distributions = self.predict_proba(X) - predictions = [] - for instance_index in range(0, X.shape[0]): - distribution = distributions[instance_index] - prediction = np.argmax(distribution) - predictions.append(prediction) - y = self.label_encoder.inverse_transform(predictions) - - return y + raise NotImplementedError( + "_predict is a protected abstract method, it must be implemented." + ) def _predict_proba(self, X): """Predicts labels probabilities for sequences in X. + Default behaviour is to call _predict and set the predicted class probability + to 1, other class probabilities to 0. Override if better estimates are + obtainable. + Parameters ---------- X : 3D np.array, array-like or sparse matrix of shape = [n_instances,n_dimensions,series_length] or shape = [n_instances,series_length] - or single-column pd.DataFrame with pd.Series entries + or pd.DataFrame with each column a dimension, each cell a pd.Series Returns ------- - y : array-like, shape = [n_instances, n_classes] - predictive pmf + y : array-like, shape = [n_instances, n_classes] - estimated probabilities + of class membership. """ - raise NotImplementedError("abstract method") + dists = np.zeros((X.shape[0], self.n_classes_)) + preds = self._predict(X) + for i in range(0, X.shape[0]): + dists[i, self._class_dictionary[preds[i]]] = 1 + + return dists diff --git a/sktime/forecasting/base/_base.py b/sktime/forecasting/base/_base.py index f2479325a5f..be356537254 100644 --- a/sktime/forecasting/base/_base.py +++ b/sktime/forecasting/base/_base.py @@ -26,27 +26,26 @@ class name: BaseForecaster """ -__author__ = ["Markus Löning", "@big-o", "fkiraly"] +__author__ = ["mloning", "@big-o", "fkiraly"] __all__ = ["BaseForecaster"] -from sktime.base import BaseEstimator - from contextlib import contextmanager from warnings import warn import numpy as np import pandas as pd -from sktime.utils.datetime import _shift -from sktime.utils.validation.forecasting import check_X -from sktime.utils.validation.forecasting import check_alpha -from sktime.utils.validation.forecasting import check_cv -from sktime.utils.validation.forecasting import check_fh -from sktime.utils.validation.forecasting import check_y_X -from sktime.utils.validation.series import check_series, check_equal_time_index - +from sktime.base import BaseEstimator from sktime.datatypes import convert_to, mtype - +from sktime.utils.datetime import _shift +from sktime.utils.validation.forecasting import ( + check_alpha, + check_cv, + check_fh, + check_X, + check_y_X, +) +from sktime.utils.validation.series import check_equal_time_index, check_series DEFAULT_ALPHA = 0.05 diff --git a/sktime/registry/_tags.py b/sktime/registry/_tags.py index 608a7b8182e..4867ec0932c 100644 --- a/sktime/registry/_tags.py +++ b/sktime/registry/_tags.py @@ -43,7 +43,6 @@ import pandas as pd - ESTIMATOR_TAG_REGISTER = [ ( "ignores-exogeneous-X", @@ -89,13 +88,13 @@ ), ( "X-y-must-have-same-index", - ["forecaster", "classifier", "regressor"], + ["forecaster", "regressor"], "bool", "do X/y in fit/update and X/fh in predict have to be same indices?", ), ( "enforce_index_type", - ["forecaster", "classifier", "regressor"], + ["forecaster", "regressor"], "type", "passed to input checks, input conversion index type to enforce", ), @@ -139,13 +138,13 @@ "capability:multivariate", "classifier", "bool", - "can classifier classify time series with 2 or more variables?", + "can the classifier classify time series with 2 or more variables?", ), ( "capability:unequal_length", "classifier", "bool", - "can classifier handle unequal length time series?", + "can the classifier handle unequal length time series?", ), # "capability:missing_values" is same as "handles-missing-data" tag. # They are kept distinct intentionally for easier TSC refactoring. @@ -168,6 +167,12 @@ "bool", "contract time setting, i.e. does the estimator support limiting max fit time?", ), + ( + "capability:multithreading", + "classifier", + "bool", + "can the classifier set n_jobs to use multiple threads?", + ), ( "coerce-X-to-pandas", ["classifier", "transformer"],