Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Classifier base class #1517

Merged
merged 37 commits into from Oct 19, 2021
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
f01b032
base
TonyBagnall Oct 13, 2021
3c14b91
enforce univariate in base class
TonyBagnall Oct 13, 2021
498b8dd
remove unnecessary classifier tags
TonyBagnall Oct 13, 2021
8b72f4d
predict and predict_proba
TonyBagnall Oct 13, 2021
b79f65b
tweaks to classifier base class
TonyBagnall Oct 13, 2021
9d36d91
formatting 1
TonyBagnall Oct 13, 2021
4ccd2df
formatting 3
TonyBagnall Oct 13, 2021
5503c54
formatting 4
TonyBagnall Oct 13, 2021
50ca231
formatting 6?
TonyBagnall Oct 13, 2021
f7dc729
blank lines or no blank lines?
TonyBagnall Oct 13, 2021
5919fbb
remove unnecessary argument to get_tag
TonyBagnall Oct 14, 2021
4bdb457
negate tag correctly, remove unnecessary get_tag argument
TonyBagnall Oct 14, 2021
721052f
Merge branch 'classifier_base_class' of https://github.com/alan-turin…
TonyBagnall Oct 14, 2021
6accbc6
correct tag negation
TonyBagnall Oct 14, 2021
aa1ca8e
Merge branch 'main' into classifier_base_class
TonyBagnall Oct 14, 2021
5734947
Merge branch 'main' into classifier_base_class
TonyBagnall Oct 15, 2021
cee784b
Merge branch 'main' into classifier_base_class
TonyBagnall Oct 15, 2021
d408408
_predict _predict_proba
TonyBagnall Oct 16, 2021
b17f9a7
formatting 1
TonyBagnall Oct 16, 2021
57347f7
formatting 2
TonyBagnall Oct 16, 2021
5af98a0
Merge branch 'main' into classifier_base_class
TonyBagnall Oct 16, 2021
8b29a99
HC comments an experiments fixes
MatthewMiddlehurst Oct 17, 2021
b31afcd
Update base.py
TonyBagnall Oct 19, 2021
26069f7
Update base.py
TonyBagnall Oct 19, 2021
2bb1d48
Merge branch 'main' of https://github.com/alan-turing-institute/sktim…
MatthewMiddlehurst Oct 19, 2021
27949f6
fix comments in get_tags
TonyBagnall Oct 19, 2021
a226422
Merge branch 'classifier_base_class' of https://github.com/alan-turin…
MatthewMiddlehurst Oct 19, 2021
031a18b
remove debug
TonyBagnall Oct 19, 2021
dee524b
get_tag comment revert
MatthewMiddlehurst Oct 19, 2021
feade0e
format 1
TonyBagnall Oct 19, 2021
8cdddff
Merge branch 'classifier_base_class' of https://github.com/alan-turin…
MatthewMiddlehurst Oct 19, 2021
87dfa72
doc consistency
MatthewMiddlehurst Oct 19, 2021
d85c4c6
change contributors on both base
TonyBagnall Oct 19, 2021
f0b64bc
class dictionary useage in predict_proba default
MatthewMiddlehurst Oct 19, 2021
77779dc
Merge branch 'classifier_base_class' of https://github.com/alan-turin…
MatthewMiddlehurst Oct 19, 2021
aa7f500
code quality
MatthewMiddlehurst Oct 19, 2021
a4b5b1a
Update base.py
TonyBagnall Oct 19, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Expand Up @@ -160,3 +160,4 @@ sktime/contrib/nothing_to_see_here.py
sktime/distances/elastic_cython.html
*.html
sktime/contrib/distance_based/old_time_series_neighbors.py
sktime/contrib/debug.py
9 changes: 5 additions & 4 deletions sktime/base/_base.py
Expand Up @@ -49,7 +49,6 @@ class name: BaseEstimator
__all__ = ["BaseEstimator", "BaseObject"]

import inspect

from copy import deepcopy

from sklearn import clone
Expand Down Expand Up @@ -142,17 +141,19 @@ def get_tag(self, tag_name, tag_value_default=None, raise_error=True):
tag_value_default : any type, optional; default=None
Default/fallback value if tag is not found
raise_error : bool
whether a ValueError is raised when the tag is not found
whether a ValueError is raised when the tag is not found and
tag_value_default is None
MatthewMiddlehurst marked this conversation as resolved.
Show resolved Hide resolved

Returns
-------
tag_value :
Value of the `tag_name` tag in self. If not found, returns
`tag_value_default`.
`tag_value_default`, or raises a ValueError if `tag_value_default` is None.

Raises
------
ValueError if raise_error is True and tag_name does not exist
ValueError if raise_error is True, tag_name does not exist and
`tag_value_default` is None.
i.e., if tag_name is not in self.get_tags().keys()
"""
collected_tags = self.get_tags()
Expand Down
158 changes: 86 additions & 72 deletions sktime/classification/base.py
Expand Up @@ -8,6 +8,7 @@ class name: BaseClassifier
Scitype defining methods:
fitting - fit(self, X, y)
predicting - predict(self, X)
- predict_proba(self, X)

State:
fitted model/strategy - by convention, any attributes ending in "_"
Expand All @@ -26,60 +27,39 @@ class name: BaseClassifier

__all__ = [
"BaseClassifier",
"classifier_list",
]
__author__ = ["mloning", "fkiraly"]
__author__ = ["mloning", "fkiraly", "TonyBagnall"]

import numpy as np

from sktime.base import BaseEstimator
from sktime.utils.validation import check_n_jobs
from sktime.utils.validation.panel import check_X, check_X_y

"""
Main list of classifiers extending this class. For clarity, some utility classifiers,
such as Proximity Stump, are not listed.
"""
classifier_list = [
# in classification/distance_based
"ProximityForest",
# "KNeighborsTimeSeriesClassifier",
# "ElasticEnsemble",
# "ShapeDTW",
# in classification/dictionary_based
"BOSS",
"ContractableBOSS",
"TemporalDictionaryEnsemble",
"WEASEL",
"MUSE",
# in classification/interval_based
"RandomIntervalSpectralForest",
"TimeSeriesForest",
"CanonicalIntervalForest",
# in classification/shapelet_based
"ShapeletTransformClassifier",
"ROCKET",
"MrSEQLClassifier",
]


class BaseClassifier(BaseEstimator):
"""Base time series classifier template class.

The base classifier specifies the methods and method
signatures that all forecasters have to implement.

Specific implementations of these methods is deferred to concrete
forecasters.
The base classifier specifies the methods and method signatures that all
classifiers have to implement.
"""

_tags = {
"coerce-X-to-numpy": True,
"coerce-X-to-pandas": False,
"capability:multivariate": False,
"capability:unequal_length": False,
"capability:missing_values": False,
"capability:train_estimate": False,
"capability:contractable": False,
"capability:multithreading": False,
}

def __init__(self):
self._is_fitted = False

self.classes_ = []
self.n_classes_ = 0
self._class_dictionary = {}
self._threads_to_use = 1
super(BaseClassifier, self).__init__()

def fit(self, X, y):
Expand All @@ -89,8 +69,7 @@ def fit(self, X, y):
----------
X : 3D np.array, array-like or sparse matrix
of shape = [n_instances,n_dimensions,series_length]
or shape = [n_instances,series_length]
or single-column pd.DataFrame with pd.Series entries
or pd.DataFrame with each column a dimension, each cell a pd.Series
y : array-like, shape = [n_instances] - the class labels.

Returns
Expand All @@ -103,12 +82,31 @@ def fit(self, X, y):
Changes state by creating a fitted model that updates attributes
ending in "_" and sets is_fitted flag to True.
"""
coerce_to_numpy = self.get_tag("coerce-X-to-numpy", False)
coerce_to_pandas = self.get_tag("coerce-X-to-pandas", False)
coerce_to_numpy = self.get_tag("coerce-X-to-numpy")
coerce_to_pandas = self.get_tag("coerce-X-to-pandas")
allow_multivariate = self.get_tag("capability:multivariate")
X, y = check_X_y(
X, y, coerce_to_numpy=coerce_to_numpy, coerce_to_pandas=coerce_to_pandas
X,
y,
coerce_to_numpy=coerce_to_numpy,
coerce_to_pandas=coerce_to_pandas,
enforce_univariate=not allow_multivariate,
)

multithread = self.get_tag("capability:multithreading")
if multithread:
try:
self._threads_to_use = check_n_jobs(self.n_jobs)
except NameError:
raise AttributeError(
"self.n_jobs must be set if capability:multithreading is True"
)

self.classes_ = np.unique(y)
self.n_classes_ = self.classes_.shape[0]
for index, classVal in enumerate(self.classes_):
self._class_dictionary[classVal] = index

self._fit(X, y)

# this should happen last
Expand All @@ -124,22 +122,25 @@ def predict(self, X):
X : 3D np.array, array-like or sparse matrix
of shape = [n_instances,n_dimensions,series_length]
or shape = [n_instances,series_length]
or single-column pd.DataFrame with pd.Series entries
or pd.DataFrame with each column a dimension, each cell a pd.Series

Returns
-------
y : array-like, shape = [n_instances] - predicted class labels
"""
coerce_to_numpy = self.get_tag("coerce-X-to-numpy", False)
coerce_to_pandas = self.get_tag("coerce-X-to-pandas", False)
X = check_X(
X, coerce_to_numpy=coerce_to_numpy, coerce_to_pandas=coerce_to_pandas
)
self.check_is_fitted()

y = self._predict(X)
coerce_to_numpy = self.get_tag("coerce-X-to-numpy")
coerce_to_pandas = self.get_tag("coerce-X-to-pandas")
allow_multivariate = self.get_tag("capability:multivariate")
X = check_X(
X,
coerce_to_numpy=coerce_to_numpy,
coerce_to_pandas=coerce_to_pandas,
enforce_univariate=not allow_multivariate,
)

return y
return self._predict(X)

def predict_proba(self, X):
"""Predicts labels probabilities for sequences in X.
Expand All @@ -149,18 +150,25 @@ def predict_proba(self, X):
X : 3D np.array, array-like or sparse matrix
of shape = [n_instances,n_dimensions,series_length]
or shape = [n_instances,series_length]
or single-column pd.DataFrame with pd.Series entries
or pd.DataFrame with each column a dimension, each cell a pd.Series

Returns
-------
y : array-like, shape = [n_instances, n_classes] - predictive pmf
y : array-like, shape = [n_instances, n_classes] - estimated class
probabilities
"""
coerce_to_numpy = self.get_tag("coerce-X-to-numpy", False)
coerce_to_pandas = self.get_tag("coerce-X-to-pandas", False)
self.check_is_fitted()

coerce_to_numpy = self.get_tag("coerce-X-to-numpy")
coerce_to_pandas = self.get_tag("coerce-X-to-pandas")
allow_multivariate = self.get_tag("capability:multivariate")
X = check_X(
X, coerce_to_numpy=coerce_to_numpy, coerce_to_pandas=coerce_to_pandas
X,
coerce_to_numpy=coerce_to_numpy,
coerce_to_pandas=coerce_to_pandas,
enforce_univariate=not allow_multivariate,
)
self.check_is_fitted()

return self._predict_proba(X)

def score(self, X, y):
Expand All @@ -171,7 +179,7 @@ def score(self, X, y):
X : 3D np.array, array-like or sparse matrix
of shape = [n_instances,n_dimensions,series_length]
or shape = [n_instances,series_length]
or single-column pd.DataFrame with pd.Series entries
or pd.DataFrame with each column a dimension, each cell a pd.Series
y : array-like, shape = [n_instances] - predicted class labels

Returns
Expand All @@ -185,14 +193,14 @@ def score(self, X, y):
def _fit(self, X, y):
"""Fit time series classifier to training data.

core logic
Abstract method

Parameters
----------
X : 3D np.array, array-like or sparse matrix
of shape = [n_instances,n_dimensions,series_length]
or shape = [n_instances,series_length]
or single-column pd.DataFrame with pd.Series entries
or pd.DataFrame with each column a dimension, each cell a pd.Series
y : array-like, shape = [n_instances] - the class labels

Returns
Expand All @@ -205,46 +213,52 @@ def _fit(self, X, y):
Changes state by creating a fitted model that updates attributes
ending in "_" and sets is_fitted flag to True.
"""
raise NotImplementedError("abstract method")
raise NotImplementedError(
"_fit is a protected abstract method, it must be implemented."
)

def _predict(self, X):
"""Predicts labels for sequences in X.

core logic
Abstract class, must be implemented.
MatthewMiddlehurst marked this conversation as resolved.
Show resolved Hide resolved

Parameters
----------
X : 3D np.array, array-like or sparse matrix
of shape = [n_instances,n_dimensions,series_length]
or shape = [n_instances,series_length]
or single-column pd.DataFrame with pd.Series entries
or pd.DataFrame with each column a dimension, each cell a pd.Series

Returns
-------
y : array-like, shape = [n_instances] - predicted class labels
"""
distributions = self.predict_proba(X)
predictions = []
for instance_index in range(0, X.shape[0]):
distribution = distributions[instance_index]
prediction = np.argmax(distribution)
predictions.append(prediction)
y = self.label_encoder.inverse_transform(predictions)

return y
raise NotImplementedError(
"_predict is a protected abstract method, it must be implemented."
)

def _predict_proba(self, X):
"""Predicts labels probabilities for sequences in X.

Default behaviour is to call _predict and set the predicted class probability
to 1, other class probabilities to 0. Override if better estimates are
obtainable.

Parameters
----------
X : 3D np.array, array-like or sparse matrix
of shape = [n_instances,n_dimensions,series_length]
or shape = [n_instances,series_length]
or single-column pd.DataFrame with pd.Series entries
or pd.DataFrame with each column a dimension, each cell a pd.Series

Returns
-------
y : array-like, shape = [n_instances, n_classes] - predictive pmf
y : array-like, shape = [n_instances, n_classes] - estimated probabilities
of class membership.
"""
raise NotImplementedError("abstract method")
dists = np.zeros((X.shape[0], self.n_classes_))
preds = self._predict(X)
for i in range(0, X.shape[0]):
dists[i, np.where(self.classes_ == preds[i])] = 1

return dists
15 changes: 10 additions & 5 deletions sktime/registry/_tags.py
Expand Up @@ -43,7 +43,6 @@

import pandas as pd


ESTIMATOR_TAG_REGISTER = [
(
"ignores-exogeneous-X",
Expand Down Expand Up @@ -89,13 +88,13 @@
),
(
"X-y-must-have-same-index",
["forecaster", "classifier", "regressor"],
["forecaster", "regressor"],
"bool",
"do X/y in fit/update and X/fh in predict have to be same indices?",
),
(
"enforce_index_type",
["forecaster", "classifier", "regressor"],
["forecaster", "regressor"],
"type",
"passed to input checks, input conversion index type to enforce",
),
Expand Down Expand Up @@ -139,13 +138,13 @@
"capability:multivariate",
"classifier",
"bool",
"can classifier classify time series with 2 or more variables?",
"can the classifier classify time series with 2 or more variables?",
),
(
"capability:unequal_length",
"classifier",
"bool",
"can classifier handle unequal length time series?",
"can the classifier handle unequal length time series?",
),
# "capability:missing_values" is same as "handles-missing-data" tag.
# They are kept distinct intentionally for easier TSC refactoring.
Expand All @@ -168,6 +167,12 @@
"bool",
"contract time setting, i.e. does the estimator support limiting max fit time?",
),
(
"capability:multithreading",
"classifier",
"bool",
"can the classifier set n_jobs to use multiple threads?",
),
(
"coerce-X-to-pandas",
["classifier", "transformer"],
Expand Down