-
-
Notifications
You must be signed in to change notification settings - Fork 1.3k
/
_shapelet_transform_pyts.py
197 lines (165 loc) · 7.94 KB
/
_shapelet_transform_pyts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
"""Shapelet transformer, from pyts."""
# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
__author__ = ["Abhay-Lejith"]
__all__ = ["ShapeletTransformPyts"]
from sktime.base.adapters._pyts import _PytsAdapter
from sktime.transformations.base import BaseTransformer
class ShapeletTransformPyts(_PytsAdapter, BaseTransformer):
"""Shapelet Transform, from ``pyts``.
Direct interface to ``pyts.transformation.ShapeletTransform``.
The Shapelet Transform algorithm extracts shapelets from a data set of time series
and returns the distances between the shapelets and the time series. A shapelet is
defined as a subset of a time series, that is a set of values from consecutive time
points. The distance between a shapelet and a time series is defined as the minimum
of the distances between this shapelet and all the shapelets of same length
extracted from this time series. The most discriminative shapelets are selected.
Two criteria are made available: mutual information and F-scores.
Parameters
----------
n_shapelets : int or 'auto' (default = 'auto')
The number of shapelets to keep. If 'auto', `n_timestamps // 2`
shapelets are considered, where `n_timestamps` is the number of
time points in the dataset. Note that there might be a smaller
number of shapelets if fewer than ``n_shapelets`` shapelets have been
extracted during the search.
criterion : 'mutual_info' or 'anova' (default = 'mutual_info')
Criterion to perform the selection of the shapelets.
'mutual_info' uses the mutual information, while 'anova' use
the ANOVA F-value.
window_sizes : array-like or 'auto '(default = 'auto')
Size of the sliding windows. If 'auto', the range for the
window sizes is determined automatically.
Otherwise, all the elements must be either integers or floats.
In the latter case, each element represents the percentage
of the size of each time series and must be between 0 and 1; the size
of the sliding windows will be computed as
``np.ceil(window_sizes * n_timestamps)``.
window_steps : None or array-like (default = None)
Step of the sliding windows. If None, each ``window_step`` is equal
to 1. Otherwise, all the elements must be either integers or floats.
In the latter case, each element represents the percentage of the size
of each time series and must be between 0 and 1; the step of the
sliding windows will be computed as
``np.ceil(window_steps * n_timestamps)``.
Must be None if ``window_sizes='auto'``.
remove_similar : bool (default = True)
If True, self-similar shapelets are removed, keeping only
the non-self-similar shapelets with the highest scores. Two
shapelets are considered to be self-similar if they are
taken from the the same time series and have at least one
overlapping index.
sort : bool (default = False)
If True, shapelets are sorted in descending order according to
their associated scores. If False, the order is undefined.
verbose : int (default = 0)
Verbosity level when fitting: if non zero, progress messages are
printed. Above 50, the output is sent to stdout.
The frequency of the messages increases with the verbosity level.
random_state : int, RandomState instance or None (default = None)
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by ``np.random``. Only used if ``window_sizes='auto'`` in order to
subsample the dataset to find the best range or if
``criterion=='mutual_info'`` to add small noise to the data.
n_jobs : None or int (default = None)
The number of jobs to run in parallel for ``fit``.
If -1, then the number of jobs is set to the number of cores.
Attributes
----------
shapelets_ : array, shape = (n_shapelets,)
The array with the selected shapelets.
indices_ : array, shape = (n_shapelets, 3)
The indices for the corresponding shapelets in the training set.
The first column consists of the indices of the samples.
The second column consists of the starting indices (included)
of the shapelets. The third column consists of the ending indices
(excluded) of the shapelets.
scores_ : array, shape = (n_shapelets,)
The scores associated to the shapelets. The higher, the more
discriminant.
If ``criterion='mutual_info'``, mutual information scores are reported.
If ``criterion='anova'``, F-scores are reported.
window_range_ : None or tuple
Range of the window sizes if ``window_sizes='auto'``. None otherwise.
References
----------
.. [1] J. Lines, L. M. Davis, J. Hills and A. Bagnall, "A Shapelet
Transform for Time Series Classification". Data Mining and Knowledge
Discovery, 289-297 (2012).
Examples
--------
>>> from sktime.transformations.panel.shapelet_transform import (
... ShapeletTransformPyts
... )
>>> from sktime.datasets import load_unit_test
>>> X_train, y_train = load_unit_test(split="train") # doctest: +SKIP
>>> stp = ShapeletTransformPyts() # doctest: +SKIP
>>> stp.fit(X_train,y_train) # doctest: +SKIP
>>> stp.transform(X_train) # doctest: +SKIP
"""
_tags = {
# packaging info
# --------------
"authors": ["johannfaouzi", "Abhay-Lejith"],
# johannfaouzi is author of upstream pyts code
"python_dependencies": "pyts",
# univariate-only controls whether internal X can be univariate/multivariate
# if True (only univariate), always applies vectorization over variables
"univariate-only": True,
"scitype:transform-input": "Series",
"scitype:transform-output": "Primitives",
"scitype:instancewise": False,
"fit_is_empty": False,
"y_inner_mtype": "numpy1D",
"requires_y": True,
}
_estimator_attr = "_pyts_shapelet_transform"
def _get_pyts_class(self):
"""Get pyts class.
should import and return pyts class
"""
from pyts.transformation.shapelet_transform import ShapeletTransform
return ShapeletTransform
def __init__(
self,
n_shapelets="auto",
criterion="mutual_info",
window_sizes="auto",
window_steps=None,
remove_similar=True,
sort=False,
verbose=0,
random_state=None,
n_jobs=None,
):
self.n_shapelets = n_shapelets
self.criterion = criterion
self.window_sizes = window_sizes
self.window_steps = window_steps
self.remove_similar = remove_similar
self.sort = sort
self.verbose = verbose
self.random_state = random_state
self.n_jobs = n_jobs
super().__init__()
@classmethod
def get_test_params(cls, parameter_set="default"):
"""
Return testing parameter settings for the estimator.
Parameters
----------
parameter_set : str, default="default"
Name of the set of test parameters to return, for use in tests. If no
special parameters are defined for a value, will return `"default"` set.
Returns
-------
params : dict or list of dict, default = {}
Parameters to create testing instances of the class
Each dict are parameters to construct an "interesting" test instance, i.e.,
`MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance.
`create_test_instance` uses the first (or only) dictionary in `params`
"""
params1 = {"criterion": "anova", "n_shapelets": 10}
params2 = {"window_sizes": [3]}
return [params1, params2]