-
-
Notifications
You must be signed in to change notification settings - Fork 1.3k
/
_minirocket_multivariate_variable.py
335 lines (284 loc) · 11.9 KB
/
_minirocket_multivariate_variable.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
"""Multivariate MiniRocket transformer."""
__author__ = ["angus924", "michaelfeil"]
__all__ = ["MiniRocketMultivariateVariable"]
import multiprocessing
import warnings
from typing import List, Union
import numpy as np
import pandas as pd
from sktime.transformations.base import BaseTransformer
class MiniRocketMultivariateVariable(BaseTransformer):
"""MINIROCKET (Multivariate, unequal length).
MINImally RandOm Convolutional KErnel Transform. [1]_
**Multivariate** and **unequal length**
A provisional and naive extension of MINIROCKET to multivariate input
with unequal length provided by the authors [2]_ . For better
performance, use the sktime class MiniRocket for univariate input,
and MiniRocketMultivariate to equal length multivariate input.
This transformer fits one set of paramereters per individual series,
and applies the transform with fitted parameter i to the i-th series in transform.
Vanilla use requires same number of series in fit and transform.
To fit and transform series at the same time,
without an identification of fit/transform instances,
wrap this transformer in ``FitInTransform``,
from ``sktime.transformations.compose``.
Parameters
----------
num_kernels : int, default=10,000
number of random convolutional kernels. The calculated number of features is the
nearest multiple of n_features_per_kernel(default 4)*84=336 < 50,000
(2*n_features_per_kernel(default 4)*num_kernels(default 10,000)).
max_dilations_per_kernel : int, default=32
maximum number of dilations per kernel.
reference_length : int or str, default = ``'max'``
series-length of reference, str defines how to infer from X during 'fit'.
options are ``'max'``, ``'mean'``, ``'median'``, ``'min'``.
pad_value_short_series : float or None, default=None
if padding series with len<9 to value. if None, not padding is performed.
n_jobs : int, default=1
The number of jobs to run in parallel for ``transform``. ``-1`` means using all
processors.
random_state : None or int, default = None
Examples
--------
>>> from sktime.transformations.panel.rocket import MiniRocketMultivariateVariable
>>> from sktime.datasets import load_japanese_vowels
>>> # load multivariate and unequal length dataset
>>> X_train, _ = load_japanese_vowels(split="train", return_X_y=True)
>>> X_test, _ = load_japanese_vowels(split="test", return_X_y=True)
>>> pre_clf = MiniRocketMultivariateVariable(
... pad_value_short_series=0.0
... ) # doctest: +SKIP
>>> pre_clf.fit(X_train, y=None) # doctest: +SKIP
MiniRocketMultivariateVariable(...)
>>> X_transformed = pre_clf.transform(X_test) # doctest: +SKIP
>>> X_transformed.shape # doctest: +SKIP
(370, 9996)
Raises
------
ValueError
If any multivariate series_length in X is < 9 and
pad_value_short_series is set to None
See Also
--------
MultiRocket, MiniRocket, MiniRocketMultivariate, Rocket
References
----------
.. [1] Angus Dempster, Daniel F Schmidt, Geoffrey I Webb
MINIROCKET: A Very Fast (Almost) Deterministic Transform for
Time Series Classification, 2020, arXiv:2012.08791
.. [2] Angus Dempster, Daniel F Schmidt, Geoffrey I Webb
https://github.com/angus924/minirocket
"""
_tags = {
"authors": ["angus924", "michaelfeil"],
"maintainers": ["angus924", "michaelfeil"],
"univariate-only": False,
"fit_is_empty": False,
"scitype:transform-input": "Series",
"scitype:transform-output": "Primitives",
"capability:unequal_length": True,
"scitype:transform-labels": "None",
"scitype:instancewise": False, # is this an instance-wise transform?
"X_inner_mtype": "df-list", # which mtypes do _fit/_predict support for X?
"y_inner_mtype": "None", # which mtypes do _fit/_predict support for X?
"requires_y": False,
"python_dependencies": "numba",
}
def __init__(
self,
num_kernels=10000,
max_dilations_per_kernel=32,
reference_length="max",
pad_value_short_series=None,
n_jobs=1,
random_state=None,
):
self.num_kernels = num_kernels
self.max_dilations_per_kernel = max_dilations_per_kernel
self.reference_length = reference_length
self._fitted_reference_length = None
self.pad_value_short_series = pad_value_short_series
self.n_jobs = n_jobs
self.random_state = random_state
if random_state is None:
self.random_state_ = random_state
elif isinstance(random_state, int):
self.random_state_ = np.int32(random_state)
else:
raise ValueError(
f"random_state in MiniRocketMultivariateVariable must be int or None, "
f"but found <{type(random_state)} {random_state}>"
)
self._reference_modes = ["max", "mean", "median", "min"]
if not (isinstance(reference_length, int) and reference_length >= 9) and not (
isinstance(reference_length, str)
and (reference_length in self._reference_modes)
):
raise ValueError(
"reference_length in MiniRocketMultivariateVariable must be int>=9 or "
"'max', 'mean', 'median', but found reference_length="
f"{reference_length}"
)
super().__init__()
def _fit(self, X: List[pd.DataFrame], y=None):
"""Fits dilations and biases to input time series.
Parameters
----------
X : pd.DataFrame
Dataframe with n_instances-rows and n_dimensions-columns,
each cell containing a series_length-long array.
n_dimensions is equal across all instances in ``X``, and
series_length is constant within each instance.
y : ignored argument for interface compatibility
Returns
-------
self
Raises
------
ValueError
If any multivariate series_length in X is < 9 and
pad_value_short_series is set to None
"""
from sktime.transformations.panel.rocket._minirocket_multi_var_numba import (
_fit_multi_var,
)
X_2d_t, lengths_1darray = _nested_dataframe_to_transposed2D_array_and_len_list(
X, pad=self.pad_value_short_series
)
if isinstance(self.reference_length, int):
_reference_length = self.reference_length
elif self.reference_length in self._reference_modes:
# np.mean, np.max, np.median, np.min ..
_reference_length = getattr(np, self.reference_length)(lengths_1darray)
else:
raise ValueError(
"reference_length in MiniRocketMultivariateVariable must be int>=9 or "
"'max', 'mean', 'median', but found reference_length="
f"{self.reference_length}"
)
self._fitted_reference_length = int(max(9, _reference_length))
if lengths_1darray.min() < 9:
failed_index = np.where(lengths_1darray < 9)[0]
raise ValueError(
f"X must be >= 9 for all samples, but found minimum to be "
f"{lengths_1darray.min()}; at index {failed_index}, pad shorter "
"series so that n_timepoints >= 9 for all samples."
)
if lengths_1darray.min() == lengths_1darray.max():
warnings.warn(
"X is of equal length, consider using MiniRocketMultivariate for "
"speedup and stability instead.",
stacklevel=2,
)
if X_2d_t.shape[0] == 1:
warnings.warn(
"X is univariate, consider using MiniRocket as Univariante for "
"speedup and stability instead.",
stacklevel=2,
)
self.parameters = _fit_multi_var(
X_2d_t,
L=lengths_1darray,
reference_length=self._fitted_reference_length,
num_features=self.num_kernels,
max_dilations_per_kernel=self.max_dilations_per_kernel,
seed=self.random_state_,
)
return self
def _transform(self, X, y=None):
"""Transform input time series.
Parameters
----------
X : pd.DataFrame with nested columns
Dataframe with n_instances-rows and n_dimensions-columns,
each cell containing a series_length-long array
y : ignored argument for interface compatibility
Returns
-------
pandas.DataFrame, size (n_instances, num_kernels)
Raises
------
ValueError
If any multivariate series_length in X is < 9 and
pad_value_short_series is set to None
"""
from numba import get_num_threads, set_num_threads
from sktime.transformations.panel.rocket._minirocket_multi_var_numba import (
_transform_multi_var,
)
X_2d_t, L = _nested_dataframe_to_transposed2D_array_and_len_list(
X, pad=self.pad_value_short_series
)
# change n_jobs depended on value and existing cores
prev_threads = get_num_threads()
if self.n_jobs < 1 or self.n_jobs > multiprocessing.cpu_count():
n_jobs = multiprocessing.cpu_count()
else:
n_jobs = self.n_jobs
set_num_threads(n_jobs)
X_ = _transform_multi_var(X_2d_t, L, self.parameters)
set_num_threads(prev_threads)
return pd.DataFrame(X_)
def _nested_dataframe_to_transposed2D_array_and_len_list(
X: List[pd.DataFrame], pad: Union[int, float, None] = 0
):
"""Convert a nested dataframe to a 2D array and a list of lengths.
Parameters
----------
X : List of dataframes
List of length n_instances, with
dataframes of series_length-rows and n_dimensions-columns
pad : float or None. if float/int,pads multivariate series with 'pad',
so that each series has at least length 9.
if None, no padding is applied.
Returns
-------
np.array: 2D array of shape =
[n_dimensions, sum(length_series(i) for i in n_instances)],
np.float32
np.array: 1D array of shape = [n_instances]
with length of each series, np.int32
Raises
------
ValueError
If any multivariate series_length in X is < 9 and
pad_value_short_series is set to None
"""
if not len(X):
raise ValueError("X is empty")
if isinstance(X, (tuple, list)) and isinstance(X[0], (pd.DataFrame, np.array)):
pass
else:
raise ValueError("X must be List of pd.DataFrame")
if not all(X[0].shape[1] == _x.shape[1] for _x in X):
raise ValueError(
"X must be nested pd.DataFrame or List of pd.DataFrame with n_dimensions"
)
vec = []
lengths = []
for _x in X:
_x_shape = _x.shape
if _x_shape[0] < 9:
if pad is not None:
# emergency: pad with zeros up to 9.
lengths.append(9)
vec.append(
np.vstack(
[_x.values, np.full([9 - _x_shape[0], _x_shape[1]], float(pad))]
)
)
else:
raise ValueError(
"X series_length must be >= 9 for all samples"
f"but sample with series_length {_x_shape[0]} found. Consider"
" padding, discard, or setting a pad_value_short_series value"
)
else:
lengths.append(_x_shape[0])
vec.append(_x.values)
X_2d_t = np.vstack(vec).T.astype(dtype=np.float32)
lengths = np.array(lengths, dtype=np.int32)
if not lengths.sum() == X_2d_t.shape[1]:
raise ValueError("X_new and lengths do not match. check input dimension")
return X_2d_t, lengths