/
k_medoids.py
executable file
·196 lines (175 loc) · 7.21 KB
/
k_medoids.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
"""Time series kmedoids."""
__author__ = ["chrisholder", "TonyBagnall"]
from typing import Callable, Union
import numpy as np
from numpy.random import RandomState
from sktime.clustering.metrics.medoids import medoids
from sktime.clustering.partitioning import TimeSeriesLloyds
from sktime.distances import pairwise_distance
class TimeSeriesKMedoids(TimeSeriesLloyds):
"""Time series K-medoids implementation.
Parameters
----------
n_clusters: int, defaults = 8
The number of clusters to form as well as the number of
centroids to generate.
init_algorithm: str, np.ndarray (3d array of shape (n_clusters, n_dimensions,
series_length)), defaults = 'forgy'
Method for initializing cluster centers or an array of initial cluster centers.
If string, any of the following strings are valid:
['kmeans++', 'random', 'forgy'].
If 3D np.ndarray, initializes cluster centers with the provided array. The array
must have shape (n_clusters, n_dimensions, series_length) and the number of
clusters in the array must be the same as what is provided to the n_clusters
argument.
metric: str or Callable, defaults = 'dtw'
Distance metric to compute similarity between time series. Any of the following
are valid: ['dtw', 'euclidean', 'erp', 'edr', 'lcss', 'squared', 'ddtw', 'wdtw',
'wddtw']
n_init: int, defaults = 10
Number of times the k-means algorithm will be run with different
centroid seeds. The final result will be the best output of n_init
consecutive runs in terms of inertia.
max_iter: int, defaults = 30
Maximum number of iterations of the k-means algorithm for a single
run.
tol: float, defaults = 1e-6
Relative tolerance with regards to Frobenius norm of the difference
in the cluster centers of two consecutive iterations to declare
convergence.
verbose: bool, defaults = False
Verbosity mode.
random_state: int or np.random.RandomState instance or None, defaults = None
Determines random number generation for centroid initialization.
distance_params: dict, defaults = None
Dictionary containing kwargs for the distance metric being used.
Attributes
----------
cluster_centers_: np.ndarray (3d array of shape (n_clusters, n_dimensions,
series_length))
Time series that represent each of the cluster centers. If the algorithm stops
before fully converging these will not be consistent with labels_.
labels_: np.ndarray (1d array of shape (n_instance,))
Labels that is the index each time series belongs to.
inertia_: float
Sum of squared distances of samples to their closest cluster center, weighted by
the sample weights if provided.
n_iter_: int
Number of iterations run.
Examples
--------
>>> from sktime.datasets import load_arrow_head
>>> from sktime.clustering.k_medoids import TimeSeriesKMedoids
>>> X_train, y_train = load_arrow_head(split="train")
>>> X_test, y_test = load_arrow_head(split="test")
>>> clusterer = TimeSeriesKMedoids(n_clusters=3) # doctest: +SKIP
>>> clusterer.fit(X_train) # doctest: +SKIP
TimeSeriesKMedoids(n_clusters=3)
>>> y_pred = clusterer.predict(X_test) # doctest: +SKIP
"""
_tags = {
# packaging info
# --------------
"authors": ["chrisholder", "TonyBagnall"],
"python_dependencies": "numba",
}
def __init__(
self,
n_clusters: int = 8,
init_algorithm: Union[str, Callable] = "random",
metric: Union[str, Callable] = "dtw",
n_init: int = 10,
max_iter: int = 300,
tol: float = 1e-6,
verbose: bool = False,
random_state: Union[int, RandomState] = None,
distance_params: dict = None,
):
self._precomputed_pairwise = None
super().__init__(
n_clusters,
init_algorithm,
metric,
n_init,
max_iter,
tol,
verbose,
random_state,
distance_params,
)
def _fit(self, X: np.ndarray, y=None) -> np.ndarray:
"""Fit time series clusterer to training data.
Parameters
----------
X : np.ndarray (2d or 3d array of shape (n_instances, series_length) or shape
(n_instances, n_dimensions, series_length))
Training time series instances to cluster.
y: ignored, exists for API consistency reasons.
Returns
-------
self:
Fitted estimator.
"""
self._check_params(X)
self._precomputed_pairwise = pairwise_distance(
X, metric=self.metric, **self._distance_params
)
return super()._fit(X, y)
def _compute_new_cluster_centers(
self, X: np.ndarray, assignment_indexes: np.ndarray
) -> np.ndarray:
"""Compute new centers.
Parameters
----------
X : np.ndarray (3d array of shape (n_instances, n_dimensions, series_length))
Time series instances to predict their cluster indexes.
assignment_indexes: np.ndarray
Indexes that each time series in X belongs to.
Returns
-------
np.ndarray (3d of shape (n_clusters, n_dimensions, series_length)
New cluster center values.
"""
new_centers = np.zeros((self.n_clusters, X.shape[1], X.shape[2]))
for i in range(self.n_clusters):
curr_indexes = np.where(assignment_indexes == i)[0]
distance_matrix = np.zeros((len(curr_indexes), len(curr_indexes)))
for j in range(len(curr_indexes)):
curr_j = curr_indexes[j]
for k in range(len(curr_indexes)):
distance_matrix[j, k] = self._precomputed_pairwise[
curr_j, curr_indexes[k]
]
result = medoids(
X[curr_indexes], precomputed_pairwise_distance=distance_matrix
)
if result.shape[0] > 0:
new_centers[i, :] = result
return new_centers
@classmethod
def get_test_params(cls, parameter_set="default"):
"""Return testing parameter settings for the estimator.
Parameters
----------
parameter_set : str, default="default"
Name of the set of test parameters to return, for use in tests. If no
special parameters are defined for a value, will return ``"default"`` set.
Returns
-------
params : dict or list of dict, default = {}
Parameters to create testing instances of the class
Each dict are parameters to construct an "interesting" test instance, i.e.,
``MyClass(**params)`` or ``MyClass(**params[i])`` creates a valid test
instance.
``create_test_instance`` uses the first (or only) dictionary in ``params``
"""
return {
"n_clusters": 2,
"init_algorithm": "random",
"metric": "euclidean",
"n_init": 1,
"max_iter": 1,
"tol": 0.0001,
"verbose": False,
"random_state": 1,
}