Skip to content

Commit

Permalink
More clustering algos implemented
Browse files Browse the repository at this point in the history
  • Loading branch information
rtavenar committed Jun 12, 2017
1 parent ce793fb commit a92401c
Show file tree
Hide file tree
Showing 6 changed files with 286 additions and 49 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,15 @@ Also, for the whole package to run properly, its base directory should be append
* A `neighbors` module includes nearest neighbor algorithms to be used with time series
* A `clustering` module includes the following time series clustering algorithms:
* Standard Euclidean k-means (based on `sklearn.cluster.KMeans` with adequate array reshaping done for you)
* DBA k-means from Petitjean _et al._
* Global Alignment kernel k-means

# TODO list

* Add soft-DTW to the proposed metrics
* Implement Learning Shapelets from Grabocka et al. (Conv+L2, + unsupervised)
* Add local feature extractors (`TransformerMixin`)
* Add k-means DBA by Petitjean _et al._ (barycenter already coded) and soft-DTW k-means by Cuturi and Blondel
* Add soft-DTW k-means by Cuturi and Blondel
* Add metric learning for time series (Garreau _et al._)
* Add automatic retrieval of UCR/UEA datasets and 1M remote sensing time series
* Add LB_Keogh for nearest neighbor search
Expand Down
8 changes: 4 additions & 4 deletions tslearn/barycenters.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,9 @@ class DTWBarycenterAveraging:
barycenter_size : int or None, default None
Size of the barycenter to generate. If None, the size of the barycenter is that of the data provided at fit
time.
tol : float, default 1e-5
tol : float (default 1e-5)
Tolerance to use for early stopping: if the decrease in cost is lower than this value, the EM procedure stops.
verbose : boolean, default False
verbose : boolean (default False)
Whether to print information about the cost at each iteration or not.
References
Expand Down Expand Up @@ -72,7 +72,7 @@ def fit(self, X):
cost_prev, cost = numpy.inf, numpy.inf
for it in range(self.n_iter):
assign = self._petitjean_assignment(X_, barycenter)
barycenter = self._petitjean_update_b(X_, assign)
barycenter = self._petitjean_update_barycenter(X_, assign)
cost = self._petitjean_cost(X_, barycenter, assign)
if self.verbose:
print("[DBA] epoch %d, cost: %.3f" % (it + 1, cost))
Expand Down Expand Up @@ -101,7 +101,7 @@ def _petitjean_assignment(self, X, barycenter):
assign[1][pair[1]].append(pair[0])
return assign

def _petitjean_update_b(self, X, assign):
def _petitjean_update_barycenter(self, X, assign):
barycenter = numpy.zeros((self.barycenter_size, X.shape[-1]))
for t in range(self.barycenter_size):
barycenter[t] = X[assign[0][t], assign[1][t]].mean(axis=0)
Expand Down
174 changes: 170 additions & 4 deletions tslearn/clustering.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,26 @@
import numpy
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.cluster import KMeans
from sklearn.cluster.k_means_ import _k_init
from sklearn.utils import check_random_state
from sklearn.utils.extmath import row_norms
from sklearn.utils.validation import check_is_fitted
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist

from tslearn.metrics import cdist_gak
from tslearn.metrics import cdist_gak, cdist_dtw
from tslearn.barycenters import EuclideanBarycenter, DTWBarycenterAveraging
from tslearn.utils import npy3d_time_series_dataset


__author__ = 'Romain Tavenard romain.tavenard[at]univ-rennes2.fr'
# Derived from https://gist.github.com/mblondel/6230787 by Mathieu Blondel
# Kernel k-means is derived from https://gist.github.com/mblondel/6230787 by Mathieu Blondel
# License: BSD 3 clause

class EmptyClusterError(Exception):
def __init__(self, message=""):
super(EmptyClusterError, self).__init__(message)


class GlobalAlignmentKernelKMeans(BaseEstimator, ClusterMixin):
"""Global Alignment Kernel K-means.
Expand Down Expand Up @@ -84,6 +92,9 @@ def _fit_one_init(self, K, rs):
dist.fill(0)
self._compute_dist(K, dist)
self.labels_ = dist.argmin(axis=1)
for k in range(self.n_clusters):
if numpy.sum(self.labels_ == k) == 0:
raise EmptyClusterError
self.inertia_ = self._compute_inertia(dist)
if self.verbose:
print("%.3f" % self.inertia_, end=" --> ")
Expand Down Expand Up @@ -125,7 +136,7 @@ def fit(self, X, y=None, sample_weight=None):
last_correct_labels = self.labels_
min_inertia = self.inertia_
n_successful += 1
except ValueError:
except EmptyClusterError:
if self.verbose:
print("Resumed because of empty cluster")
if n_successful > 0:
Expand Down Expand Up @@ -173,7 +184,7 @@ def predict(self, X):
return dist.argmin(axis=1)


class TimeSeriesKMeans(KMeans):
class TimeSeriesKMeansOld(KMeans):
"""Standard Euclidean K-Means clustering for time series data.
Parameters
Expand Down Expand Up @@ -348,3 +359,158 @@ def predict(self, X):
n_ts, sz, d = X.shape
dists = euclidean_distances(X.reshape((n_ts, -1)), self.cluster_centers_.reshape((self.n_clusters, -1)))
return numpy.argmin(dists, axis=1)


class TimeSeriesKMeans(BaseEstimator, ClusterMixin):
"""K-means clustering for time-series data.
Parameters
----------
n_clusters : int (default: 3)
Number of clusters to form.
max_iter : int (default: 50)
Maximum number of iterations of the k-means algorithm for a single run.
tol : float (default: 1e-6)
Inertia variation threshold. If at some point, inertia varies less than this threshold between two consecutive
iterations, the model is considered to have converged and the algorithm stops.
n_init : int (default: 1)
Number of time the k-means algorithm will be run with different centroid seeds. The final results will be the
best output of n_init consecutive runs in terms of inertia.
metric : {"euclidean", "dtw"}, default: "euclidean"
Metric to be used for both cluster assignment and barycenter computation. If "dtw", DBA is used for barycenter
computation.
n_iter_dba : int (default: 100)
Number of iterations for the DBA barycenter computation process. Only used if `metric="dtw"`.
verbose : bool (default: True)
Whether or not to print information about the inertia while learning the model.
random_state : integer or numpy.RandomState, optional
Generator used to initialize the centers. If an integer is given, it fixes the seed. Defaults to the global
numpy random number generator.
Attributes
----------
labels_ : numpy.ndarray
Labels of each point.
cluster_centers_ : numpy.ndarray
Cluster centers.
inertia_ : float
Sum of distances of samples to their closest cluster center.
"""

def __init__(self, n_clusters=3, max_iter=50, tol=1e-6, n_init=1, metric="euclidean", n_iter_dba=100, verbose=True,
random_state=None):
self.n_clusters = n_clusters
self.max_iter = max_iter
self.tol = tol
self.random_state = random_state
self.metric = metric
self.n_init = n_init
self.verbose = verbose

self.labels_ = None
self.inertia_ = numpy.inf
self.cluster_centers_ = None
self.X_fit_ = None

if self.metric == "dtw":
self.dba_ = DTWBarycenterAveraging(n_iter=n_iter_dba, barycenter_size=None, verbose=False)

def _fit_one_init(self, X, x_squared_norms, rs):
n_samples, sz, d = X.shape
self.cluster_centers_ = _k_init(X.reshape((n_samples, -1)),
self.n_clusters, x_squared_norms, rs).reshape((-1, sz, d))
old_inertia = numpy.inf

for it in range(self.max_iter):
self._assign(X)
self._update_centroids(X)
if self.verbose:
print("%.3f" % self.inertia_, end=" --> ")

if numpy.abs(old_inertia - self.inertia_) < self.tol:
break
old_inertia = self.inertia_
if self.verbose:
print("")

return self

def _assign(self, X):
if self.metric == "euclidean":
dists = cdist(X.reshape((X.shape[0], -1)), self.cluster_centers_.reshape((self.n_clusters, -1)),
metric="euclidean")
elif self.metric == "dtw":
dists = cdist_dtw(X, self.cluster_centers_)
else:
raise ValueError("Incorrect metric: %s (should be one of 'dtw', 'euclidean')" % self.metric)
self.labels_ = dists.argmin(axis=1)
for k in range(self.n_clusters):
if numpy.sum(self.labels_ == k) == 0:
raise EmptyClusterError
self.inertia_ = numpy.sum(dists[numpy.arange(X.shape[0]), self.labels_] ** 2) / X.shape[0]

def _update_centroids(self, X):
for k in range(self.n_clusters):
if self.metric == "euclidean":
self.cluster_centers_[k] = EuclideanBarycenter().fit(X[self.labels_ == k])
elif self.metric == "dtw":
self.cluster_centers_[k] = self.dba_.fit(X[self.labels_ == k])
else:
raise ValueError("Incorrect metric: %s (should be one of 'dtw', 'euclidean')" % self.metric)

def fit(self, X, y=None, sample_weight=None):
"""Compute k-means clustering.
Parameters
----------
X : array-like, shape=(n_ts, sz, d)
Time series dataset.
"""
n_successful = 0

X_ = npy3d_time_series_dataset(X)
rs = check_random_state(self.random_state)
x_squared_norms = cdist(X_.reshape((X_.shape[0], -1)), numpy.zeros((1, X_.shape[1] * X_.shape[2])),
metric="sqeuclidean").reshape((1, -1))

last_correct_centroids = None
min_inertia = numpy.inf
for trial in range(self.n_init):
try:
if self.verbose:
print("Init %d" % (trial + 1))
self._fit_one_init(X_, x_squared_norms, rs)
if self.inertia_ < min_inertia:
last_correct_centroids = self.cluster_centers_.copy()
min_inertia = self.inertia_
n_successful += 1
except EmptyClusterError:
if self.verbose:
print("Resumed because of empty cluster")
if n_successful > 0:
self.X_fit_ = X_
self.cluster_centers_ = last_correct_centroids
self._assign(X_)
self.inertia_ = min_inertia
else:
self.X_fit_ = None
return self

def predict(self, X):
"""Predict the closest cluster each time series in X belongs to.
Parameters
----------
X : array-like, shape=(n_ts, sz, d)
Time series dataset to predict.
Returns
-------
labels : array, shape=(n_ts, )
Index of the cluster each sample belongs to.
"""
K = self._get_kernel(X, self.X_fit_)
n_samples = X.shape[0]
dist = numpy.zeros((n_samples, self.n_clusters))
self._compute_dist(K, dist)
return dist.argmin(axis=1)
39 changes: 20 additions & 19 deletions tslearn/docs/examples/ex_barycenter.ipynb

Large diffs are not rendered by default.

97 changes: 83 additions & 14 deletions tslearn/docs/examples/ex_clustering.ipynb

Large diffs are not rendered by default.

14 changes: 7 additions & 7 deletions tslearn/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,10 +94,10 @@ def cdist_dtw(dataset1, dataset2=None):
Parameters
----------
dataset1
dataset1 : array-like
A dataset of time series
dataset2
Another time series
dataset2 : array-like, default: None
Another time series. If `None`, self-similarity of `dataset1` is returned.
Returns
-------
Expand Down Expand Up @@ -139,7 +139,7 @@ def lr_dtw(s1, s2, gamma=0.):
A time series
s2
Another time series
gamma : float
gamma : float (default: 0.)
Regularization parameter
Returns
Expand Down Expand Up @@ -170,7 +170,7 @@ def lr_dtw_path(s1, s2, gamma=0.):
A time series
s2
Another time series
gamma : float
gamma : float (default: 0.)
Regularization parameter
Returns
Expand Down Expand Up @@ -204,7 +204,7 @@ def gak(s1, s2, sigma=1.):
A time series
s2
Another time series
sigma : float
sigma : float (default 1.)
Bandwidth of the internal gaussian kernel used for GAK
Returns
Expand Down Expand Up @@ -241,7 +241,7 @@ def cdist_gak(dataset1, dataset2=None, sigma=1.):
A dataset of time series
dataset2
Another time series
sigma : float
sigma : float (default 1.)
Bandwidth of the internal gaussian kernel used for GAK
Returns
Expand Down

0 comments on commit a92401c

Please sign in to comment.