More clustering algos implemented

tslearn-team · Jun 12, 2017 · a92401c · a92401c
1 parent ce793fb
commit a92401c
Show file tree

Hide file tree

Showing 6 changed files with 286 additions and 49 deletions.
diff --git a/README.md b/README.md
@@ -37,14 +37,15 @@ Also, for the whole package to run properly, its base directory should be append
 * A `neighbors` module includes nearest neighbor algorithms to be used with time series
 * A `clustering` module includes the following time series clustering algorithms:
   * Standard Euclidean k-means (based on `sklearn.cluster.KMeans` with adequate array reshaping done for you)
+  * DBA k-means from Petitjean _et al._
   * Global Alignment kernel k-means
 
 # TODO list
 
 * Add soft-DTW to the proposed metrics
 * Implement Learning Shapelets from Grabocka et al. (Conv+L2, + unsupervised)
 * Add local feature extractors (`TransformerMixin`)
-* Add k-means DBA by Petitjean _et al._ (barycenter already coded) and soft-DTW k-means by Cuturi and Blondel
+* Add soft-DTW k-means by Cuturi and Blondel
 * Add metric learning for time series (Garreau _et al._)
 * Add automatic retrieval of UCR/UEA datasets and 1M remote sensing time series
 * Add LB_Keogh for nearest neighbor search

diff --git a/tslearn/barycenters.py b/tslearn/barycenters.py
@@ -36,9 +36,9 @@ class DTWBarycenterAveraging:
     barycenter_size : int or None, default None
         Size of the barycenter to generate. If None, the size of the barycenter is that of the data provided at fit
         time.
-    tol : float, default 1e-5
+    tol : float (default 1e-5)
         Tolerance to use for early stopping: if the decrease in cost is lower than this value, the EM procedure stops.
-    verbose : boolean, default False
+    verbose : boolean (default False)
         Whether to print information about the cost at each iteration or not.
     
     References
@@ -72,7 +72,7 @@ def fit(self, X):
         cost_prev, cost = numpy.inf, numpy.inf
         for it in range(self.n_iter):
             assign = self._petitjean_assignment(X_, barycenter)
-            barycenter = self._petitjean_update_b(X_, assign)
+            barycenter = self._petitjean_update_barycenter(X_, assign)
             cost = self._petitjean_cost(X_, barycenter, assign)
             if self.verbose:
                 print("[DBA] epoch %d, cost: %.3f" % (it + 1, cost))
@@ -101,7 +101,7 @@ def _petitjean_assignment(self, X, barycenter):
                 assign[1][pair[1]].append(pair[0])
         return assign
 
-    def _petitjean_update_b(self, X, assign):
+    def _petitjean_update_barycenter(self, X, assign):
         barycenter = numpy.zeros((self.barycenter_size, X.shape[-1]))
         for t in range(self.barycenter_size):
             barycenter[t] = X[assign[0][t], assign[1][t]].mean(axis=0)

diff --git a/tslearn/clustering.py b/tslearn/clustering.py
@@ -1,18 +1,26 @@
 import numpy
 from sklearn.base import BaseEstimator, ClusterMixin
 from sklearn.cluster import KMeans
+from sklearn.cluster.k_means_ import _k_init
 from sklearn.utils import check_random_state
+from sklearn.utils.extmath import row_norms
 from sklearn.utils.validation import check_is_fitted
 from sklearn.metrics import euclidean_distances
+from scipy.spatial.distance import cdist
 
-from tslearn.metrics import cdist_gak
+from tslearn.metrics import cdist_gak, cdist_dtw
+from tslearn.barycenters import EuclideanBarycenter, DTWBarycenterAveraging
 from tslearn.utils import npy3d_time_series_dataset
 
 
 __author__ = 'Romain Tavenard romain.tavenard[at]univ-rennes2.fr'
-# Derived from https://gist.github.com/mblondel/6230787 by Mathieu Blondel
+# Kernel k-means is derived from https://gist.github.com/mblondel/6230787 by Mathieu Blondel
 # License: BSD 3 clause
 
+class EmptyClusterError(Exception):
+    def __init__(self, message=""):
+        super(EmptyClusterError, self).__init__(message)
+
 
 class GlobalAlignmentKernelKMeans(BaseEstimator, ClusterMixin):
     """Global Alignment Kernel K-means.
@@ -84,6 +92,9 @@ def _fit_one_init(self, K, rs):
             dist.fill(0)
             self._compute_dist(K, dist)
             self.labels_ = dist.argmin(axis=1)
+            for k in range(self.n_clusters):
+                if numpy.sum(self.labels_ == k) == 0:
+                    raise EmptyClusterError
             self.inertia_ = self._compute_inertia(dist)
             if self.verbose:
                 print("%.3f" % self.inertia_, end=" --> ")
@@ -125,7 +136,7 @@ def fit(self, X, y=None, sample_weight=None):
                     last_correct_labels = self.labels_
                     min_inertia = self.inertia_
                 n_successful += 1
-            except ValueError:
+            except EmptyClusterError:
                 if self.verbose:
                     print("Resumed because of empty cluster")
         if n_successful > 0:
@@ -173,7 +184,7 @@ def predict(self, X):
         return dist.argmin(axis=1)
 
 
-class TimeSeriesKMeans(KMeans):
+class TimeSeriesKMeansOld(KMeans):
     """Standard Euclidean K-Means clustering for time series data.
 
     Parameters
@@ -348,3 +359,158 @@ def predict(self, X):
         n_ts, sz, d = X.shape
         dists = euclidean_distances(X.reshape((n_ts, -1)), self.cluster_centers_.reshape((self.n_clusters, -1)))
         return numpy.argmin(dists, axis=1)
+
+
+class TimeSeriesKMeans(BaseEstimator, ClusterMixin):
+    """K-means clustering for time-series data.
+
+    Parameters
+    ----------
+    n_clusters : int (default: 3)
+        Number of clusters to form.
+    max_iter : int (default: 50)
+        Maximum number of iterations of the k-means algorithm for a single run.
+    tol : float (default: 1e-6)
+        Inertia variation threshold. If at some point, inertia varies less than this threshold between two consecutive
+        iterations, the model is considered to have converged and the algorithm stops.
+    n_init : int (default: 1)
+        Number of time the k-means algorithm will be run with different centroid seeds. The final results will be the
+        best output of n_init consecutive runs in terms of inertia.
+    metric : {"euclidean", "dtw"}, default: "euclidean"
+        Metric to be used for both cluster assignment and barycenter computation. If "dtw", DBA is used for barycenter
+        computation.
+    n_iter_dba : int (default: 100)
+        Number of iterations for the DBA barycenter computation process. Only used if `metric="dtw"`.
+    verbose : bool (default: True)
+        Whether or not to print information about the inertia while learning the model.
+    random_state : integer or numpy.RandomState, optional
+        Generator used to initialize the centers. If an integer is given, it fixes the seed. Defaults to the global
+        numpy random number generator.
+
+    Attributes
+    ----------
+    labels_ : numpy.ndarray
+        Labels of each point.
+    cluster_centers_ : numpy.ndarray
+        Cluster centers.
+    inertia_ : float
+        Sum of distances of samples to their closest cluster center.
+    """
+
+    def __init__(self, n_clusters=3, max_iter=50, tol=1e-6, n_init=1, metric="euclidean", n_iter_dba=100, verbose=True,
+                 random_state=None):
+        self.n_clusters = n_clusters
+        self.max_iter = max_iter
+        self.tol = tol
+        self.random_state = random_state
+        self.metric = metric
+        self.n_init = n_init
+        self.verbose = verbose
+
+        self.labels_ = None
+        self.inertia_ = numpy.inf
+        self.cluster_centers_ = None
+        self.X_fit_ = None
+
+        if self.metric == "dtw":
+            self.dba_ = DTWBarycenterAveraging(n_iter=n_iter_dba, barycenter_size=None, verbose=False)
+
+    def _fit_one_init(self, X, x_squared_norms, rs):
+        n_samples, sz, d = X.shape
+        self.cluster_centers_ = _k_init(X.reshape((n_samples, -1)),
+                                        self.n_clusters, x_squared_norms, rs).reshape((-1, sz, d))
+        old_inertia = numpy.inf
+
+        for it in range(self.max_iter):
+            self._assign(X)
+            self._update_centroids(X)
+            if self.verbose:
+                print("%.3f" % self.inertia_, end=" --> ")
+
+            if numpy.abs(old_inertia - self.inertia_) < self.tol:
+                break
+            old_inertia = self.inertia_
+        if self.verbose:
+            print("")
+
+        return self
+
+    def _assign(self, X):
+        if self.metric == "euclidean":
+            dists = cdist(X.reshape((X.shape[0], -1)), self.cluster_centers_.reshape((self.n_clusters, -1)),
+                          metric="euclidean")
+        elif self.metric == "dtw":
+            dists = cdist_dtw(X, self.cluster_centers_)
+        else:
+            raise ValueError("Incorrect metric: %s (should be one of 'dtw', 'euclidean')" % self.metric)
+        self.labels_ = dists.argmin(axis=1)
+        for k in range(self.n_clusters):
+            if numpy.sum(self.labels_ == k) == 0:
+                raise EmptyClusterError
+        self.inertia_ = numpy.sum(dists[numpy.arange(X.shape[0]), self.labels_] ** 2) / X.shape[0]
+
+    def _update_centroids(self, X):
+        for k in range(self.n_clusters):
+            if self.metric == "euclidean":
+                self.cluster_centers_[k] = EuclideanBarycenter().fit(X[self.labels_ == k])
+            elif self.metric == "dtw":
+                self.cluster_centers_[k] = self.dba_.fit(X[self.labels_ == k])
+            else:
+                raise ValueError("Incorrect metric: %s (should be one of 'dtw', 'euclidean')" % self.metric)
+
+    def fit(self, X, y=None, sample_weight=None):
+        """Compute k-means clustering.
+
+        Parameters
+        ----------
+        X : array-like, shape=(n_ts, sz, d)
+            Time series dataset.
+        """
+        n_successful = 0
+
+        X_ = npy3d_time_series_dataset(X)
+        rs = check_random_state(self.random_state)
+        x_squared_norms = cdist(X_.reshape((X_.shape[0], -1)), numpy.zeros((1, X_.shape[1] * X_.shape[2])),
+                                metric="sqeuclidean").reshape((1, -1))
+
+        last_correct_centroids = None
+        min_inertia = numpy.inf
+        for trial in range(self.n_init):
+            try:
+                if self.verbose:
+                    print("Init %d" % (trial + 1))
+                self._fit_one_init(X_, x_squared_norms, rs)
+                if self.inertia_ < min_inertia:
+                    last_correct_centroids = self.cluster_centers_.copy()
+                    min_inertia = self.inertia_
+                n_successful += 1
+            except EmptyClusterError:
+                if self.verbose:
+                    print("Resumed because of empty cluster")
+        if n_successful > 0:
+            self.X_fit_ = X_
+            self.cluster_centers_ = last_correct_centroids
+            self._assign(X_)
+            self.inertia_ = min_inertia
+        else:
+            self.X_fit_ = None
+        return self
+
+    def predict(self, X):
+        """Predict the closest cluster each time series in X belongs to.
+
+        Parameters
+        ----------
+        X : array-like, shape=(n_ts, sz, d)
+            Time series dataset to predict.
+
+        Returns
+        -------
+        labels : array, shape=(n_ts, )
+            Index of the cluster each sample belongs to.
+        """
+        K = self._get_kernel(X, self.X_fit_)
+        n_samples = X.shape[0]
+        dist = numpy.zeros((n_samples, self.n_clusters))
+        self._compute_dist(K, dist)
+        return dist.argmin(axis=1)
diff --git a/tslearn/docs/examples/ex_barycenter.ipynb b/tslearn/docs/examples/ex_barycenter.ipynb
diff --git a/tslearn/docs/examples/ex_clustering.ipynb b/tslearn/docs/examples/ex_clustering.ipynb
diff --git a/tslearn/metrics.py b/tslearn/metrics.py
@@ -94,10 +94,10 @@ def cdist_dtw(dataset1, dataset2=None):
 
     Parameters
     ----------
-    dataset1
+    dataset1 : array-like
         A dataset of time series
-    dataset2
-        Another time series
+    dataset2 : array-like, default: None
+        Another time series. If `None`, self-similarity of `dataset1` is returned.
 
     Returns
     -------
@@ -139,7 +139,7 @@ def lr_dtw(s1, s2, gamma=0.):
         A time series
     s2
         Another time series
-    gamma : float
+    gamma : float (default: 0.)
         Regularization parameter
 
     Returns
@@ -170,7 +170,7 @@ def lr_dtw_path(s1, s2, gamma=0.):
         A time series
     s2
         Another time series
-    gamma : float
+    gamma : float (default: 0.)
         Regularization parameter
 
     Returns
@@ -204,7 +204,7 @@ def gak(s1, s2, sigma=1.):
         A time series
     s2
         Another time series
-    sigma : float
+    sigma : float (default 1.)
         Bandwidth of the internal gaussian kernel used for GAK
 
     Returns
@@ -241,7 +241,7 @@ def cdist_gak(dataset1, dataset2=None, sigma=1.):
         A dataset of time series
     dataset2
         Another time series
-    sigma : float
+    sigma : float (default 1.)
         Bandwidth of the internal gaussian kernel used for GAK
 
     Returns