[MRG] Fast, low memory, single linkage implementation (#11514)

* First cut at basic single linkage internals * Refer to correct dist_metrics package * Add csgraph sparse implementation for single linkage * Add fast labelling/conversion from MST to single linkage tree; remove uneeded single_linkage.pyx file. * Ensure existing tests cover single linkage * Name cingle linkage labelling correctly. * Iterating toward correct solution. Still have to get n_clusters, compute_full_tree=False working * Get n_components correct. * Update docstrings. * Fix the parents array when we don't get the "full tree" * Add single linkage to agglomerative clustering example. * Add single linkage to digits agglomerative clustering example. * Update documentation to reflect the addition of single linkage. * Update documentation to reflect the addition of single linkage. * Pep8 fix for class declaration in cython * Fix heading in clustering docs * Update the digits clustering text to reflect the new reality. * Provide a more complete comparison of the different linkage methods, highlighting the relative strengths and weaknesses. * We don't need connectivity here, and we can ignore issues with warnings for spectral clustering. * Add an explicit test that single linkage successfully works on examples it should perform well on. * Update docs with a more complete comparison on linkage methods (scale to be determined?) * List formatting in example linkage comparison. * Flake8 fixes. * Flake8 fixes. * More Flake8 fixes. * Fix agglomerative plot example with correct subplot spec * Explicitly test linkages (including single) produce results identical to scipy.cluster.hierarchical * Fix comment on why we sort (consistency) * Make dense single linkage faster * Add docstring to new mst-linkage-core computations. * Add a test that new single linkage code matches scipy * Ensure we only attemtp this for metrics Jake implemented. * Per amueller; it's a long paper, ref the figure. * Clean up a few things. * Too many blank lines for flake8 * Bad scipy slink input * Flake8 fixes * Clean up cython a little; fix typo/carryover * Convert memoryview to numpy array on return * Just convert to the correct dtype * Update sklearn/cluster/_hierarchical.pyx Co-Authored-By: Nicolas Hug <contact@nicolas-hug.com> * Update sklearn/cluster/_hierarchical.pyx Co-Authored-By: Nicolas Hug <contact@nicolas-hug.com> * Update sklearn/cluster/_hierarchical.pyx Co-Authored-By: Nicolas Hug <contact@nicolas-hug.com> * Update sklearn/cluster/tests/test_hierarchical.py Co-Authored-By: Nicolas Hug <contact@nicolas-hug.com> * Fixes as per @NicolasHug suggestions. * Update renaming of params in test_hierarchical * Relative import? * Ah, it got renamed in master... * A bad merge on my part. * In principle this is in sklearn.neighbors now... * No; not that way... * Declare dim before use. * Update sklearn/cluster/tests/test_hierarchical.py Co-Authored-By: Nicolas Hug <contact@nicolas-hug.com> * Remaining fixes per Nicolas Hug. * Update sklearn/cluster/tests/test_hierarchical.py Co-Authored-By: Nicolas Hug <contact@nicolas-hug.com> * Fix flake8 issues. * Switch from stable to mergesort per jnotham * Update sklearn/cluster/_hierarchical.py Co-Authored-By: Nicolas Hug <contact@nicolas-hug.com> * Skip checks that are already validated. * Update docstring per Gael's suggestion * Add a benchmark script for agglomerative clustering * Fix some flake8 issues * No flake8 on the one line * Update parameters and output for benchmark hierarchical * Switch to 2D plotting for hierarchical benchmark * Wrong colormap name * Formatting fpr bench hierarchical * Add an item to WhatsNew
scikit-learn · Nov 28, 2019 · 0f26e10 · 0f26e10
1 parent 46a6b8b
commit 0f26e10
Show file tree

Hide file tree

Showing 6 changed files with 229 additions and 3 deletions.
diff --git a/benchmarks/bench_plot_hierarchical.py b/benchmarks/bench_plot_hierarchical.py
@@ -0,0 +1,85 @@
+from collections import defaultdict
+from time import time
+
+import numpy as np
+from numpy import random as nr
+
+from sklearn.cluster import AgglomerativeClustering
+
+
+def compute_bench(samples_range, features_range):
+
+    it = 0
+    results = defaultdict(lambda: [])
+
+    max_it = len(samples_range) * len(features_range)
+    for n_samples in samples_range:
+        for n_features in features_range:
+            it += 1
+            print('==============================')
+            print('Iteration %03d of %03d' % (it, max_it))
+            print('n_samples %05d; n_features %02d' % (n_samples, n_features))
+            print('==============================')
+            print()
+            data = nr.randint(-50, 51, (n_samples, n_features))
+
+            for linkage in ("single", "average", "complete", "ward"):
+                print(linkage.capitalize())
+                tstart = time()
+                AgglomerativeClustering(
+                    linkage=linkage,
+                    n_clusters=10
+                ).fit(data)
+
+                delta = time() - tstart
+                print("Speed: %0.3fs" % delta)
+                print()
+
+                results[linkage].append(delta)
+
+    return results
+
+
+if __name__ == '__main__':
+    import matplotlib.pyplot as plt
+
+    samples_range = np.linspace(1000, 15000, 8).astype(np.int)
+    features_range = np.array([2, 10, 20, 50])
+
+    results = compute_bench(samples_range, features_range)
+
+    max_time = max([max(i) for i in [t for (label, t) in results.items()]])
+
+    colors = plt.get_cmap('tab10')(np.linspace(0, 1, 10))[:4]
+    lines = {linkage: None for linkage in results.keys()}
+    fig, axs = plt.subplots(2, 2, sharex=True, sharey=True)
+    fig.suptitle(
+        'Scikit-learn agglomerative clustering benchmark results',
+        fontsize=16
+    )
+    for c, (label, timings) in zip(colors,
+                                   sorted(results.items())):
+        timing_by_samples = np.asarray(timings).reshape(
+            samples_range.shape[0],
+            features_range.shape[0]
+        )
+
+        for n in range(timing_by_samples.shape[1]):
+            ax = axs.flatten()[n]
+            lines[label], = ax.plot(
+                samples_range,
+                timing_by_samples[:, n],
+                color=c,
+                label=label
+            )
+            ax.set_title('n_features = %d' % features_range[n])
+            if n >= 2:
+                ax.set_xlabel('n_samples')
+            if n % 2 == 0:
+                ax.set_ylabel('time (s)')
+
+    fig.subplots_adjust(right=0.8)
+    fig.legend([lines[link] for link in sorted(results.keys())],
+               sorted(results.keys()), loc="center right", fontsize=8)
+
+    plt.show()
diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
@@ -1687,6 +1687,7 @@ Drawbacks
 Calinski-Harabasz Index
 -----------------------
 
+
 If the ground truth labels are not known, the Calinski-Harabasz index
 (:func:`sklearn.metrics.calinski_harabasz_score`) - also known as the Variance 
 Ratio Criterion - can be used to evaluate the model, where a higher 

diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
@@ -168,6 +168,10 @@ Changelog
   `affinity='cosine'` and `X` has samples that are all-zeros. :pr:`7943` by
   :user:`mthorrell`.
 
+- |Enhancement| :class:`cluster.AgglomerativeClustering` has a faster and more
+  more memory efficient implementation of single linkage clustering.
+  :pr:`11514` by :user:`Leland McInnes <lmcinnes>`.
+
 :mod:`sklearn.compose`
 ......................
 

diff --git a/sklearn/cluster/_hierarchical.py b/sklearn/cluster/_hierarchical.py
@@ -18,6 +18,8 @@
 from ..metrics.pairwise import paired_distances, pairwise_distances
 from ..utils import check_array
 from ..utils.validation import check_memory
+from ..neighbors import DistanceMetric
+from ..neighbors._dist_metrics import METRIC_MAPPING
 
 from . import _hierarchical_fast as _hierarchical
 from ._feature_agglomeration import AgglomerationTransform
@@ -107,7 +109,7 @@ def _single_linkage_tree(connectivity, n_samples, n_nodes, n_clusters,
     mst_array = np.vstack([mst.row, mst.col, mst.data]).T
 
     # Sort edges of the min_spanning_tree by weight
-    mst_array = mst_array[np.argsort(mst_array.T[2]), :]
+    mst_array = mst_array[np.argsort(mst_array.T[2], kind='mergesort'), :]
 
     # Convert edge list into standard hierarchical clustering format
     single_linkage_tree = _hierarchical._single_linkage_label(mst_array)
@@ -464,7 +466,25 @@ def linkage_tree(X, connectivity=None, n_clusters=None, linkage='complete',
             X = affinity(X)
             i, j = np.triu_indices(X.shape[0], k=1)
             X = X[i, j]
-        out = hierarchy.linkage(X, method=linkage, metric=affinity)
+        if (linkage == 'single'
+                and affinity != 'precomputed'
+                and not callable(affinity)
+                and affinity in METRIC_MAPPING):
+
+            # We need the fast cythonized metric from neighbors
+            dist_metric = DistanceMetric.get_metric(affinity)
+
+            # The Cython routines used require contiguous arrays
+            X = np.ascontiguousarray(X, dtype=np.double)
+
+            mst = _hierarchical.mst_linkage_core(X, dist_metric)
+            # Sort edges of the min_spanning_tree by weight
+            mst = mst[np.argsort(mst.T[2], kind='mergesort'), :]
+
+            # Convert edge list into standard hierarchical clustering format
+            out = _hierarchical.single_linkage_label(mst)
+        else:
+            out = hierarchy.linkage(X, method=linkage, metric=affinity)
         children_ = out[:, :2].astype(np.int, copy=False)
 
         if return_distance:

diff --git a/sklearn/cluster/_hierarchical_fast.pyx b/sklearn/cluster/_hierarchical_fast.pyx
@@ -13,6 +13,7 @@ ctypedef np.int8_t INT8
 
 np.import_array()
 
+from ..neighbors._dist_metrics cimport DistanceMetric
 from ..utils._fast_dict cimport IntFloatDict
 
 # C++
@@ -26,6 +27,8 @@ ctypedef np.float64_t DTYPE_t
 ITYPE = np.intp
 ctypedef np.intp_t ITYPE_t
 
+from numpy.math cimport INFINITY
+
 ###############################################################################
 # Utilities for computing the ward momentum
 
@@ -446,3 +449,89 @@ def single_linkage_label(L):
         raise ValueError("Input MST array must be sorted by weight")
 
     return _single_linkage_label(L)
+
+
+# Implements MST-LINKAGE-CORE from https://arxiv.org/abs/1109.2378
+@cython.boundscheck(False)
+@cython.nonecheck(False)
+def mst_linkage_core(
+        DTYPE_t [:, ::1] raw_data,
+        DistanceMetric dist_metric):
+    """
+    Compute the necessary elements of a minimum spanning
+    tree for computation of single linkage clustering. This
+    represents the MST-LINKAGE-CORE algorithm (Figure 6) from
+    *Modern hierarchical, agglomerative clustering algorithms*
+    by Daniel Mullner (https://arxiv.org/abs/1109.2378).
+
+    In contrast to the scipy implementation is never computes
+    a full distance matrix, generating distances only as they
+    are needed and releasing them when no longer needed.
+
+    Parameters
+    ----------
+    raw_data: array of shape (n_samples, n_features)
+        The array of feature data to be clustered. Must be C-aligned
+
+    dist_metric: DistanceMetric
+        A DistanceMetric object conforming to the API from
+        ``sklearn.neighbors._dist_metrics.pxd`` that will be
+        used to compute distances.
+
+    Returns
+    -------
+    mst_core_data: array of shape (n_samples, 3)
+        An array providing information from which one
+        can either compute an MST, or the linkage hierarchy
+        very efficiently. See https://arxiv.org/abs/1109.2378
+        algorithm MST-LINKAGE-CORE for more details.
+    """
+    cdef:
+        ITYPE_t n_samples = raw_data.shape[0]
+        np.int8_t[:] in_tree = np.zeros(n_samples, dtype=np.int8)
+        DTYPE_t[:, ::1] result = np.zeros((n_samples - 1, 3))
+
+        np.ndarray label_filter
+
+        ITYPE_t current_node = 0
+        ITYPE_t new_node
+        ITYPE_t i
+        ITYPE_t j
+        ITYPE_t num_features = raw_data.shape[1]
+
+        DTYPE_t right_value
+        DTYPE_t left_value
+        DTYPE_t new_distance
+
+        DTYPE_t[:] current_distances = np.full(n_samples, INFINITY)
+
+    for i in range(n_samples - 1):
+
+        in_tree[current_node] = 1
+
+        new_distance = INFINITY
+        new_node = 0
+
+        for j in range(n_samples):
+            if in_tree[j]:
+                continue
+
+            right_value = current_distances[j]
+            left_value = dist_metric.dist(&raw_data[current_node, 0],
+                                          &raw_data[j, 0],
+                                          num_features)
+
+            if left_value < right_value:
+                current_distances[j] = left_value
+
+            if current_distances[j] < new_distance:
+                new_distance = current_distances[j]
+                new_node = j
+
+        result[i, 0] = current_node
+        result[i, 1] = new_node
+        result[i, 2] = new_distance
+        current_node = new_node
+
+    return np.array(result)
+
diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py
@@ -280,7 +280,7 @@ def assess_same_labelling(cut1, cut2):
     assert (co_clust[0] == co_clust[1]).all()
 
 
-def test_scikit_vs_scipy():
+def test_sparse_scikit_vs_scipy():
     # Test scikit linkage with full connectivity (i.e. unstructured) vs scipy
     n, p, k = 10, 5, 3
     rng = np.random.RandomState(0)
@@ -314,6 +314,33 @@ def test_scikit_vs_scipy():
         _hc_cut(n_leaves + 1, children, n_leaves)
 
 
+# Make sure our custom mst_linkage_core gives
+# the same results as scipy's builtin
+@pytest.mark.parametrize('seed', range(5))
+def test_vector_scikit_single_vs_scipy_single(seed):
+    n_samples, n_features, n_clusters = 10, 5, 3
+    rng = np.random.RandomState(seed)
+    X = .1 * rng.normal(size=(n_samples, n_features))
+    X -= 4. * np.arange(n_samples)[:, np.newaxis]
+    X -= X.mean(axis=1)[:, np.newaxis]
+
+    out = hierarchy.linkage(X, method='single')
+    children_scipy = out[:, :2].astype(np.int)
+
+    children, _, n_leaves, _ = _TREE_BUILDERS['single'](X)
+
+    # Sort the order of child nodes per row for consistency
+    children.sort(axis=1)
+    assert_array_equal(children, children_scipy,
+                       'linkage tree differs'
+                       ' from scipy impl for'
+                       ' single linkage.')
+
+    cut = _hc_cut(n_clusters, children, n_leaves)
+    cut_scipy = _hc_cut(n_clusters, children_scipy, n_leaves)
+    assess_same_labelling(cut, cut_scipy)
+
+
 def test_identical_points():
     # Ensure identical points are handled correctly when using mst with
     # a sparse connectivity matrix