Skip to content

Commit

Permalink
[MRG] Fast, low memory, single linkage implementation (#11514)
Browse files Browse the repository at this point in the history
* First cut at basic single linkage internals

* Refer to correct dist_metrics package

* Add csgraph sparse implementation for single linkage

* Add fast labelling/conversion from MST to single linkage tree; remove uneeded single_linkage.pyx file.

* Ensure existing tests cover single linkage

* Name cingle linkage labelling correctly.

* Iterating toward correct solution. Still have to get n_clusters, compute_full_tree=False working

* Get n_components correct.

* Update docstrings.

* Fix the parents array when we don't get the "full tree"

* Add single linkage to agglomerative clustering example.

* Add single linkage to digits agglomerative clustering example.

* Update documentation to reflect the addition of single linkage.

* Update documentation to reflect the addition of single linkage.

* Pep8 fix for class declaration in cython

* Fix heading in clustering docs

* Update the digits clustering text to reflect the new reality.

* Provide a more complete comparison of the different linkage methods, highlighting the relative strengths and weaknesses.

* We don't need connectivity here, and we can ignore issues with warnings for spectral clustering.

* Add an explicit test that single linkage successfully works on examples it should perform well on.

* Update docs with a more complete comparison on linkage methods (scale to be determined?)

* List formatting in example linkage comparison.

* Flake8 fixes.

* Flake8 fixes.

* More Flake8 fixes.

* Fix agglomerative plot example with correct subplot spec

* Explicitly test linkages (including single) produce results identical to scipy.cluster.hierarchical

* Fix comment on why we sort (consistency)

* Make dense single linkage faster

* Add docstring to new mst-linkage-core computations.

* Add a test that new single linkage code matches scipy

* Ensure we only attemtp this for metrics Jake implemented.

* Per amueller; it's a long paper, ref the figure.

* Clean up a few things.

* Too many blank lines for flake8

* Bad scipy slink input

* Flake8 fixes

* Clean up cython a little; fix typo/carryover

* Convert memoryview to numpy array on return

* Just convert to the correct dtype

* Update sklearn/cluster/_hierarchical.pyx

Co-Authored-By: Nicolas Hug <contact@nicolas-hug.com>

* Update sklearn/cluster/_hierarchical.pyx

Co-Authored-By: Nicolas Hug <contact@nicolas-hug.com>

* Update sklearn/cluster/_hierarchical.pyx

Co-Authored-By: Nicolas Hug <contact@nicolas-hug.com>

* Update sklearn/cluster/tests/test_hierarchical.py

Co-Authored-By: Nicolas Hug <contact@nicolas-hug.com>

* Fixes as per @NicolasHug suggestions.

* Update renaming of params in test_hierarchical

* Relative import?

* Ah, it got renamed in master...

* A bad merge on my part.

* In principle this is in sklearn.neighbors now...

* No; not that way...

* Declare dim before use.

* Update sklearn/cluster/tests/test_hierarchical.py

Co-Authored-By: Nicolas Hug <contact@nicolas-hug.com>

* Remaining fixes per Nicolas Hug.

* Update sklearn/cluster/tests/test_hierarchical.py

Co-Authored-By: Nicolas Hug <contact@nicolas-hug.com>

* Fix flake8 issues.

* Switch from stable to mergesort per jnotham

* Update sklearn/cluster/_hierarchical.py

Co-Authored-By: Nicolas Hug <contact@nicolas-hug.com>

* Skip checks that are already validated.

* Update docstring per Gael's suggestion

* Add a benchmark script for agglomerative clustering

* Fix some flake8 issues

* No flake8 on the one line

* Update parameters and output for benchmark hierarchical

* Switch to 2D plotting for hierarchical benchmark

* Wrong colormap name

* Formatting fpr bench hierarchical

* Add an item to WhatsNew
  • Loading branch information
lmcinnes authored and jnothman committed Nov 28, 2019
1 parent 46a6b8b commit 0f26e10
Show file tree
Hide file tree
Showing 6 changed files with 229 additions and 3 deletions.
85 changes: 85 additions & 0 deletions benchmarks/bench_plot_hierarchical.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
from collections import defaultdict
from time import time

import numpy as np
from numpy import random as nr

from sklearn.cluster import AgglomerativeClustering


def compute_bench(samples_range, features_range):

it = 0
results = defaultdict(lambda: [])

max_it = len(samples_range) * len(features_range)
for n_samples in samples_range:
for n_features in features_range:
it += 1
print('==============================')
print('Iteration %03d of %03d' % (it, max_it))
print('n_samples %05d; n_features %02d' % (n_samples, n_features))
print('==============================')
print()
data = nr.randint(-50, 51, (n_samples, n_features))

for linkage in ("single", "average", "complete", "ward"):
print(linkage.capitalize())
tstart = time()
AgglomerativeClustering(
linkage=linkage,
n_clusters=10
).fit(data)

delta = time() - tstart
print("Speed: %0.3fs" % delta)
print()

results[linkage].append(delta)

return results


if __name__ == '__main__':
import matplotlib.pyplot as plt

samples_range = np.linspace(1000, 15000, 8).astype(np.int)
features_range = np.array([2, 10, 20, 50])

results = compute_bench(samples_range, features_range)

max_time = max([max(i) for i in [t for (label, t) in results.items()]])

colors = plt.get_cmap('tab10')(np.linspace(0, 1, 10))[:4]
lines = {linkage: None for linkage in results.keys()}
fig, axs = plt.subplots(2, 2, sharex=True, sharey=True)
fig.suptitle(
'Scikit-learn agglomerative clustering benchmark results',
fontsize=16
)
for c, (label, timings) in zip(colors,
sorted(results.items())):
timing_by_samples = np.asarray(timings).reshape(
samples_range.shape[0],
features_range.shape[0]
)

for n in range(timing_by_samples.shape[1]):
ax = axs.flatten()[n]
lines[label], = ax.plot(
samples_range,
timing_by_samples[:, n],
color=c,
label=label
)
ax.set_title('n_features = %d' % features_range[n])
if n >= 2:
ax.set_xlabel('n_samples')
if n % 2 == 0:
ax.set_ylabel('time (s)')

fig.subplots_adjust(right=0.8)
fig.legend([lines[link] for link in sorted(results.keys())],
sorted(results.keys()), loc="center right", fontsize=8)

plt.show()
1 change: 1 addition & 0 deletions doc/modules/clustering.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1687,6 +1687,7 @@ Drawbacks
Calinski-Harabasz Index
-----------------------


If the ground truth labels are not known, the Calinski-Harabasz index
(:func:`sklearn.metrics.calinski_harabasz_score`) - also known as the Variance
Ratio Criterion - can be used to evaluate the model, where a higher
Expand Down
4 changes: 4 additions & 0 deletions doc/whats_new/v0.22.rst
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,10 @@ Changelog
`affinity='cosine'` and `X` has samples that are all-zeros. :pr:`7943` by
:user:`mthorrell`.

- |Enhancement| :class:`cluster.AgglomerativeClustering` has a faster and more
more memory efficient implementation of single linkage clustering.
:pr:`11514` by :user:`Leland McInnes <lmcinnes>`.

:mod:`sklearn.compose`
......................

Expand Down
24 changes: 22 additions & 2 deletions sklearn/cluster/_hierarchical.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
from ..metrics.pairwise import paired_distances, pairwise_distances
from ..utils import check_array
from ..utils.validation import check_memory
from ..neighbors import DistanceMetric
from ..neighbors._dist_metrics import METRIC_MAPPING

from . import _hierarchical_fast as _hierarchical
from ._feature_agglomeration import AgglomerationTransform
Expand Down Expand Up @@ -107,7 +109,7 @@ def _single_linkage_tree(connectivity, n_samples, n_nodes, n_clusters,
mst_array = np.vstack([mst.row, mst.col, mst.data]).T

# Sort edges of the min_spanning_tree by weight
mst_array = mst_array[np.argsort(mst_array.T[2]), :]
mst_array = mst_array[np.argsort(mst_array.T[2], kind='mergesort'), :]

# Convert edge list into standard hierarchical clustering format
single_linkage_tree = _hierarchical._single_linkage_label(mst_array)
Expand Down Expand Up @@ -464,7 +466,25 @@ def linkage_tree(X, connectivity=None, n_clusters=None, linkage='complete',
X = affinity(X)
i, j = np.triu_indices(X.shape[0], k=1)
X = X[i, j]
out = hierarchy.linkage(X, method=linkage, metric=affinity)
if (linkage == 'single'
and affinity != 'precomputed'
and not callable(affinity)
and affinity in METRIC_MAPPING):

# We need the fast cythonized metric from neighbors
dist_metric = DistanceMetric.get_metric(affinity)

# The Cython routines used require contiguous arrays
X = np.ascontiguousarray(X, dtype=np.double)

mst = _hierarchical.mst_linkage_core(X, dist_metric)
# Sort edges of the min_spanning_tree by weight
mst = mst[np.argsort(mst.T[2], kind='mergesort'), :]

# Convert edge list into standard hierarchical clustering format
out = _hierarchical.single_linkage_label(mst)
else:
out = hierarchy.linkage(X, method=linkage, metric=affinity)
children_ = out[:, :2].astype(np.int, copy=False)

if return_distance:
Expand Down
89 changes: 89 additions & 0 deletions sklearn/cluster/_hierarchical_fast.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ ctypedef np.int8_t INT8

np.import_array()

from ..neighbors._dist_metrics cimport DistanceMetric
from ..utils._fast_dict cimport IntFloatDict

# C++
Expand All @@ -26,6 +27,8 @@ ctypedef np.float64_t DTYPE_t
ITYPE = np.intp
ctypedef np.intp_t ITYPE_t

from numpy.math cimport INFINITY

###############################################################################
# Utilities for computing the ward momentum

Expand Down Expand Up @@ -446,3 +449,89 @@ def single_linkage_label(L):
raise ValueError("Input MST array must be sorted by weight")

return _single_linkage_label(L)


# Implements MST-LINKAGE-CORE from https://arxiv.org/abs/1109.2378
@cython.boundscheck(False)
@cython.nonecheck(False)
def mst_linkage_core(
DTYPE_t [:, ::1] raw_data,
DistanceMetric dist_metric):
"""
Compute the necessary elements of a minimum spanning
tree for computation of single linkage clustering. This
represents the MST-LINKAGE-CORE algorithm (Figure 6) from
*Modern hierarchical, agglomerative clustering algorithms*
by Daniel Mullner (https://arxiv.org/abs/1109.2378).
In contrast to the scipy implementation is never computes
a full distance matrix, generating distances only as they
are needed and releasing them when no longer needed.
Parameters
----------
raw_data: array of shape (n_samples, n_features)
The array of feature data to be clustered. Must be C-aligned
dist_metric: DistanceMetric
A DistanceMetric object conforming to the API from
``sklearn.neighbors._dist_metrics.pxd`` that will be
used to compute distances.
Returns
-------
mst_core_data: array of shape (n_samples, 3)
An array providing information from which one
can either compute an MST, or the linkage hierarchy
very efficiently. See https://arxiv.org/abs/1109.2378
algorithm MST-LINKAGE-CORE for more details.
"""
cdef:
ITYPE_t n_samples = raw_data.shape[0]
np.int8_t[:] in_tree = np.zeros(n_samples, dtype=np.int8)
DTYPE_t[:, ::1] result = np.zeros((n_samples - 1, 3))

np.ndarray label_filter

ITYPE_t current_node = 0
ITYPE_t new_node
ITYPE_t i
ITYPE_t j
ITYPE_t num_features = raw_data.shape[1]

DTYPE_t right_value
DTYPE_t left_value
DTYPE_t new_distance

DTYPE_t[:] current_distances = np.full(n_samples, INFINITY)

for i in range(n_samples - 1):

in_tree[current_node] = 1

new_distance = INFINITY
new_node = 0

for j in range(n_samples):
if in_tree[j]:
continue

right_value = current_distances[j]
left_value = dist_metric.dist(&raw_data[current_node, 0],
&raw_data[j, 0],
num_features)

if left_value < right_value:
current_distances[j] = left_value

if current_distances[j] < new_distance:
new_distance = current_distances[j]
new_node = j

result[i, 0] = current_node
result[i, 1] = new_node
result[i, 2] = new_distance
current_node = new_node

return np.array(result)

29 changes: 28 additions & 1 deletion sklearn/cluster/tests/test_hierarchical.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@ def assess_same_labelling(cut1, cut2):
assert (co_clust[0] == co_clust[1]).all()


def test_scikit_vs_scipy():
def test_sparse_scikit_vs_scipy():
# Test scikit linkage with full connectivity (i.e. unstructured) vs scipy
n, p, k = 10, 5, 3
rng = np.random.RandomState(0)
Expand Down Expand Up @@ -314,6 +314,33 @@ def test_scikit_vs_scipy():
_hc_cut(n_leaves + 1, children, n_leaves)


# Make sure our custom mst_linkage_core gives
# the same results as scipy's builtin
@pytest.mark.parametrize('seed', range(5))
def test_vector_scikit_single_vs_scipy_single(seed):
n_samples, n_features, n_clusters = 10, 5, 3
rng = np.random.RandomState(seed)
X = .1 * rng.normal(size=(n_samples, n_features))
X -= 4. * np.arange(n_samples)[:, np.newaxis]
X -= X.mean(axis=1)[:, np.newaxis]

out = hierarchy.linkage(X, method='single')
children_scipy = out[:, :2].astype(np.int)

children, _, n_leaves, _ = _TREE_BUILDERS['single'](X)

# Sort the order of child nodes per row for consistency
children.sort(axis=1)
assert_array_equal(children, children_scipy,
'linkage tree differs'
' from scipy impl for'
' single linkage.')

cut = _hc_cut(n_clusters, children, n_leaves)
cut_scipy = _hc_cut(n_clusters, children_scipy, n_leaves)
assess_same_labelling(cut, cut_scipy)


def test_identical_points():
# Ensure identical points are handled correctly when using mst with
# a sparse connectivity matrix
Expand Down

0 comments on commit 0f26e10

Please sign in to comment.