[MRG] Fix diagonal in DBSCAN with precomputed sparse neighbors graph (#…

…12105)
scikit-learn · Oct 15, 2018 · 4bd468a · 4bd468a
1 parent 16dba4e
commit 4bd468a
Show file tree

Hide file tree

Showing 3 changed files with 35 additions and 12 deletions.
diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
@@ -23,6 +23,10 @@ Changelog
   ``n_jobs > 1``.
   :issue:`12159` by :user:`Olivier Grisel <ogrisel>`.
 
+- |Fix| Fixed a bug in :class:`cluster.DBSCAN` with precomputed sparse neighbors
+  graph, which would add explicitly zeros on the diagonal even when already
+  present. :issue:`12105` by `Tom Dupre la Tour`_.
+
 :mod:`sklearn.ensemble`
 .......................
 
@@ -45,7 +49,7 @@ Changelog
 - |Fix| force the parallelism backend to :code:`threading` for
   :class:`neighbors.KDTree` and :class:`neighbors.BallTree` in Python 2.7 to
   avoid pickling errors caused by the serialization of their methods.
-  :issue:`12171` by :user:`Thomas Moreau <tomMoral>`
+  :issue:`12171` by :user:`Thomas Moreau <tomMoral>`.
 
 .. _changes_0_20:
 
@@ -663,7 +667,7 @@ Support for Python 3.3 has been officially dropped.
 
 - |Feature| :func:`metrics.classification_report` now reports all applicable averages on
   the given data, including micro, macro and weighted average as well as samples
-  average for multilabel data. :issue:`11679` by :user:`Alexander Pacha <apacha>`. 
+  average for multilabel data. :issue:`11679` by :user:`Alexander Pacha <apacha>`.
 
 - |Feature| :func:`metrics.average_precision_score` now supports binary
   ``y_true`` other than ``{0, 1}`` or ``{-1, 1}`` through ``pos_label``
@@ -917,7 +921,7 @@ Support for Python 3.3 has been officially dropped.
   keyword arguments on to the pipeline's last estimator, enabling the use of
   parameters such as ``return_std`` in a pipeline with caution.
   :issue:`9304` by :user:`Breno Freitas <brenolf>`.
-  
+
 - |API| :class:`pipeline.FeatureUnion` now supports ``'drop'`` as a transformer
   to drop features. :issue:`11144` by :user:`thomasjpfan`.
 
@@ -1039,7 +1043,7 @@ Support for Python 3.3 has been officially dropped.
 - |API| The NaN marker for the missing values has been changed
   between the :class:`preprocessing.Imputer` and the
   :class:`impute.SimpleImputer`.
-  ``missing_values='NaN'`` should now be
+  ``missing_values='NaN'`` should now be
   ``missing_values=np.nan``. :issue:`11211` by
   :user:`Jeremie du Boisberranger <jeremiedbb>`.
 

diff --git a/sklearn/cluster/dbscan_.py b/sklearn/cluster/dbscan_.py
@@ -14,6 +14,7 @@
 
 from ..base import BaseEstimator, ClusterMixin
 from ..utils import check_array, check_consistent_length
+from ..utils.testing import ignore_warnings
 from ..neighbors import NearestNeighbors
 
 from ._dbscan_inner import dbscan_inner
@@ -136,15 +137,16 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', metric_params=None,
     if metric == 'precomputed' and sparse.issparse(X):
         neighborhoods = np.empty(X.shape[0], dtype=object)
         X.sum_duplicates()  # XXX: modifies X's internals in-place
+
+        # set the diagonal to explicit values, as a point is its own neighbor
+        with ignore_warnings():
+            X.setdiag(X.diagonal())  # XXX: modifies X's internals in-place
+
         X_mask = X.data <= eps
         masked_indices = X.indices.astype(np.intp, copy=False)[X_mask]
-        masked_indptr = np.concatenate(([0], np.cumsum(X_mask)))[X.indptr[1:]]
+        masked_indptr = np.concatenate(([0], np.cumsum(X_mask)))
+        masked_indptr = masked_indptr[X.indptr[1:-1]]
 
-        # insert the diagonal: a point is its own neighbor, but 0 distance
-        # means absence from sparse matrix data
-        masked_indices = np.insert(masked_indices, masked_indptr,
-                                   np.arange(X.shape[0]))
-        masked_indptr = masked_indptr[:-1] + np.arange(1, X.shape[0])
         # split into rows
         neighborhoods[:] = np.split(masked_indices, masked_indptr)
     else:

diff --git a/sklearn/cluster/tests/test_dbscan.py b/sklearn/cluster/tests/test_dbscan.py
@@ -81,10 +81,12 @@ def test_dbscan_sparse():
     assert_array_equal(labels_dense, labels_sparse)
 
 
-def test_dbscan_sparse_precomputed():
+@pytest.mark.parametrize('include_self', [False, True])
+def test_dbscan_sparse_precomputed(include_self):
     D = pairwise_distances(X)
     nn = NearestNeighbors(radius=.9).fit(X)
-    D_sparse = nn.radius_neighbors_graph(mode='distance')
+    X_ = X if include_self else None
+    D_sparse = nn.radius_neighbors_graph(X=X_, mode='distance')
     # Ensure it is sparse not merely on diagonals:
     assert D_sparse.nnz < D.shape[0] * (D.shape[0] - 1)
     core_sparse, labels_sparse = dbscan(D_sparse,
@@ -97,6 +99,21 @@ def test_dbscan_sparse_precomputed():
     assert_array_equal(labels_dense, labels_sparse)
 
 
+@pytest.mark.parametrize('use_sparse', [True, False])
+@pytest.mark.parametrize('metric', ['precomputed', 'minkowski'])
+def test_dbscan_input_not_modified(use_sparse, metric):
+    # test that the input is not modified by dbscan
+    X = np.random.RandomState(0).rand(10, 10)
+    X = sparse.csr_matrix(X) if use_sparse else X
+    X_copy = X.copy()
+    dbscan(X, metric=metric)
+
+    if use_sparse:
+        assert_array_equal(X.toarray(), X_copy.toarray())
+    else:
+        assert_array_equal(X, X_copy)
+
+
 def test_dbscan_no_core_samples():
     rng = np.random.RandomState(0)
     X = rng.rand(40, 10)