MAINT Mark PairwiseDistancesReductions as unusable for some config.

scikit-learn · Sep 15, 2022 · 1d7bcc7 · 1d7bcc7
1 parent 63fda8c
commit 1d7bcc7
Show file tree

Hide file tree

Showing 2 changed files with 33 additions and 2 deletions.
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py
@@ -110,7 +110,7 @@ def is_valid_sparse_matrix(X):
                 X.indices.dtype == X.indptr.dtype == np.int32
             )
 
-        return (
+        is_usable = (
             get_config().get("enable_cython_pairwise_dist", True)
             and (is_numpy_c_ordered(X) or is_valid_sparse_matrix(X))
             and (is_numpy_c_ordered(Y) or is_valid_sparse_matrix(Y))
@@ -119,6 +119,24 @@ def is_valid_sparse_matrix(X):
             and metric in cls.valid_metrics()
         )
 
+        # The other joblib-based back-end might be more efficient on fused sparse-dense
+        # datasets' pairs on metric="(sq)euclidean" for some configurations because it
+        # uses the Squared Euclidean matrix decomposition, i.e.:
+        #
+        #       ||X_c_i - Y_c_j||² = ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
+        #
+        # calling efficient sparse-dense routines for matrix and vectors multiplication
+        # implemented in SciPy we do not use yet here.
+        # See: https://github.com/scikit-learn/scikit-learn/pull/23585#issuecomment-1247996669  # noqa
+        # TODO: implement specialisation for (sq)euclidean on fused sparse-dense
+        # using sparse-dense routines for matrix-vector multiplications.
+        fused_sparse_dense_euclidean_case_guard = not (
+            (is_valid_sparse_matrix(X) or is_valid_sparse_matrix(Y))
+            and "euclidean" in metric
+        )
+
+        return is_usable and fused_sparse_dense_euclidean_case_guard
+
     @classmethod
     @abstractmethod
     def compute(

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -518,7 +518,7 @@ def test_pairwise_distances_reduction_is_usable_for():
     Y = rng.rand(100, 10)
     X_csr = csr_matrix(X)
     Y_csr = csr_matrix(Y)
-    metric = "euclidean"
+    metric = "manhattan"
 
     # Must be usable for all possible pair of {dense, sparse} datasets
     assert BaseDistanceReductionDispatcher.is_usable_for(X, Y, metric)
@@ -551,6 +551,19 @@ def test_pairwise_distances_reduction_is_usable_for():
         np.asfortranarray(X), Y, metric
     )
 
+    # We prefer not to use those implementations for fused sparse-dense when
+    # metric="(sq)euclidean" because it's not yet the most efficient one on
+    # all configurations of datasets.
+    # See: https://github.com/scikit-learn/scikit-learn/pull/23585#issuecomment-1247996669  # noqa
+    # TODO: implement specialisation for (sq)euclidean on fused sparse-dense
+    # using sparse-dense routines for matrix-vector multiplications.
+    assert not BaseDistanceReductionDispatcher.is_usable_for(
+        X_csr, Y, metric="euclidean"
+    )
+    assert not BaseDistanceReductionDispatcher.is_usable_for(
+        X_csr, Y_csr, metric="sqeuclidean"
+    )
+
     # CSR matrices without non-zeros elements aren't currently supported
     # TODO: support CSR matrices without non-zeros elements
     X_csr_0_nnz = csr_matrix(X * 0)