Skip to content

Commit

Permalink
MAINT Mark PairwiseDistancesReductions as unusable for some config.
Browse files Browse the repository at this point in the history
  • Loading branch information
jjerphan committed Sep 15, 2022
1 parent 63fda8c commit 1d7bcc7
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 2 deletions.
20 changes: 19 additions & 1 deletion sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def is_valid_sparse_matrix(X):
X.indices.dtype == X.indptr.dtype == np.int32
)

return (
is_usable = (
get_config().get("enable_cython_pairwise_dist", True)
and (is_numpy_c_ordered(X) or is_valid_sparse_matrix(X))
and (is_numpy_c_ordered(Y) or is_valid_sparse_matrix(Y))
Expand All @@ -119,6 +119,24 @@ def is_valid_sparse_matrix(X):
and metric in cls.valid_metrics()
)

# The other joblib-based back-end might be more efficient on fused sparse-dense
# datasets' pairs on metric="(sq)euclidean" for some configurations because it
# uses the Squared Euclidean matrix decomposition, i.e.:
#
# ||X_c_i - Y_c_j||² = ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
#
# calling efficient sparse-dense routines for matrix and vectors multiplication
# implemented in SciPy we do not use yet here.
# See: https://github.com/scikit-learn/scikit-learn/pull/23585#issuecomment-1247996669 # noqa
# TODO: implement specialisation for (sq)euclidean on fused sparse-dense
# using sparse-dense routines for matrix-vector multiplications.
fused_sparse_dense_euclidean_case_guard = not (
(is_valid_sparse_matrix(X) or is_valid_sparse_matrix(Y))
and "euclidean" in metric
)

return is_usable and fused_sparse_dense_euclidean_case_guard

@classmethod
@abstractmethod
def compute(
Expand Down
15 changes: 14 additions & 1 deletion sklearn/metrics/tests/test_pairwise_distances_reduction.py
Original file line number Diff line number Diff line change
Expand Up @@ -518,7 +518,7 @@ def test_pairwise_distances_reduction_is_usable_for():
Y = rng.rand(100, 10)
X_csr = csr_matrix(X)
Y_csr = csr_matrix(Y)
metric = "euclidean"
metric = "manhattan"

# Must be usable for all possible pair of {dense, sparse} datasets
assert BaseDistanceReductionDispatcher.is_usable_for(X, Y, metric)
Expand Down Expand Up @@ -551,6 +551,19 @@ def test_pairwise_distances_reduction_is_usable_for():
np.asfortranarray(X), Y, metric
)

# We prefer not to use those implementations for fused sparse-dense when
# metric="(sq)euclidean" because it's not yet the most efficient one on
# all configurations of datasets.
# See: https://github.com/scikit-learn/scikit-learn/pull/23585#issuecomment-1247996669 # noqa
# TODO: implement specialisation for (sq)euclidean on fused sparse-dense
# using sparse-dense routines for matrix-vector multiplications.
assert not BaseDistanceReductionDispatcher.is_usable_for(
X_csr, Y, metric="euclidean"
)
assert not BaseDistanceReductionDispatcher.is_usable_for(
X_csr, Y_csr, metric="sqeuclidean"
)

# CSR matrices without non-zeros elements aren't currently supported
# TODO: support CSR matrices without non-zeros elements
X_csr_0_nnz = csr_matrix(X * 0)
Expand Down

0 comments on commit 1d7bcc7

Please sign in to comment.