Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix for AttributeError thrown when calling metrics.pairwise_distances with binary metrics and Y is None #13864

Merged
merged 6 commits into from May 13, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
23 changes: 23 additions & 0 deletions doc/whats_new/v0.21.rst
Expand Up @@ -2,6 +2,29 @@

.. currentmodule:: sklearn

.. _changes_0_21_1:

Version 0.21.1
==============

**May 2019**


This is a bug-fix release with some minor documentation improvements and
enhancements to features released in 0.21.0.

Changelog
---------

:mod:`sklearn.metrics`
......................

- |Fix| Fixed a bug in :class:`metrics.pairwise_distances` where it would raise
``AttributeError`` for boolean metrics when ``X`` had a boolean dtype and
``Y == None``.
:issue:`13864` by :user:`Paresh Mathur <rick2047>`.


.. _changes_0_21:

Version 0.21.0
Expand Down
14 changes: 7 additions & 7 deletions sklearn/metrics/pairwise.py
Expand Up @@ -306,7 +306,7 @@ def _euclidean_distances_upcast(X, XX=None, Y=None, YY=None):
maxmem = max(
((x_density * n_samples_X + y_density * n_samples_Y) * n_features
+ (x_density * n_samples_X * y_density * n_samples_Y)) / 10,
10 * 2**17)
10 * 2 ** 17)

# The increase amount of memory in 8-byte blocks is:
# - x_density * batch_size * n_features (copy of chunk of X)
Expand All @@ -315,7 +315,7 @@ def _euclidean_distances_upcast(X, XX=None, Y=None, YY=None):
# Hence x² + (xd+yd)kx = M, where x=batch_size, k=n_features, M=maxmem
# xd=x_density and yd=y_density
tmp = (x_density + y_density) * n_features
batch_size = (-tmp + np.sqrt(tmp**2 + 4 * maxmem)) / 2
batch_size = (-tmp + np.sqrt(tmp ** 2 + 4 * maxmem)) / 2
batch_size = max(int(batch_size), 1)

x_batches = gen_batches(X.shape[0], batch_size)
Expand Down Expand Up @@ -900,7 +900,7 @@ def sigmoid_kernel(X, Y=None, gamma=None, coef0=1):
K = safe_sparse_dot(X, Y.T, dense_output=True)
K *= gamma
K += coef0
np.tanh(K, K) # compute tanh in-place
np.tanh(K, K) # compute tanh in-place
return K


Expand Down Expand Up @@ -933,7 +933,7 @@ def rbf_kernel(X, Y=None, gamma=None):

K = euclidean_distances(X, Y, squared=True)
K *= -gamma
np.exp(K, K) # exponentiate K in-place
np.exp(K, K) # exponentiate K in-place
return K


Expand Down Expand Up @@ -967,7 +967,7 @@ def laplacian_kernel(X, Y=None, gamma=None):
gamma = 1.0 / X.shape[1]

K = -gamma * manhattan_distances(X, Y)
np.exp(K, K) # exponentiate K in-place
np.exp(K, K) # exponentiate K in-place
return K


Expand Down Expand Up @@ -1545,7 +1545,8 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=None, **kwds):

dtype = bool if metric in PAIRWISE_BOOLEAN_FUNCTIONS else None

if dtype == bool and (X.dtype != bool or Y.dtype != bool):
if (dtype == bool and
(X.dtype != bool or (Y is not None and Y.dtype != bool))):
msg = "Data was converted to boolean for metric %s" % metric
warnings.warn(msg, DataConversionWarning)

Expand Down Expand Up @@ -1576,7 +1577,6 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=None, **kwds):
'yule',
]


# Helper functions - distance
PAIRWISE_KERNEL_FUNCTIONS = {
# If updating this dictionary, update the doc in both distance_metrics()
Expand Down
9 changes: 9 additions & 0 deletions sklearn/metrics/tests/test_pairwise.py
Expand Up @@ -173,6 +173,15 @@ def test_pairwise_boolean_distance(metric):
with pytest.warns(DataConversionWarning, match=msg):
pairwise_distances(X, metric=metric)

# Check that the warning is raised if X is boolean by Y is not boolean:
with pytest.warns(DataConversionWarning, match=msg):
pairwise_distances(X.astype(bool), Y=Y, metric=metric)

# Check that no warning is raised if X is already boolean and Y is None:
with pytest.warns(None) as records:
pairwise_distances(X.astype(bool), metric=metric)
assert len(records) == 0


def test_no_data_conversion_warning():
# No warnings issued if metric is not a boolean distance function
Expand Down