Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] Deprecate WMinkowskiDistance & make MinkowskiDistance accept weights #21873

Merged
merged 14 commits into from Jan 20, 2022
Merged
5 changes: 5 additions & 0 deletions doc/whats_new/v1.1.rst
Expand Up @@ -369,6 +369,11 @@ Changelog
A deprecation cycle was introduced.
:pr:`21576` by :user:`Paul-Emile Dugnat <pedugnat>`.

- |API| The `"wminkowski"` metric of :class:`sklearn.metrics.DistanceMetric` is deprecated
and will be removed in version 1.3. Instead the existing `"minkowski"` metric now takes
in an optional `w` parameter for weights. This deprecation aims at remaining consistent
with SciPy 1.8 convention. :pr:`21873` by :user:`Yar Khine Phyo <yarkhinephyo>`

- |Fix| :func:`metrics.silhouette_score` now supports integer input for precomputed
distances. :pr:`22108` by `Thomas Fan`_.

Expand Down
2 changes: 2 additions & 0 deletions sklearn/cluster/tests/test_hierarchical.py
Expand Up @@ -410,6 +410,8 @@ def test_vector_scikit_single_vs_scipy_single(seed):
assess_same_labelling(cut, cut_scipy)


# TODO: Remove filterwarnings in 1.3 when wminkowski is removed
@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
@pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS)
def test_mst_linkage_core_memory_mapped(metric_param_grid):
"""The MST-LINKAGE-CORE algorithm must work on mem-mapped dataset.
Expand Down
85 changes: 63 additions & 22 deletions sklearn/metrics/_dist_metrics.pyx
Expand Up @@ -30,6 +30,7 @@ cdef DTYPE_t INF = np.inf
from ..utils._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t, DTYPECODE
from ..utils._typedefs import DTYPE, ITYPE
from ..utils._readonly_array_wrapper import ReadonlyArrayWrapper
from ..utils import check_array

######################################################################
# newObj function
Expand Down Expand Up @@ -119,9 +120,12 @@ cdef class DistanceMetric:
"mahalanobis" MahalanobisDistance V or VI ``sqrt((x - y)' V^-1 (x - y))``
============== ==================== ======== ===============================

Note that "minkowski" with a non-None `w` parameter actually calls
`WMinkowskiDistance` with `w=w ** (1/p)` in order to be consistent with the
parametrization of scipy 1.8 and later.
.. deprecated:: 1.1
`WMinkowskiDistance` is deprecated in version 1.1 and will be removed in version 1.3.
Use `MinkowskiDistance` instead. Note that in `MinkowskiDistance`, the weights are
applied to the absolute differences already raised to the p power. This is different from
`WMinkowskiDistance` where weights are applied to the absolute differences before raising
to the p power. The deprecation aims to remain consistent with SciPy 1.8 convention.
yarkhinephyo marked this conversation as resolved.
Show resolved Hide resolved

**Metrics intended for two-dimensional vector spaces:** Note that the haversine
distance metric requires data in the form of [latitude, longitude] and both
Expand Down Expand Up @@ -257,25 +261,14 @@ cdef class DistanceMetric:
if metric is MinkowskiDistance:
p = kwargs.pop('p', 2)
w = kwargs.pop('w', None)
if w is not None:
# Be consistent with scipy 1.8 conventions: in scipy 1.8,
# 'wminkowski' was removed in favor of passing a
# weight vector directly to 'minkowski', however
# the new weights apply to the absolute differences raised to
# the p power instead of the absolute difference as in
# previous versions of scipy.
# WMinkowskiDistance in sklearn implements the weighting
# scheme of the old 'wminkowski' in scipy < 1.8, hence the
# following adaptation:
return WMinkowskiDistance(p, w ** (1/p), **kwargs)
if p == 1:
if p == 1 and w is None:
return ManhattanDistance(**kwargs)
elif p == 2:
elif p == 2 and w is None:
return EuclideanDistance(**kwargs)
elif np.isinf(p):
elif np.isinf(p) and w is None:
return ChebyshevDistance(**kwargs)
else:
return MinkowskiDistance(p, **kwargs)
return MinkowskiDistance(p, w, **kwargs)
else:
return metric(**kwargs)

Expand Down Expand Up @@ -554,27 +547,64 @@ cdef class MinkowskiDistance(DistanceMetric):
r"""Minkowski Distance

.. math::
D(x, y) = [\sum_i |x_i - y_i|^p] ^ (1/p)
D(x, y) = {||u-v||}_p

when w is None.

Here is the more general expanded expression for the weighted case:

.. math::
D(x, y) = [\sum_i w_i *|x_i - y_i|^p] ^ (1/p)

Parameters
----------
p : int
The order of the p-norm of the difference (see above).
w : (N,) array-like (optional)
The weight vector.

Minkowski Distance requires p >= 1 and finite. For p = infinity,
use ChebyshevDistance.
Note that for p=1, ManhattanDistance is more efficient, and for
p=2, EuclideanDistance is more efficient.
"""
def __init__(self, p):
def __init__(self, p, w=None):
if p < 1:
raise ValueError("p must be greater than 1")
elif np.isinf(p):
raise ValueError("MinkowskiDistance requires finite p. "
"For p=inf, use ChebyshevDistance.")

self.p = p
if w is not None:
w_array = check_array(
w, ensure_2d=False, dtype=DTYPE, input_name="w"
)
if (w_array < 0).any():
raise ValueError("w cannot contain negative weights")
self.vec = ReadonlyArrayWrapper(w_array)
self.size = self.vec.shape[0]
yarkhinephyo marked this conversation as resolved.
Show resolved Hide resolved
else:
self.vec = ReadonlyArrayWrapper(np.asarray([], dtype=DTYPE))
self.size = 0

def _validate_data(self, X):
if self.size > 0 and X.shape[1] != self.size:
raise ValueError("MinkowskiDistance: the size of w must match "
f"the number of features ({X.shape[1]}). "
f"Currently len(w)={self.size}.")

cdef inline DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2,
ITYPE_t size) nogil except -1:
cdef DTYPE_t d=0
cdef np.intp_t j
for j in range(size):
d += pow(fabs(x1[j] - x2[j]), self.p)
cdef bint has_w = self.size > 0
if has_w:
for j in range(size):
d += self.vec[j] * pow(fabs(x1[j] - x2[j]), self.p)
else:
for j in range(size):
d += pow(fabs(x1[j] - x2[j]), self.p)
return d

cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
Expand All @@ -595,6 +625,7 @@ cdef class MinkowskiDistance(DistanceMetric):


#------------------------------------------------------------
# TODO: Remove in 1.3 - WMinkowskiDistance class
# W-Minkowski Distance
cdef class WMinkowskiDistance(DistanceMetric):
r"""Weighted Minkowski Distance
Expand All @@ -613,6 +644,16 @@ cdef class WMinkowskiDistance(DistanceMetric):

"""
def __init__(self, p, w):
from warnings import warn
warn("WMinkowskiDistance is deprecated in version 1.1 and will be "
"removed in version 1.3. Use MinkowskiDistance instead. Note "
"that in MinkowskiDistance, the weights are applied to the "
"absolute differences raised to the p power. This is different "
"from WMinkowskiDistance where weights are applied to the "
"absolute differences before raising to the p power. "
"The deprecation aims to remain consistent with SciPy 1.8 "
"convention.", FutureWarning)

if p < 1:
raise ValueError("p must be greater than 1")
elif np.isinf(p):
Expand Down
63 changes: 63 additions & 0 deletions sklearn/metrics/tests/test_dist_metrics.py
Expand Up @@ -7,6 +7,7 @@

import pytest

import scipy.sparse as sp
from scipy.spatial.distance import cdist
from sklearn.metrics import DistanceMetric
from sklearn.utils import check_random_state
Expand Down Expand Up @@ -89,6 +90,8 @@ def check_cdist(metric, kwargs, X1, X2):
assert_array_almost_equal(D_sklearn, D_scipy_cdist)


# TODO: Remove filterwarnings in 1.3 when wminkowski is removed
@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
@pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS)
@pytest.mark.parametrize("X1, X2", [(X1, X2), (X1_mmap, X2_mmap)])
def test_cdist(metric_param_grid, X1, X2):
Expand Down Expand Up @@ -120,6 +123,8 @@ def check_cdist_bool(metric, D_true):
assert_array_almost_equal(D12, D_true)


# TODO: Remove filterwarnings in 1.3 when wminkowski is removed
@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
@pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS)
@pytest.mark.parametrize("X1, X2", [(X1, X2), (X1_mmap, X2_mmap)])
def test_pdist(metric_param_grid, X1, X2):
Expand Down Expand Up @@ -170,6 +175,8 @@ def check_pdist_bool(metric, D_true):
assert_array_almost_equal(D12, D_true)


# TODO: Remove filterwarnings in 1.3 when wminkowski is removed
@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
@pytest.mark.parametrize("writable_kwargs", [True, False])
@pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS)
def test_pickle(writable_kwargs, metric_param_grid):
Expand All @@ -185,6 +192,8 @@ def test_pickle(writable_kwargs, metric_param_grid):
check_pickle(metric, kwargs)


# TODO: Remove filterwarnings in 1.3 when wminkowski is removed
@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
@pytest.mark.parametrize("metric", BOOL_METRICS)
@pytest.mark.parametrize("X1_bool", [X1_bool, X1_bool_mmap])
def test_pickle_bool_metrics(metric, X1_bool):
Expand Down Expand Up @@ -262,6 +271,8 @@ def custom_metric(x, y):
assert_array_almost_equal(pyfunc.pairwise(X), eucl.pairwise(X) ** 2)


# TODO: Remove filterwarnings in 1.3 when wminkowski is removed
@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
def test_readonly_kwargs():
# Non-regression test for:
# https://github.com/scikit-learn/scikit-learn/issues/21685
Expand All @@ -277,3 +288,55 @@ def test_readonly_kwargs():
DistanceMetric.get_metric("seuclidean", V=weights)
DistanceMetric.get_metric("wminkowski", p=1, w=weights)
DistanceMetric.get_metric("mahalanobis", VI=VI)


@pytest.mark.parametrize(
"w, err_type, err_msg",
[
(np.array([1, 1.5, -13]), ValueError, "w cannot contain negative weights"),
(np.array([1, 1.5, np.nan]), ValueError, "w contains NaN"),
(
sp.csr_matrix([1, 1.5, 1]),
TypeError,
"A sparse matrix was passed, but dense data is required",
),
(np.array(["a", "b", "c"]), ValueError, "could not convert string to float"),
(np.array([]), ValueError, "a minimum of 1 is required"),
],
)
def test_minkowski_metric_validate_weights_values(w, err_type, err_msg):
with pytest.raises(err_type, match=err_msg):
DistanceMetric.get_metric("minkowski", p=3, w=w)


def test_minkowski_metric_validate_weights_size():
w2 = rng.random_sample(d + 1)
dm = DistanceMetric.get_metric("minkowski", p=3, w=w2)
msg = (
"MinkowskiDistance: the size of w must match "
f"the number of features \\({X1.shape[1]}\\). "
f"Currently len\\(w\\)={w2.shape[0]}."
)
with pytest.raises(ValueError, match=msg):
dm.pairwise(X1, X2)


# TODO: Remove in 1.3 when wminkowski is removed
def test_wminkowski_deprecated():
w = rng.random_sample(d)
msg = "WMinkowskiDistance is deprecated in version 1.1"
with pytest.warns(FutureWarning, match=msg):
DistanceMetric.get_metric("wminkowski", p=3, w=w)
yarkhinephyo marked this conversation as resolved.
Show resolved Hide resolved


# TODO: Remove in 1.3 when wminkowski is removed
@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
@pytest.mark.parametrize("p", [1, 1.5, 3])
def test_wminkowski_minkowski_equivalence(p):
w = rng.random_sample(d)
# Weights are rescaled for consistency w.r.t scipy 1.8 refactoring of 'minkowski'
dm_wmks = DistanceMetric.get_metric("wminkowski", p=p, w=(w) ** (1 / p))
dm_mks = DistanceMetric.get_metric("minkowski", p=p, w=w)
D_wmks = dm_wmks.pairwise(X1, X2)
D_mks = dm_mks.pairwise(X1, X2)
assert_array_almost_equal(D_wmks, D_mks)
2 changes: 2 additions & 0 deletions sklearn/metrics/tests/test_pairwise.py
Expand Up @@ -252,6 +252,8 @@ def callable_rbf_kernel(x, y, **kwds):
return K


# TODO: Remove filterwarnings in 1.3 when wminkowski is removed
@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
@pytest.mark.parametrize(
"func, metric, kwds",
[
Expand Down
4 changes: 4 additions & 0 deletions sklearn/neighbors/tests/test_neighbors.py
Expand Up @@ -1275,6 +1275,8 @@ def test_neighbors_badargs():
nbrs.radius_neighbors_graph(X, mode="blah")


# TODO: Remove filterwarnings in 1.3 when wminkowski is removed
@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
def test_neighbors_metrics(n_samples=20, n_features=3, n_query_pts=2, n_neighbors=5):
# Test computing the neighbors for various metrics
# create a symmetric matrix
Expand Down Expand Up @@ -1381,6 +1383,8 @@ def custom_metric(x1, x2):
assert_array_almost_equal(dist1, dist2)


# TODO: Remove filterwarnings in 1.3 when wminkowski is removed
@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
def test_valid_brute_metric_for_auto_algorithm():
X = rng.rand(12, 12)
Xcsr = csr_matrix(X)
Expand Down
2 changes: 2 additions & 0 deletions sklearn/neighbors/tests/test_neighbors_tree.py
Expand Up @@ -233,6 +233,8 @@ def test_gaussian_kde(Cls, n_samples=1000):
assert_array_almost_equal(dens_tree, dens_gkde, decimal=3)


# TODO: Remove filterwarnings in 1.3 when wminkowski is removed
@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
@pytest.mark.parametrize(
"Cls, metric",
itertools.chain(
Expand Down