Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] Deprecate WMinkowskiDistance & make MinkowskiDistance accept weights #21873

Merged
merged 14 commits into from
Jan 20, 2022
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions doc/whats_new/v1.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,11 @@ Changelog
A deprecation cycle was introduced.
:pr:`21576` by :user:`Paul-Emile Dugnat <pedugnat>`.

- |API| The `WMinkowskiDistance` metric is deprecated and will be removed in version 1.3.
The `MinkowskiDistance` metric now takes in an optional parameter for weights.
This deprecation is to remain consistent with Scipy-1.8 convention.
yarkhinephyo marked this conversation as resolved.
Show resolved Hide resolved
:pr:`21873` by :user:`Yar Khine Phyo <yarkhinephyo>`

:mod:`sklearn.manifold`
.......................

Expand Down
2 changes: 2 additions & 0 deletions sklearn/cluster/tests/test_hierarchical.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,6 +410,8 @@ def test_vector_scikit_single_vs_scipy_single(seed):
assess_same_labelling(cut, cut_scipy)


# TODO: Remove filterwarnings in 1.3 when wminkowski is removed
@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn")
@pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS)
def test_mst_linkage_core_memory_mapped(metric_param_grid):
"""The MST-LINKAGE-CORE algorithm must work on mem-mapped dataset.
Expand Down
65 changes: 43 additions & 22 deletions sklearn/metrics/_dist_metrics.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -119,9 +119,12 @@ cdef class DistanceMetric:
"mahalanobis" MahalanobisDistance V or VI ``sqrt((x - y)' V^-1 (x - y))``
============== ==================== ======== ===============================

Note that "minkowski" with a non-None `w` parameter actually calls
`WMinkowskiDistance` with `w=w ** (1/p)` in order to be consistent with the
parametrization of scipy 1.8 and later.
.. deprecated:: 1.1
`WMinkowskiDistance` is deprecated in version 1.1 and will be removed in version 1.3.
Use `MinkowskiDistance` instead. Note that in `MinkowskiDistance`, the weights are
applied to the absolute differences already raised to the p power. This is different from
`WMinkowskiDistance` where weights are applied to the absolute differences before raising
to the p power. The deprecation aims to remain consistent with Scipy-1.8 convention.

**Metrics intended for two-dimensional vector spaces:** Note that the haversine
distance metric requires data in the form of [latitude, longitude] and both
Expand Down Expand Up @@ -257,25 +260,14 @@ cdef class DistanceMetric:
if metric is MinkowskiDistance:
p = kwargs.pop('p', 2)
w = kwargs.pop('w', None)
if w is not None:
# Be consistent with scipy 1.8 conventions: in scipy 1.8,
# 'wminkowski' was removed in favor of passing a
# weight vector directly to 'minkowski', however
# the new weights apply to the absolute differences raised to
# the p power instead of the absolute difference as in
# previous versions of scipy.
# WMinkowskiDistance in sklearn implements the weighting
# scheme of the old 'wminkowski' in scipy < 1.8, hence the
# following adaptation:
return WMinkowskiDistance(p, w ** (1/p), **kwargs)
if p == 1:
if p == 1 and w is None:
return ManhattanDistance(**kwargs)
elif p == 2:
elif p == 2 and w is None:
return EuclideanDistance(**kwargs)
elif np.isinf(p):
elif np.isinf(p) and w is None:
return ChebyshevDistance(**kwargs)
else:
return MinkowskiDistance(p, **kwargs)
return MinkowskiDistance(p, w, **kwargs)
else:
return metric(**kwargs)

Expand Down Expand Up @@ -554,27 +546,49 @@ cdef class MinkowskiDistance(DistanceMetric):
r"""Minkowski Distance

.. math::
D(x, y) = [\sum_i |x_i - y_i|^p] ^ (1/p)
D(x, y) = [\sum_i w_i *|x_i - y_i|^p] ^ (1/p)

Parameters
----------
p : int
The order of the norm of the difference :math:`{||u-v||}_p`.
jjerphan marked this conversation as resolved.
Show resolved Hide resolved
yarkhinephyo marked this conversation as resolved.
Show resolved Hide resolved
w : (N,) array-like (optional)
The weight vector.

Minkowski Distance requires p >= 1 and finite. For p = infinity,
use ChebyshevDistance.
Note that for p=1, ManhattanDistance is more efficient, and for
p=2, EuclideanDistance is more efficient.
"""
def __init__(self, p):
def __init__(self, p, w=None):
if p < 1:
raise ValueError("p must be greater than 1")
elif np.isinf(p):
raise ValueError("MinkowskiDistance requires finite p. "
"For p=inf, use ChebyshevDistance.")
elif w is not None and any(w_i < 0 for w_i in w):
yarkhinephyo marked this conversation as resolved.
Show resolved Hide resolved
raise ValueError("w cannot contain negative weights")

self.p = p
self.vec = ReadonlyArrayWrapper(np.asarray([], dtype=DTYPE))
self.size = 0
yarkhinephyo marked this conversation as resolved.
Show resolved Hide resolved
if w is not None:
self.vec = ReadonlyArrayWrapper(np.asarray(w, dtype=DTYPE))
self.size = self.vec.shape[0]
yarkhinephyo marked this conversation as resolved.
Show resolved Hide resolved

def _validate_data(self, X):
if self.size > 0 and X.shape[1] != self.size:
raise ValueError('MinkowskiDistance dist: '
'size of w does not match')
yarkhinephyo marked this conversation as resolved.
Show resolved Hide resolved

cdef inline DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2,
ITYPE_t size) nogil except -1:
cdef DTYPE_t d=0
cdef DTYPE_t vec_j, d=0
cdef np.intp_t j
cdef bint has_w = self.size > 0
for j in range(size):
d += pow(fabs(x1[j] - x2[j]), self.p)
vec_j = self.vec[j] if has_w else 1.
d += vec_j * pow(fabs(x1[j] - x2[j]), self.p)
yarkhinephyo marked this conversation as resolved.
Show resolved Hide resolved
return d

cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
Expand All @@ -595,6 +609,7 @@ cdef class MinkowskiDistance(DistanceMetric):


#------------------------------------------------------------
# TODO: Remove in 1.3 - WMinkowskiDistance class
# W-Minkowski Distance
cdef class WMinkowskiDistance(DistanceMetric):
r"""Weighted Minkowski Distance
Expand All @@ -613,6 +628,12 @@ cdef class WMinkowskiDistance(DistanceMetric):

"""
def __init__(self, p, w):
from warnings import warn
warn("WMinkowskiDistance is deprecated in version 1.1 and will be "
"removed in version 1.3. Use MinkowskiDistance instead. Note "
"that in MinkowskiDistance, the weights are applied to the "
"absolute differences raised to the p power.", FutureWarning)

if p < 1:
raise ValueError("p must be greater than 1")
elif np.isinf(p):
Expand Down
32 changes: 32 additions & 0 deletions sklearn/metrics/tests/test_dist_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ def check_cdist(metric, kwargs, X1, X2):
assert_array_almost_equal(D_sklearn, D_scipy_cdist)


# TODO: Remove filterwarnings in 1.3 when wminkowski is removed
@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn")
yarkhinephyo marked this conversation as resolved.
Show resolved Hide resolved
@pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS)
@pytest.mark.parametrize("X1, X2", [(X1, X2), (X1_mmap, X2_mmap)])
def test_cdist(metric_param_grid, X1, X2):
Expand Down Expand Up @@ -120,6 +122,8 @@ def check_cdist_bool(metric, D_true):
assert_array_almost_equal(D12, D_true)


# TODO: Remove filterwarnings in 1.3 when wminkowski is removed
@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn")
@pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS)
@pytest.mark.parametrize("X1, X2", [(X1, X2), (X1_mmap, X2_mmap)])
def test_pdist(metric_param_grid, X1, X2):
Expand Down Expand Up @@ -170,6 +174,8 @@ def check_pdist_bool(metric, D_true):
assert_array_almost_equal(D12, D_true)


# TODO: Remove filterwarnings in 1.3 when wminkowski is removed
@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn")
@pytest.mark.parametrize("writable_kwargs", [True, False])
@pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS)
def test_pickle(writable_kwargs, metric_param_grid):
Expand All @@ -185,6 +191,8 @@ def test_pickle(writable_kwargs, metric_param_grid):
check_pickle(metric, kwargs)


# TODO: Remove filterwarnings in 1.3 when wminkowski is removed
@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn")
@pytest.mark.parametrize("metric", BOOL_METRICS)
@pytest.mark.parametrize("X1_bool", [X1_bool, X1_bool_mmap])
def test_pickle_bool_metrics(metric, X1_bool):
Expand Down Expand Up @@ -262,6 +270,8 @@ def custom_metric(x, y):
assert_array_almost_equal(pyfunc.pairwise(X), eucl.pairwise(X) ** 2)


# TODO: Remove filterwarnings in 1.3 when wminkowski is removed
@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn")
def test_readonly_kwargs():
# Non-regression test for:
# https://github.com/scikit-learn/scikit-learn/issues/21685
Expand All @@ -277,3 +287,25 @@ def test_readonly_kwargs():
DistanceMetric.get_metric("seuclidean", V=weights)
DistanceMetric.get_metric("wminkowski", p=1, w=weights)
DistanceMetric.get_metric("mahalanobis", VI=VI)


def test_minkowski_metric_validate_weights():
w1 = rng.random_sample(d)
w1[0] = -1337
msg = "w cannot contain negative weights"
with pytest.raises(ValueError, match=msg):
DistanceMetric.get_metric("minkowski", p=3, w=w1)

w2 = rng.random_sample(d + 1)
dm = DistanceMetric.get_metric("minkowski", p=3, w=w2)
msg = "size of w does not match"
with pytest.raises(ValueError, match=msg):
dm.pairwise(X1, X2)


# TODO: Remove in 1.3 when mwinkowski is removed
def test_wminkowski_deprecated():
w = rng.random_sample(d)
msg = "WMinkowskiDistance is deprecated in version 1.1"
with pytest.warns(FutureWarning, match=msg):
DistanceMetric.get_metric("wminkowski", p=3, w=w)
yarkhinephyo marked this conversation as resolved.
Show resolved Hide resolved
2 changes: 2 additions & 0 deletions sklearn/metrics/tests/test_pairwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,8 @@ def callable_rbf_kernel(x, y, **kwds):
return K


# TODO: Remove filterwarnings in 1.3 when wminkowski is removed
@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn")
@pytest.mark.parametrize(
"func, metric, kwds",
[
Expand Down
4 changes: 4 additions & 0 deletions sklearn/neighbors/tests/test_neighbors.py
Original file line number Diff line number Diff line change
Expand Up @@ -1275,6 +1275,8 @@ def test_neighbors_badargs():
nbrs.radius_neighbors_graph(X, mode="blah")


# TODO: Remove filterwarnings in 1.3 when wminkowski is removed
@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn")
def test_neighbors_metrics(n_samples=20, n_features=3, n_query_pts=2, n_neighbors=5):
# Test computing the neighbors for various metrics
# create a symmetric matrix
Expand Down Expand Up @@ -1381,6 +1383,8 @@ def custom_metric(x1, x2):
assert_array_almost_equal(dist1, dist2)


# TODO: Remove filterwarnings in 1.3 when wminkowski is removed
@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn")
def test_valid_brute_metric_for_auto_algorithm():
X = rng.rand(12, 12)
Xcsr = csr_matrix(X)
Expand Down
2 changes: 2 additions & 0 deletions sklearn/neighbors/tests/test_neighbors_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,8 @@ def test_gaussian_kde(Cls, n_samples=1000):
assert_array_almost_equal(dens_tree, dens_gkde, decimal=3)


# TODO: Remove filterwarnings in 1.3 when wminkowski is removed
@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn")
@pytest.mark.parametrize(
"Cls, metric",
itertools.chain(
Expand Down