Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MRG: Support for distance matrices in NearestNeighbor #2532

Closed
wants to merge 15 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/whats_new.rst
Expand Up @@ -106,6 +106,9 @@ Changelog
- Add multi-output support to :class:`gaussian_process.GaussianProcess`
by John Novak.

- Support for distance matrices (i.e. n_samples by n_samples) for
NearestNeighbor with algorithm='brute' by `Robert Layton`_.

- Norm computations optimized for NumPy 1.6 and later versions by
`Lars Buitinck`_. In particular, the k-means algorithm no longer
needs a temporary data structure the size of its input.
Expand Down
4 changes: 3 additions & 1 deletion sklearn/metrics/pairwise.py
Expand Up @@ -929,7 +929,9 @@ def chi2_kernel(X, Y=None, gamma=1.):
'euclidean': euclidean_distances,
'l2': euclidean_distances,
'l1': manhattan_distances,
'manhattan': manhattan_distances, }
'manhattan': manhattan_distances,
'precomputed': lambda x: x
}


def distance_metrics():
Expand Down
18 changes: 15 additions & 3 deletions sklearn/neighbors/base.py
Expand Up @@ -111,7 +111,10 @@ def _init_params(self, n_neighbors=None, radius=None,
raise ValueError("unrecognized algorithm: '%s'" % algorithm)

if algorithm == 'auto':
alg_check = 'ball_tree'
if metric == 'precomputed':
alg_check = 'brute'
else:
alg_check = 'ball_tree'
else:
alg_check = algorithm

Expand Down Expand Up @@ -197,8 +200,9 @@ def _fit(self, X):
if self._fit_method == 'auto':
# A tree approach is better for small number of neighbors,
# and KDTree is generally faster when available
if (self.n_neighbors is None
or self.n_neighbors < self._fit_X.shape[0] // 2):
if ((self.n_neighbors is None or
self.n_neighbors < self._fit_X.shape[0] // 2) and
self.metric != 'precomputed'):
if self.effective_metric_ in VALID_METRICS['kd_tree']:
self._fit_method = 'kd_tree'
else:
Expand Down Expand Up @@ -280,6 +284,10 @@ class from an array representing our data set and ask who's

X = atleast2d_or_csr(X)

if (self.effective_metric_ == 'precomputed'
and X.shape[0] != X.shape[1]):
raise ValueError("Precomputed metric requires a square matrix.")

if n_neighbors is None:
n_neighbors = self.n_neighbors

Expand Down Expand Up @@ -442,6 +450,10 @@ class from an array representing our data set and ask who's

X = atleast2d_or_csr(X)

if self.effective_metric_ == 'precomputed' and \
X.shape[0] != X.shape[1]:
raise ValueError("Precomputed metric requires a square matrix.")

if radius is None:
radius = self.radius

Expand Down
40 changes: 39 additions & 1 deletion sklearn/neighbors/tests/test_neighbors.py
@@ -1,9 +1,11 @@
from itertools import product
import pickle

import numpy as np
from scipy.sparse import (bsr_matrix, coo_matrix, csc_matrix, csr_matrix,
dok_matrix, lil_matrix)

from sklearn import metrics
from sklearn.cross_validation import train_test_split
from sklearn.utils.testing import assert_array_almost_equal
from sklearn.utils.testing import assert_array_equal
Expand Down Expand Up @@ -94,6 +96,43 @@ def test_unsupervised_inputs():
assert_array_almost_equal(ind1, ind2)


def test_unsupervised_precomputed():
"""Tests unsupervised NearestNeighbors with a distance matrix."""
X = rng.random_sample((3, 4)) # Must not be square for tests below.
D = metrics.pairwise_distances(X, metric='euclidean')
for method in ['kneighbors', 'radius_neighbors']:
# As a feature matrix (n_samples by n_features)
nbrs_X = neighbors.NearestNeighbors(n_neighbors=3)
nbrs_X.fit(X)
dist_X, ind_X = getattr(nbrs_X, method)(X)

# As a dense distance matrix (n_samples by n_samples)
nbrs_D = neighbors.NearestNeighbors(n_neighbors=3, algorithm='brute',
metric='precomputed')
nbrs_D.fit(D)
dist_D, ind_D = getattr(nbrs_D, method)(D)
assert_array_almost_equal(dist_X, dist_D)
assert_array_almost_equal(ind_X, ind_D)

# Check auto works too
nbrs_D = neighbors.NearestNeighbors(n_neighbors=3, algorithm='auto',
metric='precomputed')
nbrs_D.fit(D)
dist_D, ind_D = getattr(nbrs_D, method)(D)
assert_array_almost_equal(dist_X, dist_D)
assert_array_almost_equal(ind_X, ind_D)

# Test pickling to ensure lambda didn't get stored
pickled_nbrs = pickle.dumps(nbrs_D)
unpickled_nbrs = pickle.loads(pickled_nbrs)
unpickled_dist_D, unpickled_ind_D = getattr(unpickled_nbrs, method)(D)
assert_array_almost_equal(unpickled_dist_D, dist_D)
assert_array_almost_equal(unpickled_ind_D, ind_D)

# Must raise a ValueError if the matrix is not square
assert_raises(ValueError, getattr(nbrs_D, method), X)


def test_unsupervised_radius_neighbors(n_samples=20, n_features=5,
n_query_pts=2, radius=0.5,
random_state=0):
Expand Down Expand Up @@ -210,7 +249,6 @@ def test_kneighbors_classifier_predict_proba():
assert_array_almost_equal(real_prob, y_prob)



def test_radius_neighbors_classifier(n_samples=40,
n_features=5,
n_test_pts=10,
Expand Down