scikit-learn · robertlayton · Oct 17, 2013 · Oct 17, 2013 · Oct 18, 2013 · Oct 18, 2013
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -106,6 +106,9 @@ Changelog
    - Add multi-output support to :class:`gaussian_process.GaussianProcess`
      by John Novak.
 
+   - Support for distance matrices (i.e. n_samples by n_samples) for
+     NearestNeighbor with algorithm='brute' by `Robert Layton`_.
+
    - Norm computations optimized for NumPy 1.6 and later versions by
      `Lars Buitinck`_. In particular, the k-means algorithm no longer
      needs a temporary data structure the size of its input.

diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
@@ -929,7 +929,9 @@ def chi2_kernel(X, Y=None, gamma=1.):
     'euclidean': euclidean_distances,
     'l2': euclidean_distances,
     'l1': manhattan_distances,
-    'manhattan': manhattan_distances, }
+    'manhattan': manhattan_distances,
+    'precomputed': lambda x: x
+}
 
 
 def distance_metrics():

diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py
@@ -111,7 +111,10 @@ def _init_params(self, n_neighbors=None, radius=None,
             raise ValueError("unrecognized algorithm: '%s'" % algorithm)
 
         if algorithm == 'auto':
-            alg_check = 'ball_tree'
+            if metric == 'precomputed':
+                alg_check = 'brute'
+            else:
+                alg_check = 'ball_tree'
         else:
             alg_check = algorithm
 
@@ -197,8 +200,9 @@ def _fit(self, X):
         if self._fit_method == 'auto':
             # A tree approach is better for small number of neighbors,
             # and KDTree is generally faster when available
-            if (self.n_neighbors is None
-                    or self.n_neighbors < self._fit_X.shape[0] // 2):
+            if ((self.n_neighbors is None or
+                 self.n_neighbors < self._fit_X.shape[0] // 2) and
+                    self.metric != 'precomputed'):
                 if self.effective_metric_ in VALID_METRICS['kd_tree']:
                     self._fit_method = 'kd_tree'
                 else:
@@ -280,6 +284,10 @@ class from an array representing our data set and ask who's
 
         X = atleast2d_or_csr(X)
 
+        if (self.effective_metric_ == 'precomputed'
+                and X.shape[0] != X.shape[1]):
+            raise ValueError("Precomputed metric requires a square matrix.")
+
         if n_neighbors is None:
             n_neighbors = self.n_neighbors
 
@@ -442,6 +450,10 @@ class from an array representing our data set and ask who's
 
         X = atleast2d_or_csr(X)
 
+        if self.effective_metric_ == 'precomputed' and \
+           X.shape[0] != X.shape[1]:
+            raise ValueError("Precomputed metric requires a square matrix.")
+
         if radius is None:
             radius = self.radius
 

diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
@@ -1,9 +1,11 @@
 from itertools import product
+import pickle
 
 import numpy as np
 from scipy.sparse import (bsr_matrix, coo_matrix, csc_matrix, csr_matrix,
                           dok_matrix, lil_matrix)
 
+from sklearn import metrics
 from sklearn.cross_validation import train_test_split
 from sklearn.utils.testing import assert_array_almost_equal
 from sklearn.utils.testing import assert_array_equal
@@ -94,6 +96,43 @@ def test_unsupervised_inputs():
         assert_array_almost_equal(ind1, ind2)
 
 
+def test_unsupervised_precomputed():
+    """Tests unsupervised NearestNeighbors with a distance matrix."""
+    X = rng.random_sample((3, 4))  # Must not be square for tests below.
+    D = metrics.pairwise_distances(X, metric='euclidean')
+    for method in ['kneighbors', 'radius_neighbors']:
+        # As a feature matrix (n_samples by n_features)
+        nbrs_X = neighbors.NearestNeighbors(n_neighbors=3)
+        nbrs_X.fit(X)
+        dist_X, ind_X = getattr(nbrs_X, method)(X)
+
+        # As a dense distance matrix (n_samples by n_samples)
+        nbrs_D = neighbors.NearestNeighbors(n_neighbors=3, algorithm='brute',
+                                            metric='precomputed')
+        nbrs_D.fit(D)
+        dist_D, ind_D = getattr(nbrs_D, method)(D)
+        assert_array_almost_equal(dist_X, dist_D)
+        assert_array_almost_equal(ind_X, ind_D)
+
+        # Check auto works too
+        nbrs_D = neighbors.NearestNeighbors(n_neighbors=3, algorithm='auto',
+                                            metric='precomputed')
+        nbrs_D.fit(D)
+        dist_D, ind_D = getattr(nbrs_D, method)(D)
+        assert_array_almost_equal(dist_X, dist_D)
+        assert_array_almost_equal(ind_X, ind_D)
+
+        # Test pickling to ensure lambda didn't get stored
+        pickled_nbrs = pickle.dumps(nbrs_D)
+        unpickled_nbrs = pickle.loads(pickled_nbrs)
+        unpickled_dist_D, unpickled_ind_D = getattr(unpickled_nbrs, method)(D)
+        assert_array_almost_equal(unpickled_dist_D, dist_D)
+        assert_array_almost_equal(unpickled_ind_D, ind_D)
+
+        # Must raise a ValueError if the matrix is not square
+        assert_raises(ValueError, getattr(nbrs_D, method), X)
+
+
 def test_unsupervised_radius_neighbors(n_samples=20, n_features=5,
                                        n_query_pts=2, radius=0.5,
                                        random_state=0):
@@ -210,7 +249,6 @@ def test_kneighbors_classifier_predict_proba():
     assert_array_almost_equal(real_prob, y_prob)
 
 
-
 def test_radius_neighbors_classifier(n_samples=40,
                                      n_features=5,
                                      n_test_pts=10,