squashed 10206

Conflicts: sklearn/manifold/t_sne.py sklearn/manifold/tests/test_t_sne.py
scikit-learn · Jan 26, 2018 · 96c9d94 · 96c9d94
1 parent f610b91
commit 96c9d94
Show file tree

Hide file tree

Showing 5 changed files with 198 additions and 37 deletions.
diff --git a/sklearn/manifold/t_sne.py b/sklearn/manifold/t_sne.py
@@ -26,6 +26,7 @@
 from . import _barnes_hut_tsne
 from ..externals.six import string_types
 from ..utils import deprecated
+from ..utils.fixes import getnnz
 
 
 MACHINE_EPSILON = np.finfo(np.double).eps
@@ -640,6 +641,12 @@ def _fit(self, X, skip_num_points=0):
             raise ValueError("'method' must be 'barnes_hut' or 'exact'")
         if self.angle < 0.0 or self.angle > 1.0:
             raise ValueError("'angle' must be between 0.0 - 1.0")
+        if self.method == 'barnes_hut':
+            X = check_array(X, accept_sparse=['csr'], ensure_min_samples=2,
+                            dtype=[np.float32, np.float64])
+        else:
+            X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
+                            dtype=[np.float32, np.float64])
         if self.metric == "precomputed":
             if isinstance(self.init, string_types) and self.init == 'pca':
                 raise ValueError("The parameter init=\"pca\" cannot be "
@@ -650,12 +657,6 @@ def _fit(self, X, skip_num_points=0):
                 raise ValueError("All distances should be positive, the "
                                  "precomputed distances given as X is not "
                                  "correct")
-        if self.method == 'barnes_hut':
-            X = check_array(X, accept_sparse=['csr'], ensure_min_samples=2,
-                            dtype=[np.float32, np.float64])
-        else:
-            X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
-                            dtype=[np.float32, np.float64])
         if self.method == 'barnes_hut' and self.n_components > 3:
             raise ValueError("'n_components' should be inferior to 4 for the "
                              "barnes_hut algorithm as it relies on "

diff --git a/sklearn/manifold/tests/test_t_sne.py b/sklearn/manifold/tests/test_t_sne.py
@@ -2,8 +2,11 @@
 from sklearn.externals.six.moves import cStringIO as StringIO
 import numpy as np
 import scipy.sparse as sp
+import pytest
 
-from sklearn.neighbors import NearestNeighbors, KNeighborsTransformer
+from sklearn.neighbors import KNeighborsTransformer
+from sklearn.neighbors import NearestNeighbors
+from sklearn.neighbors import kneighbors_graph
 from sklearn.utils.testing import assert_less_equal
 from sklearn.utils.testing import assert_equal
 from sklearn.utils.testing import assert_almost_equal
@@ -294,18 +297,18 @@ def test_optimization_minimizes_kl_divergence():
     assert_less_equal(kl_divergences[2], kl_divergences[1])
 
 
-def test_fit_csr_matrix():
+@pytest.mark.parametrize('method', ['exact', 'barnes_hut'])
+def test_fit_csr_matrix(method):
     # X can be a sparse matrix.
     random_state = check_random_state(0)
     X = random_state.randn(100, 2)
     X[(np.random.randint(0, 100, 50), np.random.randint(0, 2, 50))] = 0.0
     X_csr = sp.csr_matrix(X)
-    for method in ['exact', 'barnes_hut']:
-        tsne = TSNE(n_components=2, perplexity=10, learning_rate=100.0,
-                    random_state=0, method=method)
-        X_embedded = tsne.fit_transform(X_csr)
-        assert_almost_equal(trustworthiness(X_csr, X_embedded, n_neighbors=1),
-                            1.0, decimal=1)
+    tsne = TSNE(n_components=2, perplexity=10, learning_rate=100.0,
+                random_state=0, method=method)
+    X_embedded = tsne.fit_transform(X_csr)
+    assert_almost_equal(trustworthiness(X_csr, X_embedded, n_neighbors=1), 1.0,
+                        decimal=1)
 
 
 def test_preserve_trustworthiness_approximately_with_precomputed_distances():
@@ -337,20 +340,54 @@ def test_too_few_iterations():
                          np.array([[0.0], [0.0]]))
 
 
-def test_non_square_precomputed_distances():
-    # Precomputed distance matrices must be square matrices.
+@pytest.mark.parametrize('method, retype', [
+    ('exact', np.asarray),
+    ('barnes_hut', np.asarray),
+    ('barnes_hut', sp.csr_matrix),
+])
+@pytest.mark.parametrize('D, message_regex', [
+    ([[0.0], [1.0]], ".* square distance matrix"),
+    ([[0., -1.], [1., 0.]], ".* positive.*"),
+])
+def test_bad_precomputed_distances(method, D, retype, message_regex):
+    tsne = TSNE(metric="precomputed", method=method)
+    assert_raises_regexp(ValueError, message_regex,
+                         tsne.fit_transform, retype(D))
+
+
+def test_exact_no_precomputed_sparse():
+    tsne = TSNE(metric='precomputed', method='exact')
+    assert_raises_regexp(TypeError, 'sparse',
+                         tsne.fit_transform,
+                         sp.csr_matrix([[0, 5], [5, 0]]))
+
+
+def test_high_perplexity_precomputed_sparse_distances():
+    # Perplexity should be less than 50
+    dist = np.array([[1., 0., 0.], [0., 1., 0.], [0., 0., 0.]])
+    bad_dist = sp.csr_matrix(dist)
     tsne = TSNE(metric="precomputed")
-    assert_raises_regexp(ValueError, ".* square distance matrix",
-                         tsne.fit_transform, np.array([[0.0], [1.0]]))
+    assert_raises_regexp(ValueError, "2 neighbors per sample are "
+                         "required .*perplexity.*precomputed distance.*",
+                         tsne.fit_transform, bad_dist)
 
 
-def test_non_positive_precomputed_distances():
-    # Precomputed distance matrices must be positive.
-    bad_dist = np.array([[0., -1.], [1., 0.]])
-    for method in ['barnes_hut', 'exact']:
-        tsne = TSNE(metric="precomputed", method=method)
-        assert_raises_regexp(ValueError, "All distances .*precomputed.*",
-                             tsne.fit_transform, bad_dist)
+def test_sparse_precomputed_distance():
+    """Make sure that TSNE works identically for sparse and dense matrix"""
+    random_state = check_random_state(0)
+    X = random_state.randn(100, 2)
+
+    D_sparse = kneighbors_graph(X, n_neighbors=99, mode='distance')
+    D = pairwise_distances(X)
+    assert sp.issparse(D_sparse)
+    assert_almost_equal(D_sparse.A, D)
+
+    tsne = TSNE(metric="precomputed", random_state=0)
+    Xt_dense = tsne.fit_transform(D)
+
+    for fmt in ['csr', 'lil']:
+        Xt_sparse = tsne.fit_transform(D_sparse.asformat(fmt))
+        assert_almost_equal(Xt_dense, Xt_sparse)
 
 
 def test_non_positive_computed_distances():

diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py
@@ -18,6 +18,7 @@
 from ..metrics import pairwise_distances
 from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS
 from ..utils import check_X_y, check_array, _get_n_jobs, gen_even_slices
+from ..utils.fixes import getnnz
 from ..utils.multiclass import check_classification_targets
 from ..utils.validation import check_is_fitted
 from ..externals import six
@@ -407,17 +408,48 @@ class from an array representing our data set and ask who's
                     X, self._fit_X, self.effective_metric_, n_jobs=n_jobs,
                     **self.effective_metric_params_)
 
-            neigh_ind = np.argpartition(dist, n_neighbors - 1, axis=1)
-            neigh_ind = neigh_ind[:, :n_neighbors]
-            # argpartition doesn't guarantee sorted order, so we sort again
-            neigh_ind = neigh_ind[
-                sample_range, np.argsort(dist[sample_range, neigh_ind])]
+            if issparse(dist):
+                print "Dist being printed \n"
+                print dist.toarray()
+                print dist.indices
+                if np.any(getnnz(dist, axis=1) < n_neighbors - query_is_train):
+                    raise ValueError("Not enough neighbors in sparse "
+                                     "precomputed matrix to get {} "
+                                     "nearest neighbors"
+                                     .format(n_neighbors - query_is_train))
+                neigh_ind = np.full((dist.shape[0], dist.shape[1]), np.inf, dtype=np.int)
+                for i in range(0, dist.shape[0]):
+                    row = np.full(dist.shape[1], np.inf)
+                    data_col = dist.indices[dist.indptr[i]:dist.indptr[i + 1]]
+                    data_values = dist.data[dist.indptr[i]:dist.indptr[i + 1]]
+                    row[data_col] = data_values
+                    neigh_ind[i] = np.argsort(row)
+                neigh_ind = neigh_ind[:, :n_neighbors]
+                '''
+                if query_is_train and np.sum([(num in neigh_ind[num]) for num in range(neigh_ind.shape[0])]) == 0:
+                    # this is done to add self as nearest neighbor
+                    neigh_ind = np.concatenate((sample_range, neigh_ind),
+                                               axis=1)
+                    neigh_ind = neigh_ind[:, :-1]
+                '''
+            else:
+                neigh_ind = np.argpartition(dist, n_neighbors - 1, axis=1)
+                neigh_ind = neigh_ind[:, :n_neighbors]
+                # argpartition doesn't guarantee sorted order, so we sort again
+                neigh_ind = neigh_ind[
+                    sample_range, np.argsort(dist[sample_range, neigh_ind])]
 
             if return_distance:
                 if self.effective_metric_ == 'euclidean':
-                    result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind
+                    if issparse(dist):
+                        result = np.sqrt(dist[sample_range, neigh_ind]).toarray(), neigh_ind
+                    else:
+                        result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind
                 else:
-                    result = dist[sample_range, neigh_ind], neigh_ind
+                    if issparse(dist):
+                        result = dist[sample_range, neigh_ind].toarray(), neigh_ind
+                    else:
+                        result = dist[sample_range, neigh_ind], neigh_ind
             else:
                 result = neigh_ind
 
@@ -458,7 +490,6 @@ class from an array representing our data set and ask who's
             # In that case mask the first duplicate.
             dup_gr_nbrs = np.all(sample_mask, axis=1)
             sample_mask[:, 0][dup_gr_nbrs] = False
-
             neigh_ind = np.reshape(
                 neigh_ind[sample_mask], (n_samples, n_neighbors - 1))
 

diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
@@ -1,5 +1,6 @@
 from itertools import product
 
+import pytest
 import numpy as np
 from scipy.sparse import (bsr_matrix, coo_matrix, csc_matrix, csr_matrix,
                           dok_matrix, lil_matrix, issparse)
@@ -19,6 +20,7 @@
 from sklearn.utils.testing import assert_greater
 from sklearn.utils.testing import assert_in
 from sklearn.utils.testing import assert_raises
+from sklearn.utils.testing import assert_raises_regex
 from sklearn.utils.testing import assert_true
 from sklearn.utils.testing import assert_warns
 from sklearn.utils.testing import assert_warns_message
@@ -109,14 +111,13 @@ def test_unsupervised_inputs():
         assert_array_almost_equal(ind1, ind2)
 
 
-def test_precomputed(random_state=42):
+def check_precomputed(make_train_test):
     """Tests unsupervised NearestNeighbors with a distance matrix."""
     # Note: smaller samples may result in spurious test success
-    rng = np.random.RandomState(random_state)
+    rng = np.random.RandomState(42)
     X = rng.random_sample((10, 4))
     Y = rng.random_sample((3, 4))
-    DXX = metrics.pairwise_distances(X, metric='euclidean')
-    DYX = metrics.pairwise_distances(Y, X, metric='euclidean')
+    DXX, DYX = make_train_test(X, Y)
     for method in ['kneighbors']:
         # TODO: also test radius_neighbors, but requires different assertion
 
@@ -163,6 +164,60 @@ def test_precomputed(random_state=42):
         assert_array_almost_equal(pred_X, pred_D)
 
 
+def test_precomputed_dense():
+    def make_train_test(X_train, X_test):
+        return (metrics.pairwise_distances(X_train),
+                metrics.pairwise_distances(X_test, X_train))
+
+    check_precomputed(make_train_test)
+
+
+@pytest.mark.parametrize('fmt', ['csr', 'lil'])
+def test_precomputed_sparse_implicit_diagonal(fmt):
+    def make_train_test(X_train, X_test):
+        nn = neighbors.NearestNeighbors(n_neighbors=3).fit(X_train)
+        return (nn.kneighbors_graph(mode='distance').asformat(fmt),
+                nn.kneighbors_graph(X_test, mode='distance').asformat(fmt))
+
+    check_precomputed(make_train_test)
+
+
+def test_precomputed_sparse_explicit_diagonal():
+    def make_train_test(X_train, X_test):
+        nn = neighbors.NearestNeighbors(n_neighbors=3, radius=10).fit(X_train)
+        return (nn.kneighbors_graph(X_train.copy(), mode='distance'),
+                nn.kneighbors_graph(X_test, mode='distance'))
+
+    check_precomputed(make_train_test)
+
+
+def test_precomputed_sparse_invalid():
+    # TODO:
+    # Ensures enough number of nearest neighbors
+    dist = np.array([[0., 2., 1.], [2., 0., 3.], [1., 3., 0.]])
+    dist_csr = csr_matrix(dist)
+    neigh = neighbors.NearestNeighbors(n_neighbors=1, metric="precomputed")
+    neigh.fit(dist_csr)
+    neigh.kneighbors(None, n_neighbors=1)
+    neigh.kneighbors(np.array([[0., 0., 0.]]), n_neighbors=1)
+
+    dist = np.array([[0., 2., 0.], [2., 0., 3.], [0., 3., 0.]])
+    dist_csr = csr_matrix(dist)
+    neigh.fit(dist_csr)
+    assert_raises_regex(ValueError, "Not enough neighbors in"
+                        " .* to get 1 nearest neighbors.*", neigh.kneighbors,
+                        None, n_neighbors=1)
+
+    # Checks error with inconsistent distance matrix
+    dist = np.array([[5., 2., 1.], [2., 0., 3.], [1., 3., 0.]])
+    dist_csr = csr_matrix(dist)
+    neigh = neighbors.NearestNeighbors(n_neighbors=1, metric="precomputed")
+    neigh.fit(dist_csr)
+    assert_raises_regex(ValueError, "Not a valid distance"
+                        " .*non-negative values.*", neigh.kneighbors,
+                        None, n_neighbors=1)
+
+
 def test_precomputed_cross_validation():
     # Ensure array is split correctly
     rng = np.random.RandomState(0)

diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py
@@ -295,3 +295,40 @@ def __getstate__(self):
                                  self._fill_value)
 else:
     from numpy.ma import MaskedArray    # noqa
+
+
+# Remove when minimum required SciPy >= 0.17.0
+def downcast_intp_index(arr):
+    """
+    Down-cast index array to np.intp dtype if it is of a larger dtype.
+
+    Raise an error if the array contains a value that is too large for
+    intp.
+    """
+    if arr.dtype.itemsize > np.dtype(np.intp).itemsize:
+        if arr.size == 0:
+            return arr.astype(np.intp)
+        maxval = arr.max()
+        minval = arr.min()
+        if maxval > np.iinfo(np.intp).max or minval < np.iinfo(np.intp).min:
+            raise ValueError("Cannot deal with arrays with indices larger "
+                             "than the machine maximum address size "
+                             "(e.g. 64-bit indices on 32-bit machine).")
+        return arr.astype(np.intp)
+    return arr
+
+
+def getnnz(X, axis=None):
+    if axis is None:
+        return int(X.indptr[-1])
+    else:
+        if axis < 0:
+            axis += 2
+        axis, _ = axis, 1 - axis
+        _, N = X.shape
+        if axis == 0:
+            return np.bincount(downcast_intp_index(X.indices),
+                               minlength=N)
+        elif axis == 1:
+            return np.diff(X.indptr)
+        raise ValueError('axis out of bounds')