Skip to content

Commit

Permalink
squashed 10206
Browse files Browse the repository at this point in the history
Conflicts:
	sklearn/manifold/t_sne.py
	sklearn/manifold/tests/test_t_sne.py
  • Loading branch information
thechargedneutron authored and TomDLT committed Jan 26, 2018
1 parent f610b91 commit 96c9d94
Show file tree
Hide file tree
Showing 5 changed files with 198 additions and 37 deletions.
13 changes: 7 additions & 6 deletions sklearn/manifold/t_sne.py
Expand Up @@ -26,6 +26,7 @@
from . import _barnes_hut_tsne
from ..externals.six import string_types
from ..utils import deprecated
from ..utils.fixes import getnnz


MACHINE_EPSILON = np.finfo(np.double).eps
Expand Down Expand Up @@ -640,6 +641,12 @@ def _fit(self, X, skip_num_points=0):
raise ValueError("'method' must be 'barnes_hut' or 'exact'")
if self.angle < 0.0 or self.angle > 1.0:
raise ValueError("'angle' must be between 0.0 - 1.0")
if self.method == 'barnes_hut':
X = check_array(X, accept_sparse=['csr'], ensure_min_samples=2,
dtype=[np.float32, np.float64])
else:
X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
dtype=[np.float32, np.float64])
if self.metric == "precomputed":
if isinstance(self.init, string_types) and self.init == 'pca':
raise ValueError("The parameter init=\"pca\" cannot be "
Expand All @@ -650,12 +657,6 @@ def _fit(self, X, skip_num_points=0):
raise ValueError("All distances should be positive, the "
"precomputed distances given as X is not "
"correct")
if self.method == 'barnes_hut':
X = check_array(X, accept_sparse=['csr'], ensure_min_samples=2,
dtype=[np.float32, np.float64])
else:
X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
dtype=[np.float32, np.float64])
if self.method == 'barnes_hut' and self.n_components > 3:
raise ValueError("'n_components' should be inferior to 4 for the "
"barnes_hut algorithm as it relies on "
Expand Down
75 changes: 56 additions & 19 deletions sklearn/manifold/tests/test_t_sne.py
Expand Up @@ -2,8 +2,11 @@
from sklearn.externals.six.moves import cStringIO as StringIO
import numpy as np
import scipy.sparse as sp
import pytest

from sklearn.neighbors import NearestNeighbors, KNeighborsTransformer
from sklearn.neighbors import KNeighborsTransformer
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import kneighbors_graph
from sklearn.utils.testing import assert_less_equal
from sklearn.utils.testing import assert_equal
from sklearn.utils.testing import assert_almost_equal
Expand Down Expand Up @@ -294,18 +297,18 @@ def test_optimization_minimizes_kl_divergence():
assert_less_equal(kl_divergences[2], kl_divergences[1])


def test_fit_csr_matrix():
@pytest.mark.parametrize('method', ['exact', 'barnes_hut'])
def test_fit_csr_matrix(method):
# X can be a sparse matrix.
random_state = check_random_state(0)
X = random_state.randn(100, 2)
X[(np.random.randint(0, 100, 50), np.random.randint(0, 2, 50))] = 0.0
X_csr = sp.csr_matrix(X)
for method in ['exact', 'barnes_hut']:
tsne = TSNE(n_components=2, perplexity=10, learning_rate=100.0,
random_state=0, method=method)
X_embedded = tsne.fit_transform(X_csr)
assert_almost_equal(trustworthiness(X_csr, X_embedded, n_neighbors=1),
1.0, decimal=1)
tsne = TSNE(n_components=2, perplexity=10, learning_rate=100.0,
random_state=0, method=method)
X_embedded = tsne.fit_transform(X_csr)
assert_almost_equal(trustworthiness(X_csr, X_embedded, n_neighbors=1), 1.0,
decimal=1)


def test_preserve_trustworthiness_approximately_with_precomputed_distances():
Expand Down Expand Up @@ -337,20 +340,54 @@ def test_too_few_iterations():
np.array([[0.0], [0.0]]))


def test_non_square_precomputed_distances():
# Precomputed distance matrices must be square matrices.
@pytest.mark.parametrize('method, retype', [
('exact', np.asarray),
('barnes_hut', np.asarray),
('barnes_hut', sp.csr_matrix),
])
@pytest.mark.parametrize('D, message_regex', [
([[0.0], [1.0]], ".* square distance matrix"),
([[0., -1.], [1., 0.]], ".* positive.*"),
])
def test_bad_precomputed_distances(method, D, retype, message_regex):
tsne = TSNE(metric="precomputed", method=method)
assert_raises_regexp(ValueError, message_regex,
tsne.fit_transform, retype(D))


def test_exact_no_precomputed_sparse():
tsne = TSNE(metric='precomputed', method='exact')
assert_raises_regexp(TypeError, 'sparse',
tsne.fit_transform,
sp.csr_matrix([[0, 5], [5, 0]]))


def test_high_perplexity_precomputed_sparse_distances():
# Perplexity should be less than 50
dist = np.array([[1., 0., 0.], [0., 1., 0.], [0., 0., 0.]])
bad_dist = sp.csr_matrix(dist)
tsne = TSNE(metric="precomputed")
assert_raises_regexp(ValueError, ".* square distance matrix",
tsne.fit_transform, np.array([[0.0], [1.0]]))
assert_raises_regexp(ValueError, "2 neighbors per sample are "
"required .*perplexity.*precomputed distance.*",
tsne.fit_transform, bad_dist)


def test_non_positive_precomputed_distances():
# Precomputed distance matrices must be positive.
bad_dist = np.array([[0., -1.], [1., 0.]])
for method in ['barnes_hut', 'exact']:
tsne = TSNE(metric="precomputed", method=method)
assert_raises_regexp(ValueError, "All distances .*precomputed.*",
tsne.fit_transform, bad_dist)
def test_sparse_precomputed_distance():
"""Make sure that TSNE works identically for sparse and dense matrix"""
random_state = check_random_state(0)
X = random_state.randn(100, 2)

D_sparse = kneighbors_graph(X, n_neighbors=99, mode='distance')
D = pairwise_distances(X)
assert sp.issparse(D_sparse)
assert_almost_equal(D_sparse.A, D)

tsne = TSNE(metric="precomputed", random_state=0)
Xt_dense = tsne.fit_transform(D)

for fmt in ['csr', 'lil']:
Xt_sparse = tsne.fit_transform(D_sparse.asformat(fmt))
assert_almost_equal(Xt_dense, Xt_sparse)


def test_non_positive_computed_distances():
Expand Down
47 changes: 39 additions & 8 deletions sklearn/neighbors/base.py
Expand Up @@ -18,6 +18,7 @@
from ..metrics import pairwise_distances
from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS
from ..utils import check_X_y, check_array, _get_n_jobs, gen_even_slices
from ..utils.fixes import getnnz
from ..utils.multiclass import check_classification_targets
from ..utils.validation import check_is_fitted
from ..externals import six
Expand Down Expand Up @@ -407,17 +408,48 @@ class from an array representing our data set and ask who's
X, self._fit_X, self.effective_metric_, n_jobs=n_jobs,
**self.effective_metric_params_)

neigh_ind = np.argpartition(dist, n_neighbors - 1, axis=1)
neigh_ind = neigh_ind[:, :n_neighbors]
# argpartition doesn't guarantee sorted order, so we sort again
neigh_ind = neigh_ind[
sample_range, np.argsort(dist[sample_range, neigh_ind])]
if issparse(dist):
print "Dist being printed \n"
print dist.toarray()
print dist.indices
if np.any(getnnz(dist, axis=1) < n_neighbors - query_is_train):
raise ValueError("Not enough neighbors in sparse "
"precomputed matrix to get {} "
"nearest neighbors"
.format(n_neighbors - query_is_train))
neigh_ind = np.full((dist.shape[0], dist.shape[1]), np.inf, dtype=np.int)
for i in range(0, dist.shape[0]):
row = np.full(dist.shape[1], np.inf)
data_col = dist.indices[dist.indptr[i]:dist.indptr[i + 1]]
data_values = dist.data[dist.indptr[i]:dist.indptr[i + 1]]
row[data_col] = data_values
neigh_ind[i] = np.argsort(row)
neigh_ind = neigh_ind[:, :n_neighbors]
'''
if query_is_train and np.sum([(num in neigh_ind[num]) for num in range(neigh_ind.shape[0])]) == 0:
# this is done to add self as nearest neighbor
neigh_ind = np.concatenate((sample_range, neigh_ind),
axis=1)
neigh_ind = neigh_ind[:, :-1]
'''
else:
neigh_ind = np.argpartition(dist, n_neighbors - 1, axis=1)
neigh_ind = neigh_ind[:, :n_neighbors]
# argpartition doesn't guarantee sorted order, so we sort again
neigh_ind = neigh_ind[
sample_range, np.argsort(dist[sample_range, neigh_ind])]

if return_distance:
if self.effective_metric_ == 'euclidean':
result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind
if issparse(dist):
result = np.sqrt(dist[sample_range, neigh_ind]).toarray(), neigh_ind
else:
result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind
else:
result = dist[sample_range, neigh_ind], neigh_ind
if issparse(dist):
result = dist[sample_range, neigh_ind].toarray(), neigh_ind
else:
result = dist[sample_range, neigh_ind], neigh_ind
else:
result = neigh_ind

Expand Down Expand Up @@ -458,7 +490,6 @@ class from an array representing our data set and ask who's
# In that case mask the first duplicate.
dup_gr_nbrs = np.all(sample_mask, axis=1)
sample_mask[:, 0][dup_gr_nbrs] = False

neigh_ind = np.reshape(
neigh_ind[sample_mask], (n_samples, n_neighbors - 1))

Expand Down
63 changes: 59 additions & 4 deletions sklearn/neighbors/tests/test_neighbors.py
@@ -1,5 +1,6 @@
from itertools import product

import pytest
import numpy as np
from scipy.sparse import (bsr_matrix, coo_matrix, csc_matrix, csr_matrix,
dok_matrix, lil_matrix, issparse)
Expand All @@ -19,6 +20,7 @@
from sklearn.utils.testing import assert_greater
from sklearn.utils.testing import assert_in
from sklearn.utils.testing import assert_raises
from sklearn.utils.testing import assert_raises_regex
from sklearn.utils.testing import assert_true
from sklearn.utils.testing import assert_warns
from sklearn.utils.testing import assert_warns_message
Expand Down Expand Up @@ -109,14 +111,13 @@ def test_unsupervised_inputs():
assert_array_almost_equal(ind1, ind2)


def test_precomputed(random_state=42):
def check_precomputed(make_train_test):
"""Tests unsupervised NearestNeighbors with a distance matrix."""
# Note: smaller samples may result in spurious test success
rng = np.random.RandomState(random_state)
rng = np.random.RandomState(42)
X = rng.random_sample((10, 4))
Y = rng.random_sample((3, 4))
DXX = metrics.pairwise_distances(X, metric='euclidean')
DYX = metrics.pairwise_distances(Y, X, metric='euclidean')
DXX, DYX = make_train_test(X, Y)
for method in ['kneighbors']:
# TODO: also test radius_neighbors, but requires different assertion

Expand Down Expand Up @@ -163,6 +164,60 @@ def test_precomputed(random_state=42):
assert_array_almost_equal(pred_X, pred_D)


def test_precomputed_dense():
def make_train_test(X_train, X_test):
return (metrics.pairwise_distances(X_train),
metrics.pairwise_distances(X_test, X_train))

check_precomputed(make_train_test)


@pytest.mark.parametrize('fmt', ['csr', 'lil'])
def test_precomputed_sparse_implicit_diagonal(fmt):
def make_train_test(X_train, X_test):
nn = neighbors.NearestNeighbors(n_neighbors=3).fit(X_train)
return (nn.kneighbors_graph(mode='distance').asformat(fmt),
nn.kneighbors_graph(X_test, mode='distance').asformat(fmt))

check_precomputed(make_train_test)


def test_precomputed_sparse_explicit_diagonal():
def make_train_test(X_train, X_test):
nn = neighbors.NearestNeighbors(n_neighbors=3, radius=10).fit(X_train)
return (nn.kneighbors_graph(X_train.copy(), mode='distance'),
nn.kneighbors_graph(X_test, mode='distance'))

check_precomputed(make_train_test)


def test_precomputed_sparse_invalid():
# TODO:
# Ensures enough number of nearest neighbors
dist = np.array([[0., 2., 1.], [2., 0., 3.], [1., 3., 0.]])
dist_csr = csr_matrix(dist)
neigh = neighbors.NearestNeighbors(n_neighbors=1, metric="precomputed")
neigh.fit(dist_csr)
neigh.kneighbors(None, n_neighbors=1)
neigh.kneighbors(np.array([[0., 0., 0.]]), n_neighbors=1)

dist = np.array([[0., 2., 0.], [2., 0., 3.], [0., 3., 0.]])
dist_csr = csr_matrix(dist)
neigh.fit(dist_csr)
assert_raises_regex(ValueError, "Not enough neighbors in"
" .* to get 1 nearest neighbors.*", neigh.kneighbors,
None, n_neighbors=1)

# Checks error with inconsistent distance matrix
dist = np.array([[5., 2., 1.], [2., 0., 3.], [1., 3., 0.]])
dist_csr = csr_matrix(dist)
neigh = neighbors.NearestNeighbors(n_neighbors=1, metric="precomputed")
neigh.fit(dist_csr)
assert_raises_regex(ValueError, "Not a valid distance"
" .*non-negative values.*", neigh.kneighbors,
None, n_neighbors=1)


def test_precomputed_cross_validation():
# Ensure array is split correctly
rng = np.random.RandomState(0)
Expand Down
37 changes: 37 additions & 0 deletions sklearn/utils/fixes.py
Expand Up @@ -295,3 +295,40 @@ def __getstate__(self):
self._fill_value)
else:
from numpy.ma import MaskedArray # noqa


# Remove when minimum required SciPy >= 0.17.0
def downcast_intp_index(arr):
"""
Down-cast index array to np.intp dtype if it is of a larger dtype.
Raise an error if the array contains a value that is too large for
intp.
"""
if arr.dtype.itemsize > np.dtype(np.intp).itemsize:
if arr.size == 0:
return arr.astype(np.intp)
maxval = arr.max()
minval = arr.min()
if maxval > np.iinfo(np.intp).max or minval < np.iinfo(np.intp).min:
raise ValueError("Cannot deal with arrays with indices larger "
"than the machine maximum address size "
"(e.g. 64-bit indices on 32-bit machine).")
return arr.astype(np.intp)
return arr


def getnnz(X, axis=None):
if axis is None:
return int(X.indptr[-1])
else:
if axis < 0:
axis += 2
axis, _ = axis, 1 - axis
_, N = X.shape
if axis == 0:
return np.bincount(downcast_intp_index(X.indices),
minlength=N)
elif axis == 1:
return np.diff(X.indptr)
raise ValueError('axis out of bounds')

0 comments on commit 96c9d94

Please sign in to comment.