This file was deleted.

@@ -0,0 +1,117 @@
"""
============================
Faces dataset decompositions
============================
This example applies to :doc:`/datasets/olivetti_faces` different
unsupervised matrix decomposition (dimension reduction) methods from the
module :py:mod:`scikits.learn.decomposition` (see the documentation
chapter :ref:`decompositions`) .
"""
print __doc__

# Authors: Vlad Niculae, Alexandre Gramfort
# License: BSD

import logging
from time import time

import pylab as pl

from scikits.learn.datasets import fetch_olivetti_faces
from scikits.learn.cluster import MiniBatchKMeans
from scikits.learn import decomposition

# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(levelname)s %(message)s')
n_row, n_col = 2, 3
n_components = n_row * n_col
image_shape = (64, 64)

###############################################################################
# Load faces data
dataset = fetch_olivetti_faces(shuffle=True)
faces = dataset.data

n_samples, n_features = faces.shape

# global centering
faces_centered = faces - faces.mean(axis=0)

# local centering
faces_centered -= faces_centered.mean(axis=1).reshape(n_samples, -1)

print "Dataset consists of %d faces" % n_samples


###############################################################################
def plot_gallery(title, images):
pl.figure(figsize=(2. * n_col, 2.26 * n_row))
pl.suptitle(title, size=16)
for i, comp in enumerate(images):
pl.subplot(n_row, n_col, i + 1)
vmax = max(comp.max(), -comp.min())
pl.imshow(comp.reshape(image_shape), cmap=pl.cm.gray,
interpolation='nearest',
vmin=-vmax, vmax=vmax)
pl.xticks(())
pl.yticks(())
pl.subplots_adjust(0.01, 0.05, 0.99, 0.93, 0.04, 0.)

###############################################################################
# List of the different estimators, whether to center and transpose the
# problem, and whether the transformer uses the clustering API.
estimators = [
('Eigenfaces - RandomizedPCA',
decomposition.RandomizedPCA(n_components=n_components, whiten=True),
True, False),

('Non-negative components - NMF',
decomposition.NMF(n_components=n_components, init='nndsvda', beta=5.0,
tol=5e-3, sparseness='components'),
False, False),

('Independent components - FastICA',
decomposition.FastICA(n_components=n_components, whiten=True, max_iter=10),
True, True),

('Sparse comp. - MiniBatchSparsePCA',
decomposition.MiniBatchSparsePCA(n_components=n_components, alpha=1e-3,
n_iter=100, chunk_size=3),
True, False),

('Cluster centers - MiniBatchKMeans',
MiniBatchKMeans(k=n_components, tol=1e-3, chunk_size=20, max_iter=50),
True, False)
]

###############################################################################
# Plot a sample of the input data

plot_gallery("First centered Olivetti faces", faces_centered[:n_components])

###############################################################################
# Do the estimation and plot it

for name, estimator, center, transpose in estimators:
print "Extracting the top %d %s..." % (n_components, name)
t0 = time()
data = faces
if center:
data = faces_centered
if transpose:
data = data.T
estimator.fit(data)
train_time = (time() - t0)
print "done in %0.3fs" % train_time
if hasattr(estimator, 'cluster_centers_'):
components_ = estimator.cluster_centers_
else:
components_ = estimator.components_
if transpose:
components_ = components_.T
plot_gallery('%s - Train time %.1fs' % (name, train_time), components_)

pl.show()
@@ -27,57 +27,57 @@

from time import time

import numpy
import pylab
import pylab as pl
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.ticker import NullFormatter

from scikits.learn import manifold, datasets

X, color = datasets.samples_generator.make_s_curve(1000)
n_neighbors = 8
n_neighbors = 10
out_dim = 2

methods = ['standard', 'ltsa', 'hessian', 'modified']
labels = ['LLE', 'LTSA', 'Hessian LLE', 'Modified LLE']

fig = pylab.figure(figsize=(8, 12))
pylab.suptitle("Manifold Learning with %i points, %i neighbors"
fig = pl.figure(figsize=(12, 8))
pl.suptitle("Manifold Learning with %i points, %i neighbors"
% (1000, n_neighbors), fontsize=14)

try:
# compatibility matplotlib < 1.0
ax = fig.add_subplot(321, projection='3d')
ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=color, cmap=pylab.cm.Spectral)
ax = fig.add_subplot(231, projection='3d')
ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=color, cmap=pl.cm.Spectral)
ax.view_init(4, -72)
except:
ax = fig.add_subplot(321, projection='3d')
ax.scatter(X[:, 0], X[:, 2], c=color, cmap=pylab.cm.Spectral)
ax = fig.add_subplot(231, projection='3d')
pl.scatter(X[:, 0], X[:, 2], c=color, cmap=pl.cm.Spectral)

ax.set_title('Original Data')

for i, method in enumerate(methods):
t0 = time()
Y, err = manifold.locally_linear_embedding(
X, n_neighbors, out_dim, eigen_solver='arpack', method=method)
Y = manifold.LocallyLinearEmbedding(n_neighbors, out_dim,
eigen_solver='auto',
method=method).fit_transform(X)
t1 = time()
print "%s: %.2g sec" % (methods[i], t1 - t0)
print ' err = %.2e' % err

ax = fig.add_subplot(322 + i)
ax.scatter(Y[:, 0], Y[:, 1], c=color, cmap=pylab.cm.Spectral)
ax.set_title("%s (%.2g sec)" % (labels[i], t1 - t0))
ax = fig.add_subplot(232 + i)
pl.scatter(Y[:, 0], Y[:, 1], c=color, cmap=pl.cm.Spectral)
pl.title("%s (%.2g sec)" % (labels[i], t1 - t0))
ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())
pl.axis('tight')

t0 = time()
Y = manifold.Isomap(n_neighbors, out_dim).fit_transform(X)
t1 = time()
print "Isomap: %.2g sec" % (t1 - t0)
ax = fig.add_subplot(326)
ax.scatter(Y[:, 0], Y[:, 1], c=color, cmap=pylab.cm.Spectral)
ax.set_title("Isomap (%.2g sec)" % (t1 - t0))
ax = fig.add_subplot(236)
pl.scatter(Y[:, 0], Y[:, 1], c=color, cmap=pl.cm.Spectral)
pl.title("Isomap (%.2g sec)" % (t1 - t0))
ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())

pylab.show()
pl.show()
@@ -1,9 +1,9 @@
"""
===============================================
Handwritten digits and Locally Linear Embedding
===============================================
=============================================================================
Manifold learning on handwritten digits: Locally Linear Embedding, Isomap...
=============================================================================
An illustration of locally linear embedding on the digits dataset.
An illustration of various embeddings on the digits dataset.
"""

# Authors: Fabian Pedregosa <fabian.pedregosa@inria.fr>
@@ -12,6 +12,7 @@
# License: BSD, (C) INRIA 2011

print __doc__
from time import time

import numpy as np
import pylab as pl
@@ -25,86 +26,131 @@
n_samples, n_features = X.shape
n_neighbors = 30

#----------------------------------------------------------------------
# Scale and visualize the embedding vectors
def plot_embedding(X, title=None):
x_min, x_max = np.min(X, 0), np.max(X, 0)
X = (X - x_min) / (x_max - x_min)

pl.figure()
ax = pl.subplot(111)
for i in range(digits.data.shape[0]):
pl.text(X[i, 0], X[i, 1], str(digits.target[i]),
color=pl.cm.Set1(digits.target[i] / 10.),
fontdict={'weight': 'bold', 'size': 9})

if hasattr(offsetbox, 'AnnotationBbox'):
# only print thumbnails with matplotlib > 1.0
shown_images = np.array([[1., 1.]]) # just something big
for i in range(digits.data.shape[0]):
dist = np.sum((X[i] - shown_images) ** 2, 1)
if np.min(dist) < 4e-3:
# don't show points that are too close
continue
shown_images = np.r_[shown_images, [X[i]]]
imagebox = offsetbox.AnnotationBbox(
offsetbox.OffsetImage(digits.images[i], cmap=pl.cm.gray_r),
X[i])
ax.add_artist(imagebox)
pl.xticks([]), pl.yticks([])
if title is not None:
pl.title(title)


#----------------------------------------------------------------------
# Random 2D projection using a random unitary matrix
print "Computing random projection"
rng = np.random.RandomState(42)
Q, _ = qr_economic(rng.normal(size=(n_features, 2)))
X_projected = np.dot(Q.T, X.T).T
plot_embedding(X_projected, "Random Projection of the digits")


#----------------------------------------------------------------------
# Projection on to the first 2 principal components

print "Computing PCA projection"
t0 = time()
X_pca = decomposition.RandomizedPCA(n_components=2).fit_transform(X)
plot_embedding(X_pca,
"Principal Components projection of the digits (time %.2fs)" %
(time() - t0))

#----------------------------------------------------------------------
# Projection on to the first 2 linear discriminant components

print "Computing LDA projection"
X2 = X.copy()
X2.flat[::X.shape[1] + 1] += 0.01 # Make X invertible
t0 = time()
X_lda = lda.LDA(n_components=2).fit_transform(X2, y)
plot_embedding(X_lda,
"Linear Discriminant projection of the digits (time %.2fs)" %
(time() - t0))


#----------------------------------------------------------------------
# Locally linear embedding of the digits dataset
print "Computing LLE embedding"
X_lle, err = manifold.locally_linear_embedding(X, n_neighbors, 2)
print "Done. Reconstruction error: %g" % err
clf = manifold.LocallyLinearEmbedding(n_neighbors, out_dim=2,
method='standard')
t0 = time()
X_lle = clf.fit_transform(X)
print "Done. Reconstruction error: %g" % clf.reconstruction_error_
plot_embedding(X_lle,
"Locally Linear Embedding of the digits (time %.2fs)" %
(time() - t0))


#----------------------------------------------------------------------
# Modified Locally linear embedding of the digits dataset
print "Computing modified LLE embedding"
X_mlle, err = manifold.locally_linear_embedding(X, n_neighbors, 2,
method='modified')
print "Done. Reconstruction error: %g" % err
clf = manifold.LocallyLinearEmbedding(n_neighbors, out_dim=2,
method='modified')
t0 = time()
X_mlle = clf.fit_transform(X)
print "Done. Reconstruction error: %g" % clf.reconstruction_error_
plot_embedding(X_mlle,
"Modified Locally Linear Embedding of the digits (time %.2fs)" %
(time() - t0))


#----------------------------------------------------------------------
# Isomap projection of the digits dataset
print "Computing Isomap embedding"
X_iso = manifold.Isomap(n_neighbors, 2).fit_transform(X)
print "Done."
# LTSA embedding of the digits dataset
print "Computing LTSA embedding"
clf = manifold.LocallyLinearEmbedding(n_neighbors, out_dim=2,
method='ltsa')
t0 = time()
X_ltsa = clf.fit_transform(X)
print "Done. Reconstruction error: %g" % clf.reconstruction_error_
plot_embedding(X_ltsa,
"Local Tangent Space Alignment of the digits (time %.2fs)" %
(time() - t0))


#----------------------------------------------------------------------
# Scale and visualize the embedding vectors
def plot_embedding(X, title=None):
x_min, x_max = np.min(X, 0), np.max(X, 0)
X = (X - x_min) / (x_max - x_min)
# HLLE embedding of the digits dataset
print "Computing Hessian LLE embedding"
clf = manifold.LocallyLinearEmbedding(n_neighbors, out_dim=2,
method='hessian')
t0 = time()
X_hlle = clf.fit_transform(X)
print "Done. Reconstruction error: %g" % clf.reconstruction_error_
plot_embedding(X_hlle,
"Hessian Locally Linear Embedding of the digits (time %.2fs)" %
(time() - t0))

pl.figure()
ax = pl.subplot(111)
for i in range(digits.data.shape[0]):
pl.text(X[i, 0], X[i, 1], str(digits.target[i]),
color=pl.cm.Set1(digits.target[i] / 10.),
fontdict={'weight': 'bold', 'size': 9})

if hasattr(offsetbox, 'AnnotationBbox'):
# only print thumbnails with matplotlib > 1.0
shown_images = np.array([[1., 1.]]) # just something big
for i in range(digits.data.shape[0]):
dist = np.sum((X[i] - shown_images) ** 2, 1)
if np.min(dist) < 4e-3:
# don't show points that are too close
continue
shown_images = np.r_[shown_images, [X[i]]]
imagebox = offsetbox.AnnotationBbox(
offsetbox.OffsetImage(digits.images[i], cmap=pl.cm.gray_r),
X[i])
ax.add_artist(imagebox)
pl.xticks([]), pl.yticks([])
if title is not None:
pl.title(title)
#----------------------------------------------------------------------
# Isomap projection of the digits dataset
print "Computing Isomap embedding"
t0 = time()
X_iso = manifold.Isomap(n_neighbors, out_dim=2).fit_transform(X)
print "Done."
plot_embedding(X_iso,
"Isomap projection of the digits (time %.2fs)" %
(time() - t0))


plot_embedding(X_projected, "Random Projection of the digits")
plot_embedding(X_pca, "Principal Components projection of the digits")
plot_embedding(X_lda, "Linear Discriminant projection of the digits")
plot_embedding(X_lle, "Locally Linear Embedding of the digits")
plot_embedding(X_mlle, "Modified Locally Linear Embedding of the digits")
plot_embedding(X_iso, "Isomap projection of the digits")

pl.show()
@@ -142,7 +142,7 @@ def fetch_20newsgroups(data_home=None, subset='train', categories=None,
cache_path=cache_path)
else:
raise IOError('20Newsgroups dataset not found')

if subset in ('train', 'test'):
data = cache[subset]
elif subset == 'all':
@@ -195,4 +195,3 @@ def load_20newsgroups(download_if_missing=False, **kwargs):
See fetch_20newsgroups.__doc__ for documentation and parameter list.
"""
return fetch_20newsgroups(download_if_missing=download_if_missing, **kwargs)

@@ -5,5 +5,6 @@
from .nmf import NMF, ProjectedGradientNMF
from .pca import PCA, RandomizedPCA, ProbabilisticPCA
from .kernel_pca import KernelPCA
from .sparse_pca import SparsePCA, dict_learning
from .sparse_pca import SparsePCA, MiniBatchSparsePCA, dict_learning, \
dict_learning_online
from .fastica_ import FastICA, fastica

Large diffs are not rendered by default.

@@ -4,18 +4,19 @@
import sys

import numpy as np
from .. import SparsePCA
from ..sparse_pca import _update_code, _update_code_parallel

from numpy.testing import assert_array_almost_equal, assert_equal

from .. import SparsePCA, MiniBatchSparsePCA, dict_learning_online
from ..sparse_pca import _update_code, _update_code_parallel
from ...utils import check_random_state

def generate_toy_data(n_atoms, n_samples, image_size):

def generate_toy_data(n_atoms, n_samples, image_size, random_state=None):
n_features = image_size[0] * image_size[1]

np.random.seed(0)
U = np.random.randn(n_samples, n_atoms)
V = np.random.randn(n_atoms, n_features)
rng = check_random_state(random_state)
U = rng.randn(n_samples, n_atoms)
V = rng.randn(n_atoms, n_features)

centers = [(3, 3), (6, 7), (8, 1)]
sz = [1, 2, 1]
@@ -28,60 +29,68 @@ def generate_toy_data(n_atoms, n_samples, image_size):

# Y is defined by : Y = UV + noise
Y = np.dot(U, V)
Y += 0.1 * np.random.randn(Y.shape[0], Y.shape[1]) # Add noise
Y += 0.1 * rng.randn(Y.shape[0], Y.shape[1]) # Add noise
return Y, U, V

# SparsePCA can be a bit slow. To avoid having test times go up, we
# test different aspects of the code in the same test


def test_correct_shapes():
np.random.seed(0)
X = np.random.randn(12, 10)
pca = SparsePCA(n_components=8)
U = pca.fit_transform(X)
assert_equal(pca.components_.shape, (8, 10))
rng = np.random.RandomState(0)
X = rng.randn(12, 10)
spca = SparsePCA(n_components=8, random_state=rng)
U = spca.fit_transform(X)
assert_equal(spca.components_.shape, (8, 10))
assert_equal(U.shape, (12, 8))
# test overcomplete decomposition
pca = SparsePCA(n_components=13)
U = pca.fit_transform(X)
assert_equal(pca.components_.shape, (13, 10))
spca = SparsePCA(n_components=13, random_state=rng)
U = spca.fit_transform(X)
assert_equal(spca.components_.shape, (13, 10))
assert_equal(U.shape, (12, 13))


def test_fit_transform():
Y, _, _ = generate_toy_data(3, 10, (8, 8)) # wide array
spca_lars = SparsePCA(n_components=3, method='lars').fit(Y)
rng = np.random.RandomState(0)
Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array
spca_lars = SparsePCA(n_components=3, method='lars', random_state=rng)
spca_lars.fit(Y)
U1 = spca_lars.transform(Y)
# Test multiple CPUs
if sys.platform == 'win32': # fake parallelism for win32
import scikits.learn.externals.joblib.parallel as joblib_par
_mp = joblib_par.multiprocessing
joblib_par.multiprocessing = None
try:
U2 = SparsePCA(n_components=3, n_jobs=2).fit(Y).transform(Y)
spca = SparsePCA(n_components=3, n_jobs=2, random_state=rng).fit(Y)
U2 = spca.transform(Y)
finally:
joblib_par.multiprocessing = _mp
else: # we can efficiently use parallelism
U2 = SparsePCA(n_components=3, n_jobs=2).fit(Y).transform(Y)
spca = SparsePCA(n_components=3, n_jobs=2, random_state=rng).fit(Y)
U2 = spca.transform(Y)
assert_array_almost_equal(U1, U2)
# Test that CD gives similar results
spca_lasso = SparsePCA(n_components=3, method='cd').fit(Y)
spca_lasso = SparsePCA(n_components=3, method='cd', random_state=rng)
spca_lasso.fit(Y)
assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)


def test_fit_transform_tall():
Y, _, _ = generate_toy_data(3, 65, (8, 8)) # tall array
U1 = SparsePCA(n_components=3, method='lars').fit_transform(Y)
U2 = SparsePCA(n_components=3, method='cd').fit(Y).transform(Y)
rng = np.random.RandomState(0)
Y, _, _ = generate_toy_data(3, 65, (8, 8), random_state=rng) # tall array
spca_lars = SparsePCA(n_components=3, method='lars', random_state=rng)
U1 = spca_lars.fit_transform(Y)
spca_lasso = SparsePCA(n_components=3, method='cd', random_state=rng)
U2 = spca_lasso.fit(Y).transform(Y)
assert_array_almost_equal(U1, U2)


def test_sparse_code():
np.random.seed(0)
dictionary = np.random.randn(10, 3)
rng = np.random.RandomState(0)
dictionary = rng.randn(10, 3)
real_code = np.zeros((3, 5))
real_code.ravel()[np.random.randint(15, size=6)] = 1.0
real_code.ravel()[rng.randint(15, size=6)] = 1.0
Y = np.dot(dictionary, real_code)
est_code_1 = _update_code(dictionary, Y, alpha=1.0)
est_code_2 = _update_code_parallel(dictionary, Y, alpha=1.0)
@@ -91,8 +100,59 @@ def test_sparse_code():


def test_initialization():
U_init = np.random.randn(5, 3)
V_init = np.random.randn(3, 4)
model = SparsePCA(n_components=3, U_init=U_init, V_init=V_init, max_iter=0)
model.fit(np.random.randn(5, 4))
rng = np.random.RandomState(0)
U_init = rng.randn(5, 3)
V_init = rng.randn(3, 4)
model = SparsePCA(n_components=3, U_init=U_init, V_init=V_init, max_iter=0,
random_state=rng)
model.fit(rng.randn(5, 4))
assert_equal(model.components_, V_init)


def test_dict_learning_online_shapes():
rng = np.random.RandomState(0)
X = rng.randn(12, 10)
dictionaryT, codeT = dict_learning_online(X.T, n_atoms=8, alpha=1,
random_state=rng)
assert_equal(codeT.shape, (8, 12))
assert_equal(dictionaryT.shape, (10, 8))
assert_equal(np.dot(codeT.T, dictionaryT.T).shape, X.shape)


def test_mini_batch_correct_shapes():
rng = np.random.RandomState(0)
X = rng.randn(12, 10)
pca = MiniBatchSparsePCA(n_components=8, random_state=rng)
U = pca.fit_transform(X)
assert_equal(pca.components_.shape, (8, 10))
assert_equal(U.shape, (12, 8))
# test overcomplete decomposition
pca = MiniBatchSparsePCA(n_components=13, random_state=rng)
U = pca.fit_transform(X)
assert_equal(pca.components_.shape, (13, 10))
assert_equal(U.shape, (12, 13))


def test_mini_batch_fit_transform():
rng = np.random.RandomState(0)
Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array
spca_lars = MiniBatchSparsePCA(n_components=3, random_state=rng).fit(Y)
U1 = spca_lars.transform(Y)
# Test multiple CPUs
if sys.platform == 'win32': # fake parallelism for win32
import scikits.learn.externals.joblib.parallel as joblib_par
_mp = joblib_par.multiprocessing
joblib_par.multiprocessing = None
try:
U2 = MiniBatchSparsePCA(n_components=3, n_jobs=2,
random_state=rng).fit(Y).transform(Y)
finally:
joblib_par.multiprocessing = _mp
else: # we can efficiently use parallelism
U2 = MiniBatchSparsePCA(n_components=3, n_jobs=2,
random_state=rng).fit(Y).transform(Y)
assert_array_almost_equal(U1, U2)
# Test that CD gives similar results
spca_lasso = MiniBatchSparsePCA(n_components=3, method='cd',
random_state=rng).fit(Y)
assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)
@@ -1,4 +1,4 @@
"""Package for modules that deal with feature extraction from raw data"""

from .image import img_to_graph
from .image import img_to_graph, grid_to_graph
from . import text
@@ -177,3 +177,7 @@ def test_patch_extractor_color():
extr = PatchExtractor(patch_size=(p_h, p_w), random_state=0)
patches = extr.transform(lenas)
assert patches.shape == (expected_n_patches, p_h, p_w, 3)

if __name__ == '__main__':
import nose
nose.runmodule()
@@ -6,16 +6,13 @@

import numpy as np
from scipy.linalg import eigh, svd, qr
from scipy.sparse import linalg, eye, csr_matrix
from scipy.sparse.linalg import LinearOperator
from scipy.sparse import eye, csr_matrix
from ..base import BaseEstimator
from ..utils import check_random_state
from ..utils.arpack import eigsh
from ..neighbors import kneighbors_graph, BallTree, barycenter_weights


def null_space(M, k, k_skip=1, eigen_solver='arpack', tol=1E-6, max_iter=100,
random_state=None):
def null_space(M, k, k_skip=1, eigen_solver='arpack', tol=1E-6, max_iter=100):
"""
Find the null space of a matrix M.
@@ -30,55 +27,34 @@ def null_space(M, k, k_skip=1, eigen_solver='arpack', tol=1E-6, max_iter=100,
k_skip : integer, optional
Number of low eigenvalues to skip.
eigen_solver : string, {'arpack', 'lobpcg', 'dense'}
eigen_solver : string, {'auto', 'arpack', 'dense'}
auto : algorithm will attempt to choose the best method for input data
arpack : use arnoldi iteration in shift-invert mode.
For this method, M may be a dense matrix, sparse matrix,
or general linear operator.
lobpcg : use locally optimized block-preconditioned conjugate gradient.
For this method, M may be a dense or sparse matrix.
A dense matrix M will be converted internally to a
csr sparse format.
dense : use standard dense matrix operations for the eigenvalue
decomposition. For this method, M must be an array
or matrix type. This method should be avoided for
large problems.
tol : float, optional
Tolerance for 'arpack' or 'lobpcg' methods.
Tolerance for 'arpack' method.
Not used if eigen_solver=='dense'.
max_iter : maximum number of iterations for 'arpack' or 'lobpcg' methods
not used if eigen_solver=='dense'
max_iter : maximum number of iterations for 'arpack' method
not used if eigen_solver=='dense'
"""
random_state = check_random_state(random_state)
if eigen_solver == 'auto':
if M.shape[0] > 200 and k + k_skip < 10:
eigen_solver = 'arpack'
else:
eigen_solver = 'dense'

if eigen_solver == 'arpack':
eigen_values, eigen_vectors = eigsh(M, k + k_skip, sigma=0.0,
tol=tol, maxiter=max_iter)

return eigen_vectors[:, k_skip:], np.sum(eigen_values[k_skip:])
elif eigen_solver == 'lobpcg':
try:
import pyamg
except ImportError:
raise ImportError("PyAMG is not installed,"
" cannot use lobcpg solver")

# initial vectors for iteration
X = random_state.rand(M.shape[0], k + k_skip)
try:
ml = pyamg.smoothed_aggregation_solver(M, symmetry='symmetric')
except TypeError:
ml = pyamg.smoothed_aggregation_solver(M, mat_flag='symmetric')
prec = ml.aspreconditioner()

# compute eigenvalues and eigenvectors with LOBPCG
eigen_values, eigen_vectors = linalg.lobpcg(
M, X, M=prec, largest=False, tol=tol, maxiter=max_iter)

index = np.argsort(eigen_values)
return (eigen_vectors[:, index[k_skip:]],
np.sum(eigen_values[index[k_skip:]]))
elif eigen_solver == 'dense':
M = np.asarray(M)
eigen_values, eigen_vectors = eigh(
@@ -90,10 +66,9 @@ def null_space(M, k, k_skip=1, eigen_solver='arpack', tol=1E-6, max_iter=100,


def locally_linear_embedding(
X, n_neighbors, out_dim, reg=1e-3, eigen_solver='arpack',
X, n_neighbors, out_dim, reg=1e-3, eigen_solver='auto',
tol=1e-6, max_iter=100, method='standard',
hessian_tol=1E-4, modified_tol=1E-12,
random_state=None):
hessian_tol=1E-4, modified_tol=1E-12):
"""Perform a Locally Linear Embedding analysis on the data.
Parameters
@@ -112,12 +87,23 @@ def locally_linear_embedding(
regularization constant, multiplies the trace of the local covariance
matrix of the distances.
eigen_solver : {'arpack', 'lobpcg', 'dense'}
arpack can handle both dense and sparse data efficiently
lobpcg solver is usually faster than dense but depends on PyAMG.
eigen_solver : string, {'auto', 'arpack', 'dense'}
auto : algorithm will attempt to choose the best method for input data
arpack : use arnoldi iteration in shift-invert mode.
For this method, M may be a dense matrix, sparse matrix,
or general linear operator.
dense : use standard dense matrix operations for the eigenvalue
decomposition. For this method, M must be an array
or matrix type. This method should be avoided for
large problems.
tol : float, optional
Tolerance for 'arpack' method
Not used if eigen_solver=='dense'.
max_iter : integer
maximum number of iterations for the lobpcg solver.
maximum number of iterations for the arpack solver.
method : string ['standard' | 'hessian' | 'modified']
standard : use the standard locally linear embedding algorithm.
@@ -161,9 +147,7 @@ def locally_linear_embedding(
reduction via tangent space alignment. Journal of Shanghai Univ.
8:406 (2004)
"""
random_state = check_random_state(random_state)

if eigen_solver not in ('arpack', 'lobpcg', 'dense'):
if eigen_solver not in ('auto', 'arpack', 'dense'):
raise ValueError("unrecognized eigen_solver '%s'" % eigen_solver)

if method not in ('standard', 'hessian', 'modified', 'ltsa'):
@@ -216,12 +200,19 @@ def locally_linear_embedding(

M = np.zeros((N, N), dtype=np.float)

use_svd = (n_neighbors > d_in)

for i in range(N):
Gi = X[neighbors[i]]
Gi -= Gi.mean(0)

#build Hessian estimator
U, sig, VT = svd(Gi, full_matrices=0)
if use_svd:
U = svd(Gi, full_matrices=0)[0]
else:
Ci = np.dot(Gi, Gi.T)
U = eigh(Ci)[1][:, ::-1]

Yi[:, 1:1 + out_dim] = U[:, :out_dim]

j = 1 + out_dim
@@ -256,9 +247,23 @@ def locally_linear_embedding(
V = np.zeros((N, n_neighbors, n_neighbors))
nev = min(d_in, n_neighbors)
evals = np.zeros([N, nev])
for i in range(N):
V[i], evals[i], tmp = np.linalg.svd(X[neighbors[i]] - X[i])
evals **= 2

#choose the most efficient way to find the eigenvectors
use_svd = (n_neighbors > d_in)

if use_svd:
for i in range(N):
X_nbrs = X[neighbors[i]] - X[i]
V[i], evals[i], _ = svd(X_nbrs,
full_matrices=True)
evals **= 2
else:
for i in range(N):
X_nbrs = X[neighbors[i]] - X[i]
C_nbrs = np.dot(X_nbrs, X_nbrs.T)
evi, vi = eigh(C_nbrs)
evals[i] = evi[::-1]
V[i] = vi[:, ::-1]

#find regularized weights: this is like normal LLE.
# because we've already computed the SVD of each covariance matrix,
@@ -340,11 +345,18 @@ def locally_linear_embedding(

M = np.zeros((N, N))

use_svd = (n_neighbors > d_in)

for i in range(N):
Xi = X[neighbors[i]]
Xi -= Xi.mean(0)

# compute out_dim largest eigenvalues of Xi * Xi^T
v = np.linalg.svd(Xi - Xi.mean(0), full_matrices=True)[0]
if use_svd:
v = svd(Xi, full_matrices=True)[0]
else:
Ci = np.dot(Xi, Xi.T)
v = eigh(Ci)[1][:, ::-1]

Gi = np.zeros((n_neighbors, out_dim + 1))
Gi[:, 1:] = v[:, :out_dim]
@@ -357,8 +369,7 @@ def locally_linear_embedding(
M[neighbors[i], neighbors[i]] += 1

return null_space(M, out_dim, k_skip=1, eigen_solver=eigen_solver,
tol=tol, max_iter=max_iter,
random_state=random_state)
tol=tol, max_iter=max_iter)


class LocallyLinearEmbedding(BaseEstimator):
@@ -372,69 +383,121 @@ class LocallyLinearEmbedding(BaseEstimator):
out_dim : integer
number of coordinates for the manifold
reg : float
regularization constant, multiplies the trace of the local covariance
matrix of the distances.
eigen_solver : string, {'auto', 'arpack', 'dense'}
auto : algorithm will attempt to choose the best method for input data
arpack : use arnoldi iteration in shift-invert mode.
For this method, M may be a dense matrix, sparse matrix,
or general linear operator.
dense : use standard dense matrix operations for the eigenvalue
decomposition. For this method, M must be an array
or matrix type. This method should be avoided for
large problems.
tol : float, optional
Tolerance for 'arpack' method
Not used if eigen_solver=='dense'.
max_iter : integer
maximum number of iterations for the arpack solver.
method : string ['standard' | 'hessian' | 'modified']
standard : use the standard locally linear embedding algorithm.
see reference [1]
hessian : use the Hessian eigenmap method. This method requires
n_neighbors > out_dim * (1 + (out_dim + 1) / 2.
see reference [2]
modified : use the modified locally linear embedding algorithm.
see reference [3]
ltsa : use local tangent space alignment algorithm
see reference [4]
hessian_tol : float, optional
Tolerance for Hessian eigenmapping method.
Only used if method == 'hessian'
modified_tol : float, optional
Tolerance for modified LLE method.
Only used if method == 'modified'
Attributes
----------
`embedding_vectors_` : array-like, shape [out_dim, n_samples]
Stores the embedding vectors
`reconstruction_error_` : float
Reconstruction error associated with `embedding_vectors_`
`ball_tree_` : BallTree object
Ball Tree used for nearest neighbors
"""

def __init__(self, n_neighbors=5, out_dim=2, random_state=None):
def __init__(self, n_neighbors=5, out_dim=2, reg=1E-3,
eigen_solver='arpack', tol=1E-6, max_iter=100,
method='standard', hessian_tol=1E-4, modified_tol=1E-12):
self.n_neighbors = n_neighbors
self.out_dim = out_dim
self.random_state = random_state
self.reg = reg
self.eigen_solver = eigen_solver
self.tol = tol
self.max_iter = max_iter
self.method = method
self.hessian_tol = hessian_tol
self.modified_tol = modified_tol

def _fit_transform(self, X):
self.ball_tree_ = BallTree(X)
self.embedding_, self.reconstruction_error_ = \
locally_linear_embedding(
self.ball_tree_, self.n_neighbors, self.out_dim,
eigen_solver=self.eigen_solver, tol=self.tol,
max_iter=self.max_iter, method=self.method,
hessian_tol=self.hessian_tol, modified_tol=self.modified_tol)

def fit(self, X, Y=None, reg=1e-3, eigen_solver='arpack', tol=1e-6,
max_iter=100, **params):
def fit(self, X, y=None, **params):
"""Compute the embedding vectors for data X
Parameters
----------
X : array-like of shape [n_samples, n_features]
training set.
out_dim : integer
number of coordinates for the manifold
reg : float
regularization constant, multiplies the trace of the local
covariance matrix of the distances
Returns
-------
self : returns an instance of self.
"""
self._set_params(**params)
self._fit_transform(X)
return self

eigen_solver : {'lobpcg', 'dense'}
use the lobpcg eigensolver or a dense eigensolver based on LAPACK
routines. The lobpcg solver is usually faster but depends on PyAMG
def fit_transform(self, X, y=None, **params):
"""Compute the embedding vectors for data X and transform X.
max_iter : integer
maximum number of iterations for the lobpcg solver.
Parameters
----------
X : array-like of shape [n_samples, n_features]
training set.
Returns
-------
self : returns an instance of self.
X_new: array-like, shape (n_samples, out_dim)
"""
self.random_state = check_random_state(self.random_state)
self._set_params(**params)
self.ball_tree = BallTree(X)
self.embedding_, self.reconstruction_error_ = \
locally_linear_embedding(
self.ball_tree, self.n_neighbors, self.out_dim, reg=reg,
eigen_solver=eigen_solver, tol=tol, max_iter=max_iter,
random_state=self.random_state)
return self
self._fit_transform(X)
return self.embedding_

def transform(self, X, reg=1e-3, **params):
def transform(self, X, **params):
"""
Transform new points into embedding space.
Parameters
----------
X : array-like, shape = [n_samples, n_features]
reg : float
regularization constant, multiplies the trace of the local
covariance matrix of the distances
Returns
-------
X_new : array, shape = [n_samples, out_dim]
@@ -446,11 +509,12 @@ def transform(self, X, reg=1e-3, **params):
"""
self._set_params(**params)
X = np.atleast_2d(X)
if not hasattr(self, 'ball_tree'):
if not hasattr(self, 'ball_tree_'):
raise ValueError('The model is not fitted')
ind = self.ball_tree.query(X, k=self.n_neighbors,
ind = self.ball_tree_.query(X, k=self.n_neighbors,
return_distance=False)
weights = barycenter_weights(X, self.ball_tree.data[ind], reg=reg)
weights = barycenter_weights(X, self.ball_tree_.data[ind],
reg=self.reg)
X_new = np.empty((X.shape[0], self.out_dim))
for i in range(X.shape[0]):
X_new[i] = np.dot(self.embedding_[ind[i]].T, weights[i])
@@ -6,12 +6,6 @@

eigen_solvers = ['dense', 'arpack']

try:
import pyamg
eigen_solvers.append('lobpcg')
except ImportError:
pass


def assert_lower(a, b, details=None):
message = "%r is not lower than %r" % (a, b)
@@ -24,13 +18,11 @@ def assert_lower(a, b, details=None):
# Test LLE by computing the reconstruction error on some manifolds.

def test_lle_simple_grid():
rng = np.random.RandomState(42)

rng = np.random.RandomState(0)
# grid of equidistant points in 2D, out_dim = n_dim
X = np.array(list(product(range(5), repeat=2)))
out_dim = 2
clf = manifold.LocallyLinearEmbedding(n_neighbors=5, out_dim=out_dim,
random_state=rng)
clf = manifold.LocallyLinearEmbedding(n_neighbors=5, out_dim=out_dim)
tol = .1

N = neighbors.kneighbors_graph(
@@ -60,8 +52,7 @@ def test_lle_manifold():
X = np.array(list(product(range(20), repeat=2)))
X = np.c_[X, X[:, 0] ** 2 / 20]
out_dim = 2
clf = manifold.LocallyLinearEmbedding(n_neighbors=5, out_dim=out_dim,
random_state=42)
clf = manifold.LocallyLinearEmbedding(n_neighbors=5, out_dim=out_dim)
tol = .5

N = neighbors.kneighbors_graph(X, clf.n_neighbors,
@@ -85,7 +76,7 @@ def test_pipeline():
from scikits.learn import pipeline, datasets
iris = datasets.load_iris()
clf = pipeline.Pipeline(
[('filter', manifold.LocallyLinearEmbedding(random_state=42)),
[('filter', manifold.LocallyLinearEmbedding()),
('clf', neighbors.NeighborsClassifier())])
clf.fit(iris.data, iris.target)
assert_lower(.7, clf.score(iris.data, iris.target))
@@ -52,8 +52,8 @@ def scale(X, axis=0, with_mean=True, with_std=True, copy=True):
The data to center and scale.
axis : int (0 by default)
axis used to compute the means and standard deviations along. If 0,
independently standardize each feature, otherwise (if 1) standardize
axis used to compute the means and standard deviations along. If 0,
independently standardize each feature, otherwise (if 1) standardize
each sample.
with_mean : boolean, True by default
@@ -90,7 +90,7 @@ def scale(X, axis=0, with_mean=True, with_std=True, copy=True):
return X


class Scaler(BaseEstimator):
class Scaler(BaseEstimator, TransformerMixin):
"""Standardize features by removing the mean and scaling to unit variance
Centering and scaling happen indepently on each feature by computing
@@ -197,11 +197,11 @@ def normalize(X, norm='l2', axis=1, copy=True):
un-necessary copy.
norm : 'l1' or 'l2', optional ('l2' by default)
The norm to use to normalize each non zero sample (or each non-zero
The norm to use to normalize each non zero sample (or each non-zero
feature if axis is 0).
axis : 0 or 1, optional (1 by default)
axis used to normalize the data along. If 1, independently normalize
axis used to normalize the data along. If 1, independently normalize
each sample, otherwise (if 0) normalize each feature.
copy : boolean, optional, default is True
@@ -249,7 +249,7 @@ def normalize(X, norm='l2', axis=1, copy=True):
return X


class Normalizer(BaseEstimator):
class Normalizer(BaseEstimator, TransformerMixin):
"""Normalize samples individually to unit norm
Each sample (i.e. each row of the data matrix) with at least one
@@ -352,7 +352,7 @@ def binarize(X, threshold=0.0, copy=True):
return X


class Binarizer(BaseEstimator):
class Binarizer(BaseEstimator, TransformerMixin):
"""Binarize data (set feature values to 0 or 1) according to a threshold
The default threshold is 0.0 so that any non-zero values are set to 1.0
@@ -331,3 +331,10 @@ def test_center_kernel():
K_pred_centered = np.dot(X_pred_centered, X_fit_centered.T)
K_pred_centered2 = centerer.transform(K_pred)
assert_array_almost_equal(K_pred_centered, K_pred_centered2)

def test_fit_transform():
X = np.random.random((5, 4))
for obj in ((Scaler(), Normalizer(), Binarizer())):
X_transformed = obj.fit(X).transform(X)
X_transformed2 = obj.fit_transform(X)
assert_array_equal(X_transformed, X_transformed2)