Skip to content

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also .

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also .
...
  • 2 commits
  • 6 files changed
  • 0 commit comments
  • 1 contributor
View
11 scikits/learn/cluster/k_means_.py
@@ -12,6 +12,7 @@
from ..base import BaseEstimator
from ..metrics.pairwise import euclidean_distances
+from ..utils import make_rng
###############################################################################
@@ -52,8 +53,7 @@ def k_init(X, k, n_local_trials=None, rng=None, x_squared_norms=None):
which is the implementation used in the aforementioned paper.
"""
n_samples, n_features = X.shape
- if rng is None:
- rng = np.random
+ rng = make_rng(rng)
centers = np.empty((k, n_features))
@@ -80,8 +80,8 @@ def k_init(X, k, n_local_trials=None, rng=None, x_squared_norms=None):
for c in xrange(1, k):
# Choose center candidates by sampling with probability proportional
# to the squared distance to the closest existing center
- rand_vals = rng.random(n_local_trials) * current_pot
- candidate_ids = np.searchsorted(closest_dist_sq.cumsum(), rand_vals)
+ rand_vals = rng.random_sample(n_local_trials) * current_pot
+ candidate_ids = np.searchsorted(closest_dist_sq.cumsum(), rand_vals)
# Compute distances to center candidates
distance_to_candidates = euclidean_distances(
@@ -181,8 +181,7 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0,
The final value of the inertia criterion
"""
- if rng is None:
- rng = np.random
+ rng = make_rng(rng)
n_samples = X.shape[0]
vdata = np.mean(np.var(X, 0))
View
38 scikits/learn/cluster/spectral.py
@@ -8,11 +8,12 @@
from ..base import BaseEstimator
+from ..utils import make_rng
from ..utils.graph import graph_laplacian
from .k_means_ import k_means
-def spectral_embedding(adjacency, n_components=8, mode=None):
+def spectral_embedding(adjacency, n_components=8, mode=None, rng=None):
"""Project the sample on the first eigen vectors of the graph Laplacian
The adjacency matrix is used to compute a normalized graph Laplacian
@@ -42,6 +43,10 @@ def spectral_embedding(adjacency, n_components=8, mode=None):
MultiGrid) is much faster, but requires pyamg to be
installed.
+ rng: int seed, RandomState instance, or None (default)
+ A pseudo random number generator used for the initialization of the
+ lobpcg eigen vectors decomposition when mode == 'amg'.
+
Returns
--------
embedding: array, shape: (n_samples, n_components)
@@ -62,6 +67,8 @@ def spectral_embedding(adjacency, n_components=8, mode=None):
except ImportError:
amg_loaded = False
+ rng = make_rng(rng)
+
n_nodes = adjacency.shape[0]
# XXX: Should we check that the matrices given is symmetric
if not amg_loaded:
@@ -101,7 +108,7 @@ def spectral_embedding(adjacency, n_components=8, mode=None):
# problem.
laplacian = laplacian.astype(np.float) # lobpcg needs native floats
ml = smoothed_aggregation_solver(laplacian.tocsr())
- X = np.random.rand(laplacian.shape[0], n_components)
+ X = rng.rand(laplacian.shape[0], n_components)
X[:, 0] = 1. / dd.ravel()
M = ml.aspreconditioner()
lambdas, diffusion_map = lobpcg(laplacian, X, M=M, tol=1.e-12,
@@ -115,7 +122,8 @@ def spectral_embedding(adjacency, n_components=8, mode=None):
return embedding
-def spectral_clustering(affinity, k=8, n_components=None, mode=None):
+def spectral_clustering(affinity, k=8, n_components=None, mode=None,
+ rng=None):
"""Apply k-means to a projection to the normalized laplacian
In practice Spectral Clustering is very useful when the structure of
@@ -149,8 +157,13 @@ def spectral_clustering(affinity, k=8, n_components=None, mode=None):
MultiGrid) is much faster, but requires pyamg to be
installed.
+ rng: int seed, RandomState instance, or None (default)
+ A pseudo random number generator used for the initialization
+ of the lobpcg eigen vectors decomposition when mode == 'amg'
+ and by the K-Means initialization.
+
Returns
- --------
+ -------
labels: array of integers, shape: n_samples
The labels of the clusters.
@@ -175,10 +188,12 @@ def spectral_clustering(affinity, k=8, n_components=None, mode=None):
This algorithm solves the normalized cut for k=2: it is a
normalized spectral clustering.
"""
+ rng = make_rng(rng)
n_components = k if n_components is None else n_components
- maps = spectral_embedding(affinity, n_components=n_components, mode=mode)
+ maps = spectral_embedding(affinity, n_components=n_components,
+ mode=mode, rng=rng)
maps = maps[1:]
- _, labels, _ = k_means(maps.T, k)
+ _, labels, _ = k_means(maps.T, k, rng=rng)
return labels
@@ -203,6 +218,11 @@ class SpectralClustering(BaseEstimator):
The eigenvalue decomposition strategy to use. AMG (Algebraic
MultiGrid) is much faster, but requires pyamg to be installed.
+ rng: int seed, RandomState instance, or None (default)
+ A pseudo random number generator used for the initialization
+ of the lobpcg eigen vectors decomposition when mode == 'amg'
+ and by the K-Means initialization.
+
Methods
-------
@@ -226,9 +246,10 @@ class SpectralClustering(BaseEstimator):
http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.165.9323
"""
- def __init__(self, k=8, mode=None):
+ def __init__(self, k=8, mode=None, rng=None):
self.k = k
self.mode = mode
+ self.rng = make_rng(rng)
def fit(self, X, **params):
"""Compute the spectral clustering from the affinity matrix
@@ -259,5 +280,6 @@ def fit(self, X, **params):
speeds up computation.
"""
self._set_params(**params)
- self.labels_ = spectral_clustering(X, k=self.k, mode=self.mode)
+ self.labels_ = spectral_clustering(X, k=self.k, mode=self.mode,
+ rng=self.rng)
return self
View
4 scikits/learn/cluster/tests/test_spectral.py
@@ -22,7 +22,7 @@ def test_spectral_clustering():
])
for mat in (S, sparse.csr_matrix(S)):
- labels = SpectralClustering().fit(mat, k=2).labels_
+ labels = SpectralClustering(rng=0).fit(mat, k=2).labels_
if labels[0] == 0:
labels = 1 - labels
@@ -47,7 +47,7 @@ def test_spectral_clustering_sparse():
S = sparse.coo_matrix(S)
- labels = SpectralClustering().fit(S, k=2).labels_
+ labels = SpectralClustering(rng=0).fit(S, k=2).labels_
if labels[0] == 0:
labels = 1 - labels
View
16 scikits/learn/utils/__init__.py
@@ -9,3 +9,19 @@ def safe_asanyarray(X, dtype=None, order=None):
else:
return np.asanyarray(X, dtype, order)
+def make_rng(seed):
+ """Turn seed into a np.random.RandomState instance
+
+ If seed is None, return the np.random singleton.
+ If seed is an int, return a new RandomState instance seeded with seed.
+ If seed is already a RandomState instance, return it.
+ Otherwise raise ValueError.
+ """
+ if seed is None or seed is np.random:
+ return np.random
+ if isinstance(seed, int):
+ return np.random.RandomState(seed)
+ if isinstance(seed, np.random.RandomState):
+ return seed
+ raise ValueError('%r cannot be used to seed a numpy.random.RandomState'
+ ' instance' % seed)
View
7 scikits/learn/utils/extmath.py
@@ -7,6 +7,7 @@
import sys
import math
+from . import make_rng
import numpy as np
#XXX: We should have a function with numpy's slogdet API
@@ -154,11 +155,7 @@ def fast_svd(M, k, p=None, q=0, transpose='auto', rng=0):
if p == None:
p = k
- if rng is None:
- rng = np.random.RandomState()
- elif isinstance(rng, int):
- rng = np.random.RandomState(rng)
-
+ rng = make_rng(rng)
n_samples, n_features = M.shape
if transpose == 'auto' and n_samples > n_features:
View
33 scikits/learn/utils/fixes.py
@@ -1,15 +1,14 @@
-"""
-Fixes for older version of numpy and scipy.
-"""
+"""Compatibility fixes for older version of numpy and scipy"""
# Authors: Emmanuelle Gouillart <emmanuelle.gouillart@normalesup.org>
# Gael Varoquaux <gael.varoquaux@normalesup.org>
# Fabian Pedregosa <fpedregosa@acm.org>
# License: BSD
import numpy as np
+
def _unique(ar, return_index=False, return_inverse=False):
- """ A replacement for the np.unique that appeared in numpy 1.4.
+ """A replacement for the np.unique that appeared in numpy 1.4.
While np.unique existed long before, keyword return_inverse was
only added in 1.4.
@@ -57,10 +56,8 @@ def _unique(ar, return_index=False, return_inverse=False):
unique = np.unique
-def _copysign (x1, x2):
- """
- (slow) Replacement for np.copysign, which was introduced in numpy 1.4
- """
+def _copysign(x1, x2):
+ """Slow replacement for np.copysign, which was introduced in numpy 1.4"""
return np.abs(x1) * np.sign(x2)
if not hasattr(np, 'copysign'):
@@ -70,20 +67,19 @@ def _copysign (x1, x2):
def _in1d(ar1, ar2, assume_unique=False):
- """ Replacement for in1d that is provided for numpy >= 1.4
- """
+ """Replacement for in1d that is provided for numpy >= 1.4"""
if not assume_unique:
ar1, rev_idx = unique(ar1, return_inverse=True)
ar2 = np.unique(ar2)
- ar = np.concatenate( (ar1, ar2) )
+ ar = np.concatenate((ar1, ar2))
# We need this to be a stable sort, so always use 'mergesort'
# here. The values from the first array should always come before
# the values from the second array.
order = ar.argsort(kind='mergesort')
sar = ar[order]
equal_adj = (sar[1:] == sar[:-1])
- flag = np.concatenate( (equal_adj, [False] ) )
- indx = order.argsort(kind='mergesort')[:len( ar1 )]
+ flag = np.concatenate((equal_adj, [False]))
+ indx = order.argsort(kind='mergesort')[:len(ar1)]
if assume_unique:
return flag[indx]
@@ -97,7 +93,8 @@ def _in1d(ar1, ar2, assume_unique=False):
def qr_economic(A, **kwargs):
- """
+ """Compat function for the QR-decomposition in economic mode
+
Scipy 0.9 changed the keyword econ=True to mode='economic'
"""
import scipy.linalg
@@ -109,7 +106,8 @@ def qr_economic(A, **kwargs):
def arpack_eigsh(A, **kwargs):
- """
+ """Compat function for sparse symmetric eigen vectors decomposition
+
Scipy 0.9 renamed eigen_symmetric to eigsh in
scipy.sparse.linalg.eigen.arpack
"""
@@ -118,8 +116,3 @@ def arpack_eigsh(A, **kwargs):
return arpack.eigsh(A, **kwargs)
else:
return arpack.eigen_symmetric(A, **kwargs)
-
-
-
-
-

No commit comments for this range

Something went wrong with that request. Please try again.