# scikit-learn/scikit-learn

MDS: out_dim → n_components

• Loading branch information...
1 parent 096e792 commit 5b53e8dd77e7411ed0ffd41503411d63c22cc440 NelleV committed with GaelVaroquaux May 3, 2012
Showing with 53 additions and 26 deletions.
1. +3 −3 examples/manifold/plot_mds.py
2. +49 −22 sklearn/manifold/mds.py
3. +1 −1 sklearn/manifold/tests/test_mds.py
6 examples/manifold/plot_mds.py
 @@ -32,17 +32,17 @@ noise[np.arange(noise.shape[0]), np.arange(noise.shape[0])] = 0 similarities += noise -mds = manifold.MDS(out_dim=2, max_iter=3000, n_jobs=2, +mds = manifold.MDS(n_components=2, max_iter=3000, n_jobs=2, eps=1e-9) pos = mds.fit(similarities).positions_ -nmds = manifold.MDS(out_dim=2, metric=False, +nmds = manifold.MDS(n_components=2, metric=False, max_iter=3000, n_jobs=2, eps=1e-9) npos = mds.fit(similarities).positions_ # Rotate the data -clf = PCA(n_components=3) +clf = PCA(n_components=2) X_true = clf.fit_transform(X_true) pos = clf.fit_transform(pos)
71 sklearn/manifold/mds.py
 @@ -70,7 +70,7 @@ def pool_adjacent_violators(distances, similarities): return distances -def _smacof_single(similarities, metric=True, out_dim=2, init=None, +def _smacof_single(similarities, metric=True, n_components=2, init=None, max_iter=300, verbose=0, eps=1e-3, random_state=None): """ Computes multidimensional scaling using SMACOF algorithm @@ -83,9 +83,9 @@ def _smacof_single(similarities, metric=True, out_dim=2, init=None, metric: boolean, optional, default: True compute metric or nonmetric SMACOF algorithm - out_dim: int, optional, default: 2 + n_components: int, optional, default: 2 number of dimension in which to immerse the similarities - overridden if initial array is provided. + overwritten if initial array is provided. init: {None or ndarray} if None, randomly chooses the initial configuration @@ -107,8 +107,8 @@ def _smacof_single(similarities, metric=True, out_dim=2, init=None, Returns ------- - X: ndarray (n_samples, out_dim), float - coordinates of the n_samples points in a out_dim-space + X: ndarray (n_samples, n_components), float + coordinates of the n_samples points in a n_components-space stress_: float The final value of the stress (sum of squared distance of the @@ -129,14 +129,14 @@ def _smacof_single(similarities, metric=True, out_dim=2, init=None, sim_flat_w = sim_flat[sim_flat != 0] if init is None: # Randomly choose initial configuration - X = random_state.rand(n_samples * out_dim) - X = X.reshape((n_samples, out_dim)) + X = random_state.rand(n_samples * n_components) + X = X.reshape((n_samples, n_components)) else: # overrides the parameter p - out_dim = init.shape[1] + n_components = init.shape[1] if n_samples != init.shape[0]: raise ValueError("init matrix should be of shape (%d, %d)" % \ - (n_samples, out_dim)) + (n_samples, n_components)) X = init old_stress = None @@ -182,11 +182,27 @@ def _smacof_single(similarities, metric=True, out_dim=2, init=None, return X, stress -def smacof(similarities, metric=True, out_dim=2, init=None, n_init=8, n_jobs=1, - max_iter=300, verbose=0, eps=1e-3, random_state=None): +def smacof(similarities, metric=True, n_components=2, init=None, n_init=8, + n_jobs=1, max_iter=300, verbose=0, eps=1e-3, random_state=None): """ Computes multidimensional scaling using SMACOF algorithm + The SMACOF algorithm is a multidimensional scaling algorithm: it minimizes + a objective function, the *stress*, using a majorization technique. The + Stress Majorization, also known as the Guttman Transform, guarantees a + monotone convergence of Stress, and is more powerful than traditional + technics such as gradient descent. + + The SMACOF algorithm for metric MDS can summarized by the following steps: + + 1. Set an initial start configuration, randomly or not. + 2. Compute the stress + 3. Compute the Guttman Transform + 4. Iterate 2 and 3 until convergence. + + The nonmetric algorithm adds a monotonic regression steps before computing + the stress. + Parameters ---------- similarities: symmetric ndarray, shape (n_samples, n_samples) @@ -195,11 +211,11 @@ def smacof(similarities, metric=True, out_dim=2, init=None, n_init=8, n_jobs=1, metric: boolean, optional, default: True compute metric or nonmetric SMACOF algorithm - out_dim: int, optional, default: 2 + n_components: int, optional, default: 2 number of dimension in which to immerse the similarities overridden if initial array is provided. - init: {None or ndarray of shape (n_samples, out_dim)} + init: {None or ndarray of shape (n_samples, n_components)} if None, randomly chooses the initial configuration if ndarray, initialize the SMACOF algorithm with this array @@ -208,6 +224,17 @@ def smacof(similarities, metric=True, out_dim=2, init=None, n_init=8, n_jobs=1, initialisation. The final results will be the best output of the n_init consecutive runs in terms of stress. + n_jobs: int, optional, default: 1 + + The number of jobs to use for the computation. This works by breaking + down the pairwise matrix into n_jobs even slices and computing them in + parallel. + + If -1 all CPUs are used. If 1 is given, no parallel computing code is + used at all, which is useful for debuging. For n_jobs below -1, + (n_cpus + 1 - n_jobs) are used. Thus for n_jobs = -2, all CPUs but one + are used. + max_iter: int, optional, default: 300 Maximum number of iterations of the SMACOF algorithm for a single run @@ -224,8 +251,8 @@ def smacof(similarities, metric=True, out_dim=2, init=None, n_init=8, n_jobs=1, Returns ------- - X: ndarray (n_samples,out_dim) - Coordinates of the n_samples points in a out_dim-space + X: ndarray (n_samples,n_components) + Coordinates of the n_samples points in a n_components-space stress: float The final value of the stress (sum of squared distance of the @@ -248,7 +275,7 @@ def smacof(similarities, metric=True, out_dim=2, init=None, n_init=8, n_jobs=1, if n_jobs == 1: for it in range(n_init): pos, stress = _smacof_single(similarities, metric=metric, - out_dim=out_dim, + n_components=n_components, init=init, max_iter=max_iter, verbose=verbose, eps=eps, random_state=random_state) @@ -259,7 +286,7 @@ def smacof(similarities, metric=True, out_dim=2, init=None, n_init=8, n_jobs=1, seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init) results = Parallel(n_jobs=n_jobs, verbose=max(verbose - 1, 0))( delayed(_smacof_single)( - similarities, metric=metric, out_dim=out_dim, + similarities, metric=metric, n_components=n_components, init=init, max_iter=max_iter, verbose=verbose, eps=eps, random_state=seed) @@ -280,7 +307,7 @@ class MDS(BaseEstimator): metric: boolean, optional, default: True compute metric or nonmetric SMACOF algorithm - out_dim: int, optional, default: 2 + n_components: int, optional, default: 2 number of dimension in which to immerse the similarities overridden if initial array is provided. @@ -300,7 +327,7 @@ class MDS(BaseEstimator): Attributes ---------- - positions_: array-like, shape [out_dim, n_samples] + positions_: array-like, shape [n_components, n_samples] Stores the position of the dataset in the embedding space stress_: float @@ -320,9 +347,9 @@ class MDS(BaseEstimator): hypothesis" Kruskal, J. Psychometrika, 29, (1964) """ - def __init__(self, out_dim=2, metric=True, n_init=8, + def __init__(self, n_components=2, metric=True, n_init=8, max_iter=300, verbose=0, eps=1e-3, n_jobs=1): - self.out_dim = out_dim + self.n_components = n_components self.metric = metric self.n_init = n_init self.max_iter = max_iter @@ -344,7 +371,7 @@ def fit(self, X, init=None, y=None): if ndarray, initialize the SMACOF algorithm with this array """ self.positions_, self.stress_ = smacof(X, metric=self.metric, - out_dim=self.out_dim, + n_components=self.n_components, init=init, n_init=self.n_init, max_iter=self.max_iter,
2 sklearn/manifold/tests/test_mds.py
 @@ -26,7 +26,7 @@ def test_smacof(): [.451, .252], [.016, -.238], [-.200, .524]]) - X, _ = mds.smacof(sim, init=Z, out_dim=2, max_iter=1) + X, _ = mds.smacof(sim, init=Z, n_components=2, max_iter=1) X_true = np.array([[-1.415, -2.471], [1.633, 1.107], [.249, -.067],

#### 0 comments on commit `5b53e8d`

Please sign in to comment.