# scikit-learn/scikit-learn

Added tests and documentation to the smacof algorithm

1 parent 4dbe1a0 commit 081cb871a4788e364980ed22f665f11f08b1cd5a NelleV committed with GaelVaroquaux Apr 19, 2012
Showing with 120 additions and 15 deletions.
1. +70 −13 sklearn/manifold/nmds.py
2. +50 −2 sklearn/manifold/tests/test_nmds.py
 @@ -3,6 +3,8 @@ """ import numpy as np + +from ..base import BaseEstimator from ..metrics import euclidean_distances @@ -20,7 +22,7 @@ def PAV(distances, similarities, copy=False, verbose=False): copy: boolean, optional """ # FIXME ties ? - indxs = similarities.argsort()[::-1] + indxs = similarities.argsort() dis = distances.copy() sort = True @@ -48,7 +50,6 @@ def guttman_image_ranking(distances, similarities, verbose=False): Guttman image ranking """ - # FIXME this isn't working sim_arg = similarities.argsort() dis_arg = distances.argsort() disparities = distances.copy() @@ -57,15 +58,46 @@ def guttman_image_ranking(distances, similarities, verbose=False): return disparities -def smacof(similarities, metric=True, p=3, init=None, - max_iter=300, verbose=False, eps=1e-3): +def smacof(similarities, metric=True, p=2, init=None, + max_iter=300, verbose=0, eps=1e-3): """ + Computes multidimensional scaling using SMACOF algorithm + + Parameters + ---------- + similarities: symmetric ndarray, shape [n * n] + similarities between the points + + metric: boolean, optional, default: True + compute metric or nonmetric SMACOF algorithm + + p: int, optional, default: 2 + number of dimension in which to immerse the similarities + overridden if initial array is provided. + + init: {None or ndarray} + if None, randomly chooses the initial configuration + if ndarray, initialize the SMACOF algorithm with this array + + max_iter: int, optional, default: 300 + Maximum number of iterations of the SMACOF algorithm for a single run + + verbose: int, optional, default: 0 + level of verbosity + + eps: float, optional, default: 1e-6 + relative tolerance w.r.t stress to declare converge + + Returns + ------- + X: ndarray (n, p) + coordinates of the n points in a p-space """ n = similarities.shape[0] if similarities.shape[0] != similarities.shape[1]: - raise ValueError("similarities must be a square array (shape=%r)" % \ - similarities.shape) + raise ValueError("similarities must be a square array (shape=%d)" % \ + n) if np.any(similarities != similarities.T): raise ValueError("similarities must be symmetric") @@ -76,7 +108,9 @@ def smacof(similarities, metric=True, p=3, init=None, # Randomly choose initial configuration X = np.random.random(size=(n, p)) else: - if n != init.shape[0] or p != init.shape[1]: + # overrides the parameter p + p = init.shape[1] + if n != init.shape[0]: raise ValueError("init matrix should be of shape (%d, %d)" % \ (n, p)) X = init @@ -85,13 +119,14 @@ def smacof(similarities, metric=True, p=3, init=None, for it in range(max_iter): # Compute distance and monotonic regression dis = euclidean_distances(X) - dis_flat = dis.flatten() - # similarities with 0 are considered as missing values - dis_flat_w = dis_flat[sim_flat != 0] if metric: disparities = similarities else: + dis_flat = dis.flatten() + # similarities with 0 are considered as missing values + dis_flat_w = dis_flat[sim_flat != 0] + # Compute the disparities using a monotonic regression disparities_flat = guttman_image_ranking(dis_flat_w, sim_flat_w) @@ -103,20 +138,42 @@ def smacof(similarities, metric=True, p=3, init=None, # Compute stress stress = ((dis.flatten() - \ - (disparities + disparities.T).flatten()) ** 2).sum() / 2 + disparities.flatten()) ** 2).sum() / 2 # Update X using the Guttman transform - ratio = (disparities + disparities.T) / dis + ratio = disparities / dis ratio[np.isinf(ratio) | np.isnan(ratio)] = 0 B = - ratio + np.diag(ratio.sum(axis=1)) X = 1. / n * np.dot(B, X) - if verbose: + if verbose == 2: print 'it: %d, stress %s' % (it, stress) if old_stress is not None: if(old_stress - stress) < eps: if verbose: print 'breaking at iteration %d with stress %s' % (it, stress) + break old_stress = stress return X + + +class MDS(BaseEstimator): + """ + Multidimensional scaling + + Parameters + ---------- + + Notes + ----- + """ + def __init__(self, p=2, init=None, max_iter=300, eps=1e-3): + self.p = p + self.init = init + self.max_iter = max_iter + self.eps = eps + + def fit(): + """ + """
 @@ -1,14 +1,62 @@ import numpy as np from numpy.testing import assert_array_almost_equal +from nose.tools import assert_raises from sklearn.manifold import nmds -def test_pva(): +def test_pav(): distances = np.array([10., 8, 11, 5, 13, 11, 9, 14, 6, 16]) similarities = np.arange(10) - distances_fit = nmds.PVA(distances, similarities) + distances_fit = nmds.PAV(distances, similarities) assert_array_almost_equal(distances_fit, np.array([8.5, 8.5, 8.5, 8.5, 10.6, 10.6, 10.6, 10.6, 10.6, 16])) + + +def test_smacof(): + # test metric smacof using the data of "Modern Multidimensional Scaling", + # Borg & Groenen, p 154 + sim = np.array([[0, 5, 3, 4], + [5, 0, 2, 2], + [3, 2, 0, 1], + [4, 2, 1, 0]]) + Z = np.array([[-.266, -.539], + [.451, .252], + [.016, -.238], + [-.200, .524]]) + X = nmds.smacof(sim, init=Z, p=2, max_iter=1) + X_true = np.array([[-1.415, -2.471], + [1.633, 1.107], + [.249, -.067], + [-.468, 1.431]]) + assert_array_almost_equal(X, X_true, decimal=3) + + +def test_smacof_error(): + # Not symmetric similarity matrix: + sim = np.array([[0, 5, 9, 4], + [5, 0, 2, 2], + [3, 2, 0, 1], + [4, 2, 1, 0]]) + + assert_raises(ValueError, nmds.smacof, sim) + + # Not squared similarity matrix: + sim = np.array([[0, 5, 9, 4], + [5, 0, 2, 2], + [4, 2, 1, 0]]) + + assert_raises(ValueError, nmds.smacof, sim) + + # init not None and not correct format: + sim = np.array([[0, 5, 3, 4], + [5, 0, 2, 2], + [3, 2, 0, 1], + [4, 2, 1, 0]]) + + Z = np.array([[-.266, -.539], + [.016, -.238], + [-.200, .524]]) + assert_raises(ValueError, nmds.smacof, sim, init=Z)