From 9d2dd548c70bff45d8ac5a1aa00556ca425ef365 Mon Sep 17 00:00:00 2001 From: Sergey Lisitsyn Date: Tue, 16 Aug 2011 00:16:25 +0400 Subject: [PATCH] Updated doc for dimensionality reduction algorithms --- .../HessianLocallyLinearEmbedding.h | 8 ++++++ src/shogun/preprocessor/Isomap.cpp | 18 ++++++------ src/shogun/preprocessor/Isomap.h | 25 ++++++++--------- src/shogun/preprocessor/LaplacianEigenmaps.h | 13 +++++++++ .../preprocessor/LocalTangentSpaceAlignment.h | 9 ++++++ .../preprocessor/LocallyLinearEmbedding.h | 15 ++++++++++ .../preprocessor/MultidimensionalScaling.cpp | 7 ++++- .../preprocessor/MultidimensionalScaling.h | 28 +++++++++++-------- 8 files changed, 89 insertions(+), 34 deletions(-) diff --git a/src/shogun/preprocessor/HessianLocallyLinearEmbedding.h b/src/shogun/preprocessor/HessianLocallyLinearEmbedding.h index 8aac87be488..75e7fb2e0ce 100755 --- a/src/shogun/preprocessor/HessianLocallyLinearEmbedding.h +++ b/src/shogun/preprocessor/HessianLocallyLinearEmbedding.h @@ -29,7 +29,15 @@ class CDistance; * Hessian eigenmaps: new tools for nonlinear dimensionality reduction. * Proceedings of National Academy of Science (Vol. 100, pp. 5591-5596). * + * Stated eigenproblem is solved in the same way as in + * CLocallyLinearEmbedding (LAPACK or ARPACK if available). * + * Hessian estimation step is parallel and neighborhood determination + * is not as in CLocallyLinearEmbedding. + * + * Be sure k value is set with at least + * 1+[target dim]+1/2 [target_dim]*[1 + target dim], e.g. + * greater than 6 for target dimensionality of 2. */ class CHessianLocallyLinearEmbedding: public CLocallyLinearEmbedding { diff --git a/src/shogun/preprocessor/Isomap.cpp b/src/shogun/preprocessor/Isomap.cpp index 6636e0a11d9..8696eadfd08 100644 --- a/src/shogun/preprocessor/Isomap.cpp +++ b/src/shogun/preprocessor/Isomap.cpp @@ -185,16 +185,16 @@ CCustomDistance* CIsomap::isomap_distance(CDistance* distance) pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); for (t=0; t* apply_to_distance(CDistance* distance); /** apply preprocessor to feature matrix using * Isomap of specified type - * @param features - * @return new feature matrix with distance similar to geodesic + * @param features + * @return new feature matrix with euclidean distance similar to geodesic */ virtual SGMatrix apply_to_feature_matrix(CFeatures* features); diff --git a/src/shogun/preprocessor/LaplacianEigenmaps.h b/src/shogun/preprocessor/LaplacianEigenmaps.h index 3d1567c7777..a4acda56693 100755 --- a/src/shogun/preprocessor/LaplacianEigenmaps.h +++ b/src/shogun/preprocessor/LaplacianEigenmaps.h @@ -30,6 +30,19 @@ class CDistance; * Science, 14, 585-591. MIT Press. * Retrieved from http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.19.9400&rep=rep1&type=pdf * + * Note that algorithm is very sensitive to heat distribution coefficient and number + * of neighbors for nearest neighbor graph. With no connectivity check is provided + * preprocessor can produce not really reasonable embeddings if K value makes graph + * not connected. + * + * This implementation is not parallel due to performance issues. Generalized + * eigenproblem is the bottleneck of this algorithm. + * + * Solving of generalized eigenproblem involves LAPACK DSYGVX routine + * and requires extra memory for right-hand side matrix storage. + * If ARPACK is available DSAUPD/DSEUPD is used with no extra + * memory usage. + * */ class CLaplacianEigenmaps: public CDimensionReductionPreprocessor { diff --git a/src/shogun/preprocessor/LocalTangentSpaceAlignment.h b/src/shogun/preprocessor/LocalTangentSpaceAlignment.h index 26e075d4c59..016a46d8347 100755 --- a/src/shogun/preprocessor/LocalTangentSpaceAlignment.h +++ b/src/shogun/preprocessor/LocalTangentSpaceAlignment.h @@ -29,6 +29,15 @@ class CDistance; * reduction via tangent space alignment, SIAM J. Sci. Comput. 26 (1) * (2004) 313–338. * + * Stated eigenproblem is solved in the same way as in + * CLocallyLinearEmbedding (LAPACK or ARPACK if available). + * + * Local tangent space alignment step is parallel. Neighborhood + * determination is not parallel as in CLocallyLinearEmbedding. + * + * This algorithm is pretty stable for variations of k parameter but + * be sure it set with consistent value (at least 3-5) for reasonable + * results. */ class CLocalTangentSpaceAlignment: public CLocallyLinearEmbedding { diff --git a/src/shogun/preprocessor/LocallyLinearEmbedding.h b/src/shogun/preprocessor/LocallyLinearEmbedding.h index e12b8db4e6e..447e02ff38c 100755 --- a/src/shogun/preprocessor/LocallyLinearEmbedding.h +++ b/src/shogun/preprocessor/LocallyLinearEmbedding.h @@ -28,6 +28,21 @@ class CDistance; * Saul, L. K., Ave, P., Park, F., & Roweis, S. T. (2001). * An Introduction to Locally Linear Embedding. Available from, 290(5500), 2323-2326. * + * The process of finding nearest neighbors involves Fibonacci Heap + * and Euclidian distance. Note it is not parallel still. + * + * Linear reconstruction step runs in parallel for objects and + * involves LAPACK routine DPOSV for solving system of linear equations. + * + * Eigenproblem stated in the algorithm is solved with LAPACK routine + * DSYEVR or with ARPACK DSAUPD/DSEUPD routines if available. + * + * Due to computation speed in case of ARPACK is being used small + * regularization and Cholesky factorization is used internally + * for Lanzcos iterations. In case of results aren't reasonable + * LUP factorization could be used with posdef parameter set to + * false using set_posdef. + * */ class CLocallyLinearEmbedding: public CDimensionReductionPreprocessor { diff --git a/src/shogun/preprocessor/MultidimensionalScaling.cpp b/src/shogun/preprocessor/MultidimensionalScaling.cpp index 26ef3305626..738f4b1eaac 100644 --- a/src/shogun/preprocessor/MultidimensionalScaling.cpp +++ b/src/shogun/preprocessor/MultidimensionalScaling.cpp @@ -225,7 +225,7 @@ SGMatrix CMultidimensionalScaling::classic_embedding(CDistance* dista if (m_eigenvalues.vector[i]<=0.0) { SG_WARNING("Embedding is not consistent (got neg eigenvalues): features %d-%d are wrong", - i, m_eigenvalues.vlen); + i, m_eigenvalues.vlen-1); break; } } @@ -241,6 +241,11 @@ SGMatrix CMultidimensionalScaling::landmark_embedding(CDistance* dist int32_t i,j,t; int32_t lmk_N = m_landmark_number; int32_t total_N = distance->get_num_vec_lhs(); + if (lmk_N<3) + { + SG_ERROR("Number of landmarks (%d) should be greater than 3 for proper triangulation.\n", + lmk_N); + } if (lmk_N>total_N) { SG_ERROR("Number of landmarks (%d) should be less than total number of vectors (%d).\n", diff --git a/src/shogun/preprocessor/MultidimensionalScaling.h b/src/shogun/preprocessor/MultidimensionalScaling.h index 62200faad80..0c8866712ad 100644 --- a/src/shogun/preprocessor/MultidimensionalScaling.h +++ b/src/shogun/preprocessor/MultidimensionalScaling.h @@ -22,28 +22,34 @@ class CFeatures; class CDistance; -/** @brief the class ClassicMDS used to perform classic eigenvector - * multidimensional scaling. - * - * Description is given at p.261 (Section 12.1) of +/** @brief the class Multidimensionalscaling used to perform + * multidimensional scaling (capable of landmark approximation + * if requested). * + * Description of classical embedding is given at p.261 (Section 12.1) of * Borg, I., & Groenen, P. J. F. (2005). * Modern multidimensional scaling: Theory and applications. Springer. * - * and in + * Description of landmark MDS approximation is given in * * Sparse multidimensional scaling using landmark points * V De Silva, J B Tenenbaum (2004) Technology, p. 1-4 * - * In this preprocessor LAPACK is used for solving eigenproblem. If - * ARPACK is available, it is used instead of LAPACK. + * In this preprocessor LAPACK routine DSYEVR is used for + * solving eigenproblem. If ARPACK library is available, + * its routines DSAUPD/DSEUPD are used instead. * - * Note that target dimension should be set with sensible value + * Note that target dimension should be set with reasonable value * (using set_target_dim). In case it is higher than intrinsic * dimensionality of the dataset 'extra' features of the output - * may be inconsistent (actually features according to zero or - * negative eigenvalues). In this case a warning is throwed. - * + * might be inconsistent (essentially, according to zero or + * negative eigenvalues). In this case a warning is showed. + * + * Faster landmark approximation is parallel using pthreads. + * As for choice of landmark number it should be at least 3 for + * proper triangulation. For reasonable embedding accuracy greater + * values (30%-50% of total examples number) is pretty good for the + * most tasks. */ class CMultidimensionalScaling: public CDimensionReductionPreprocessor {