Updated doc for dimensionality reduction algorithms

shogun-toolbox · Aug 15, 2011 · 9d2dd54 · 9d2dd54
1 parent bcf3002
commit 9d2dd54
Show file tree

Hide file tree

Showing 8 changed files with 89 additions and 34 deletions.
diff --git a/src/shogun/preprocessor/HessianLocallyLinearEmbedding.h b/src/shogun/preprocessor/HessianLocallyLinearEmbedding.h
@@ -29,7 +29,15 @@ class CDistance;
  * Hessian eigenmaps: new tools for nonlinear dimensionality reduction.
  * Proceedings of National Academy of Science (Vol. 100, pp. 5591-5596).
  *
+ * Stated eigenproblem is solved in the same way as in
+ * CLocallyLinearEmbedding (LAPACK or ARPACK if available).
  *
+ * Hessian estimation step is parallel and neighborhood determination 
+ * is not as in CLocallyLinearEmbedding.
+ *
+ * Be sure k value is set with at least 
+ * 1+[target dim]+1/2 [target_dim]*[1 + target dim], e.g.
+ * greater than 6 for target dimensionality of 2.
  */
 class CHessianLocallyLinearEmbedding: public CLocallyLinearEmbedding
 {

diff --git a/src/shogun/preprocessor/Isomap.cpp b/src/shogun/preprocessor/Isomap.cpp
@@ -185,16 +185,16 @@ CCustomDistance* CIsomap::isomap_distance(CDistance* distance)
 	pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
 	for (t=0; t<num_threads; t++)
 	{
-		parameters[t].idx_start = t;	
-		parameters[t].idx_stop = N;	
-		parameters[t].idx_step = num_threads;	
-		parameters[t].heap = heaps[t];	
-		parameters[t].edges_matrix = edges_matrix;	
-		parameters[t].edges_idx_matrix = edges_idx_matrix;	
-		parameters[t].s = s+t*N;	
-		parameters[t].f = f+t*N;	
+		parameters[t].idx_start = t;
+		parameters[t].idx_stop = N;
+		parameters[t].idx_step = num_threads;
+		parameters[t].heap = heaps[t];
+		parameters[t].edges_matrix = edges_matrix;
+		parameters[t].edges_idx_matrix = edges_idx_matrix;
+		parameters[t].s = s+t*N;
+		parameters[t].f = f+t*N;
 		parameters[t].m_k = m_k;
-		parameters[t].shortest_D = shortest_D;	
+		parameters[t].shortest_D = shortest_D;
 		pthread_create(&threads[t], &attr, CIsomap::run_dijkstra_thread, (void*)&parameters[t]);
 	}
 	for (t=0; t<num_threads; t++)

diff --git a/src/shogun/preprocessor/Isomap.h b/src/shogun/preprocessor/Isomap.h
@@ -27,17 +27,16 @@ class CFeatures;
 
 class CDistance;
 
-/** @brief the base class Isomap used to preprocess data using Classic
- * or Landmark K-Isomap. The description is given in
+/** @brief the class Isomap used to preprocess data using K-Isomap algorithm.
+ * The description is given in
  * 
- * Global versus local methods in nonlinear dimensionality reduction
- * Vin De Silva, Joshua B Tenenbaum (2003)
- * Advances in Neural Information Processing Systems 15 15 (Figure 2) p.721-728
- *
- *
- *
- *
+ * Silva, V. D., & Tenenbaum, J. B. (2003). 
+ * Global versus local methods in nonlinear dimensionality reduction. 
+ * Advances in Neural Information Processing Systems 15, 15(Figure 2), 721-728. MIT Press. 
+ * Retrieved from http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.9.3407&rep=rep1&type=pdf
  *
+ * Shortest paths are being computed with Dijkstra's algorithm with heap
+ * in parallel. Due to sparsity of kNN graph Fibonacci Heap is used. 
  */
 class CIsomap: public CMultidimensionalScaling
 {
@@ -59,15 +58,15 @@ class CIsomap: public CMultidimensionalScaling
 
 	/** apply preprocessor to CDistance using
 	 * Isomap of specified type
-	 * @param distance
-	 * @return new features with distance similar to geodesic
+	 * @param distance distance
+	 * @return new features with euclidean distance similar to geodesic
 	 */
 	virtual CSimpleFeatures<float64_t>* apply_to_distance(CDistance* distance);
 
 	/** apply preprocessor to feature matrix using 
 	 * Isomap of specified type
-	 * @param features
-	 * @return new feature matrix with distance similar to geodesic
+	 * @param features 
+	 * @return new feature matrix with euclidean distance similar to geodesic
 	 */
 	virtual SGMatrix<float64_t> apply_to_feature_matrix(CFeatures* features);
 

diff --git a/src/shogun/preprocessor/LaplacianEigenmaps.h b/src/shogun/preprocessor/LaplacianEigenmaps.h
@@ -30,6 +30,19 @@ class CDistance;
  * Science, 14, 585-591. MIT Press. 
  * Retrieved from http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.19.9400&rep=rep1&type=pdf
  *
+ * Note that algorithm is very sensitive to heat distribution coefficient and number
+ * of neighbors for nearest neighbor graph. With no connectivity check is provided
+ * preprocessor can produce not really reasonable embeddings if K value makes graph
+ * not connected.
+ *
+ * This implementation is not parallel due to performance issues. Generalized 
+ * eigenproblem is the bottleneck of this algorithm.
+ *
+ * Solving of generalized eigenproblem involves LAPACK DSYGVX routine
+ * and requires extra memory for right-hand side matrix storage. 
+ * If ARPACK is available DSAUPD/DSEUPD is used with no extra 
+ * memory usage. 
+ *
  */
 class CLaplacianEigenmaps: public CDimensionReductionPreprocessor
 {

diff --git a/src/shogun/preprocessor/LocalTangentSpaceAlignment.h b/src/shogun/preprocessor/LocalTangentSpaceAlignment.h
@@ -29,6 +29,15 @@ class CDistance;
  * reduction via tangent space alignment, SIAM J. Sci. Comput. 26 (1)
  * (2004) 313–338.
  *
+ * Stated eigenproblem is solved in the same way as in
+ * CLocallyLinearEmbedding (LAPACK or ARPACK if available).
+ *
+ * Local tangent space alignment step is parallel. Neighborhood
+ * determination is not parallel as in CLocallyLinearEmbedding.
+ *
+ * This algorithm is pretty stable for variations of k parameter but
+ * be sure it set with consistent value (at least 3-5) for reasonable
+ * results.
  */
 class CLocalTangentSpaceAlignment: public CLocallyLinearEmbedding
 {

diff --git a/src/shogun/preprocessor/LocallyLinearEmbedding.h b/src/shogun/preprocessor/LocallyLinearEmbedding.h
@@ -28,6 +28,21 @@ class CDistance;
  * Saul, L. K., Ave, P., Park, F., & Roweis, S. T. (2001).
  * An Introduction to Locally Linear Embedding. Available from, 290(5500), 2323-2326.
  *
+ * The process of finding nearest neighbors involves Fibonacci Heap 
+ * and Euclidian distance. Note it is not parallel still.
+ *
+ * Linear reconstruction step runs in parallel for objects and 
+ * involves LAPACK routine DPOSV for solving system of linear equations.
+ *
+ * Eigenproblem stated in the algorithm is solved with LAPACK routine 
+ * DSYEVR or with ARPACK DSAUPD/DSEUPD routines if available.
+ *
+ * Due to computation speed in case of ARPACK is being used small 
+ * regularization and Cholesky factorization is used internally 
+ * for Lanzcos iterations. In case of results aren't reasonable
+ * LUP factorization could be used with posdef parameter set to 
+ * false using set_posdef.
+ *
  */
 class CLocallyLinearEmbedding: public CDimensionReductionPreprocessor
 {

diff --git a/src/shogun/preprocessor/MultidimensionalScaling.cpp b/src/shogun/preprocessor/MultidimensionalScaling.cpp
@@ -225,7 +225,7 @@ SGMatrix<float64_t> CMultidimensionalScaling::classic_embedding(CDistance* dista
 		if (m_eigenvalues.vector[i]<=0.0)
 		{
 			SG_WARNING("Embedding is not consistent (got neg eigenvalues): features %d-%d are wrong",
-			           i, m_eigenvalues.vlen);
+			           i, m_eigenvalues.vlen-1);
 			break;
 		}
 	}	
@@ -241,6 +241,11 @@ SGMatrix<float64_t> CMultidimensionalScaling::landmark_embedding(CDistance* dist
 	int32_t i,j,t;
 	int32_t lmk_N = m_landmark_number;
 	int32_t total_N = distance->get_num_vec_lhs();
+	if (lmk_N<3)
+	{
+		SG_ERROR("Number of landmarks (%d) should be greater than 3 for proper triangulation.\n", 
+		         lmk_N);
+	}
 	if (lmk_N>total_N)
 	{
 		SG_ERROR("Number of landmarks (%d) should be less than total number of vectors (%d).\n",

diff --git a/src/shogun/preprocessor/MultidimensionalScaling.h b/src/shogun/preprocessor/MultidimensionalScaling.h
@@ -22,28 +22,34 @@ class CFeatures;
 
 class CDistance;
 
-/** @brief the class ClassicMDS used to perform classic eigenvector
- * multidimensional scaling.
- *
- * Description is given at p.261 (Section 12.1) of
+/** @brief the class Multidimensionalscaling used to perform
+ * multidimensional scaling (capable of landmark approximation
+ * if requested).
  *
+ * Description of classical embedding is given at p.261 (Section 12.1) of
  * Borg, I., & Groenen, P. J. F. (2005).
  * Modern multidimensional scaling: Theory and applications. Springer.
  *
- * and in
+ * Description of landmark MDS approximation is given in
  *
  * Sparse multidimensional scaling using landmark points
  * V De Silva, J B Tenenbaum (2004) Technology, p. 1-4
  * 
- * In this preprocessor LAPACK is used for solving eigenproblem. If 
- * ARPACK is available, it is used instead of LAPACK.
+ * In this preprocessor LAPACK routine DSYEVR is used for
+ * solving eigenproblem. If ARPACK library is available,
+ * its routines DSAUPD/DSEUPD are used instead.
  *
- * Note that target dimension should be set with sensible value
+ * Note that target dimension should be set with reasonable value
  * (using set_target_dim). In case it is higher than intrinsic
  * dimensionality of the dataset 'extra' features of the output 
- * may be inconsistent (actually features according to zero or
- * negative eigenvalues). In this case a warning is throwed.  
- * 
+ * might be inconsistent (essentially, according to zero or
+ * negative eigenvalues). In this case a warning is showed.
+ *
+ * Faster landmark approximation is parallel using pthreads.
+ * As for choice of landmark number it should be at least 3 for
+ * proper triangulation. For reasonable embedding accuracy greater
+ * values (30%-50% of total examples number) is pretty good for the
+ * most tasks.
  */
 class CMultidimensionalScaling: public CDimensionReductionPreprocessor
 {