Skip to content

Commit

Permalink
PCA code cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
mazumdarparijat committed Mar 3, 2014
1 parent 2e5573d commit 6644ff3
Show file tree
Hide file tree
Showing 3 changed files with 146 additions and 95 deletions.
164 changes: 88 additions & 76 deletions src/shogun/preprocessor/PCA.cpp
Expand Up @@ -12,7 +12,7 @@
*/
#include <shogun/lib/config.h>

#if defined(HAVE_LAPACK) && defined(HAVE_EIGEN3)
#if defined HAVE_EIGEN3
#include <shogun/preprocessor/PCA.h>
#include <shogun/mathematics/lapack.h>
#include <shogun/mathematics/Math.h>
Expand All @@ -27,23 +27,23 @@
using namespace shogun;
using namespace Eigen;

CPCA::CPCA(bool do_whitening, EPCAMode mode, float64_t thresh, EPCAMethod method, EPCAMemoryMode mem)
CPCA::CPCA(bool do_whitening, EPCAMode mode, float64_t thresh, EPCAMethod method, EPCAMemoryMode mem_mode)
: CDimensionReductionPreprocessor()
{
init();
m_whitening = do_whitening;
m_mode = mode;
m_thresh = thresh;
mem_mode = mem;
m_mem_mode = mem_mode;
m_method = method;
}

CPCA::CPCA(EPCAMethod method, bool do_whitening, EPCAMemoryMode mem)
CPCA::CPCA(EPCAMethod method, bool do_whitening, EPCAMemoryMode mem_mode)
: CDimensionReductionPreprocessor()
{
init();
m_whitening = do_whitening;
mem_mode = mem;
m_mem_mode = mem_mode;
m_method = method;
}

Expand All @@ -57,7 +57,7 @@ void CPCA::init()
m_whitening = false;
m_mode = FIXED_NUMBER;
m_thresh = 1e-6;
mem_mode = MEM_REALLOCATE;
m_mem_mode = MEM_REALLOCATE;
m_method = AUTO;

SG_ADD(&m_transformation_matrix, "transformation_matrix",
Expand All @@ -72,8 +72,10 @@ void CPCA::init()
MS_AVAILABLE);
SG_ADD((machine_int_t*) &m_mode, "mode", "PCA Mode.", MS_AVAILABLE);
SG_ADD(&m_thresh, "m_thresh", "Cutoff threshold.", MS_AVAILABLE);
SG_ADD((machine_int_t*) &mem_mode, "mem_mode", "Memory mode (in-place or reallocation).", MS_NOT_AVAILABLE);
SG_ADD((machine_int_t*) &m_method, "m_method", "Method used for PCA calculation", MS_NOT_AVAILABLE);
SG_ADD((machine_int_t*) &m_mem_mode, "m_mem_mode",
"Memory mode (in-place or reallocation).", MS_NOT_AVAILABLE);
SG_ADD((machine_int_t*) &m_method, "m_method",
"Method used for PCA calculation", MS_NOT_AVAILABLE);
}

CPCA::~CPCA()
Expand All @@ -84,13 +86,11 @@ bool CPCA::init(CFeatures* features)
{
if (!m_initialized)
{
// loop variable
int32_t i;

REQUIRE(features->get_feature_class()==C_DENSE, "PCA only works with dense features")
REQUIRE(features->get_feature_type()==F_DREAL, "PCA only works with real features")

SGMatrix<float64_t> feature_matrix = ((CDenseFeatures<float64_t>*)features)->get_feature_matrix();
SGMatrix<float64_t> feature_matrix = ((CDenseFeatures<float64_t>*)features)
->get_feature_matrix();
int32_t num_vectors = feature_matrix.num_cols;
int32_t num_features = feature_matrix.num_rows;
SG_INFO("num_examples: %ld num_features: %ld \n", num_vectors, num_features)
Expand Down Expand Up @@ -124,48 +124,56 @@ bool CPCA::init(CFeatures* features)

SG_INFO("Computing Eigenvalues ... ")
// eigen value computed
SelfAdjointEigenSolver<MatrixXd> eigenSolve = SelfAdjointEigenSolver<MatrixXd>(cov_mat);
SelfAdjointEigenSolver<MatrixXd> eigenSolve =
SelfAdjointEigenSolver<MatrixXd>(cov_mat);
eigenValues = eigenSolve.eigenvalues().tail(max_dim_allowed);

if (m_mode == FIXED_NUMBER)
{
num_dim = m_target_dim;
}
if (m_mode == VARIANCE_EXPLAINED)
{
float64_t eig_sum = eigenValues.sum();
float64_t com_sum = 0;
for (i=num_features-1; i>-1; i--)
{
num_dim++;
com_sum += m_eigenvalues_vector.vector[i];
if (com_sum/eig_sum>=m_thresh)
break;
}
}
if (m_mode == THRESHOLD)
// target dimension
switch (m_mode)
{
for (i=num_features-1; i>-1; i--)
{
if (m_eigenvalues_vector.vector[i]>m_thresh)
num_dim++;
else
break;
}
}

case FIXED_NUMBER :
num_dim = m_target_dim;
break;

case VARIANCE_EXPLAINED :
{
float64_t eig_sum = eigenValues.sum();
float64_t com_sum = 0;
for (int32_t i=num_features-1; i<-1; i++)
{
num_dim++;
com_sum += m_eigenvalues_vector.vector[i];
if (com_sum/eig_sum>=m_thresh)
break;
}
}
break;

case THRESHOLD :
for (int32_t i=num_features-1; i<-1; i++)
{
if (m_eigenvalues_vector.vector[i]>m_thresh)
num_dim++;
else
break;
}
break;
};
SG_INFO("Done\nReducing from %i to %i features..", num_features, num_dim)

m_transformation_matrix = SGMatrix<float64_t>(num_features,num_dim);
Map<MatrixXd> transformMatrix(m_transformation_matrix.matrix, num_features, num_dim);
Map<MatrixXd> transformMatrix(m_transformation_matrix.matrix,
num_features, num_dim);
num_old_dim = num_features;

// eigenvector matrix
transformMatrix = eigenSolve.eigenvectors().block(0, num_features-num_dim, num_features,num_dim);
transformMatrix = eigenSolve.eigenvectors().block(0,
num_features-num_dim, num_features,num_dim);
if (m_whitening)
{
for (i=0; i<num_dim; i++)
transformMatrix.col(i) /= sqrt(eigenValues[i+max_dim_allowed-num_dim]);
for (int32_t i=0; i<num_dim; i++)
transformMatrix.col(i) /=
sqrt(eigenValues[i+max_dim_allowed-num_dim]);
}
}

Expand All @@ -179,33 +187,36 @@ bool CPCA::init(CFeatures* features)
eigenValues = eigenValues.cwiseProduct(eigenValues)/(num_vectors-1);

// target dimension
if (m_mode == FIXED_NUMBER)
{
num_dim = m_target_dim;
}
if (m_mode == VARIANCE_EXPLAINED)
switch (m_mode)
{
float64_t eig_sum = eigenValues.sum();
float64_t com_sum = 0;
for (i=0; i<num_features; i++)
{
num_dim++;
com_sum += m_eigenvalues_vector.vector[i];
if (com_sum/eig_sum>=m_thresh)
break;
}
}
if (m_mode == THRESHOLD)
{
for (i=0; i<num_features; i++)
{
if (m_eigenvalues_vector.vector[i]>m_thresh)
num_dim++;
else
break;
}
}

case FIXED_NUMBER :
num_dim = m_target_dim;
break;

case VARIANCE_EXPLAINED :
{
float64_t eig_sum = eigenValues.sum();
float64_t com_sum = 0;
for (int32_t i=0; i<num_features; i++)
{
num_dim++;
com_sum += m_eigenvalues_vector.vector[i];
if (com_sum/eig_sum>=m_thresh)
break;
}
}
break;

case THRESHOLD :
for (int32_t i=0; i<num_features; i++)
{
if (m_eigenvalues_vector.vector[i]>m_thresh)
num_dim++;
else
break;
}
break;
};
SG_INFO("Done\nReducing from %i to %i features..", num_features, num_dim)

// right singular vectors form eigenvectors
Expand All @@ -215,7 +226,7 @@ bool CPCA::init(CFeatures* features)
transformMatrix = svd.matrixV().block(0, 0, num_features, num_dim);
if (m_whitening)
{
for (i=0; i<num_dim; i++)
for (int32_t i=0; i<num_dim; i++)
transformMatrix.col(i) /= sqrt(eigenValues[i]);
}
}
Expand Down Expand Up @@ -248,7 +259,7 @@ SGMatrix<float64_t> CPCA::apply_to_feature_matrix(CFeatures* features)
Map<MatrixXd> transform_matrix(m_transformation_matrix.matrix,
m_transformation_matrix.num_rows, m_transformation_matrix.num_cols);

if (mem_mode == MEM_IN_PLACE)
if (m_mem_mode == MEM_IN_PLACE)
{
if (m.matrix)
{
Expand All @@ -257,7 +268,8 @@ SGMatrix<float64_t> CPCA::apply_to_feature_matrix(CFeatures* features)
VectorXd data_mean = feature_matrix.rowwise().sum()/(float64_t) num_vectors;
feature_matrix = feature_matrix.colwise()-data_mean;

feature_matrix.block(0,0,num_dim,num_vectors) = transform_matrix.transpose()*feature_matrix;
feature_matrix.block(0,0,num_dim,num_vectors) =
transform_matrix.transpose()*feature_matrix;

SG_INFO("Form matrix of target dimension")
for (int32_t col=0; col<num_vectors; col++)
Expand Down Expand Up @@ -324,12 +336,12 @@ SGVector<float64_t> CPCA::get_mean()

EPCAMemoryMode CPCA::get_memory_mode() const
{
return mem_mode;
return m_mem_mode;
}

void CPCA::set_memory_mode(EPCAMemoryMode e)
{
mem_mode = e;
m_mem_mode = e;
}

#endif // HAVE_LAPACK && HAVE_EIGEN3
#endif // HAVE_EIGEN3
34 changes: 22 additions & 12 deletions src/shogun/preprocessor/PCA.h
Expand Up @@ -16,7 +16,7 @@

#include <shogun/lib/config.h>

#if defined(HAVE_LAPACK) && defined(HAVE_EIGEN3)
#if defined HAVE_EIGEN3
#include <shogun/mathematics/lapack.h>
#include <stdio.h>
#include <shogun/preprocessor/DimensionReductionPreprocessor.h>
Expand All @@ -28,11 +28,15 @@ namespace shogun
/** Matrix decomposition method for PCA */
enum EPCAMethod
{
/** if N>D then EVD is chosen automatically else SVD is chosen (D-dimensions N-number of vectors) */
/** if N>D then EVD is chosen automatically else SVD is chosen
* (D-dimensions N-number of vectors)
*/
AUTO,
/** SVD based PCA. Time complexity ~14dn^2 (d-dimensions n-number of vectors) */
SVD,
/** Eigenvalue decomposition of covariance matrix. Time complexity ~10d^3 (d-dimensions n-number of vectors) */
/** Eigenvalue decomposition of covariance matrix.
* Time complexity ~10d^3 (d-dimensions n-number of vectors)
*/
EVD
};

Expand All @@ -50,7 +54,10 @@ enum EPCAMode
/** memory usage by PCA : In-place or through reallocation */
enum EPCAMemoryMode
{
/** The feature matrix replaced by new matrix with target dims */
/** The feature matrix replaced by new matrix with target dims.
* This requires memory for old matrix as well as new matrix
* at once for a short amount of time initially.
*/
MEM_REALLOCATE,
/** The feature matrix is modified in-place to generate the new matrix.
* Output matrix dimensions are changed to target dims, but actual matrix
Expand Down Expand Up @@ -94,10 +101,14 @@ enum EPCAMemoryMode
* <em>VARIANCE_EXPLAINED</em> : The user supplies the fractional variance that he
* wants preserved in the target dimension T. From this supplied fractional variance (thresh),
* T is calculated as the smallest k such that the ratio of sum of largest k eigenvalues
* over total sum of all eigenvalues is greater than thresh.
* over total sum of all eigenvalues is greater than thresh.
*
* <em>THRESH</em> : The user supplies a threshold. All eigenvectors with corresponding eigenvalue
* greater than the supplied threshold are chosen.
*
* An option for whitening the transformation matrix is also given - do_whitening. Setting this
* option normalizes the eigenvectors (ie. the columns of transformation matrix) by dividing them
* with the square root of corresponding eigenvalues.
*
* Note that vectors/matrices don't have to have zero mean as it is substracted within the class.
*/
Expand All @@ -107,20 +118,19 @@ class CPCA: public CDimensionReductionPreprocessor

/** standard constructor
*
* @param do_whitening do whitening
* @param do_whitening normalize columns(eigenvectors) in transformation matrix
* @param mode mode of pca : FIXED_NUMBER/VARIANCE_EXPLAINED/THRESHOLD
* @param thresh threshold value for VARIANCE_EXPLAINED or THRESHOLD mode
* @param method Matrix decomposition method used : SVD/EVD/AUTO[default]
* @param mem memory usage mode of PCA : MEM_REALLOCATE/MEM_IN_PLACE
* @param mem_mode memory usage mode of PCA : MEM_REALLOCATE/MEM_IN_PLACE
*/
CPCA(bool do_whitening=false, EPCAMode mode=FIXED_NUMBER, float64_t thresh=1e-6,
EPCAMethod method=AUTO, EPCAMemoryMode mem=MEM_REALLOCATE);
EPCAMethod method=AUTO, EPCAMemoryMode mem_mode=MEM_REALLOCATE);

/** special constructor for FIXED_NUMBER mode
*
* @param do_whitening do whitening
* @param mode mode of pca : FIXED_NUMBER/VARIANCE_EXPLAINED/THRESHOLD
* @param method Matrix decomposition method used : SVD/EVD/AUTO[default]
* @param do_whitening normalize columns(eigenvectors) in transformation matrix
* @param mem memory usage mode of PCA : MEM_REALLOCATE/MEM_IN_PLACE
*/
CPCA(EPCAMethod method, bool do_whitening=false, EPCAMemoryMode mem=MEM_REALLOCATE);
Expand Down Expand Up @@ -200,10 +210,10 @@ class CPCA: public CDimensionReductionPreprocessor
/** thresh */
float64_t m_thresh;
/** PCA memory mode */
EPCAMemoryMode mem_mode;
EPCAMemoryMode m_mem_mode;
/** PCA method */
EPCAMethod m_method;
};
}
#endif // HAVE_LAPACK && HAVE_EIGEN3
#endif // HAVE_EIGEN3
#endif // PCA_H_

0 comments on commit 6644ff3

Please sign in to comment.