Cleanup pca preprocessors and rename init to fit

shogun-toolbox · Jun 8, 2018 · 1ea60e1 · 1ea60e1
1 parent cb0f95c
commit 1ea60e1
Show file tree

Hide file tree

Showing 9 changed files with 94 additions and 115 deletions.
diff --git a/src/shogun/metric/LMNNImpl.cpp b/src/shogun/metric/LMNNImpl.cpp
@@ -371,7 +371,7 @@ SGMatrix<float64_t> CLMNNImpl::compute_pca_transform(CDenseFeatures<float64_t>*
 	// Obtain the linear transform applying PCA
 	CPCA* pca = new CPCA();
 	pca->set_target_dim(cloned_features->get_num_features());
-	pca->init(cloned_features);
+	pca->fit(cloned_features);
 	SGMatrix<float64_t> pca_transform = pca->get_transformation_matrix();
 
 	SG_UNREF(pca);

diff --git a/src/shogun/preprocessor/DimensionReductionPreprocessor.cpp b/src/shogun/preprocessor/DimensionReductionPreprocessor.cpp
@@ -55,11 +55,6 @@ SGMatrix<float64_t> CDimensionReductionPreprocessor::apply_to_feature_matrix(CFe
 	}
 }
 
-bool CDimensionReductionPreprocessor::init(CFeatures* data)
-{
-	return true;
-}
-
 void CDimensionReductionPreprocessor::cleanup()
 {
 

diff --git a/src/shogun/preprocessor/DimensionReductionPreprocessor.h b/src/shogun/preprocessor/DimensionReductionPreprocessor.h
@@ -43,12 +43,6 @@ class CDimensionReductionPreprocessor: public CDensePreprocessor<float64_t>
 	/** destructor */
 	virtual ~CDimensionReductionPreprocessor();
 
-	/** init
-	 * set true by default, should be defined if dimension reduction
-	 * preprocessor is using some initialization
-	 */
-	virtual bool init(CFeatures* data);
-
 	/** cleanup
 	 * set empty by default, should be defined if dimension reduction
 	 * preprocessor should free some resources

diff --git a/src/shogun/preprocessor/KernelPCA.cpp b/src/shogun/preprocessor/KernelPCA.cpp
@@ -63,63 +63,62 @@ CKernelPCA::~CKernelPCA()
 		SG_UNREF(m_init_features);
 }
 
-bool CKernelPCA::init(CFeatures* features)
+void CKernelPCA::fit(CFeatures* features)
 {
-	if (!m_initialized && m_kernel)
-	{
-		SG_REF(features);
-		m_init_features = features;
-
-		m_kernel->init(features,features);
-		SGMatrix<float64_t> kernel_matrix = m_kernel->get_kernel_matrix();
-		m_kernel->cleanup();
-		int32_t n = kernel_matrix.num_cols;
-		int32_t m = kernel_matrix.num_rows;
-		ASSERT(n==m)
-		if (m_target_dim > n)
-		{
-			SG_SWARNING(
-			    "Target dimension (%d) is not a valid value, it must be"
-			    "less or equal than the number of vectors."
-			    "Setting it to maximum allowed size (%d).",
-			    m_target_dim, n);
-			m_target_dim = n;
-		}
+	REQUIRE(m_kernel, "Kernel not set\n");
 
-		SGVector<float64_t> bias_tmp = linalg::rowwise_sum(kernel_matrix);
-		linalg::scale(bias_tmp, bias_tmp, -1.0 / n);
-		float64_t s = linalg::sum(bias_tmp) / n;
-		linalg::add_scalar(bias_tmp, -s);
+	if (m_initialized)
+		cleanup();
 
-		linalg::center_matrix(kernel_matrix);
+	SG_REF(features);
+	m_init_features = features;
+
+	m_kernel->init(features, features);
+	SGMatrix<float64_t> kernel_matrix = m_kernel->get_kernel_matrix();
+	m_kernel->cleanup();
+	int32_t n = kernel_matrix.num_cols;
+	int32_t m = kernel_matrix.num_rows;
+	ASSERT(n == m)
+	if (m_target_dim > n)
+	{
+		SG_SWARNING(
+		    "Target dimension (%d) is not a valid value, it must be"
+		    "less or equal than the number of vectors."
+		    "Setting it to maximum allowed size (%d).",
+		    m_target_dim, n);
+		m_target_dim = n;
+	}
 
-		SGVector<float64_t> eigenvalues(m_target_dim);
-		SGMatrix<float64_t> eigenvectors(kernel_matrix.num_rows, m_target_dim);
-		linalg::eigen_solver_symmetric(
-		    kernel_matrix, eigenvalues, eigenvectors, m_target_dim);
+	SGVector<float64_t> bias_tmp = linalg::rowwise_sum(kernel_matrix);
+	linalg::scale(bias_tmp, bias_tmp, -1.0 / n);
+	float64_t s = linalg::sum(bias_tmp) / n;
+	linalg::add_scalar(bias_tmp, -s);
 
-		m_transformation_matrix =
-		    SGMatrix<float64_t>(kernel_matrix.num_rows, m_target_dim);
-		// eigenvalues are in increasing order
-		for (int32_t i = 0; i < m_target_dim; i++)
-		{
-			//normalize and trap divide by zero and negative eigenvalues
-			auto idx = m_target_dim - i - 1;
-			auto vec = eigenvectors.get_column(idx);
-			linalg::scale(
-			    vec, vec, 1.0 / std::sqrt(std::max(std::numeric_limits<float64_t>::epsilon(), eigenvalues[idx])));
-			m_transformation_matrix.set_column(i, vec);
-		}
+	linalg::center_matrix(kernel_matrix);
 
-		m_bias_vector = SGVector<float64_t>(m_target_dim);
-		linalg::matrix_prod(
-		    m_transformation_matrix, bias_tmp, m_bias_vector, true);
+	SGVector<float64_t> eigenvalues(m_target_dim);
+	SGMatrix<float64_t> eigenvectors(kernel_matrix.num_rows, m_target_dim);
+	linalg::eigen_solver_symmetric(
+	    kernel_matrix, eigenvalues, eigenvectors, m_target_dim);
 
-		m_initialized=true;
-		SG_INFO("Done\n")
-		return true;
+	m_transformation_matrix =
+	    SGMatrix<float64_t>(kernel_matrix.num_rows, m_target_dim);
+	// eigenvalues are in increasing order
+	for (int32_t i = 0; i < m_target_dim; i++)
+	{
+		// normalize and trap divide by zero and negative eigenvalues
+		auto idx = m_target_dim - i - 1;
+		auto vec = eigenvectors.get_column(idx);
+		linalg::scale(
+		    vec, vec, 1.0 / std::sqrt(std::max(std::numeric_limits<float64_t>::epsilon(), eigenvalues[idx])));
+		m_transformation_matrix.set_column(i, vec);
 	}
-	return false;
+
+	m_bias_vector = SGVector<float64_t>(m_target_dim);
+	linalg::matrix_prod(m_transformation_matrix, bias_tmp, m_bias_vector, true);
+
+	m_initialized = true;
+	SG_INFO("Done\n")
 }
 
 SGMatrix<float64_t> CKernelPCA::apply_to_feature_matrix(CFeatures* features)

diff --git a/src/shogun/preprocessor/KernelPCA.h b/src/shogun/preprocessor/KernelPCA.h
@@ -42,8 +42,8 @@ class CKernelPCA: public CDimensionReductionPreprocessor
 
 		virtual ~CKernelPCA();
 
-		/// initialize preprocessor from features
-		virtual bool init(CFeatures* features);
+		virtual void fit(CFeatures* features);
+
 		/// cleanup
 		virtual void cleanup();
 

diff --git a/src/shogun/preprocessor/PCA.cpp b/src/shogun/preprocessor/PCA.cpp
@@ -74,52 +74,46 @@ CPCA::~CPCA()
 {
 }
 
-bool CPCA::init(CFeatures* features)
+void CPCA::fit(CFeatures* features)
 {
-	if (!m_initialized)
-	{
-		REQUIRE(features->get_feature_class()==C_DENSE, "PCA only works with dense features")
-		REQUIRE(features->get_feature_type()==F_DREAL, "PCA only works with real features")
-
-		SGMatrix<float64_t> feature_matrix = ((CDenseFeatures<float64_t>*)features)
-									->get_feature_matrix();
-		int32_t num_vectors = feature_matrix.num_cols;
-		int32_t num_features = feature_matrix.num_rows;
-		SG_INFO(
-		    "num_examples: %d num_features: %d\n", num_vectors, num_features)
+	if (m_initialized)
+		cleanup();
 
-		// max target dim allowed
-		int32_t max_dim_allowed = CMath::min(num_vectors, num_features);
-		num_dim=0;
+	auto feature_matrix =
+	    features->as<CDenseFeatures<float64_t>>()->get_feature_matrix();
+	int32_t num_vectors = feature_matrix.num_cols;
+	int32_t num_features = feature_matrix.num_rows;
+	SG_INFO("num_examples: %d num_features: %d\n", num_vectors, num_features)
 
-		REQUIRE(m_target_dim<=max_dim_allowed,
-			 "target dimension should be less or equal to than minimum of N and D")
+	// max target dim allowed
+	int32_t max_dim_allowed = CMath::min(num_vectors, num_features);
+	num_dim = 0;
 
-		// center data
-		Map<MatrixXd> fmatrix(feature_matrix.matrix, num_features, num_vectors);
+	REQUIRE(
+	    m_target_dim <= max_dim_allowed,
+	    "target dimension should be less or equal to than minimum of N and D")
 
-		m_mean_vector = SGVector<float64_t>(num_features);
-		Map<VectorXd> data_mean(m_mean_vector.vector, num_features);
- 		data_mean = fmatrix.rowwise().sum()/(float64_t) num_vectors;
-		fmatrix = fmatrix.colwise()-data_mean;
+	// center data
+	Map<MatrixXd> fmatrix(feature_matrix.matrix, num_features, num_vectors);
 
-		m_eigenvalues_vector = SGVector<float64_t>(max_dim_allowed);
+	m_mean_vector = SGVector<float64_t>(num_features);
+	Map<VectorXd> data_mean(m_mean_vector.vector, num_features);
+	data_mean = fmatrix.rowwise().sum() / (float64_t)num_vectors;
+	fmatrix = fmatrix.colwise() - data_mean;
 
-		if (m_method == AUTO)
-			m_method = (num_vectors>num_features) ? EVD : SVD;
+	m_eigenvalues_vector = SGVector<float64_t>(max_dim_allowed);
 
-		if (m_method == EVD)
-		    init_with_evd(feature_matrix,  max_dim_allowed);
-		else
-		    init_with_svd(feature_matrix, max_dim_allowed);
+	if (m_method == AUTO)
+		m_method = (num_vectors > num_features) ? EVD : SVD;
 
-		// restore feature matrix
-		fmatrix = fmatrix.colwise()+data_mean;
-		m_initialized = true;
-		return true;
-	}
+	if (m_method == EVD)
+		init_with_evd(feature_matrix, max_dim_allowed);
+	else
+		init_with_svd(feature_matrix, max_dim_allowed);
 
-	return false;
+	// restore feature matrix
+	fmatrix = fmatrix.colwise() + data_mean;
+	m_initialized = true;
 }
 
 void CPCA::init_with_evd(const SGMatrix<float64_t>& feature_matrix, int32_t max_dim_allowed)

diff --git a/src/shogun/preprocessor/PCA.h b/src/shogun/preprocessor/PCA.h
@@ -130,10 +130,7 @@ class CPCA: public CDimensionReductionPreprocessor
 		/** destructor */
 		virtual ~CPCA();
 
-		/** initialize preprocessor from features
-		 * @param features
-		 */
-		virtual bool init(CFeatures* features);
+		virtual void fit(CFeatures* features);
 
 		/** cleanup */
 		virtual void cleanup();

diff --git a/tests/unit/preprocessor/KernelPCA_unittest.cc b/tests/unit/preprocessor/KernelPCA_unittest.cc
@@ -51,7 +51,7 @@ TEST(KernelPCA, apply_to_feature_matrix)
 	CKernelPCA* kpca = new CKernelPCA(kernel);
 	SG_REF(kpca)
 	kpca->set_target_dim(target_dim);
-	kpca->init(train_feats);
+	kpca->fit(train_feats);
 
 	SGMatrix<float64_t> embedding = kpca->apply_to_feature_matrix(test_feats);
 
@@ -82,7 +82,7 @@ TEST(KernelPCA, apply_to_feature_vector)
 	CKernelPCA* kpca = new CKernelPCA(kernel);
 	SG_REF(kpca)
 	kpca->set_target_dim(target_dim);
-	kpca->init(train_feats);
+	kpca->fit(train_feats);
 
 	SGVector<float64_t> embedding = kpca->apply_to_feature_vector(test_vector);
 

diff --git a/tests/unit/preprocessor/PCA_unittest.cc b/tests/unit/preprocessor/PCA_unittest.cc
@@ -54,7 +54,7 @@ TEST(PCA, PCA_N_greater_D_EVD)
 	CDenseFeatures<float64_t>* features=new CDenseFeatures<float64_t>(data);
 	CPCA* pca=new CPCA(EVD);
 	pca->set_target_dim(3);
-	pca->init(features);
+	pca->fit(features);
 
 	SGMatrix<float64_t> transmat=pca->get_transformation_matrix();
 	SGMatrix<float64_t> finalmat=pca->apply_to_feature_matrix(features);
@@ -118,7 +118,7 @@ TEST(PCA, PCA_N_equals_D_EVD)
 	CDenseFeatures<float64_t>* features=new CDenseFeatures<float64_t>(data);
 	CPCA* pca=new CPCA(EVD);
 	pca->set_target_dim(3);
-	pca->init(features);
+	pca->fit(features);
 
 	SGMatrix<float64_t> transmat=pca->get_transformation_matrix();
 	SGMatrix<float64_t> finalmat=pca->apply_to_feature_matrix(features);
@@ -182,7 +182,7 @@ TEST(PCA, PCA_N_less_D_EVD)
 	CDenseFeatures<float64_t>* features=new CDenseFeatures<float64_t>(data);
 	CPCA* pca=new CPCA(EVD);
 	pca->set_target_dim(2);
-	pca->init(features);
+	pca->fit(features);
 
 	SGMatrix<float64_t> transmat=pca->get_transformation_matrix();
 	SGMatrix<float64_t> finalmat=pca->apply_to_feature_matrix(features);
@@ -243,7 +243,7 @@ TEST(PCA, PCA_N_greater_D_SVD)
 	CDenseFeatures<float64_t>* features=new CDenseFeatures<float64_t>(data);
 	CPCA* pca=new CPCA(SVD);
 	pca->set_target_dim(3);
-	pca->init(features);
+	pca->fit(features);
 
 	SGMatrix<float64_t> transmat=pca->get_transformation_matrix();
 	SGMatrix<float64_t> finalmat=pca->apply_to_feature_matrix(features);
@@ -303,7 +303,7 @@ TEST(PCA, PCA_N_equals_D_SVD)
 	CDenseFeatures<float64_t>* features=new CDenseFeatures<float64_t>(data);
 	CPCA* pca=new CPCA(SVD);
 	pca->set_target_dim(3);
-	pca->init(features);
+	pca->fit(features);
 
 	SGMatrix<float64_t> transmat=pca->get_transformation_matrix();
 	SGMatrix<float64_t> finalmat=pca->apply_to_feature_matrix(features);
@@ -362,7 +362,7 @@ TEST(PCA, PCA_N_less_D_SVD)
 	CDenseFeatures<float64_t>* features=new CDenseFeatures<float64_t>(data);
 	CPCA* pca=new CPCA(SVD);
 	pca->set_target_dim(2);
-	pca->init(features);
+	pca->fit(features);
 
 	SGMatrix<float64_t> transmat=pca->get_transformation_matrix();
 	SGMatrix<float64_t> finalmat=pca->apply_to_feature_matrix(features);
@@ -414,7 +414,7 @@ TEST(PCA, PCA_MEM_IN_PLACE)
 	CDenseFeatures<float64_t>* features=new CDenseFeatures<float64_t>(data);
 	CPCA* pca=new CPCA(AUTO,false,MEM_IN_PLACE);
 	pca->set_target_dim(3);
-	pca->init(features);
+	pca->fit(features);
 
 	SGMatrix<float64_t> transmat=pca->get_transformation_matrix();
 	SGMatrix<float64_t> finalmat=pca->apply_to_feature_matrix(features);
@@ -480,7 +480,7 @@ TEST(PCA, PCA_apply_to_feature_vector_methodTest)
 	CDenseFeatures<float64_t>* features=new CDenseFeatures<float64_t>(data);
 	CPCA* pca=new CPCA(SVD);
 	pca->set_target_dim(2);
-	pca->init(features);
+	pca->fit(features);
 
 	SGVector<float64_t> finalVector=pca->apply_to_feature_vector(inputVector);
 
@@ -515,7 +515,7 @@ TEST(PCA, PCA_WHITEN_SVD)
 	CDenseFeatures<float64_t>* features=new CDenseFeatures<float64_t>(data);
 	CPCA* pca=new CPCA(SVD,true);
 	pca->set_target_dim(3);
-	pca->init(features);
+	pca->fit(features);
 
 	SGMatrix<float64_t> transmat=pca->get_transformation_matrix();
 	SGMatrix<float64_t> finalmat=pca->apply_to_feature_matrix(features);
@@ -594,7 +594,7 @@ TEST(PCA, PCA_WHITEN_EVD)
 	CDenseFeatures<float64_t>* features=new CDenseFeatures<float64_t>(data);
 	CPCA* pca=new CPCA(EVD,true);
 	pca->set_target_dim(3);
-	pca->init(features);
+	pca->fit(features);
 
 	SGMatrix<float64_t> finalmat=pca->apply_to_feature_matrix(features);