Add option to compute DotFeatures covariance matrix by matrix product…

… and set it as default. Add unit tests for DotFeatures::get_cov/compute_cov.
shogun-toolbox · Jul 10, 2017 · a26f9f1 · a26f9f1
1 parent 5e5ebb0
commit a26f9f1
Show file tree

Hide file tree

Showing 3 changed files with 192 additions and 34 deletions.
diff --git a/src/shogun/features/DotFeatures.cpp b/src/shogun/features/DotFeatures.cpp
@@ -305,42 +305,52 @@ CDotFeatures::compute_mean(CDotFeatures* lhs, CDotFeatures* rhs)
 	return mean;
 }
 
-SGMatrix<float64_t> CDotFeatures::get_cov()
+SGMatrix<float64_t> CDotFeatures::get_cov(bool copy_data_for_speed)
 {
 	int32_t num=get_num_vectors();
 	int32_t dim=get_dim_feature_space();
 	ASSERT(num>0)
 	ASSERT(dim>0)
 
 	SGMatrix<float64_t> cov(dim, dim);
-	linalg::zero(cov);
-
 	SGVector<float64_t> mean = get_mean();
-	for (int i = 0; i < num; i++)
+
+	if (copy_data_for_speed)
 	{
-		SGVector<float64_t> v = get_computed_dot_feature_vector(i);
-		linalg::add(v, mean, v, 1.0, -1.0);
-		for (int m = 0; m < v.vlen; m++)
+		SGMatrix<float64_t> centered_data(dim, num);
+		for (int i = 0; i < num; i++)
 		{
-			for (int n = 0; n <= m ; n++)
-			{
-				(cov.matrix)[m*v.vlen+n] += v.vector[m]*v.vector[n];
-			}
+			SGVector<float64_t> v = get_computed_dot_feature_vector(i);
+			centered_data.set_column(i, linalg::add(v, mean, 1.0, -1.0));
 		}
+
+		cov = linalg::matrix_prod(centered_data, centered_data, false, true);
 	}
-	for (int m = 0; m < dim-1; m++)
+	else
 	{
-		for (int n = m+1; n < dim; n++)
+		linalg::zero(cov);
+		for (int i = 0; i < num; i++)
+		{
+			SGVector<float64_t> v = get_computed_dot_feature_vector(i);
+			linalg::add(v, mean, v, 1.0, -1.0);
+			for (int m = 0; m < v.vlen; m++)
+				linalg::add_col_vec(cov, m, v, cov, 1.0, v.vector[m]);
+		}
+		for (int m = 0; m < dim - 1; m++)
 		{
-			(cov.matrix)[m*dim+n] = (cov.matrix)[n*dim+m];
+			for (int n = m + 1; n < dim; n++)
+			{
+				(cov.matrix)[m * dim + n] = (cov.matrix)[n * dim + m];
+			}
 		}
 	}
 	linalg::scale(cov, cov, 1.0 / num);
 
 	return cov;
 }
 
-SGMatrix<float64_t> CDotFeatures::compute_cov(CDotFeatures* lhs, CDotFeatures* rhs)
+SGMatrix<float64_t> CDotFeatures::compute_cov(
+    CDotFeatures* lhs, CDotFeatures* rhs, bool copy_data_for_speed)
 {
 	CDotFeatures* feats[2];
 	feats[0]=lhs;
@@ -361,34 +371,46 @@ SGMatrix<float64_t> CDotFeatures::compute_cov(CDotFeatures* lhs, CDotFeatures* r
 	int32_t dim = dims[0];
 
 	SGMatrix<float64_t> cov(dim, dim);
-	linalg::zero(cov);
-
 	SGVector<float64_t> mean = compute_mean(lhs, rhs);
 
-	for (int i = 0; i < 2; i++)
+	if (copy_data_for_speed)
 	{
-		for (int j = 0; j < nums[i]; j++)
+		SGMatrix<float64_t> centered_data(dim, num);
+		for (int i = 0; i < num; i++)
 		{
-			SGVector<float64_t> v = feats[i]->get_computed_dot_feature_vector(j);
-			linalg::add(v, mean, v, 1.0, -1.0);
-			for (int m = 0; m < v.vlen; m++)
+			SGVector<float64_t> v =
+			    i < nums[0] ? lhs->get_computed_dot_feature_vector(i)
+			                : rhs->get_computed_dot_feature_vector(i - nums[0]);
+
+			centered_data.set_column(i, linalg::add(v, mean, 1.0, -1.0));
+		}
+
+		cov = linalg::matrix_prod(centered_data, centered_data, false, true);
+	}
+	else
+	{
+		linalg::zero(cov);
+		for (int i = 0; i < 2; i++)
+		{
+			for (int j = 0; j < nums[i]; j++)
 			{
-				for (int n = 0; n <= m; n++)
-				{
-					(cov.matrix)[m*v.vlen+n] += v.vector[m]*v.vector[n];
-				}
+				SGVector<float64_t> v =
+				    feats[i]->get_computed_dot_feature_vector(j);
+				linalg::add(v, mean, v, 1.0, -1.0);
+				for (int m = 0; m < v.vlen; m++)
+					linalg::add_col_vec(cov, m, v, cov, 1.0, v.vector[m]);
 			}
 		}
-	}
-	linalg::scale(cov, cov, 1.0 / num);
 
-	for (int m = 0; m < dim-1; m++)
-	{
-		for (int n = m+1; n < dim; n++)
+		for (int m = 0; m < dim - 1; m++)
 		{
-			(cov.matrix[m*dim+n]) = (cov.matrix)[n*dim+m];
+			for (int n = m + 1; n < dim; n++)
+			{
+				(cov.matrix[m * dim + n]) = (cov.matrix)[n * dim + m];
+			}
 		}
 	}
+	linalg::scale(cov, cov, 1.0 / num);
 
 	return cov;
 }

diff --git a/src/shogun/features/DotFeatures.h b/src/shogun/features/DotFeatures.h
@@ -218,15 +218,26 @@ class CDotFeatures : public CFeatures
 
 		/** get covariance
 		 *
+		 * @param copy_data_for_speed if true, the method stores explicitly
+		 * the centered data matrix and the covariance is calculated by matrix
+		 * product of the centered data with its transpose, this make it
+		 * possible to take advantage of multithreaded matrix product,
+		 * this may not be possible if the data doesn't fit into memory,
+		 * in such case set this parameter to false to compute iteratively
+		 * the covariance matrix without storing the centered data.
+		 * [default = true]
 		 * @return covariance
 		 */
-		virtual SGMatrix<float64_t> get_cov();
+		virtual SGMatrix<float64_t> get_cov(bool copy_data_for_speed = true);
 
 		/** compute the covariance of two CDotFeatures together
 		 *
+		 * @param copy_data_for_speed @see CDotFeatures::get_cov
 		 * @return covariance
 		 */
-		static SGMatrix<float64_t> compute_cov(CDotFeatures* lhs, CDotFeatures* rhs);
+		static SGMatrix<float64_t> compute_cov(
+		    CDotFeatures* lhs, CDotFeatures* rhs,
+		    bool copy_data_for_speed = true);
 
 	private:
 		void init();

diff --git a/tests/unit/features/DotFeatures_unittest.cc b/tests/unit/features/DotFeatures_unittest.cc
@@ -0,0 +1,125 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#include <gtest/gtest.h>
+#include <shogun/features/DenseFeatures.h>
+#include <shogun/features/DotFeatures.h>
+
+using namespace shogun;
+
+class DotFeaturesTest : public ::testing::Test
+{
+protected:
+	virtual void SetUp()
+	{
+		SGMatrix<float64_t> data_a(dims, num_a);
+		data_a(0, 0) = 1.01611997;
+		data_a(1, 0) = 0.88935567;
+		data_a(2, 0) = -0.53592717;
+		data_a(0, 1) = 0.24132379;
+		data_a(1, 1) = 0.50475675;
+		data_a(2, 1) = 0.66029218;
+		data_a(0, 2) = 0.776238;
+		data_a(1, 2) = 0.19904003;
+		data_a(2, 2) = -0.60085628;
+		data_a(0, 3) = 0.86905328;
+		data_a(1, 3) = -1.22505732;
+		data_a(2, 3) = -1.12045593;
+		data_a(0, 4) = -0.60848342;
+		data_a(1, 4) = -1.45115708;
+		data_a(2, 4) = 1.15711328;
+		feats_a = new CDenseFeatures<float64_t>(data_a);
+		SG_REF(feats_a);
+
+		SGMatrix<float64_t> data_b(dims, num_b);
+		data_b(0, 0) = 0.14210129;
+		data_b(1, 0) = -0.36770534;
+		data_b(2, 0) = 0.80232687;
+		data_b(0, 1) = -0.10386986;
+		data_b(1, 1) = 0.3970658;
+		data_b(2, 1) = 1.15765292;
+		data_b(0, 2) = 1.22478326;
+		data_b(1, 2) = 0.61167198;
+		data_b(2, 2) = 0.49287339;
+		data_b(0, 3) = 0.04932024;
+		data_b(1, 3) = -1.0330936;
+		data_b(2, 3) = -0.87217125;
+		feats_b = new CDenseFeatures<float64_t>(data_b);
+		SG_REF(feats_b);
+
+		ref_cov_a = SGMatrix<float64_t>(dims, dims);
+		ref_cov_a(0, 0) = 0.353214;
+		ref_cov_a(1, 0) = 0.29906652;
+		ref_cov_a(2, 0) = -0.46552636;
+		ref_cov_a(0, 1) = 0.29906652;
+		ref_cov_a(1, 1) = 0.8914735;
+		ref_cov_a(2, 1) = -0.13294825;
+		ref_cov_a(0, 2) = -0.46552636;
+		ref_cov_a(1, 2) = -0.13294825;
+		ref_cov_a(2, 2) = 0.72797476;
+
+		ref_cov_ab = SGMatrix<float64_t>(dims, dims);
+		ref_cov_ab(0, 0) = 0.32300248;
+		ref_cov_ab(1, 0) = 0.24380185;
+		ref_cov_ab(2, 0) = -0.27024556;
+		ref_cov_ab(0, 1) = 0.24380185;
+		ref_cov_ab(1, 1) = 0.68716546;
+		ref_cov_ab(2, 1) = 0.10940845;
+		ref_cov_ab(0, 2) = -0.27024556;
+		ref_cov_ab(1, 2) = 0.10940845;
+		ref_cov_ab(2, 2) = 0.72460503;
+	}
+
+	virtual void TearDown()
+	{
+		SG_UNREF(feats_a);
+		SG_UNREF(feats_b);
+	}
+
+	const index_t num_a = 5;
+	const index_t num_b = 4;
+	const index_t dims = 3;
+	const float64_t eps = 1e-8;
+
+	CDenseFeatures<float64_t>* feats_a;
+	CDenseFeatures<float64_t>* feats_b;
+	SGMatrix<float64_t> ref_cov_a;
+	SGMatrix<float64_t> ref_cov_ab;
+};
+
+TEST_F(DotFeaturesTest, get_cov)
+{
+	auto cov = feats_a->CDotFeatures::get_cov();
+
+	for (index_t i = 0; i < (index_t)cov.size(); ++i)
+		EXPECT_NEAR(cov[i], ref_cov_a[i], eps);
+}
+
+TEST_F(DotFeaturesTest, get_cov_nocopy)
+{
+	auto cov = feats_a->CDotFeatures::get_cov(false);
+
+	for (index_t i = 0; i < (index_t)cov.size(); ++i)
+		EXPECT_NEAR(cov[i], ref_cov_a[i], eps);
+}
+
+TEST_F(DotFeaturesTest, compute_cov)
+{
+	auto cov = CDotFeatures::compute_cov(feats_a, feats_b);
+
+	for (index_t i = 0; i < (index_t)cov.size(); ++i)
+		EXPECT_NEAR(cov[i], ref_cov_ab[i], eps);
+}
+
+TEST_F(DotFeaturesTest, compute_cov_nocopy)
+{
+	auto cov = CDotFeatures::compute_cov(feats_a, feats_b, false);
+
+	for (index_t i = 0; i < (index_t)cov.size(); ++i)
+		EXPECT_NEAR(cov[i], ref_cov_ab[i], eps);
+}