diff --git a/src/shogun/features/DotFeatures.cpp b/src/shogun/features/DotFeatures.cpp index bfb4a001b90..13dd2c2001c 100644 --- a/src/shogun/features/DotFeatures.cpp +++ b/src/shogun/features/DotFeatures.cpp @@ -305,7 +305,7 @@ CDotFeatures::compute_mean(CDotFeatures* lhs, CDotFeatures* rhs) return mean; } -SGMatrix CDotFeatures::get_cov() +SGMatrix CDotFeatures::get_cov(bool copy_data_for_speed) { int32_t num=get_num_vectors(); int32_t dim=get_dim_feature_space(); @@ -313,26 +313,35 @@ SGMatrix CDotFeatures::get_cov() ASSERT(dim>0) SGMatrix cov(dim, dim); - linalg::zero(cov); - SGVector mean = get_mean(); - for (int i = 0; i < num; i++) + + if (copy_data_for_speed) { - SGVector v = get_computed_dot_feature_vector(i); - linalg::add(v, mean, v, 1.0, -1.0); - for (int m = 0; m < v.vlen; m++) + SGMatrix centered_data(dim, num); + for (int i = 0; i < num; i++) { - for (int n = 0; n <= m ; n++) - { - (cov.matrix)[m*v.vlen+n] += v.vector[m]*v.vector[n]; - } + SGVector v = get_computed_dot_feature_vector(i); + centered_data.set_column(i, linalg::add(v, mean, 1.0, -1.0)); } + + cov = linalg::matrix_prod(centered_data, centered_data, false, true); } - for (int m = 0; m < dim-1; m++) + else { - for (int n = m+1; n < dim; n++) + linalg::zero(cov); + for (int i = 0; i < num; i++) + { + SGVector v = get_computed_dot_feature_vector(i); + linalg::add(v, mean, v, 1.0, -1.0); + for (int m = 0; m < v.vlen; m++) + linalg::add_col_vec(cov, m, v, cov, 1.0, v.vector[m]); + } + for (int m = 0; m < dim - 1; m++) { - (cov.matrix)[m*dim+n] = (cov.matrix)[n*dim+m]; + for (int n = m + 1; n < dim; n++) + { + (cov.matrix)[m * dim + n] = (cov.matrix)[n * dim + m]; + } } } linalg::scale(cov, cov, 1.0 / num); @@ -340,7 +349,8 @@ SGMatrix CDotFeatures::get_cov() return cov; } -SGMatrix CDotFeatures::compute_cov(CDotFeatures* lhs, CDotFeatures* rhs) +SGMatrix CDotFeatures::compute_cov( + CDotFeatures* lhs, CDotFeatures* rhs, bool copy_data_for_speed) { CDotFeatures* feats[2]; feats[0]=lhs; @@ -361,34 +371,46 @@ SGMatrix CDotFeatures::compute_cov(CDotFeatures* lhs, CDotFeatures* r int32_t dim = dims[0]; SGMatrix cov(dim, dim); - linalg::zero(cov); - SGVector mean = compute_mean(lhs, rhs); - for (int i = 0; i < 2; i++) + if (copy_data_for_speed) { - for (int j = 0; j < nums[i]; j++) + SGMatrix centered_data(dim, num); + for (int i = 0; i < num; i++) { - SGVector v = feats[i]->get_computed_dot_feature_vector(j); - linalg::add(v, mean, v, 1.0, -1.0); - for (int m = 0; m < v.vlen; m++) + SGVector v = + i < nums[0] ? lhs->get_computed_dot_feature_vector(i) + : rhs->get_computed_dot_feature_vector(i - nums[0]); + + centered_data.set_column(i, linalg::add(v, mean, 1.0, -1.0)); + } + + cov = linalg::matrix_prod(centered_data, centered_data, false, true); + } + else + { + linalg::zero(cov); + for (int i = 0; i < 2; i++) + { + for (int j = 0; j < nums[i]; j++) { - for (int n = 0; n <= m; n++) - { - (cov.matrix)[m*v.vlen+n] += v.vector[m]*v.vector[n]; - } + SGVector v = + feats[i]->get_computed_dot_feature_vector(j); + linalg::add(v, mean, v, 1.0, -1.0); + for (int m = 0; m < v.vlen; m++) + linalg::add_col_vec(cov, m, v, cov, 1.0, v.vector[m]); } } - } - linalg::scale(cov, cov, 1.0 / num); - for (int m = 0; m < dim-1; m++) - { - for (int n = m+1; n < dim; n++) + for (int m = 0; m < dim - 1; m++) { - (cov.matrix[m*dim+n]) = (cov.matrix)[n*dim+m]; + for (int n = m + 1; n < dim; n++) + { + (cov.matrix[m * dim + n]) = (cov.matrix)[n * dim + m]; + } } } + linalg::scale(cov, cov, 1.0 / num); return cov; } diff --git a/src/shogun/features/DotFeatures.h b/src/shogun/features/DotFeatures.h index 4da1287f146..c2068cc5392 100644 --- a/src/shogun/features/DotFeatures.h +++ b/src/shogun/features/DotFeatures.h @@ -218,15 +218,26 @@ class CDotFeatures : public CFeatures /** get covariance * + * @param copy_data_for_speed if true, the method stores explicitly + * the centered data matrix and the covariance is calculated by matrix + * product of the centered data with its transpose, this make it + * possible to take advantage of multithreaded matrix product, + * this may not be possible if the data doesn't fit into memory, + * in such case set this parameter to false to compute iteratively + * the covariance matrix without storing the centered data. + * [default = true] * @return covariance */ - virtual SGMatrix get_cov(); + virtual SGMatrix get_cov(bool copy_data_for_speed = true); /** compute the covariance of two CDotFeatures together * + * @param copy_data_for_speed @see CDotFeatures::get_cov * @return covariance */ - static SGMatrix compute_cov(CDotFeatures* lhs, CDotFeatures* rhs); + static SGMatrix compute_cov( + CDotFeatures* lhs, CDotFeatures* rhs, + bool copy_data_for_speed = true); private: void init(); diff --git a/tests/unit/features/DotFeatures_unittest.cc b/tests/unit/features/DotFeatures_unittest.cc new file mode 100644 index 00000000000..edef15f2c77 --- /dev/null +++ b/tests/unit/features/DotFeatures_unittest.cc @@ -0,0 +1,125 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + */ + +#include +#include +#include + +using namespace shogun; + +class DotFeaturesTest : public ::testing::Test +{ +protected: + virtual void SetUp() + { + SGMatrix data_a(dims, num_a); + data_a(0, 0) = 1.01611997; + data_a(1, 0) = 0.88935567; + data_a(2, 0) = -0.53592717; + data_a(0, 1) = 0.24132379; + data_a(1, 1) = 0.50475675; + data_a(2, 1) = 0.66029218; + data_a(0, 2) = 0.776238; + data_a(1, 2) = 0.19904003; + data_a(2, 2) = -0.60085628; + data_a(0, 3) = 0.86905328; + data_a(1, 3) = -1.22505732; + data_a(2, 3) = -1.12045593; + data_a(0, 4) = -0.60848342; + data_a(1, 4) = -1.45115708; + data_a(2, 4) = 1.15711328; + feats_a = new CDenseFeatures(data_a); + SG_REF(feats_a); + + SGMatrix data_b(dims, num_b); + data_b(0, 0) = 0.14210129; + data_b(1, 0) = -0.36770534; + data_b(2, 0) = 0.80232687; + data_b(0, 1) = -0.10386986; + data_b(1, 1) = 0.3970658; + data_b(2, 1) = 1.15765292; + data_b(0, 2) = 1.22478326; + data_b(1, 2) = 0.61167198; + data_b(2, 2) = 0.49287339; + data_b(0, 3) = 0.04932024; + data_b(1, 3) = -1.0330936; + data_b(2, 3) = -0.87217125; + feats_b = new CDenseFeatures(data_b); + SG_REF(feats_b); + + ref_cov_a = SGMatrix(dims, dims); + ref_cov_a(0, 0) = 0.353214; + ref_cov_a(1, 0) = 0.29906652; + ref_cov_a(2, 0) = -0.46552636; + ref_cov_a(0, 1) = 0.29906652; + ref_cov_a(1, 1) = 0.8914735; + ref_cov_a(2, 1) = -0.13294825; + ref_cov_a(0, 2) = -0.46552636; + ref_cov_a(1, 2) = -0.13294825; + ref_cov_a(2, 2) = 0.72797476; + + ref_cov_ab = SGMatrix(dims, dims); + ref_cov_ab(0, 0) = 0.32300248; + ref_cov_ab(1, 0) = 0.24380185; + ref_cov_ab(2, 0) = -0.27024556; + ref_cov_ab(0, 1) = 0.24380185; + ref_cov_ab(1, 1) = 0.68716546; + ref_cov_ab(2, 1) = 0.10940845; + ref_cov_ab(0, 2) = -0.27024556; + ref_cov_ab(1, 2) = 0.10940845; + ref_cov_ab(2, 2) = 0.72460503; + } + + virtual void TearDown() + { + SG_UNREF(feats_a); + SG_UNREF(feats_b); + } + + const index_t num_a = 5; + const index_t num_b = 4; + const index_t dims = 3; + const float64_t eps = 1e-8; + + CDenseFeatures* feats_a; + CDenseFeatures* feats_b; + SGMatrix ref_cov_a; + SGMatrix ref_cov_ab; +}; + +TEST_F(DotFeaturesTest, get_cov) +{ + auto cov = feats_a->CDotFeatures::get_cov(); + + for (index_t i = 0; i < (index_t)cov.size(); ++i) + EXPECT_NEAR(cov[i], ref_cov_a[i], eps); +} + +TEST_F(DotFeaturesTest, get_cov_nocopy) +{ + auto cov = feats_a->CDotFeatures::get_cov(false); + + for (index_t i = 0; i < (index_t)cov.size(); ++i) + EXPECT_NEAR(cov[i], ref_cov_a[i], eps); +} + +TEST_F(DotFeaturesTest, compute_cov) +{ + auto cov = CDotFeatures::compute_cov(feats_a, feats_b); + + for (index_t i = 0; i < (index_t)cov.size(); ++i) + EXPECT_NEAR(cov[i], ref_cov_ab[i], eps); +} + +TEST_F(DotFeaturesTest, compute_cov_nocopy) +{ + auto cov = CDotFeatures::compute_cov(feats_a, feats_b, false); + + for (index_t i = 0; i < (index_t)cov.size(); ++i) + EXPECT_NEAR(cov[i], ref_cov_ab[i], eps); +}