From c4d66b5dd9129679442e1b9edc7d592eabc94d84 Mon Sep 17 00:00:00 2001 From: lambday Date: Thu, 19 May 2016 19:47:24 +0100 Subject: [PATCH] added permutation without storing the kernel matrix --- .../statistical_testing/QuadraticTimeMMD.cpp | 120 +++++++++++++----- .../statistical_testing/QuadraticTimeMMD.h | 2 + .../mmd/WithinBlockPermutationBatch.cpp | 7 +- .../mmd/WithinBlockPermutationBatch.h | 3 +- .../QuadraticTimeMMD_unittest.cc | 37 ++++++ .../WithinBlockPermutationBatch_unittest.cc | 52 +++++++- 6 files changed, 182 insertions(+), 39 deletions(-) diff --git a/src/shogun/statistical_testing/QuadraticTimeMMD.cpp b/src/shogun/statistical_testing/QuadraticTimeMMD.cpp index a742b1a6053..acde91a22e2 100644 --- a/src/shogun/statistical_testing/QuadraticTimeMMD.cpp +++ b/src/shogun/statistical_testing/QuadraticTimeMMD.cpp @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -59,6 +60,7 @@ struct CQuadraticTimeMMD::Self SGMatrix get_kernel_matrix(); std::pair compute_statistic_variance(); SGVector sample_null(); + void init_kernel(); void create_statistic_job(); void create_variance_job(); @@ -68,12 +70,15 @@ struct CQuadraticTimeMMD::Self CQuadraticTimeMMD& owner; index_t num_eigenvalues; + bool precompute; + bool is_kernel_initialized; std::function&)> statistic_job; std::function&)> variance_job; }; CQuadraticTimeMMD::Self::Self(CQuadraticTimeMMD& mmd) : owner(mmd), num_eigenvalues(10), + precompute(true), is_kernel_initialized(false), statistic_job(nullptr), variance_job(nullptr) { } @@ -133,54 +138,28 @@ void CQuadraticTimeMMD::Self::compute_jobs(ComputationManager& cm) const SG_SDEBUG("Leaving\n"); } -SGMatrix CQuadraticTimeMMD::Self::get_kernel_matrix() +void CQuadraticTimeMMD::Self::init_kernel() { SG_SDEBUG("Entering\n"); const KernelManager& km=owner.get_kernel_manager(); auto kernel=km.kernel_at(0); REQUIRE(kernel!=nullptr, "Kernel is not set!\n"); - SGMatrix kernel_matrix; - - // check if precomputed kernel is given, no need to do anything in that case - // otherwise, init kernel with data and precompute kernel matrix - if (kernel->get_kernel_type()==K_CUSTOM) - { - auto precomputed_kernel=dynamic_cast(kernel); - ASSERT(precomputed_kernel!=nullptr); - kernel_matrix=precomputed_kernel->get_float32_kernel_matrix(); - } - else + if (!is_kernel_initialized && !(kernel->get_kernel_type()==K_CUSTOM)) { DataManager& dm=owner.get_data_manager(); - - // using data manager next() API in order to make it work with - // streaming samples as well. dm.start(); auto samples=dm.next(); if (!samples.empty()) { dm.end(); - - // use 0th block from each distribution (since there is only one block - // for quadratic time MMD CFeatures *samples_p=samples[0][0].get(); CFeatures *samples_q=samples[1][0].get(); auto samples_p_and_q=FeaturesUtil::create_merged_copy(samples_p, samples_q); samples.clear(); kernel->init(samples_p_and_q, samples_p_and_q); - try - { - owner.get_kernel_manager().precompute_kernel_at(0); - } - catch (ShogunException e) - { - SG_SERROR("%s, Data is too large! Computing kernel matrix was not possible!\n", e.get_exception_string()); - } - kernel->remove_lhs_and_rhs(); - auto precomputed_kernel=dynamic_cast(km.kernel_at(0)); - ASSERT(precomputed_kernel!=nullptr); - kernel_matrix=precomputed_kernel->get_float32_kernel_matrix(); + is_kernel_initialized=true; + SG_SDEBUG("Kernel is initialized with joint features of %d total samples!\n", samples_p_and_q->get_num_vectors()); } else { @@ -188,6 +167,39 @@ SGMatrix CQuadraticTimeMMD::Self::get_kernel_matrix() SG_SERROR("Could not fetch samples!\n"); } } + SG_SDEBUG("Leaving\n"); +} + +SGMatrix CQuadraticTimeMMD::Self::get_kernel_matrix() +{ + SG_SDEBUG("Entering\n"); + const KernelManager& km=owner.get_kernel_manager(); + auto kernel=km.kernel_at(0); + REQUIRE(kernel!=nullptr, "Kernel is not set!\n"); + + SGMatrix kernel_matrix; + if (kernel->get_kernel_type()==K_CUSTOM) + { + auto precomputed_kernel=dynamic_cast(kernel); + ASSERT(precomputed_kernel!=nullptr); + kernel_matrix=precomputed_kernel->get_float32_kernel_matrix(); + } + else + { + init_kernel(); + try + { + owner.get_kernel_manager().precompute_kernel_at(0); + } + catch (ShogunException e) + { + SG_SERROR("%s, Data is too large! Computing kernel matrix was not possible!\n", e.get_exception_string()); + } + kernel->remove_lhs_and_rhs(); + auto precomputed_kernel=dynamic_cast(km.kernel_at(0)); + ASSERT(precomputed_kernel!=nullptr); + kernel_matrix=precomputed_kernel->get_float32_kernel_matrix(); + } SG_SDEBUG("Leaving\n"); return kernel_matrix; @@ -225,17 +237,35 @@ std::pair CQuadraticTimeMMD::Self::compute_statistic_varia SGVector CQuadraticTimeMMD::Self::sample_null() { SG_SDEBUG("Entering\n"); - SGMatrix kernel_matrix=get_kernel_matrix(); const DataManager& dm=owner.get_data_manager(); auto Nx=dm.num_samples_at(0); auto Ny=dm.num_samples_at(1); - WithinBlockPermutationBatch compute(Nx, Ny, owner.get_num_null_samples(), owner.get_statistic_type()); - SGVector result=compute(kernel_matrix); + + SGVector result; + if (precompute) + { + SGMatrix kernel_matrix=get_kernel_matrix(); + result=compute(kernel_matrix); + } + else + { + const KernelManager& km=owner.get_kernel_manager(); + auto kernel=km.kernel_at(0); + REQUIRE(kernel!=nullptr, "Kernel is not set!\n"); + if (kernel->get_kernel_type()==K_CUSTOM) + { + SG_SERROR("Precomputed kernel matrix exists!\n"); + } + init_kernel(); + result=compute(internal::Kernel(kernel)); + } + SGVector null_samples(result.vlen); for (auto i=0; i CQuadraticTimeMMD::spectrum_sample_null() return null_samples; } +void CQuadraticTimeMMD::precompute_kernel_matrix(bool precompute) +{ + if (self->precompute && !precompute) + { + const KernelManager& km=get_kernel_manager(); + auto kernel=km.kernel_at(0); + REQUIRE(kernel!=nullptr, "Kernel is not set!\n"); + + if (kernel->get_kernel_type()==K_CUSTOM) + { + SG_SINFO("Precomputed kernel matrix exists! Removing the existing matrix!\n"); + get_kernel_manager().restore_kernel_at(0); + kernel=km.kernel_at(0); + REQUIRE(kernel!=nullptr, "Kernel is not set!\n"); + if (kernel->get_kernel_type()==K_CUSTOM) + { + SG_SERROR("The underlying kernel itself is a precomputed kernel!\n"); + } + } + self->is_kernel_initialized=false; + } + self->precompute=precompute; +} + const char* CQuadraticTimeMMD::get_name() const { return "QuadraticTimeMMD"; diff --git a/src/shogun/statistical_testing/QuadraticTimeMMD.h b/src/shogun/statistical_testing/QuadraticTimeMMD.h index 18ab8e0890b..a8cc5784427 100644 --- a/src/shogun/statistical_testing/QuadraticTimeMMD.h +++ b/src/shogun/statistical_testing/QuadraticTimeMMD.h @@ -59,6 +59,8 @@ class CQuadraticTimeMMD : public CMMD virtual float64_t compute_p_value(float64_t statistic) override; virtual float64_t compute_threshold(float64_t alpha) override; + void precompute_kernel_matrix(bool precompute); + virtual const char* get_name() const; private: struct Self; diff --git a/src/shogun/statistical_testing/internals/mmd/WithinBlockPermutationBatch.cpp b/src/shogun/statistical_testing/internals/mmd/WithinBlockPermutationBatch.cpp index ae19bd0870e..c59cf8e1bd1 100644 --- a/src/shogun/statistical_testing/internals/mmd/WithinBlockPermutationBatch.cpp +++ b/src/shogun/statistical_testing/internals/mmd/WithinBlockPermutationBatch.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -73,7 +74,8 @@ void WithinBlockPermutationBatch::add_term(terms_t& t, float32_t val, index_t i, } } -SGVector WithinBlockPermutationBatch::operator()(const SGMatrix& km) +template +SGVector WithinBlockPermutationBatch::operator()(const Kernel& km) { SG_SDEBUG("Entering!\n"); @@ -134,3 +136,6 @@ SGVector WithinBlockPermutationBatch::operator()(const SGMatrix WithinBlockPermutationBatch::operator()(const SGMatrix&); +template SGVector WithinBlockPermutationBatch::operator()(const Kernel&); diff --git a/src/shogun/statistical_testing/internals/mmd/WithinBlockPermutationBatch.h b/src/shogun/statistical_testing/internals/mmd/WithinBlockPermutationBatch.h index fe1bb680546..046ea9b459a 100644 --- a/src/shogun/statistical_testing/internals/mmd/WithinBlockPermutationBatch.h +++ b/src/shogun/statistical_testing/internals/mmd/WithinBlockPermutationBatch.h @@ -42,8 +42,7 @@ class WithinBlockPermutationBatch using return_type=SGVector; public: WithinBlockPermutationBatch(index_t, index_t, index_t, EStatisticType); - return_type operator()(const SGMatrix& kernel_matrix); -// return_type operator()(const CGPUMatrix& kernel_matrix); + template return_type operator()(const Kernel& kernel); private: struct terms_t; void add_term(terms_t&, float32_t, index_t, index_t); diff --git a/tests/unit/statistical_testing/QuadraticTimeMMD_unittest.cc b/tests/unit/statistical_testing/QuadraticTimeMMD_unittest.cc index 0f19454b06b..01cc149ea89 100644 --- a/tests/unit/statistical_testing/QuadraticTimeMMD_unittest.cc +++ b/tests/unit/statistical_testing/QuadraticTimeMMD_unittest.cc @@ -456,3 +456,40 @@ TEST(QuadraticTimeMMD, perform_test_spectrum) p_value_spectrum=mmd->compute_p_value(mmd->compute_statistic()); EXPECT_NEAR(p_value_spectrum, 0.0, 1E-10); } + +TEST(QuadraticTimeMMD, precomputed_vs_nonprecomputed) +{ + const index_t m=20; + const index_t n=20; + const index_t dim=3; + + float64_t difference=0.5; + + auto gen_p=some(0, dim, 0); + auto gen_q=some(difference, dim, 0); + + CFeatures* feat_p=gen_p->get_streamed_features(m); + CFeatures* feat_q=gen_q->get_streamed_features(n); + + float64_t sigma=2; + float64_t sq_sigma_twice=sigma*sigma*2; + CGaussianKernel* kernel=new CGaussianKernel(10, sq_sigma_twice); + + auto mmd=some(feat_p, feat_q); + mmd->set_kernel(kernel); + + index_t num_null_samples=10; + mmd->set_num_null_samples(num_null_samples); + mmd->set_null_approximation_method(NAM_PERMUTATION); + + sg_rand->set_seed(12345); + SGVector result_1=mmd->sample_null(); + + mmd->precompute_kernel_matrix(false); + sg_rand->set_seed(12345); + SGVector result_2=mmd->sample_null(); + + ASSERT_EQ(result_1.size(), result_2.size()); + for (auto i=0; i #include #include +#include #include #include #include @@ -50,9 +51,9 @@ using namespace Eigen; TEST(WithinBlockPermutationBatch, biased_full) { - const index_t dim=2; - const index_t n=13; - const index_t m=7; + const index_t dim=3000; + const index_t n=1500; + const index_t m=1500; const index_t num_null_samples=5; using operation=std::function)>; @@ -280,3 +281,48 @@ TEST(WithinBlockPermutationBatch, unbiased_incomplete) SG_UNREF(feats); } + +TEST(WithinBlockPermutationBatch, kernel_functor) +{ + const index_t dim=2; + const index_t n=8; + const index_t m=8; + const index_t num_null_samples=5; + + SGMatrix data_p(dim, n); + std::iota(data_p.matrix, data_p.matrix+dim*n, 1); + std::for_each(data_p.matrix, data_p.matrix+dim*n, [&n](float64_t& val) { val/=n; }); + + SGMatrix data_q(dim, m); + std::iota(data_q.matrix, data_q.matrix+dim*m, n+1); + std::for_each(data_q.matrix, data_q.matrix+dim*m, [&m](float64_t& val) { val/=2*m; }); + + auto feats_p=new CDenseFeatures(data_p); + auto feats_q=new CDenseFeatures(data_q); + auto feats=feats_p->create_merged_copy(feats_q); + SG_REF(feats); + SG_UNREF(feats_p); + SG_UNREF(feats_q); + + auto kernel=some(); + kernel->set_width(2.0); + + kernel->init(feats, feats); + auto mat=kernel->get_kernel_matrix(); + + // compute using within-block-permutation functor + shogun::internal::mmd::WithinBlockPermutationBatch batch(n, m, num_null_samples, ST_BIASED_FULL); + + sg_rand->set_seed(12345); + SGVector result_1=batch(mat); + + sg_rand->set_seed(12345); + SGVector result_2=batch(shogun::internal::Kernel(kernel)); + + EXPECT_TRUE(result_1.size()==result_2.size()); + for (auto i=0; i