diff --git a/src/shogun/statistical_testing/MMD.cpp b/src/shogun/statistical_testing/MMD.cpp index 28552587e1e..9c61d0434ae 100644 --- a/src/shogun/statistical_testing/MMD.cpp +++ b/src/shogun/statistical_testing/MMD.cpp @@ -93,18 +93,16 @@ void CMMD::Self::create_statistic_job() { case EStatisticType::UNBIASED_FULL: statistic_job = mmd::UnbiasedFull(Bx); - permutation_job = mmd::WithinBlockPermutation(Bx); break; case EStatisticType::UNBIASED_INCOMPLETE: statistic_job = mmd::UnbiasedIncomplete(Bx); - permutation_job = mmd::WithinBlockPermutation(Bx); break; case EStatisticType::BIASED_FULL: statistic_job = mmd::BiasedFull(Bx); - permutation_job = mmd::WithinBlockPermutation(Bx); break; default : break; }; + permutation_job = mmd::WithinBlockPermutation(Bx, statistic_type); } void CMMD::Self::create_variance_job() diff --git a/src/shogun/statistical_testing/QuadraticTimeMMD.cpp b/src/shogun/statistical_testing/QuadraticTimeMMD.cpp index 00fecdfa487..0d1ae1780d0 100644 --- a/src/shogun/statistical_testing/QuadraticTimeMMD.cpp +++ b/src/shogun/statistical_testing/QuadraticTimeMMD.cpp @@ -93,18 +93,16 @@ void CQuadraticTimeMMD::Self::create_statistic_job() { case EStatisticType::UNBIASED_FULL: statistic_job=mmd::UnbiasedFull(Nx); - permutation_job=mmd::WithinBlockPermutation(Nx); break; case EStatisticType::UNBIASED_INCOMPLETE: statistic_job=mmd::UnbiasedIncomplete(Nx); - permutation_job=mmd::WithinBlockPermutation(Nx); break; case EStatisticType::BIASED_FULL: statistic_job=mmd::BiasedFull(Nx); - permutation_job=mmd::WithinBlockPermutation(Nx); break; default : break; }; + permutation_job = mmd::WithinBlockPermutation(Nx, owner.get_statistic_type()); SG_SDEBUG("Leaving\n"); } diff --git a/src/shogun/statistical_testing/internals/mmd/WithinBlockPermutation.cpp b/src/shogun/statistical_testing/internals/mmd/WithinBlockPermutation.cpp index 3999670e6b8..1b0d16fb3bb 100644 --- a/src/shogun/statistical_testing/internals/mmd/WithinBlockPermutation.cpp +++ b/src/shogun/statistical_testing/internals/mmd/WithinBlockPermutation.cpp @@ -16,11 +16,13 @@ * along with this program. If not, see . */ +#include #include #include #include #include #include +#include #include #include #include @@ -30,30 +32,84 @@ using namespace shogun; using namespace internal; using namespace mmd; -template -WithinBlockPermutation::WithinBlockPermutation(index_t n) : n_x(n) +WithinBlockPermutation::WithinBlockPermutation(index_t n, EStatisticType type) +: n_x(n), stype(type) { } -template -typename T::return_type WithinBlockPermutation::operator()(SGMatrix km) +float64_t WithinBlockPermutation::operator()(SGMatrix km) { - // http:/stackoverflow.com/questions/15858569/randomly-permute-rows-columns-of-a-matrix-with-eigen + SG_SDEBUG("Entering!\n"); + inds.resize(km.num_rows); + std::iota(inds.data(), inds.data()+inds.size(), 0); + SGVector permuted_inds(inds.data(), inds.size(), false); + CMath::permute(permuted_inds); - Eigen::Map map(km.matrix, km.num_rows, km.num_cols); + const index_t n_y=km.num_rows-n_x; + SG_SDEBUG("number of samples are %d and %d!\n", n_x, n_y); - Eigen::PermutationMatrix perm(km.num_rows); - perm.setIdentity(); - SGVector inds(perm.indices().data(), perm.indices().size(), false); - CMath::permute(inds); + auto term_1=0.0; + for (auto i=0; ij) + term_1+=km(permuted_inds[i], permuted_inds[j]); + } + } + term_1*=2; + SG_SDEBUG("term_1 sum (without diagonal) = %f!\n", term_1); + if (stype==EStatisticType::BIASED_FULL) + { + for (auto i=0; i permuted_km(permuted.data(), permuted.rows(), permuted.cols(), false); + auto term_2=0.0; + for (auto i=n_x; ij) + term_2+=km(permuted_inds[i], permuted_inds[j]); + } + } + term_2*=2.0; + SG_SDEBUG("term_2 sum (without diagonal) = %f!\n", term_2); + if (stype==EStatisticType::BIASED_FULL) + { + for (auto i=n_x; i; -template class WithinBlockPermutation; -template class WithinBlockPermutation; + SG_SDEBUG("Leaving!\n"); + return term_1+term_2-2*term_3; +} diff --git a/src/shogun/statistical_testing/internals/mmd/WithinBlockPermutation.h b/src/shogun/statistical_testing/internals/mmd/WithinBlockPermutation.h index affc1489c49..9cc0ee09e99 100644 --- a/src/shogun/statistical_testing/internals/mmd/WithinBlockPermutation.h +++ b/src/shogun/statistical_testing/internals/mmd/WithinBlockPermutation.h @@ -26,6 +26,7 @@ namespace shogun template class SGMatrix; template class CGPUMatrix; +enum class EStatisticType; namespace internal { @@ -33,14 +34,15 @@ namespace internal namespace mmd { -template struct WithinBlockPermutation { - using return_type = typename Statistic::return_type; - WithinBlockPermutation(index_t n); + using return_type = float64_t; + WithinBlockPermutation(index_t, EStatisticType); return_type operator()(SGMatrix kernel_matrix); // return_type operator()(CGPUMatrix kernel_matrix); - index_t n_x; + const index_t n_x; + const EStatisticType stype; + std::vector inds; }; } diff --git a/tests/unit/statistical_testing/QuadraticTimeMMD_unittest.cc b/tests/unit/statistical_testing/QuadraticTimeMMD_unittest.cc index 7b0393bd57e..c2897366a62 100644 --- a/tests/unit/statistical_testing/QuadraticTimeMMD_unittest.cc +++ b/tests/unit/statistical_testing/QuadraticTimeMMD_unittest.cc @@ -238,7 +238,7 @@ TEST(QuadraticTimeMMD, biased_different_num_samples) EXPECT_NEAR(statistic, 0.54418915736201567, 1E-8); } -TEST(QuadraticTimeMMD,compute_variance_null) +TEST(QuadraticTimeMMD, compute_variance_null) { index_t m=8; index_t d=3; @@ -289,7 +289,7 @@ TEST(QuadraticTimeMMD,compute_variance_null) EXPECT_NEAR(var, 0.0064888052500342575, 1E-10); } -TEST(QuadraticTimeMMD, perform_test_permutation) +TEST(QuadraticTimeMMD, perform_test_permutation_biased_full) { const index_t m=20; const index_t n=30; @@ -317,24 +317,92 @@ TEST(QuadraticTimeMMD, perform_test_permutation) auto mmd=some(feat_p, feat_q); mmd->set_kernel(kernel); - index_t num_null_samples=250; + index_t num_null_samples=10; mmd->set_num_null_samples(num_null_samples); mmd->set_null_approximation_method(ENullApproximationMethod::PERMUTATION); - // biased case - // compute p-value using permutation for null distribution and // assert against local machine computed result mmd->set_statistic_type(EStatisticType::BIASED_FULL); float64_t p_value=mmd->compute_p_value(mmd->compute_statistic()); EXPECT_NEAR(p_value, 0.0, 1E-10); +} - // unbiased case +TEST(QuadraticTimeMMD, perform_test_permutation_unbiased_full) +{ + const index_t m=20; + const index_t n=30; + const index_t dim=3; + + // use fixed seed + sg_rand->set_seed(12345); + + float64_t difference=0.5; + + // streaming data generator for mean shift distributions + auto gen_p=some(0, dim, 0); + auto gen_q=some(difference, dim, 0); + + // stream some data from generator + CFeatures* feat_p=gen_p->get_streamed_features(m); + CFeatures* feat_q=gen_q->get_streamed_features(n); + + // shoguns kernel width is different + float64_t sigma=2; + float64_t sq_sigma_twice=sigma*sigma*2; + CGaussianKernel* kernel=new CGaussianKernel(10, sq_sigma_twice); + + // create MMD instance, convienience constructor + auto mmd=some(feat_p, feat_q); + mmd->set_kernel(kernel); + + index_t num_null_samples=10; + mmd->set_num_null_samples(num_null_samples); + mmd->set_null_approximation_method(ENullApproximationMethod::PERMUTATION); // compute p-value using permutation for null distribution and // assert against local machine computed result mmd->set_statistic_type(EStatisticType::UNBIASED_FULL); - p_value=mmd->compute_p_value(mmd->compute_statistic()); + float64_t p_value=mmd->compute_p_value(mmd->compute_statistic()); + EXPECT_NEAR(p_value, 0.0, 1E-10); +} + +TEST(QuadraticTimeMMD, perform_test_permutation_unbiased_incomplete) +{ + const index_t m=20; + const index_t n=20; + const index_t dim=3; + + // use fixed seed + sg_rand->set_seed(12345); + + float64_t difference=0.5; + + // streaming data generator for mean shift distributions + auto gen_p=some(0, dim, 0); + auto gen_q=some(difference, dim, 0); + + // stream some data from generator + CFeatures* feat_p=gen_p->get_streamed_features(m); + CFeatures* feat_q=gen_q->get_streamed_features(n); + + // shoguns kernel width is different + float64_t sigma=2; + float64_t sq_sigma_twice=sigma*sigma*2; + CGaussianKernel* kernel=new CGaussianKernel(10, sq_sigma_twice); + + // create MMD instance, convienience constructor + auto mmd=some(feat_p, feat_q); + mmd->set_kernel(kernel); + + index_t num_null_samples=10; + mmd->set_num_null_samples(num_null_samples); + mmd->set_null_approximation_method(ENullApproximationMethod::PERMUTATION); + + // compute p-value using permutation for null distribution and + // assert against local machine computed result + mmd->set_statistic_type(EStatisticType::UNBIASED_INCOMPLETE); + float64_t p_value=mmd->compute_p_value(mmd->compute_statistic()); EXPECT_NEAR(p_value, 0.0, 1E-10); } @@ -366,7 +434,7 @@ TEST(QuadraticTimeMMD, perform_test_spectrum) auto mmd=some(feat_p, feat_q); mmd->set_kernel(kernel); - index_t num_null_samples=250; + index_t num_null_samples=10; index_t num_eigenvalues=10; mmd->set_num_null_samples(num_null_samples); mmd->set_null_approximation_method(ENullApproximationMethod::MMD2_SPECTRUM); @@ -386,5 +454,5 @@ TEST(QuadraticTimeMMD, perform_test_spectrum) // assert against local machine computed result mmd->set_statistic_type(EStatisticType::UNBIASED_FULL); p_value_spectrum=mmd->compute_p_value(mmd->compute_statistic()); - EXPECT_NEAR(p_value_spectrum, 0.004, 1E-10); + EXPECT_NEAR(p_value_spectrum, 0.0, 1E-10); } diff --git a/tests/unit/statistical_testing/internals/WithinBlockPermutation_unittest.cc b/tests/unit/statistical_testing/internals/WithinBlockPermutation_unittest.cc new file mode 100644 index 00000000000..6b206ac6269 --- /dev/null +++ b/tests/unit/statistical_testing/internals/WithinBlockPermutation_unittest.cc @@ -0,0 +1,217 @@ +/* + * Copyright (c) The Shogun Machine Learning Toolbox + * Written (w) 2016 Soumyajit De + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * The views and conclusions contained in the software and documentation are those + * of the authors and should not be interpreted as representing official policies, + * either expressed or implied, of the Shogun Development Team. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace shogun; +using namespace Eigen; + +TEST(WithinBlockPermutation, biased_full) +{ + const index_t dim=2; + const index_t n=6; + const index_t m=4; + + using operation=std::function)>; + + SGMatrix data(dim, n+m); + std::iota(data.matrix, data.matrix+dim*(n+m), 1); + + auto feats=new CDenseFeatures(data); + SG_REF(feats); + + auto kernel=some(); + kernel->set_width(2.0); + + kernel->init(feats, feats); + auto mat=kernel->get_kernel_matrix(); + + // compute using within-block-permutation functor + operation compute=shogun::internal::mmd::WithinBlockPermutation(n, EStatisticType::BIASED_FULL); + sg_rand->set_seed(12345); + auto result_1=compute(mat); + + compute=shogun::internal::mmd::BiasedFull(n); + + // compute a row-column permuted temporary matrix first + // then compute a biased-full statistic on this matrix + Map map(mat.matrix, mat.num_rows, mat.num_cols); + PermutationMatrix perm(mat.num_rows); + perm.setIdentity(); + SGVector perminds(perm.indices().data(), perm.indices().size(), false); + sg_rand->set_seed(12345); + CMath::permute(perminds); + MatrixXd permuted = perm.transpose()*map*perm; + SGMatrix permuted_km(permuted.data(), permuted.rows(), permuted.cols(), false); + auto result_2=compute(permuted_km); + + // shuffle the features first, recompute the kernel matrix using + // shuffled samples, then compute a biased-full statistic on this matrix + SGVector inds(mat.num_rows); + std::iota(inds.vector, inds.vector+inds.vlen, 0); + sg_rand->set_seed(12345); + CMath::permute(inds); + feats->add_subset(inds); + kernel->init(feats, feats); + mat=kernel->get_kernel_matrix(); + auto result_3=compute(mat); + + EXPECT_NEAR(result_1, result_2, 1E-15); + EXPECT_NEAR(result_1, result_3, 1E-15); + + kernel->remove_lhs_and_rhs(); +} + +TEST(WithinBlockPermutation, unbiased_full) +{ + const index_t dim=2; + const index_t n=3; + const index_t m=4; + + using operation=std::function)>; + + SGMatrix data(dim, n+m); + std::iota(data.matrix, data.matrix+dim*(n+m), 1); + + auto feats=new CDenseFeatures(data); + SG_REF(feats); + + auto kernel=some(); + kernel->set_width(2.0); + + kernel->init(feats, feats); + auto mat=kernel->get_kernel_matrix(); + + // compute using within-block-permutation functor + operation compute=shogun::internal::mmd::WithinBlockPermutation(n, EStatisticType::UNBIASED_FULL); + sg_rand->set_seed(12345); + auto result_1=compute(mat); + + compute=shogun::internal::mmd::UnbiasedFull(n); + + // compute a row-column permuted temporary matrix first + // then compute unbiased-full statistic on this matrix + Map map(mat.matrix, mat.num_rows, mat.num_cols); + PermutationMatrix perm(mat.num_rows); + perm.setIdentity(); + SGVector perminds(perm.indices().data(), perm.indices().size(), false); + sg_rand->set_seed(12345); + CMath::permute(perminds); + MatrixXd permuted = perm.transpose()*map*perm; + SGMatrix permuted_km(permuted.data(), permuted.rows(), permuted.cols(), false); + auto result_2=compute(permuted_km); + + // shuffle the features first, recompute the kernel matrix using + // shuffled samples, then compute unbiased-full statistic on this matrix + SGVector inds(mat.num_rows); + std::iota(inds.vector, inds.vector+inds.vlen, 0); + sg_rand->set_seed(12345); + CMath::permute(inds); + feats->add_subset(inds); + kernel->init(feats, feats); + mat=kernel->get_kernel_matrix(); + auto result_3=compute(mat); + + EXPECT_NEAR(result_1, result_2, 1E-15); + EXPECT_NEAR(result_1, result_3, 1E-15); + + kernel->remove_lhs_and_rhs(); +} + +TEST(WithinBlockPermutation, unbiased_incomplete) +{ + const index_t dim=2; + const index_t n=5; + const index_t m=5; + + using operation=std::function)>; + + SGMatrix data(dim, n+m); + std::iota(data.matrix, data.matrix+dim*(n+m), 1); + + auto feats=new CDenseFeatures(data); + SG_REF(feats); + + auto kernel=some(); + kernel->set_width(2.0); + + kernel->init(feats, feats); + auto mat=kernel->get_kernel_matrix(); + + // compute using within-block-permutation functor + operation compute=shogun::internal::mmd::WithinBlockPermutation(n, EStatisticType::UNBIASED_INCOMPLETE); + sg_rand->set_seed(12345); + auto result_1=compute(mat); + + compute=shogun::internal::mmd::UnbiasedIncomplete(n); + + // compute a row-column permuted temporary matrix first + // then compute unbiased-incomplete statistic on this matrix + Map map(mat.matrix, mat.num_rows, mat.num_cols); + PermutationMatrix perm(mat.num_rows); + perm.setIdentity(); + SGVector perminds(perm.indices().data(), perm.indices().size(), false); + sg_rand->set_seed(12345); + CMath::permute(perminds); + MatrixXd permuted = perm.transpose()*map*perm; + SGMatrix permuted_km(permuted.data(), permuted.rows(), permuted.cols(), false); + auto result_2=compute(permuted_km); + + // shuffle the features first, recompute the kernel matrix using + // shuffled samples, then compute uniased-incomplete statistic on this matrix + SGVector inds(mat.num_rows); + std::iota(inds.vector, inds.vector+inds.vlen, 0); + sg_rand->set_seed(12345); + CMath::permute(inds); + feats->add_subset(inds); + kernel->init(feats, feats); + mat=kernel->get_kernel_matrix(); + auto result_3=compute(mat); + + EXPECT_NEAR(result_1, result_2, 1E-15); + EXPECT_NEAR(result_1, result_3, 1E-15); + + kernel->remove_lhs_and_rhs(); +}