diff --git a/src/shogun/statistical_testing/MMD.cpp b/src/shogun/statistical_testing/MMD.cpp index 944a94db1c3..6c12a9662be 100644 --- a/src/shogun/statistical_testing/MMD.cpp +++ b/src/shogun/statistical_testing/MMD.cpp @@ -35,8 +35,6 @@ #include #include #include -#include -#include #include #include #include @@ -71,7 +69,6 @@ struct CMMD::Self std::pair compute_statistic_variance(); std::pair, SGMatrix> compute_statistic_and_Q(const KernelManager&); - CCustomDistance* compute_distance(); SGVector sample_null(); CMMD& owner; @@ -391,54 +388,6 @@ SGVector CMMD::Self::sample_null() return statistic; } -CCustomDistance* CMMD::Self::compute_distance() -{ - auto distance=new CCustomDistance(); - DataManager& data_mgr=owner.get_data_mgr(); - - bool blockwise=data_mgr.is_blockwise(); - data_mgr.set_blockwise(false); - - // using data manager next() API in order to make it work with - // streaming samples as well. - data_mgr.start(); - auto samples=data_mgr.next(); - if (!samples.empty()) - { - // use 0th block from each distribution (since there is only one block - // for quadratic time MMD - CFeatures *samples_p=samples[0][0].get(); - CFeatures *samples_q=samples[1][0].get(); - - try - { - auto p_and_q=FeaturesUtil::create_merged_copy(samples_p, samples_q); - samples.clear(); - auto euclidean_distance=std::unique_ptr(new CEuclideanDistance()); - if (euclidean_distance->init(p_and_q, p_and_q)) - { - auto dist_mat=euclidean_distance->get_distance_matrix(); - distance->set_triangle_distance_matrix_from_full(dist_mat.data(), dist_mat.num_rows, dist_mat.num_cols); - } - else - { - SG_SERROR("Computing distance matrix was not possible! Please contact Shogun developers.\n"); - } - } - catch (ShogunException e) - { - SG_SERROR("%s, Data is too large! Computing distance matrix was not possible!\n", e.get_exception_string()); - } - } - else - SG_SERROR("Could not fetch samples!\n"); - - data_mgr.end(); - data_mgr.set_blockwise(blockwise); - - return distance; -} - CMMD::CMMD() : CTwoSampleTest() { #if EIGEN_VERSION_AT_LEAST(3,1,0) @@ -502,11 +451,6 @@ void CMMD::select_kernel() SG_DEBUG("Leaving!\n"); } -CCustomDistance* CMMD::compute_distance() -{ - return self->compute_distance(); -} - float64_t CMMD::compute_statistic() { return self->compute_statistic_variance().first; diff --git a/src/shogun/statistical_testing/MMD.h b/src/shogun/statistical_testing/MMD.h index 2072f18101f..383f4fae416 100644 --- a/src/shogun/statistical_testing/MMD.h +++ b/src/shogun/statistical_testing/MMD.h @@ -40,7 +40,6 @@ namespace shogun { class CKernel; -class CCustomDistance; template class SGVector; template class SGMatrix; class CKernelSelectionStrategy; @@ -96,7 +95,6 @@ class CMMD : public CTwoSampleTest void add_kernel(CKernel *kernel); void select_kernel(); - CCustomDistance* compute_distance(); void set_train_test_ratio(float64_t ratio); virtual float64_t compute_statistic(); virtual float64_t compute_variance(); diff --git a/src/shogun/statistical_testing/TwoDistributionTest.cpp b/src/shogun/statistical_testing/TwoDistributionTest.cpp index 021fb6e02f5..ac012d1b7fd 100644 --- a/src/shogun/statistical_testing/TwoDistributionTest.cpp +++ b/src/shogun/statistical_testing/TwoDistributionTest.cpp @@ -16,9 +16,13 @@ * along with this program. If not, see . */ +#include +#include #include #include #include +#include +#include using namespace shogun; using namespace internal; @@ -79,6 +83,49 @@ const index_t CTwoDistributionTest::get_num_samples_q() const return dm.num_samples_at(1); } +CCustomDistance* CTwoDistributionTest::compute_distance() +{ + auto distance=new CCustomDistance(); + auto& data_mgr=get_data_mgr(); + + bool is_blockwise=data_mgr.is_blockwise(); + data_mgr.set_blockwise(false); + + data_mgr.start(); + auto samples=data_mgr.next(); + if (!samples.empty()) + { + CFeatures *samples_p=samples[0][0].get(); + CFeatures *samples_q=samples[1][0].get(); + try + { + auto p_and_q=FeaturesUtil::create_merged_copy(samples_p, samples_q); + samples.clear(); + auto euclidean_distance=std::unique_ptr(new CEuclideanDistance()); + if (euclidean_distance->init(p_and_q, p_and_q)) + { + auto dist_mat=euclidean_distance->get_distance_matrix(); + distance->set_triangle_distance_matrix_from_full(dist_mat.data(), dist_mat.num_rows, dist_mat.num_cols); + } + else + { + SG_SERROR("Computing distance matrix was not possible! Please contact Shogun developers.\n"); + } + } + catch (ShogunException e) + { + SG_SERROR("%s, Data is too large! Computing distance matrix was not possible!\n", e.get_exception_string()); + } + } + else + SG_SERROR("Could not fetch samples!\n"); + + data_mgr.end(); + data_mgr.set_blockwise(is_blockwise); + + return distance; +} + const char* CTwoDistributionTest::get_name() const { return "TwoDistributionTest"; diff --git a/src/shogun/statistical_testing/TwoDistributionTest.h b/src/shogun/statistical_testing/TwoDistributionTest.h index 120410d2acf..7636cd7b93c 100644 --- a/src/shogun/statistical_testing/TwoDistributionTest.h +++ b/src/shogun/statistical_testing/TwoDistributionTest.h @@ -26,6 +26,8 @@ namespace shogun { +class CCustomDistance; + class CTwoDistributionTest : public CHypothesisTest { public: @@ -44,6 +46,8 @@ class CTwoDistributionTest : public CHypothesisTest void set_num_samples_q(index_t num_samples_from_q); const index_t get_num_samples_q() const; + CCustomDistance* compute_distance(); + virtual float64_t compute_statistic()=0; virtual SGVector sample_null()=0; diff --git a/tests/unit/statistical_testing/TwoDistributionTest_unittest.cc b/tests/unit/statistical_testing/TwoDistributionTest_unittest.cc new file mode 100644 index 00000000000..91bc70a6ba7 --- /dev/null +++ b/tests/unit/statistical_testing/TwoDistributionTest_unittest.cc @@ -0,0 +1,139 @@ +/* + * Copyright (c) The Shogun Machine Learning Toolbox + * Written (w) 2016 Soumyajit De + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * The views and conclusions contained in the software and documentation are those + * of the authors and should not be interpreted as representing official policies, + * either expressed or implied, of the Shogun Development Team. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace shogun +{ + +class CTwoDistributionTestMock : public CTwoDistributionTest +{ +public: + MOCK_METHOD0(compute_statistic, float64_t()); + MOCK_METHOD0(sample_null, SGVector()); +}; + +} + +using namespace shogun; + +TEST(TwoDistributionTest, compute_distance_dense) +{ + const index_t m=5; + const index_t n=10; + const index_t dim=1; + const float64_t difference=0.5; + + auto gen_p=some(0, dim, 0); + auto gen_q=some(difference, dim, 0); + + auto feats_p=static_cast*>(gen_p->get_streamed_features(m)); + auto feats_q=static_cast*>(gen_q->get_streamed_features(n)); + + auto test=some(); + test->set_p(feats_p); + test->set_q(feats_q); + + auto distance=test->compute_distance(); + auto distance_mat1=distance->get_distance_matrix(); + + SGMatrix data_p_and_q(1, m+n); + auto data_p=feats_p->get_feature_matrix(); + auto data_q=feats_q->get_feature_matrix(); + std::copy(data_p.data(), data_p.data()+data_p.size(), data_p_and_q.data()); + std::copy(data_q.data(), data_q.data()+data_q.size(), data_p_and_q.data()+data_p.size()); + auto feats_p_and_q=new CDenseFeatures(data_p_and_q); + + auto euclidean_distance=some(); + euclidean_distance->init(feats_p_and_q, feats_p_and_q); + auto distance_mat2=euclidean_distance->get_distance_matrix(); + + EXPECT_TRUE(distance_mat1.num_rows==distance_mat2.num_rows); + EXPECT_TRUE(distance_mat1.num_cols==distance_mat2.num_cols); + for (size_t i=0; i(); + test->set_p(gen_p); + test->set_q(gen_q); + test->set_num_samples_p(m); + test->set_num_samples_q(n); + + sg_rand->set_seed(12345); + auto distance=test->compute_distance(); + auto distance_mat1=distance->get_distance_matrix(); + + sg_rand->set_seed(12345); + auto feats_p=static_cast*>(gen_p->get_streamed_features(m)); + auto feats_q=static_cast*>(gen_q->get_streamed_features(n)); + + SGMatrix data_p_and_q(1, m+n); + auto data_p=feats_p->get_feature_matrix(); + auto data_q=feats_q->get_feature_matrix(); + std::copy(data_p.data(), data_p.data()+data_p.size(), data_p_and_q.data()); + std::copy(data_q.data(), data_q.data()+data_q.size(), data_p_and_q.data()+data_p.size()); + auto feats_p_and_q=new CDenseFeatures(data_p_and_q); + SG_UNREF(feats_p); + SG_UNREF(feats_q); + + auto euclidean_distance=some(); + euclidean_distance->init(feats_p_and_q, feats_p_and_q); + auto distance_mat2=euclidean_distance->get_distance_matrix(); + + EXPECT_TRUE(distance_mat1.num_rows==distance_mat2.num_rows); + EXPECT_TRUE(distance_mat1.num_cols==distance_mat2.num_cols); + for (size_t i=0; i