Skip to content

Commit

Permalink
moved compute_distance to twodistributiontest, added unittest
Browse files Browse the repository at this point in the history
  • Loading branch information
lambday committed Jun 7, 2016
1 parent 053a959 commit d52850f
Show file tree
Hide file tree
Showing 5 changed files with 190 additions and 58 deletions.
56 changes: 0 additions & 56 deletions src/shogun/statistical_testing/MMD.cpp
Expand Up @@ -35,8 +35,6 @@
#include <shogun/kernel/CustomKernel.h>
#include <shogun/kernel/CombinedKernel.h>
#include <shogun/features/Features.h>
#include <shogun/distance/EuclideanDistance.h>
#include <shogun/distance/CustomDistance.h>
#include <shogun/statistical_testing/MMD.h>
#include <shogun/statistical_testing/QuadraticTimeMMD.h>
#include <shogun/statistical_testing/BTestMMD.h>
Expand Down Expand Up @@ -71,7 +69,6 @@ struct CMMD::Self

std::pair<float64_t, float64_t> compute_statistic_variance();
std::pair<SGVector<float64_t>, SGMatrix<float64_t>> compute_statistic_and_Q(const KernelManager&);
CCustomDistance* compute_distance();
SGVector<float64_t> sample_null();

CMMD& owner;
Expand Down Expand Up @@ -391,54 +388,6 @@ SGVector<float64_t> CMMD::Self::sample_null()
return statistic;
}

CCustomDistance* CMMD::Self::compute_distance()
{
auto distance=new CCustomDistance();
DataManager& data_mgr=owner.get_data_mgr();

bool blockwise=data_mgr.is_blockwise();
data_mgr.set_blockwise(false);

// using data manager next() API in order to make it work with
// streaming samples as well.
data_mgr.start();
auto samples=data_mgr.next();
if (!samples.empty())
{
// use 0th block from each distribution (since there is only one block
// for quadratic time MMD
CFeatures *samples_p=samples[0][0].get();
CFeatures *samples_q=samples[1][0].get();

try
{
auto p_and_q=FeaturesUtil::create_merged_copy(samples_p, samples_q);
samples.clear();
auto euclidean_distance=std::unique_ptr<CEuclideanDistance>(new CEuclideanDistance());
if (euclidean_distance->init(p_and_q, p_and_q))
{
auto dist_mat=euclidean_distance->get_distance_matrix<float32_t>();
distance->set_triangle_distance_matrix_from_full(dist_mat.data(), dist_mat.num_rows, dist_mat.num_cols);
}
else
{
SG_SERROR("Computing distance matrix was not possible! Please contact Shogun developers.\n");
}
}
catch (ShogunException e)
{
SG_SERROR("%s, Data is too large! Computing distance matrix was not possible!\n", e.get_exception_string());
}
}
else
SG_SERROR("Could not fetch samples!\n");

data_mgr.end();
data_mgr.set_blockwise(blockwise);

return distance;
}

CMMD::CMMD() : CTwoSampleTest()
{
#if EIGEN_VERSION_AT_LEAST(3,1,0)
Expand Down Expand Up @@ -502,11 +451,6 @@ void CMMD::select_kernel()
SG_DEBUG("Leaving!\n");
}

CCustomDistance* CMMD::compute_distance()
{
return self->compute_distance();
}

float64_t CMMD::compute_statistic()
{
return self->compute_statistic_variance().first;
Expand Down
2 changes: 0 additions & 2 deletions src/shogun/statistical_testing/MMD.h
Expand Up @@ -40,7 +40,6 @@ namespace shogun
{

class CKernel;
class CCustomDistance;
template <typename> class SGVector;
template <typename> class SGMatrix;
class CKernelSelectionStrategy;
Expand Down Expand Up @@ -96,7 +95,6 @@ class CMMD : public CTwoSampleTest
void add_kernel(CKernel *kernel);
void select_kernel();

CCustomDistance* compute_distance();
void set_train_test_ratio(float64_t ratio);
virtual float64_t compute_statistic();
virtual float64_t compute_variance();
Expand Down
47 changes: 47 additions & 0 deletions src/shogun/statistical_testing/TwoDistributionTest.cpp
Expand Up @@ -16,9 +16,13 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

#include <shogun/distance/CustomDistance.h>
#include <shogun/distance/EuclideanDistance.h>
#include <shogun/statistical_testing/TwoDistributionTest.h>
#include <shogun/statistical_testing/internals/DataManager.h>
#include <shogun/statistical_testing/internals/TestTypes.h>
#include <shogun/statistical_testing/internals/NextSamples.h>
#include <shogun/statistical_testing/internals/FeaturesUtil.h>

using namespace shogun;
using namespace internal;
Expand Down Expand Up @@ -79,6 +83,49 @@ const index_t CTwoDistributionTest::get_num_samples_q() const
return dm.num_samples_at(1);
}

CCustomDistance* CTwoDistributionTest::compute_distance()
{
auto distance=new CCustomDistance();
auto& data_mgr=get_data_mgr();

bool is_blockwise=data_mgr.is_blockwise();
data_mgr.set_blockwise(false);

data_mgr.start();
auto samples=data_mgr.next();
if (!samples.empty())
{
CFeatures *samples_p=samples[0][0].get();
CFeatures *samples_q=samples[1][0].get();
try
{
auto p_and_q=FeaturesUtil::create_merged_copy(samples_p, samples_q);
samples.clear();
auto euclidean_distance=std::unique_ptr<CEuclideanDistance>(new CEuclideanDistance());
if (euclidean_distance->init(p_and_q, p_and_q))
{
auto dist_mat=euclidean_distance->get_distance_matrix<float32_t>();
distance->set_triangle_distance_matrix_from_full(dist_mat.data(), dist_mat.num_rows, dist_mat.num_cols);
}
else
{
SG_SERROR("Computing distance matrix was not possible! Please contact Shogun developers.\n");
}
}
catch (ShogunException e)
{
SG_SERROR("%s, Data is too large! Computing distance matrix was not possible!\n", e.get_exception_string());
}
}
else
SG_SERROR("Could not fetch samples!\n");

data_mgr.end();
data_mgr.set_blockwise(is_blockwise);

return distance;
}

const char* CTwoDistributionTest::get_name() const
{
return "TwoDistributionTest";
Expand Down
4 changes: 4 additions & 0 deletions src/shogun/statistical_testing/TwoDistributionTest.h
Expand Up @@ -26,6 +26,8 @@
namespace shogun
{

class CCustomDistance;

class CTwoDistributionTest : public CHypothesisTest
{
public:
Expand All @@ -44,6 +46,8 @@ class CTwoDistributionTest : public CHypothesisTest
void set_num_samples_q(index_t num_samples_from_q);
const index_t get_num_samples_q() const;

CCustomDistance* compute_distance();

virtual float64_t compute_statistic()=0;
virtual SGVector<float64_t> sample_null()=0;

Expand Down
139 changes: 139 additions & 0 deletions tests/unit/statistical_testing/TwoDistributionTest_unittest.cc
@@ -0,0 +1,139 @@
/*
* Copyright (c) The Shogun Machine Learning Toolbox
* Written (w) 2016 Soumyajit De
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* The views and conclusions contained in the software and documentation are those
* of the authors and should not be interpreted as representing official policies,
* either expressed or implied, of the Shogun Development Team.
*/

#include <shogun/base/some.h>
#include <shogun/lib/SGVector.h>
#include <shogun/lib/SGMatrix.h>
#include <shogun/features/Features.h>
#include <shogun/features/DenseFeatures.h>
#include <shogun/features/streaming/generators/MeanShiftDataGenerator.h>
#include <shogun/distance/CustomDistance.h>
#include <shogun/distance/EuclideanDistance.h>
#include <shogun/statistical_testing/TwoDistributionTest.h>
#include <gtest/gtest.h>
#include <gmock/gmock.h>

namespace shogun
{

class CTwoDistributionTestMock : public CTwoDistributionTest
{
public:
MOCK_METHOD0(compute_statistic, float64_t());
MOCK_METHOD0(sample_null, SGVector<float64_t>());
};

}

using namespace shogun;

TEST(TwoDistributionTest, compute_distance_dense)
{
const index_t m=5;
const index_t n=10;
const index_t dim=1;
const float64_t difference=0.5;

auto gen_p=some<CMeanShiftDataGenerator>(0, dim, 0);
auto gen_q=some<CMeanShiftDataGenerator>(difference, dim, 0);

auto feats_p=static_cast<CDenseFeatures<float64_t>*>(gen_p->get_streamed_features(m));
auto feats_q=static_cast<CDenseFeatures<float64_t>*>(gen_q->get_streamed_features(n));

auto test=some<CTwoDistributionTestMock>();
test->set_p(feats_p);
test->set_q(feats_q);

auto distance=test->compute_distance();
auto distance_mat1=distance->get_distance_matrix();

SGMatrix<float64_t> data_p_and_q(1, m+n);
auto data_p=feats_p->get_feature_matrix();
auto data_q=feats_q->get_feature_matrix();
std::copy(data_p.data(), data_p.data()+data_p.size(), data_p_and_q.data());
std::copy(data_q.data(), data_q.data()+data_q.size(), data_p_and_q.data()+data_p.size());
auto feats_p_and_q=new CDenseFeatures<float64_t>(data_p_and_q);

auto euclidean_distance=some<CEuclideanDistance>();
euclidean_distance->init(feats_p_and_q, feats_p_and_q);
auto distance_mat2=euclidean_distance->get_distance_matrix();

EXPECT_TRUE(distance_mat1.num_rows==distance_mat2.num_rows);
EXPECT_TRUE(distance_mat1.num_cols==distance_mat2.num_cols);
for (size_t i=0; i<distance_mat1.size(); ++i)
EXPECT_NEAR(distance_mat1.data()[i], distance_mat2.data()[i], 1E-6);

SG_UNREF(distance);
}

TEST(TwoDistributionTest, compute_distance_streaming)
{
const index_t m=5;
const index_t n=10;
const index_t dim=1;
const float64_t difference=0.5;

auto gen_p=new CMeanShiftDataGenerator(0, dim, 0);
auto gen_q=new CMeanShiftDataGenerator(difference, dim, 0);

auto test=some<CTwoDistributionTestMock>();
test->set_p(gen_p);
test->set_q(gen_q);
test->set_num_samples_p(m);
test->set_num_samples_q(n);

sg_rand->set_seed(12345);
auto distance=test->compute_distance();
auto distance_mat1=distance->get_distance_matrix();

sg_rand->set_seed(12345);
auto feats_p=static_cast<CDenseFeatures<float64_t>*>(gen_p->get_streamed_features(m));
auto feats_q=static_cast<CDenseFeatures<float64_t>*>(gen_q->get_streamed_features(n));

SGMatrix<float64_t> data_p_and_q(1, m+n);
auto data_p=feats_p->get_feature_matrix();
auto data_q=feats_q->get_feature_matrix();
std::copy(data_p.data(), data_p.data()+data_p.size(), data_p_and_q.data());
std::copy(data_q.data(), data_q.data()+data_q.size(), data_p_and_q.data()+data_p.size());
auto feats_p_and_q=new CDenseFeatures<float64_t>(data_p_and_q);
SG_UNREF(feats_p);
SG_UNREF(feats_q);

auto euclidean_distance=some<CEuclideanDistance>();
euclidean_distance->init(feats_p_and_q, feats_p_and_q);
auto distance_mat2=euclidean_distance->get_distance_matrix();

EXPECT_TRUE(distance_mat1.num_rows==distance_mat2.num_rows);
EXPECT_TRUE(distance_mat1.num_cols==distance_mat2.num_cols);
for (size_t i=0; i<distance_mat1.size(); ++i)
EXPECT_NEAR(distance_mat1.data()[i], distance_mat2.data()[i], 1E-6);

SG_UNREF(distance);
}

0 comments on commit d52850f

Please sign in to comment.