Skip to content

Commit

Permalink
added first draft of multi kernel MMD
Browse files Browse the repository at this point in the history
  • Loading branch information
lambday committed Jun 13, 2016
1 parent d52850f commit 2b156a3
Show file tree
Hide file tree
Showing 13 changed files with 556 additions and 18 deletions.
3 changes: 3 additions & 0 deletions src/interfaces/modular/Statistics.i
Expand Up @@ -7,6 +7,9 @@
* Written (W) 2012-2013 Heiko Strathmann
*/

/* These functions return new Objects */
%newobject shogun::CTwoDistributionTest::compute_distance();

/* Remove C Prefix */
%rename(HypothesisTest) CHypothesisTest;
%rename(OneDistributionTest) COneDistributionTest;
Expand Down
17 changes: 17 additions & 0 deletions src/shogun/statistical_testing/QuadraticTimeMMD.cpp
Expand Up @@ -48,6 +48,8 @@
#include <shogun/statistical_testing/internals/mmd/UnbiasedIncomplete.h>
#include <shogun/statistical_testing/internals/mmd/FullDirect.h>
#include <shogun/statistical_testing/internals/mmd/WithinBlockPermutationBatch.h>
#include <shogun/statistical_testing/internals/mmd/MultiKernelMMD.h>
#include <shogun/statistical_testing/kernelselection/KernelSelectionStrategy.h>

using namespace shogun;
using namespace internal;
Expand Down Expand Up @@ -542,6 +544,21 @@ void CQuadraticTimeMMD::precompute_kernel_matrix(bool precompute)
self->precompute=precompute;
}

SGVector<float64_t> CQuadraticTimeMMD::compute_statistic(const internal::KernelManager& kernel_mgr)
{
SG_DEBUG("Entering");
const auto& data_mgr=get_data_mgr();
const index_t nx=data_mgr.num_samples_at(0);
const index_t ny=data_mgr.num_samples_at(1);
MultiKernelMMD compute(nx, ny, get_statistic_type());
compute.set_distance(compute_distance());
SGVector<float64_t> result=compute(kernel_mgr);
for (auto i=0; i<result.vlen; ++i)
result[i]=normalize_statistic(result[i]);
SG_DEBUG("Leaving");
return result;
}

const char* CQuadraticTimeMMD::get_name() const
{
return "QuadraticTimeMMD";
Expand Down
9 changes: 9 additions & 0 deletions src/shogun/statistical_testing/QuadraticTimeMMD.h
Expand Up @@ -41,8 +41,15 @@ namespace shogun

template <typename> class SGVector;

namespace internal
{
class KernelManager;
class MaxMeasure;
}

class CQuadraticTimeMMD : public CMMD
{
friend class internal::MaxMeasure;
public:
typedef std::function<float32_t(SGMatrix<float32_t>)> operation;
CQuadraticTimeMMD();
Expand Down Expand Up @@ -72,6 +79,8 @@ class CQuadraticTimeMMD : public CMMD
virtual const float64_t normalize_variance(float64_t variance) const;
SGVector<float64_t> gamma_fit_null();
SGVector<float64_t> spectrum_sample_null();

SGVector<float64_t> compute_statistic(const internal::KernelManager& kernel_mgr);
};

}
Expand Down
151 changes: 151 additions & 0 deletions src/shogun/statistical_testing/internals/mmd/MultiKernelMMD.cpp
@@ -0,0 +1,151 @@
/*
* Copyright (c) The Shogun Machine Learning Toolbox
* Written (w) 2014 - 2016 Soumyajit De
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* The views and conclusions contained in the software and documentation are those
* of the authors and should not be interpreted as representing official policies,
* either expressed or implied, of the Shogun Development Team.
*/

#include <shogun/io/SGIO.h>
#include <shogun/lib/SGMatrix.h>
#include <shogun/mathematics/Math.h>
#include <shogun/kernel/Kernel.h>
#include <shogun/kernel/GaussianKernel.h>
#include <shogun/distance/CustomDistance.h>
#include <shogun/statistical_testing/MMD.h>
#include <shogun/statistical_testing/internals/KernelManager.h>
#include <shogun/statistical_testing/internals/mmd/MultiKernelMMD.h>

using namespace shogun;
using namespace internal;
using namespace mmd;

struct MultiKernelMMD::terms_t
{
std::array<float64_t, 3> term{};
std::array<float64_t, 3> diag{};
};

MultiKernelMMD::MultiKernelMMD(index_t nx, index_t ny, EStatisticType stype) : n_x(nx), n_y(ny), s_type(stype)
{
SG_SDEBUG("number of samples are %d and %d!\n", n_x, n_y);
}

void MultiKernelMMD::set_distance(CCustomDistance* distance)
{
m_distance=std::shared_ptr<CCustomDistance>(distance);
}

void MultiKernelMMD::add_term(terms_t& t, float32_t val, index_t i, index_t j) const
{
if (i<n_x && j<n_x && i<=j)
{
SG_SDEBUG("Adding Kernel(%d,%d)=%f to term_0!\n", i, j, val);
t.term[0]+=val;
if (i==j)
t.diag[0]+=val;
}
else if (i>=n_x && j>=n_x && i<=j)
{
SG_SDEBUG("Adding Kernel(%d,%d)=%f to term_1!\n", i, j, val);
t.term[1]+=val;
if (i==j)
t.diag[1]+=val;
}
else if (i>=n_x && j<n_x)
{
SG_SDEBUG("Adding Kernel(%d,%d)=%f to term_2!\n", i, j, val);
t.term[2]+=val;
if (i-n_x==j)
t.diag[2]+=val;
}
}

SGVector<float64_t> MultiKernelMMD::operator()(const KernelManager& kernel_mgr) const
{
SG_SDEBUG("Entering!\n");
SGVector<float64_t> widths(kernel_mgr.num_kernels());
for (size_t i=0; i<kernel_mgr.num_kernels(); ++i)
{
CGaussianKernel* rbf_kernel=dynamic_cast<CGaussianKernel*>(kernel_mgr.kernel_at(i));
REQUIRE(rbf_kernel, "Kernel is not an instance of Gaussian Kernel!\n"); // TODO remove this
widths[i]=rbf_kernel->get_width();
}

SGVector<float64_t> result(kernel_mgr.num_kernels());
#pragma omp parallel for
for (size_t k=0; k<kernel_mgr.num_kernels(); ++k)
{
float64_t const width=widths[k];
terms_t t;
for (auto j=0; j<n_x+n_y; ++j)
{
for (auto i=0; i<n_x+n_y; ++i)
{
auto distance=m_distance->distance(i, j);
auto kernel=CMath::exp(-distance*distance/width);
add_term(t, kernel, i, j);
}
}

t.term[0]=2*(t.term[0]-t.diag[0]);
t.term[1]=2*(t.term[1]-t.diag[1]);
SG_SDEBUG("term_0 sum (without diagonal) = %f!\n", t.term[0]);
SG_SDEBUG("term_1 sum (without diagonal) = %f!\n", t.term[1]);
if (s_type!=ST_BIASED_FULL)
{
t.term[0]/=n_x*(n_x-1);
t.term[1]/=n_y*(n_y-1);
}
else
{
t.term[0]+=t.diag[0];
t.term[1]+=t.diag[1];
SG_SDEBUG("term_0 sum (with diagonal) = %f!\n", t.term[0]);
SG_SDEBUG("term_1 sum (with diagonal) = %f!\n", t.term[1]);
t.term[0]/=n_x*n_x;
t.term[1]/=n_y*n_y;
}
SG_SDEBUG("term_0 (normalized) = %f!\n", t.term[0]);
SG_SDEBUG("term_1 (normalized) = %f!\n", t.term[1]);

SG_SDEBUG("term_2 sum (with diagonal) = %f!\n", t.term[2]);
if (s_type==ST_UNBIASED_INCOMPLETE)
{
t.term[2]-=t.diag[2];
SG_SDEBUG("term_2 sum (without diagonal) = %f!\n", t.term[2]);
t.term[2]/=n_x*(n_x-1);
}
else
t.term[2]/=n_x*n_y;
SG_SDEBUG("term_2 (normalized) = %f!\n", t.term[2]);

result[k]=t.term[0]+t.term[1]-2*t.term[2];
SG_SDEBUG("result[%d] = %f!\n", k, result[k]);
}

SG_SDEBUG("Leaving!\n");
return result;
}
70 changes: 70 additions & 0 deletions src/shogun/statistical_testing/internals/mmd/MultiKernelMMD.h
@@ -0,0 +1,70 @@
/*
* Copyright (c) The Shogun Machine Learning Toolbox
* Written (w) 2014 - 2016 Soumyajit De
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* The views and conclusions contained in the software and documentation are those
* of the authors and should not be interpreted as representing official policies,
* either expressed or implied, of the Shogun Development Team.
*/

#ifndef MULTI_KERNEL_MMD_H_
#define MULTI_KERNEL_MMD_H_

#include <memory>
#include <shogun/statistical_testing/MMD.h>

namespace shogun
{

template <typename T> class SGVector;
class CCustomDistance;

namespace internal
{

namespace mmd
{

class MultiKernelMMD
{
public:
MultiKernelMMD(index_t nx, index_t ny, EStatisticType stype);
SGVector<float64_t> operator()(const KernelManager& kernel_mgr) const;
void set_distance(CCustomDistance* distance);
private:
struct terms_t;
const index_t n_x;
const index_t n_y;
const EStatisticType s_type;
std::shared_ptr<CCustomDistance> m_distance;
void add_term(terms_t&, float32_t, index_t, index_t) const;
};

}

}

}

#endif // MULTI_KERNEL_MMD_H_
Expand Up @@ -220,6 +220,7 @@ CKernel* CKernelSelectionStrategy::select_kernel(CMMD* estimator)
return self->policy->select_kernel();
}

// TODO call this method when test train mode is turned off
void CKernelSelectionStrategy::erase_intermediate_results()
{
self->policy=nullptr;
Expand Down
Expand Up @@ -40,6 +40,7 @@ namespace shogun

class CKernel;
class CMMD;
class CQuadraticTimeMMD;
template <class> class SGVector;
template <class> class SGMatrix;

Expand All @@ -60,6 +61,7 @@ enum EKernelSelectionMethod : uint32_t
class CKernelSelectionStrategy : public CSGObject
{
friend class CMMD;
friend class CQuadraticTimeMMD;
public:
CKernelSelectionStrategy();
explicit CKernelSelectionStrategy(EKernelSelectionMethod method);
Expand Down
Expand Up @@ -34,6 +34,7 @@
#include <shogun/lib/SGMatrix.h>
#include <shogun/kernel/Kernel.h>
#include <shogun/statistical_testing/MMD.h>
#include <shogun/statistical_testing/QuadraticTimeMMD.h>
#include <shogun/statistical_testing/internals/KernelManager.h>
#include <shogun/statistical_testing/kernelselection/internals/MaxMeasure.h>

Expand Down Expand Up @@ -61,7 +62,7 @@ SGMatrix<float64_t> MaxMeasure::get_measure_matrix()

void MaxMeasure::init_measures()
{
const size_t num_kernels=kernel_mgr.num_kernels();
const index_t num_kernels=kernel_mgr.num_kernels();
REQUIRE(num_kernels>0, "Number of kernels is %d!\n", kernel_mgr.num_kernels());
if (measures.size()!=num_kernels)
measures=SGVector<float64_t>(num_kernels);
Expand All @@ -70,23 +71,34 @@ void MaxMeasure::init_measures()

void MaxMeasure::compute_measures()
{
init_measures();
REQUIRE(estimator!=nullptr, "Estimator is not set!\n");
auto existing_kernel=estimator->get_kernel();
const size_t num_kernels=kernel_mgr.num_kernels();
for (size_t i=0; i<num_kernels; ++i)
CQuadraticTimeMMD* mmd=dynamic_cast<CQuadraticTimeMMD*>(estimator);
if (mmd!=nullptr) // TODO put check for translational invariant kernels
{
auto kernel=kernel_mgr.kernel_at(i);
estimator->set_kernel(kernel);
measures[i]=estimator->compute_statistic();
estimator->cleanup();
// TODO only call this for translational invariant kernels
// for the rest, call the below one
measures=mmd->compute_statistic(kernel_mgr);
}
else
{
init_measures();
auto existing_kernel=estimator->get_kernel();
const size_t num_kernels=kernel_mgr.num_kernels();
for (size_t i=0; i<num_kernels; ++i)
{
auto kernel=kernel_mgr.kernel_at(i);
estimator->set_kernel(kernel);
measures[i]=estimator->compute_statistic();
estimator->cleanup();
}
estimator->set_kernel(existing_kernel);
}
estimator->set_kernel(existing_kernel);
}

CKernel* MaxMeasure::select_kernel()
{
compute_measures();
ASSERT(measures.size()==kernel_mgr.num_kernels());
auto max_element=std::max_element(measures.vector, measures.vector+measures.vlen);
auto max_idx=std::distance(measures.vector, max_element);
SG_SDEBUG("Selected kernel at %d position!\n", max_idx);
Expand Down
Expand Up @@ -54,7 +54,6 @@ MedianHeuristic::MedianHeuristic(KernelManager& km, CMMD* est) : KernelSelection

MedianHeuristic::~MedianHeuristic()
{
SG_UNREF(distance);
}

void MedianHeuristic::init_measures()
Expand All @@ -64,9 +63,7 @@ void MedianHeuristic::init_measures()

void MedianHeuristic::compute_measures()
{
SG_UNREF(distance);
distance=estimator->compute_distance();
SG_REF(distance);
distance=std::shared_ptr<CCustomDistance>(estimator->compute_distance());
n=distance->get_num_vec_lhs();
REQUIRE(distance->get_num_vec_lhs()==distance->get_num_vec_rhs(),
"Distance matrix is supposed to be a square matrix (was of dimension %dX%d)!\n",
Expand Down

0 comments on commit 2b156a3

Please sign in to comment.