Skip to content

Commit

Permalink
Added different number of samples for quadratic time MMD
Browse files Browse the repository at this point in the history
  • Loading branch information
lambday committed Mar 18, 2014
1 parent 6134bc2 commit 3bb6578
Show file tree
Hide file tree
Showing 4 changed files with 315 additions and 227 deletions.
179 changes: 115 additions & 64 deletions src/shogun/statistics/QuadraticTimeMMD.cpp
Expand Up @@ -31,11 +31,13 @@
#include <shogun/statistics/QuadraticTimeMMD.h>
#include <shogun/features/Features.h>
#include <shogun/mathematics/Statistics.h>
#include <shogun/mathematics/eigen3.h>
#include <shogun/kernel/Kernel.h>
#include <shogun/kernel/CombinedKernel.h>
#include <shogun/kernel/CustomKernel.h>

using namespace shogun;
using namespace Eigen;

CQuadraticTimeMMD::CQuadraticTimeMMD() : CKernelTwoSampleTest()
{
Expand All @@ -47,24 +49,12 @@ CQuadraticTimeMMD::CQuadraticTimeMMD(CKernel* kernel, CFeatures* p_and_q,
CKernelTwoSampleTest(kernel, p_and_q, m)
{
init();

if (p_and_q && m!=p_and_q->get_num_vectors()/2)
{
SG_ERROR("Only features with equal number of vectors "
"are currently possible\n");
}
}

CQuadraticTimeMMD::CQuadraticTimeMMD(CKernel* kernel, CFeatures* p,
CFeatures* q) : CKernelTwoSampleTest(kernel, p, q)
{
init();

if (p && q && p->get_num_vectors()!=q->get_num_vectors())
{
SG_ERROR("Only features with equal number of vectors "
"are currently possible\n");
}
}

CQuadraticTimeMMD::CQuadraticTimeMMD(CCustomKernel* custom_kernel, index_t m) :
Expand All @@ -75,7 +65,6 @@ CQuadraticTimeMMD::CQuadraticTimeMMD(CCustomKernel* custom_kernel, index_t m) :

CQuadraticTimeMMD::~CQuadraticTimeMMD()
{

}

void CQuadraticTimeMMD::init()
Expand All @@ -98,6 +87,19 @@ float64_t CQuadraticTimeMMD::compute_unbiased_statistic()
{
/* split computations into three terms from JLMR paper (see documentation )*/
index_t m=m_m;
index_t n=0;

/* check if kernel is precomputed (custom kernel) */
if (m_kernel && m_kernel->get_kernel_type()==K_CUSTOM)
n=m_kernel->get_num_vec_lhs()-m;
else
{
REQUIRE(m_p_and_q, "The samples are not initialized!\n");
n=m_p_and_q->get_num_vectors()-m;
}

SG_DEBUG("Computing MMD_u with %d samples from p and %d samples from q\n",
m, n);

/* init kernel with features */
m_kernel->init(m_p_and_q, m_p_and_q);
Expand All @@ -110,34 +112,55 @@ float64_t CQuadraticTimeMMD::compute_unbiased_statistic()
for (index_t j=0; j<m; ++j)
first+=i==j ? 0 : m_kernel->kernel(i,j);
}
first/=(m-1);
first/=m*(m-1);

/* second term */
float64_t second=0;
for (index_t i=m_m; i<m_m+m; ++i)
for (index_t i=m; i<m+n; ++i)
{
/* ensure i!=j while adding up */
for (index_t j=m_m; j<m_m+m; ++j)
for (index_t j=m; j<m+n; ++j)
second+=i==j ? 0 : m_kernel->kernel(i,j);
}
second/=(m-1);
second/=n*(n-1);

/* third term */
float64_t third=0;
for (index_t i=0; i<m; ++i)
{
for (index_t j=m_m; j<m_m+m; ++j)
for (index_t j=m; j<m+n; ++j)
third+=m_kernel->kernel(i,j);
}
third*=2.0/m;
third*=2.0/m/n;

SG_DEBUG("first=%f, second=%f, third=%f\n", first, second, third);

return first+second-third;
float64_t mmd_u=first+second-third;

/* return m*MMD_u when m=n, (m+n)*MMD_u in general case */
if (m==n)
return m*mmd_u;

return (m+n)*mmd_u;
}

float64_t CQuadraticTimeMMD::compute_biased_statistic()
{
/* split computations into three terms from JLMR paper (see documentation )*/
index_t m=m_m;
index_t n=0;

/* check if kernel is precomputed (custom kernel) */
if (m_kernel && m_kernel->get_kernel_type()==K_CUSTOM)
n=m_kernel->get_num_vec_lhs()-m;
else
{
REQUIRE(m_p_and_q, "The samples are not initialized!\n");
n=m_p_and_q->get_num_vectors()-m;
}

SG_DEBUG("Computing MMD_b with %d samples from p and %d samples from q\n",
m, n);

/* init kernel with features */
m_kernel->init(m_p_and_q, m_p_and_q);
Expand All @@ -149,27 +172,35 @@ float64_t CQuadraticTimeMMD::compute_biased_statistic()
for (index_t j=0; j<m; ++j)
first+=m_kernel->kernel(i,j);
}
first/=m;
first/=m*m;

/* second term */
float64_t second=0;
for (index_t i=m_m; i<m_m+m; ++i)
for (index_t i=m; i<m+n; ++i)
{
for (index_t j=m_m; j<m_m+m; ++j)
for (index_t j=m; j<m+n; ++j)
second+=m_kernel->kernel(i,j);
}
second/=m;
second/=n*n;

/* third term */
float64_t third=0;
for (index_t i=0; i<m; ++i)
{
for (index_t j=m_m; j<m_m+m; ++j)
for (index_t j=m; j<m+n; ++j)
third+=m_kernel->kernel(i,j);
}
third*=2.0/m;
third*=2.0/m/n;

return first+second-third;
SG_DEBUG("first=%f, second=%f, third=%f\n", first, second, third);

float64_t mmd_b=first+second-third;

/* return m*MMD_b when m=n, (m+n)*MMD_b in general case */
if (m==n)
return m*mmd_b;

return (m+n)*mmd_b;
}

float64_t CQuadraticTimeMMD::compute_statistic()
Expand Down Expand Up @@ -202,16 +233,16 @@ float64_t CQuadraticTimeMMD::compute_p_value(float64_t statistic)
{
case MMD2_SPECTRUM:
{
#ifdef HAVE_LAPACK
#ifdef HAVE_EIGEN3
/* get samples from null-distribution and compute p-value of statistic */
SGVector<float64_t> null_samples=sample_null_spectrum(
m_num_samples_spectrum, m_num_eigenvalues_spectrum);
null_samples.qsort();
index_t pos=null_samples.find_position_to_insert(statistic);
result=1.0-((float64_t)pos)/null_samples.vlen;
#else // HAVE_LAPACK
SG_ERROR("Only possible if shogun is compiled with LAPACK enabled\n");
#endif // HAVE_LAPACK
#else // HAVE_EIGEN3
SG_ERROR("Only possible if shogun is compiled with EIGEN3 enabled\n");
#endif // HAVE_EIGEN3
break;
}

Expand Down Expand Up @@ -280,15 +311,15 @@ float64_t CQuadraticTimeMMD::compute_threshold(float64_t alpha)
{
case MMD2_SPECTRUM:
{
#ifdef HAVE_LAPACK
#ifdef HAVE_EIGEN3
/* get samples from null-distribution and compute threshold */
SGVector<float64_t> null_samples=sample_null_spectrum(
m_num_samples_spectrum, m_num_eigenvalues_spectrum);
null_samples.qsort();
result=null_samples[index_t(CMath::floor(null_samples.vlen*(1-alpha)))];
#else // HAVE_LAPACK
SG_ERROR("Only possible if shogun is compiled with LAPACK enabled\n");
#endif // HAVE_LAPACK
#else // HAVE_EIGEN3
SG_ERROR("Only possible if shogun is compiled with EIGEN3 enabled\n");
#endif // HAVE_EIGEN3
break;
}

Expand All @@ -310,7 +341,7 @@ float64_t CQuadraticTimeMMD::compute_threshold(float64_t alpha)
}


#ifdef HAVE_LAPACK
#ifdef HAVE_EIGEN3
SGVector<float64_t> CQuadraticTimeMMD::sample_null_spectrum(index_t num_samples,
index_t num_eigenvalues)
{
Expand All @@ -320,35 +351,30 @@ SGVector<float64_t> CQuadraticTimeMMD::sample_null_spectrum(index_t num_samples,
"(%d, %d): No features set and no custom kernel in use!\n",
num_samples, num_eigenvalues);

index_t num_data;
if (m_kernel->get_kernel_type()==K_CUSTOM)
num_data=m_kernel->get_num_vec_rhs();
else
num_data=m_p_and_q->get_num_vectors();
index_t m=m_m;
index_t n=0;

if (m_m!=num_data/2)
SG_ERROR("Currently, only equal sample sizes are supported\n");
/* check if kernel is precomputed (custom kernel) */
if (m_kernel && m_kernel->get_kernel_type()==K_CUSTOM)
n=m_kernel->get_num_vec_lhs()-m;
else
{
REQUIRE(m_p_and_q, "The samples are not initialized!\n");
n=m_p_and_q->get_num_vectors()-m;
}

if (num_samples<=2)
{
SG_ERROR("Number of samples has to be at least 2, "
"better in the hundreds");
}

if (num_eigenvalues>2*m_m-1)
if (num_eigenvalues>m+n-1)
SG_ERROR("Number of Eigenvalues too large\n");

if (num_eigenvalues<1)
SG_ERROR("Number of Eigenvalues too small\n");

/* evtl. warn user not to use wrong statistic type */
if (m_statistic_type!=BIASED)
{
SG_WARNING("Note: provided statistic has "
"to be BIASED. Please ensure that! To get rid of warning,"
"call %s::set_statistic_type(BIASED)\n", get_name());
}

/* imaginary matrix K=[K KL; KL' L] (MATLAB notation)
* K is matrix for XX, L is matrix for YY, KL is XY, LK is YX
* works since X and Y are concatenated here */
Expand All @@ -359,30 +385,54 @@ SGVector<float64_t> CQuadraticTimeMMD::sample_null_spectrum(index_t num_samples,
K.center();

/* compute eigenvalues and select num_eigenvalues largest ones */
SGVector<float64_t> eigenvalues=
SGMatrix<float64_t>::compute_eigenvectors(K);
SGVector<float64_t> largest_ev(num_eigenvalues);

/* take largest EV, scale by 1/2/m on the fly and take abs value*/
for (index_t i=0; i<num_eigenvalues; ++i)
largest_ev[i]=CMath::abs(
1.0/2/m_m*eigenvalues[eigenvalues.vlen-1-i]);
Map<MatrixXd> c_kernel_matrix(K.matrix, K.num_rows, K.num_cols);
SelfAdjointEigenSolver<MatrixXd> eigen_solver(c_kernel_matrix);
REQUIRE(eigen_solver.info()==Eigen::Success,
"Eigendecomposition failed!\n");
index_t max_num_eigenvalues=eigen_solver.eigenvalues().rows();

/* precomputing terms with rho_x and rho_y of equation 10 in [1]
* (see documentation) */
float64_t rho_x=float64_t(m)/(m+n);
float64_t rho_y=1.0-rho_x;
float64_t sqrt_rho_x=CMath::sqrt(rho_x);
float64_t sqrt_rho_y=CMath::sqrt(rho_y);
float64_t inv_rho_x_y=1.0/(rho_x*rho_y);

SG_DEBUG("sqrt_rho_x=%f, sqrt_rho_y=%f\n", sqrt_rho_x, sqrt_rho_y);

/* finally, sample from null distribution */
SGVector<float64_t> null_samples(num_samples);
for (index_t i=0; i<num_samples; ++i)
{
/* 2*sum(kEigs.*(randn(length(kEigs),1)).^2); */
null_samples[i]=0;
for (index_t j=0; j<largest_ev.vlen; ++j)
null_samples[i]+=largest_ev[j]*CMath::pow(CMath::randn_double(), 2);
for (index_t j=0; j<num_eigenvalues; ++j)
{
/* compute the right hand multiple of eq. 10 in [1] */
float64_t a_j=CMath::randn_double();
float64_t b_j=CMath::randn_double();

SG_DEBUG("a_j=%f, b_j=%f\n", a_j, b_j);

float64_t multiple=CMath::pow(a_j/sqrt_rho_x-b_j/sqrt_rho_y, 2);

/* take largest EV, scale by 1/(m+n) on the fly and take abs value*/
float64_t eigenvalue_estimate=CMath::abs(1.0/(m+n)
*eigen_solver.eigenvalues()[max_num_eigenvalues-1-j]);

if (m_statistic_type==UNBIASED)
multiple-=inv_rho_x_y;

SG_DEBUG("multiple=%f, eigenvalue=%f\n", multiple, eigenvalue_estimate);

null_samples[i]+=eigenvalue_estimate*multiple;
}
null_samples[i]*=2;
}

return null_samples;
}
#endif // HAVE_LAPACK
#endif // HAVE_EIGEN3

SGVector<float64_t> CQuadraticTimeMMD::fit_null_gamma()
{
Expand Down Expand Up @@ -474,3 +524,4 @@ void CQuadraticTimeMMD::set_statistic_type(EQuadraticMMDType
{
m_statistic_type=statistic_type;
}

0 comments on commit 3bb6578

Please sign in to comment.