Added different number of samples for quadratic time MMD

shogun-toolbox · Mar 18, 2014 · 3bb6578 · 3bb6578
1 parent 6134bc2
commit 3bb6578
Show file tree

Hide file tree

Showing 4 changed files with 315 additions and 227 deletions.
diff --git a/src/shogun/statistics/QuadraticTimeMMD.cpp b/src/shogun/statistics/QuadraticTimeMMD.cpp
@@ -31,11 +31,13 @@
 #include <shogun/statistics/QuadraticTimeMMD.h>
 #include <shogun/features/Features.h>
 #include <shogun/mathematics/Statistics.h>
+#include <shogun/mathematics/eigen3.h>
 #include <shogun/kernel/Kernel.h>
 #include <shogun/kernel/CombinedKernel.h>
 #include <shogun/kernel/CustomKernel.h>
 
 using namespace shogun;
+using namespace Eigen;
 
 CQuadraticTimeMMD::CQuadraticTimeMMD() : CKernelTwoSampleTest()
 {
@@ -47,24 +49,12 @@ CQuadraticTimeMMD::CQuadraticTimeMMD(CKernel* kernel, CFeatures* p_and_q,
 		CKernelTwoSampleTest(kernel, p_and_q, m)
 {
 	init();
-
-	if (p_and_q && m!=p_and_q->get_num_vectors()/2)
-	{
-		SG_ERROR("Only features with equal number of vectors "
-				"are currently possible\n");
-	}
 }
 
 CQuadraticTimeMMD::CQuadraticTimeMMD(CKernel* kernel, CFeatures* p,
 		CFeatures* q) : CKernelTwoSampleTest(kernel, p, q)
 {
 	init();
-
-	if (p && q && p->get_num_vectors()!=q->get_num_vectors())
-	{
-		SG_ERROR("Only features with equal number of vectors "
-				"are currently possible\n");
-	}
 }
 
 CQuadraticTimeMMD::CQuadraticTimeMMD(CCustomKernel* custom_kernel, index_t m) :
@@ -75,7 +65,6 @@ CQuadraticTimeMMD::CQuadraticTimeMMD(CCustomKernel* custom_kernel, index_t m) :
 
 CQuadraticTimeMMD::~CQuadraticTimeMMD()
 {
-
 }
 
 void CQuadraticTimeMMD::init()
@@ -98,6 +87,19 @@ float64_t CQuadraticTimeMMD::compute_unbiased_statistic()
 {
 	/* split computations into three terms from JLMR paper (see documentation )*/
 	index_t m=m_m;
+	index_t n=0;
+
+	/* check if kernel is precomputed (custom kernel) */
+	if (m_kernel && m_kernel->get_kernel_type()==K_CUSTOM)
+		n=m_kernel->get_num_vec_lhs()-m;
+	else
+	{
+		REQUIRE(m_p_and_q, "The samples are not initialized!\n");
+		n=m_p_and_q->get_num_vectors()-m;
+	}
+
+	SG_DEBUG("Computing MMD_u with %d samples from p and %d samples from q\n",
+			m, n);
 
 	/* init kernel with features */
 	m_kernel->init(m_p_and_q, m_p_and_q);
@@ -110,34 +112,55 @@ float64_t CQuadraticTimeMMD::compute_unbiased_statistic()
 		for (index_t j=0; j<m; ++j)
 			first+=i==j ? 0 : m_kernel->kernel(i,j);
 	}
-	first/=(m-1);
+	first/=m*(m-1);
 
 	/* second term */
 	float64_t second=0;
-	for (index_t i=m_m; i<m_m+m; ++i)
+	for (index_t i=m; i<m+n; ++i)
 	{
 		/* ensure i!=j while adding up */
-		for (index_t j=m_m; j<m_m+m; ++j)
+		for (index_t j=m; j<m+n; ++j)
 			second+=i==j ? 0 : m_kernel->kernel(i,j);
 	}
-	second/=(m-1);
+	second/=n*(n-1);
 
 	/* third term */
 	float64_t third=0;
 	for (index_t i=0; i<m; ++i)
 	{
-		for (index_t j=m_m; j<m_m+m; ++j)
+		for (index_t j=m; j<m+n; ++j)
 			third+=m_kernel->kernel(i,j);
 	}
-	third*=2.0/m;
+	third*=2.0/m/n;
+
+	SG_DEBUG("first=%f, second=%f, third=%f\n", first, second, third);
 
-	return first+second-third;
+	float64_t mmd_u=first+second-third;
+
+	/* return m*MMD_u when m=n, (m+n)*MMD_u in general case */
+	if (m==n)
+		return m*mmd_u;
+
+	return (m+n)*mmd_u;
 }
 
 float64_t CQuadraticTimeMMD::compute_biased_statistic()
 {
 	/* split computations into three terms from JLMR paper (see documentation )*/
 	index_t m=m_m;
+	index_t n=0;
+
+	/* check if kernel is precomputed (custom kernel) */
+	if (m_kernel && m_kernel->get_kernel_type()==K_CUSTOM)
+		n=m_kernel->get_num_vec_lhs()-m;
+	else
+	{
+		REQUIRE(m_p_and_q, "The samples are not initialized!\n");
+		n=m_p_and_q->get_num_vectors()-m;
+	}
+
+	SG_DEBUG("Computing MMD_b with %d samples from p and %d samples from q\n",
+			m, n);
 
 	/* init kernel with features */
 	m_kernel->init(m_p_and_q, m_p_and_q);
@@ -149,27 +172,35 @@ float64_t CQuadraticTimeMMD::compute_biased_statistic()
 		for (index_t j=0; j<m; ++j)
 			first+=m_kernel->kernel(i,j);
 	}
-	first/=m;
+	first/=m*m;
 
 	/* second term */
 	float64_t second=0;
-	for (index_t i=m_m; i<m_m+m; ++i)
+	for (index_t i=m; i<m+n; ++i)
 	{
-		for (index_t j=m_m; j<m_m+m; ++j)
+		for (index_t j=m; j<m+n; ++j)
 			second+=m_kernel->kernel(i,j);
 	}
-	second/=m;
+	second/=n*n;
 
 	/* third term */
 	float64_t third=0;
 	for (index_t i=0; i<m; ++i)
 	{
-		for (index_t j=m_m; j<m_m+m; ++j)
+		for (index_t j=m; j<m+n; ++j)
 			third+=m_kernel->kernel(i,j);
 	}
-	third*=2.0/m;
+	third*=2.0/m/n;
 
-	return first+second-third;
+	SG_DEBUG("first=%f, second=%f, third=%f\n", first, second, third);
+
+	float64_t mmd_b=first+second-third;
+
+	/* return m*MMD_b when m=n, (m+n)*MMD_b in general case */
+	if (m==n)
+		return m*mmd_b;
+
+	return (m+n)*mmd_b;
 }
 
 float64_t CQuadraticTimeMMD::compute_statistic()
@@ -202,16 +233,16 @@ float64_t CQuadraticTimeMMD::compute_p_value(float64_t statistic)
 	{
 	case MMD2_SPECTRUM:
 	{
-#ifdef HAVE_LAPACK
+#ifdef HAVE_EIGEN3
 		/* get samples from null-distribution and compute p-value of statistic */
 		SGVector<float64_t> null_samples=sample_null_spectrum(
 				m_num_samples_spectrum, m_num_eigenvalues_spectrum);
 		null_samples.qsort();
 		index_t pos=null_samples.find_position_to_insert(statistic);
 		result=1.0-((float64_t)pos)/null_samples.vlen;
-#else // HAVE_LAPACK
-		SG_ERROR("Only possible if shogun is compiled with LAPACK enabled\n");
-#endif // HAVE_LAPACK
+#else // HAVE_EIGEN3
+		SG_ERROR("Only possible if shogun is compiled with EIGEN3 enabled\n");
+#endif // HAVE_EIGEN3
 		break;
 	}
 
@@ -280,15 +311,15 @@ float64_t CQuadraticTimeMMD::compute_threshold(float64_t alpha)
 	{
 	case MMD2_SPECTRUM:
 	{
-#ifdef HAVE_LAPACK
+#ifdef HAVE_EIGEN3
 		/* get samples from null-distribution and compute threshold */
 		SGVector<float64_t> null_samples=sample_null_spectrum(
 				m_num_samples_spectrum, m_num_eigenvalues_spectrum);
 		null_samples.qsort();
 		result=null_samples[index_t(CMath::floor(null_samples.vlen*(1-alpha)))];
-#else // HAVE_LAPACK
-		SG_ERROR("Only possible if shogun is compiled with LAPACK enabled\n");
-#endif // HAVE_LAPACK
+#else // HAVE_EIGEN3
+		SG_ERROR("Only possible if shogun is compiled with EIGEN3 enabled\n");
+#endif // HAVE_EIGEN3
 		break;
 	}
 
@@ -310,7 +341,7 @@ float64_t CQuadraticTimeMMD::compute_threshold(float64_t alpha)
 }
 
 
-#ifdef HAVE_LAPACK
+#ifdef HAVE_EIGEN3
 SGVector<float64_t> CQuadraticTimeMMD::sample_null_spectrum(index_t num_samples,
 		index_t num_eigenvalues)
 {
@@ -320,35 +351,30 @@ SGVector<float64_t> CQuadraticTimeMMD::sample_null_spectrum(index_t num_samples,
 			"(%d, %d): No features set and no custom kernel in use!\n",
 			num_samples, num_eigenvalues);
 
-	index_t num_data;
-	if (m_kernel->get_kernel_type()==K_CUSTOM)
-		num_data=m_kernel->get_num_vec_rhs();
-	else
-		num_data=m_p_and_q->get_num_vectors();
+	index_t m=m_m;
+	index_t n=0;
 
-	if (m_m!=num_data/2)
-		SG_ERROR("Currently, only equal sample sizes are supported\n");
+	/* check if kernel is precomputed (custom kernel) */
+	if (m_kernel && m_kernel->get_kernel_type()==K_CUSTOM)
+		n=m_kernel->get_num_vec_lhs()-m;
+	else
+	{
+		REQUIRE(m_p_and_q, "The samples are not initialized!\n");
+		n=m_p_and_q->get_num_vectors()-m;
+	}
 
 	if (num_samples<=2)
 	{
 		SG_ERROR("Number of samples has to be at least 2, "
 				"better in the hundreds");
 	}
 
-	if (num_eigenvalues>2*m_m-1)
+	if (num_eigenvalues>m+n-1)
 		SG_ERROR("Number of Eigenvalues too large\n");
 
 	if (num_eigenvalues<1)
 		SG_ERROR("Number of Eigenvalues too small\n");
 
-	/* evtl. warn user not to use wrong statistic type */
-	if (m_statistic_type!=BIASED)
-	{
-		SG_WARNING("Note: provided statistic has "
-				"to be BIASED. Please ensure that! To get rid of warning,"
-				"call %s::set_statistic_type(BIASED)\n", get_name());
-	}
-
 	/* imaginary matrix K=[K KL; KL' L] (MATLAB notation)
 	 * K is matrix for XX, L is matrix for YY, KL is XY, LK is YX
 	 * works since X and Y are concatenated here */
@@ -359,30 +385,54 @@ SGVector<float64_t> CQuadraticTimeMMD::sample_null_spectrum(index_t num_samples,
 	K.center();
 
 	/* compute eigenvalues and select num_eigenvalues largest ones */
-	SGVector<float64_t> eigenvalues=
-			SGMatrix<float64_t>::compute_eigenvectors(K);
-	SGVector<float64_t> largest_ev(num_eigenvalues);
-
-	/* take largest EV, scale by 1/2/m on the fly and take abs value*/
-	for (index_t i=0; i<num_eigenvalues; ++i)
-		largest_ev[i]=CMath::abs(
-				1.0/2/m_m*eigenvalues[eigenvalues.vlen-1-i]);
+	Map<MatrixXd> c_kernel_matrix(K.matrix, K.num_rows, K.num_cols);
+	SelfAdjointEigenSolver<MatrixXd> eigen_solver(c_kernel_matrix);
+	REQUIRE(eigen_solver.info()==Eigen::Success,
+			"Eigendecomposition failed!\n");
+	index_t max_num_eigenvalues=eigen_solver.eigenvalues().rows();
+
+	/* precomputing terms with rho_x and rho_y of equation 10 in [1]
+	 * (see documentation) */
+	float64_t rho_x=float64_t(m)/(m+n);
+	float64_t rho_y=1.0-rho_x;
+	float64_t sqrt_rho_x=CMath::sqrt(rho_x);
+	float64_t sqrt_rho_y=CMath::sqrt(rho_y);
+	float64_t inv_rho_x_y=1.0/(rho_x*rho_y);
+
+	SG_DEBUG("sqrt_rho_x=%f, sqrt_rho_y=%f\n", sqrt_rho_x, sqrt_rho_y);
 
 	/* finally, sample from null distribution */
 	SGVector<float64_t> null_samples(num_samples);
 	for (index_t i=0; i<num_samples; ++i)
 	{
-		/* 2*sum(kEigs.*(randn(length(kEigs),1)).^2); */
 		null_samples[i]=0;
-		for (index_t j=0; j<largest_ev.vlen; ++j)
-			null_samples[i]+=largest_ev[j]*CMath::pow(CMath::randn_double(), 2);
+		for (index_t j=0; j<num_eigenvalues; ++j)
+		{
+			/* compute the right hand multiple of eq. 10 in [1] */
+			float64_t a_j=CMath::randn_double();
+			float64_t b_j=CMath::randn_double();
+
+			SG_DEBUG("a_j=%f, b_j=%f\n", a_j, b_j);
 
+			float64_t multiple=CMath::pow(a_j/sqrt_rho_x-b_j/sqrt_rho_y, 2);
+
+			/* take largest EV, scale by 1/(m+n) on the fly and take abs value*/
+			float64_t eigenvalue_estimate=CMath::abs(1.0/(m+n)
+				*eigen_solver.eigenvalues()[max_num_eigenvalues-1-j]);
+
+			if (m_statistic_type==UNBIASED)
+				multiple-=inv_rho_x_y;
+
+			SG_DEBUG("multiple=%f, eigenvalue=%f\n", multiple, eigenvalue_estimate);
+
+			null_samples[i]+=eigenvalue_estimate*multiple;
+		}
 		null_samples[i]*=2;
 	}
 
 	return null_samples;
 }
-#endif // HAVE_LAPACK
+#endif // HAVE_EIGEN3
 
 SGVector<float64_t> CQuadraticTimeMMD::fit_null_gamma()
 {
@@ -474,3 +524,4 @@ void CQuadraticTimeMMD::set_statistic_type(EQuadraticMMDType
 {
 	m_statistic_type=statistic_type;
 }
+