diff --git a/doc/ipython-notebooks/statistics/mmd_two_sample_testing.ipynb b/doc/ipython-notebooks/statistics/mmd_two_sample_testing.ipynb index 953c6b27171..024416256c9 100644 --- a/doc/ipython-notebooks/statistics/mmd_two_sample_testing.ipynb +++ b/doc/ipython-notebooks/statistics/mmd_two_sample_testing.ipynb @@ -545,7 +545,7 @@ "#tell Shogun to use spectrum approximation\n", "mmd.set_null_approximation_method(MMD2_SPECTRUM)\n", "mmd.set_num_eigenvalues_spectrum(num_eigen)\n", - "mmd.set_num_samples_sepctrum(num_samples)\n", + "mmd.set_num_samples_spectrum(num_samples)\n", "\n", "# the usual test interface\n", "p_value_spectrum=mmd.perform_test()\n", @@ -649,7 +649,7 @@ " mmd=QuadraticTimeMMD(kernel, feat_p, feat_q)\n", " mmd.set_null_approximation_method(MMD2_SPECTRUM)\n", " mmd.set_num_eigenvalues_spectrum(num_eigen)\n", - " mmd.set_num_samples_sepctrum(num_samples)\n", + " mmd.set_num_samples_spectrum(num_samples)\n", " mmd.set_statistic_type(BIASED)\n", " rejections_spectrum[i]=mmd.perform_test(alpha)\n", " \n", diff --git a/examples/undocumented/libshogun/statistics_quadratic_time_mmd.cpp b/examples/undocumented/libshogun/statistics_quadratic_time_mmd.cpp index 13d1d62da60..4a8032ca6b2 100644 --- a/examples/undocumented/libshogun/statistics_quadratic_time_mmd.cpp +++ b/examples/undocumented/libshogun/statistics_quadratic_time_mmd.cpp @@ -61,7 +61,7 @@ void quadratic_time_mmd() mmd->set_statistic_type(BIASED); mmd->set_null_approximation_method(MMD2_SPECTRUM); mmd->set_num_eigenvalues_spectrum(3); - mmd->set_num_samples_sepctrum(250); + mmd->set_num_samples_spectrum(250); p_value=mmd->perform_test(); /* reject if p-value is smaller than test level */ SG_SPRINT("spectrum: p!=q: %d\n", p_value CQuadraticTimeMMD::fit_null_gamma() return result; } -void CQuadraticTimeMMD::set_num_samples_sepctrum(index_t +void CQuadraticTimeMMD::set_num_samples_spectrum(index_t num_samples_spectrum) { m_num_samples_spectrum=num_samples_spectrum; diff --git a/src/shogun/statistics/QuadraticTimeMMD.h b/src/shogun/statistics/QuadraticTimeMMD.h index a1cc2f6c01e..2f24b954365 100644 --- a/src/shogun/statistics/QuadraticTimeMMD.h +++ b/src/shogun/statistics/QuadraticTimeMMD.h @@ -260,7 +260,7 @@ class CQuadraticTimeMMD : public CKernelTwoSampleTest * @param num_samples_spectrum number of samples to draw from * approximate null-distributrion */ - void set_num_samples_sepctrum(index_t num_samples_spectrum); + void set_num_samples_spectrum(index_t num_samples_spectrum); /** setter for number of eigenvalues to use in spectrum based p-value * computation. Maximum is m_m+m_n-1 diff --git a/tests/unit/statistics/QuadraticTimeMMD_unittest.cc b/tests/unit/statistics/QuadraticTimeMMD_unittest.cc index 3e92af1802d..55d91b197cf 100644 --- a/tests/unit/statistics/QuadraticTimeMMD_unittest.cc +++ b/tests/unit/statistics/QuadraticTimeMMD_unittest.cc @@ -13,9 +13,11 @@ #include #include #include +#include #include using namespace shogun; +using namespace Eigen; TEST(QuadraticTimeMMD,test_quadratic_mmd_biased) { @@ -188,6 +190,65 @@ TEST(QuadraticTimeMMD, test_quadratic_mmd_biased_different_num_samples) SG_UNREF(features_q); } +#ifdef HAVE_EIGEN3 +TEST(QuadraticTimeMMD, null_approximation_spectrum_different_num_samples) +{ + const index_t m=20; + const index_t n=30; + const index_t dim=3; + + /* use fixed seed */ + sg_rand->set_seed(12345); + + float64_t difference=0.5; + + /* streaming data generator for mean shift distributions */ + CMeanShiftDataGenerator* gen_p=new CMeanShiftDataGenerator(0, dim, 0); + CMeanShiftDataGenerator* gen_q=new CMeanShiftDataGenerator(difference, dim, 0); + + /* stream some data from generator */ + CFeatures* feat_p=gen_p->get_streamed_features(m); + CFeatures* feat_q=gen_q->get_streamed_features(n); + + /* shoguns kernel width is different */ + float64_t sigma=2; + float64_t sq_sigma_twice=sigma*sigma*2; + CGaussianKernel* kernel=new CGaussianKernel(10, sq_sigma_twice); + + /* create MMD instance, convienience constructor */ + CQuadraticTimeMMD* mmd=new CQuadraticTimeMMD(kernel, feat_p, feat_q); + + index_t num_null_samples=250; + index_t num_eigenvalues=10; + mmd->set_num_samples_spectrum(num_null_samples); + mmd->set_null_approximation_method(MMD2_SPECTRUM); + mmd->set_num_eigenvalues_spectrum(num_eigenvalues); + + /* biased case */ + + /* compute p-value using spectrum approximation for null distribution and + * assert against local machine computed result */ + mmd->set_statistic_type(BIASED); + float64_t p_value_spectrum=mmd->perform_test(); + EXPECT_NEAR(p_value_spectrum, 0.0, 1E-10); + + /* unbiased case */ + + /* compute p-value using spectrum approximation for null distribution and + * assert against local machine computed result */ + mmd->set_statistic_type(UNBIASED); + p_value_spectrum=mmd->perform_test(); + EXPECT_NEAR(p_value_spectrum, 0.004, 1E-10); + + /* clean up */ + SG_UNREF(mmd); + SG_UNREF(feat_p); + SG_UNREF(feat_q); + SG_UNREF(gen_p); + SG_UNREF(gen_q); +} +#endif // HAVE_EIGEN3 + TEST(QuadraticTimeMMD,test_quadratic_mmd_precomputed_kernel) { index_t m=8; @@ -263,6 +324,7 @@ TEST(QuadraticTimeMMD,test_quadratic_mmd_precomputed_kernel) SG_UNREF(p_and_q); } +#ifdef HAVE_EIGEN3 TEST(QuadraticTimeMMD,custom_kernel_vs_normal_kernel) { /* number of examples kept low in order to make things fast */ @@ -287,6 +349,15 @@ TEST(QuadraticTimeMMD,custom_kernel_vs_normal_kernel) * copies p and q and does not reference them */ CQuadraticTimeMMD* mmd=new CQuadraticTimeMMD(kernel, feat_p, feat_q); + /* set up for a precomputed custom kernel using merged features p_and_q */ + CGaussianKernel* kernel2=new CGaussianKernel(10, width); + CFeatures* p_and_q=mmd->get_p_and_q(); + kernel2->init(p_and_q, p_and_q); + CCustomKernel* precomputed=new CCustomKernel(kernel2); + CQuadraticTimeMMD* mmd2=new CQuadraticTimeMMD(precomputed, m); + SG_UNREF(p_and_q); + SG_UNREF(kernel2); + /* perform test: compute p-value and test if null-hypothesis is rejected for * a test level of 0.05 */ float64_t alpha=0.05; @@ -295,22 +366,19 @@ TEST(QuadraticTimeMMD,custom_kernel_vs_normal_kernel) mmd->set_statistic_type(BIASED); mmd->set_num_null_samples(3); mmd->set_num_eigenvalues_spectrum(3); - mmd->set_num_samples_sepctrum(250); + mmd->set_num_samples_spectrum(250); + + mmd2->set_null_approximation_method(PERMUTATION); + mmd2->set_statistic_type(BIASED); + mmd2->set_num_null_samples(3); + mmd2->set_num_eigenvalues_spectrum(3); + mmd2->set_num_samples_spectrum(250); /* compute tpye I and II error using normal and precomputed kernel */ index_t num_trials=3; - SGVector type_I_mmds(num_trials); - SGVector type_I_threshs_boot(num_trials); - SGVector type_I_threshs_spectrum(num_trials); - SGVector type_I_threshs_gamma(num_trials); - SGVector type_II_mmds(num_trials); - SGVector type_II_threshs_boot(num_trials); - SGVector type_II_threshs_spectrum(num_trials); - SGVector type_II_threshs_gamma(num_trials); SGVector inds(2*m); inds.range_fill(); - CFeatures* p_and_q=mmd->get_p_and_q(); /* use fixed seed */ CMath::init_random(1); @@ -318,91 +386,63 @@ TEST(QuadraticTimeMMD,custom_kernel_vs_normal_kernel) { /* this effectively means that p=q - rejecting is tpye I error */ inds.permute(); + + /* setting seed for Gaussian samples used in spectrum approximation method */ + sg_rand->set_seed(1); + + /* first, we compute using normal kernel */ p_and_q->add_subset(inds); - type_I_mmds[i]=mmd->compute_statistic(); + float64_t type_I_mmds=mmd->compute_statistic(); mmd->set_null_approximation_method(PERMUTATION); - type_I_threshs_boot[i]=mmd->compute_threshold(alpha); + float64_t type_I_threshs_boot=mmd->compute_threshold(alpha); mmd->set_null_approximation_method(MMD2_SPECTRUM); - type_I_threshs_spectrum[i]=mmd->compute_threshold(alpha); + float64_t type_I_threshs_spectrum=mmd->compute_threshold(alpha); mmd->set_null_approximation_method(MMD2_GAMMA); - type_I_threshs_gamma[i]=mmd->compute_threshold(alpha); + float64_t type_I_threshs_gamma=mmd->compute_threshold(alpha); p_and_q->remove_subset(); - type_II_mmds[i]=mmd->compute_statistic(); + float64_t type_II_mmds=mmd->compute_statistic(); mmd->set_null_approximation_method(PERMUTATION); - type_II_threshs_boot[i]=mmd->compute_threshold(alpha); + float64_t type_II_threshs_boot=mmd->compute_threshold(alpha); mmd->set_null_approximation_method(MMD2_SPECTRUM); - type_II_threshs_spectrum[i]=mmd->compute_threshold(alpha); + float64_t type_II_threshs_spectrum=mmd->compute_threshold(alpha); mmd->set_null_approximation_method(MMD2_GAMMA); - type_II_threshs_gamma[i]=mmd->compute_threshold(alpha); + float64_t type_II_threshs_gamma=mmd->compute_threshold(alpha); - } - SG_UNREF(p_and_q); + /* now compute using precomputed custom kernel */ + + /* setting seed for Gaussian samples used in spectrum approximation method */ + sg_rand->set_seed(1); - //SG_SPRINT("normal kernel\n"); - //type_I_mmds.display_vector("type_I_mmds"); - //type_I_threshs_boot.display_vector("type_I_threshs_boot"); - //type_II_mmds.display_vector("type_II_mmds"); - //type_II_threshs_boot.display_vector("type_II_threshs_boot"); - - /* same thing with precomputed kernel */ - SGVector type_I_mmds_pre(num_trials); - SGVector type_I_threshs_boot_pre(num_trials); - SGVector type_I_threshs_spectrum_pre(num_trials); - SGVector type_I_threshs_gamma_pre(num_trials); - SGVector type_II_mmds_pre(num_trials); - SGVector type_II_threshs_boot_pre(num_trials); - SGVector type_II_threshs_spectrum_pre(num_trials); - SGVector type_II_threshs_gamma_pre(num_trials); - kernel->init(p_and_q, p_and_q); - CCustomKernel* precomputed=new CCustomKernel(kernel); - CQuadraticTimeMMD* mmd2=new CQuadraticTimeMMD(precomputed, m); - mmd2->set_null_approximation_method(PERMUTATION); - mmd2->set_num_null_samples(3); - inds.range_fill(); - CMath::init_random(1); - for (index_t i=0; iadd_row_subset(inds); precomputed->add_col_subset(inds); - type_I_mmds_pre[i]=mmd2->compute_statistic(); - mmd->set_null_approximation_method(PERMUTATION); - type_I_threshs_boot_pre[i]=mmd2->compute_threshold(alpha); - mmd->set_null_approximation_method(MMD2_SPECTRUM); - type_I_threshs_spectrum_pre[i]=mmd2->compute_threshold(alpha); - mmd->set_null_approximation_method(MMD2_GAMMA); - type_I_threshs_gamma_pre[i]=mmd2->compute_threshold(alpha); + float64_t type_I_mmds_pre=mmd2->compute_statistic(); + mmd2->set_null_approximation_method(PERMUTATION); + float64_t type_I_threshs_boot_pre=mmd2->compute_threshold(alpha); + mmd2->set_null_approximation_method(MMD2_SPECTRUM); + float64_t type_I_threshs_spectrum_pre=mmd2->compute_threshold(alpha); + mmd2->set_null_approximation_method(MMD2_GAMMA); + float64_t type_I_threshs_gamma_pre=mmd2->compute_threshold(alpha); precomputed->remove_row_subset(); precomputed->remove_col_subset(); - type_II_mmds_pre[i]=mmd2->compute_statistic(); - mmd->set_null_approximation_method(PERMUTATION); - type_II_threshs_boot_pre[i]=mmd2->compute_threshold(alpha); - mmd->set_null_approximation_method(MMD2_SPECTRUM); - type_II_threshs_spectrum_pre[i]=mmd2->compute_threshold(alpha); - mmd->set_null_approximation_method(MMD2_GAMMA); - type_II_threshs_gamma_pre[i]=mmd2->compute_threshold(alpha); - - } - - //SG_SPRINT("precomputed kernel\n"); - //type_I_mmds_pre.display_vector("type_I_mmds"); - //type_I_threshs_boot_pre.display_vector("type_I_threshs_boot"); - //type_II_mmds_pre.display_vector("type_II_mmds"); - //type_II_threshs_boot_pre.display_vector("type_II_threshs_boot"); - - for (index_t i=0; icompute_statistic(); + mmd2->set_null_approximation_method(PERMUTATION); + float64_t type_II_threshs_boot_pre=mmd2->compute_threshold(alpha); + mmd2->set_null_approximation_method(MMD2_SPECTRUM); + float64_t type_II_threshs_spectrum_pre=mmd2->compute_threshold(alpha); + mmd2->set_null_approximation_method(MMD2_GAMMA); + float64_t type_II_threshs_gamma_pre=mmd2->compute_threshold(alpha); + + /* assert results from both */ + EXPECT_NEAR(type_I_mmds, type_I_mmds_pre, 1E-6); + EXPECT_NEAR(type_I_threshs_boot, type_I_threshs_boot_pre, 1E-6); + EXPECT_NEAR(type_I_threshs_spectrum, type_I_threshs_spectrum_pre, 1E-6); + EXPECT_NEAR(type_I_threshs_gamma, type_I_threshs_gamma_pre, 1E-6); + EXPECT_NEAR(type_II_mmds, type_II_mmds_pre, 1E-6); + EXPECT_NEAR(type_II_threshs_boot, type_II_threshs_boot_pre, 1E-6); + EXPECT_NEAR(type_II_threshs_spectrum, type_II_threshs_spectrum_pre, 1E-6); + EXPECT_NEAR(type_II_threshs_gamma, type_II_threshs_gamma_pre, 1E-6); } /* clean up */ @@ -415,3 +455,4 @@ TEST(QuadraticTimeMMD,custom_kernel_vs_normal_kernel) SG_UNREF(feat_p); SG_UNREF(feat_q); } +#endif // HAVE_EIGEN3