From 391f6448d17f22fc5788c98cad1d09ea5fe1fb65 Mon Sep 17 00:00:00 2001 From: lambday Date: Tue, 22 Mar 2016 08:43:08 +0530 Subject: [PATCH] removed old files --- .../HypothesisTest.cpp} | 36 +- .../HypothesisTest.h} | 12 +- .../internals/BlockwiseDetails.cpp | 2 +- .../internals/BlockwiseDetails.h | 0 .../internals/ComputationManager.cpp | 8 +- .../internals/ComputationManager.h | 2 +- .../internals/DataFetcher.cpp | 4 +- .../internals/DataFetcher.h | 2 +- .../internals/DataFetcherFactory.cpp | 8 +- .../internals/DataFetcherFactory.h | 2 +- .../internals/DataManager.cpp | 10 +- .../internals/DataManager.h | 4 +- .../internals/InitPerFeature.cpp | 8 +- .../internals/InitPerFeature.h | 3 +- .../internals/InitPerKernel.cpp | 4 +- .../internals/InitPerKernel.h | 2 +- .../internals/KernelManager.cpp | 4 +- .../internals/KernelManager.h | 4 +- .../internals/NextSamples.cpp | 4 +- .../internals/NextSamples.h | 4 +- .../internals/StreamingDataFetcher.cpp | 4 +- .../internals/StreamingDataFetcher.h | 6 +- .../internals/TestTypes.h | 0 .../internals/mmd/BiasedFull.cpp | 4 +- .../internals/mmd/BiasedFull.h | 2 +- .../internals/mmd/FullDirect.cpp | 4 +- .../internals/mmd/FullDirect.h | 2 +- .../internals/mmd/UnbiasedFull.cpp | 4 +- .../internals/mmd/UnbiasedFull.h | 2 +- .../internals/mmd/UnbiasedIncomplete.cpp | 4 +- .../internals/mmd/UnbiasedIncomplete.h | 2 +- .../internals/mmd/WithinBlockDirect.cpp | 4 +- .../internals/mmd/WithinBlockDirect.h | 2 +- .../internals/mmd/WithinBlockPermutation.cpp | 12 +- .../internals/mmd/WithinBlockPermutation.h | 2 +- src/shogun/statistics/HSIC.cpp | 293 ----- src/shogun/statistics/HSIC.h | 207 --- src/shogun/statistics/HypothesisTest.cpp | 125 -- src/shogun/statistics/HypothesisTest.h | 182 --- src/shogun/statistics/IndependenceTest.cpp | 132 -- src/shogun/statistics/IndependenceTest.h | 124 -- .../statistics/KernelIndependenceTest.cpp | 208 --- .../statistics/KernelIndependenceTest.h | 145 --- src/shogun/statistics/KernelMeanMatching.cpp | 109 -- src/shogun/statistics/KernelMeanMatching.h | 63 - src/shogun/statistics/KernelSelection.cpp | 86 -- src/shogun/statistics/KernelSelection.h | 104 -- src/shogun/statistics/KernelTwoSampleTest.cpp | 117 -- src/shogun/statistics/KernelTwoSampleTest.h | 126 -- src/shogun/statistics/LinearTimeMMD.cpp | 458 ------- src/shogun/statistics/LinearTimeMMD.h | 158 --- src/shogun/statistics/MMDKernelSelection.cpp | 89 -- src/shogun/statistics/MMDKernelSelection.h | 100 -- .../statistics/MMDKernelSelectionComb.cpp | 171 --- .../statistics/MMDKernelSelectionComb.h | 98 -- .../MMDKernelSelectionCombMaxL2.cpp | 74 -- .../statistics/MMDKernelSelectionCombMaxL2.h | 81 -- .../statistics/MMDKernelSelectionCombOpt.cpp | 99 -- .../statistics/MMDKernelSelectionCombOpt.h | 95 -- .../statistics/MMDKernelSelectionMax.cpp | 32 - src/shogun/statistics/MMDKernelSelectionMax.h | 60 - .../statistics/MMDKernelSelectionMedian.cpp | 237 ---- .../statistics/MMDKernelSelectionMedian.h | 83 -- .../statistics/MMDKernelSelectionOpt.cpp | 62 - src/shogun/statistics/MMDKernelSelectionOpt.h | 82 -- src/shogun/statistics/NOCCO.cpp | 268 ---- src/shogun/statistics/NOCCO.h | 224 ---- src/shogun/statistics/QuadraticTimeMMD.cpp | 1115 ----------------- src/shogun/statistics/QuadraticTimeMMD.h | 487 ------- src/shogun/statistics/StreamingMMD.cpp | 325 ----- src/shogun/statistics/StreamingMMD.h | 310 ----- src/shogun/statistics/TwoSampleTest.cpp | 176 --- src/shogun/statistics/TwoSampleTest.h | 144 --- 73 files changed, 88 insertions(+), 7138 deletions(-) rename src/shogun/{statistics/experimental/HypothesisTestExp.cpp => hypothesistest/HypothesisTest.cpp} (61%) rename src/shogun/{statistics/experimental/HypothesisTestExp.h => hypothesistest/HypothesisTest.h} (86%) rename src/shogun/{statistics/experimental => hypothesistest}/internals/BlockwiseDetails.cpp (96%) rename src/shogun/{statistics/experimental => hypothesistest}/internals/BlockwiseDetails.h (100%) rename src/shogun/{statistics/experimental => hypothesistest}/internals/ComputationManager.cpp (86%) rename src/shogun/{statistics/experimental => hypothesistest}/internals/ComputationManager.h (95%) rename src/shogun/{statistics/experimental => hypothesistest}/internals/DataFetcher.cpp (94%) rename src/shogun/{statistics/experimental => hypothesistest}/internals/DataFetcher.h (97%) rename src/shogun/{statistics/experimental => hypothesistest}/internals/DataFetcherFactory.cpp (80%) rename src/shogun/{statistics/experimental => hypothesistest}/internals/DataFetcherFactory.h (94%) rename src/shogun/{statistics/experimental => hypothesistest}/internals/DataManager.cpp (94%) rename src/shogun/{statistics/experimental => hypothesistest}/internals/DataManager.h (92%) rename src/shogun/{statistics/experimental => hypothesistest}/internals/InitPerFeature.cpp (84%) rename src/shogun/{statistics/experimental => hypothesistest}/internals/InitPerFeature.h (94%) rename src/shogun/{statistics/experimental => hypothesistest}/internals/InitPerKernel.cpp (89%) rename src/shogun/{statistics/experimental => hypothesistest}/internals/InitPerKernel.h (94%) rename src/shogun/{statistics/experimental => hypothesistest}/internals/KernelManager.cpp (94%) rename src/shogun/{statistics/experimental => hypothesistest}/internals/KernelManager.h (90%) rename src/shogun/{statistics/experimental => hypothesistest}/internals/NextSamples.cpp (93%) rename src/shogun/{statistics/experimental => hypothesistest}/internals/NextSamples.h (95%) rename src/shogun/{statistics/experimental => hypothesistest}/internals/StreamingDataFetcher.cpp (94%) rename src/shogun/{statistics/experimental => hypothesistest}/internals/StreamingDataFetcher.h (87%) rename src/shogun/{statistics/experimental => hypothesistest}/internals/TestTypes.h (100%) rename src/shogun/{statistics/experimental => hypothesistest}/internals/mmd/BiasedFull.cpp (90%) rename src/shogun/{statistics/experimental => hypothesistest}/internals/mmd/BiasedFull.h (94%) rename src/shogun/{statistics/experimental => hypothesistest}/internals/mmd/FullDirect.cpp (90%) rename src/shogun/{statistics/experimental => hypothesistest}/internals/mmd/FullDirect.h (93%) rename src/shogun/{statistics/experimental => hypothesistest}/internals/mmd/UnbiasedFull.cpp (91%) rename src/shogun/{statistics/experimental => hypothesistest}/internals/mmd/UnbiasedFull.h (94%) rename src/shogun/{statistics/experimental => hypothesistest}/internals/mmd/UnbiasedIncomplete.cpp (90%) rename src/shogun/{statistics/experimental => hypothesistest}/internals/mmd/UnbiasedIncomplete.h (94%) rename src/shogun/{statistics/experimental => hypothesistest}/internals/mmd/WithinBlockDirect.cpp (90%) rename src/shogun/{statistics/experimental => hypothesistest}/internals/mmd/WithinBlockDirect.h (94%) rename src/shogun/{statistics/experimental => hypothesistest}/internals/mmd/WithinBlockPermutation.cpp (76%) rename src/shogun/{statistics/experimental => hypothesistest}/internals/mmd/WithinBlockPermutation.h (94%) delete mode 100644 src/shogun/statistics/HSIC.cpp delete mode 100644 src/shogun/statistics/HSIC.h delete mode 100644 src/shogun/statistics/HypothesisTest.cpp delete mode 100644 src/shogun/statistics/HypothesisTest.h delete mode 100644 src/shogun/statistics/IndependenceTest.cpp delete mode 100644 src/shogun/statistics/IndependenceTest.h delete mode 100644 src/shogun/statistics/KernelIndependenceTest.cpp delete mode 100644 src/shogun/statistics/KernelIndependenceTest.h delete mode 100644 src/shogun/statistics/KernelMeanMatching.cpp delete mode 100644 src/shogun/statistics/KernelMeanMatching.h delete mode 100644 src/shogun/statistics/KernelSelection.cpp delete mode 100644 src/shogun/statistics/KernelSelection.h delete mode 100644 src/shogun/statistics/KernelTwoSampleTest.cpp delete mode 100644 src/shogun/statistics/KernelTwoSampleTest.h delete mode 100644 src/shogun/statistics/LinearTimeMMD.cpp delete mode 100644 src/shogun/statistics/LinearTimeMMD.h delete mode 100644 src/shogun/statistics/MMDKernelSelection.cpp delete mode 100644 src/shogun/statistics/MMDKernelSelection.h delete mode 100644 src/shogun/statistics/MMDKernelSelectionComb.cpp delete mode 100644 src/shogun/statistics/MMDKernelSelectionComb.h delete mode 100644 src/shogun/statistics/MMDKernelSelectionCombMaxL2.cpp delete mode 100644 src/shogun/statistics/MMDKernelSelectionCombMaxL2.h delete mode 100644 src/shogun/statistics/MMDKernelSelectionCombOpt.cpp delete mode 100644 src/shogun/statistics/MMDKernelSelectionCombOpt.h delete mode 100644 src/shogun/statistics/MMDKernelSelectionMax.cpp delete mode 100644 src/shogun/statistics/MMDKernelSelectionMax.h delete mode 100644 src/shogun/statistics/MMDKernelSelectionMedian.cpp delete mode 100644 src/shogun/statistics/MMDKernelSelectionMedian.h delete mode 100644 src/shogun/statistics/MMDKernelSelectionOpt.cpp delete mode 100644 src/shogun/statistics/MMDKernelSelectionOpt.h delete mode 100644 src/shogun/statistics/NOCCO.cpp delete mode 100644 src/shogun/statistics/NOCCO.h delete mode 100644 src/shogun/statistics/QuadraticTimeMMD.cpp delete mode 100644 src/shogun/statistics/QuadraticTimeMMD.h delete mode 100644 src/shogun/statistics/StreamingMMD.cpp delete mode 100644 src/shogun/statistics/StreamingMMD.h delete mode 100644 src/shogun/statistics/TwoSampleTest.cpp delete mode 100644 src/shogun/statistics/TwoSampleTest.h diff --git a/src/shogun/statistics/experimental/HypothesisTestExp.cpp b/src/shogun/hypothesistest/HypothesisTest.cpp similarity index 61% rename from src/shogun/statistics/experimental/HypothesisTestExp.cpp rename to src/shogun/hypothesistest/HypothesisTest.cpp index 6a78bdea4f2..242344dd12e 100644 --- a/src/shogun/statistics/experimental/HypothesisTestExp.cpp +++ b/src/shogun/hypothesistest/HypothesisTest.cpp @@ -19,15 +19,15 @@ #include #include #include -#include -#include -#include -#include +#include +#include +#include +#include using namespace shogun; using namespace internal; -struct CHypothesisTestExp::Self +struct CHypothesisTest::Self { Self(index_t num_distributions, index_t num_kernels) : data_manager(num_distributions), kernel_manager(num_kernels) @@ -37,36 +37,36 @@ struct CHypothesisTestExp::Self KernelManager kernel_manager; }; -CHypothesisTestExp::CHypothesisTestExp(index_t num_distributions, index_t num_kernels) : CSGObject() +CHypothesisTest::CHypothesisTest(index_t num_distributions, index_t num_kernels) : CSGObject() { - self = std::unique_ptr(new CHypothesisTestExp::Self(num_distributions, num_kernels)); + self = std::unique_ptr(new CHypothesisTest::Self(num_distributions, num_kernels)); } -CHypothesisTestExp::~CHypothesisTestExp() +CHypothesisTest::~CHypothesisTest() { } -DataManager& CHypothesisTestExp::get_data_manager() +DataManager& CHypothesisTest::get_data_manager() { return self->data_manager; } -const DataManager& CHypothesisTestExp::get_data_manager() const +const DataManager& CHypothesisTest::get_data_manager() const { return self->data_manager; } -KernelManager& CHypothesisTestExp::get_kernel_manager() +KernelManager& CHypothesisTest::get_kernel_manager() { return self->kernel_manager; } -const KernelManager& CHypothesisTestExp::get_kernel_manager() const +const KernelManager& CHypothesisTest::get_kernel_manager() const { return self->kernel_manager; } -float64_t CHypothesisTestExp::compute_p_value(float64_t statistic) +float64_t CHypothesisTest::compute_p_value(float64_t statistic) { SGVector values = sample_null(); @@ -76,7 +76,7 @@ float64_t CHypothesisTestExp::compute_p_value(float64_t statistic) return 1.0 - i / values.vlen; } -float64_t CHypothesisTestExp::compute_threshold(float64_t alpha) +float64_t CHypothesisTest::compute_threshold(float64_t alpha) { float64_t result = 0; SGVector values = sample_null(); @@ -85,18 +85,18 @@ float64_t CHypothesisTestExp::compute_threshold(float64_t alpha) return values[index_t(CMath::floor(values.vlen * (1 - alpha)))]; } -float64_t CHypothesisTestExp::perform_test() +float64_t CHypothesisTest::perform_test() { return compute_p_value(compute_statistic()); } -bool CHypothesisTestExp::perform_test(float64_t alpha) +bool CHypothesisTest::perform_test(float64_t alpha) { float64_t p_value = perform_test(); return p_value < alpha; } -const char* CHypothesisTestExp::get_name() const +const char* CHypothesisTest::get_name() const { - return "HypothesisTestExp"; + return "HypothesisTest"; } diff --git a/src/shogun/statistics/experimental/HypothesisTestExp.h b/src/shogun/hypothesistest/HypothesisTest.h similarity index 86% rename from src/shogun/statistics/experimental/HypothesisTestExp.h rename to src/shogun/hypothesistest/HypothesisTest.h index d1311a0e181..f291650ed84 100644 --- a/src/shogun/statistics/experimental/HypothesisTestExp.h +++ b/src/shogun/hypothesistest/HypothesisTest.h @@ -16,8 +16,8 @@ * along with this program. If not, see . */ -#ifndef HYPOTHESIS_TEST_EXP_H_ -#define HYPOTHESIS_TEST_EXP_H_ +#ifndef HYPOTHESIS_TEST_H_ +#define HYPOTHESIS_TEST_H_ #include #include @@ -36,11 +36,11 @@ class KernelManager; } -class CHypothesisTestExp : public CSGObject +class CHypothesisTest : public CSGObject { public: - CHypothesisTestExp(index_t num_distributions, index_t num_kernels); - virtual ~CHypothesisTestExp(); + CHypothesisTest(index_t num_distributions, index_t num_kernels); + virtual ~CHypothesisTest(); virtual float64_t compute_statistic() = 0; @@ -66,4 +66,4 @@ class CHypothesisTestExp : public CSGObject } -#endif // HYPOTHESIS_TEST_EXP_H_ +#endif // HYPOTHESIS_TEST_H_ diff --git a/src/shogun/statistics/experimental/internals/BlockwiseDetails.cpp b/src/shogun/hypothesistest/internals/BlockwiseDetails.cpp similarity index 96% rename from src/shogun/statistics/experimental/internals/BlockwiseDetails.cpp rename to src/shogun/hypothesistest/internals/BlockwiseDetails.cpp index 7efe828385f..1ad6eb829ae 100644 --- a/src/shogun/statistics/experimental/internals/BlockwiseDetails.cpp +++ b/src/shogun/hypothesistest/internals/BlockwiseDetails.cpp @@ -28,7 +28,7 @@ * either expressed or implied, of the Shogun Development Team. */ -#include +#include using namespace shogun; using namespace internal; diff --git a/src/shogun/statistics/experimental/internals/BlockwiseDetails.h b/src/shogun/hypothesistest/internals/BlockwiseDetails.h similarity index 100% rename from src/shogun/statistics/experimental/internals/BlockwiseDetails.h rename to src/shogun/hypothesistest/internals/BlockwiseDetails.h diff --git a/src/shogun/statistics/experimental/internals/ComputationManager.cpp b/src/shogun/hypothesistest/internals/ComputationManager.cpp similarity index 86% rename from src/shogun/statistics/experimental/internals/ComputationManager.cpp rename to src/shogun/hypothesistest/internals/ComputationManager.cpp index 099cb2905f9..263a4acfb09 100644 --- a/src/shogun/statistics/experimental/internals/ComputationManager.cpp +++ b/src/shogun/hypothesistest/internals/ComputationManager.cpp @@ -13,12 +13,12 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see . + * along with this program. If not, see . */ #include -#include -#include +#include +#include using namespace shogun; using namespace internal; @@ -53,7 +53,7 @@ void ComputationManager::compute() std::vector results; if (gpu) { - // TODO results = operation.compute_using_gpu(kernel_matrices); + / TODO results = operation.compute_using_gpu(kernel_matrices); } else { diff --git a/src/shogun/statistics/experimental/internals/ComputationManager.h b/src/shogun/hypothesistest/internals/ComputationManager.h similarity index 95% rename from src/shogun/statistics/experimental/internals/ComputationManager.h rename to src/shogun/hypothesistest/internals/ComputationManager.h index 263641b01a2..d152c9db3bd 100644 --- a/src/shogun/statistics/experimental/internals/ComputationManager.h +++ b/src/shogun/hypothesistest/internals/ComputationManager.h @@ -13,7 +13,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see . + * along with this program. If not, see . */ #ifndef COMPUTATION_MANAGER_H__ diff --git a/src/shogun/statistics/experimental/internals/DataFetcher.cpp b/src/shogun/hypothesistest/internals/DataFetcher.cpp similarity index 94% rename from src/shogun/statistics/experimental/internals/DataFetcher.cpp rename to src/shogun/hypothesistest/internals/DataFetcher.cpp index ef7d0bb8930..28dac7f0741 100644 --- a/src/shogun/statistics/experimental/internals/DataFetcher.cpp +++ b/src/shogun/hypothesistest/internals/DataFetcher.cpp @@ -13,12 +13,12 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see . + * along with this program. If not, see . */ #include #include -#include +#include using namespace shogun; diff --git a/src/shogun/statistics/experimental/internals/DataFetcher.h b/src/shogun/hypothesistest/internals/DataFetcher.h similarity index 97% rename from src/shogun/statistics/experimental/internals/DataFetcher.h rename to src/shogun/hypothesistest/internals/DataFetcher.h index 9b2996fbde5..7fdaa6a8c42 100644 --- a/src/shogun/statistics/experimental/internals/DataFetcher.h +++ b/src/shogun/hypothesistest/internals/DataFetcher.h @@ -30,7 +30,7 @@ #include #include -#include +#include #ifndef DATA_FETCHER_H__ #define DATA_FETCHER_H__ diff --git a/src/shogun/statistics/experimental/internals/DataFetcherFactory.cpp b/src/shogun/hypothesistest/internals/DataFetcherFactory.cpp similarity index 80% rename from src/shogun/statistics/experimental/internals/DataFetcherFactory.cpp rename to src/shogun/hypothesistest/internals/DataFetcherFactory.cpp index 9b9532fa1be..a52cb8abded 100644 --- a/src/shogun/statistics/experimental/internals/DataFetcherFactory.cpp +++ b/src/shogun/hypothesistest/internals/DataFetcherFactory.cpp @@ -13,14 +13,14 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see . + * along with this program. If not, see . */ #include #include -#include -#include -#include +#include +#include +#include using namespace shogun; using namespace internal; diff --git a/src/shogun/statistics/experimental/internals/DataFetcherFactory.h b/src/shogun/hypothesistest/internals/DataFetcherFactory.h similarity index 94% rename from src/shogun/statistics/experimental/internals/DataFetcherFactory.h rename to src/shogun/hypothesistest/internals/DataFetcherFactory.h index 091e6468cee..037e72aa716 100644 --- a/src/shogun/statistics/experimental/internals/DataFetcherFactory.h +++ b/src/shogun/hypothesistest/internals/DataFetcherFactory.h @@ -13,7 +13,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see . + * along with this program. If not, see . */ #include diff --git a/src/shogun/statistics/experimental/internals/DataManager.cpp b/src/shogun/hypothesistest/internals/DataManager.cpp similarity index 94% rename from src/shogun/statistics/experimental/internals/DataManager.cpp rename to src/shogun/hypothesistest/internals/DataManager.cpp index 70cfb587247..a247e4283c2 100644 --- a/src/shogun/statistics/experimental/internals/DataManager.cpp +++ b/src/shogun/hypothesistest/internals/DataManager.cpp @@ -13,14 +13,14 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see . + * along with this program. If not, see . */ #include // TODO remove -#include -#include -#include -#include +#include +#include +#include +#include #include using namespace shogun; diff --git a/src/shogun/statistics/experimental/internals/DataManager.h b/src/shogun/hypothesistest/internals/DataManager.h similarity index 92% rename from src/shogun/statistics/experimental/internals/DataManager.h rename to src/shogun/hypothesistest/internals/DataManager.h index fe22887230b..c3b1d20b88e 100644 --- a/src/shogun/statistics/experimental/internals/DataManager.h +++ b/src/shogun/hypothesistest/internals/DataManager.h @@ -13,7 +13,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see . + * along with this program. If not, see . */ #ifndef DATA_MANAGER_H__ @@ -21,7 +21,7 @@ #include #include -#include +#include #include namespace shogun diff --git a/src/shogun/statistics/experimental/internals/InitPerFeature.cpp b/src/shogun/hypothesistest/internals/InitPerFeature.cpp similarity index 84% rename from src/shogun/statistics/experimental/internals/InitPerFeature.cpp rename to src/shogun/hypothesistest/internals/InitPerFeature.cpp index eec65684dab..01e24048720 100644 --- a/src/shogun/statistics/experimental/internals/InitPerFeature.cpp +++ b/src/shogun/hypothesistest/internals/InitPerFeature.cpp @@ -13,13 +13,13 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see . + * along with this program. If not, see . */ #include // TODO remove -#include -#include -#include +#include +#include +#include #include using namespace shogun; diff --git a/src/shogun/statistics/experimental/internals/InitPerFeature.h b/src/shogun/hypothesistest/internals/InitPerFeature.h similarity index 94% rename from src/shogun/statistics/experimental/internals/InitPerFeature.h rename to src/shogun/hypothesistest/internals/InitPerFeature.h index ee3e45d7471..5727e261405 100644 --- a/src/shogun/statistics/experimental/internals/InitPerFeature.h +++ b/src/shogun/hypothesistest/internals/InitPerFeature.h @@ -13,7 +13,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see . + * along with this program. If not, see . */ #ifndef INIT_PER_FEATURE_H__ @@ -51,4 +51,3 @@ class InitPerFeature } #endif // INIT_PER_FEATURE_H__ - diff --git a/src/shogun/statistics/experimental/internals/InitPerKernel.cpp b/src/shogun/hypothesistest/internals/InitPerKernel.cpp similarity index 89% rename from src/shogun/statistics/experimental/internals/InitPerKernel.cpp rename to src/shogun/hypothesistest/internals/InitPerKernel.cpp index 3c3ffca2e09..2002f616cc4 100644 --- a/src/shogun/statistics/experimental/internals/InitPerKernel.cpp +++ b/src/shogun/hypothesistest/internals/InitPerKernel.cpp @@ -13,11 +13,11 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see . + * along with this program. If not, see . */ #include -#include +#include using namespace shogun; using namespace internal; diff --git a/src/shogun/statistics/experimental/internals/InitPerKernel.h b/src/shogun/hypothesistest/internals/InitPerKernel.h similarity index 94% rename from src/shogun/statistics/experimental/internals/InitPerKernel.h rename to src/shogun/hypothesistest/internals/InitPerKernel.h index cbedd68ac95..4f06c7224f2 100644 --- a/src/shogun/statistics/experimental/internals/InitPerKernel.h +++ b/src/shogun/hypothesistest/internals/InitPerKernel.h @@ -13,7 +13,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see . + * along with this program. If not, see . */ #ifndef INIT_PER_KERNEL_H__ diff --git a/src/shogun/statistics/experimental/internals/KernelManager.cpp b/src/shogun/hypothesistest/internals/KernelManager.cpp similarity index 94% rename from src/shogun/statistics/experimental/internals/KernelManager.cpp rename to src/shogun/hypothesistest/internals/KernelManager.cpp index d843c8fc254..4938d4f5df8 100644 --- a/src/shogun/statistics/experimental/internals/KernelManager.cpp +++ b/src/shogun/hypothesistest/internals/KernelManager.cpp @@ -13,7 +13,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see . + * along with this program. If not, see . */ #include @@ -21,7 +21,7 @@ #include #include #include -#include +#include using namespace shogun; using namespace internal; diff --git a/src/shogun/statistics/experimental/internals/KernelManager.h b/src/shogun/hypothesistest/internals/KernelManager.h similarity index 90% rename from src/shogun/statistics/experimental/internals/KernelManager.h rename to src/shogun/hypothesistest/internals/KernelManager.h index f445cc1f0da..af03739e01b 100644 --- a/src/shogun/statistics/experimental/internals/KernelManager.h +++ b/src/shogun/hypothesistest/internals/KernelManager.h @@ -13,7 +13,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see . + * along with this program. If not, see . */ #ifndef KERNEL_MANAGER_H__ @@ -22,7 +22,7 @@ #include #include #include -#include +#include namespace shogun { diff --git a/src/shogun/statistics/experimental/internals/NextSamples.cpp b/src/shogun/hypothesistest/internals/NextSamples.cpp similarity index 93% rename from src/shogun/statistics/experimental/internals/NextSamples.cpp rename to src/shogun/hypothesistest/internals/NextSamples.cpp index 15caec3db8c..c98faba81d5 100644 --- a/src/shogun/statistics/experimental/internals/NextSamples.cpp +++ b/src/shogun/hypothesistest/internals/NextSamples.cpp @@ -13,12 +13,12 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see . + * along with this program. If not, see . */ #include #include -#include +#include #include using namespace shogun; diff --git a/src/shogun/statistics/experimental/internals/NextSamples.h b/src/shogun/hypothesistest/internals/NextSamples.h similarity index 95% rename from src/shogun/statistics/experimental/internals/NextSamples.h rename to src/shogun/hypothesistest/internals/NextSamples.h index f710e2872bc..bce175b970b 100644 --- a/src/shogun/statistics/experimental/internals/NextSamples.h +++ b/src/shogun/hypothesistest/internals/NextSamples.h @@ -13,7 +13,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see . + * along with this program. If not, see . */ #ifndef NEXT_SAMPLES_H__ @@ -49,7 +49,7 @@ namespace internal * { * auto first = next_samples[0]; * auto second = next_samples[1]; - * auto third = next_samples[2]; // Runtime Error + * auto third = next_samples[2]; / Runtime Error * } * @endcode */ diff --git a/src/shogun/statistics/experimental/internals/StreamingDataFetcher.cpp b/src/shogun/hypothesistest/internals/StreamingDataFetcher.cpp similarity index 94% rename from src/shogun/statistics/experimental/internals/StreamingDataFetcher.cpp rename to src/shogun/hypothesistest/internals/StreamingDataFetcher.cpp index 5f2cb693ac4..cc7fd790450 100644 --- a/src/shogun/statistics/experimental/internals/StreamingDataFetcher.cpp +++ b/src/shogun/hypothesistest/internals/StreamingDataFetcher.cpp @@ -13,13 +13,13 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see . + * along with this program. If not, see . */ #include #include #include -#include +#include using namespace shogun; diff --git a/src/shogun/statistics/experimental/internals/StreamingDataFetcher.h b/src/shogun/hypothesistest/internals/StreamingDataFetcher.h similarity index 87% rename from src/shogun/statistics/experimental/internals/StreamingDataFetcher.h rename to src/shogun/hypothesistest/internals/StreamingDataFetcher.h index 4e02cfc6396..e41666ed1d0 100644 --- a/src/shogun/statistics/experimental/internals/StreamingDataFetcher.h +++ b/src/shogun/hypothesistest/internals/StreamingDataFetcher.h @@ -13,13 +13,13 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see . + * along with this program. If not, see . */ #include #include -#include -#include +#include +#include #ifndef STREMING_DATA_FETCHER_H__ #define STREMING_DATA_FETCHER_H__ diff --git a/src/shogun/statistics/experimental/internals/TestTypes.h b/src/shogun/hypothesistest/internals/TestTypes.h similarity index 100% rename from src/shogun/statistics/experimental/internals/TestTypes.h rename to src/shogun/hypothesistest/internals/TestTypes.h diff --git a/src/shogun/statistics/experimental/internals/mmd/BiasedFull.cpp b/src/shogun/hypothesistest/internals/mmd/BiasedFull.cpp similarity index 90% rename from src/shogun/statistics/experimental/internals/mmd/BiasedFull.cpp rename to src/shogun/hypothesistest/internals/mmd/BiasedFull.cpp index 265dad2510a..e8e4e247668 100644 --- a/src/shogun/statistics/experimental/internals/mmd/BiasedFull.cpp +++ b/src/shogun/hypothesistest/internals/mmd/BiasedFull.cpp @@ -13,13 +13,13 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see . + * along with this program. If not, see . */ #include #include #include -#include +#include using namespace shogun; using namespace internal; diff --git a/src/shogun/statistics/experimental/internals/mmd/BiasedFull.h b/src/shogun/hypothesistest/internals/mmd/BiasedFull.h similarity index 94% rename from src/shogun/statistics/experimental/internals/mmd/BiasedFull.h rename to src/shogun/hypothesistest/internals/mmd/BiasedFull.h index 558f6aa61f9..770d3b95eec 100644 --- a/src/shogun/statistics/experimental/internals/mmd/BiasedFull.h +++ b/src/shogun/hypothesistest/internals/mmd/BiasedFull.h @@ -13,7 +13,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see . + * along with this program. If not, see . */ #ifndef BIASED_FULL_H_ diff --git a/src/shogun/statistics/experimental/internals/mmd/FullDirect.cpp b/src/shogun/hypothesistest/internals/mmd/FullDirect.cpp similarity index 90% rename from src/shogun/statistics/experimental/internals/mmd/FullDirect.cpp rename to src/shogun/hypothesistest/internals/mmd/FullDirect.cpp index 9c82de65f5a..ba7db1b1edf 100644 --- a/src/shogun/statistics/experimental/internals/mmd/FullDirect.cpp +++ b/src/shogun/hypothesistest/internals/mmd/FullDirect.cpp @@ -13,14 +13,14 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see . + * along with this program. If not, see . */ #include #include #include #include -#include +#include using namespace shogun; using namespace internal; diff --git a/src/shogun/statistics/experimental/internals/mmd/FullDirect.h b/src/shogun/hypothesistest/internals/mmd/FullDirect.h similarity index 93% rename from src/shogun/statistics/experimental/internals/mmd/FullDirect.h rename to src/shogun/hypothesistest/internals/mmd/FullDirect.h index b21129c91d7..9c5c27d0136 100644 --- a/src/shogun/statistics/experimental/internals/mmd/FullDirect.h +++ b/src/shogun/hypothesistest/internals/mmd/FullDirect.h @@ -13,7 +13,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see . + * along with this program. If not, see . */ #ifndef FULL_DIRECT_H_ diff --git a/src/shogun/statistics/experimental/internals/mmd/UnbiasedFull.cpp b/src/shogun/hypothesistest/internals/mmd/UnbiasedFull.cpp similarity index 91% rename from src/shogun/statistics/experimental/internals/mmd/UnbiasedFull.cpp rename to src/shogun/hypothesistest/internals/mmd/UnbiasedFull.cpp index 4241452cb9f..24359be86cc 100644 --- a/src/shogun/statistics/experimental/internals/mmd/UnbiasedFull.cpp +++ b/src/shogun/hypothesistest/internals/mmd/UnbiasedFull.cpp @@ -13,13 +13,13 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see . + * along with this program. If not, see . */ #include #include #include -#include +#include using namespace shogun; using namespace internal; diff --git a/src/shogun/statistics/experimental/internals/mmd/UnbiasedFull.h b/src/shogun/hypothesistest/internals/mmd/UnbiasedFull.h similarity index 94% rename from src/shogun/statistics/experimental/internals/mmd/UnbiasedFull.h rename to src/shogun/hypothesistest/internals/mmd/UnbiasedFull.h index ead5151906a..3b7701c290a 100644 --- a/src/shogun/statistics/experimental/internals/mmd/UnbiasedFull.h +++ b/src/shogun/hypothesistest/internals/mmd/UnbiasedFull.h @@ -13,7 +13,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see . + * along with this program. If not, see . */ #ifndef UNBIASED_FULL_H_ diff --git a/src/shogun/statistics/experimental/internals/mmd/UnbiasedIncomplete.cpp b/src/shogun/hypothesistest/internals/mmd/UnbiasedIncomplete.cpp similarity index 90% rename from src/shogun/statistics/experimental/internals/mmd/UnbiasedIncomplete.cpp rename to src/shogun/hypothesistest/internals/mmd/UnbiasedIncomplete.cpp index 94767ed566b..fe339968cba 100644 --- a/src/shogun/statistics/experimental/internals/mmd/UnbiasedIncomplete.cpp +++ b/src/shogun/hypothesistest/internals/mmd/UnbiasedIncomplete.cpp @@ -13,13 +13,13 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see . + * along with this program. If not, see . */ #include #include #include -#include +#include using namespace shogun; using namespace internal; diff --git a/src/shogun/statistics/experimental/internals/mmd/UnbiasedIncomplete.h b/src/shogun/hypothesistest/internals/mmd/UnbiasedIncomplete.h similarity index 94% rename from src/shogun/statistics/experimental/internals/mmd/UnbiasedIncomplete.h rename to src/shogun/hypothesistest/internals/mmd/UnbiasedIncomplete.h index 96d356f30b4..3db9a907b5d 100644 --- a/src/shogun/statistics/experimental/internals/mmd/UnbiasedIncomplete.h +++ b/src/shogun/hypothesistest/internals/mmd/UnbiasedIncomplete.h @@ -13,7 +13,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see . + * along with this program. If not, see . */ #ifndef UNBIASED_INCOMPLETE_H_ diff --git a/src/shogun/statistics/experimental/internals/mmd/WithinBlockDirect.cpp b/src/shogun/hypothesistest/internals/mmd/WithinBlockDirect.cpp similarity index 90% rename from src/shogun/statistics/experimental/internals/mmd/WithinBlockDirect.cpp rename to src/shogun/hypothesistest/internals/mmd/WithinBlockDirect.cpp index ba2f27180e2..94d8befbff4 100644 --- a/src/shogun/statistics/experimental/internals/mmd/WithinBlockDirect.cpp +++ b/src/shogun/hypothesistest/internals/mmd/WithinBlockDirect.cpp @@ -13,14 +13,14 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see . + * along with this program. If not, see . */ #include #include #include #include -#include +#include using namespace shogun; using namespace internal; diff --git a/src/shogun/statistics/experimental/internals/mmd/WithinBlockDirect.h b/src/shogun/hypothesistest/internals/mmd/WithinBlockDirect.h similarity index 94% rename from src/shogun/statistics/experimental/internals/mmd/WithinBlockDirect.h rename to src/shogun/hypothesistest/internals/mmd/WithinBlockDirect.h index 8064c8da3e6..f78df5a5307 100644 --- a/src/shogun/statistics/experimental/internals/mmd/WithinBlockDirect.h +++ b/src/shogun/hypothesistest/internals/mmd/WithinBlockDirect.h @@ -13,7 +13,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see . + * along with this program. If not, see . */ #ifndef WITHIN_BLOCK_DIRECT_H_ diff --git a/src/shogun/statistics/experimental/internals/mmd/WithinBlockPermutation.cpp b/src/shogun/hypothesistest/internals/mmd/WithinBlockPermutation.cpp similarity index 76% rename from src/shogun/statistics/experimental/internals/mmd/WithinBlockPermutation.cpp rename to src/shogun/hypothesistest/internals/mmd/WithinBlockPermutation.cpp index 25881a46f97..dd007a8e19c 100644 --- a/src/shogun/statistics/experimental/internals/mmd/WithinBlockPermutation.cpp +++ b/src/shogun/hypothesistest/internals/mmd/WithinBlockPermutation.cpp @@ -13,16 +13,16 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see . + * along with this program. If not, see . */ #include #include #include -#include -#include -#include -#include +#include +#include +#include +#include using namespace shogun; using namespace internal; @@ -36,7 +36,7 @@ WithinBlockPermutation::WithinBlockPermutation(index_t n) : n_x(n) template typename T::return_type WithinBlockPermutation::operator()(SGMatrix km) { - // http://stackoverflow.com/questions/15858569/randomly-permute-rows-columns-of-a-matrix-with-eigen + // http:/stackoverflow.com/questions/15858569/randomly-permute-rows-columns-of-a-matrix-with-eigen Eigen::Map map(km.matrix, km.num_rows, km.num_cols); diff --git a/src/shogun/statistics/experimental/internals/mmd/WithinBlockPermutation.h b/src/shogun/hypothesistest/internals/mmd/WithinBlockPermutation.h similarity index 94% rename from src/shogun/statistics/experimental/internals/mmd/WithinBlockPermutation.h rename to src/shogun/hypothesistest/internals/mmd/WithinBlockPermutation.h index c27d7f0ca62..affc1489c49 100644 --- a/src/shogun/statistics/experimental/internals/mmd/WithinBlockPermutation.h +++ b/src/shogun/hypothesistest/internals/mmd/WithinBlockPermutation.h @@ -13,7 +13,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program. If not, see . + * along with this program. If not, see . */ #ifndef WITHIN_BLOCK_PERMUTATION_H_ diff --git a/src/shogun/statistics/HSIC.cpp b/src/shogun/statistics/HSIC.cpp deleted file mode 100644 index 7c2eb90a37d..00000000000 --- a/src/shogun/statistics/HSIC.cpp +++ /dev/null @@ -1,293 +0,0 @@ -/* - * Copyright (c) The Shogun Machine Learning Toolbox - * Written (w) 2012-2013 Heiko Strathmann - * Written (w) 2014 Soumyajit De - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * The views and conclusions contained in the software and documentation are those - * of the authors and should not be interpreted as representing official policies, - * either expressed or implied, of the Shogun Development Team. - */ - -#include -#include -#include -#include -#include - -using namespace shogun; - -CHSIC::CHSIC() : CKernelIndependenceTest() -{ - init(); -} - -CHSIC::CHSIC(CKernel* kernel_p, CKernel* kernel_q, CFeatures* p, - CFeatures* q) : - CKernelIndependenceTest(kernel_p, kernel_q, p, q) -{ - init(); - - if (p && q && p->get_num_vectors()!=q->get_num_vectors()) - { - SG_ERROR("Only features with equal number of vectors are currently " - "possible\n"); - } - else - m_num_features=p->get_num_vectors(); -} - -CHSIC::~CHSIC() -{ -} - -void CHSIC::init() -{ - SG_ADD(&m_num_features, "num_features", - "Number of features from each of the distributions", - MS_NOT_AVAILABLE); - - m_num_features=0; -} - -float64_t CHSIC::compute_statistic() -{ - SG_DEBUG("entering!\n"); - - REQUIRE(m_kernel_p && m_kernel_q, "No or only one kernel specified!\n"); - - REQUIRE(m_p && m_q, "features needed!\n") - - /* compute kernel matrices */ - SGMatrix K=get_kernel_matrix_K(); - SGMatrix L=get_kernel_matrix_L(); - - /* center matrices (MATLAB: Kc=H*K*H) */ - K.center(); - - /* compute MATLAB: sum(sum(Kc' .* (L))), which is biased HSIC */ - index_t m=m_num_features; - SG_DEBUG("Number of samples %d!\n", m); - - float64_t result=0; - for (index_t i=0; i params=fit_null_gamma(); - result=CStatistics::gamma_cdf(statistic, params[0], params[1]); - break; - } - - default: - /* sampling null is handled there */ - result=CIndependenceTest::compute_p_value(statistic); - break; - } - - return result; -} - -float64_t CHSIC::compute_threshold(float64_t alpha) -{ - float64_t result=0; - switch (m_null_approximation_method) - { - case HSIC_GAMMA: - { - /* fit gamma and return inverse cdf at statistic */ - SGVector params=fit_null_gamma(); - - // alpha, beta are shape and rate parameter - result=CStatistics::gamma_inverse_cdf(alpha, params[0], params[1]); - break; - } - - default: - /* sampling null is handled there */ - result=CIndependenceTest::compute_threshold(alpha); - break; - } - - return result; -} - -SGVector CHSIC::fit_null_gamma() -{ - REQUIRE(m_kernel_p && m_kernel_q, "No or only one kernel specified!\n"); - - REQUIRE(m_p && m_q, "features needed!\n") - - index_t m=m_num_features; - - /* compute kernel matrices */ - SGMatrix K=get_kernel_matrix_K(); - SGMatrix L=get_kernel_matrix_L(); - - /* compute sum and trace of uncentered kernel matrices, needed later */ - float64_t trace_K=0; - float64_t trace_L=0; - float64_t sum_K=0; - float64_t sum_L=0; - for (index_t i=0; i result(2); - result[0]=a; - result[1]=b; - - SG_DEBUG("leaving!\n") - return result; -} - -SGVector CHSIC::sample_null() -{ - SG_DEBUG("entering!\n") - - /* replace current kernel via precomputed custom kernel and call superclass - * method */ - - /* backup references to old kernels */ - CKernel* kernel_p=m_kernel_p; - CKernel* kernel_q=m_kernel_q; - - /* init kernels before to be sure that everything is fine - * kernel function between two samples from different distributions - * is never computed - in fact, they may as well have different features */ - m_kernel_p->init(m_p, m_p); - m_kernel_q->init(m_q, m_q); - - /* precompute kernel matrices */ - CCustomKernel* precomputed_p=new CCustomKernel(m_kernel_p); - CCustomKernel* precomputed_q=new CCustomKernel(m_kernel_q); - SG_REF(precomputed_p); - SG_REF(precomputed_q); - - /* temporarily replace own kernels */ - m_kernel_p=precomputed_p; - m_kernel_q=precomputed_q; - - /* use superclass sample_null which shuffles the entries for one - * distribution using index permutation on rows and columns of - * kernel matrix from one distribution, while accessing the other - * in its original order and then compute statistic */ - SGVector null_samples=CKernelIndependenceTest::sample_null(); - - /* restore kernels */ - m_kernel_p=kernel_p; - m_kernel_q=kernel_q; - - SG_UNREF(precomputed_p); - SG_UNREF(precomputed_q); - - SG_DEBUG("leaving!\n") - return null_samples; -} - -void CHSIC::set_p(CFeatures* p) -{ - CIndependenceTest::set_p(p); - m_num_features=p->get_num_vectors(); -} - -void CHSIC::set_q(CFeatures* q) -{ - CIndependenceTest::set_q(q); - m_num_features=q->get_num_vectors(); -} - diff --git a/src/shogun/statistics/HSIC.h b/src/shogun/statistics/HSIC.h deleted file mode 100644 index 5122222fde2..00000000000 --- a/src/shogun/statistics/HSIC.h +++ /dev/null @@ -1,207 +0,0 @@ -/* - * Copyright (c) The Shogun Machine Learning Toolbox - * Written (w) 2012-2013 Heiko Strathmann - * Written (w) 2014 Soumyajit De - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * The views and conclusions contained in the software and documentation are those - * of the authors and should not be interpreted as representing official policies, - * either expressed or implied, of the Shogun Development Team. - */ - -#ifndef __HSIC_H_ -#define __HSIC_H_ - -#include - -#include - -namespace shogun -{ - -template class SGMatrix; - - -/** @brief This class implements the Hilbert Schmidtd Independence Criterion - * based independence test as described in [1]. - * - * Given samples \f$Z=\{(x_i,y_i)\}_{i=1}^m\f$ from the joint - * distribution \f$\textbf{P}_{xy}\f$, does the joint distribution - * factorize as \f$\textbf{P}_{xy}=\textbf{P}_x\textbf{P}_y\f$? - * - * The HSIC is a kernel based independence criterion, which is based on the - * largest singular value of a Cross-Covariance Operator in a reproducing - * kernel Hilbert space (RKHS). Its population expression is zero if and only - * if the two underlying distributions are independent. - * - * This class can compute empirical biased estimates: - * \f[ - * m\text{HSIC}(Z)[,p,q]^2)=\frac{1}{m^2}\text{trace}\textbf{KHLH} - * \f] - * where \f$\textbf{H}=\textbf{I}-\frac{1}{m}\textbf{11}^T\f$ is a centering - * matrix and \f$\textbf{K}, \textbf{L}\f$ are kernel matrices of both sets - * of samples. - * - * Note that computing the statistic returns m*MMD; same holds for the null - * distribution samples. - * - * Along with the statistic comes a method to compute a p-value based on - * different methods. Sampling from null is also possible. If unsure which one to - * use, sampling with 250 iterations always is correct (but slow). - * - * To choose, use set_null_approximation_method() and choose from - * - * HSIC_GAMMA: for a very fast, but not consistent test based on moment matching - * of a Gamma distribution, as described in [1]. - * - * PERMUTATION: For permuting available samples to sample null-distribution. - * This is done on precomputed kernel matrices, since they have to - * be stored anyway when the statistic is computed. - * - * A very basic method for kernel selection when using CGaussianKernel is to - * use the median distance of the underlying data. See examples how to do that. - * More advanced methods will follow in the near future. However, the median - * heuristic works in quite some cases. See [1]. - * - * [1]: Gretton, A., Fukumizu, K., Teo, C., & Song, L. (2008). - * A kernel statistical test of independence. - * Advances in Neural Information Processing Systems, 1-8. - * - */ -class CHSIC : public CKernelIndependenceTest -{ -public: - /** Constructor */ - CHSIC(); - - /** Constructor. - * - * Initializes the kernels and features from the two distributions and - * SG_REFs them - * - * @param kernel_p kernel to use on samples from p - * @param kernel_q kernel to use on samples from q - * @param p samples from distribution p - * @param q samples from distribution q - */ - CHSIC(CKernel* kernel_p, CKernel* kernel_q, CFeatures* p, CFeatures* q); - - /** destructor */ - virtual ~CHSIC(); - - /** Computes the HSIC statistic (see class description) for underlying - * kernels and data. Note that it is multiplied by the number of used - * samples. It is a biased estimator. Note that it is m*HSIC_b. - * - * Note that since kernel matrices have to be stored, it has quadratic - * space costs. - * - * @return m*HSIC (unbiased estimate) - */ - virtual float64_t compute_statistic(); - - /** computes a p-value based on current method for approximating the - * null-distribution. The p-value is the 1-p quantile of the null- - * distribution where the given statistic lies in. - * - * @param statistic statistic value to compute the p-value for - * @return p-value parameter statistic is the (1-p) percentile of the - * null distribution - */ - virtual float64_t compute_p_value(float64_t statistic); - - /** computes a threshold based on current method for approximating the - * null-distribution. The threshold is the value that a statistic has - * to have in ordner to reject the null-hypothesis. - * - * @param alpha test level to reject null-hypothesis - * @return threshold for statistics to reject null-hypothesis - */ - virtual float64_t compute_threshold(float64_t alpha); - - /** @return the class name */ - virtual const char* get_name() const - { - return "HSIC"; - } - - /** returns the statistic type of this test statistic */ - virtual EStatisticType get_statistic_type() const - { - return S_HSIC; - } - - /** Setter for features from distribution p, SG_REFs it - * - * @param p features from p - */ - virtual void set_p(CFeatures* p); - - /** Setter for features from distribution q, SG_REFs it - * - * @param q features from q - */ - virtual void set_q(CFeatures* q); - - /** Approximates the null-distribution by a two parameter gamma - * distribution. Returns parameters. - * - * NOTE: the gamma distribution is fitted to m*HSIC_b. But since - * compute_statistic() returnes the biased estimate, you can safely call - * this with values from compute_statistic(). - * However, the attached features have to be the SAME size, as these, the - * statistic was computed on. If compute_threshold() or compute_p_value() - * are used, this is ensured automatically. Note that m*Null-distribution is - * fitted, which is fine since the statistic is also m*HSIC. - * - * Has quadratic computational costs in terms of samples. - * - * Called by compute_p_value() if null approximation method is set to - * MMD2_GAMMA. - * - * @return vector with two parameters for gamma distribution. To use: - * call gamma_cdf(statistic, a, b). - */ - SGVector fit_null_gamma(); - - /** merges both sets of samples and computes the test statistic - * m_num_null_sample times. This version precomputes the kenrel matrix - * once by hand, then samples using this one. The matrix has - * to be stored anyway when statistic is computed. - * - * @return vector of all statistics - */ - virtual SGVector sample_null(); - -private: - /** register parameters and initialize with defaults */ - void init(); - - /** number of features from the distributions (should be equal for both) */ - index_t m_num_features; - -}; - -} - -#endif /* __HSIC_H_ */ diff --git a/src/shogun/statistics/HypothesisTest.cpp b/src/shogun/statistics/HypothesisTest.cpp deleted file mode 100644 index d8167fd9e24..00000000000 --- a/src/shogun/statistics/HypothesisTest.cpp +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Copyright (c) The Shogun Machine Learning Toolbox - * Written (w) 2012-2013 Heiko Strathmann - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * The views and conclusions contained in the software and documentation are those - * of the authors and should not be interpreted as representing official policies, - * either expressed or implied, of the Shogun Development Team. - */ - -#include -#include -#include -#include - -using namespace shogun; - -CHypothesisTest::CHypothesisTest() : CSGObject() -{ - init(); -} - -CHypothesisTest::~CHypothesisTest() -{ -} - -void CHypothesisTest::init() -{ - SG_ADD(&m_num_null_samples, "num_null_samples", - "Number of permutation iterations for sampling null", - MS_NOT_AVAILABLE); - SG_ADD((machine_int_t*)&m_null_approximation_method, - "null_approximation_method", - "Method for approximating null distribution", - MS_NOT_AVAILABLE); - - m_num_null_samples=250; - m_null_approximation_method=PERMUTATION; -} - -void CHypothesisTest::set_null_approximation_method( - ENullApproximationMethod null_approximation_method) -{ - m_null_approximation_method=null_approximation_method; -} - -void CHypothesisTest::set_num_null_samples(index_t num_null_samples) -{ - m_num_null_samples=num_null_samples; -} - -float64_t CHypothesisTest::compute_p_value(float64_t statistic) -{ - float64_t result=0; - - if (m_null_approximation_method==PERMUTATION) - { - /* sample a bunch of MMD values from null distribution */ - SGVector values=sample_null(); - - /* find out percentile of parameter "statistic" in null distribution */ - CMath::qsort(values); - float64_t i=values.find_position_to_insert(statistic); - - /* return corresponding p-value */ - result=1.0-i/values.vlen; - } - else - SG_ERROR("Unknown method to approximate null distribution!\n"); - - return result; -} - -float64_t CHypothesisTest::compute_threshold(float64_t alpha) -{ - float64_t result=0; - - if (m_null_approximation_method==PERMUTATION) - { - /* sample a bunch of MMD values from null distribution */ - SGVector values=sample_null(); - - /* return value of (1-alpha) quantile */ - CMath::qsort(values); - result=values[index_t(CMath::floor(values.vlen*(1-alpha)))]; - } - else - SG_ERROR("Unknown method to approximate null distribution!\n"); - - return result; -} - -float64_t CHypothesisTest::perform_test() -{ - /* baseline method here is simply to compute statistic and p-value - * separately */ - float64_t statistic=compute_statistic(); - return compute_p_value(statistic); -} - -bool CHypothesisTest::perform_test(float64_t alpha) -{ - float64_t p_value=perform_test(); - return p_value - -#include - -namespace shogun -{ - -/** enum for different statistic types */ -enum EStatisticType -{ - S_LINEAR_TIME_MMD, - S_QUADRATIC_TIME_MMD, - S_HSIC, - S_NOCCO -}; - -/** enum for different method to approximate null-distibution */ -enum ENullApproximationMethod -{ - PERMUTATION, - MMD2_SPECTRUM_DEPRECATED, - MMD2_SPECTRUM, - MMD2_GAMMA, - MMD1_GAUSSIAN, - HSIC_GAMMA -}; - -/** @brief Hypothesis test base class. Provides an interface for statistical - * hypothesis testing via three methods: compute_statistic(), compute_p_value() - * and compute_threshold(). The second computes a p-value for the statistic - * computed by the first method. - * The p-value represents the position of the statistic in the null-distribution, - * i.e. the distribution of the statistic population given the null-hypothesis - * is true. (1-position = p-value). - * The third method, compute_threshold(), computes a threshold for a given - * test level which is needed to reject the null-hypothesis. - * - * Also provides an interface for sampling from the null-distribution. - * The actual sampling has to be implemented in sub-classes. - * - * All statistical tests should inherit from this class. - * - * Abstract base class. - */ -class CHypothesisTest : public CSGObject -{ -public: - /** default constructor */ - CHypothesisTest(); - - /** destructor */ - virtual ~CHypothesisTest(); - - /** @return test statistic for the given data/parameters/methods */ - virtual float64_t compute_statistic()=0; - - /** computes a p-value based on current method for approximating the - * null-distribution. The p-value is the 1-p quantile of the null- - * distribution where the given statistic lies in. - * This method depends on the implementation of sample_null method - * which should be implemented in its sub-classes - * - * @param statistic statistic value to compute the p-value for - * @return p-value parameter statistic is the (1-p) percentile of the - * null distribution - */ - virtual float64_t compute_p_value(float64_t statistic); - - /** computes a threshold based on current method for approximating the - * null-distribution. The threshold is the value that a statistic has - * to have in ordner to reject the null-hypothesis. - * This method depends on the implementation of sample_null method - * which should be implemented in its sub-classes - * - * @param alpha test level to reject null-hypothesis - * @return threshold for statistics to reject null-hypothesis - */ - virtual float64_t compute_threshold(float64_t alpha); - - /** Performs the complete two-sample test on current data and returns a - * p-value. - * - * This is a wrapper that calls compute_statistic first and then - * calls compute_p_value using the obtained statistic. In some statistic - * classes, it might be possible to compute statistic and p-value in - * one single run which is more efficient. Therefore, this method might - * be overwritten in subclasses. - * - * The method for computing the p-value can be set via - * set_null_approximation_method(). - * - * @return p-value such that computed statistic is the (1-p) quantile - * of the estimated null distribution - */ - virtual float64_t perform_test(); - - /** Performs the complete two-sample test on current data and returns - * a binary answer wheter null hypothesis is rejected or not. - * - * This is just a wrapper for the above perform_test() method that - * returns a p-value. If this p-value lies below the test level alpha, - * the null hypothesis is rejected. - * - * Should not be overwritten in subclasses. (Therefore not virtual) - * - * @param alpha test level alpha. - * @return true if null hypothesis is rejected and false otherwise - */ - bool perform_test(float64_t alpha); - - /** computes the test statistic m_num_null_samples times, exact - * computation depends on the implementations. - * - * @return vector of all statistics - */ - virtual SGVector sample_null()=0; - - /** sets the number of permutation iterations for sample_null() - * - * @param num_null_samples how often permutation shall be done - */ - virtual void set_num_null_samples(index_t num_null_samples); - - /** sets the method how to approximate the null-distribution - * @param null_approximation_method method to use - */ - virtual void set_null_approximation_method( - ENullApproximationMethod null_approximation_method); - - /** returns the statistic type of this test statistic */ - virtual EStatisticType get_statistic_type() const=0; - - virtual const char* get_name() const=0; - -private: - /** register parameters and initialize with default values */ - void init(); - -protected: - /** number of iterations for sampling from null-distributions */ - index_t m_num_null_samples; - - /** Defines how the the null distribution is approximated */ - ENullApproximationMethod m_null_approximation_method; -}; - -} - -#endif /* HYPOTHESIS_TEST_H_ */ diff --git a/src/shogun/statistics/IndependenceTest.cpp b/src/shogun/statistics/IndependenceTest.cpp deleted file mode 100644 index 89c16cb3ad2..00000000000 --- a/src/shogun/statistics/IndependenceTest.cpp +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Copyright (c) The Shogun Machine Learning Toolbox - * Written (w) 2012-2013 Heiko Strathmann - * Written (w) 2014 Soumyajit De - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * The views and conclusions contained in the software and documentation are those - * of the authors and should not be interpreted as representing official policies, - * either expressed or implied, of the Shogun Development Team. - */ - -#include -#include - -using namespace shogun; - -CIndependenceTest::CIndependenceTest() : CHypothesisTest() -{ - init(); -} - -CIndependenceTest::CIndependenceTest(CFeatures* p, CFeatures* q) - : CHypothesisTest() -{ - init(); - - SG_REF(p); - SG_REF(q); - - m_p=p; - m_q=q; -} - -CIndependenceTest::~CIndependenceTest() -{ - SG_UNREF(m_p); - SG_UNREF(m_q); -} - -void CIndependenceTest::init() -{ - SG_ADD((CSGObject**)&m_p, "p", "Samples from p", MS_NOT_AVAILABLE); - SG_ADD((CSGObject**)&m_q, "q", "Samples from q", MS_NOT_AVAILABLE); - - m_p=NULL; - m_q=NULL; -} - -SGVector CIndependenceTest::sample_null() -{ - SG_DEBUG("entering!\n") - - REQUIRE(m_p, "No features p!\n"); - REQUIRE(m_q, "No features q!\n"); - - /* compute sample statistics for null distribution */ - SGVector results(m_num_null_samples); - - /* memory for index permutations. Adding of subset has to happen - * inside the loop since it may be copied if there already is one set. - * - * subset for selecting samples from p. In this case we want to - * shuffle only samples from p while keeping samples from q fixed */ - SGVector ind_permutation(m_p->get_num_vectors()); - ind_permutation.range_fill(); - - for (index_t i=0; iadd_subset(ind_permutation); - results[i]=compute_statistic(); - m_p->remove_subset(); - } - - SG_DEBUG("leaving!\n") - return results; -} - -void CIndependenceTest::set_p(CFeatures* p) -{ - /* ref before unref to avoid problems when instances are equal */ - SG_REF(p); - SG_UNREF(m_p); - m_p=p; -} - -void CIndependenceTest::set_q(CFeatures* q) -{ - /* ref before unref to avoid problems when instances are equal */ - SG_REF(q); - SG_UNREF(m_q); - m_q=q; -} - -CFeatures* CIndependenceTest::get_p() -{ - SG_REF(m_p); - return m_p; -} - -CFeatures* CIndependenceTest::get_q() -{ - SG_REF(m_q); - return m_q; -} - diff --git a/src/shogun/statistics/IndependenceTest.h b/src/shogun/statistics/IndependenceTest.h deleted file mode 100644 index faac0eee492..00000000000 --- a/src/shogun/statistics/IndependenceTest.h +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright (c) The Shogun Machine Learning Toolbox - * Written (w) 2012-2013 Heiko Strathmann - * Written (w) 2014 Soumyajit De - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * The views and conclusions contained in the software and documentation are those - * of the authors and should not be interpreted as representing official policies, - * either expressed or implied, of the Shogun Development Team. - */ - -#ifndef INDEPENDENCE_TEST_H_ -#define INDEPENDENCE_TEST_H_ - -#include - -#include - -namespace shogun -{ - -class CFeatures; - -/** @brief Provides an interface for performing the independence test. - * Given samples \f$Z=\{(x_i,y_i)\}_{i=1}^m\f$ from the joint distribution - * \f$\textbf{P}_{xy}\f$, does the joint distribution factorize as - * \f$\textbf{P}_{xy}=\textbf{P}_x\textbf{P}_y\f$, i.e. product of the marginals? - * The null-hypothesis says yes, i.e. no dependence, the alternative hypothesis - * says no. - * - * Abstract base class. Provides all interfaces and implements approximating - * the null distribution via permutation, i.e. shuffling the samples from - * one distribution repeatedly using subsets while keeping the samples from - * the other distribution in its original order - * - */ -class CIndependenceTest : public CHypothesisTest -{ -public: - /** default constructor */ - CIndependenceTest(); - - /** Constructor. - * - * Initializes the features from the two distributions and SG_REFs them - * - * @param p samples from distribution p - * @param q samples from distribution q - */ - CIndependenceTest(CFeatures* p, CFeatures* q); - - /** destructor */ - virtual ~CIndependenceTest(); - - /** shuffles samples from one distribution keeping the samples from another - * distribution in the original order and computes the test statistic - * m_num_null_sample times - * - * @return vector of all statistics - */ - virtual SGVector sample_null(); - - /** Setter for features from distribution p, SG_REFs it - * - * @param p features from p - */ - virtual void set_p(CFeatures* p); - - /** Setter for features from distribution q, SG_REFs it - * - * @param q features from q - */ - virtual void set_q(CFeatures* q); - - /** Getter for features from p, SG_REF'ed - * - * @return feature object from p - */ - virtual CFeatures* get_p(); - - /** Getter for features from q, SG_REF'ed - * - * @return feature object from q - */ - virtual CFeatures* get_q(); - - /** @return class name */ - virtual const char* get_name() const=0; - -private: - /** register parameters and initialize with default values */ - void init(); - -protected: - /** samples of the distribution p */ - CFeatures* m_p; - - /** samples of the distribution q */ - CFeatures* m_q; -}; - -} - -#endif /* INDEPENDENCE_TEST_H_ */ diff --git a/src/shogun/statistics/KernelIndependenceTest.cpp b/src/shogun/statistics/KernelIndependenceTest.cpp deleted file mode 100644 index e667b2978ce..00000000000 --- a/src/shogun/statistics/KernelIndependenceTest.cpp +++ /dev/null @@ -1,208 +0,0 @@ -/* - * Copyright (c) The Shogun Machine Learning Toolbox - * Written (w) 2012-2013 Heiko Strathmann - * Written (w) 2014 Soumyajit De - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * The views and conclusions contained in the software and documentation are those - * of the authors and should not be interpreted as representing official policies, - * either expressed or implied, of the Shogun Development Team. - */ - -#include -#include -#include -#include -#include - -using namespace shogun; - -CKernelIndependenceTest::CKernelIndependenceTest() : - CIndependenceTest() -{ - init(); -} - -CKernelIndependenceTest::CKernelIndependenceTest(CKernel* kernel_p, - CKernel* kernel_q, CFeatures* p, CFeatures* q) : - CIndependenceTest(p, q) -{ - init(); - - m_kernel_p=kernel_p; - SG_REF(kernel_p); - - m_kernel_q=kernel_q; - SG_REF(kernel_q); -} - -CKernelIndependenceTest::~CKernelIndependenceTest() -{ - SG_UNREF(m_kernel_p); - SG_UNREF(m_kernel_q); -} - -void CKernelIndependenceTest::init() -{ - SG_ADD((CSGObject**)&m_kernel_p, "kernel_p", "Kernel for samples from p", - MS_AVAILABLE); - SG_ADD((CSGObject**)&m_kernel_q, "kernel_q", "Kernel for samples from q", - MS_AVAILABLE); - - m_kernel_p=NULL; - m_kernel_q=NULL; -} - -SGVector CKernelIndependenceTest::sample_null() -{ - SG_DEBUG("entering!\n") - - /* compute sample statistics for null distribution */ - SGVector results; - - /* only do something if a custom kernel is used: use the power of pre- - * computed kernel matrices - */ - if (m_kernel_p->get_kernel_type()==K_CUSTOM && - m_kernel_q->get_kernel_type()==K_CUSTOM) - { - /* allocate memory */ - results=SGVector(m_num_null_samples); - - /* memory for index permutations */ - SGVector ind_permutation(m_p->get_num_vectors()); - ind_permutation.range_fill(); - - /* check if kernel is a custom kernel. In that case, changing features is - * not what we want but just subsetting the kernel itself */ - CCustomKernel* custom_kernel_p=(CCustomKernel*)m_kernel_p; - - for (index_t i=0; iadd_row_subset(ind_permutation); - custom_kernel_p->add_col_subset(ind_permutation); - - /* compute statistic for this permutation of mixed samples */ - results[i]=compute_statistic(); - - /* remove subsets */ - custom_kernel_p->remove_row_subset(); - custom_kernel_p->remove_col_subset(); - } - } - else - { - /* in this case, just use superclass method */ - results=CIndependenceTest::sample_null(); - } - - - SG_DEBUG("leaving!\n") - return results; -} - -void CKernelIndependenceTest::set_kernel_p(CKernel* kernel_p) -{ - /* ref before unref to avoid problems when instances are equal */ - SG_REF(kernel_p); - SG_UNREF(m_kernel_p); - m_kernel_p=kernel_p; -} - -void CKernelIndependenceTest::set_kernel_q(CKernel* kernel_q) -{ - /* ref before unref to avoid problems when instances are equal */ - SG_REF(kernel_q); - SG_UNREF(m_kernel_q); - m_kernel_q=kernel_q; -} - -CKernel* CKernelIndependenceTest::get_kernel_p() -{ - SG_REF(m_kernel_p); - return m_kernel_p; -} - -CKernel* CKernelIndependenceTest::get_kernel_q() -{ - SG_REF(m_kernel_q); - return m_kernel_q; -} - -SGMatrix CKernelIndependenceTest::get_kernel_matrix_K() -{ - SG_DEBUG("entering!\n"); - - SGMatrix K; - - /* distinguish between custom and normal kernels */ - if (m_kernel_p->get_kernel_type()==K_CUSTOM) - { - /* custom kernels need to to be initialised when a subset is added */ - CCustomKernel* custom_kernel_p=(CCustomKernel*)m_kernel_p; - K=custom_kernel_p->get_kernel_matrix(); - } - else - { - /* need to init the kernel if kernel is not precomputed - if subsets of - * features are in the stack (for permutation), this will handle it */ - m_kernel_p->init(m_p, m_p); - K=m_kernel_p->get_kernel_matrix(); - } - - SG_DEBUG("leaving!\n"); - - return K; -} - -SGMatrix CKernelIndependenceTest::get_kernel_matrix_L() -{ - SG_DEBUG("entering!\n"); - - SGMatrix L; - - /* now second half of data for L */ - if (m_kernel_q->get_kernel_type()==K_CUSTOM) - { - /* custom kernels need to to be initialised - no subsets here */ - CCustomKernel* custom_kernel_q=(CCustomKernel*)m_kernel_q; - L=custom_kernel_q->get_kernel_matrix(); - } - else - { - /* need to init the kernel if kernel is not precomputed */ - m_kernel_q->init(m_q, m_q); - L=m_kernel_q->get_kernel_matrix(); - } - - SG_DEBUG("leaving!\n"); - - return L; -} - diff --git a/src/shogun/statistics/KernelIndependenceTest.h b/src/shogun/statistics/KernelIndependenceTest.h deleted file mode 100644 index 077d56462d9..00000000000 --- a/src/shogun/statistics/KernelIndependenceTest.h +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Copyright (c) The Shogun Machine Learning Toolbox - * Written (w) 2012-2013 Heiko Strathmann - * Written (w) 2014 Soumyajit De - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * The views and conclusions contained in the software and documentation are those - * of the authors and should not be interpreted as representing official policies, - * either expressed or implied, of the Shogun Development Team. - */ - -#ifndef KERNEL_INDEPENDENCE_TEST_H_ -#define KERNEL_INDEPENDENCE_TEST_H_ - -#include - -#include - -namespace shogun -{ - -class CFeatures; -class CKernel; - -/** @brief Kernel independence test base class. Provides an interface for - * performing an independence test. Given samples \f$Z=\{(x_i,y_i)\}_{i=1}^m\f$ - * from the joint distribution \f$\textbf{P}_{xy}\f$, does the joint - * distribution factorize as \f$\textbf{P}_{xy}=\textbf{P}_x\textbf{P}_y\f$, - * i.e. product of the marginals? - * - * The null-hypothesis says yes, i.e. no dependence, the alternative hypothesis - * says no. - * - * In this class, this is done using a single kernel for each of the two sets - * of samples - * - * The class also re-implements the sample_null() method. If the underlying - * kernel is a custom one (precomputed), the rows and column of the kernel - * matrix for p is permuted using subsets. The computation falls back to - * CIndependenceTest::sample_null() otherwise, which requires to re-compute - * the kernel in each iteration via subsets on the features instead - * - * Abstract base class. - */ -class CKernelIndependenceTest: public CIndependenceTest -{ -public: - /** default constructor */ - CKernelIndependenceTest(); - - /** Constructor. - * - * Initializes the kernels and features from the two distributions and - * SG_REFs them - * - * @param kernel_p kernel to use on samples from p - * @param kernel_q kernel to use on samples from q - * @param p samples from distribution p - * @param q samples from distribution q - */ - CKernelIndependenceTest(CKernel* kernel_p, CKernel* kernel_q, - CFeatures* p, CFeatures* q); - - /** destructor */ - virtual ~CKernelIndependenceTest(); - - /** shuffles the indeices that corresponds to the kernel entries of - * samples from p while accessing samples from q in the original order and - * computes the test statistic m_num_null_samples times. This version - * checks if a precomputed custom kernel is used, and, if so, just permutes - * the indices of the kernel corresponding to p instead of re-computing it - * in every iteration. - * - * @return vector of all statistics - */ - virtual SGVector sample_null(); - - /** Setter for kernel for features from distribution p, SG_REFs it - * - * @param kernel_p kernel for features from p - */ - virtual void set_kernel_p(CKernel* kernel_p); - - /** Setter for kernel for features from distribution q, SG_REFs it - * - * @param kernel_q kernel for features from q - */ - virtual void set_kernel_q(CKernel* kernel_q); - - /** Getter for kernel for features from p, SG_REF'ed - * - * @return kernel for features from p - */ - virtual CKernel* get_kernel_p(); - - /** Getter for kernel for features from q, SG_REF'ed - * - * @return kernel for features from q - */ - virtual CKernel* get_kernel_q(); - - /** @return the class name */ - virtual const char* get_name() const=0; - -private: - /** register parameters and intiailize with default values */ - void init(); - -protected: - /** @return kernel matrix on samples from p. Distinguishes CustomKernels */ - SGMatrix get_kernel_matrix_K(); - - /** @return kernel matrix on samples from q. Distinguishes CustomKernels */ - SGMatrix get_kernel_matrix_L(); - - /** underlying kernel for p */ - CKernel* m_kernel_p; - - /** underlying kernel for q */ - CKernel* m_kernel_q; -}; - -} - -#endif /* KERNEL_INDEPENDENCE_TEST_H_ */ diff --git a/src/shogun/statistics/KernelMeanMatching.cpp b/src/shogun/statistics/KernelMeanMatching.cpp deleted file mode 100644 index 71942054b6d..00000000000 --- a/src/shogun/statistics/KernelMeanMatching.cpp +++ /dev/null @@ -1,109 +0,0 @@ -/* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3 of the License, or - * (at your option) any later version. - * - * Copyright (W) 2012 Sergey Lisitsyn - */ - -#include -#ifdef USE_GPL_SHOGUN - -#include -#include - - -static float64_t* kmm_K = NULL; -static int32_t kmm_K_ld = 0; - -static const float64_t* kmm_get_col(uint32_t i) -{ - return kmm_K + kmm_K_ld*i; -} - -namespace shogun -{ -CKernelMeanMatching::CKernelMeanMatching() : - CSGObject(), m_kernel(NULL) -{ -} - -CKernelMeanMatching::CKernelMeanMatching(CKernel* kernel, SGVector training_indices, - SGVector test_indices) : - CSGObject(), m_kernel(NULL) -{ - set_kernel(kernel); - set_training_indices(training_indices); - set_test_indices(test_indices); -} - -SGVector CKernelMeanMatching::compute_weights() -{ - int32_t i,j; - ASSERT(m_kernel) - ASSERT(m_training_indices.vlen) - ASSERT(m_test_indices.vlen) - - int32_t n_tr = m_training_indices.vlen; - int32_t n_te = m_test_indices.vlen; - - SGVector weights(n_tr); - weights.zero(); - - kmm_K = SG_MALLOC(float64_t, n_tr*n_tr); - kmm_K_ld = n_tr; - float64_t* diag_K = SG_MALLOC(float64_t, n_tr); - for (i=0; ikernel(m_training_indices[i], m_training_indices[i]); - diag_K[i] = d; - kmm_K[i*n_tr+i] = d; - for (j=i+1; jkernel(m_training_indices[i],m_training_indices[j]); - kmm_K[i*n_tr+j] = d; - kmm_K[j*n_tr+i] = d; - } - } - float64_t* kappa = SG_MALLOC(float64_t, n_tr); - for (i=0; ikernel(m_training_indices[i],m_test_indices[j]); - - avg *= float64_t(n_tr)/n_te; - kappa[i] = -avg; - } - float64_t* a = SG_MALLOC(float64_t, n_tr); - for (i=0; i -#ifdef USE_GPL_SHOGUN - -#include -#include - -namespace shogun -{ - -/** @brief Kernel Mean Matching */ -class CKernelMeanMatching: public CSGObject -{ -public: - - /** constructor */ - CKernelMeanMatching(); - - /** constructor */ - CKernelMeanMatching(CKernel* kernel, SGVector training_indices, SGVector test_indices); - - /** get kernel */ - CKernel* get_kernel() const { SG_REF(m_kernel); return m_kernel; } - /** set kernel */ - void set_kernel(CKernel* kernel) { SG_REF(kernel); SG_UNREF(m_kernel); m_kernel = kernel; } - /** get training indices */ - SGVector get_training_indices() const { return m_training_indices; } - /** set training indices */ - void set_training_indices(SGVector training_indices) { m_training_indices = training_indices; } - /** get test indices */ - SGVector get_test_indices() const { return m_test_indices; } - /** set test indices */ - void set_test_indices(SGVector test_indices) { m_test_indices = test_indices; } - - /** compute weights */ - SGVector compute_weights(); - - virtual const char* get_name() const { return "KernelMeanMatching"; } - -protected: - - /** kernel */ - CKernel* m_kernel; - /** training indices */ - SGVector m_training_indices; - /** test indices */ - SGVector m_test_indices; -}; - -} -#endif //USE_GPL_SHOGUN -#endif diff --git a/src/shogun/statistics/KernelSelection.cpp b/src/shogun/statistics/KernelSelection.cpp deleted file mode 100644 index 84a45fa74c7..00000000000 --- a/src/shogun/statistics/KernelSelection.cpp +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (c) The Shogun Machine Learning Toolbox - * Written (w) 2012-2013 Heiko Strathmann - * Written (w) 2014 Soumyajit De - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * The views and conclusions contained in the software and documentation are those - * of the authors and should not be interpreted as representing official policies, - * either expressed or implied, of the Shogun Development Team. - */ - -#include -#include -#include -#include -#include - -using namespace shogun; - -CKernelSelection::CKernelSelection() -{ - init(); -} - -CKernelSelection::CKernelSelection(CKernelTwoSampleTest* estimator) -{ - init(); - set_estimator(estimator); -} - -CKernelSelection::~CKernelSelection() -{ - SG_UNREF(m_estimator); -} - -void CKernelSelection::init() -{ - SG_ADD((CSGObject**)&m_estimator, "estimator", - "Underlying CKernelTwoSampleTest instance", MS_NOT_AVAILABLE); - - m_estimator=NULL; -} - -void CKernelSelection::set_estimator(CKernelTwoSampleTest* estimator) -{ - REQUIRE(estimator, "No CKernelTwoSampleTest instance provided!\n"); - - /* ensure that there is a combined kernel */ - CKernel* kernel=estimator->get_kernel(); - REQUIRE(kernel, "Underlying \"%s\" has no kernel set!\n", - estimator->get_name()); - REQUIRE(kernel->get_kernel_type()==K_COMBINED, "Kernel of underlying \"%s\" " - "is of type \"%s\" but is has to be CCombinedKernel\n", - estimator->get_name(), kernel->get_name()); - SG_UNREF(kernel); - - SG_REF(estimator); - SG_UNREF(m_estimator); - m_estimator=estimator; -} - -CKernelTwoSampleTest* CKernelSelection::get_estimator() const -{ - SG_REF(m_estimator); - return m_estimator; -} diff --git a/src/shogun/statistics/KernelSelection.h b/src/shogun/statistics/KernelSelection.h deleted file mode 100644 index 8d2cdf29abe..00000000000 --- a/src/shogun/statistics/KernelSelection.h +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Copyright (c) The Shogun Machine Learning Toolbox - * Written (w) 2012-2013 Heiko Strathmann - * Written (w) 2014 Soumyajit De - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * The views and conclusions contained in the software and documentation are those - * of the authors and should not be interpreted as representing official policies, - * either expressed or implied, of the Shogun Development Team. - */ - -#ifndef KERNEL_SELECTION_H_ -#define KERNEL_SELECTION_H_ - -#include -#include - -namespace shogun -{ - -class CKernelTwoSampleTest; -class CKernel; - -/** @brief Base class for kernel selection for kernel two-sample test - * statistic implementations (e.g. MMD). - * Provides abstract methods for selecting kernels and computing criteria or - * kernel weights for the implemented method. In order to implement new methods - * for kernel selection, simply write a new implementation of this class. - */ -class CKernelSelection: public CSGObject -{ -public: - /** Default constructor */ - CKernelSelection(); - - /** Constructor that initialises the underlying CKernelTwoSampleTest instance - * - * @param estimator CKernelTwoSampleTest instance to use. - */ - CKernelSelection(CKernelTwoSampleTest* estimator); - - /** Destructor */ - virtual ~CKernelSelection(); - - /** If the the implemented method selects a single kernel, this computes - * criteria for all underlying kernels. If the method selects combined - * kernels, this method returns weights for the baseline kernels - * - * @return vector with criteria or kernel weights - */ - virtual SGVector compute_measures()=0; - - /** Abstract method that performs kernel selection on the base of the - * compute_measures() method and returns the selected kernel which is - * either a single or a combined one (with weights set) - * - * @return selected kernel (SG_REF'ed) - */ - virtual CKernel* select_kernel()=0; - - /** @param estimator the underlying CKernelTwoSampleTest instance */ - void set_estimator(CKernelTwoSampleTest* estimator); - - /** @return the underlying CKernelTwoSampleTest instance */ - CKernelTwoSampleTest* get_estimator() const; - - /** @return name of the SGSerializable */ - virtual const char* get_name() const - { - return "KernelSelection"; - } - -private: - /** Register parameters and initialize with default */ - void init(); - -protected: - /** Underlying kernel two-sample test instance */ - CKernelTwoSampleTest* m_estimator; -}; - -} - -#endif /* KERNEL_SELECTION_H_ */ diff --git a/src/shogun/statistics/KernelTwoSampleTest.cpp b/src/shogun/statistics/KernelTwoSampleTest.cpp deleted file mode 100644 index 0035f1bb9d5..00000000000 --- a/src/shogun/statistics/KernelTwoSampleTest.cpp +++ /dev/null @@ -1,117 +0,0 @@ -/* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3 of the License, or - * (at your option) any later version. - * - * Written (W) 2012-2013 Heiko Strathmann - */ - -#include -#include -#include -#include -#include - -using namespace shogun; - -CKernelTwoSampleTest::CKernelTwoSampleTest() : - CTwoSampleTest() -{ - init(); -} - -CKernelTwoSampleTest::CKernelTwoSampleTest(CKernel* kernel, - CFeatures* p_and_q, index_t q_start) : - CTwoSampleTest(p_and_q, q_start) -{ - init(); - - m_kernel=kernel; - SG_REF(kernel); -} - -CKernelTwoSampleTest::CKernelTwoSampleTest(CKernel* kernel, - CFeatures* p, CFeatures* q) : CTwoSampleTest(p, q) -{ - init(); - - m_kernel=kernel; - SG_REF(kernel); -} - -CKernelTwoSampleTest::~CKernelTwoSampleTest() -{ - SG_UNREF(m_kernel); -} - -void CKernelTwoSampleTest::init() -{ - SG_ADD((CSGObject**)&m_kernel, "kernel", "Kernel for two sample test", - MS_AVAILABLE); - m_kernel=NULL; -} - -SGVector CKernelTwoSampleTest::sample_null() -{ - SG_DEBUG("entering!\n"); - - REQUIRE(m_kernel, "No kernel set!\n"); - REQUIRE(m_kernel->get_kernel_type()==K_CUSTOM || m_p_and_q, - "No features and no custom kernel set!\n"); - - /* compute sample statistics for null distribution */ - SGVector results; - - /* only do something if a custom kernel is used: use the power of pre- - * computed kernel matrices - */ - if (m_kernel->get_kernel_type()==K_CUSTOM) - { - /* allocate memory */ - results=SGVector(m_num_null_samples); - - /* in case of custom kernel, there are no features */ - index_t num_data; - if (m_kernel->get_kernel_type()==K_CUSTOM) - num_data=m_kernel->get_num_vec_lhs(); - else - num_data=m_p_and_q->get_num_vectors(); - - /* memory for index permutations, (would slow down loop) */ - SGVector ind_permutation(num_data); - ind_permutation.range_fill(); - - /* check if kernel is a custom kernel. In that case, changing features is - * not what we want but just subsetting the kernel itself */ - CCustomKernel* custom_kernel=(CCustomKernel*)m_kernel; - - for (index_t i=0; iadd_row_subset(ind_permutation); - custom_kernel->add_col_subset(ind_permutation); - - /* compute statistic for this permutation of mixed samples */ - results[i]=compute_statistic(); - - /* remove subsets */ - custom_kernel->remove_row_subset(); - custom_kernel->remove_col_subset(); - } - } - else - { - /* in this case, just use superclass method */ - results=CTwoSampleTest::sample_null(); - } - - SG_DEBUG("leaving!\n"); - - return results; -} diff --git a/src/shogun/statistics/KernelTwoSampleTest.h b/src/shogun/statistics/KernelTwoSampleTest.h deleted file mode 100644 index ba7b9d26160..00000000000 --- a/src/shogun/statistics/KernelTwoSampleTest.h +++ /dev/null @@ -1,126 +0,0 @@ -/* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3 of the License, or - * (at your option) any later version. - * - * Written (W) 2012-2013 Heiko Strathmann - */ - -#ifndef KERNEL_TWO_SAMPLE_TEST_H_ -#define KERNEL_TWO_SAMPLE_TEST_H_ - -#include - -#include -#include - -namespace shogun -{ - -class CFeatures; -class CKernel; - -/** @brief Kernel two sample test base class. Provides an interface for - * performing a two-sample test using a kernel, i.e. Given samples from two - * distributions \f$p\f$ and \f$q\f$, the null-hypothesis is: \f$H_0: p=q\f$, - * the alternative hypothesis: \f$H_1: p\neq q\f$. - * - * In this class, this is done using a single kernel for the data. - * - * The class also re-implements the sample_null() method. If the underlying - * kernel is a custom one (precomputed), the rows and column of the kernel - * matrix is permuted using subsets. The computation falls back to - * CTwoSampleTest::sample_null() otherwise. - * - * Abstract base class. - */ -class CKernelTwoSampleTest : public CTwoSampleTest -{ - public: - /** default constructor */ - CKernelTwoSampleTest(); - - /** Constructor - * - * @param p_and_q feature data. Is assumed to contain samples from both - * p and q. First all samples from p, then from index q_start all - * samples from q - * - * @param kernel kernel to use - * @param p_and_q samples from p and q, appended - * @param q_start index of first sample of q - */ - CKernelTwoSampleTest(CKernel* kernel, CFeatures* p_and_q, - index_t q_start); - - /** Constructor. - * This is a convienience constructor which copies both features to one - * element and then calls the other constructor. Needs twice the memory - * for a short time - * - * @param kernel kernel for MMD - * @param p samples from distribution p, will be copied and NOT - * SG_REF'ed - * @param q samples from distribution q, will be copied and NOT - * SG_REF'ed - */ - CKernelTwoSampleTest(CKernel* kernel, CFeatures* p, - CFeatures* q); - - /** destructor */ - virtual ~CKernelTwoSampleTest(); - - /** Setter for the underlying kernel - * @param kernel new kernel to use - */ - inline virtual void set_kernel(CKernel* kernel) - { - /* ref before unref to prevent deleting in case objects are the same */ - SG_REF(kernel); - SG_UNREF(m_kernel); - m_kernel=kernel; - } - - /** @return underlying kernel, is SG_REF'ed */ - inline virtual CKernel* get_kernel() - { - SG_REF(m_kernel); - return m_kernel; - } - - /** merges both sets of samples and computes the test statistic - * m_num_null_samples times. This version checks if a precomputed - * custom kernel is used, and, if so, just permutes it instead of re- - * computing it in every iteration. - * - * @return vector of all statistics - */ - virtual SGVector sample_null(); - - /** Same as compute_statistic(), but with the possibility to perform on - * multiple kernels at once - * - * @param multiple_kernels if true, and underlying kernel is K_COMBINED, - * method will be executed on all subkernels on the same data - * @return vector of results for subkernels - */ - virtual SGVector compute_statistic( - bool multiple_kernels)=0; - - /** Wrapper for compute_statistic(false) */ - virtual float64_t compute_statistic()=0; - - virtual const char* get_name() const=0; - - private: - void init(); - - protected: - /** underlying kernel */ - CKernel* m_kernel; -}; - -} - -#endif /* KERNEL_TWO_SAMPLE_TEST_H_ */ diff --git a/src/shogun/statistics/LinearTimeMMD.cpp b/src/shogun/statistics/LinearTimeMMD.cpp deleted file mode 100644 index 735d7dc502f..00000000000 --- a/src/shogun/statistics/LinearTimeMMD.cpp +++ /dev/null @@ -1,458 +0,0 @@ -/* - * Copyright (c) The Shogun Machine Learning Toolbox - * Written (w) 2012-2013 Heiko Strathmann - * Written (w) 2014 Soumyajit De - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * The views and conclusions contained in the software and documentation are those - * of the authors and should not be interpreted as representing official policies, - * either expressed or implied, of the Shogun Development Team. - */ - -#include -#include -#include -#include -#include -#include -#include - -#include - -using namespace shogun; - -CLinearTimeMMD::CLinearTimeMMD() : CStreamingMMD() -{ -} - -CLinearTimeMMD::CLinearTimeMMD(CKernel* kernel, CStreamingFeatures* p, - CStreamingFeatures* q, index_t m, index_t blocksize) - : CStreamingMMD(kernel, p, q, m, blocksize) -{ -} - -CLinearTimeMMD::~CLinearTimeMMD() -{ -} - -void CLinearTimeMMD::compute_squared_mmd(CKernel* kernel, CList* data, - SGVector& current, SGVector& pp, - SGVector& qq, SGVector& pq, - SGVector& qp, index_t num_this_run) -{ - SG_DEBUG("entering!\n"); - - REQUIRE(data->get_num_elements()==4, "Wrong number of blocks!\n"); - - /* cast is safe the list is passed inside the class - * features will be SG_REF'ed once again by these get methods */ - CFeatures* p1=(CFeatures*)data->get_first_element(); - CFeatures* p2=(CFeatures*)data->get_next_element(); - CFeatures* q1=(CFeatures*)data->get_next_element(); - CFeatures* q2=(CFeatures*)data->get_next_element(); - - SG_DEBUG("computing MMD values for current kernel!\n"); - - /* compute kernel matrix diagonals */ - kernel->init(p1, p2); - kernel->get_kernel_diagonal(pp); - - kernel->init(q1, q2); - kernel->get_kernel_diagonal(qq); - - kernel->init(p1, q2); - kernel->get_kernel_diagonal(pq); - - kernel->init(q1, p2); - kernel->get_kernel_diagonal(qp); - - /* cleanup */ - SG_UNREF(p1); - SG_UNREF(p2); - SG_UNREF(q1); - SG_UNREF(q2); - - /* compute sum of current h terms for current kernel */ - - for (index_t i=0; i CLinearTimeMMD::compute_squared_mmd(CKernel* kernel, - CList* data, index_t num_this_run) -{ - /* wrapper method used for convenience for using preallocated memory */ - SGVector current(num_this_run); - SGVector pp(num_this_run); - SGVector qq(num_this_run); - SGVector pq(num_this_run); - SGVector qp(num_this_run); - compute_squared_mmd(kernel, data, current, pp, qq, pq, qp, num_this_run); - return current; -} - -void CLinearTimeMMD::compute_statistic_and_variance( - SGVector& statistic, SGVector& variance, - bool multiple_kernels) -{ - SG_DEBUG("entering!\n") - - REQUIRE(m_streaming_p, "streaming features p required!\n"); - REQUIRE(m_streaming_q, "streaming features q required!\n"); - - REQUIRE(m_kernel, "kernel needed!\n"); - - /* make sure multiple_kernels flag is used only with a combined kernel */ - REQUIRE(!multiple_kernels || m_kernel->get_kernel_type()==K_COMBINED, - "multiple kernels specified, but underlying kernel is not of type " - "K_COMBINED\n"); - - /* m is number of samples from each distribution, m_2 is half of it - * using names from JLMR paper (see class documentation) */ - index_t m_2=m_m/2; - - SG_DEBUG("m_m=%d\n", m_m) - - /* find out whether single or multiple kernels (cast is safe, check above) */ - index_t num_kernels=1; - if (multiple_kernels) - { - num_kernels=((CCombinedKernel*)m_kernel)->get_num_subkernels(); - SG_DEBUG("computing MMD and variance for %d sub-kernels\n", - num_kernels); - } - - /* allocate memory for results if vectors are empty */ - if (!statistic.vector) - statistic=SGVector(num_kernels); - - if (!variance.vector) - variance=SGVector(num_kernels); - - /* ensure right dimensions */ - REQUIRE(statistic.vlen==num_kernels, - "statistic vector size (%d) does not match number of kernels (%d)\n", - statistic.vlen, num_kernels); - - REQUIRE(variance.vlen==num_kernels, - "variance vector size (%d) does not match number of kernels (%d)\n", - variance.vlen, num_kernels); - - /* temp variable in the algorithm */ - float64_t delta; - - /* initialise statistic and variance since they are cumulative */ - statistic.zero(); - variance.zero(); - - /* needed for online mean and variance */ - SGVector term_counters(num_kernels); - term_counters.set_const(1); - - /* term counter to compute online mean and variance */ - index_t num_examples_processed=0; - while (num_examples_processedget_kernel(i); - - /* compute linear time MMD values */ - SGVector current=compute_squared_mmd(kernel, data, - num_this_run); - - /* single variances for all kernels. Update mean and variance - * using Knuth's online variance algorithm. - * C.f. for example Wikipedia */ - for (index_t j=0; jget_loglevel()==MSG_DEBUG || io->get_loglevel()==MSG_GCDEBUG) - statistic.display_vector("statistics"); - - /* variance of terms can be computed using mean (statistic). - * Note that the variance needs to be divided by m_2 in order to get - * variance of null-distribution */ - for (index_t i=0; iget_loglevel()==MSG_DEBUG || io->get_loglevel()==MSG_GCDEBUG) - variance.display_vector("variances"); - - SG_DEBUG("leaving!\n") -} - -void CLinearTimeMMD::compute_statistic_and_Q( - SGVector& statistic, SGMatrix& Q) -{ - SG_DEBUG("entering!\n") - - REQUIRE(m_streaming_p, "streaming features p required!\n"); - REQUIRE(m_streaming_q, "streaming features q required!\n"); - - REQUIRE(m_kernel, "kernel needed!\n"); - - /* make sure multiple_kernels flag is used only with a combined kernel */ - REQUIRE(m_kernel->get_kernel_type()==K_COMBINED, - "underlying kernel is not of type K_COMBINED\n"); - - /* cast combined kernel */ - CCombinedKernel* combined=(CCombinedKernel*)m_kernel; - - /* m is number of samples from each distribution, m_4 is quarter of it */ - REQUIRE(m_m>=4, "Need at least m>=4\n"); - index_t m_4=m_m/4; - - SG_DEBUG("m_m=%d\n", m_m) - - /* find out whether single or multiple kernels (cast is safe, check above) */ - index_t num_kernels=combined->get_num_subkernels(); - REQUIRE(num_kernels>0, "At least one kernel is needed\n"); - - /* allocate memory for results if vectors are empty */ - if (!statistic.vector) - statistic=SGVector(num_kernels); - - if (!Q.matrix) - Q=SGMatrix(num_kernels, num_kernels); - - /* ensure right dimensions */ - REQUIRE(statistic.vlen==num_kernels, - "statistic vector size (%d) does not match number of kernels (%d)\n", - statistic.vlen, num_kernels); - - REQUIRE(Q.num_rows==num_kernels, - "Q number of rows does (%d) not match number of kernels (%d)\n", - Q.num_rows, num_kernels); - - REQUIRE(Q.num_cols==num_kernels, - "Q number of columns (%d) does not match number of kernels (%d)\n", - Q.num_cols, num_kernels); - - /* initialise statistic and variance since they are cumulative */ - statistic.zero(); - Q.zero(); - - /* produce two kernel lists to iterate doubly nested */ - CList* list_i=new CList(); - CList* list_j=new CList(); - - for (index_t k_idx=0; k_idxget_num_kernels(); k_idx++) - { - CKernel* kernel = combined->get_kernel(k_idx); - list_i->append_element(kernel); - list_j->append_element(kernel); - SG_UNREF(kernel); - } - - /* needed for online mean and variance */ - SGVector term_counters_statistic(num_kernels); - SGMatrix term_counters_Q(num_kernels, num_kernels); - term_counters_statistic.set_const(1); - term_counters_Q.set_const(1); - - index_t num_examples_processed=0; - while (num_examples_processedget_num_elements(); - CFeatures* current=(CFeatures*)data->get_first_element(); - data_a->append_element(current); - SG_UNREF(current); - current=(CFeatures*)data->get_next_element(); - data_b->append_element(current); - SG_UNREF(current); - num_elements-=2; - /* loop counter is safe since num_elements can only be even */ - while (num_elements) - { - current=(CFeatures*)data->get_next_element(); - data_a->append_element(current); - SG_UNREF(current); - current=(CFeatures*)data->get_next_element(); - data_b->append_element(current); - SG_UNREF(current); - num_elements-=2; - } - /* safely unref previous list of data, decreases refcounts of features - * but doesn't delete them */ - SG_UNREF(data); - - /* now for each of these streamed data instances, iterate through all - * kernels and update Q matrix while also computing MMD statistic */ - - /* preallocate some memory for faster processing */ - SGVector pp(num_this_run); - SGVector qq(num_this_run); - SGVector pq(num_this_run); - SGVector qp(num_this_run); - SGVector h_i_a(num_this_run); - SGVector h_i_b(num_this_run); - SGVector h_j_a(num_this_run); - SGVector h_j_b(num_this_run); - - /* iterate through Q matrix and update values, compute mmd */ - CKernel* kernel_i=(CKernel*)list_i->get_first_element(); - for (index_t i=0; iget_first_element(); - for (index_t j=0; j<=i; ++j) - { - /* compute all necessary 8 h-vectors for this burst. - * h_delta-terms for each kernel, expression 7 of NIPS paper */ - - /* second kernel, a-part */ - compute_squared_mmd(kernel_j, data_a, h_j_a, pp, qq, pq, qp, - num_this_run); - - /* second kernel, b-part */ - compute_squared_mmd(kernel_j, data_b, h_j_b, pp, qq, pq, qp, - num_this_run); - - float64_t term; - for (index_t it=0; itget_next_element(); - } - - /* update MMD statistic online computation for kernel i, using - * vectors that were computed above */ - SGVector h(num_this_run*2); - for (index_t it=0; itget_next_element(); - } - - /* clean up streamed data */ - SG_UNREF(data_a); - SG_UNREF(data_b); - - /* add number of processed examples for this run */ - num_examples_processed+=num_this_run; - } - - /* clean up */ - SG_UNREF(list_i); - SG_UNREF(list_j); - - SG_DEBUG("Done compouting statistic, processed 4*%d examples.\n", - num_examples_processed); - - SG_DEBUG("leaving!\n") -} - diff --git a/src/shogun/statistics/LinearTimeMMD.h b/src/shogun/statistics/LinearTimeMMD.h deleted file mode 100644 index 84049792803..00000000000 --- a/src/shogun/statistics/LinearTimeMMD.h +++ /dev/null @@ -1,158 +0,0 @@ -/* - * Copyright (c) The Shogun Machine Learning Toolbox - * Written (w) 2012-2013 Heiko Strathmann - * Written (w) 2014 Soumyajit De - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * The views and conclusions contained in the software and documentation are those - * of the authors and should not be interpreted as representing official policies, - * either expressed or implied, of the Shogun Development Team. - */ - -#ifndef LINEAR_TIME_MMD_H_ -#define LINEAR_TIME_MMD_H_ - -#include - -#include - -namespace shogun -{ - -class CStreamingFeatures; -class CFeatures; - -/** @brief This class implements the linear time Maximum Mean Statistic as - * described in [1] for streaming data (see CStreamingMMD for description). - * - * Given two sets of samples \f$\{x_i\}_{i=1}^m\sim p\f$ and - * \f$\{y_i\}_{i=1}^m\sim q\f$ - * the (unbiased) statistic is computed as - * \f[ - * \text{MMD}_l^2[\mathcal{F},X,Y]=\frac{1}{m_2}\sum_{i=1}^{m_2} - * h(z_{2i},z_{2i+1}) - * \f] - * where - * \f[ - * h(z_{2i},z_{2i+1})=k(x_{2i},x_{2i+1})+k(y_{2i},y_{2i+1})-k(x_{2i},y_{2i+1})- - * k(x_{2i+1},y_{2i}) - * \f] - * and \f$ m_2=\lfloor\frac{m}{2} \rfloor\f$. - * - * [1]: Gretton, A., Borgwardt, K. M., Rasch, M. J., Schoelkopf, B., - * & Smola, A. (2012). A Kernel Two-Sample Test. Journal of Machine Learning - * Research, 13, 671-721. - */ -class CLinearTimeMMD: public CStreamingMMD -{ -public: - /** default constructor */ - CLinearTimeMMD(); - - /** Constructor. - * @param kernel kernel to use - * @param p streaming features p to use - * @param q streaming features q to use - * @param m number of samples from each distribution - * @param blocksize size of examples that are processed at once when - * computing statistic/threshold. If larger than m/2, all examples will be - * processed at once. Memory consumption increased linearly in the - * blocksize. Choose as large as possible regarding available memory. - */ - CLinearTimeMMD(CKernel* kernel, CStreamingFeatures* p, - CStreamingFeatures* q, index_t m, index_t blocksize=10000); - - /** destructor */ - virtual ~CLinearTimeMMD(); - - /** Computes squared MMD and a variance estimate, in linear time. - * If multiple_kernels is set to true, each subkernel is evaluated on the - * same data. - * - * @param statistic return parameter for statistic, vector with entry for - * each kernel. May be allocated before but doesn not have to be - * - * @param variance return parameter for statistic, vector with entry for - * each kernel. May be allocated before but doesn not have to be - * - * @param multiple_kernels optional flag, if set to true, it is assumed that - * the underlying kernel is of type K_COMBINED. Then, the MMD is computed on - * all subkernel separately rather than computing it on the combination. - * This is used by kernel selection strategies that need to evaluate - * multiple kernels on the same data. Since the linear time MMD works on - * streaming data, one cannot simply compute MMD, change kernel since data - * would be different for every kernel. - */ - virtual void compute_statistic_and_variance( - SGVector& statistic, SGVector& variance, - bool multiple_kernels=false); - - /** Same as compute_statistic_and_variance, but computes a linear time - * estimate of the covariance of the multiple-kernel-MMD. - * See [1] for details. - */ - virtual void compute_statistic_and_Q( - SGVector& statistic, SGMatrix& Q); - - /** returns the statistic type of this test statistic */ - virtual EStatisticType get_statistic_type() const - { - return S_LINEAR_TIME_MMD; - } - - /** @return the class name */ - virtual const char* get_name() const - { - return "LinearTimeMMD"; - } - -protected: - /** method that computes the squared MMD in linear time (see class - * description for the equation) - * - * @param kernel the kernel to be used for computing MMD. This will be - * useful when multiple kernels are used - * @param data the list of data on which kernels are computed. The order - * of data in the list is \f$x,x',\cdots\sim p\f$ followed by - * \f$y,y',\cdots\sim q\f$. It is assumed that detele_data flag is set - * inside the list - * @param num_this_run number of data points in current blocks - * @return the MMD values (the h-vectors) - */ - virtual SGVector compute_squared_mmd(CKernel* kernel, - CList* data, index_t num_this_run); - -private: - /** helper method, same as compute_squared_mmd with an option to use - * preallocated memory for faster processing */ - void compute_squared_mmd(CKernel* kernel, CList* data, - SGVector& current, SGVector& pp, - SGVector& qq, SGVector& pq, - SGVector& qp, index_t num_this_run); - -}; - -} - -#endif /* LINEAR_TIME_MMD_H_ */ - diff --git a/src/shogun/statistics/MMDKernelSelection.cpp b/src/shogun/statistics/MMDKernelSelection.cpp deleted file mode 100644 index 4882423ad57..00000000000 --- a/src/shogun/statistics/MMDKernelSelection.cpp +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) The Shogun Machine Learning Toolbox - * Written (w) 2012-2013 Heiko Strathmann - * Written (w) 2014 Soumyajit De - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * The views and conclusions contained in the software and documentation are those - * of the authors and should not be interpreted as representing official policies, - * either expressed or implied, of the Shogun Development Team. - */ - -#include -#include -#include -#include -#include - -using namespace shogun; - -CMMDKernelSelection::CMMDKernelSelection() -{ -} - -CMMDKernelSelection::CMMDKernelSelection(CKernelTwoSampleTest* mmd) - : CKernelSelection(mmd) -{ - /* ensure that mmd contains an instance of a MMD related class - TODO - Add S_BTEST_MMD when feature/mmd is merged with develop */ - REQUIRE(mmd->get_statistic_type()==S_LINEAR_TIME_MMD || - mmd->get_statistic_type()==S_QUADRATIC_TIME_MMD, - "Provided instance for kernel two sample testing has to be a MMD-" - "based class! The provided is of class \"%s\"\n", mmd->get_name()); -} - -CMMDKernelSelection::~CMMDKernelSelection() -{ -} - -CKernel* CMMDKernelSelection::select_kernel() -{ - SG_DEBUG("entering\n") - - /* compute measures and return single kernel with maximum measure */ - SGVector measures=compute_measures(); - - /* find maximum and return corresponding kernel */ - float64_t max=measures[0]; - index_t max_idx=0; - for (index_t i=1; imax) - { - max=measures[i]; - max_idx=i; - } - } - - /* find kernel with corresponding index */ - CCombinedKernel* combined=(CCombinedKernel*)m_estimator->get_kernel(); - CKernel* current=combined->get_kernel(max_idx); - - SG_UNREF(combined); - SG_DEBUG("leaving\n"); - - /* current is not SG_UNREF'ed nor SG_REF'ed since the counter needs to be - * incremented exactly by one */ - return current; -} - diff --git a/src/shogun/statistics/MMDKernelSelection.h b/src/shogun/statistics/MMDKernelSelection.h deleted file mode 100644 index 5616b2bbdd5..00000000000 --- a/src/shogun/statistics/MMDKernelSelection.h +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Copyright (c) The Shogun Machine Learning Toolbox - * Written (w) 2012-2013 Heiko Strathmann - * Written (w) 2014 Soumyajit De - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * The views and conclusions contained in the software and documentation are those - * of the authors and should not be interpreted as representing official policies, - * either expressed or implied, of the Shogun Development Team. - */ - -#ifndef __MMDKERNELSELECTION_H_ -#define __MMDKERNELSELECTION_H_ - -#include -#include - -namespace shogun -{ - -class CKernelTwoSampleTest; -class CKernel; - -/** @brief Base class for kernel selection for MMD-based two-sample test - * statistic implementations. - * Provides abstract methods for selecting kernels and computing criteria or - * kernel weights for the implemented method. In order to implement new methods - * for kernel selection, simply write a new implementation of this class. - * - * Kernel selection works this way: One passes an instance of CCombinedKernel - * to the MMD statistic and appends all kernels that should be considered. - * Depending on the type of kernel selection implementation, a single one or - * a combination of those baseline kernels is selected and returned to the user. - * This kernel can then be passed to the MMD instance to perform a test. - * - */ -class CMMDKernelSelection: public CKernelSelection -{ -public: - - /** Default constructor */ - CMMDKernelSelection(); - - /** Constructor that initialises the underlying MMD instance - * - * @param mmd MMD instance to use. Has to be an MMD based kernel two-sample - * test. Currently: linear or quadratic time MMD. - */ - CMMDKernelSelection(CKernelTwoSampleTest* mmd); - - /** Destructor */ - virtual ~CMMDKernelSelection(); - - /** If the the implemented method selects a single kernel, this computes - * criteria for all underlying kernels. If the method selects combined - * kernels, this method returns weights for the baseline kernels - * - * @return vector with criteria or kernel weights - */ - virtual SGVector compute_measures()=0; - - /** Performs kernel selection on the base of the compute_measures() method - * and returns the selected kernel which is either a single or a combined - * one (with weights set) - * - * @return selected kernel (SG_REF'ed) - */ - virtual CKernel* select_kernel(); - - /** @return name of the SGSerializable */ - virtual const char* get_name() const - { - return "MMDKernelSelection"; - } - -}; - -} - -#endif /* __MMDKERNELSELECTION_H_ */ diff --git a/src/shogun/statistics/MMDKernelSelectionComb.cpp b/src/shogun/statistics/MMDKernelSelectionComb.cpp deleted file mode 100644 index aa1da177181..00000000000 --- a/src/shogun/statistics/MMDKernelSelectionComb.cpp +++ /dev/null @@ -1,171 +0,0 @@ -/* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3 of the License, or - * (at your option) any later version. - * - * Written (W) 2012-2013 Heiko Strathmann - */ - -#include -#ifdef USE_GPL_SHOGUN - -#include -#include -#include - -using namespace shogun; - -CMMDKernelSelectionComb::CMMDKernelSelectionComb() : - CMMDKernelSelection() -{ - init(); -} - -CMMDKernelSelectionComb::CMMDKernelSelectionComb( - CKernelTwoSampleTest* mmd) : CMMDKernelSelection(mmd) -{ - init(); -} - -CMMDKernelSelectionComb::~CMMDKernelSelectionComb() -{ -} - -void CMMDKernelSelectionComb::init() -{ - SG_ADD(&m_opt_max_iterations, "opt_max_iterations", "Maximum number of " - "iterations for qp solver", MS_NOT_AVAILABLE); - SG_ADD(&m_opt_epsilon, "opt_epsilon", "Stopping criterion for qp solver", - MS_NOT_AVAILABLE); - SG_ADD(&m_opt_low_cut, "opt_low_cut", "Low cut value for optimization " - "kernel weights", MS_NOT_AVAILABLE); - - /* sensible values for optimization */ - m_opt_max_iterations=10000; - m_opt_epsilon=10E-15; - m_opt_low_cut=10E-7; -} - -CKernel* CMMDKernelSelectionComb::select_kernel() -{ - /* cast is safe due to assertion in constructor */ - CCombinedKernel* combined=(CCombinedKernel*)m_estimator->get_kernel(); - - /* optimise for kernel weights and set them */ - SGVector weights=compute_measures(); - combined->set_subkernel_weights(weights); - - /* note that kernel is SG_REF'ed from getter above */ - return combined; -} - -/* no reference counting, use the static context constructor of SGMatrix */ -SGMatrix CMMDKernelSelectionComb::m_Q=SGMatrix(false); - -const float64_t* CMMDKernelSelectionComb::get_Q_col(uint32_t i) -{ - return &m_Q[m_Q.num_rows*i]; -} - -/** helper function that prints current state */ -void CMMDKernelSelectionComb::print_state(libqp_state_T state) -{ - SG_SDEBUG("libqp state: primal=%f\n", state.QP); -} - -SGVector CMMDKernelSelectionComb::solve_optimization( - SGVector mmds) -{ - /* readability */ - index_t num_kernels=mmds.vlen; - - /* compute sum of mmds to generate feasible point for convex program */ - float64_t sum_mmds=0; - for (index_t i=0; i Q_diag(num_kernels); - SGVector f(num_kernels); - SGVector lb(num_kernels); - SGVector ub(num_kernels); - SGVector weights(num_kernels); - - /* init everything, there are two cases possible: i) at least one mmd is - * is positive, ii) all mmds are negative */ - bool one_pos=false; - for (index_t i=0; i0) - { - SG_DEBUG("found at least one positive MMD\n") - one_pos=true; - break; - } - } - - if (!one_pos) - { - SG_WARNING("All mmd estimates are negative. This is techically possible," - "although extremely rare. Consider using different kernels. " - "This combination will lead to a bad two-sample test. Since any" - "combination is bad, will now just return equally distributed " - "kernel weights\n"); - - /* if no element is positive, we can choose arbritary weights since - * the results will be bad anyway */ - weights.set_const(1.0/num_kernels); - } - else - { - SG_DEBUG("one MMD entry is positive, performing optimisation\n") - /* do optimisation, init vectors */ - for (index_t i=0; i - -#ifdef USE_GPL_SHOGUN - -#include -#include -#include - -namespace shogun -{ - -class CLinearTimeMMD; - -/** @brief Base class for kernel selection of combined kernels. Given an MMD - * instance whose underlying kernel is a combined one, this class provides an - * interface to select weights of this combined kernel. - */ -class CMMDKernelSelectionComb: public CMMDKernelSelection -{ -public: - - /** Default constructor */ - CMMDKernelSelectionComb(); - - /** Constructor that initialises the underlying MMD instance. Currently, - * only the linear time MMD is supported - * - * @param mmd MMD instance to use - */ - CMMDKernelSelectionComb(CKernelTwoSampleTest* mmd); - - /** Destructor */ - virtual ~CMMDKernelSelectionComb(); - - /** @return computes weights for the underlying kernel, sets them to it, and - * returns it (SG_REF'ed) - * - * @return underlying kernel with weights set - */ - virtual CKernel* select_kernel(); - - /** @return name of the SGSerializable */ - virtual const char* get_name() const=0; - -protected: - /** Solves the quadratic program - * \f[ - * \min_\beta \{\beta^T Q \beta \quad \text{s.t.}\quad \beta^T \eta=1, \beta\succeq 0\}, - * \f] - * where \f$\eta\f$ is a given parameter and \f$Q\f$ is the m_Q member. - * - * Note that at least one element is assumed \f$\eta\f$ has to be positive. - * - * @param mmds values that will be put into \f$\eta\f$. At least one element - * is assumed to be positive - * @return result of optimization \f$\beta\f$ - */ - virtual SGVector solve_optimization(SGVector mmds); - - /** return pointer to i-th column of m_Q. Helper for libqp */ - static const float64_t* get_Q_col(uint32_t i); - - /** helper function that prints current state */ - static void print_state(libqp_state_T state); - - /** maximum number of iterations of qp solver */ - index_t m_opt_max_iterations; - - /** stopping accuracy of qp solver */ - float64_t m_opt_epsilon; - - /** low cut for weights, if weights are under this value, are set to zero */ - float64_t m_opt_low_cut; - - /** matrix for selection of kernel weights (static because of libqp) */ - static SGMatrix m_Q; - -private: - /** initializer */ - void init(); -}; - -} - -#endif //USE_GPL_SHOGUN -#endif /* __MMDKERNELSELECTIONCOMB_H_ */ diff --git a/src/shogun/statistics/MMDKernelSelectionCombMaxL2.cpp b/src/shogun/statistics/MMDKernelSelectionCombMaxL2.cpp deleted file mode 100644 index e150e7ce8c2..00000000000 --- a/src/shogun/statistics/MMDKernelSelectionCombMaxL2.cpp +++ /dev/null @@ -1,74 +0,0 @@ -/* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3 of the License, or - * (at your option) any later version. - * - * Written (W) 2013 Heiko Strathmann - */ - -#include -#ifdef USE_GPL_SHOGUN - -#include -#include -#include -#include - - -using namespace shogun; - -CMMDKernelSelectionCombMaxL2::CMMDKernelSelectionCombMaxL2() : - CMMDKernelSelectionComb() -{ -} - -CMMDKernelSelectionCombMaxL2::CMMDKernelSelectionCombMaxL2( - CKernelTwoSampleTest* mmd) : CMMDKernelSelectionComb(mmd) -{ - /* currently, this method is only developed for the linear time MMD */ - REQUIRE(mmd->get_statistic_type()==S_QUADRATIC_TIME_MMD || - mmd->get_statistic_type()==S_LINEAR_TIME_MMD, "%s::%s(): Only " - "CLinearTimeMMD is currently supported! Provided instance is " - "\"%s\"\n", get_name(), get_name(), mmd->get_name()); -} - -CMMDKernelSelectionCombMaxL2::~CMMDKernelSelectionCombMaxL2() -{ -} - -SGVector CMMDKernelSelectionCombMaxL2::compute_measures() -{ - /* cast is safe due to assertion in constructor */ - CCombinedKernel* kernel=(CCombinedKernel*)m_estimator->get_kernel(); - index_t num_kernels=kernel->get_num_subkernels(); - SG_UNREF(kernel); - - /* compute mmds for all underlying kernels and create identity matrix Q - * (see NIPS paper) */ - SGVector mmds=m_estimator->compute_statistic(true); - - /* free matrix by hand since it is static */ - SG_FREE(m_Q.matrix); - m_Q.matrix=NULL; - m_Q.num_rows=0; - m_Q.num_cols=0; - m_Q=SGMatrix(num_kernels, num_kernels, false); - for (index_t i=0; i result=CMMDKernelSelectionComb::solve_optimization(mmds); - - /* free matrix by hand since it is static (again) */ - SG_FREE(m_Q.matrix); - m_Q.matrix=NULL; - m_Q.num_rows=0; - m_Q.num_cols=0; - - return result; -} -#endif //USE_GPL_SHOGUN diff --git a/src/shogun/statistics/MMDKernelSelectionCombMaxL2.h b/src/shogun/statistics/MMDKernelSelectionCombMaxL2.h deleted file mode 100644 index 430300f97cc..00000000000 --- a/src/shogun/statistics/MMDKernelSelectionCombMaxL2.h +++ /dev/null @@ -1,81 +0,0 @@ -/* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3 of the License, or - * (at your option) any later version. - * - * Written (W) 2013 Heiko Strathmann - */ - -#ifndef __MMDKERNELSELECTIONCOMBMAXL2_H_ -#define __MMDKERNELSELECTIONCOMBMAXL2_H_ - -#include -#ifdef USE_GPL_SHOGUN -#include -#include - -namespace shogun -{ - -/** @brief Implementation of maximum MMD kernel selection for combined kernel. - * This class selects a combination of baseline kernels that maximises the - * the MMD for a combined kernel based on a L2-regularization approach. This - * boils down to solve the convex program - * \f[ - * \min_\beta \{\beta^T \beta \quad \text{s.t.}\quad \beta^T \eta=1, \beta\succeq 0\}, - * \f] - * where \f$\eta\f$ is a vector whose elements are the MMDs of the baseline - * kernels. - * - * This is meant to work for the CQuadraticTimeMMD statistic. - * Optimal weight selecton for CLinearTimeMMD can be found in - * CMMDKernelSelectionCombOpt. - * - * The method is described in - * Gretton, A., Sriperumbudur, B., Sejdinovic, D., Strathmann, H., - * Balakrishnan, S., Pontil, M., & Fukumizu, K. (2012). - * Optimal kernel choice for large-scale two-sample tests. - * Advances in Neural Information Processing Systems. - */ -class CMMDKernelSelectionCombMaxL2: public CMMDKernelSelectionComb -{ -public: - - /** Default constructor */ - CMMDKernelSelectionCombMaxL2(); - - /** Constructor that initialises the underlying MMD instance - * - * @param mmd MMD instance to use. Has to be an MMD based kernel two-sample - * test. Currently: linear or quadratic time MMD. - */ - CMMDKernelSelectionCombMaxL2(CKernelTwoSampleTest* mmd); - - /** Destructor */ - virtual ~CMMDKernelSelectionCombMaxL2(); - - /** Computes kernel weights which maximise the MMD of the underlying - * combined kernel using L2-regularization. - * - * This boils down to solving a convex program which is quadratic in the - * number of kernels. See class description. - * - * SHOGUN has to be compiled with LAPACK to make this available. See - * set_opt* methods for optimization parameters. - * - * IMPORTANT: Kernel weights have to be learned on different data than is - * used for testing/evaluation! - */ - virtual SGVector compute_measures(); - - /** @return name of the SGSerializable */ - virtual const char* get_name() const - { - return "MMDKernelSelectionCombMaxL2"; - } -}; - -} -#endif //USE_GPL_SHOGUN -#endif /* __MMDKERNELSELECTIONCOMBMAXL2_H_ */ diff --git a/src/shogun/statistics/MMDKernelSelectionCombOpt.cpp b/src/shogun/statistics/MMDKernelSelectionCombOpt.cpp deleted file mode 100644 index ceecf63b500..00000000000 --- a/src/shogun/statistics/MMDKernelSelectionCombOpt.cpp +++ /dev/null @@ -1,99 +0,0 @@ -/* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3 of the License, or - * (at your option) any later version. - * - * Written (W) 2012-2013 Heiko Strathmann - */ - -#include -#ifdef USE_GPL_SHOGUN - -#include -#include -#include - - -using namespace shogun; - -CMMDKernelSelectionCombOpt::CMMDKernelSelectionCombOpt() : - CMMDKernelSelectionComb() -{ - init(); -} - -CMMDKernelSelectionCombOpt::CMMDKernelSelectionCombOpt( - CKernelTwoSampleTest* mmd, float64_t lambda) : - CMMDKernelSelectionComb(mmd) -{ - /* currently, this method is only developed for the linear time MMD */ - REQUIRE(dynamic_cast(mmd), "%s::%s(): Only " - "CLinearTimeMMD is currently supported! Provided instance is " - "\"%s\"\n", get_name(), get_name(), mmd->get_name()); - - init(); - - m_lambda=lambda; -} - -CMMDKernelSelectionCombOpt::~CMMDKernelSelectionCombOpt() -{ -} - -void CMMDKernelSelectionCombOpt::init() -{ - /* set to a sensible standard value that proved to be useful in - * experiments, see NIPS paper */ - m_lambda=1E-5; - - SG_ADD(&m_lambda, "lambda", "Regularization parameter lambda", - MS_NOT_AVAILABLE); -} - -SGVector CMMDKernelSelectionCombOpt::compute_measures() -{ - /* cast is safe due to assertion in constructor */ - CCombinedKernel* kernel=(CCombinedKernel*)m_estimator->get_kernel(); - index_t num_kernels=kernel->get_num_subkernels(); - SG_UNREF(kernel); - - /* allocate space for MMDs and Q matrix */ - SGVector mmds(num_kernels); - - /* free matrix by hand since it is static */ - SG_FREE(m_Q.matrix); - m_Q.matrix=NULL; - m_Q.num_rows=0; - m_Q.num_cols=0; - m_Q=SGMatrix(num_kernels, num_kernels, false); - - /* online compute mmds and covariance matrix Q of kernels */ - ((CLinearTimeMMD*)m_estimator)->compute_statistic_and_Q(mmds, m_Q); - - /* evtl regularize to avoid numerical problems (see NIPS paper) */ - if (m_lambda) - { - SG_DEBUG("regularizing matrix Q by adding %f to diagonal\n", m_lambda) - for (index_t i=0; iget_loglevel()==MSG_DEBUG) - { - m_Q.display_matrix("(regularized) Q"); - mmds.display_vector("mmds"); - } - - /* solve the generated problem */ - SGVector result=CMMDKernelSelectionComb::solve_optimization(mmds); - - /* free matrix by hand since it is static (again) */ - SG_FREE(m_Q.matrix); - m_Q.matrix=NULL; - m_Q.num_rows=0; - m_Q.num_cols=0; - - return result; -} -#endif //USE_GPL_SHOGUN diff --git a/src/shogun/statistics/MMDKernelSelectionCombOpt.h b/src/shogun/statistics/MMDKernelSelectionCombOpt.h deleted file mode 100644 index 9e7223ea6ee..00000000000 --- a/src/shogun/statistics/MMDKernelSelectionCombOpt.h +++ /dev/null @@ -1,95 +0,0 @@ -/* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3 of the License, or - * (at your option) any later version. - * - * Written (W) 2012-2013 Heiko Strathmann - */ - -#ifndef __MMDKERNELSELECTIONCOMBOPT_H_ -#define __MMDKERNELSELECTIONCOMBOPT_H_ - -#include -#ifdef USE_GPL_SHOGUN - -#include - -namespace shogun -{ - -class CLinearTimeMMD; - -/** @brief Implementation of optimal kernel selection for combined kernel. - * This class selects a combination of baseline kernels that maximises the - * ratio of the MMD and its standard deviation for a combined kernel. This - * boils down to solve the convex program - * \f[ - * \min_\beta \{\beta^T (Q+\lambda_m) \beta \quad \text{s.t.}\quad \beta^T \eta=1, \beta\succeq 0\}, - * \f] - * where \f$\eta\f$ is a vector whose elements are the MMDs of the baseline - * kernels and \f$Q\f$ is a linear time estimate of the covariance of \f$\eta\f$. - * - * This only works for the CLinearTimeMMD statistic. * - * IMPORTANT: The kernel has to be selected on different data than the two-sample - * test is performed on. - * - * The method is described in - * Gretton, A., Sriperumbudur, B., Sejdinovic, D., Strathmann, H., - * Balakrishnan, S., Pontil, M., & Fukumizu, K. (2012). - * Optimal kernel choice for large-scale two-sample tests. - * Advances in Neural Information Processing Systems. - */ -class CMMDKernelSelectionCombOpt: public CMMDKernelSelectionComb -{ -public: - - /** Default constructor */ - CMMDKernelSelectionCombOpt(); - - /** Constructor that initialises the underlying MMD instance - * - * @param mmd linear time mmd MMD instance to use. - * @param lambda ridge that is added to standard deviation, a sensible value - * is 10E-5 which is the default - */ - CMMDKernelSelectionCombOpt(CKernelTwoSampleTest* mmd, - float64_t lambda=10E-5); - - /** Destructor */ - virtual ~CMMDKernelSelectionCombOpt(); - - /** Computes optimal kernel weights using the ratio of the squared MMD by its - * standard deviation as a criterion, where both expressions are estimated - * in linear time. - * - * This boils down to solving a convex program which is quadratic in the - * number of kernels. See class description. - * - * SHOGUN has to be compiled with LAPACK to make this available. See - * set_opt* methods for optimization parameters. - * - * IMPORTANT: Kernel weights have to be learned on different data than is - * used for testing/evaluation! - */ - virtual SGVector compute_measures(); - - /** @return name of the SGSerializable */ - virtual const char* get_name() const - { - return "MMDKernelSelectionCombOpt"; - } - -private: - /** Initializer */ - void init(); - -protected: - /** Ridge that is added to the diagonal of the Q matrix in the optimization - * problem */ - float64_t m_lambda; -}; - -} -#endif //USE_GPL_SHOGUN -#endif /* __MMDKERNELSELECTIONCOMBOPT_H_ */ diff --git a/src/shogun/statistics/MMDKernelSelectionMax.cpp b/src/shogun/statistics/MMDKernelSelectionMax.cpp deleted file mode 100644 index fe2c2c7ca6e..00000000000 --- a/src/shogun/statistics/MMDKernelSelectionMax.cpp +++ /dev/null @@ -1,32 +0,0 @@ -/* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3 of the License, or - * (at your option) any later version. - * - * Written (W) 2012-2013 Heiko Strathmann - */ - -#include -#include - -using namespace shogun; - -CMMDKernelSelectionMax::CMMDKernelSelectionMax() : CMMDKernelSelection() -{ -} - -CMMDKernelSelectionMax::CMMDKernelSelectionMax( - CKernelTwoSampleTest* mmd) : CMMDKernelSelection(mmd) -{ -} - -CMMDKernelSelectionMax::~CMMDKernelSelectionMax() -{ -} - -SGVector CMMDKernelSelectionMax::compute_measures() -{ - /* simply return vector with MMDs */ - return m_estimator->compute_statistic(true); -} diff --git a/src/shogun/statistics/MMDKernelSelectionMax.h b/src/shogun/statistics/MMDKernelSelectionMax.h deleted file mode 100644 index 34bb9fcadc0..00000000000 --- a/src/shogun/statistics/MMDKernelSelectionMax.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3 of the License, or - * (at your option) any later version. - * - * Written (W) 2012-2013 Heiko Strathmann - */ - -#ifndef __MMDKERNELSELECTIONMAX_H_ -#define __MMDKERNELSELECTIONMAX_H_ - -#include - -#include - -namespace shogun -{ - -/** @brief Kernel selection class that selects the single kernel that maximises - * the MMD statistic. Works for CQuadraticTimeMMD and CLinearTimeMMD. This leads - * to a heuristic that is better than the standard median heuristic for - * Gaussian kernels. However, it comes with no guarantees. - * - * Optimal selection of single kernels can be found in the class - * CMMDKernelSelectionOpt - * - * This method was first described in - * Sriperumbudur, B., Fukumizu, K., Gretton, A., Lanckriet, G. R. G., - * & Schoelkopf, B. - * Kernel choice and classifiability for RKHS embeddings of probability - * distributions. Advances in Neural Information Processing Systems (2009). - */ -class CMMDKernelSelectionMax: public CMMDKernelSelection -{ -public: - - /** Default constructor */ - CMMDKernelSelectionMax(); - - /** Constructor that initialises the underlying MMD instance - * - * @param mmd MMD instance to use. Has to be an MMD based kernel two-sample - * test. Currently: linear or quadratic time MMD. - */ - CMMDKernelSelectionMax(CKernelTwoSampleTest* mmd); - - /** Destructor */ - virtual ~CMMDKernelSelectionMax(); - - /** @return vector the MMD of all single baseline kernels */ - virtual SGVector compute_measures(); - - /** @return name of the SGSerializable */ - virtual const char* get_name() const { return "MMDKernelSelectionMax"; } -}; - -} - -#endif /* __MMDKERNELSELECTIONMAX_H_ */ diff --git a/src/shogun/statistics/MMDKernelSelectionMedian.cpp b/src/shogun/statistics/MMDKernelSelectionMedian.cpp deleted file mode 100644 index fad79dcb3d0..00000000000 --- a/src/shogun/statistics/MMDKernelSelectionMedian.cpp +++ /dev/null @@ -1,237 +0,0 @@ -/* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3 of the License, or - * (at your option) any later version. - * - * Written (W) 2013 Heiko Strathmann - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -using namespace shogun; - -CMMDKernelSelectionMedian::CMMDKernelSelectionMedian() : - CMMDKernelSelection() -{ - init(); -} - -CMMDKernelSelectionMedian::CMMDKernelSelectionMedian( - CKernelTwoSampleTest* mmd, index_t num_data_distance) : - CMMDKernelSelection(mmd) -{ - /* assert that a combined kernel is used */ - CKernel* kernel=mmd->get_kernel(); - CFeatures* lhs=kernel->get_lhs(); - CFeatures* rhs=kernel->get_rhs(); - REQUIRE(kernel, "%s::%s(): No kernel set!\n", get_name(), get_name()); - REQUIRE(kernel->get_kernel_type()==K_COMBINED, "%s::%s(): Requires " - "CombinedKernel as kernel. Yours is %s", get_name(), get_name(), - kernel->get_name()); - - /* assert that all subkernels are Gaussian kernels */ - CCombinedKernel* combined=(CCombinedKernel*)kernel; - - for (index_t k_idx=0; k_idxget_num_kernels(); k_idx++) - { - CKernel* subkernel=combined->get_kernel(k_idx); - REQUIRE(kernel, "%s::%s(): Subkernel (%d) of current kernel is not" - " of type GaussianKernel\n", get_name(), get_name(), k_idx); - SG_UNREF(subkernel); - } - - /* assert 64 bit dense features since EuclideanDistance can only handle - * those */ - if (m_estimator->get_statistic_type()==S_QUADRATIC_TIME_MMD) - { - CFeatures* features=((CQuadraticTimeMMD*)m_estimator)->get_p_and_q(); - REQUIRE(features->get_feature_class()==C_DENSE && - features->get_feature_type()==F_DREAL, "%s::select_kernel(): " - "Only 64 bit float dense features allowed, these are \"%s\"" - " and of type %d\n", - get_name(), features->get_name(), features->get_feature_type()); - SG_UNREF(features); - } - else if (m_estimator->get_statistic_type()==S_LINEAR_TIME_MMD) - { - CStreamingFeatures* p=((CLinearTimeMMD*)m_estimator)->get_streaming_p(); - CStreamingFeatures* q=((CLinearTimeMMD*)m_estimator)->get_streaming_q(); - REQUIRE(p->get_feature_class()==C_STREAMING_DENSE && - p->get_feature_type()==F_DREAL, "%s::select_kernel(): " - "Only 64 bit float streaming dense features allowed, these (p) " - "are \"%s\" and of type %d\n", - get_name(), p->get_name(), p->get_feature_type()); - - REQUIRE(p->get_feature_class()==C_STREAMING_DENSE && - p->get_feature_type()==F_DREAL, "%s::select_kernel(): " - "Only 64 bit float streaming dense features allowed, these (q) " - "are \"%s\" and of type %d\n", - get_name(), q->get_name(), q->get_feature_type()); - SG_UNREF(p); - SG_UNREF(q); - } - - SG_UNREF(kernel); - SG_UNREF(lhs); - SG_UNREF(rhs); - - init(); - - m_num_data_distance=num_data_distance; -} - -CMMDKernelSelectionMedian::~CMMDKernelSelectionMedian() -{ -} - -void CMMDKernelSelectionMedian::init() -{ - SG_ADD(&m_num_data_distance, "m_num_data_distance", "Number of elements to " - "to compute median distance on", MS_NOT_AVAILABLE); - - /* this is a sensible value */ - m_num_data_distance=1000; -} - -SGVector CMMDKernelSelectionMedian::compute_measures() -{ - SG_ERROR("%s::compute_measures(): Not implemented. Use select_kernel() " - "method!\n", get_name()); - return SGVector(); -} - -CKernel* CMMDKernelSelectionMedian::select_kernel() -{ - /* number of data for distace */ - index_t num_data=CMath::min(m_num_data_distance, m_estimator->get_m()); - - SGMatrix dists; - - /* compute all pairwise distances, depends which mmd statistic is used */ - if (m_estimator->get_statistic_type()==S_QUADRATIC_TIME_MMD) - { - /* fixed data, create merged copy of a random subset */ - - /* create vector with that correspond to the num_data first points of - * each distribution, remember data is stored jointly */ - SGVector subset(num_data*2); - index_t m=m_estimator->get_m(); - for (index_t i=0; iget_p_and_q(); - features->add_subset(subset); - - /* cast is safe, see constructor */ - CDenseFeatures* dense_features= - (CDenseFeatures*) features; - - CEuclideanDistance* distance=new CEuclideanDistance(dense_features, - dense_features); - dists=distance->get_distance_matrix(); - features->remove_subset(); - SG_UNREF(distance); - SG_UNREF(features); - } - else if (m_estimator->get_statistic_type()==S_LINEAR_TIME_MMD) - { - /* just stream the desired number of points */ - CLinearTimeMMD* linear_mmd=(CLinearTimeMMD*)m_estimator; - - CStreamingFeatures* p=linear_mmd->get_streaming_p(); - CStreamingFeatures* q=linear_mmd->get_streaming_q(); - - /* cast is safe, see constructor */ - CDenseFeatures* p_streamed=(CDenseFeatures*) - p->get_streamed_features(num_data); - CDenseFeatures* q_streamed=(CDenseFeatures*) - q->get_streamed_features(num_data); - - /* for safety */ - SG_REF(p_streamed); - SG_REF(q_streamed); - - /* create merged feature object */ - CDenseFeatures* merged=(CDenseFeatures*) - p_streamed->create_merged_copy(q_streamed); - - /* compute pairwise distances */ - CEuclideanDistance* distance=new CEuclideanDistance(merged, merged); - dists=distance->get_distance_matrix(); - - /* clean up */ - SG_UNREF(distance); - SG_UNREF(p_streamed); - SG_UNREF(q_streamed); - SG_UNREF(p); - SG_UNREF(q); - } - - /* create a vector where the zeros have been removed, use upper triangle - * only since distances are symmetric */ - SGVector dist_vec(dists.num_rows*(dists.num_rows-1)/2); - index_t write_idx=0; - for (index_t i=0; i(dist_vec.vector, dist_vec.vlen); - float64_t median_distance=dist_vec[dist_vec.vlen/2]; - SG_DEBUG("median_distance: %f\n", median_distance); - - /* shogun has no square and factor two in its kernel width, MATLAB does - * median_width = sqrt(0.5*median_distance), we do this */ - float64_t shogun_sigma=median_distance; - SG_DEBUG("kernel width (shogun): %f\n", shogun_sigma); - - /* now of all kernels, find the one which has its width closest - * Cast is safe due to constructor of MMDKernelSelection class */ - CCombinedKernel* combined=(CCombinedKernel*)m_estimator->get_kernel(); - float64_t min_distance=CMath::MAX_REAL_NUMBER; - CKernel* min_kernel=NULL; - float64_t distance; - for (index_t i=0; iget_num_subkernels(); ++i) - { - CKernel* current=combined->get_kernel(i); - REQUIRE(current->get_kernel_type()==K_GAUSSIAN, "%s::select_kernel(): " - "%d-th kernel is not a Gaussian but \"%s\"!\n", get_name(), i, - current->get_name()); - - /* check if width is closer to median width */ - distance=CMath::abs(((CGaussianKernel*)current)->get_width()- - shogun_sigma); - - if (distance - -#include - -namespace shogun -{ - -/** @brief Implements MMD kernel selection for a number of Gaussian baseline - * kernels via selecting the one with a bandwidth parameter that is closest to - * the median of all pairwise distances in the underlying data. Therefore, it - * only works for data to which a GaussianKernel can be applied, which are - * grouped under the class CDotFeatures in SHOGUN. - * - * This method works reasonable if distinguishing characteristics of data are not - * hidden at a different length-scale that the overall one. In addition it is - * fast to compute. In other cases, it is a bad choice. - * - * Optimal selection of single kernels can be found in the class - * CMMDKernelSelectionOpt - * - * Described among oher places in - * Gretton, A., Borgwardt, K. M., Rasch, M. J., Schoelkopf, B., & Smola, A. - * (2012). - * A Kernel Two-Sample Test. Journal of Machine Learning Research, 13, 671-721. - */ -class CMMDKernelSelectionMedian: public CMMDKernelSelection -{ -public: - - /** Default constructor */ - CMMDKernelSelectionMedian(); - - /** Constructor that initialises the underlying MMD instance - * - * @param mmd MMD instance to use. Has to be an MMD based kernel two-sample - * test. - * @param num_data_distance Number of points that is used to compute the - * median distance on. Since the median is stable, this do need need to be - * all data, but a small subset is sufficient. - */ - CMMDKernelSelectionMedian(CKernelTwoSampleTest* mmd, - index_t num_data_distance=1000); - - /** Destructor */ - virtual ~CMMDKernelSelectionMedian(); - - /** @return Throws an error and shoold not be used */ - virtual SGVector compute_measures(); - - /** Returns the baseline kernel whose bandwidth parameter is closest to the - * median of the pairwise distances of the underlyinf data - * - * @return selected kernel (SG_REF'ed) - */ - virtual CKernel* select_kernel(); - - /** @return name of the SGSerializable */ - virtual const char* get_name() const { return "MMDKernelSelectionMedian"; } - -private: - /* initialises and registers member variables */ - void init(); - -protected: - /** maximum number of data to be used for median distance computation */ - index_t m_num_data_distance; -}; - -} - -#endif /* __MMDKERNELSELECTIONMEDIAN_H_ */ diff --git a/src/shogun/statistics/MMDKernelSelectionOpt.cpp b/src/shogun/statistics/MMDKernelSelectionOpt.cpp deleted file mode 100644 index b03dfaee18f..00000000000 --- a/src/shogun/statistics/MMDKernelSelectionOpt.cpp +++ /dev/null @@ -1,62 +0,0 @@ -/* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3 of the License, or - * (at your option) any later version. - * - * Written (W) 2012-2013 Heiko Strathmann - */ - -#include -#include -#include - -using namespace shogun; - -CMMDKernelSelectionOpt::CMMDKernelSelectionOpt() : - CMMDKernelSelection() -{ - init(); -} - -CMMDKernelSelectionOpt::CMMDKernelSelectionOpt( - CKernelTwoSampleTest* mmd, float64_t lambda) : - CMMDKernelSelection(mmd) -{ - init(); - - /* currently, this method is only developed for the linear time MMD */ - REQUIRE(dynamic_cast(mmd), "%s::%s(): Only " - "CLinearTimeMMD is currently supported! Provided instance is " - "\"%s\"\n", get_name(), get_name(), mmd->get_name()); - - m_lambda=lambda; -} - -CMMDKernelSelectionOpt::~CMMDKernelSelectionOpt() -{ -} - -SGVector CMMDKernelSelectionOpt::compute_measures() -{ - /* comnpute mmd on all subkernels using the same data. Note that underlying - * kernel was asserted to be a combined one */ - SGVector mmds; - SGVector vars; - ((CLinearTimeMMD*)m_estimator)->compute_statistic_and_variance(mmds, vars, true); - - /* we know that the underlying MMD is linear time version, cast is safe */ - SGVector measures(mmds.vlen); - - for (index_t i=0; i - -#include - -namespace shogun -{ - -class CLinearTimeMMD; - -/** @brief Implements optimal kernel selection for single kernels. - * Given a number of baseline kernels, this method selects the one that - * minimizes the type II error for a given type I error for a two-sample test. - * This only works for the CLinearTimeMMD statistic. - * - * The idea is to maximise the ratio of MMD and its standard deviation. - * - * IMPORTANT: The kernel has to be selected on different data than the two-sample - * test is performed on. - * - * Described in - * Gretton, A., Sriperumbudur, B., Sejdinovic, D., Strathmann, H., - * Balakrishnan, S., Pontil, M., & Fukumizu, K. (2012). - * Optimal kernel choice for large-scale two-sample tests. - * Advances in Neural Information Processing Systems. - */ -class CMMDKernelSelectionOpt: public CMMDKernelSelection -{ -public: - - /** Default constructor */ - CMMDKernelSelectionOpt(); - - /** Constructor that initialises the underlying MMD instance. Currently, - * only the linear time MMD is supported - * - * @param mmd MMD instance to use - * @param lambda ridge that is added to standard deviation in order to - * prevent division by zero. A sensivle value is for example 1E-5. - */ - CMMDKernelSelectionOpt(CKernelTwoSampleTest* mmd, - float64_t lambda=10E-5); - - /** Destructor */ - virtual ~CMMDKernelSelectionOpt(); - - /** Overwrites superclass method and ensures that all statistics are - * computed on the same data. Since linear time MMD is a streaming - * statistic, just computing all statistics one after another would use - * different data. This method makes sure that all kernels are used at once - * - * @return vector with kernel criterion values for all attached kernels - */ - virtual SGVector compute_measures(); - - /** @return name of the SGSerializable */ - virtual const char* get_name() const { return "MMDKernelSelectionOpt"; } - -private: - /** Initializer */ - void init(); - -protected: - /** Ridge that is added to the denumerator of the ratio of MMD and its - * standard deviation */ - float64_t m_lambda; -}; - -} - -#endif /* __MMDKERNELSELECTIONOPTSINGLE_H_ */ diff --git a/src/shogun/statistics/NOCCO.cpp b/src/shogun/statistics/NOCCO.cpp deleted file mode 100644 index e2dc9ad4a4c..00000000000 --- a/src/shogun/statistics/NOCCO.cpp +++ /dev/null @@ -1,268 +0,0 @@ -/* - * Copyright (c) The Shogun Machine Learning Toolbox - * Written (w) 2014 Soumyajit De - * Written (w) 2012-2013 Heiko Strathmann - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * The views and conclusions contained in the software and documentation are those - * of the authors and should not be interpreted as representing official policies, - * either expressed or implied, of the Shogun Development Team. - */ - -#include - -#include -#include -#include -#include -#include - -using namespace shogun; -using namespace Eigen; - -CNOCCO::CNOCCO() : CKernelIndependenceTest() -{ - init(); -} - -CNOCCO::CNOCCO(CKernel* kernel_p, CKernel* kernel_q, CFeatures* p, CFeatures* q) - : CKernelIndependenceTest(kernel_p, kernel_q, p, q) -{ - init(); - - // only equal number of samples are allowed - if (p && q) - { - REQUIRE(p->get_num_vectors()==q->get_num_vectors(), - "Only equal number of samples from both the distributions are " - "possible. Provided %d samples from p and %d samples from q!\n", - p->get_num_vectors(), q->get_num_vectors()); - - m_num_features=p->get_num_vectors(); - } -} - -CNOCCO::~CNOCCO() -{ -} - -void CNOCCO::init() -{ - SG_ADD(&m_num_features, "num_features", - "Number of features from each of the distributions", - MS_NOT_AVAILABLE); - SG_ADD(&m_epsilon, "epsilon", "The regularization constant", - MS_NOT_AVAILABLE); - - m_num_features=0; - m_epsilon=0.0; - - // we need PERMUTATION as the null approximation method here - m_null_approximation_method=PERMUTATION; -} - -void CNOCCO::set_p(CFeatures* p) -{ - CIndependenceTest::set_p(p); - REQUIRE(m_p, "Provided feature for p cannot be null!\n"); - m_num_features=m_p->get_num_vectors(); -} - -void CNOCCO::set_q(CFeatures* q) -{ - CIndependenceTest::set_q(q); - REQUIRE(m_q, "Provided feature for q cannot be null!\n"); - m_num_features=m_q->get_num_vectors(); -} - -void CNOCCO::set_epsilon(float64_t epsilon) -{ - m_epsilon=epsilon; -} - -float64_t CNOCCO::get_epsilon() const -{ - return m_epsilon; -} - -SGMatrix CNOCCO::compute_helper(SGMatrix m) -{ - SG_DEBUG("Entering!\n"); - - const index_t n=m_num_features; - Map mat(m.matrix, n, n); - - // the result matrix res = m * inv(m + n*epsilon*eye(n,n)) - SGMatrix res(n, n); - MatrixXd to_inv=mat+n*m_epsilon*MatrixXd::Identity(n,n); - - // since the matrix is SPD, instead of directly computing the inverse, - // we compute the Cholesky decomposition and solve systems (see class - // documentation for details) - LLT chol(to_inv); - - // compute the matrix times inverse by solving systems - VectorXd e=VectorXd::Zero(n); - for (index_t i=0; i Gx=get_kernel_matrix_K(); - SGMatrix Gy=get_kernel_matrix_L(); - - // center the kernel matrices - Gx.center(); - Gy.center(); - - SGMatrix Rx=compute_helper(Gx); - SGMatrix Ry=compute_helper(Gy); - - Map Rx_map(Rx.matrix, Rx.num_rows, Rx.num_cols); - Map Ry_map(Ry.matrix, Ry.num_rows, Ry.num_cols); - - // compute the trace of the matrix multiplication without computing the - // off-diagonal entries of the final matrix and just the diagonal entries - float64_t result=0.0; - for (index_t i=0; i CNOCCO::sample_null() -{ - SG_DEBUG("Entering!\n") - - /* replace current kernel via precomputed custom kernel and call superclass - * method */ - - /* backup references to old kernels */ - CKernel* kernel_p=m_kernel_p; - CKernel* kernel_q=m_kernel_q; - - /* init kernels before to be sure that everything is fine - * kernel function between two samples from different distributions - * is never computed - in fact, they may as well have different features */ - m_kernel_p->init(m_p, m_p); - m_kernel_q->init(m_q, m_q); - - /* precompute kernel matrices */ - CCustomKernel* precomputed_p=new CCustomKernel(m_kernel_p); - CCustomKernel* precomputed_q=new CCustomKernel(m_kernel_q); - SG_REF(precomputed_p); - SG_REF(precomputed_q); - - /* temporarily replace own kernels */ - m_kernel_p=precomputed_p; - m_kernel_q=precomputed_q; - - /* use superclass sample_null which shuffles the entries for one - * distribution using index permutation on rows and columns of - * kernel matrix from one distribution, while accessing the other - * in its original order and then compute statistic */ - SGVector null_samples=CKernelIndependenceTest::sample_null(); - - /* restore kernels */ - m_kernel_p=kernel_p; - m_kernel_q=kernel_q; - - SG_UNREF(precomputed_p); - SG_UNREF(precomputed_q); - - SG_DEBUG("Leaving!\n") - return null_samples; -} diff --git a/src/shogun/statistics/NOCCO.h b/src/shogun/statistics/NOCCO.h deleted file mode 100644 index 7989a95ea55..00000000000 --- a/src/shogun/statistics/NOCCO.h +++ /dev/null @@ -1,224 +0,0 @@ -/* - * Copyright (c) The Shogun Machine Learning Toolbox - * Written (w) 2014 Soumyajit De - * Written (w) 2012-2013 Heiko Strathmann - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * The views and conclusions contained in the software and documentation are those - * of the authors and should not be interpreted as representing official policies, - * either expressed or implied, of the Shogun Development Team. - */ - -#ifndef NOCCO_H_ -#define NOCCO_H_ - -#include - -#include - -namespace shogun -{ - -template class SGMatrix; - -/** @brief This class implements the NOrmalized Cross Covariance Operator - * (NOCCO) based independence test as described in [1]. - * - * The test of independence is performed as follows: Given samples \f$Z=\{(x_i, - * y_i)\}_{i=1}^n\f$ from the joint distribution \f$\textbf{P}_{XY}\f$, - * does the joint distribution factorize as \f$\textbf{P}_{XY}=\textbf{P}_X - * \textbf{P}_Y\f$? The null hypothesis says yes and the alternative hypothesis - * says no. - * - * The dependence of the random variables \f$\mathbf X=\{x_i\}\f$ and \f$ - * \mathbf Y=\{y_i\}\f$ can be measured via the cross-covariance operator - * \f$\boldsymbol\Sigma_{YX}\f$ which becomes \f$\mathbf{0}\f$ if and only if - * \f$\mathbf X\f$ and \f$\mathbf Y\f$ are independent. This term factorizes as - * \f$\boldsymbol\Sigma_{YX}=\boldsymbol\Sigma_{YY}^{\frac{1}{2}}\mathbf{V}_ - * {YX}\boldsymbol\Sigma_{XX}^{\frac{1}{2}}\f$, where \f$\boldsymbol\Sigma_ - * {XX}\f$ and \f$\boldsymbol\Sigma_{YY}\f$ are known as covariance operator and - * \f$\mathbf{V}_{YX}\f$ is known as normalized cross-covariance operator. The - * paper uses the Hilbert-Schmidt norm of \f$\mathbf V_{YX}\f$ as a dependence - * measure of the independence test (see paper for theroretical details). - * - * This class overrides the compute_statistic() method of the superclass which - * computes an unbiased estimate of the normalized cross covariance operator - * norm. Given the kernels \f$K\f$ (for \f$\mathbf X\f$) and \f$L\f$ (for - * \f$\mathbf Y\f$), if we denote the doubly centered Gram matrices as - * \f$\mathbf{G}_X=\mathbf{HKH}\f$ and \f$\mathbf{G}_Y=\mathbf{HLH}\f$ - * (where \f$\mathbf H=\mathbf I-\frac{1}{n}\mathbf{1}\f$), then the operator - * norm is estimated as - * \f[ - * \hat{I}^{\text{NOCCO}}=\text{Trace}\left[\mathbf{R_X R_Y}\right] - * \f] - * where \f$\mathbf{R}_X=\mathbf{G}_X(\mathbf{G}_X+n\varepsilon_n\mathbf{I}) - * ^{-1}\f$ and \f$\mathbf{R}_Y=\mathbf{G}_Y(\mathbf{G}_Y+n\varepsilon_n - * \mathbf{I})^{-1}\f$ and \f$\varepsilon_n\gt 0\f$ is a regularization - * constant. - * - * In order to avoid computing direct inverse in the above terms for avoiding - * numerical issues, this class uses Cholesky decomposition of matrices - * \f$\mathbf{GG}_*=\mathbf{LL}^\top\f$ (where \f$\mathbf{GG}_*=(\mathbf{G}_*+ - * n\varepsilon_n\mathbf{I})^{-1}\f$) and solve systems \f$\mathbf{GG}_* - * \mathbf x_i=\mathbf{LL}^\top\mathbf x_i=\mathbf e_i\f$ (\f$\mathbf e_i\f$ - * being the \f$i^{\text{th}}\f$ column of \f$\mathbf I_n\f$) one by one. On - * the fly it then uses the solution vectors \f$\mathbf x_i\f$ to compute the - * matrix-matrix product \f$\mathbf C_*=\mathbf G_*\mathbf{GG}_*^{-1}\f$ - * using \f$\mathbf C_{*,(j,i)}=\mathbf G_{*,j}\cdot \mathbf x_i\f$, where - * \f$\mathbf G_{*,j}\f$ is the \f$j^{\text{th}}\f$ row of \f$\mathbf G_*\f$ (or - * column, since it is symmetric) and then discarding the vector. - * - * The final trace computation is also simplified using the symmetry of the - * matrices \f$\mathbf R_X\f$ and \f$\mathbf R_Y\f$. Computation of the off- - * diagonal elements are avoided using - * \f[ - * \text{Trace}\left[\mathbf R_X \mathbf R_Y\right ]=\sum_{i=1}^n - * \mathbf R_X^i\cdot \mathbf R_Y^i - * \f] - * - * For performing the independence test, PERMUTATION test is used by first - * randomly shuffling the samples from one of the distributions while keeping - * the samples from the other distribution in the original order. This way we - * sample the null distribution and compute p-value and threshold for a given - * test power. - * - * [1]: Kenji Fukumizu, Arthur Gretton, Xiaohai Sun, Bernhard Scholkopf: - * Kernel Measures of Conditional Dependence. NIPS 2007 - */ -class CNOCCO : public CKernelIndependenceTest -{ -public: - /** Constructor */ - CNOCCO(); - - /** Constructor. - * - * Initializes the kernels and features from the two distributions and - * SG_REFs them - * - * @param kernel_p kernel to use on samples from p - * @param kernel_q kernel to use on samples from q - * @param p samples from distribution p - * @param q samples from distribution q - */ - CNOCCO(CKernel* kernel_p, CKernel* kernel_q, CFeatures* p, CFeatures* q); - - /** Destructor */ - virtual ~CNOCCO(); - - /** Computes the NOCCO statistic (see class description) for underlying - * kernels and data. - * - * Note that since kernel matrices have to be stored, it has quadratic - * space costs. - * - * @return unbiased estimate of NOCCO - */ - virtual float64_t compute_statistic(); - - /** Computes a p-value based on current method for approximating the - * null-distribution. The p-value is the 1-p quantile of the null- - * distribution where the given statistic lies in. - * - * @param statistic statistic value to compute the p-value for - * @return p-value parameter statistic is the (1-p) percentile of the - * null distribution - */ - virtual float64_t compute_p_value(float64_t statistic); - - /** Computes a threshold based on current method for approximating the - * null-distribution. The threshold is the value that a statistic has - * to have in ordner to reject the null-hypothesis. - * - * @param alpha test level to reject null-hypothesis - * @return threshold for statistics to reject null-hypothesis - */ - virtual float64_t compute_threshold(float64_t alpha); - - /** @return the class name */ - virtual const char* get_name() const - { - return "NOCCO"; - } - - /** @return the statistic type of this test statistic */ - virtual EStatisticType get_statistic_type() const - { - return S_NOCCO; - } - - /** Setter for features from distribution p, SG_REFs it - * - * @param p features from p - */ - virtual void set_p(CFeatures* p); - - /** Setter for features from distribution q, SG_REFs it - * - * @param q features from q - */ - virtual void set_q(CFeatures* q); - - /** - * Setter for regularization parameter epsilon - * @param epsilon the regularization parameter - */ - void set_epsilon(float64_t epsilon); - - /** @return epsilon the regularization parameter */ - float64_t get_epsilon() const; - - /** Merges both sets of samples and computes the test statistic - * m_num_null_sample times. This version precomputes the kenrel matrix - * once by hand, then samples using this one. The matrix has - * to be stored anyway when statistic is computed. - * - * @return vector of all statistics - */ - virtual SGVector sample_null(); - -protected: - /** - * Helper method which computes the matrix times matrix inverse using LLT - * solve (Cholesky) withoout storing the inverse (see class documentation). - * - * @param m the centered Gram matrix - * @return the result matrix of the multiplication - */ - SGMatrix compute_helper(SGMatrix m); - -private: - /** Register parameters and initialize with defaults */ - void init(); - - /** Number of features from the distributions (should be equal for both) */ - index_t m_num_features; - - /** The regularization constant */ - float64_t m_epsilon; - -}; - -} - -#endif // NOCCO_H_ diff --git a/src/shogun/statistics/QuadraticTimeMMD.cpp b/src/shogun/statistics/QuadraticTimeMMD.cpp deleted file mode 100644 index fff724e5e54..00000000000 --- a/src/shogun/statistics/QuadraticTimeMMD.cpp +++ /dev/null @@ -1,1115 +0,0 @@ -/* - * Copyright (c) The Shogun Machine Learning Toolbox - * Written (w) 2012-2013 Heiko Strathmann - * Written (w) 2014 Soumyajit De - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * The views and conclusions contained in the software and documentation are those - * of the authors and should not be interpreted as representing official policies, - * either expressed or implied, of the Shogun Development Team. - */ - -#include -#include -#include -#include -#include -#include -#include - -using namespace shogun; - -#include - -using namespace Eigen; - -CQuadraticTimeMMD::CQuadraticTimeMMD() : CKernelTwoSampleTest() -{ - init(); -} - -CQuadraticTimeMMD::CQuadraticTimeMMD(CKernel* kernel, CFeatures* p_and_q, - index_t m) : - CKernelTwoSampleTest(kernel, p_and_q, m) -{ - init(); -} - -CQuadraticTimeMMD::CQuadraticTimeMMD(CKernel* kernel, CFeatures* p, - CFeatures* q) : CKernelTwoSampleTest(kernel, p, q) -{ - init(); -} - -CQuadraticTimeMMD::CQuadraticTimeMMD(CCustomKernel* custom_kernel, index_t m) : - CKernelTwoSampleTest(custom_kernel, NULL, m) -{ - init(); -} - -CQuadraticTimeMMD::~CQuadraticTimeMMD() -{ -} - -void CQuadraticTimeMMD::init() -{ - SG_ADD(&m_num_samples_spectrum, "num_samples_spectrum", "Number of samples" - " for spectrum method null-distribution approximation", - MS_NOT_AVAILABLE); - SG_ADD(&m_num_eigenvalues_spectrum, "num_eigenvalues_spectrum", "Number of " - " Eigenvalues for spectrum method null-distribution approximation", - MS_NOT_AVAILABLE); - SG_ADD((machine_int_t*)&m_statistic_type, "statistic_type", - "Biased or unbiased MMD", MS_NOT_AVAILABLE); - - m_num_samples_spectrum=0; - m_num_eigenvalues_spectrum=0; - m_statistic_type=UNBIASED; -} - -SGVector CQuadraticTimeMMD::compute_unbiased_statistic_variance( - int m, int n) -{ - SG_DEBUG("Entering!\n"); - - /* init kernel with features. NULL check is handled in compute_statistic */ - m_kernel->init(m_p_and_q, m_p_and_q); - - /* computing kernel values and their sums on the fly that are used both in - computing statistic and variance */ - - /* the following matrix stores row-wise sum of kernel values k(X,X') in - the first column and row-wise squared sum of kernel values k^2(X,X') - in the second column. m entries in both column */ - SGMatrix xx_sum_sq_sum_rowwise=m_kernel-> - row_wise_sum_squared_sum_symmetric_block(0, m); - - /* row-wise sum of kernel values k(Y,Y'), n entries */ - SGVector yy_sum_rowwise=m_kernel-> - row_wise_sum_symmetric_block(m, n); - - /* row-wise and col-wise sum of kernel values k(X,Y), m+n entries - first m entries are row-wise sum, rest n entries are col-wise sum */ - SGVector xy_sum_rowcolwise=m_kernel-> - row_col_wise_sum_block(0, m, m, n); - - /* computing overall sum and squared sum from above for convenience */ - - SGVector xx_sum_rowwise(m); - std::copy(xx_sum_sq_sum_rowwise.matrix, xx_sum_sq_sum_rowwise.matrix+m, - xx_sum_rowwise.vector); - - SGVector xy_sum_rowwise(m); - std::copy(xy_sum_rowcolwise.vector, xy_sum_rowcolwise.vector+m, - xy_sum_rowwise.vector); - - SGVector xy_sum_colwise(n); - std::copy(xy_sum_rowcolwise.vector+m, xy_sum_rowcolwise.vector+m+n, - xy_sum_colwise.vector); - - float64_t xx_sq_sum=0.0; - for (index_t i=0; i results(3); - results[0]=statistic; - results[1]=var_null; - results[2]=var_alt; - - SG_DEBUG("Leaving!\n"); - - return results; -} - -SGVector CQuadraticTimeMMD::compute_biased_statistic_variance(int m, int n) -{ - SG_DEBUG("Entering!\n"); - - /* init kernel with features. NULL check is handled in compute_statistic */ - m_kernel->init(m_p_and_q, m_p_and_q); - - /* computing kernel values and their sums on the fly that are used both in - computing statistic and variance */ - - /* the following matrix stores row-wise sum of kernel values k(X,X') in - the first column and row-wise squared sum of kernel values k^2(X,X') - in the second column. m entries in both column */ - SGMatrix xx_sum_sq_sum_rowwise=m_kernel-> - row_wise_sum_squared_sum_symmetric_block(0, m, false); - - /* row-wise sum of kernel values k(Y,Y'), n entries */ - SGVector yy_sum_rowwise=m_kernel-> - row_wise_sum_symmetric_block(m, n, false); - - /* row-wise and col-wise sum of kernel values k(X,Y), m+n entries - first m entries are row-wise sum, rest n entries are col-wise sum */ - SGVector xy_sum_rowcolwise=m_kernel-> - row_col_wise_sum_block(0, m, m, n); - - /* computing overall sum and squared sum from above for convenience */ - - SGVector xx_sum_rowwise(m); - std::copy(xx_sum_sq_sum_rowwise.matrix, xx_sum_sq_sum_rowwise.matrix+m, - xx_sum_rowwise.vector); - - SGVector xy_sum_rowwise(m); - std::copy(xy_sum_rowcolwise.vector, xy_sum_rowcolwise.vector+m, - xy_sum_rowwise.vector); - - SGVector xy_sum_colwise(n); - std::copy(xy_sum_rowcolwise.vector+m, xy_sum_rowcolwise.vector+m+n, - xy_sum_colwise.vector); - - float64_t xx_sq_sum=0.0; - for (index_t i=0; i results(3); - results[0]=statistic; - results[1]=var_null; - results[2]=var_alt; - - SG_DEBUG("Leaving!\n"); - - return results; -} - -SGVector CQuadraticTimeMMD::compute_incomplete_statistic_variance(int n) -{ - SG_DEBUG("Entering!\n"); - - /* init kernel with features. NULL check is handled in compute_statistic */ - m_kernel->init(m_p_and_q, m_p_and_q); - - /* computing kernel values and their sums on the fly that are used both in - computing statistic and variance */ - - /* the following matrix stores row-wise sum of kernel values k(X,X') in - the first column and row-wise squared sum of kernel values k^2(X,X') - in the second column. n entries in both column */ - SGMatrix xx_sum_sq_sum_rowwise=m_kernel-> - row_wise_sum_squared_sum_symmetric_block(0, n); - - /* row-wise sum of kernel values k(Y,Y'), n entries */ - SGVector yy_sum_rowwise=m_kernel-> - row_wise_sum_symmetric_block(n, n); - - /* row-wise and col-wise sum of kernel values k(X,Y), 2n entries - first n entries are row-wise sum, rest n entries are col-wise sum */ - SGVector xy_sum_rowcolwise=m_kernel-> - row_col_wise_sum_block(0, n, n, n, true); - - /* computing overall sum and squared sum from above for convenience */ - - SGVector xx_sum_rowwise(n); - std::copy(xx_sum_sq_sum_rowwise.matrix, xx_sum_sq_sum_rowwise.matrix+n, - xx_sum_rowwise.vector); - - SGVector xy_sum_rowwise(n); - std::copy(xy_sum_rowcolwise.vector, xy_sum_rowcolwise.vector+n, - xy_sum_rowwise.vector); - - SGVector xy_sum_colwise(n); - std::copy(xy_sum_rowcolwise.vector+n, xy_sum_rowcolwise.vector+2*n, - xy_sum_colwise.vector); - - float64_t xx_sq_sum=0.0; - for (index_t i=0; i results(3); - results[0]=statistic; - results[1]=var_null; - results[2]=var_alt; - - SG_DEBUG("Leaving!\n"); - - return results; -} - -float64_t CQuadraticTimeMMD::compute_unbiased_statistic(int m, int n) -{ - return compute_unbiased_statistic_variance(m, n)[0]; -} - -float64_t CQuadraticTimeMMD::compute_biased_statistic(int m, int n) -{ - return compute_biased_statistic_variance(m, n)[0]; -} - -float64_t CQuadraticTimeMMD::compute_incomplete_statistic(int n) -{ - return compute_incomplete_statistic_variance(n)[0]; -} - -float64_t CQuadraticTimeMMD::compute_statistic() -{ - REQUIRE(m_kernel, "No kernel specified!\n") - - index_t m=m_m; - index_t n=0; - - /* check if kernel is precomputed (custom kernel) */ - if (m_kernel->get_kernel_type()==K_CUSTOM) - n=m_kernel->get_num_vec_lhs()-m; - else - { - REQUIRE(m_p_and_q, "The samples are not initialized!\n"); - n=m_p_and_q->get_num_vectors()-m; - } - - SG_DEBUG("Computing MMD with %d samples from p and %d samples from q!\n", - m, n); - - float64_t result=0; - switch (m_statistic_type) - { - case UNBIASED: - result=compute_unbiased_statistic(m, n); - result*=m*n/float64_t(m+n); - break; - case UNBIASED_DEPRECATED: - result=compute_unbiased_statistic(m, n); - result*=m==n ? m : (m+n); - break; - case BIASED: - result=compute_biased_statistic(m, n); - result*=m*n/float64_t(m+n); - break; - case BIASED_DEPRECATED: - result=compute_biased_statistic(m, n); - result*=m==n? m : (m+n); - break; - case INCOMPLETE: - REQUIRE(m==n, "Only possible with equal number of samples from both" - "distribution!\n") - result=compute_incomplete_statistic(n); - result*=n/2; - break; - default: - SG_ERROR("Unknown statistic type!\n"); - break; - } - - return result; -} - -SGVector CQuadraticTimeMMD::compute_variance() -{ - REQUIRE(m_kernel, "No kernel specified!\n") - - index_t m=m_m; - index_t n=0; - - /* check if kernel is precomputed (custom kernel) */ - if (m_kernel->get_kernel_type()==K_CUSTOM) - n=m_kernel->get_num_vec_lhs()-m; - else - { - REQUIRE(m_p_and_q, "The samples are not initialized!\n"); - n=m_p_and_q->get_num_vectors()-m; - } - - SG_DEBUG("Computing MMD with %d samples from p and %d samples from q!\n", - m, n); - - SGVector result(2); - switch (m_statistic_type) - { - case UNBIASED: - case UNBIASED_DEPRECATED: - { - SGVector res=compute_unbiased_statistic_variance(m, n); - result[0]=res[1]; - result[1]=res[2]; - break; - } - case BIASED: - case BIASED_DEPRECATED: - { - SGVector res=compute_biased_statistic_variance(m, n); - result[0]=res[1]; - result[1]=res[2]; - break; - } - case INCOMPLETE: - { - REQUIRE(m==n, "Only possible with equal number of samples from both" - "distribution!\n") - SGVector res=compute_incomplete_statistic_variance(n); - result[0]=res[1]; - result[1]=res[2]; - break; - } - default: - SG_ERROR("Unknown statistic type!\n"); - break; - } - - return result; -} - -float64_t CQuadraticTimeMMD::compute_variance_under_null() -{ - return compute_variance()[0]; -} - -float64_t CQuadraticTimeMMD::compute_variance_under_alternative() -{ - return compute_variance()[1]; -} - -SGVector CQuadraticTimeMMD::compute_statistic(bool multiple_kernels) -{ - SGVector mmds; - if (!multiple_kernels) - { - /* just one mmd result */ - mmds=SGVector(1); - mmds[0]=compute_statistic(); - } - else - { - REQUIRE(m_kernel, "No kernel specified!\n") - REQUIRE(m_kernel->get_kernel_type()==K_COMBINED, - "multiple kernels specified, but underlying kernel is not of type " - "K_COMBINED\n"); - - /* cast and allocate memory for results */ - CCombinedKernel* combined=(CCombinedKernel*)m_kernel; - SG_REF(combined); - mmds=SGVector(combined->get_num_subkernels()); - - /* iterate through all kernels and compute statistic */ - /* TODO this might be done in parallel */ - for (index_t i=0; iget_kernel(i); - /* temporarily replace underlying kernel and compute statistic */ - m_kernel=current; - mmds[i]=compute_statistic(); - - SG_UNREF(current); - } - - /* restore combined kernel */ - m_kernel=combined; - SG_UNREF(combined); - } - - return mmds; -} - -SGMatrix CQuadraticTimeMMD::compute_variance(bool multiple_kernels) -{ - SGMatrix vars; - if (!multiple_kernels) - { - /* just one mmd result */ - vars=SGMatrix(1, 2); - SGVector result=compute_variance(); - vars(0, 0)=result[0]; - vars(0, 1)=result[1]; - } - else - { - REQUIRE(m_kernel, "No kernel specified!\n") - REQUIRE(m_kernel->get_kernel_type()==K_COMBINED, - "multiple kernels specified, but underlying kernel is not of type " - "K_COMBINED\n"); - - /* cast and allocate memory for results */ - CCombinedKernel* combined=(CCombinedKernel*)m_kernel; - SG_REF(combined); - vars=SGMatrix(combined->get_num_subkernels(), 2); - - /* iterate through all kernels and compute variance */ - /* TODO this might be done in parallel */ - for (index_t i=0; iget_kernel(i); - /* temporarily replace underlying kernel and compute variance */ - m_kernel=current; - SGVector result=compute_variance(); - vars(i, 0)=result[0]; - vars(i, 1)=result[1]; - - SG_UNREF(current); - } - - /* restore combined kernel */ - m_kernel=combined; - SG_UNREF(combined); - } - - return vars; -} - -float64_t CQuadraticTimeMMD::compute_p_value(float64_t statistic) -{ - float64_t result=0; - - switch (m_null_approximation_method) - { - case MMD2_SPECTRUM: - { - /* get samples from null-distribution and compute p-value of statistic */ - SGVector null_samples=sample_null_spectrum( - m_num_samples_spectrum, m_num_eigenvalues_spectrum); - CMath::qsort(null_samples); - index_t pos=null_samples.find_position_to_insert(statistic); - result=1.0-((float64_t)pos)/null_samples.vlen; - break; - } - - case MMD2_SPECTRUM_DEPRECATED: - { - /* get samples from null-distribution and compute p-value of statistic */ - SGVector null_samples=sample_null_spectrum_DEPRECATED( - m_num_samples_spectrum, m_num_eigenvalues_spectrum); - CMath::qsort(null_samples); - index_t pos=null_samples.find_position_to_insert(statistic); - result=1.0-((float64_t)pos)/null_samples.vlen; - break; - } - - case MMD2_GAMMA: - { - /* fit gamma and return cdf at statistic */ - SGVector params=fit_null_gamma(); - result=CStatistics::gamma_cdf(statistic, params[0], params[1]); - break; - } - - default: - result=CKernelTwoSampleTest::compute_p_value(statistic); - break; - } - - return result; -} - -float64_t CQuadraticTimeMMD::compute_threshold(float64_t alpha) -{ - float64_t result=0; - - switch (m_null_approximation_method) - { - case MMD2_SPECTRUM: - { - /* get samples from null-distribution and compute threshold */ - SGVector null_samples=sample_null_spectrum( - m_num_samples_spectrum, m_num_eigenvalues_spectrum); - CMath::qsort(null_samples); - result=null_samples[index_t(CMath::floor(null_samples.vlen*(1-alpha)))]; - break; - } - - case MMD2_SPECTRUM_DEPRECATED: - { - /* get samples from null-distribution and compute threshold */ - SGVector null_samples=sample_null_spectrum_DEPRECATED( - m_num_samples_spectrum, m_num_eigenvalues_spectrum); - CMath::qsort(null_samples); - result=null_samples[index_t(CMath::floor(null_samples.vlen*(1-alpha)))]; - break; - } - - case MMD2_GAMMA: - { - /* fit gamma and return inverse cdf at alpha */ - SGVector params=fit_null_gamma(); - result=CStatistics::gamma_inverse_cdf(alpha, params[0], params[1]); - break; - } - - default: - /* sampling null is handled here */ - result=CKernelTwoSampleTest::compute_threshold(alpha); - break; - } - - return result; -} - - -SGVector CQuadraticTimeMMD::sample_null_spectrum(index_t num_samples, - index_t num_eigenvalues) -{ - REQUIRE(m_kernel, "(%d, %d): No kernel set!\n", num_samples, - num_eigenvalues); - REQUIRE(m_kernel->get_kernel_type()==K_CUSTOM || m_p_and_q, - "(%d, %d): No features set and no custom kernel in use!\n", - num_samples, num_eigenvalues); - - index_t m=m_m; - index_t n=0; - - /* check if kernel is precomputed (custom kernel) */ - if (m_kernel && m_kernel->get_kernel_type()==K_CUSTOM) - n=m_kernel->get_num_vec_lhs()-m; - else - { - REQUIRE(m_p_and_q, "The samples are not initialized!\n"); - n=m_p_and_q->get_num_vectors()-m; - } - - if (num_samples<=2) - { - SG_ERROR("Number of samples has to be at least 2, " - "better in the hundreds"); - } - - if (num_eigenvalues>m+n-1) - SG_ERROR("Number of Eigenvalues too large\n"); - - if (num_eigenvalues<1) - SG_ERROR("Number of Eigenvalues too small\n"); - - /* imaginary matrix K=[K KL; KL' L] (MATLAB notation) - * K is matrix for XX, L is matrix for YY, KL is XY, LK is YX - * works since X and Y are concatenated here */ - m_kernel->init(m_p_and_q, m_p_and_q); - SGMatrix K=m_kernel->get_kernel_matrix(); - - /* center matrix K=H*K*H */ - K.center(); - - /* compute eigenvalues and select num_eigenvalues largest ones */ - Map c_kernel_matrix(K.matrix, K.num_rows, K.num_cols); - SelfAdjointEigenSolver eigen_solver(c_kernel_matrix); - REQUIRE(eigen_solver.info()==Eigen::Success, - "Eigendecomposition failed!\n"); - index_t max_num_eigenvalues=eigen_solver.eigenvalues().rows(); - - /* finally, sample from null distribution */ - SGVector null_samples(num_samples); - for (index_t i=0; i CQuadraticTimeMMD::sample_null_spectrum_DEPRECATED( - index_t num_samples, index_t num_eigenvalues) -{ - REQUIRE(m_kernel, "(%d, %d): No kernel set!\n", num_samples, - num_eigenvalues); - REQUIRE(m_kernel->get_kernel_type()==K_CUSTOM || m_p_and_q, - "(%d, %d): No features set and no custom kernel in use!\n", - num_samples, num_eigenvalues); - - index_t m=m_m; - index_t n=0; - - /* check if kernel is precomputed (custom kernel) */ - if (m_kernel && m_kernel->get_kernel_type()==K_CUSTOM) - n=m_kernel->get_num_vec_lhs()-m; - else - { - REQUIRE(m_p_and_q, "The samples are not initialized!\n"); - n=m_p_and_q->get_num_vectors()-m; - } - - if (num_samples<=2) - { - SG_ERROR("Number of samples has to be at least 2, " - "better in the hundreds"); - } - - if (num_eigenvalues>m+n-1) - SG_ERROR("Number of Eigenvalues too large\n"); - - if (num_eigenvalues<1) - SG_ERROR("Number of Eigenvalues too small\n"); - - /* imaginary matrix K=[K KL; KL' L] (MATLAB notation) - * K is matrix for XX, L is matrix for YY, KL is XY, LK is YX - * works since X and Y are concatenated here */ - m_kernel->init(m_p_and_q, m_p_and_q); - SGMatrix K=m_kernel->get_kernel_matrix(); - - /* center matrix K=H*K*H */ - K.center(); - - /* compute eigenvalues and select num_eigenvalues largest ones */ - Map c_kernel_matrix(K.matrix, K.num_rows, K.num_cols); - SelfAdjointEigenSolver eigen_solver(c_kernel_matrix); - REQUIRE(eigen_solver.info()==Eigen::Success, - "Eigendecomposition failed!\n"); - index_t max_num_eigenvalues=eigen_solver.eigenvalues().rows(); - - /* precomputing terms with rho_x and rho_y of equation 10 in [1] - * (see documentation) */ - float64_t rho_x=float64_t(m)/(m+n); - float64_t rho_y=1-rho_x; - - /* instead of using two Gaussian rv's ~ N(0,1), we'll use just one rv - * ~ N(0, 1/rho_x+1/rho_y) (derived from eq 10 in [1]) */ - float64_t std_dev=CMath::sqrt(1/rho_x+1/rho_y); - float64_t inv_rho_x_y=1/(rho_x*rho_y); - - SG_DEBUG("Using Gaussian samples ~ N(0,%f)\n", std_dev*std_dev); - - /* finally, sample from null distribution */ - SGVector null_samples(num_samples); - for (index_t i=0; i CQuadraticTimeMMD::fit_null_gamma() -{ - REQUIRE(m_kernel, "No kernel set!\n"); - REQUIRE(m_kernel->get_kernel_type()==K_CUSTOM || m_p_and_q, - "No features set and no custom kernel in use!\n"); - - index_t n=0; - - /* check if kernel is precomputed (custom kernel) */ - if (m_kernel && m_kernel->get_kernel_type()==K_CUSTOM) - n=m_kernel->get_num_vec_lhs()-m_m; - else - { - REQUIRE(m_p_and_q, "The samples are not initialized!\n"); - n=m_p_and_q->get_num_vectors()-m_m; - } - REQUIRE(m_m==n, "Only possible with equal number of samples " - "from both distribution!\n") - - index_t num_data; - if (m_kernel->get_kernel_type()==K_CUSTOM) - num_data=m_kernel->get_num_vec_rhs(); - else - num_data=m_p_and_q->get_num_vectors(); - - if (m_m!=num_data/2) - SG_ERROR("Currently, only equal sample sizes are supported\n"); - - /* evtl. warn user not to use wrong statistic type */ - if (m_statistic_type!=BIASED_DEPRECATED) - { - SG_WARNING("Note: provided statistic has " - "to be BIASED. Please ensure that! To get rid of warning," - "call %s::set_statistic_type(BIASED_DEPRECATED)\n", get_name()); - } - - /* imaginary matrix K=[K KL; KL' L] (MATLAB notation) - * K is matrix for XX, L is matrix for YY, KL is XY, LK is YX - * works since X and Y are concatenated here */ - m_kernel->init(m_p_and_q, m_p_and_q); - - /* compute mean under H0 of MMD, which is - * meanMMD = 2/m * ( 1 - 1/m*sum(diag(KL)) ); - * in MATLAB. - * Remove diagonals on the fly */ - float64_t mean_mmd=0; - for (index_t i=0; ikernel(i, m_m+i); - } - mean_mmd=2.0/m_m*(1.0-1.0/m_m*mean_mmd); - - /* compute variance under H0 of MMD, which is - * varMMD = 2/m/(m-1) * 1/m/(m-1) * sum(sum( (K + L - KL - KL').^2 )); - * in MATLAB, so sum up all elements */ - float64_t var_mmd=0; - for (index_t i=0; ikernel(i, j); - to_add+=m_kernel->kernel(m_m+i, m_m+j); - to_add-=m_kernel->kernel(i, m_m+j); - to_add-=m_kernel->kernel(m_m+i, j); - var_mmd+=CMath::pow(to_add, 2); - } - } - var_mmd*=2.0/m_m/(m_m-1)*1.0/m_m/(m_m-1); - - /* parameters for gamma distribution */ - float64_t a=CMath::pow(mean_mmd, 2)/var_mmd; - float64_t b=var_mmd*m_m / mean_mmd; - - SGVector result(2); - result[0]=a; - result[1]=b; - - return result; -} - -void CQuadraticTimeMMD::set_num_samples_spectrum(index_t - num_samples_spectrum) -{ - m_num_samples_spectrum=num_samples_spectrum; -} - -void CQuadraticTimeMMD::set_num_eigenvalues_spectrum( - index_t num_eigenvalues_spectrum) -{ - m_num_eigenvalues_spectrum=num_eigenvalues_spectrum; -} - -void CQuadraticTimeMMD::set_statistic_type(EQuadraticMMDType - statistic_type) -{ - m_statistic_type=statistic_type; -} - diff --git a/src/shogun/statistics/QuadraticTimeMMD.h b/src/shogun/statistics/QuadraticTimeMMD.h deleted file mode 100644 index f9981b0cec9..00000000000 --- a/src/shogun/statistics/QuadraticTimeMMD.h +++ /dev/null @@ -1,487 +0,0 @@ -/* - * Copyright (c) The Shogun Machine Learning Toolbox - * Written (w) 2012-2013 Heiko Strathmann - * Written (w) 2014 Soumyajit De - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * The views and conclusions contained in the software and documentation are those - * of the authors and should not be interpreted as representing official policies, - * either expressed or implied, of the Shogun Development Team. - */ - -#ifndef QUADRATIC_TIME_MMD_H_ -#define QUADRATIC_TIME_MMD_H_ - -#include - -#include - -namespace shogun -{ - -class CFeatures; -class CKernel; -class CCustomKernel; - -/** Enum to select which statistic type of quadratic time MMD should be computed */ -enum EQuadraticMMDType -{ - BIASED, - BIASED_DEPRECATED, - UNBIASED, - UNBIASED_DEPRECATED, - INCOMPLETE -}; - -/** @brief This class implements the quadratic time Maximum Mean Statistic as - * described in [1]. - * The MMD is the distance of two probability distributions \f$p\f$ and \f$q\f$ - * in a RKHS which we denote by - * \f[ - * \hat{\eta_k}=\text{MMD}[\mathcal{F},p,q]^2=\textbf{E}_{x,x'} - * \left[ k(x,x')\right]-2\textbf{E}_{x,y}\left[ k(x,y)\right] - * +\textbf{E}_{y,y'}\left[ k(y,y')\right]=||\mu_p - \mu_q||^2_\mathcal{F} - * \f] - * - * Given two sets of samples \f$\{x_i\}_{i=1}^{n_x}\sim p\f$ and - * \f$\{y_i\}_{i=1}^{n_y}\sim q\f$, \f$n_x+n_y=n\f$, - * the unbiased estimate of the above statistic is computed as - * \f[ - * \hat{\eta}_{k,U}=\frac{1}{n_x(n_x-1)}\sum_{i=1}^{n_x}\sum_{j\neq i} - * k(x_i,x_j)+\frac{1}{n_y(n_y-1)}\sum_{i=1}^{n_y}\sum_{j\neq i}k(y_i,y_j) - * -\frac{2}{n_xn_y}\sum_{i=1}^{n_x}\sum_{j=1}^{n_y}k(x_i,y_j) - * \f] - * - * A biased version is - * \f[ - * \hat{\eta}_{k,V}=\frac{1}{n_x^2}\sum_{i=1}^{n_x}\sum_{j=1}^{n_x} - * k(x_i,x_j)+\frac{1}{n_y^2}\sum_{i=1}^{n_y}\sum_{j=1}^{n_y}k(y_i,y_j) - * -\frac{2}{n_xn_y}\sum_{i=1}^{n_x}\sum_{j=1}^{n_y}k(x_i,y_j) - * \f] - * - * When \f$n_x=n_y=\frac{n}{2}\f$, an incomplete version can also be computed - * as the following - * \f[ - * \hat{\eta}_{k,U^-}=\frac{1}{\frac{n}{2}(\frac{n}{2}-1)}\sum_{i\neq j} - * h(z_i,z_j) - * \f] - * where for each pair \f$z=(x,y)\f$, \f$h(z,z')=k(x,x')+k(y,y')-k(x,y')- - * k(x',y)\f$. - * - * The type (biased/unbiased/incomplete) can be selected via set_statistic_type(). - * Note that there are presently two setups for computing statistic. While using - * BIASED, UNBIASED or INCOMPLETE, the estimate returned by compute_statistic() - * is \f$\frac{n_xn_y}{n_x+n_y}\hat{\eta}_k\f$. If DEPRECATED ones are used, then - * this returns \f$(n_x+n_y)\hat{\eta}_k\f$ in general and \f$(\frac{n}{2}) - * \hat{\eta}_k\f$ when \f$n_x=n_y=\frac{n}{2}\f$. This holds for the null - * distribution samples as well. - * - * Estimating variance of the asymptotic distribution of the statistic under - * null and alternative hypothesis can be done using compute_variance() method. - * This is internally done alongwise computing statistics to avoid recomputing - * the kernel. - * - * Variance under null is computed as - * \f$\sigma_{k,0}^2=2\hat{\kappa}_2=2(\kappa_2-2\kappa_1+\kappa_0)\f$ - * where - * \f$\kappa_0=\left(\mathbb{E}_{X,X'}k(X,X')\right )^2\f$, - * \f$\kappa_1=\mathbb{E}_X\left[(\mathbb{E}_{X'}k(X,X'))^2\right]\f$, and - * \f$\kappa_2=\mathbb{E}_{X,X'}k^2(X,X')\f$ - * and variance under alternative is computed as - * \f[ - * \sigma_{k,A}^2=4\rho_y\left\{\mathbb{E}_X\left[\left(\mathbb{E}_{X'} - * k(X,X')-\mathbb{E}_Yk(X,Y)\right)^2 \right ] -\left(\mathbb{E}_{X,X'} - * k(X,X')-\mathbb{E}_{X,Y}k(X,Y) \right)^2\right \}+4\rho_x\left\{ - * \mathbb{E}_Y\left[\left(\mathbb{E}_{Y'}k(Y,Y')-\mathbb{E}_Xk(X,Y) - * \right)^2\right ] -\left(\mathbb{E}_{Y,Y'}k(Y,Y')-\mathbb{E}_{X,Y} - * k(X,Y) \right)^2\right \} - * \f] - * where \f$\rho_x=\frac{n_x}{n}\f$ and \f$\rho_y=\frac{n_y}{n}\f$. - * - * Note that statistic and variance estimation can be done for multiple kernels - * at once as well. - * - * Along with the statistic comes a method to compute a p-value based on - * different methods. Permutation test is also possible. If unsure which one to - * use, sampling with 250 permutation iterations always is correct (but slow). - * - * To choose, use set_null_approximation_method() and choose from. - * - * MMD2_SPECTRUM_DEPRECATED: For a fast, consistent test based on the spectrum of - * the kernel matrix, as described in [2]. Only supported if Eigen3 is installed. - * - * MMD2_SPECTRUM: Similar to the deprecated version except it estimates the - * statistic under null as \f$\frac{n_xn_y}{n_x+n_y}\hat{\eta}_{k,U}\rightarrow - * \sum_r\lambda_r(Z_r^2-1)\f$ instead (see method description for more details). - * - * MMD2_GAMMA: for a very fast, but not consistent test based on moment matching - * of a Gamma distribution, as described in [2]. - * - * PERMUTATION: For permuting available samples to sample null-distribution - * - * If you do not know about your data, but want to use the MMD from a kernel - * matrix, just use the custom kernel constructor. Everything else will work as - * usual. - * - * For kernel selection see CMMDKernelSelection. - * - * NOTE: \f$n_x\f$ and \f$n_y\f$ are represented by \f$m\f$ and \f$n\f$, - * respectively in the implementation. - * - * [1]: Gretton, A., Borgwardt, K. M., Rasch, M. J., Schoelkopf, B., & Smola, A. (2012). - * A Kernel Two-Sample Test. Journal of Machine Learning Research, 13, 671-721. - * - * [2]: Gretton, A., Fukumizu, K., & Harchaoui, Z. (2011). - * A fast, consistent kernel two-sample test. - * - */ -class CQuadraticTimeMMD : public CKernelTwoSampleTest -{ -public: - /** default constructor */ - CQuadraticTimeMMD(); - - /** Constructor - * - * @param p_and_q feature data. Is assumed to contain samples from both - * p and q. First m samples from p, then from index m all samples from q - * - * @param kernel kernel to use - * @param p_and_q samples from p and q, appended - * @param m index of first sample of q - */ - CQuadraticTimeMMD(CKernel* kernel, CFeatures* p_and_q, index_t m); - - /** Constructor. - * This is a convienience constructor which copies both features to one - * element and then calls the other constructor. Needs twice the memory - * for a short time - * - * @param kernel kernel for MMD - * @param p samples from distribution p, will be copied and NOT SG_REF'ed - * @param q samples from distribution q, will be copied and NOT SG_REF'ed - */ - CQuadraticTimeMMD(CKernel* kernel, CFeatures* p, CFeatures* q); - - /** Constructor. - * This is a convienience constructor which allows to only specify - * a custom kernel. In this case, the features are completely ignored - * and all computations will be done on the custom kernel - * - * @param custom_kernel custom kernel for MMD, which is a kernel between - * the appended features p and q - * @param m index of first sample of q - */ - CQuadraticTimeMMD(CCustomKernel* custom_kernel, index_t m); - - /** destructor */ - virtual ~CQuadraticTimeMMD(); - - /** Computes the squared quadratic time MMD for the current data. Note - * that the type (biased/unbiased/incomplete) can be specified with - * set_statistic_type() method. - * - * @return (biased, unbiased or incomplete) \f$\frac{mn}{m+n}\hat{\eta}_k\f$. - * If DEPRECATED types are used, then it returns \f$(m+m)\hat{\eta}_k\f$ in - * general and \f$m\hat{\eta}_k\f$ when \f$m=n\f$. - */ - virtual float64_t compute_statistic(); - - /** Same as compute_statistic(), but with the possibility to perform on - * multiple kernels at once - * - * @param multiple_kernels if true, and underlying kernel is K_COMBINED, - * method will be executed on all subkernels on the same data - * @return vector of results for subkernels - */ - SGVector compute_statistic(bool multiple_kernels); - - /** - * Wrapper for computing variance estimate of the asymptotic distribution - * of the statistic (unbisaed/biased/incomplete) under null and alternative - * hypothesis (see class description for details) - * - * @return a vector of two values containing asymptotic variance estimate - * under null and alternative, respectively - */ - virtual SGVector compute_variance(); - - /** Same as compute_variance(), but with the possibility to perform on - * multiple kernels at once - * - * @param multiple_kernels if true, and underlying kernel is K_COMBINED, - * method will be executed on all subkernels on the same data - * @return matrix of results for subkernels, one row for each subkernel - */ - SGMatrix compute_variance(bool multiple_kernels); - - /** - * Wrapper method for compute_variance() - * - * @return variance estimation of asymptotic distribution of statistic - * under null hypothesis - */ - float64_t compute_variance_under_null(); - - /** - * Wrapper method for compute_variance() - * - * @return variance estimation of asymptotic distribution of statistic - * under alternative hypothesis - */ - float64_t compute_variance_under_alternative(); - - /** computes a p-value based on current method for approximating the - * null-distribution. The p-value is the 1-p quantile of the null- - * distribution where the given statistic lies in. - * - * Not all methods for computing the p-value are compatible with all - * methods of computing the statistic (biased/unbiased/incomplete). - * - * @param statistic statistic value to compute the p-value for - * @return p-value parameter statistic is the (1-p) percentile of the - * null distribution - */ - virtual float64_t compute_p_value(float64_t statistic); - - /** computes a threshold based on current method for approximating the - * null-distribution. The threshold is the value that a statistic has - * to have in ordner to reject the null-hypothesis. - * - * Not all methods for computing the p-value are compatible with all - * methods of computing the statistic (biased/unbiased/incomplete). - * - * @param alpha test level to reject null-hypothesis - * @return threshold for statistics to reject null-hypothesis - */ - virtual float64_t compute_threshold(float64_t alpha); - - /** @return the class name */ - virtual const char* get_name() const - { - return "QuadraticTimeMMD"; - }; - - /** returns the statistic type of this test statistic */ - virtual EStatisticType get_statistic_type() const - { - return S_QUADRATIC_TIME_MMD; - } - - /** Returns a set of samples of an estimate of the null distribution - * using the Eigen-spectrum of the centered kernel matrix of the merged - * samples of p and q. May be used to compute p-value (easy). - * - * The estimate is computed as - * \f[ - * \frac{n_xn_y}{n_x+n_y}\hat{\eta}_{k,U}\rightarrow\sum_{l=1}^\infty - * \lambda_l\left(Z^2_l-1 \right) - * \f] - * where \f${Z_l}\stackrel{i.i.d.}{\sim}\mathcal{N}(0,1)\f$ and - * \f$\lambda_l\f$ are the eigenvalues of centered kernel matrix HKH. - * - * kernel matrix needs to be stored in memory - * - * Note that m*n/(m+n)*Null-distribution is returned, - * which is fine since the statistic is also m*n/(m+n)*MMD^2 - * - * Works well if the kernel matrix is NOT diagonal dominant. - * See Gretton, A., Fukumizu, K., & Harchaoui, Z. (2011). - * A fast, consistent kernel two-sample test. - * - * @param num_samples number of samples to draw - * @param num_eigenvalues number of eigenvalues to use to draw samples - * Maximum number of m+n-1 where m and n are the sizes of samples from - * p and q respectively. - * @return samples from the estimated null distribution - */ - SGVector sample_null_spectrum(index_t num_samples, - index_t num_eigenvalues); - - /** Returns a set of samples of an estimate of the null distribution - * using the Eigen-spectrum of the centered kernel matrix of the merged - * samples of p and q. May be used to compute p-value (easy). - * - * The unbiased version uses - * \f[ - * t\text{MMD}_u^2[\mathcal{F},X,Y]\rightarrow\sum_{l=1}^\infty - * \lambda_l\left((a_l\rho_x^{-\frac{1}{{2}}} - * -b_l\rho_y^{-\frac{1}{{2}}})^2-(\rho_x\rho_y)^{-1} \right) - * \f] - * where \f$t=m+n\f$, \f$\lim_{m,n\rightarrow\infty}m/t\rightarrow - * \rho_x\f$ and \f$\rho_y\f$ likewise (equation 10 from [1]) and - * \f$\lambda_l\f$ are estimated as \f$\frac{\nu_l}{(m+n)}\f$, where - * \f$\nu_l\f$ are the eigenvalues of centered kernel matrix HKH. - * - * The biased version uses - * \f[ - * t\text{MMD}_b^2[\mathcal{F},X,Y]\rightarrow\sum_{l=1}^\infty - * \lambda_l\left((a_l\rho_x^{-\frac{1}{{2}}}- - * b_l\rho_y^{-\frac{1}{{2}}})^2\right) - * \f] - * - * kernel matrix needs to be stored in memory - * - * Note that (m+n)*Null-distribution is returned, - * which is fine since the statistic is also (m+n)*MMD: - * except when m and n are equal, then m*MMD^2 is returned - * - * Works well if the kernel matrix is NOT diagonal dominant. - * See Gretton, A., Fukumizu, K., & Harchaoui, Z. (2011). - * A fast, consistent kernel two-sample test. - * - * @param num_samples number of samples to draw - * @param num_eigenvalues number of eigenvalues to use to draw samples - * Maximum number of m+n-1 where m and n are the sizes of samples from - * p and q respectively. - * It is usually safe to use a smaller number since they decay very - * fast, however, a conservative approach would be to use all (-1 does - * this). See paper for details. - * @return samples from the estimated null distribution - */ - SGVector sample_null_spectrum_DEPRECATED(index_t num_samples, - index_t num_eigenvalues); - - /** setter for number of samples to use in spectrum based p-value - * computation. - * - * @param num_samples_spectrum number of samples to draw from - * approximate null-distributrion - */ - void set_num_samples_spectrum(index_t num_samples_spectrum); - - /** setter for number of eigenvalues to use in spectrum based p-value - * computation. Maximum is m_m+m_n-1 - * - * @param num_eigenvalues_spectrum number of eigenvalues to use to - * approximate null-distributrion - */ - void set_num_eigenvalues_spectrum(index_t num_eigenvalues_spectrum); - - /** @param statistic_type statistic type (biased/unbiased/incomplete) to use */ - void set_statistic_type(EQuadraticMMDType statistic_type); - - /** Approximates the null-distribution by the two parameter gamma - * distribution. It works in O(m^2) where m is the number of samples - * from each distribution. Its very fast, but may be inaccurate. - * However, there are cases where it performs very well. - * Returns parameters of gamma distribution that is fitted. - * - * Called by compute_p_value() if null approximation method is set to - * MMD2_GAMMA. - * - * Note that when being used for constructing a test, the provided - * statistic HAS to be the biased version (see paper for details). To use, - * set BIASED_DEPRECATED as statistic type. Note that m*Null-distribution - * is fitted, which is fine since the statistic is also m*MMD. - * - * See Gretton, A., Fukumizu, K., & Harchaoui, Z. (2011). - * A fast, consistent kernel two-sample test. - * - * @return vector with two parameter for gamma distribution. To use: - * call gamma_cdf(statistic, a, b). - */ - SGVector fit_null_gamma(); - -protected: - /** - * Helper method to compute unbiased estimate of squared quadratic time MMD - * and variance estimate under null and alternative hypothesis - * - * @param m number of samples from p - * @param n number of samples from q - * @return a vector of three values - * first - unbiased \f$\text{MMD}^2\f$ estimate \f$\hat{\eta}_{k,U}\f$ - * second - variance under null hypothesis (see class documentation) - * third - variance under alternative hypothesis (see class documentation) - */ - SGVector compute_unbiased_statistic_variance(int m, int n); - - /** - * Helper method to compute biased estimate of squared quadratic time MMD - * and variance estimate under null and alternative hypothesis - * - * @param m number of samples from p - * @param n number of samples from q - * @return a vector of three values - * first - biased \f$\text{MMD}^2\f$ estimate \f$\hat{\eta}_{k,V}\f$ - * second - variance under null hypothesis (see class documentation) - * third - variance under alternative hypothesis (see class documentation) - */ - SGVector compute_biased_statistic_variance(int m, int n); - - /** - * Helper method to compute incomplete estimate of squared quadratic time MMD - * and variance estimate under null and alternative hypothesis - * - * @param n number of samples from p and q - * @return a vector of three values - * first - incomplete \f$\text{MMD}^2\f$ estimate \f$\hat{\eta}_{k,U^-}\f$ - * second - variance under null hypothesis (see class documentation) - * third - variance under alternative hypothesis (see class documentation) - */ - SGVector compute_incomplete_statistic_variance(int n); - - /** Wrapper method for computing unbiased estimate of MMD^2 - * - * @param m number of samples from p - * @param n number of samples from q - * @return unbiased \f$\text{MMD}^2\f$ estimate \f$\hat{\eta}_{k,U}\f$ - */ - float64_t compute_unbiased_statistic(int m, int n); - - /** Wrapper method for computing biased estimate of MMD^2 - * - * @param m number of samples from p - * @param n number of samples from q - * @return biased \f$\text{MMD}^2\f$ estimate \f$\hat{\eta}_{k,V}\f$ - */ - float64_t compute_biased_statistic(int m, int n); - - /** Wrapper method for computing incomplete estimate of MMD^2 - * - * @param n number of samples from p and q - * @return incomplete \f$\text{MMD}^2\f$ estimate \f$\hat{\eta}_{k,U^-}\f$ - */ - float64_t compute_incomplete_statistic(int n); - -private: - /** register parameters and initialize with defaults */ - void init(); - -protected: - /** number of samples for spectrum null-dstribution-approximation */ - index_t m_num_samples_spectrum; - - /** number of Eigenvalues for spectrum null-dstribution-approximation */ - index_t m_num_eigenvalues_spectrum; - - /** type of statistic (biased/unbiased/incomplete as well as deprecated - * versions of biased/unbiased) - */ - EQuadraticMMDType m_statistic_type; -}; - -} - -#endif /* QUADRATIC_TIME_MMD_H_ */ diff --git a/src/shogun/statistics/StreamingMMD.cpp b/src/shogun/statistics/StreamingMMD.cpp deleted file mode 100644 index 5ba3d10a796..00000000000 --- a/src/shogun/statistics/StreamingMMD.cpp +++ /dev/null @@ -1,325 +0,0 @@ -/* - * Copyright (c) The Shogun Machine Learning Toolbox - * Written (w) 2012-2013 Heiko Strathmann - * Written (w) 2014 Soumyajit De - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * The views and conclusions contained in the software and documentation are those - * of the authors and should not be interpreted as representing official policies, - * either expressed or implied, of the Shogun Development Team. - */ - -#include -#include -#include -#include -#include - -using namespace shogun; - -CStreamingMMD::CStreamingMMD() : CKernelTwoSampleTest() -{ - init(); -} - -CStreamingMMD::CStreamingMMD(CKernel* kernel, CStreamingFeatures* p, - CStreamingFeatures* q, index_t m, index_t blocksize) : - CKernelTwoSampleTest(kernel, NULL, m) -{ - init(); - - m_streaming_p=p; - SG_REF(m_streaming_p); - - m_streaming_q=q; - SG_REF(m_streaming_q); - - m_blocksize=blocksize; -} - -CStreamingMMD::~CStreamingMMD() -{ - SG_UNREF(m_streaming_p); - SG_UNREF(m_streaming_q); - - /* m_kernel is SG_UNREFed in base desctructor */ -} - -void CStreamingMMD::init() -{ - SG_ADD((CSGObject**)&m_streaming_p, "streaming_p", "Streaming features p", - MS_NOT_AVAILABLE); - SG_ADD((CSGObject**)&m_streaming_q, "streaming_q", "Streaming features p", - MS_NOT_AVAILABLE); - SG_ADD(&m_blocksize, "blocksize", "Number of elements processed at once", - MS_NOT_AVAILABLE); - SG_ADD(&m_simulate_h0, "simulate_h0", "Whether p and q are mixed", - MS_NOT_AVAILABLE); - - m_streaming_p=NULL; - m_streaming_q=NULL; - m_blocksize=10000; - m_simulate_h0=false; -} - -float64_t CStreamingMMD::compute_statistic() -{ - /* use wrapper method and compute for single kernel */ - SGVector statistic; - SGVector variance; - compute_statistic_and_variance(statistic, variance, false); - - return statistic[0]; -} - -SGVector CStreamingMMD::compute_statistic(bool multiple_kernels) -{ - /* make sure multiple_kernels flag is used only with a combined kernel */ - REQUIRE(!multiple_kernels || m_kernel->get_kernel_type()==K_COMBINED, - "multiple kernels specified, but underlying kernel is not of type " - "K_COMBINED\n"); - - SGVector statistic; - SGVector variance; - compute_statistic_and_variance(statistic, variance, multiple_kernels); - - return statistic; -} - -float64_t CStreamingMMD::compute_variance_estimate() -{ - /* use wrapper method and compute for single kernel */ - SGVector statistic; - SGVector variance; - compute_statistic_and_variance(statistic, variance, false); - - return variance[0]; -} - -float64_t CStreamingMMD::compute_p_value(float64_t statistic) -{ - float64_t result=0; - - switch (m_null_approximation_method) - { - case MMD1_GAUSSIAN: - { - /* compute variance and use to estimate Gaussian distribution */ - float64_t std_dev=CMath::sqrt(compute_variance_estimate()); - result=1.0-CStatistics::normal_cdf(statistic, std_dev); - } - break; - - default: - /* sampling null is handled here */ - result=CKernelTwoSampleTest::compute_p_value(statistic); - break; - } - - return result; -} - -float64_t CStreamingMMD::compute_threshold(float64_t alpha) -{ - float64_t result=0; - - switch (m_null_approximation_method) - { - case MMD1_GAUSSIAN: - { - /* compute variance and use to estimate Gaussian distribution */ - float64_t std_dev=CMath::sqrt(compute_variance_estimate()); - result=1.0-CStatistics::inverse_normal_cdf(1-alpha, 0, std_dev); - } - break; - - default: - /* sampling null is handled here */ - result=CKernelTwoSampleTest::compute_threshold(alpha); - break; - } - - return result; -} - -float64_t CStreamingMMD::perform_test() -{ - float64_t result=0; - - switch (m_null_approximation_method) - { - case MMD1_GAUSSIAN: - { - /* compute variance and use to estimate Gaussian distribution, use - * wrapper method and compute for single kernel */ - SGVector statistic; - SGVector variance; - compute_statistic_and_variance(statistic, variance, false); - - /* estimate Gaussian distribution */ - result=1.0-CStatistics::normal_cdf(statistic[0], - CMath::sqrt(variance[0])); - } - break; - - default: - /* sampling null can be done separately in superclass */ - result=CHypothesisTest::perform_test(); - break; - } - - return result; -} - -SGVector CStreamingMMD::sample_null() -{ - SGVector samples(m_num_null_samples); - - /* instead of permutating samples, just samples new data all the time. */ - CStreamingFeatures* p=m_streaming_p; - CStreamingFeatures* q=m_streaming_q; - SG_REF(p); - SG_REF(q); - - bool old=m_simulate_h0; - set_simulate_h0(true); - for (index_t i=0; iget_streamed_features(num_this_run); - data->append_element(block); - } - - SG_DEBUG("streaming %d blocks from q of blocksize %d!\n", num_blocks, - num_this_run); - - /* stream data from q num_blocks of time*/ - for (index_t i=0; iget_streamed_features(num_this_run); - data->append_element(block); - } - - /* check whether h0 should be simulated and permute if so */ - if (m_simulate_h0) - { - /* create merged copy of all feature instances to permute */ - SG_DEBUG("merging and premuting features!\n"); - - /* use the first element to merge rest of the data into */ - CFeatures* first=(CFeatures*)data->get_first_element(); - - /* this delete element doesn't deallocate first element but just removes - * from the list and does a SG_UNREF. But its not deleted because - * get_first_element() does a SG_REF before returning so we need to later - * manually take care of its destruction via SG_UNREF here itself */ - data->delete_element(); - - CFeatures* merged=first->create_merged_copy(data); - - /* now we can get rid of unnecessary feature objects */ - SG_UNREF(first); - data->delete_all_elements(); - - /* permute */ - SGVector inds(merged->get_num_vectors()); - inds.range_fill(); - CMath::permute(inds); - merged->add_subset(inds); - - /* copy back */ - SGVector copy(num_this_run); - copy.range_fill(); - for (index_t i=0; i<2*num_blocks; ++i) - { - CFeatures* current=merged->copy_subset(copy); - data->append_element(current); - /* SG_UNREF'ing since copy_subset does a SG_REF, this is - * safe since the object is already SG_REF'ed inside the list */ - SG_UNREF(current); - - if (i<2*num_blocks-1) - copy.add(num_this_run); - } - - /* clean up */ - SG_UNREF(merged); - } - - SG_REF(data); - - SG_DEBUG("leaving!\n"); - return data; -} - -void CStreamingMMD::set_p_and_q(CFeatures* p_and_q) -{ - SG_ERROR("Method not implemented since linear time mmd is based on " - "streaming features\n"); -} - -CFeatures* CStreamingMMD::get_p_and_q() -{ - SG_ERROR("Method not implemented since linear time mmd is based on " - "streaming features\n"); - return NULL; -} - -CStreamingFeatures* CStreamingMMD::get_streaming_p() -{ - SG_REF(m_streaming_p); - return m_streaming_p; -} - -CStreamingFeatures* CStreamingMMD::get_streaming_q() -{ - SG_REF(m_streaming_q); - return m_streaming_q; -} - diff --git a/src/shogun/statistics/StreamingMMD.h b/src/shogun/statistics/StreamingMMD.h deleted file mode 100644 index d2e8d0a3e0d..00000000000 --- a/src/shogun/statistics/StreamingMMD.h +++ /dev/null @@ -1,310 +0,0 @@ -/* - * Copyright (c) The Shogun Machine Learning Toolbox - * Written (w) 2012-2013 Heiko Strathmann - * Written (w) 2014 Soumyajit De - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * The views and conclusions contained in the software and documentation are those - * of the authors and should not be interpreted as representing official policies, - * either expressed or implied, of the Shogun Development Team. - */ - -#ifndef STREAMING_MMD_H_ -#define STREAMING_MMD_H_ - -#include - -#include - -namespace shogun -{ - -class CStreamingFeatures; -class CFeatures; - -/** @brief Abstract base class that provides an interface for performing kernel - * two-sample test on streaming data using Maximum Mean Discrepancy (MMD) as - * the test statistic. The MMD is the distance of two probability distributions - * \f$p\f$ and \f$q\f$ in a RKHS (see [1] for formal description). - * - * \f[ - * \text{MMD}[\mathcal{F},p,q]^2=\textbf{E}_{x,x'}\left[ k(x,x')\right]- - * 2\textbf{E}_{x,y}\left[ k(x,y)\right] - * +\textbf{E}_{y,y'}\left[ k(y,y')\right]=||\mu_p - \mu_q||^2_\mathcal{F} - * \f] - * - * where \f$x,x'\sim p\f$ and \f$y,y'\sim q\f$. The data has to be provided as - * streaming features, which are processed in blocks for a given blocksize. - * The blocksize determines how many examples are processed at once. A method - * for getting a specified number of blocks of data is provided which can - * optionally merge and permute the data within the current burst. The exact - * computation of kernel functions for MMD computation is abstract and has to - * be defined by its subclasses, which should return a vector of function - * values. Please note that for streaming MMD, the number of data points from - * both the distributions has to be equal. - * - * Along with the statistic comes a method to compute a p-value based on a - * Gaussian approximation of the null-distribution which is possible in - * linear time and constant space. Sampling from null is also possible (no - * permutations but new examples will be used here). - * If unsure which one to use, sampling with 250 iterations always is - * correct (but slow). When the sample size is large (>1000) at least, - * the Gaussian approximation is an accurate and much faster choice. - * - * To choose, use set_null_approximation_method() and choose from - * - * MMD1_GAUSSIAN: Approximates the null-distribution with a Gaussian. Only use - * from at least 1000 samples. If using, check if type I error equals the - * desired value. - * - * PERMUTATION: For permuting available samples to sample null-distribution. - * - * For kernel selection see CMMDKernelSelection. - * - * [1]: Gretton, A., Borgwardt, K. M., Rasch, M. J., Schoelkopf, B., & - * Smola, A. (2012). A Kernel Two-Sample Test. Journal of Machine Learning - * Research, 13, 671-721. - */ -class CStreamingMMD: public CKernelTwoSampleTest -{ -public: - /** default constructor */ - CStreamingMMD(); - - /** constructor. - * - * @param kernel kernel to use - * @param p streaming features p to use - * @param q streaming features q to use - * @param m number of samples from each distribution - * @param blocksize size of examples that are processed at once when - * computing statistic/threshold. - */ - CStreamingMMD(CKernel* kernel, CStreamingFeatures* p, - CStreamingFeatures* q, index_t m, index_t blocksize=10000); - - /** destructor */ - virtual ~CStreamingMMD(); - - /** Computes the squared MMD for the current data. This is an unbiased - * estimate. This method relies on compute_statistic_and_variance which - * has to be defined in the subclasses - * - * Note that the underlying streaming feature parser has to be started - * before this is called. Otherwise deadlock. - * - * @return squared MMD - */ - virtual float64_t compute_statistic(); - - /** Same as compute_statistic(), but with the possibility to perform on - * multiple kernels at once - * - * @param multiple_kernels if true, and underlying kernel is K_COMBINED, - * method will be executed on all subkernels on the same data - * @return vector of results for subkernels - */ - virtual SGVector compute_statistic(bool multiple_kernels); - - /** computes a p-value based on current method for approximating the - * null-distribution. The p-value is the 1-p quantile of the null- - * distribution where the given statistic lies in. - * - * The method for computing the p-value can be set via - * set_null_approximation_method(). - * Since the null- distribution is normal, a Gaussian approximation - * is available. - * - * @param statistic statistic value to compute the p-value for - * @return p-value parameter statistic is the (1-p) percentile of the - * null distribution - */ - virtual float64_t compute_p_value(float64_t statistic); - - /** Performs the complete two-sample test on current data and returns a - * p-value. - * - * In case null distribution should be estimated with MMD1_GAUSSIAN, - * statistic and p-value are computed in the same loop, which is more - * efficient than first computing statistic and then computung p-values. - * - * In case of sampling null, superclass method is called. - * - * The method for computing the p-value can be set via - * set_null_approximation_method(). - * - * @return p-value such that computed statistic is the (1-p) quantile - * of the estimated null distribution - */ - virtual float64_t perform_test(); - - /** computes a threshold based on current method for approximating the - * null-distribution. The threshold is the value that a statistic has - * to have in ordner to reject the null-hypothesis. - * - * The method for computing the p-value can be set via - * set_null_approximation_method(). - * Since the null- distribution is normal, a Gaussian approximation - * is available. - * - * @param alpha test level to reject null-hypothesis - * @return threshold for statistics to reject null-hypothesis - */ - virtual float64_t compute_threshold(float64_t alpha); - - /** computes a linear time estimate of the variance of the squared mmd, - * which may be used for an approximation of the null-distribution - * The value is the variance of the vector of which the MMD is the mean. - * - * @return variance estimate - */ - virtual float64_t compute_variance_estimate(); - - /** Abstract method that computes MMD and a linear time variance estimate. - * If multiple_kernels is set to true, each subkernel is evaluated on the - * same data. - * - * @param statistic return parameter for statistic, vector with entry for - * each kernel. May be allocated before but doesn not have to be - * - * @param variance return parameter for statistic, vector with entry for - * each kernel. May be allocated before but doesn not have to be - * - * @param multiple_kernels optional flag, if set to true, it is assumed that - * the underlying kernel is of type K_COMBINED. Then, the MMD is computed on - * all subkernel separately rather than computing it on the combination. - * This is used by kernel selection strategies that need to evaluate - * multiple kernels on the same data. Since the linear time MMD works on - * streaming data, one cannot simply compute MMD, change kernel since data - * would be different for every kernel. - */ - virtual void compute_statistic_and_variance( - SGVector& statistic, SGVector& variance, - bool multiple_kernels=false)=0; - - /** Same as compute_statistic_and_variance, but computes a linear time - * estimate of the covariance of the multiple-kernel-MMD. - * See [1] for details. - */ - virtual void compute_statistic_and_Q( - SGVector& statistic, SGMatrix& Q)=0; - - /** Mimics sampling null for MMD. However, samples are not permutated but - * constantly streamed and then merged. Usually, this is not necessary - * since there is the Gaussian approximation for the null distribution. - * However, in certain cases this may fail and sampling the null - * distribution might be numerically more stable. Ovewrite superclass - * method that merges samples. - * - * @return vector of all statistics - */ - virtual SGVector sample_null(); - - /** Setter for the blocksize of examples to be processed at once - * @param blocksize new blocksize to use - */ - void set_blocksize(index_t blocksize) - { - m_blocksize=blocksize; - } - - /** Streams num_blocks data from each distribution with blocks of size - * num_this_run. If m_simulate_h0 is set, it merges the blocks together, - * shuffles and redistributes between the blocks. - * - * @param num_blocks number of blocks to be streamed from each distribution - * @param num_this_run number of data points to be streamed for one block - * @return an ordered list of blocks of data. The order in the - * list is \f$x,x',\cdots\sim p\f$ followed by \f$y,y',\cdots\sim q\f$. - * The features inside the list are SG_REF'ed and delete_data is set in the - * list, which will destroy the at CList's destructor call - */ - CList* stream_data_blocks(index_t num_blocks, index_t num_this_run); - - /** Not implemented for streaming MMD since it uses streaming feautres */ - virtual void set_p_and_q(CFeatures* p_and_q); - - /** Not implemented for streaming MMD since it uses streaming feautres */ - virtual CFeatures* get_p_and_q(); - - /** Getter for streaming features of p distribution. - * @return streaming features object for p distribution, SG_REF'ed - */ - virtual CStreamingFeatures* get_streaming_p(); - - /** Getter for streaming features of q distribution. - * @return streaming features object for q distribution, SG_REF'ed - */ - virtual CStreamingFeatures* get_streaming_q(); - - /** @param simulate_h0 if true, samples from p and q will be mixed and - * permuted - */ - inline void set_simulate_h0(bool simulate_h0) - { - m_simulate_h0=simulate_h0; - } - - /** @return the class name */ - virtual const char* get_name() const - { - return "StreamingMMD"; - } - -protected: - /** abstract method that computes the squared MMD - * - * @param kernel the kernel to be used for computing MMD. This will be - * useful when multiple kernels are used - * @param data the list of data on which kernels are computed. The order - * of data in the list is \f$x,x',\cdots\sim p\f$ followed by - * \f$y,y',\cdots\sim q\f$. It is assumed that detele_data flag is set - * inside the list - * @param num_this_run number of data points in current blocks - * @return the MMD values - */ - virtual SGVector compute_squared_mmd(CKernel* kernel, - CList* data, index_t num_this_run)=0; - - /** Streaming feature objects that are used instead of merged samples */ - CStreamingFeatures* m_streaming_p; - - /** Streaming feature objects that are used instead of merged samples*/ - CStreamingFeatures* m_streaming_q; - - /** Number of examples processed at once, i.e. in one burst */ - index_t m_blocksize; - - /** If this is true, samples will be mixed between p and q in any method - * that computes the statistic */ - bool m_simulate_h0; - -private: - /** register parameters and initialize with defaults */ - void init(); -}; - -} - -#endif /* STREAMING_MMD_H_ */ - diff --git a/src/shogun/statistics/TwoSampleTest.cpp b/src/shogun/statistics/TwoSampleTest.cpp deleted file mode 100644 index 0510f3b5e48..00000000000 --- a/src/shogun/statistics/TwoSampleTest.cpp +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Copyright (c) The Shogun Machine Learning Toolbox - * Written (w) 2012-2013 Heiko Strathmann - * Written (w) 2014 Soumyajit De - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * The views and conclusions contained in the software and documentation are those - * of the authors and should not be interpreted as representing official policies, - * either expressed or implied, of the Shogun Development Team. - */ - -#include -#include -#include - -using namespace shogun; - -CTwoSampleTest::CTwoSampleTest() : CHypothesisTest() -{ - init(); -} - -CTwoSampleTest::CTwoSampleTest(CFeatures* p_and_q, index_t m) : - CHypothesisTest() -{ - init(); - - m_p_and_q=p_and_q; - SG_REF(m_p_and_q); - - m_m=m; -} - -CTwoSampleTest::CTwoSampleTest(CFeatures* p, CFeatures* q) : - CHypothesisTest() -{ - init(); - - m_p_and_q=p->create_merged_copy(q); - SG_REF(m_p_and_q); - - m_m=p->get_num_vectors(); -} - -CTwoSampleTest::~CTwoSampleTest() -{ - SG_UNREF(m_p_and_q); -} - -void CTwoSampleTest::init() -{ - SG_ADD((CSGObject**)&m_p_and_q, "p_and_q", "Concatenated samples p and q", - MS_NOT_AVAILABLE); - SG_ADD(&m_m, "m", "Index of first sample of q", - MS_NOT_AVAILABLE); - - m_p_and_q=NULL; - m_m=0; -} - -SGVector CTwoSampleTest::sample_null() -{ - SG_DEBUG("entering!\n") - - REQUIRE(m_p_and_q, "No appended features p and q!\n"); - - /* compute sample statistics for null distribution */ - SGVector results(m_num_null_samples); - - /* memory for index permutations. Adding of subset has to happen - * inside the loop since it may be copied if there already is one set */ - SGVector ind_permutation(m_p_and_q->get_num_vectors()); - ind_permutation.range_fill(); - - for (index_t i=0; iadd_subset(ind_permutation); - results[i]=compute_statistic(); - m_p_and_q->remove_subset(); - } - - SG_DEBUG("leaving!\n") - return results; -} - -float64_t CTwoSampleTest::compute_p_value(float64_t statistic) -{ - float64_t result=0; - - if (m_null_approximation_method==PERMUTATION) - { - /* sample a bunch of MMD values from null distribution */ - SGVector values=sample_null(); - - /* find out percentile of parameter "statistic" in null distribution */ - CMath::qsort(values); - float64_t i=values.find_position_to_insert(statistic); - - /* return corresponding p-value */ - result=1.0-i/values.vlen; - } - else - SG_ERROR("Unknown method to approximate null distribution!\n"); - - return result; -} - -float64_t CTwoSampleTest::compute_threshold(float64_t alpha) -{ - float64_t result=0; - - if (m_null_approximation_method==PERMUTATION) - { - /* sample a bunch of MMD values from null distribution */ - SGVector values=sample_null(); - - /* return value of (1-alpha) quantile */ - result=values[index_t(CMath::floor(values.vlen*(1-alpha)))]; - } - else - SG_ERROR("Unknown method to approximate null distribution!\n"); - - return result; -} - -void CTwoSampleTest::set_p_and_q(CFeatures* p_and_q) -{ - /* ref before unref to avoid problems when instances are equal */ - SG_REF(p_and_q); - SG_UNREF(m_p_and_q); - m_p_and_q=p_and_q; -} - -void CTwoSampleTest::set_m(index_t m) -{ - REQUIRE(m_p_and_q, "Samples are not specified!\n"); - REQUIRE(m_p_and_q->get_num_vectors()>m, "Provided sample size for p" - "(%d) is greater than total number of samples (%d)!\n", - m, m_p_and_q->get_num_vectors()); - m_m=m; -} - -CFeatures* CTwoSampleTest::get_p_and_q() -{ - SG_REF(m_p_and_q); - return m_p_and_q; -} - diff --git a/src/shogun/statistics/TwoSampleTest.h b/src/shogun/statistics/TwoSampleTest.h deleted file mode 100644 index aa57c5a1ccb..00000000000 --- a/src/shogun/statistics/TwoSampleTest.h +++ /dev/null @@ -1,144 +0,0 @@ -/* - * Copyright (c) The Shogun Machine Learning Toolbox - * Written (w) 2012-2013 Heiko Strathmann - * Written (w) 2014 Soumyajit De - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * The views and conclusions contained in the software and documentation are those - * of the authors and should not be interpreted as representing official policies, - * either expressed or implied, of the Shogun Development Team. - */ - -#ifndef TWO_SAMPLE_TEST_H_ -#define TWO_SAMPLE_TEST_H_ - -#include - -#include - -namespace shogun -{ - -class CFeatures; - -/** @brief Provides an interface for performing the classical two-sample test - * i.e. Given samples from two distributions \f$p\f$ and \f$q\f$, the - * null-hypothesis is: \f$H_0: p=q\f$, the alternative hypothesis: - * \f$H_1: p\neq q\f$. - * - * Abstract base class. Provides all interfaces and implements approximating - * the null distribution via permutation, i.e. repeatedly merging both samples - * and them compute the test statistic on them. - * - */ -class CTwoSampleTest : public CHypothesisTest -{ -public: - /** default constructor */ - CTwoSampleTest(); - - /** Constructor - * - * @param p_and_q feature data. Is assumed to contain samples from both - * p and q. First all samples from p, then from index m all - * samples from q - * - * @param p_and_q samples from p and q, appended - * @param m index of first sample of q - */ - CTwoSampleTest(CFeatures* p_and_q, index_t m); - - /** Constructor. - * This is a convienience constructor which copies both features to one - * element and then calls the other constructor. Needs twice the memory - * for a short time - * - * @param p samples from distribution p, will be copied and NOT - * SG_REF'ed - * @param q samples from distribution q, will be copied and NOT - * SG_REF'ed - */ - CTwoSampleTest(CFeatures* p, CFeatures* q); - - /** destructor */ - virtual ~CTwoSampleTest(); - - /** merges both sets of samples and computes the test statistic - * m_num_permutation_iteration times - * - * @return vector of all statistics - */ - virtual SGVector sample_null(); - - /** computes a p-value based on current method for approximating the - * null-distribution. The p-value is the 1-p quantile of the null- - * distribution where the given statistic lies in. - * - * @param statistic statistic value to compute the p-value for - * @return p-value parameter statistic is the (1-p) percentile of the - * null distribution - */ - virtual float64_t compute_p_value(float64_t statistic); - - /** computes a threshold based on current method for approximating the - * null-distribution. The threshold is the argument of the \f$1-\alpha\f$ - * quantile of the null. \f$\alpha\f$ is provided. - * - * @param alpha \f$\alpha\f$ quantile to get the threshold for - * @return threshold which is the \f$1-\alpha\f$ quantile of the null - * distribution - */ - virtual float64_t compute_threshold(float64_t alpha); - - /** Setter for joint features - * @param p_and_q joint features from p and q to set - */ - virtual void set_p_and_q(CFeatures* p_and_q); - - /** Getter for joint features, SG_REF'ed - * @return joint feature object - */ - virtual CFeatures* get_p_and_q(); - - /** @param m number of samples from first distribution p */ - void set_m(index_t m); - - /** @return number of to be used samples m */ - index_t get_m() { return m_m; } - - virtual const char* get_name() const=0; - -private: - void init(); - -protected: - /** concatenated samples of the two distributions (two blocks) */ - CFeatures* m_p_and_q; - - /** defines the first index of samples of q */ - index_t m_m; -}; - -} - -#endif /* TWO_SAMPLE_TEST_H_ */