From 193ae5613b80f58159cadbe1a1b18651bc32e57c Mon Sep 17 00:00:00 2001 From: lambday Date: Tue, 1 Jul 2014 20:08:51 +0530 Subject: [PATCH 1/2] Added class CKernelDependenceMaximization for feature selection --- .../KernelDependenceMaximization.cpp | 135 ++++++++++++++++++ .../KernelDependenceMaximization.h | 109 ++++++++++++++ 2 files changed, 244 insertions(+) create mode 100644 src/shogun/preprocessor/KernelDependenceMaximization.cpp create mode 100644 src/shogun/preprocessor/KernelDependenceMaximization.h diff --git a/src/shogun/preprocessor/KernelDependenceMaximization.cpp b/src/shogun/preprocessor/KernelDependenceMaximization.cpp new file mode 100644 index 00000000000..70c1fa5aa46 --- /dev/null +++ b/src/shogun/preprocessor/KernelDependenceMaximization.cpp @@ -0,0 +1,135 @@ +/* + * Copyright (c) The Shogun Machine Learning Toolbox + * Written (w) 2014 Soumyajit De + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * The views and conclusions contained in the software and documentation are those + * of the authors and should not be interpreted as representing official policies, + * either expressed or implied, of the Shogun Development Team. + */ + +#include +#include +#include +#include +#include + +using namespace shogun; + +CKernelDependenceMaximization::CKernelDependenceMaximization() + : CDependenceMaximization() +{ + init(); +} + +void CKernelDependenceMaximization::init() +{ + SG_ADD((CSGObject**)&m_kernel_features, "kernel_features", + "the kernel to be used for features", MS_NOT_AVAILABLE); + SG_ADD((CSGObject**)&m_kernel_labels, "kernel_labels", + "the kernel to be used for labels", MS_NOT_AVAILABLE); + + m_kernel_features=NULL; + m_kernel_labels=NULL; +} + +CKernelDependenceMaximization::~CKernelDependenceMaximization() +{ + SG_UNREF(m_kernel_features); + SG_UNREF(m_kernel_labels); +} + +void CKernelDependenceMaximization::precompute() +{ + SG_DEBUG("Entering!\n"); + + REQUIRE(m_kernel_labels, "Kernel for labels is not initialized!\n"); + + // convert labels instance into a dense feature object and set it + // to the estimator + CDependenceMaximization::precompute(); + + // precompute the kernel for labels + m_kernel_labels->init(m_labels_feats, m_labels_feats); + CCustomKernel* precomputed + =new CCustomKernel(m_kernel_labels->get_kernel_matrix()); + + // replace the kernel for labels with precomputed kernel + SG_UNREF(m_kernel_labels); + m_kernel_labels=precomputed; + SG_REF(m_kernel_labels); + + // we can safely SG_UNREF the feature object for labels now + SG_UNREF(m_labels_feats); + m_labels_feats=NULL; + + // check proper cast set this kernel to the estimator + CKernelIndependenceTest* estimator + =dynamic_cast(m_estimator); + REQUIRE(estimator, "An instance of CKernelIndependenceTest expected!" + "Got an instance of %s instead!\n", estimator->get_name()); + + estimator->set_kernel_q(m_kernel_labels); + + SG_DEBUG("Leaving!\n"); +} + +void CKernelDependenceMaximization::set_kernel_features(CKernel* kernel) +{ + SG_REF(kernel); + SG_UNREF(m_kernel_features); + m_kernel_features=kernel; + + CKernelIndependenceTest* estimator + =dynamic_cast(m_estimator); + REQUIRE(estimator, "An instance of CKernelIndependenceTest expected!" + "Got an instance of %s instead!\n", estimator->get_name()); + + estimator->set_kernel_p(m_kernel_features); +} + +void CKernelDependenceMaximization::set_kernel_labels(CKernel* kernel) +{ + SG_REF(kernel); + SG_UNREF(m_kernel_labels); + m_kernel_labels=kernel; + + CKernelIndependenceTest* estimator=dynamic_cast + (m_estimator); + REQUIRE(estimator, "An instance of CKernelIndependenceTest expected!" + "Got an instance of %s instead!\n", estimator->get_name()); + + estimator->set_kernel_q(m_kernel_labels); +} + +CKernel* CKernelDependenceMaximization::get_kernel_features() const +{ + SG_REF(m_kernel_features); + return m_kernel_features; +} + +CKernel* CKernelDependenceMaximization::get_kernel_labels() const +{ + SG_REF(m_kernel_labels); + return m_kernel_labels; +} diff --git a/src/shogun/preprocessor/KernelDependenceMaximization.h b/src/shogun/preprocessor/KernelDependenceMaximization.h new file mode 100644 index 00000000000..717d8be6c86 --- /dev/null +++ b/src/shogun/preprocessor/KernelDependenceMaximization.h @@ -0,0 +1,109 @@ +/* + * Copyright (c) The Shogun Machine Learning Toolbox + * Written (w) 2014 Soumyajit De + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * The views and conclusions contained in the software and documentation are those + * of the authors and should not be interpreted as representing official policies, + * either expressed or implied, of the Shogun Development Team. + */ + +#ifndef KERNEL_DEPENDENCE_MAXIMIZATION_H__ +#define KERNEL_DEPENDENCE_MAXIMIZATION_H__ + +#include +#include + +namespace shogun +{ + +class CFeatures; +class CKernelSelection; + +/** @brief Class CKernelDependenceMaximization, that uses an implementation + * of CKernelIndependenceTest to compute dependence measures for feature + * selection. Different kernels are used for labels and data. For the sake + * of computational convenience, the precompute() method is overridden to + * precompute the kernel for labels and save as an instance of CCustomKernel + * after converting the labels into a dense feature using + * CDependenceMaximization::precompute + */ +class CKernelDependenceMaximization : public CDependenceMaximization +{ +public: + /** Default constructor */ + CKernelDependenceMaximization(); + + /** Destructor */ + virtual ~CKernelDependenceMaximization(); + + /** @param kernel the kernel for features (data) */ + void set_kernel_features(CKernel* kernel); + + /** @return the kernel for features */ + CKernel* get_kernel_features() const; + + /** @param kernel the kernel for labels */ + void set_kernel_labels(CKernel* kernel); + + /** @return the kernel for labels */ + CKernel* get_kernel_labels() const; + + /** + * Abstract method which is overridden in the subclasses to set accepted + * feature selection algorithm + * + * @param algorithm the feature selection algorithm to use + */ + virtual void set_algorithm(EFeatureSelectionAlgorithm algorithm)=0; + + /** @return the class name */ + virtual const char* get_name() const + { + return "KernelDependenceMaximization"; + } + +protected: + /** + * Creates a dense feature object from the labels provided, m_labels and + * sets this feature in the independence test estimator via calling + * CDependenceMaximization::precompute. After this, it precomputes the + * kernel on labels and replaces the #m_kernel_labels with an instance of + * CCustomKernel + */ + virtual void precompute(); + + /** The kernel for data (features) to be used in CKernelIndependenceTest */ + CKernel* m_kernel_features; + + /** The kernel for labels to be used in CKernelIndependenceTest */ + CKernel* m_kernel_labels; + +private: + /** Register params and initialize with default values */ + void init(); + +}; + +} +#endif // KERNEL_DEPENDENCE_MAXIMIZATION_H__ From 97911c069fa6bc3aaef28098ffa9d30238db8fcd Mon Sep 17 00:00:00 2001 From: lambday Date: Thu, 3 Jul 2014 03:19:32 +0530 Subject: [PATCH 2/2] Added class CBAHSIC for feature selection --- src/interfaces/modular/Features.i | 1 + src/interfaces/modular/Preprocessor.i | 36 +++++ .../modular/Preprocessor_includes.i | 5 + src/interfaces/modular/Statistics.i | 2 + src/interfaces/modular/Statistics_includes.i | 1 + src/shogun/preprocessor/BAHSIC.cpp | 62 ++++++++ src/shogun/preprocessor/BAHSIC.h | 92 ++++++++++++ .../preprocessor/DependenceMaximization.cpp | 59 +++++--- .../preprocessor/DependenceMaximization.h | 19 ++- src/shogun/preprocessor/FeatureSelection.cpp | 3 + src/shogun/preprocessor/FeatureSelection.h | 37 +++-- .../KernelDependenceMaximization.cpp | 48 +++--- .../KernelDependenceMaximization.h | 10 +- src/shogun/preprocessor/Preprocessor.h | 3 +- tests/unit/preprocessor/BAHSIC_unittest.cc | 98 +++++++++++++ .../preprocessor/FeatureSelection_unittest.cc | 137 ++++++++++++++++++ 16 files changed, 538 insertions(+), 75 deletions(-) create mode 100644 src/shogun/preprocessor/BAHSIC.cpp create mode 100644 src/shogun/preprocessor/BAHSIC.h create mode 100644 tests/unit/preprocessor/BAHSIC_unittest.cc create mode 100644 tests/unit/preprocessor/FeatureSelection_unittest.cc diff --git a/src/interfaces/modular/Features.i b/src/interfaces/modular/Features.i index 93953caacd8..dfe3a1000d2 100644 --- a/src/interfaces/modular/Features.i +++ b/src/interfaces/modular/Features.i @@ -21,6 +21,7 @@ %newobject get_transposed(); %newobject create_merged_copy(CFeatures* other); %newobject copy_subset(SGVector indices); +%newobject copy_dimension_subset(SGVector indices); %newobject get_streamed_features(index_t num_elements); diff --git a/src/interfaces/modular/Preprocessor.i b/src/interfaces/modular/Preprocessor.i index 737011e551e..31ba0a9a493 100644 --- a/src/interfaces/modular/Preprocessor.i +++ b/src/interfaces/modular/Preprocessor.i @@ -27,6 +27,14 @@ %rename(SortUlongString) CSortUlongString; %rename(SortWordString) CSortWordString; +/* Feature selection framework */ +%rename(DependenceMaximization) CDependenceMaximization; +%rename(KernelDependenceMaximization) CDependenceMaximization; +%rename(BAHSIC) CBAHSIC; + +%newobject shogun::CFeatureSelection::apply; +%newobject shogun::CFeatureSelection::remove_feats; + %newobject shogun::CKernelPCA::apply_to_string_features; /* Include Class Headers to make them visible from within the target language */ @@ -95,6 +103,31 @@ namespace shogun %template(DecompressCharString) CDecompressString; #endif } + +/* Templates Class FeatureSelection */ +%include +namespace shogun +{ +#ifdef USE_FLOAT64 + %template(RealFeatureSelection) CFeatureSelection; +#endif +#ifdef USE_UINT64 + %template(UlongFeatureSelection) CFeatureSelection; +#endif +#ifdef USE_UINT16 + %template(WordFeatureSelection) CFeatureSelection; +#endif +#ifdef USE_INT16 + %template(ShortFeatureSelection) CFeatureSelection; +#endif +#ifdef USE_UINT8 + %template(ByteFeatureSelection) CFeatureSelection; +#endif +#ifdef USE_CHAR + %template(CharFeatureSelection) CFeatureSelection; +#endif +} + %include %include %include @@ -111,3 +144,6 @@ namespace shogun %include %include +%include +%include +%include diff --git a/src/interfaces/modular/Preprocessor_includes.i b/src/interfaces/modular/Preprocessor_includes.i index 22fe9c65d53..81f1fd2e3b4 100644 --- a/src/interfaces/modular/Preprocessor_includes.i +++ b/src/interfaces/modular/Preprocessor_includes.i @@ -22,4 +22,9 @@ #include #include #include + +#include +#include +#include +#include %} diff --git a/src/interfaces/modular/Statistics.i b/src/interfaces/modular/Statistics.i index 382cece66c2..a2aacef7b15 100644 --- a/src/interfaces/modular/Statistics.i +++ b/src/interfaces/modular/Statistics.i @@ -18,6 +18,7 @@ %rename(KernelIndependenceTest) CKernelIndependenceTest; %rename(HSIC) CHSIC; %rename(KernelMeanMatching) CKernelMeanMatching; +%rename(KernelSelection) CKernelSelection; %rename(MMDKernelSelection) CMMDKernelSelection; %rename(MMDKernelSelectionComb) CMMDKernelSelectionComb; %rename(MMDKernelSelectionMedian) CMMDKernelSelectionMedian; @@ -38,6 +39,7 @@ %include %include %include +%include %include %include %include diff --git a/src/interfaces/modular/Statistics_includes.i b/src/interfaces/modular/Statistics_includes.i index adfaa958ef7..86dc2b91357 100644 --- a/src/interfaces/modular/Statistics_includes.i +++ b/src/interfaces/modular/Statistics_includes.i @@ -9,6 +9,7 @@ #include #include #include + #include #include #include #include diff --git a/src/shogun/preprocessor/BAHSIC.cpp b/src/shogun/preprocessor/BAHSIC.cpp new file mode 100644 index 00000000000..cc5615d92ca --- /dev/null +++ b/src/shogun/preprocessor/BAHSIC.cpp @@ -0,0 +1,62 @@ +/* + * Copyright (c) The Shogun Machine Learning Toolbox + * Written (w) 2014 Soumyajit De + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * The views and conclusions contained in the software and documentation are those + * of the authors and should not be interpreted as representing official policies, + * either expressed or implied, of the Shogun Development Team. + */ + +#include +#include + +using namespace shogun; + +CBAHSIC::CBAHSIC() : CKernelDependenceMaximization() +{ + init(); +} + +void CBAHSIC::init() +{ + m_estimator=new CHSIC(); + SG_REF(m_estimator); + m_algorithm=BACKWARD_ELIMINATION; +} + +CBAHSIC::~CBAHSIC() +{ + // estimator is SG_UNREF'ed in base CDependenceMaximization destructor +} + +void CBAHSIC::set_algorithm(EFeatureSelectionAlgorithm algorithm) +{ + SG_INFO("Algorithm is set to BACKWARD_ELIMINATION for %s and therefore " + "cannot be set externally!\n", get_name()); +} + +EPreprocessorType CBAHSIC::get_type() const +{ + return P_BAHSIC; +} diff --git a/src/shogun/preprocessor/BAHSIC.h b/src/shogun/preprocessor/BAHSIC.h new file mode 100644 index 00000000000..cc8243fc726 --- /dev/null +++ b/src/shogun/preprocessor/BAHSIC.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) The Shogun Machine Learning Toolbox + * Written (w) 2014 Soumyajit De + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * The views and conclusions contained in the software and documentation are those + * of the authors and should not be interpreted as representing official policies, + * either expressed or implied, of the Shogun Development Team. + */ + +#ifndef BAHSIC_H__ +#define BAHSIC_H__ + +#include +#include + +namespace shogun +{ + +/** @brief Class CBAHSIC, that extends CKernelDependenceMaximization and uses + * HSIC [1] to compute dependence measures for feature selection using a + * backward elimination approach as described in [1]. This class serves as a + * convenience class that initializes the CDependenceMaximization#m_estimator + * with an instance of CHSIC and allows only ::BACKWARD_ELIMINATION algorithm + * to use which is set internally. Therefore, trying to use other algorithms + * by set_algorithm() will not work. Plese see the class documentation of CHSIC + * and [2] for more details on mathematical description of HSIC. + * + * Refrences: + * [1] Song, Le and Bedo, Justin and Borgwardt, Karsten M. and Gretton, Arthur + * and Smola, Alex. (2007). Gene Selection via the BAHSIC Family of Algorithms. + * Journal Bioinformatics. Volume 23 Issue Pages i490-i498. Oxford University + * Press Oxford, UK + * [2]: Gretton, A., Fukumizu, K., Teo, C., & Song, L. (2008). A kernel + * statistical test of independence. Advances in Neural Information Processing + * Systems, 1-8. + */ +class CBAHSIC : public CKernelDependenceMaximization +{ +public: + /** Default constructor */ + CBAHSIC(); + + /** Destructor */ + virtual ~CBAHSIC(); + + /** + * Since only ::BACKWARD_ELIMINATION algorithm is applicable for BAHSIC, + * and this is set internally, this method is overridden to prevent this + * to be set from public API. + * + * @param algorithm the feature selection algorithm to use + */ + virtual void set_algorithm(EFeatureSelectionAlgorithm algorithm); + + /** @return the preprocessor type */ + virtual EPreprocessorType get_type() const; + + /** @return the class name */ + virtual const char* get_name() const + { + return "BAHSIC"; + } + +private: + /** Register params and initialize with default values */ + void init(); + +}; + +} +#endif // BAHSIC_H__ diff --git a/src/shogun/preprocessor/DependenceMaximization.cpp b/src/shogun/preprocessor/DependenceMaximization.cpp index caad555a9a2..b2bc24866f7 100644 --- a/src/shogun/preprocessor/DependenceMaximization.cpp +++ b/src/shogun/preprocessor/DependenceMaximization.cpp @@ -74,36 +74,18 @@ bool CDependenceMaximization::init(CFeatures* features) return true; } -void CDependenceMaximization::precompute() -{ - SG_DEBUG("Entering!\n"); - - REQUIRE(m_labels, "Labels are not initialized!\n"); - REQUIRE(m_estimator, "Estimator is not initialized!\n"); - - // convert the CLabels object to CDenseFeatures - SG_UNREF(m_labels_feats); - - SGMatrix labels_matrix(1, m_labels->get_num_labels()); - for (index_t i=0; iget_value(i); - - m_labels_feats=new CDenseFeatures(labels_matrix); - SG_REF(m_labels_feats); - - // and set this to the estimator - m_estimator->set_q(m_labels_feats); - - SG_DEBUG("Leaving!\n"); -} - CFeatures* CDependenceMaximization::create_transformed_copy(CFeatures* features, index_t idx) { SG_DEBUG("Entering!\n"); // remove the dimension specified by the index, i.e. get X\X_i + // NULL check is handled in CFeatureSelection::get_num_features call index_t num_features=get_num_features(features); + REQUIRE(num_features>idx, "Specified dimension to remove (%d) is greater " + "than the total number of current features (%d)!\n", + idx, num_features); + SGVector dims(num_features-1); index_t n_dims=0; for (index_t i=0; i inds(argsorted.vlen-threshold); @@ -186,3 +177,23 @@ void CDependenceMaximization::set_policy(EFeatureRemovalPolicy policy) "with %s!\n", get_name()); m_policy=policy; } + +void CDependenceMaximization::set_labels(CLabels* labels) +{ + // NULL check is handled in base class CFeatureSelection + CFeatureSelection::set_labels(labels); + + // convert the CLabels object to CDenseFeatures + SG_UNREF(m_labels_feats); + + SGMatrix labels_matrix(1, m_labels->get_num_labels()); + for (index_t i=0; iget_value(i); + + m_labels_feats=new CDenseFeatures(labels_matrix); + SG_REF(m_labels_feats); + + // we need to set this to the estimator which is set internally + ASSERT(m_estimator); + m_estimator->set_q(m_labels_feats); +} diff --git a/src/shogun/preprocessor/DependenceMaximization.h b/src/shogun/preprocessor/DependenceMaximization.h index 8ec8ee5b00e..eb54ca8e098 100644 --- a/src/shogun/preprocessor/DependenceMaximization.h +++ b/src/shogun/preprocessor/DependenceMaximization.h @@ -60,7 +60,7 @@ class CIndependenceTest; * The estimator cannot be set via user interface, rather its subclasses * initialize this estimator with appropriate instances internally. * - * This class also overrides precompute() method to create a feature object from + * This class also overrides set_labels() method to create a feature object from * the labels and sets this as features \f$\mathbf{Y}\sim q\f$ to the estimator * which is required to compute the measure. */ @@ -119,6 +119,16 @@ class CDependenceMaximization : public CFeatureSelection */ virtual bool init(CFeatures* features); + /** + * Setter for labels. This method is overridden to internally convert the + * labels to a dense feature object and set this feature in the + * independence test estimator. These labels serve as samples + * \f$\mathbf{Y}\sim q\f$ in the independence test + * + * @param labels the labels + */ + virtual void set_labels(CLabels* labels); + /** @return the class name */ virtual const char* get_name() const { @@ -137,13 +147,6 @@ class CDependenceMaximization : public CFeatureSelection */ virtual CFeatures* create_transformed_copy(CFeatures* features, index_t idx); - /** - * Creates a dense feature object from the labels provided, #m_labels and - * sets this feature in the independence test estimator. These labels serve - * as samples \f$\mathbf{Y}\sim q\f$ in the independence test - */ - virtual void precompute(); - /** * The estimator for performing statistical tests for independence which * is used for computing measures diff --git a/src/shogun/preprocessor/FeatureSelection.cpp b/src/shogun/preprocessor/FeatureSelection.cpp index 7e200daa280..30e0e25ce41 100644 --- a/src/shogun/preprocessor/FeatureSelection.cpp +++ b/src/shogun/preprocessor/FeatureSelection.cpp @@ -84,6 +84,7 @@ CFeatures* CFeatureSelection::apply_backward_elimination(CFeatures* features // precompute whenever appropriate for performing the rest of the tasks precompute(); + // NULL check for features is handled in get_num_features index_t num_features=get_num_features(features); SG_DEBUG("Initial number of features %d!\n", num_features); @@ -208,6 +209,8 @@ void CFeatureSelection::adapt_params(CFeatures* features) template index_t CFeatureSelection::get_num_features(CFeatures* features) const { + REQUIRE(features, "Features not initialized!\n"); + EFeatureClass f_class=features->get_feature_class(); switch (f_class) diff --git a/src/shogun/preprocessor/FeatureSelection.h b/src/shogun/preprocessor/FeatureSelection.h index 18ee8af72f8..677fb1cfee1 100644 --- a/src/shogun/preprocessor/FeatureSelection.h +++ b/src/shogun/preprocessor/FeatureSelection.h @@ -109,7 +109,7 @@ enum EFeatureRemovalPolicy * * Note that not all policies can be adapted for a specific feature seleciton * approaches. In general, in classes where feature selection is performed by - * removing the features which corresponds to lowest measure, the policy + * removing the features which correspond to lowest measure, the policies * ::N_SMALLEST and ::PERCENTILE_SMALLEST are appropriate. When features * corresponding to highest measures are removed (e.g. training error in a * cross-validation scenario), ::N_LARGEST and ::PERCENTILE_LARGEST are @@ -149,17 +149,6 @@ template class CFeatureSelection : public CPreprocessor */ virtual CFeatures* apply(CFeatures* features); - /** - * Applies backward elimination algorithm for performing feature selection. - * After performing necessary precomputing (defined by subclasses), it - * iteratively eliminates a number of features based on a measure until - * target dimension is reached. - * - * @param features the input features - * @return the result feature object after applying the preprocessor - */ - virtual CFeatures* apply_backward_elimination(CFeatures* features); - /** * Abstract method that is defined in the subclasses to compute the * measures for the provided features based on which feature selection @@ -234,8 +223,17 @@ template class CFeatureSelection : public CPreprocessor /** @return number or percentage of features removed in each iteration */ index_t get_num_remove() const; - /** @param the labels */ - void set_labels(CLabels* labels); + /** + * Setter for labels. This method may be overridden in subclasses if + * necessary to set some additional parameters associated with it. For + * example, in CDependenceMaximization, we need a feature instance of + * the labels which is used in the estimator. So this method is overridden + * there to internally convert the labels to a dense feature object in + * this set call only + * + * @param labels the labels + */ + virtual void set_labels(CLabels* labels); /** @return the labels */ CLabels* get_labels() const; @@ -250,6 +248,17 @@ template class CFeatureSelection : public CPreprocessor } protected: + /** + * Applies backward elimination algorithm for performing feature selection. + * After performing necessary precomputing (defined by subclasses), it + * iteratively eliminates a number of features based on a measure until + * target dimension is reached. + * + * @param features the input features + * @return the result feature object after applying the preprocessor + */ + virtual CFeatures* apply_backward_elimination(CFeatures* features); + /** * Performs the tasks which can be computed beforehand before the actual * algorithm begins. This method is overridden in the subclasses. Here diff --git a/src/shogun/preprocessor/KernelDependenceMaximization.cpp b/src/shogun/preprocessor/KernelDependenceMaximization.cpp index 70c1fa5aa46..bfd5f38c585 100644 --- a/src/shogun/preprocessor/KernelDependenceMaximization.cpp +++ b/src/shogun/preprocessor/KernelDependenceMaximization.cpp @@ -28,8 +28,6 @@ * either expressed or implied, of the Shogun Development Team. */ -#include -#include #include #include #include @@ -63,11 +61,20 @@ void CKernelDependenceMaximization::precompute() { SG_DEBUG("Entering!\n"); + REQUIRE(m_labels_feats, "Features for labels is not initialized!\n"); REQUIRE(m_kernel_labels, "Kernel for labels is not initialized!\n"); - // convert labels instance into a dense feature object and set it - // to the estimator - CDependenceMaximization::precompute(); + // ASSERT here because the estimator is set internally and cannot + // be set via public API + ASSERT(m_estimator); + + CFeatureSelection::precompute(); + + // make sure that we have an instance of CKernelIndependenceTest via + // proper cast and set this kernel to the estimator + CKernelIndependenceTest* estimator + =dynamic_cast(m_estimator); + ASSERT(estimator); // precompute the kernel for labels m_kernel_labels->init(m_labels_feats, m_labels_feats); @@ -83,12 +90,7 @@ void CKernelDependenceMaximization::precompute() SG_UNREF(m_labels_feats); m_labels_feats=NULL; - // check proper cast set this kernel to the estimator - CKernelIndependenceTest* estimator - =dynamic_cast(m_estimator); - REQUIRE(estimator, "An instance of CKernelIndependenceTest expected!" - "Got an instance of %s instead!\n", estimator->get_name()); - + // finally set this as kernel for the labels estimator->set_kernel_q(m_kernel_labels); SG_DEBUG("Leaving!\n"); @@ -96,29 +98,33 @@ void CKernelDependenceMaximization::precompute() void CKernelDependenceMaximization::set_kernel_features(CKernel* kernel) { + // sanity check. using assert here because estimator instances are + // set internally and cannot be set via public API. + ASSERT(m_estimator); + CKernelIndependenceTest* estimator + =dynamic_cast(m_estimator); + ASSERT(estimator); + SG_REF(kernel); SG_UNREF(m_kernel_features); m_kernel_features=kernel; - CKernelIndependenceTest* estimator - =dynamic_cast(m_estimator); - REQUIRE(estimator, "An instance of CKernelIndependenceTest expected!" - "Got an instance of %s instead!\n", estimator->get_name()); - estimator->set_kernel_p(m_kernel_features); } void CKernelDependenceMaximization::set_kernel_labels(CKernel* kernel) { + // sanity check. using assert here because estimator instances are + // set internally and cannot be set via public API. + ASSERT(m_estimator); + CKernelIndependenceTest* estimator + =dynamic_cast(m_estimator); + ASSERT(estimator); + SG_REF(kernel); SG_UNREF(m_kernel_labels); m_kernel_labels=kernel; - CKernelIndependenceTest* estimator=dynamic_cast - (m_estimator); - REQUIRE(estimator, "An instance of CKernelIndependenceTest expected!" - "Got an instance of %s instead!\n", estimator->get_name()); - estimator->set_kernel_q(m_kernel_labels); } diff --git a/src/shogun/preprocessor/KernelDependenceMaximization.h b/src/shogun/preprocessor/KernelDependenceMaximization.h index 717d8be6c86..88f7340a564 100644 --- a/src/shogun/preprocessor/KernelDependenceMaximization.h +++ b/src/shogun/preprocessor/KernelDependenceMaximization.h @@ -45,8 +45,6 @@ class CKernelSelection; * selection. Different kernels are used for labels and data. For the sake * of computational convenience, the precompute() method is overridden to * precompute the kernel for labels and save as an instance of CCustomKernel - * after converting the labels into a dense feature using - * CDependenceMaximization::precompute */ class CKernelDependenceMaximization : public CDependenceMaximization { @@ -85,11 +83,9 @@ class CKernelDependenceMaximization : public CDependenceMaximization protected: /** - * Creates a dense feature object from the labels provided, m_labels and - * sets this feature in the independence test estimator via calling - * CDependenceMaximization::precompute. After this, it precomputes the - * kernel on labels and replaces the #m_kernel_labels with an instance of - * CCustomKernel + * Precomputes the kernel on labels and replaces the #m_kernel_labels + * with an instance of CCustomKernel. Labels features are set via + * CDependenceMaximization::set_labels call. */ virtual void precompute(); diff --git a/src/shogun/preprocessor/Preprocessor.h b/src/shogun/preprocessor/Preprocessor.h index 20eab72f8bd..a3fd64467a4 100644 --- a/src/shogun/preprocessor/Preprocessor.h +++ b/src/shogun/preprocessor/Preprocessor.h @@ -52,7 +52,8 @@ enum EPreprocessorType P_HOMOGENEOUSKERNELMAP = 180, P_PNORM = 190, P_RESCALEFEATURES = 200, - P_FISHERLDA = 210 + P_FISHERLDA = 210, + P_BAHSIC = 220 }; /** @brief Class Preprocessor defines a preprocessor interface. diff --git a/tests/unit/preprocessor/BAHSIC_unittest.cc b/tests/unit/preprocessor/BAHSIC_unittest.cc new file mode 100644 index 00000000000..ee1d37e3f66 --- /dev/null +++ b/tests/unit/preprocessor/BAHSIC_unittest.cc @@ -0,0 +1,98 @@ +/* + * Copyright (c) The Shogun Machine Learning Toolbox + * Written (w) 2014 Soumyajit De + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * The views and conclusions contained in the software and documentation are those + * of the authors and should not be interpreted as representing official policies, + * either expressed or implied, of the Shogun Development Team. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace shogun; + +TEST(BAHSIC, apply) +{ + const index_t dim=8; + const index_t num_data=5; + + // use fix seed for reproducibility + CMath::init_random(1); + + SGMatrix data(dim, num_data); + for (index_t i=0; i labels_vec(num_data); + for (index_t i=0; i* feats=new CDenseFeatures(data); + CBinaryLabels* labels=new CBinaryLabels(labels_vec); + float64_t sigma=1.0; + CGaussianKernel* kernel_p=new CGaussianKernel(10, 2*CMath::sq(sigma)); + CGaussianKernel* kernel_q=new CGaussianKernel(10, 2*CMath::sq(sigma)); + + CBAHSIC* fs=new CBAHSIC(); + index_t target_dim=dim/2; + fs->set_labels(labels); + fs->set_target_dim(target_dim); + fs->set_kernel_features(kernel_p); + fs->set_kernel_labels(kernel_q); + fs->set_policy(N_SMALLEST); + fs->set_num_remove(dim-target_dim); + CFeatures* selected=fs->apply(feats); + + SGMatrix selected_data + =((CDenseFeatures*)selected)->get_feature_matrix(); + + // ensure that the selected number of features is indeed equal to the + // target dimension + EXPECT_EQ(selected_data.num_rows, target_dim); + + // ensure that selected feats are the same as computed in local machine + SGVector inds(target_dim); + inds[0]=2; + inds[1]=5; + inds[2]=6; + inds[3]=7; + + for (index_t i=0; i +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace shogun; + +TEST(FeatureSelection, remove_feats) +{ + const index_t dim=8; + const index_t num_data=5; + + // use fix seed for reproducibility + CMath::init_random(1); + + SGMatrix data(dim, num_data); + for (index_t i=0; i* feats=new CDenseFeatures(data); + + CBAHSIC* fs=new CBAHSIC(); + index_t target_dim=dim/2; + fs->set_num_remove(dim-target_dim); + + // create a dummy argsorted vector to remove first dim/2 features + SGVector argsorted(dim); + argsorted.range_fill(); + + CFeatures* reduced=fs->remove_feats(feats, argsorted); + SGMatrix reduced_data + =((CDenseFeatures*)reduced)->get_feature_matrix(); + + for (index_t i=0; i data(dim, num_data); + for (index_t i=0; i labels_vec(num_data); + for (index_t i=0; i* feats=new CDenseFeatures(data); + CBinaryLabels* labels=new CBinaryLabels(labels_vec); + float64_t sigma=1.0; + CGaussianKernel* kernel_p=new CGaussianKernel(10, 2*CMath::sq(sigma)); + CGaussianKernel* kernel_q=new CGaussianKernel(10, 2*CMath::sq(sigma)); + + // SG_REF'ing the kernel for q because it is SG_UNREF'ed in precompute + // call and to replace by a CCustomKernel + SG_REF(kernel_q); + + CBAHSIC* fs=new CBAHSIC(); + fs->set_labels(labels); + fs->set_kernel_features(kernel_p); + fs->set_kernel_labels(kernel_q); + + // compute the measure removing dimension 0 + float64_t measure=fs->compute_measures(feats, 0); + + // recreate this using HSIC + SGVector inds(dim-1); + for (index_t i=0; icopy_dimension_subset(inds); + + SGMatrix l_data(1, num_data); + memcpy(l_data.matrix, labels_vec.vector, sizeof(float64_t)*num_data); + CDenseFeatures* l_feats=new CDenseFeatures(l_data); + + CHSIC* hsic=new CHSIC(); + hsic->set_p(transformed); + hsic->set_q(l_feats); + hsic->set_kernel_p(kernel_p); + hsic->set_kernel_q(kernel_q); + + EXPECT_NEAR(measure, hsic->compute_statistic(), 1E-15); + + SG_UNREF(fs); + SG_UNREF(hsic); + SG_UNREF(kernel_q); + SG_UNREF(feats); + SG_UNREF(transformed); +}