Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Merge pull request #2363 from lambday/feature/selection
Added CKernelDependenceMaximization and CBAHSIC in feature selection framework
  • Loading branch information
karlnapf committed Jul 5, 2014
2 parents b95d9a7 + 97911c0 commit 80287f4
Show file tree
Hide file tree
Showing 16 changed files with 754 additions and 47 deletions.
1 change: 1 addition & 0 deletions src/interfaces/modular/Features.i
Expand Up @@ -21,6 +21,7 @@
%newobject get_transposed();
%newobject create_merged_copy(CFeatures* other);
%newobject copy_subset(SGVector<index_t> indices);
%newobject copy_dimension_subset(SGVector<index_t> indices);
%newobject get_streamed_features(index_t num_elements);


Expand Down
36 changes: 36 additions & 0 deletions src/interfaces/modular/Preprocessor.i
Expand Up @@ -27,6 +27,14 @@
%rename(SortUlongString) CSortUlongString;
%rename(SortWordString) CSortWordString;

/* Feature selection framework */
%rename(DependenceMaximization) CDependenceMaximization;
%rename(KernelDependenceMaximization) CDependenceMaximization;
%rename(BAHSIC) CBAHSIC;

%newobject shogun::CFeatureSelection::apply;
%newobject shogun::CFeatureSelection::remove_feats;

%newobject shogun::CKernelPCA::apply_to_string_features;

/* Include Class Headers to make them visible from within the target language */
Expand Down Expand Up @@ -95,6 +103,31 @@ namespace shogun
%template(DecompressCharString) CDecompressString<char>;
#endif
}

/* Templates Class FeatureSelection */
%include <shogun/preprocessor/FeatureSelection.h>
namespace shogun
{
#ifdef USE_FLOAT64
%template(RealFeatureSelection) CFeatureSelection<float64_t>;
#endif
#ifdef USE_UINT64
%template(UlongFeatureSelection) CFeatureSelection<uint64_t>;
#endif
#ifdef USE_UINT16
%template(WordFeatureSelection) CFeatureSelection<uint16_t>;
#endif
#ifdef USE_INT16
%template(ShortFeatureSelection) CFeatureSelection<int16_t>;
#endif
#ifdef USE_UINT8
%template(ByteFeatureSelection) CFeatureSelection<uint8_t>;
#endif
#ifdef USE_CHAR
%template(CharFeatureSelection) CFeatureSelection<char>;
#endif
}

%include <shogun/preprocessor/SparsePreprocessor.h>
%include <shogun/preprocessor/NormOne.h>
%include <shogun/preprocessor/SumOne.h>
Expand All @@ -111,3 +144,6 @@ namespace shogun
%include <shogun/preprocessor/SortUlongString.h>
%include <shogun/preprocessor/SortWordString.h>

%include <shogun/preprocessor/DependenceMaximization.h>
%include <shogun/preprocessor/KernelDependenceMaximization.h>
%include <shogun/preprocessor/BAHSIC.h>
5 changes: 5 additions & 0 deletions src/interfaces/modular/Preprocessor_includes.i
Expand Up @@ -22,4 +22,9 @@
#include <shogun/preprocessor/DecompressString.h>
#include <shogun/preprocessor/SortUlongString.h>
#include <shogun/preprocessor/SortWordString.h>

#include <shogun/preprocessor/FeatureSelection.h>
#include <shogun/preprocessor/DependenceMaximization.h>
#include <shogun/preprocessor/KernelDependenceMaximization.h>
#include <shogun/preprocessor/BAHSIC.h>
%}
2 changes: 2 additions & 0 deletions src/interfaces/modular/Statistics.i
Expand Up @@ -18,6 +18,7 @@
%rename(KernelIndependenceTest) CKernelIndependenceTest;
%rename(HSIC) CHSIC;
%rename(KernelMeanMatching) CKernelMeanMatching;
%rename(KernelSelection) CKernelSelection;
%rename(MMDKernelSelection) CMMDKernelSelection;
%rename(MMDKernelSelectionComb) CMMDKernelSelectionComb;
%rename(MMDKernelSelectionMedian) CMMDKernelSelectionMedian;
Expand All @@ -38,6 +39,7 @@
%include <shogun/statistics/KernelIndependenceTest.h>
%include <shogun/statistics/HSIC.h>
%include <shogun/statistics/KernelMeanMatching.h>
%include <shogun/statistics/KernelSelection.h>
%include <shogun/statistics/MMDKernelSelection.h>
%include <shogun/statistics/MMDKernelSelectionComb.h>
%include <shogun/statistics/MMDKernelSelectionMedian.h>
Expand Down
1 change: 1 addition & 0 deletions src/interfaces/modular/Statistics_includes.i
Expand Up @@ -9,6 +9,7 @@
#include <shogun/statistics/KernelIndependenceTest.h>
#include <shogun/statistics/HSIC.h>
#include <shogun/statistics/KernelMeanMatching.h>
#include <shogun/statistics/KernelSelection.h>
#include <shogun/statistics/MMDKernelSelection.h>
#include <shogun/statistics/MMDKernelSelectionComb.h>
#include <shogun/statistics/MMDKernelSelectionMedian.h>
Expand Down
62 changes: 62 additions & 0 deletions src/shogun/preprocessor/BAHSIC.cpp
@@ -0,0 +1,62 @@
/*
* Copyright (c) The Shogun Machine Learning Toolbox
* Written (w) 2014 Soumyajit De
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* The views and conclusions contained in the software and documentation are those
* of the authors and should not be interpreted as representing official policies,
* either expressed or implied, of the Shogun Development Team.
*/

#include <shogun/statistics/HSIC.h>
#include <shogun/preprocessor/BAHSIC.h>

using namespace shogun;

CBAHSIC::CBAHSIC() : CKernelDependenceMaximization()
{
init();
}

void CBAHSIC::init()
{
m_estimator=new CHSIC();
SG_REF(m_estimator);
m_algorithm=BACKWARD_ELIMINATION;
}

CBAHSIC::~CBAHSIC()
{
// estimator is SG_UNREF'ed in base CDependenceMaximization destructor
}

void CBAHSIC::set_algorithm(EFeatureSelectionAlgorithm algorithm)
{
SG_INFO("Algorithm is set to BACKWARD_ELIMINATION for %s and therefore "
"cannot be set externally!\n", get_name());
}

EPreprocessorType CBAHSIC::get_type() const
{
return P_BAHSIC;
}
92 changes: 92 additions & 0 deletions src/shogun/preprocessor/BAHSIC.h
@@ -0,0 +1,92 @@
/*
* Copyright (c) The Shogun Machine Learning Toolbox
* Written (w) 2014 Soumyajit De
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* The views and conclusions contained in the software and documentation are those
* of the authors and should not be interpreted as representing official policies,
* either expressed or implied, of the Shogun Development Team.
*/

#ifndef BAHSIC_H__
#define BAHSIC_H__

#include <shogun/lib/config.h>
#include <shogun/preprocessor/KernelDependenceMaximization.h>

namespace shogun
{

/** @brief Class CBAHSIC, that extends CKernelDependenceMaximization and uses
* HSIC [1] to compute dependence measures for feature selection using a
* backward elimination approach as described in [1]. This class serves as a
* convenience class that initializes the CDependenceMaximization#m_estimator
* with an instance of CHSIC and allows only ::BACKWARD_ELIMINATION algorithm
* to use which is set internally. Therefore, trying to use other algorithms
* by set_algorithm() will not work. Plese see the class documentation of CHSIC
* and [2] for more details on mathematical description of HSIC.
*
* Refrences:
* [1] Song, Le and Bedo, Justin and Borgwardt, Karsten M. and Gretton, Arthur
* and Smola, Alex. (2007). Gene Selection via the BAHSIC Family of Algorithms.
* Journal Bioinformatics. Volume 23 Issue Pages i490-i498. Oxford University
* Press Oxford, UK
* [2]: Gretton, A., Fukumizu, K., Teo, C., & Song, L. (2008). A kernel
* statistical test of independence. Advances in Neural Information Processing
* Systems, 1-8.
*/
class CBAHSIC : public CKernelDependenceMaximization
{
public:
/** Default constructor */
CBAHSIC();

/** Destructor */
virtual ~CBAHSIC();

/**
* Since only ::BACKWARD_ELIMINATION algorithm is applicable for BAHSIC,
* and this is set internally, this method is overridden to prevent this
* to be set from public API.
*
* @param algorithm the feature selection algorithm to use
*/
virtual void set_algorithm(EFeatureSelectionAlgorithm algorithm);

/** @return the preprocessor type */
virtual EPreprocessorType get_type() const;

/** @return the class name */
virtual const char* get_name() const
{
return "BAHSIC";
}

private:
/** Register params and initialize with default values */
void init();

};

}
#endif // BAHSIC_H__
59 changes: 35 additions & 24 deletions src/shogun/preprocessor/DependenceMaximization.cpp
Expand Up @@ -74,36 +74,18 @@ bool CDependenceMaximization::init(CFeatures* features)
return true;
}

void CDependenceMaximization::precompute()
{
SG_DEBUG("Entering!\n");

REQUIRE(m_labels, "Labels are not initialized!\n");
REQUIRE(m_estimator, "Estimator is not initialized!\n");

// convert the CLabels object to CDenseFeatures
SG_UNREF(m_labels_feats);

SGMatrix<float64_t> labels_matrix(1, m_labels->get_num_labels());
for (index_t i=0; i<labels_matrix.num_cols; ++i)
labels_matrix.matrix[i]=m_labels->get_value(i);

m_labels_feats=new CDenseFeatures<float64_t>(labels_matrix);
SG_REF(m_labels_feats);

// and set this to the estimator
m_estimator->set_q(m_labels_feats);

SG_DEBUG("Leaving!\n");
}

CFeatures* CDependenceMaximization::create_transformed_copy(CFeatures* features,
index_t idx)
{
SG_DEBUG("Entering!\n");

// remove the dimension specified by the index, i.e. get X\X_i
// NULL check is handled in CFeatureSelection::get_num_features call
index_t num_features=get_num_features(features);
REQUIRE(num_features>idx, "Specified dimension to remove (%d) is greater "
"than the total number of current features (%d)!\n",
idx, num_features);

SGVector<index_t> dims(num_features-1);
index_t n_dims=0;
for (index_t i=0; i<num_features; ++i)
Expand All @@ -127,6 +109,7 @@ float64_t CDependenceMaximization::compute_measures(CFeatures* features,

// remove the dimension (feat) specified by the index idx
CFeatures* reduced_feats=create_transformed_copy(features, idx);
ASSERT(reduced_feats);

// perform an independence test for X\X_i ~ p and Y ~ q with
// H_0: P(X\X_i, Y) = P(X\X_i) * P(Y)
Expand All @@ -153,14 +136,22 @@ CFeatures* CDependenceMaximization::remove_feats(CFeatures* features,
REQUIRE(m_policy==N_SMALLEST || m_policy==PERCENTILE_SMALLEST,
"Only N_SMALLEST and PERCENTILE_SMALLEST removal policy can work "
"with %s!\n", get_name());
REQUIRE(features, "Features is not intialized!\n");
REQUIRE(argsorted.vector, "The argsorted vector is not initialized!\n");
REQUIRE(get_num_features(features)==argsorted.vlen,
"argsorted vector should be equal to the number of features (%d)! "
"But it was %d!\n", argsorted.vlen);

// compute a threshold to remove for both the policies
index_t threshold=m_num_remove;
if (m_policy==PERCENTILE_SMALLEST)
threshold*=argsorted.vlen*0.01;

// make sure that the threshold is valid given the current number of feats
ASSERT(threshold<argsorted.vlen)
REQUIRE(threshold<argsorted.vlen, "The threshold of removal is too high "
"(asked to remove %d features out of %d)! Please use a smaller "
"number for removal using set_num_remove() call",
threshold, argsorted.vlen);

// remove the lowest threshold rank holders by storing indices
SGVector<index_t> inds(argsorted.vlen-threshold);
Expand All @@ -186,3 +177,23 @@ void CDependenceMaximization::set_policy(EFeatureRemovalPolicy policy)
"with %s!\n", get_name());
m_policy=policy;
}

void CDependenceMaximization::set_labels(CLabels* labels)
{
// NULL check is handled in base class CFeatureSelection
CFeatureSelection::set_labels(labels);

// convert the CLabels object to CDenseFeatures
SG_UNREF(m_labels_feats);

SGMatrix<float64_t> labels_matrix(1, m_labels->get_num_labels());
for (index_t i=0; i<labels_matrix.num_cols; ++i)
labels_matrix.matrix[i]=m_labels->get_value(i);

m_labels_feats=new CDenseFeatures<float64_t>(labels_matrix);
SG_REF(m_labels_feats);

// we need to set this to the estimator which is set internally
ASSERT(m_estimator);
m_estimator->set_q(m_labels_feats);
}
19 changes: 11 additions & 8 deletions src/shogun/preprocessor/DependenceMaximization.h
Expand Up @@ -60,7 +60,7 @@ class CIndependenceTest;
* The estimator cannot be set via user interface, rather its subclasses
* initialize this estimator with appropriate instances internally.
*
* This class also overrides precompute() method to create a feature object from
* This class also overrides set_labels() method to create a feature object from
* the labels and sets this as features \f$\mathbf{Y}\sim q\f$ to the estimator
* which is required to compute the measure.
*/
Expand Down Expand Up @@ -119,6 +119,16 @@ class CDependenceMaximization : public CFeatureSelection<float64_t>
*/
virtual bool init(CFeatures* features);

/**
* Setter for labels. This method is overridden to internally convert the
* labels to a dense feature object and set this feature in the
* independence test estimator. These labels serve as samples
* \f$\mathbf{Y}\sim q\f$ in the independence test
*
* @param labels the labels
*/
virtual void set_labels(CLabels* labels);

/** @return the class name */
virtual const char* get_name() const
{
Expand All @@ -137,13 +147,6 @@ class CDependenceMaximization : public CFeatureSelection<float64_t>
*/
virtual CFeatures* create_transformed_copy(CFeatures* features, index_t idx);

/**
* Creates a dense feature object from the labels provided, #m_labels and
* sets this feature in the independence test estimator. These labels serve
* as samples \f$\mathbf{Y}\sim q\f$ in the independence test
*/
virtual void precompute();

/**
* The estimator for performing statistical tests for independence which
* is used for computing measures
Expand Down

0 comments on commit 80287f4

Please sign in to comment.