Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added CKernelDependenceMaximization and CBAHSIC in feature selection framework #2363

Merged
merged 2 commits into from
Jul 5, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions src/interfaces/modular/Features.i
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
%newobject get_transposed();
%newobject create_merged_copy(CFeatures* other);
%newobject copy_subset(SGVector<index_t> indices);
%newobject copy_dimension_subset(SGVector<index_t> indices);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure this thing even needs to be exposed to modular ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@karlnapf ummm.. well, its in public API of CFeatures.. so.. But since we have kept num_features thing out of CFeatures since it doesn't make sense for all feature types, maybe this method should not be here at all. Maybe a helper method in CFeatureSelection should handle it, like CFeatureSelection::get_num_features. What do you think?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mmmh, no hiding in in such a specialised class is not a good idea. then rather keep it public and expose it. i just though nobody might ever call this, so rather hide to not confuse people. but maybe actually somebody wants to call it, so keep this stuff, sorry for the confusion :)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@karlnapf well I thought that since our last discussion with @vigsterkr and @sonney2k regarding this num_feature thing, having a copy_dimension_subset in CFeatures would ultimately result in this method being unimplemented in all feature classes except for CDenseFeatures and CSparseFeatures :(

%newobject get_streamed_features(index_t num_elements);


Expand Down
36 changes: 36 additions & 0 deletions src/interfaces/modular/Preprocessor.i
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,14 @@
%rename(SortUlongString) CSortUlongString;
%rename(SortWordString) CSortWordString;

/* Feature selection framework */
%rename(DependenceMaximization) CDependenceMaximization;
%rename(KernelDependenceMaximization) CDependenceMaximization;
%rename(BAHSIC) CBAHSIC;

%newobject shogun::CFeatureSelection::apply;
%newobject shogun::CFeatureSelection::remove_feats;

%newobject shogun::CKernelPCA::apply_to_string_features;

/* Include Class Headers to make them visible from within the target language */
Expand Down Expand Up @@ -95,6 +103,31 @@ namespace shogun
%template(DecompressCharString) CDecompressString<char>;
#endif
}

/* Templates Class FeatureSelection */
%include <shogun/preprocessor/FeatureSelection.h>
namespace shogun
{
#ifdef USE_FLOAT64
%template(RealFeatureSelection) CFeatureSelection<float64_t>;
#endif
#ifdef USE_UINT64
%template(UlongFeatureSelection) CFeatureSelection<uint64_t>;
#endif
#ifdef USE_UINT16
%template(WordFeatureSelection) CFeatureSelection<uint16_t>;
#endif
#ifdef USE_INT16
%template(ShortFeatureSelection) CFeatureSelection<int16_t>;
#endif
#ifdef USE_UINT8
%template(ByteFeatureSelection) CFeatureSelection<uint8_t>;
#endif
#ifdef USE_CHAR
%template(CharFeatureSelection) CFeatureSelection<char>;
#endif
}

%include <shogun/preprocessor/SparsePreprocessor.h>
%include <shogun/preprocessor/NormOne.h>
%include <shogun/preprocessor/SumOne.h>
Expand All @@ -111,3 +144,6 @@ namespace shogun
%include <shogun/preprocessor/SortUlongString.h>
%include <shogun/preprocessor/SortWordString.h>

%include <shogun/preprocessor/DependenceMaximization.h>
%include <shogun/preprocessor/KernelDependenceMaximization.h>
%include <shogun/preprocessor/BAHSIC.h>
5 changes: 5 additions & 0 deletions src/interfaces/modular/Preprocessor_includes.i
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,9 @@
#include <shogun/preprocessor/DecompressString.h>
#include <shogun/preprocessor/SortUlongString.h>
#include <shogun/preprocessor/SortWordString.h>

#include <shogun/preprocessor/FeatureSelection.h>
#include <shogun/preprocessor/DependenceMaximization.h>
#include <shogun/preprocessor/KernelDependenceMaximization.h>
#include <shogun/preprocessor/BAHSIC.h>
%}
2 changes: 2 additions & 0 deletions src/interfaces/modular/Statistics.i
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
%rename(KernelIndependenceTest) CKernelIndependenceTest;
%rename(HSIC) CHSIC;
%rename(KernelMeanMatching) CKernelMeanMatching;
%rename(KernelSelection) CKernelSelection;
%rename(MMDKernelSelection) CMMDKernelSelection;
%rename(MMDKernelSelectionComb) CMMDKernelSelectionComb;
%rename(MMDKernelSelectionMedian) CMMDKernelSelectionMedian;
Expand All @@ -38,6 +39,7 @@
%include <shogun/statistics/KernelIndependenceTest.h>
%include <shogun/statistics/HSIC.h>
%include <shogun/statistics/KernelMeanMatching.h>
%include <shogun/statistics/KernelSelection.h>
%include <shogun/statistics/MMDKernelSelection.h>
%include <shogun/statistics/MMDKernelSelectionComb.h>
%include <shogun/statistics/MMDKernelSelectionMedian.h>
Expand Down
1 change: 1 addition & 0 deletions src/interfaces/modular/Statistics_includes.i
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <shogun/statistics/KernelIndependenceTest.h>
#include <shogun/statistics/HSIC.h>
#include <shogun/statistics/KernelMeanMatching.h>
#include <shogun/statistics/KernelSelection.h>
#include <shogun/statistics/MMDKernelSelection.h>
#include <shogun/statistics/MMDKernelSelectionComb.h>
#include <shogun/statistics/MMDKernelSelectionMedian.h>
Expand Down
62 changes: 62 additions & 0 deletions src/shogun/preprocessor/BAHSIC.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
/*
* Copyright (c) The Shogun Machine Learning Toolbox
* Written (w) 2014 Soumyajit De
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* The views and conclusions contained in the software and documentation are those
* of the authors and should not be interpreted as representing official policies,
* either expressed or implied, of the Shogun Development Team.
*/

#include <shogun/statistics/HSIC.h>
#include <shogun/preprocessor/BAHSIC.h>

using namespace shogun;

CBAHSIC::CBAHSIC() : CKernelDependenceMaximization()
{
init();
}

void CBAHSIC::init()
{
m_estimator=new CHSIC();
SG_REF(m_estimator);
m_algorithm=BACKWARD_ELIMINATION;
}

CBAHSIC::~CBAHSIC()
{
// estimator is SG_UNREF'ed in base CDependenceMaximization destructor
}

void CBAHSIC::set_algorithm(EFeatureSelectionAlgorithm algorithm)
{
SG_INFO("Algorithm is set to BACKWARD_ELIMINATION for %s and therefore "
"cannot be set externally!\n", get_name());
}

EPreprocessorType CBAHSIC::get_type() const
{
return P_BAHSIC;
}
92 changes: 92 additions & 0 deletions src/shogun/preprocessor/BAHSIC.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
/*
* Copyright (c) The Shogun Machine Learning Toolbox
* Written (w) 2014 Soumyajit De
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* The views and conclusions contained in the software and documentation are those
* of the authors and should not be interpreted as representing official policies,
* either expressed or implied, of the Shogun Development Team.
*/

#ifndef BAHSIC_H__
#define BAHSIC_H__

#include <shogun/lib/config.h>
#include <shogun/preprocessor/KernelDependenceMaximization.h>

namespace shogun
{

/** @brief Class CBAHSIC, that extends CKernelDependenceMaximization and uses
* HSIC [1] to compute dependence measures for feature selection using a
* backward elimination approach as described in [1]. This class serves as a
* convenience class that initializes the CDependenceMaximization#m_estimator
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe the class header of both of those should mention the memory requirements additional to the HSIC computation itself?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@karlnapf the base class CDependenceMaximization doc already mentions the additional memory requirement. These two classes don't add anything extra on the top of that.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok !

* with an instance of CHSIC and allows only ::BACKWARD_ELIMINATION algorithm
* to use which is set internally. Therefore, trying to use other algorithms
* by set_algorithm() will not work. Plese see the class documentation of CHSIC
* and [2] for more details on mathematical description of HSIC.
*
* Refrences:
* [1] Song, Le and Bedo, Justin and Borgwardt, Karsten M. and Gretton, Arthur
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could we try to reproduce an example from the paper in the notebook for this?

* and Smola, Alex. (2007). Gene Selection via the BAHSIC Family of Algorithms.
* Journal Bioinformatics. Volume 23 Issue Pages i490-i498. Oxford University
* Press Oxford, UK
* [2]: Gretton, A., Fukumizu, K., Teo, C., & Song, L. (2008). A kernel
* statistical test of independence. Advances in Neural Information Processing
* Systems, 1-8.
*/
class CBAHSIC : public CKernelDependenceMaximization
{
public:
/** Default constructor */
CBAHSIC();

/** Destructor */
virtual ~CBAHSIC();

/**
* Since only ::BACKWARD_ELIMINATION algorithm is applicable for BAHSIC,
* and this is set internally, this method is overridden to prevent this
* to be set from public API.
*
* @param algorithm the feature selection algorithm to use
*/
virtual void set_algorithm(EFeatureSelectionAlgorithm algorithm);

/** @return the preprocessor type */
virtual EPreprocessorType get_type() const;

/** @return the class name */
virtual const char* get_name() const
{
return "BAHSIC";
}

private:
/** Register params and initialize with default values */
void init();

};

}
#endif // BAHSIC_H__
59 changes: 35 additions & 24 deletions src/shogun/preprocessor/DependenceMaximization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,36 +74,18 @@ bool CDependenceMaximization::init(CFeatures* features)
return true;
}

void CDependenceMaximization::precompute()
{
SG_DEBUG("Entering!\n");

REQUIRE(m_labels, "Labels are not initialized!\n");
REQUIRE(m_estimator, "Estimator is not initialized!\n");

// convert the CLabels object to CDenseFeatures
SG_UNREF(m_labels_feats);

SGMatrix<float64_t> labels_matrix(1, m_labels->get_num_labels());
for (index_t i=0; i<labels_matrix.num_cols; ++i)
labels_matrix.matrix[i]=m_labels->get_value(i);

m_labels_feats=new CDenseFeatures<float64_t>(labels_matrix);
SG_REF(m_labels_feats);

// and set this to the estimator
m_estimator->set_q(m_labels_feats);

SG_DEBUG("Leaving!\n");
}

CFeatures* CDependenceMaximization::create_transformed_copy(CFeatures* features,
index_t idx)
{
SG_DEBUG("Entering!\n");

// remove the dimension specified by the index, i.e. get X\X_i
// NULL check is handled in CFeatureSelection::get_num_features call
index_t num_features=get_num_features(features);
REQUIRE(num_features>idx, "Specified dimension to remove (%d) is greater "
"than the total number of current features (%d)!\n",
idx, num_features);

SGVector<index_t> dims(num_features-1);
index_t n_dims=0;
for (index_t i=0; i<num_features; ++i)
Expand All @@ -127,6 +109,7 @@ float64_t CDependenceMaximization::compute_measures(CFeatures* features,

// remove the dimension (feat) specified by the index idx
CFeatures* reduced_feats=create_transformed_copy(features, idx);
ASSERT(reduced_feats);

// perform an independence test for X\X_i ~ p and Y ~ q with
// H_0: P(X\X_i, Y) = P(X\X_i) * P(Y)
Expand All @@ -153,14 +136,22 @@ CFeatures* CDependenceMaximization::remove_feats(CFeatures* features,
REQUIRE(m_policy==N_SMALLEST || m_policy==PERCENTILE_SMALLEST,
"Only N_SMALLEST and PERCENTILE_SMALLEST removal policy can work "
"with %s!\n", get_name());
REQUIRE(features, "Features is not intialized!\n");
REQUIRE(argsorted.vector, "The argsorted vector is not initialized!\n");
REQUIRE(get_num_features(features)==argsorted.vlen,
"argsorted vector should be equal to the number of features (%d)! "
"But it was %d!\n", argsorted.vlen);

// compute a threshold to remove for both the policies
index_t threshold=m_num_remove;
if (m_policy==PERCENTILE_SMALLEST)
threshold*=argsorted.vlen*0.01;

// make sure that the threshold is valid given the current number of feats
ASSERT(threshold<argsorted.vlen)
REQUIRE(threshold<argsorted.vlen, "The threshold of removal is too high "
"(asked to remove %d features out of %d)! Please use a smaller "
"number for removal using set_num_remove() call",
threshold, argsorted.vlen);

// remove the lowest threshold rank holders by storing indices
SGVector<index_t> inds(argsorted.vlen-threshold);
Expand All @@ -186,3 +177,23 @@ void CDependenceMaximization::set_policy(EFeatureRemovalPolicy policy)
"with %s!\n", get_name());
m_policy=policy;
}

void CDependenceMaximization::set_labels(CLabels* labels)
{
// NULL check is handled in base class CFeatureSelection
CFeatureSelection::set_labels(labels);

// convert the CLabels object to CDenseFeatures
SG_UNREF(m_labels_feats);

SGMatrix<float64_t> labels_matrix(1, m_labels->get_num_labels());
for (index_t i=0; i<labels_matrix.num_cols; ++i)
labels_matrix.matrix[i]=m_labels->get_value(i);

m_labels_feats=new CDenseFeatures<float64_t>(labels_matrix);
SG_REF(m_labels_feats);

// we need to set this to the estimator which is set internally
ASSERT(m_estimator);
m_estimator->set_q(m_labels_feats);
}
19 changes: 11 additions & 8 deletions src/shogun/preprocessor/DependenceMaximization.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ class CIndependenceTest;
* The estimator cannot be set via user interface, rather its subclasses
* initialize this estimator with appropriate instances internally.
*
* This class also overrides precompute() method to create a feature object from
* This class also overrides set_labels() method to create a feature object from
* the labels and sets this as features \f$\mathbf{Y}\sim q\f$ to the estimator
* which is required to compute the measure.
*/
Expand Down Expand Up @@ -119,6 +119,16 @@ class CDependenceMaximization : public CFeatureSelection<float64_t>
*/
virtual bool init(CFeatures* features);

/**
* Setter for labels. This method is overridden to internally convert the
* labels to a dense feature object and set this feature in the
* independence test estimator. These labels serve as samples
* \f$\mathbf{Y}\sim q\f$ in the independence test
*
* @param labels the labels
*/
virtual void set_labels(CLabels* labels);

/** @return the class name */
virtual const char* get_name() const
{
Expand All @@ -137,13 +147,6 @@ class CDependenceMaximization : public CFeatureSelection<float64_t>
*/
virtual CFeatures* create_transformed_copy(CFeatures* features, index_t idx);

/**
* Creates a dense feature object from the labels provided, #m_labels and
* sets this feature in the independence test estimator. These labels serve
* as samples \f$\mathbf{Y}\sim q\f$ in the independence test
*/
virtual void precompute();

/**
* The estimator for performing statistical tests for independence which
* is used for computing measures
Expand Down