From 74d8dba0ab36d1d4b16d98e38d0df63f819391d1 Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Wed, 30 May 2018 20:49:25 +0800 Subject: [PATCH] Drop apply_preprocessor and preproc states in features --- .../python/features_string_char_compressed.py | 4 +- src/shogun/features/DenseFeatures.cpp | 42 ------------------- src/shogun/features/DenseFeatures.h | 12 ------ src/shogun/features/Features.cpp | 42 +------------------ src/shogun/features/Features.h | 21 ---------- src/shogun/features/SparseFeatures.cpp | 32 -------------- src/shogun/features/SparseFeatures.h | 9 ---- src/shogun/features/StringFeatures.cpp | 24 ----------- src/shogun/features/StringFeatures.h | 7 ---- .../kernel/string/CommWordStringKernel.cpp | 10 ----- .../string/WeightedCommWordStringKernel.cpp | 10 ----- .../GaussianProcessClassification_unittest.cc | 10 +++-- 12 files changed, 10 insertions(+), 213 deletions(-) diff --git a/examples/undocumented/python/features_string_char_compressed.py b/examples/undocumented/python/features_string_char_compressed.py index 462809c66bc..ad8023dcca2 100644 --- a/examples/undocumented/python/features_string_char_compressed.py +++ b/examples/undocumented/python/features_string_char_compressed.py @@ -57,8 +57,8 @@ def features_string_char_compressed (fname): # load compressed data and uncompress via preprocessor f2=StringCharFeatures(RAWBYTE); f2.load_compressed("tmp/foo_lzo.str", False) - f2.add_preprocessor(DecompressCharString(LZO)) - f2.apply_preprocessor() + preproc = DecompressCharString(LZO) + f2 = preproc.transform(f2) #print("lzo strings", f2.get_features()) #print diff --git a/src/shogun/features/DenseFeatures.cpp b/src/shogun/features/DenseFeatures.cpp index a2d6b7c8038..46d939dd25f 100644 --- a/src/shogun/features/DenseFeatures.cpp +++ b/src/shogun/features/DenseFeatures.cpp @@ -388,48 +388,6 @@ template ST* CDenseFeatures::get_transposed(int32_t &num_feat, int return fm; } -template bool CDenseFeatures::apply_preprocessor(bool force_preprocessing) -{ - if (m_subset_stack->has_subsets()) - SG_ERROR("A subset is set, cannot call apply_preproc\n") - - SG_DEBUG("force: %d\n", force_preprocessing) - - if (feature_matrix.matrix && get_num_preprocessors()) - { - for (int32_t i = 0; i < get_num_preprocessors(); i++) - { - if ((!is_preprocessed(i) || force_preprocessing)) - { - set_preprocessed(i); - CDensePreprocessor* p = - (CDensePreprocessor*) get_preprocessor(i); - SG_INFO("preprocessing using preproc %s\n", p->get_name()) - - if (p->apply_to_feature_matrix(this).matrix == NULL) - { - SG_UNREF(p); - return false; - } - SG_UNREF(p); - - } - } - - return true; - } - else - { - if (!feature_matrix.matrix) - SG_ERROR("no feature matrix\n") - - if (!get_num_preprocessors()) - SG_ERROR("no preprocessors available\n") - - return false; - } -} - template int32_t CDenseFeatures::get_num_vectors() const { return m_subset_stack->has_subsets() ? m_subset_stack->get_size() : num_vectors; diff --git a/src/shogun/features/DenseFeatures.h b/src/shogun/features/DenseFeatures.h index ab34eff1683..48fef017a41 100644 --- a/src/shogun/features/DenseFeatures.h +++ b/src/shogun/features/DenseFeatures.h @@ -261,18 +261,6 @@ template class CDenseFeatures: public CDotFeatures */ ST* get_transposed(int32_t &num_feat, int32_t &num_vec); - /** apply preprocessor - * - * applies preprocessors to ALL features (subset removed before and - * restored afterwards) - * - * not possible with subset - * - * @param force_preprocessing if preprocssing shall be forced - * @return if applying was successful - */ - virtual bool apply_preprocessor(bool force_preprocessing = false); - /** get number of feature vectors * * @return number of feature vectors diff --git a/src/shogun/features/Features.cpp b/src/shogun/features/Features.cpp index a54fbb65a7e..610184932df 100644 --- a/src/shogun/features/Features.cpp +++ b/src/shogun/features/Features.cpp @@ -28,13 +28,10 @@ CFeatures::CFeatures(const CFeatures& orig) { init(); - // Call to init creates new preproc and preprocessed arrays. + // Call to init creates new preproc arrays. SG_UNREF(preproc); - SG_UNREF(preprocessed); preproc = orig.preproc; - preprocessed = orig.preprocessed; SG_REF(preproc); - SG_REF(preprocessed); } CFeatures::CFeatures(CFile* loader) @@ -51,7 +48,6 @@ CFeatures::~CFeatures() clean_preprocessors(); SG_UNREF(m_subset_stack); SG_UNREF(preproc); - SG_UNREF(preprocessed); } void CFeatures::init() @@ -61,8 +57,6 @@ void CFeatures::init() SG_ADD((CSGObject**) &preproc, "preproc", "Array of preprocessors.", MS_NOT_AVAILABLE); - SG_ADD((CSGObject**) &preprocessed, "preprocessed", "Array of preprocessed.", - MS_NOT_AVAILABLE); SG_ADD((CSGObject**)&m_subset_stack, "subset_stack", "Stack of subsets", MS_NOT_AVAILABLE); @@ -73,9 +67,7 @@ void CFeatures::init() properties = FP_NONE; cache_size = 0; preproc = new CDynamicObjectArray(); - preprocessed = new CDynamicArray(); SG_REF(preproc); - SG_REF(preprocessed); } void CFeatures::add_preprocessor(CPreprocessor* p) @@ -83,7 +75,6 @@ void CFeatures::add_preprocessor(CPreprocessor* p) ASSERT(p) preproc->push_back(p); - preprocessed->push_back(false); } CPreprocessor* CFeatures::get_preprocessor(int32_t num) const @@ -96,23 +87,9 @@ CPreprocessor* CFeatures::get_preprocessor(int32_t num) const return NULL; } -int32_t CFeatures::get_num_preprocessed() const -{ - int32_t num=0; - - for (int32_t i=0; iget_num_elements(); i++) - { - if ((*preprocessed)[i]) - num++; - } - - return num; -} - void CFeatures::clean_preprocessors() { preproc->reset_array(); - preprocessed->reset_array(); } void CFeatures::del_preprocessor(int32_t num) @@ -120,7 +97,6 @@ void CFeatures::del_preprocessor(int32_t num) if (numget_num_elements() && num>=0) { preproc->delete_element(num); - preprocessed->delete_element(num); } } @@ -130,24 +106,10 @@ void CFeatures::list_preprocessors() for (int32_t i=0; iget_element(i)->get_name(), - preprocessed->get_element(i) ? "true" : "false"); + SG_INFO("preproc[%d]=%s\n", i, preproc->get_element(i)->get_name()); } } -void CFeatures::set_preprocessed(int32_t num) -{ - ASSERT(numget_num_elements() && num>=0); - (*preprocessed)[num]=true; -} - -bool CFeatures::is_preprocessed(int32_t num) const -{ - ASSERT(numget_num_elements() && num>=0); - return (*preprocessed)[num]; -} - int32_t CFeatures::get_num_preprocessors() const { return preproc->get_num_elements(); diff --git a/src/shogun/features/Features.h b/src/shogun/features/Features.h index 33610cfb1a9..4294d08e18f 100644 --- a/src/shogun/features/Features.h +++ b/src/shogun/features/Features.h @@ -140,24 +140,6 @@ class CFeatures : public CSGObject */ CPreprocessor* get_preprocessor(int32_t num) const; - /** set applied flag for preprocessor - * - * @param num index of preprocessor in list - */ - void set_preprocessed(int32_t num); - - /** get whether specified preprocessor was already applied - * - * @param num index of preprocessor in list - */ - bool is_preprocessed(int32_t num) const; - - /** get the number of applied preprocs - * - * @return number of applied preprocessors - */ - int32_t get_num_preprocessed() const; - /** get number of preprocessors * * @return number of preprocessors @@ -375,9 +357,6 @@ class CFeatures : public CSGObject /** list of preprocessors */ CDynamicObjectArray* preproc; - /** i'th entry is true if features were already preprocessed with preproc i */ - CDynamicArray* preprocessed; - protected: /** subset used for index transformations */ CSubsetStack* m_subset_stack; diff --git a/src/shogun/features/SparseFeatures.cpp b/src/shogun/features/SparseFeatures.cpp index 2252d74328e..e7b0cd8fdf0 100644 --- a/src/shogun/features/SparseFeatures.cpp +++ b/src/shogun/features/SparseFeatures.cpp @@ -301,38 +301,6 @@ template void CSparseFeatures::set_full_feature_matrix(SGMatrix bool CSparseFeatures::apply_preprocessor(bool force_preprocessing) -{ - SG_INFO("force: %d\n", force_preprocessing) - - if (sparse_feature_matrix.sparse_matrix && get_num_preprocessors()) - { - for (int32_t i=0; i* p = (CSparsePreprocessor*) get_preprocessor(i); - SG_INFO("preprocessing using preproc %s\n", p->get_name()) - - if (p->apply_to_sparse_feature_matrix(this) == NULL) - { - SG_UNREF(p); - return false; - } - - SG_UNREF(p); - } - } - return true; - } - else - { - SG_WARNING("no sparse feature matrix available or features already preprocessed - skipping.\n") - return false; - } -} - template int32_t CSparseFeatures::get_num_vectors() const { return m_subset_stack->has_subsets() ? m_subset_stack->get_size() : sparse_feature_matrix.num_vectors; diff --git a/src/shogun/features/SparseFeatures.h b/src/shogun/features/SparseFeatures.h index 4cc3bed5557..b4ef16dbd04 100644 --- a/src/shogun/features/SparseFeatures.h +++ b/src/shogun/features/SparseFeatures.h @@ -243,15 +243,6 @@ template class CSparseFeatures : public CDotFeatures */ virtual void set_full_feature_matrix(SGMatrix full); - /** apply preprocessor - * - * possible with subset - * - * @param force_preprocessing if preprocssing shall be forced - * @return if applying was successful - */ - virtual bool apply_preprocessor(bool force_preprocessing=false); - /** get number of feature vectors, possibly of subset * * @return number of feature vectors diff --git a/src/shogun/features/StringFeatures.cpp b/src/shogun/features/StringFeatures.cpp index dde5598d3b6..109cbff88e5 100644 --- a/src/shogun/features/StringFeatures.cpp +++ b/src/shogun/features/StringFeatures.cpp @@ -1222,30 +1222,6 @@ template bool CStringFeatures::save_compressed(char* dest, E_COMPR return true; } -template bool CStringFeatures::apply_preprocessor(bool force_preprocessing) -{ - SG_DEBUG("force: %d\n", force_preprocessing) - - for (int32_t i=0; i* p=(CStringPreprocessor*) get_preprocessor(i); - SG_INFO("preprocessing using preproc %s\n", p->get_name()) - - if (!p->apply_to_string_features(this)) - { - SG_UNREF(p); - return false; - } - else - SG_UNREF(p); - } - } - return true; -} - template int32_t CStringFeatures::obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip) { if (m_subset_stack->has_subsets()) diff --git a/src/shogun/features/StringFeatures.h b/src/shogun/features/StringFeatures.h index f5faa9793d2..98e1c98cccf 100644 --- a/src/shogun/features/StringFeatures.h +++ b/src/shogun/features/StringFeatures.h @@ -488,13 +488,6 @@ template class CStringFeatures : public CFeatures */ virtual bool save_compressed(char* dest, E_COMPRESSION_TYPE compression, int level); - /** apply preprocessor - * - * @param force_preprocessing if preprocssing shall be forced - * @return if applying was successful - */ - virtual bool apply_preprocessor(bool force_preprocessing=false); - /** slides a window of size window_size over the current single string * step_size is the amount by which the window is shifted. * creates (string_len-window_size)/step_size many feature obj diff --git a/src/shogun/kernel/string/CommWordStringKernel.cpp b/src/shogun/kernel/string/CommWordStringKernel.cpp index 7f6f3c25e1c..bf5ee1ebd42 100644 --- a/src/shogun/kernel/string/CommWordStringKernel.cpp +++ b/src/shogun/kernel/string/CommWordStringKernel.cpp @@ -156,16 +156,6 @@ float64_t CCommWordStringKernel::compute_helper( else bvec=NULL; } - else - { - if ( (l->get_num_preprocessors() != l->get_num_preprocessed()) || - (r->get_num_preprocessors() != r->get_num_preprocessed())) - { - SG_ERROR("not all preprocessors have been applied to training (%d/%d)" - " or test (%d/%d) data\n", l->get_num_preprocessed(), l->get_num_preprocessors(), - r->get_num_preprocessed(), r->get_num_preprocessors()); - } - } float64_t result=0; diff --git a/src/shogun/kernel/string/WeightedCommWordStringKernel.cpp b/src/shogun/kernel/string/WeightedCommWordStringKernel.cpp index fd953f4ea01..83ed047178f 100644 --- a/src/shogun/kernel/string/WeightedCommWordStringKernel.cpp +++ b/src/shogun/kernel/string/WeightedCommWordStringKernel.cpp @@ -124,16 +124,6 @@ float64_t CWeightedCommWordStringKernel::compute_helper( else bvec=NULL; } - else - { - if ( (l->get_num_preprocessors() != l->get_num_preprocessed()) || - (r->get_num_preprocessors() != r->get_num_preprocessed())) - { - SG_ERROR("not all preprocessors have been applied to training (%d/%d)" - " or test (%d/%d) data\n", l->get_num_preprocessed(), l->get_num_preprocessors(), - r->get_num_preprocessed(), r->get_num_preprocessors()); - } - } float64_t result=0; uint8_t mask=0; diff --git a/tests/unit/classifier/GaussianProcessClassification_unittest.cc b/tests/unit/classifier/GaussianProcessClassification_unittest.cc index 37e200333ec..d94a5bca6d5 100644 --- a/tests/unit/classifier/GaussianProcessClassification_unittest.cc +++ b/tests/unit/classifier/GaussianProcessClassification_unittest.cc @@ -595,11 +595,13 @@ TEST_F(GaussianProcessClassification, apply_preprocessor_and_binary) CRescaleFeatures* preproc=new CRescaleFeatures(); preproc->fit(features_train); - features_train->add_preprocessor(preproc); - features_train->apply_preprocessor(); + features_train = + preproc->transform(features_train)->as>(); + SG_REF(features_train) - features_test->add_preprocessor(preproc); - features_test->apply_preprocessor(); + features_test = + preproc->transform(features_test)->as>(); + SG_REF(features_test); // logit likelihood CLogitLikelihood* likelihood=new CLogitLikelihood();