Skip to content

Commit

Permalink
Implement apply api in string preproc, deprecate apply_to_string_feat…
Browse files Browse the repository at this point in the history
…ures
  • Loading branch information
vinx13 authored and vigsterkr committed Jun 4, 2018
1 parent 11e72d6 commit 4b6f6bb
Show file tree
Hide file tree
Showing 10 changed files with 78 additions and 65 deletions.
18 changes: 16 additions & 2 deletions src/shogun/lib/SGStringList.cpp
@@ -1,7 +1,8 @@
#include <shogun/lib/SGStringList.h>
#include <shogun/lib/SGString.h>
#include <shogun/base/range.h>
#include <shogun/io/File.h>
#include <shogun/io/SGIO.h>
#include <shogun/lib/SGString.h>
#include <shogun/lib/SGStringList.h>

namespace shogun
{
Expand Down Expand Up @@ -87,6 +88,19 @@ void SGStringList<T>::free_data()
max_string_length = 0;
}

template <class T>
SGStringList<T> SGStringList<T>::clone() const
{
SGStringList<T> result(*this);

auto strings = SG_MALLOC(SGString<T>, num_strings);

for (auto i : range(num_strings))
strings[i] = this->strings[i].clone();

return SGStringList<T>(strings, num_strings, max_string_length);
}

template class SGStringList<bool>;
template class SGStringList<char>;
template class SGStringList<int8_t>;
Expand Down
5 changes: 5 additions & 0 deletions src/shogun/lib/SGStringList.h
Expand Up @@ -60,6 +60,11 @@ template <class T> class SGStringList : public SGReferencedData
*/
void save(CFile* saver);

/** clone string list
*
* @return a deep copy of current string list
*/
SGStringList<T> clone() const;

protected:

Expand Down
21 changes: 6 additions & 15 deletions src/shogun/preprocessor/DecompressString.cpp
Expand Up @@ -46,24 +46,15 @@ bool CDecompressString<ST>::save(FILE* f)
}

template <class ST>
bool CDecompressString<ST>::apply_to_string_features(CFeatures* f)
void CDecompressString<ST>::apply_to_string_list(SGStringList<ST> string_list)
{
int32_t i;
auto sf = f->as<CStringFeatures<ST>>();
int32_t num_vec = sf->get_num_vectors();

for (i=0; i<num_vec; i++)
for (auto i : range(string_list.num_strings))
{
int32_t len=0;
bool free_vec;
auto vec = sf->get_feature_vector(i, len, free_vec);

auto decompressed=apply_to_string(vec, len);
sf->free_feature_vector(vec, i, free_vec);
sf->cleanup_feature_vector(i);
sf->set_feature_vector(i, decompressed, len);
auto& vec = string_list.strings[i];
auto decompressed = apply_to_string(vec.string, vec.slen);
SG_FREE(vec.string);
vec.string = decompressed;
}
return true;
}

template <class ST>
Expand Down
8 changes: 3 additions & 5 deletions src/shogun/preprocessor/DecompressString.h
Expand Up @@ -52,11 +52,6 @@ template <class ST> class CDecompressString : public CStringPreprocessor<ST>
/// save preprocessor init-data to file
bool save(FILE* f);

/// apply preproc on feature matrix
/// result in feature matrix
/// return pointer to feature_matrix, i.e. f->get_feature_matrix();
virtual bool apply_to_string_features(CFeatures* f);

/// apply preproc on single feature vector
virtual ST* apply_to_string(ST* f, int32_t &len);

Expand All @@ -67,6 +62,9 @@ template <class ST> class CDecompressString : public CStringPreprocessor<ST>
virtual EPreprocessorType get_type() const;

protected:
virtual void
apply_to_string_list(SGStringList<ST> string_list) override;

/** compressor used to decompress strings */
CCompressor* compressor;
};
Expand Down
20 changes: 5 additions & 15 deletions src/shogun/preprocessor/SortUlongString.cpp
Expand Up @@ -42,27 +42,17 @@ bool CSortUlongString::save(FILE* f)
return false;
}

/// apply preproc on feature matrix
/// result in feature matrix
/// return pointer to feature_matrix, i.e. f->get_feature_matrix();
bool CSortUlongString::apply_to_string_features(CFeatures* f)
void CSortUlongString::apply_to_string_list(SGStringList<uint64_t> string_list)
{
auto sf = f->as<CStringFeatures<uint64_t>>();
auto num_vec = sf->get_num_vectors();

for (auto i : range(num_vec))
for (auto i : range(string_list.num_strings))
{
int32_t len=0;
bool free_vec;
auto vec = sf->get_feature_vector(i, len, free_vec);
ASSERT(!free_vec) // won't work with non-in-memory string features
auto& vec = string_list.strings[i];

SG_DEBUG("sorting string of length %i\n", len)
SG_DEBUG("sorting string of length %i\n", vec.slen);

//CMath::qsort(vec, len);
CMath::radix_sort(vec, len);
CMath::radix_sort(vec.string, vec.slen);
}
return true;
}

/// apply preproc on single feature vector
Expand Down
9 changes: 4 additions & 5 deletions src/shogun/preprocessor/SortUlongString.h
Expand Up @@ -38,11 +38,6 @@ class CSortUlongString : public CStringPreprocessor<uint64_t>
/// save preprocessor init-data to file
virtual bool save(FILE* f);

/// apply preproc to feature matrix
/// result in feature matrix
/// return pointer to feature_matrix, i.e. f->get_feature_matrix();
virtual bool apply_to_string_features(CFeatures* f);

/// apply preproc on single feature vector
/// result in feature matrix
virtual uint64_t* apply_to_string(uint64_t* f, int32_t &len);
Expand All @@ -52,6 +47,10 @@ class CSortUlongString : public CStringPreprocessor<uint64_t>

/// return a type of preprocessor
virtual EPreprocessorType get_type() const { return P_SORTULONGSTRING; }

protected:
virtual void
apply_to_string_list(SGStringList<uint64_t> string_list) override;
};
}
#endif
19 changes: 4 additions & 15 deletions src/shogun/preprocessor/SortWordString.cpp
Expand Up @@ -41,27 +41,16 @@ bool CSortWordString::save(FILE* f)
return false;
}

/// apply preproc on feature matrix
/// result in feature matrix
/// return pointer to feature_matrix, i.e. f->get_feature_matrix();
bool CSortWordString::apply_to_string_features(CFeatures* f)
void CSortWordString::apply_to_string_list(SGStringList<uint16_t> string_list)
{
int32_t i;
auto sf = f->as<CStringFeatures<uint16_t>>();
auto num_vec = sf->get_num_vectors();

for (i=0; i<num_vec; i++)
for (auto i : range(string_list.num_strings))
{
int32_t len = 0 ;
bool free_vec;
auto vec = sf->get_feature_vector(i, len, free_vec);
ASSERT(!free_vec) // won't work with non-in-memory string features
auto& vec = string_list.strings[i];

//CMath::qsort(vec, len);
CMath::radix_sort(vec, len);

CMath::radix_sort(vec.string, vec.slen);
}
return true ;
}

/// apply preproc on single feature vector
Expand Down
8 changes: 3 additions & 5 deletions src/shogun/preprocessor/SortWordString.h
Expand Up @@ -39,11 +39,6 @@ class CSortWordString : public CStringPreprocessor<uint16_t>
/// save preprocessor init-data to file
virtual bool save(FILE* f);

/// apply preproc on feature matrix
/// result in feature matrix
/// return pointer to feature_matrix, i.e. f->get_feature_matrix();
virtual bool apply_to_string_features(CFeatures* f);

/// apply preproc on single feature vector
/// result in feature matrix
virtual uint16_t* apply_to_string(uint16_t* f, int32_t &len);
Expand All @@ -54,6 +49,9 @@ class CSortWordString : public CStringPreprocessor<uint16_t>
/// return a type of preprocessor
virtual EPreprocessorType get_type() const { return P_SORTWORDSTRING; }

protected:
virtual void
apply_to_string_list(SGStringList<uint16_t> string_list) override;
};
}
#endif
22 changes: 20 additions & 2 deletions src/shogun/preprocessor/StringPreprocessor.cpp
Expand Up @@ -96,8 +96,26 @@ namespace shogun
"has to be of C_STRING (%d) class!\n",
features->get_feature_class(), C_STRING);

apply_to_string_features(features);
return features;
auto string_features = features->as<CStringFeatures<ST>>();
auto string_list = string_features->get_features();

if (!inplace)
string_list = string_list.clone();

apply_to_string_list(string_list);

auto processed = new CStringFeatures<ST>(
string_list, string_features->get_alphabet());
SG_REF(processed);

return processed;
}

template <class ST>
bool CStringPreprocessor<ST>::apply_to_string_features(CFeatures* features)
{
apply(features);
return true;
}

template class CStringPreprocessor<bool>;
Expand Down
13 changes: 12 additions & 1 deletion src/shogun/preprocessor/StringPreprocessor.h
Expand Up @@ -11,6 +11,7 @@

#include <shogun/features/Features.h>
#include <shogun/features/StringFeatures.h>
#include <shogun/lib/SGStringList.h>
#include <shogun/lib/common.h>
#include <shogun/preprocessor/Preprocessor.h>

Expand Down Expand Up @@ -43,7 +44,11 @@ template <class ST> class CStringPreprocessor : public CPreprocessor
/// apply preproc on feature matrix
/// result in feature matrix
/// return pointer to feature_matrix, i.e. f->get_feature_matrix();
virtual bool apply_to_string_features(CFeatures* f)=0;
#ifndef SWIG
[[deprecated]]
#endif
virtual bool
apply_to_string_features(CFeatures* f);

/// apply preproc on single feature vector
virtual ST* apply_to_string(ST* f, int32_t &len)=0;
Expand All @@ -59,6 +64,12 @@ template <class ST> class CStringPreprocessor : public CPreprocessor
/// return a type of preprocessor
virtual EPreprocessorType get_type() const { return P_UNKNOWN; }

protected:
/** apply the preprocessor to string list in place.
*
* @param string_list the string list to be preprocessed
*/
virtual void apply_to_string_list(SGStringList<ST> string_list) = 0;
};

}
Expand Down

0 comments on commit 4b6f6bb

Please sign in to comment.