diff --git a/src/shogun/features/HashedSparseFeatures.cpp b/src/shogun/features/HashedSparseFeatures.cpp index 7f4f4ed9d6f..9ad23e84dd9 100644 --- a/src/shogun/features/HashedSparseFeatures.cpp +++ b/src/shogun/features/HashedSparseFeatures.cpp @@ -9,6 +9,7 @@ */ #include +#include #include #include #include @@ -91,7 +92,18 @@ template SGSparseVector CHashedSparseFeatures::get_hashed_feature_vector( int32_t vec_idx) const { - SGSparseVector vec = sparse_feats->get_sparse_feature_vector(vec_idx); + return CHashedSparseFeatures::hash_vector(sparse_feats->get_sparse_feature_vector(vec_idx), dim); +} + +template +SGSparseVector CHashedSparseFeatures::hash_vector(SGVector vec, int32_t dim) +{ + return CHashedDenseFeatures::get_hashed_vector(vec, dim); +} + +template +SGSparseVector CHashedSparseFeatures::hash_vector(SGSparseVector vec, int32_t dim) +{ CDynamicArray indices(vec.num_feat_entries); for (index_t i=0; i class CHashedSparseFeatures : public CDotFeatures * * @param size cache size */ - CHashedSparseFeatures (int32_t size=0); + CHashedSparseFeatures(int32_t size=0); /** constructor * * @param feats the sparse features to use as a base * @param d new feature space dimension */ - CHashedSparseFeatures (CSparseFeatures* feats, int32_t d); + CHashedSparseFeatures(CSparseFeatures* feats, int32_t d); /** constructor * * @param matrix feature matrix * @param d new feature space dimension */ - CHashedSparseFeatures (SGSparseMatrix matrix, int32_t d); + CHashedSparseFeatures(SGSparseMatrix matrix, int32_t d); /** constructor loading features from file * * @param loader File object via which to load data * @param d new feature space dimension */ - CHashedSparseFeatures (CFile* loader, int32_t d); + CHashedSparseFeatures(CFile* loader, int32_t d); /** copy constructor */ - CHashedSparseFeatures (const CHashedSparseFeatures & orig); + CHashedSparseFeatures(const CHashedSparseFeatures & orig); /** duplicate */ virtual CFeatures* duplicate() const; /** destructor */ - virtual ~CHashedSparseFeatures (); + virtual ~CHashedSparseFeatures(); /** obtain the dimensionality of the feature space * @@ -178,6 +178,22 @@ template class CHashedSparseFeatures : public CDotFeatures */ SGSparseVector get_hashed_feature_vector(int32_t vec_idx) const; + /** Get the hashed representation of the given vector + * + * @param vec the vector to hash + * @param dim the dimension of the new feature space + * @return the hashed representation of the vector vec + */ + static SGSparseVector hash_vector(SGVector vec, int32_t dim); + + + /** Get the hashed representation of the given sparse vector + * + * @param vec the vector to hash + * @param dim the dimension of the hashed target space + * @return the hashed representation of the vector vec + */ + static SGSparseVector hash_vector(SGSparseVector vec, int32_t dim); protected: void init(CSparseFeatures* feats, int32_t d); diff --git a/src/shogun/features/streaming/StreamingHashedSparseFeatures.cpp b/src/shogun/features/streaming/StreamingHashedSparseFeatures.cpp new file mode 100644 index 00000000000..bd2258bd861 --- /dev/null +++ b/src/shogun/features/streaming/StreamingHashedSparseFeatures.cpp @@ -0,0 +1,225 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * Written (W) 2013 Evangelos Anagnostopoulos + * Copyright (C) 2013 Evangelos Anagnostopoulos + */ + +#include +#include +#include + +namespace shogun +{ + +template +CStreamingHashedSparseFeatures::CStreamingHashedSparseFeatures() +{ + init(NULL, false, 0, 0); +} + +template +CStreamingHashedSparseFeatures::CStreamingHashedSparseFeatures(CStreamingFile* file, + bool is_labelled, int32_t size, int32_t d) +{ + init(file, is_labelled, size, d); +} + +template +CStreamingHashedSparseFeatures::CStreamingHashedSparseFeatures(CSparseFeatures* dot_features, + int32_t d, float64_t* lab) +{ + ASSERT(dot_features); + + CStreamingFileFromSparseFeatures* file = + new CStreamingFileFromSparseFeatures(dot_features, lab); + bool is_labelled = (lab != NULL); + int32_t size = 1024; + + init(file, is_labelled, size, d); + + parser.set_free_vectors_on_destruct(false); + seekable=true; +} + +template +CStreamingHashedSparseFeatures::~CStreamingHashedSparseFeatures() +{ +} + +template +void CStreamingHashedSparseFeatures::init(CStreamingFile* file, bool is_labelled, + int32_t size, int32_t d) +{ + dim = d; + SG_ADD(&dim, "dim", "Size of target dimension", MS_NOT_AVAILABLE); + + has_labels = is_labelled; + if (file) + { + working_file = file; + SG_REF(working_file); + parser.init(file, is_labelled, size); + seekable = false; + } + else + file = NULL; + + set_read_functions(); + parser.set_free_vector_after_release(false); +} + +template +float32_t CStreamingHashedSparseFeatures::dot(CStreamingDotFeatures* df) +{ + ASSERT(df); + ASSERT(df->get_feature_type() == get_feature_type()) + ASSERT(strcmp(df->get_name(),get_name())==0) + + CStreamingHashedSparseFeatures* hdf = (CStreamingHashedSparseFeatures* ) df; + return current_vector.sparse_dot(hdf->current_vector); +} + +template +float32_t CStreamingHashedSparseFeatures::dense_dot(const float32_t* vec2, int32_t vec2_len) +{ + ASSERT(vec2_len == dim); + + float32_t result = 0; + for (index_t i=0; i +void CStreamingHashedSparseFeatures::add_to_dense_vec(float32_t alpha, float32_t* vec2, + int32_t vec2_len, bool abs_val) +{ + ASSERT(vec2_len == dim); + + if (abs_val) + alpha = CMath::abs(alpha); + + for (index_t i=0; i +int32_t CStreamingHashedSparseFeatures::get_dim_feature_space() const +{ + return dim; +} + +template +const char* CStreamingHashedSparseFeatures::get_name() const +{ + return "StreamingHashedSparseFeatures"; +} + +template +int32_t CStreamingHashedSparseFeatures::get_num_vectors() const +{ + return 1; +} + +template +CFeatures* CStreamingHashedSparseFeatures::duplicate() const +{ + return new CStreamingHashedSparseFeatures(*this); +} + +template +void CStreamingHashedSparseFeatures::set_vector_reader() +{ + SG_DEBUG("called inside set_vector_reader\n"); + parser.set_read_vector(&CStreamingFile::get_sparse_vector); +} + +template +void CStreamingHashedSparseFeatures::set_vector_and_label_reader() +{ + parser.set_read_vector_and_label(&CStreamingFile::get_sparse_vector_and_label); +} + +template +EFeatureType CStreamingHashedSparseFeatures::get_feature_type() const +{ + return F_UINT; +} + +template +EFeatureClass CStreamingHashedSparseFeatures::get_feature_class() const +{ + return C_STREAMING_SPARSE; +} + +template +void CStreamingHashedSparseFeatures::start_parser() +{ + if (!parser.is_running()) + parser.start_parser(); +} + +template +void CStreamingHashedSparseFeatures::end_parser() +{ + parser.end_parser(); +} + +template +float64_t CStreamingHashedSparseFeatures::get_label() +{ + return current_label; +} + +template +bool CStreamingHashedSparseFeatures::get_next_example() +{ + SGSparseVector tmp; + if (parser.get_next_example(tmp.features, + tmp.num_feat_entries, current_label)) + { + current_vector = CHashedSparseFeatures::hash_vector(tmp, dim); + tmp.features = NULL; + tmp.num_feat_entries = -1; + return true; + } + return false; +} + +template +void CStreamingHashedSparseFeatures::release_example() +{ + parser.finalize_example(); +} + +template +int32_t CStreamingHashedSparseFeatures::get_num_features() +{ + return dim; +} + +template +SGSparseVector CStreamingHashedSparseFeatures::get_vector() +{ + return current_vector; +} + +template class CStreamingHashedSparseFeatures; +template class CStreamingHashedSparseFeatures; +template class CStreamingHashedSparseFeatures; +template class CStreamingHashedSparseFeatures; +template class CStreamingHashedSparseFeatures; +template class CStreamingHashedSparseFeatures; +template class CStreamingHashedSparseFeatures; +template class CStreamingHashedSparseFeatures; +template class CStreamingHashedSparseFeatures; +template class CStreamingHashedSparseFeatures; +template class CStreamingHashedSparseFeatures; +template class CStreamingHashedSparseFeatures; +template class CStreamingHashedSparseFeatures; +} diff --git a/src/shogun/features/streaming/StreamingHashedSparseFeatures.h b/src/shogun/features/streaming/StreamingHashedSparseFeatures.h new file mode 100644 index 00000000000..a3ef67f513a --- /dev/null +++ b/src/shogun/features/streaming/StreamingHashedSparseFeatures.h @@ -0,0 +1,224 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * Written (W) 2013 Evangelos Anagnostopoulos + * Copyright (C) 2013 Evangelos Anagnostopoulos + */ + +#ifndef _STREAMING_HASHED_SPARSEFEATURES__H__ +#define _STREAMING_HASHED_SPARSEFEATURES__H__ + +#include +#include +#include + +namespace shogun +{ +class CStreamingDotFeatures; + +/** @brief This class acts as an alternative to the CStreamingSparseFeatures class + * and their difference is that the current example in this class is hashed into + * a smaller dimension dim. + * + * The current example is stored as a combination of current_vector + * and current_label. Call get_next_example() followed by get_current_vector() + * to iterate through the stream. + */ +template class CStreamingHashedSparseFeatures : public CStreamingDotFeatures +{ +public: + /** Constructor */ + CStreamingHashedSparseFeatures(); + + /** + * Constructor with input information passed. + * + * @param file CStreamingFile to take input from. + * @param is_labelled Whether examples are labelled or not. + * @param size Number of examples to be held in the parser's "ring". + * @param d the dimensionality of the target feature space + */ + CStreamingHashedSparseFeatures (CStreamingFile* file, bool is_labelled, int32_t size, + int32_t d = 512); + + /** + * Constructor taking a CDotFeatures object and optionally, + * labels, as args. + * + * The derived class should implement it so that the + * Streaming*Features class uses the DotFeatures object as the + * input, getting examples one by one from the DotFeatures + * object (and labels, if applicable). + * + * @param dot_features CDotFeatures object + * @param d the dimensionality of the target feature space + * @param lab labels (optional) + */ + CStreamingHashedSparseFeatures (CSparseFeatures* dot_features, int32_t d = 512, + float64_t* lab = NULL); + + /** Destructor */ + virtual ~CStreamingHashedSparseFeatures (); + + /** compute dot product between vectors of two + * StreamingDotFeatures objects. + * + * @param df StreamingDotFeatures (of same kind) to compute + * dot product with + */ + virtual float32_t dot(CStreamingDotFeatures* df); + + /** compute dot product between current vector and a dense vector + * + * @param vec2 real valued vector + * @param vec2_len length of vector + */ + virtual float32_t dense_dot(const float32_t* vec2, int32_t vec2_len); + + /** add current vector multiplied with alpha to dense vector, 'vec' + * + * @param alpha scalar alpha + * @param vec2 real valued vector to add to + * @param vec2_len length of vector + * @param abs_val if true add the absolute value + */ + virtual void add_to_dense_vec(float32_t alpha, float32_t* vec2, + int32_t vec2_len, bool abs_val = false); + + /** obtain the dimensionality of the feature space + * + * (not mix this up with the dimensionality of the input space, usually + * obtained via get_num_features()) + * + * @return dimensionality + */ + virtual int32_t get_dim_feature_space() const; + + /** + * Return the name. + * + * @return the name of the class + */ + virtual const char* get_name() const; + + /** + * Return the number of vectors stored in this object. + * + * @return 1 if current_vector exists, else 0. + */ + virtual int32_t get_num_vectors() const; + + /** + * Duplicate the object. + * + * @return a duplicate object as CFeatures* + */ + virtual CFeatures* duplicate() const; + + /** + * Sets the read function (in case the examples are + * unlabelled) to get_*_vector() from CStreamingFile. + * + * The exact function depends on type ST. + * + * The parser uses the function set by this while reading + * unlabelled examples. + */ + virtual void set_vector_reader(); + + /** + * Sets the read function (in case the examples are labelled) + * to get_*_vector_and_label from CStreamingFile. + * + * The exact function depends on type ST. + * + * The parser uses the function set by this while reading + * labelled examples. + */ + virtual void set_vector_and_label_reader(); + + /** + * Return the feature type, depending on ST. + * + * @return Feature type as EFeatureType + */ + virtual EFeatureType get_feature_type() const; + + /** + * Return the feature class + * + * @return C_STREAMING_DENSE + */ + virtual EFeatureClass get_feature_class() const; + + /** + * Start the parser. + * It stores parsed examples from the input in a separate thread. + */ + virtual void start_parser(); + + /** + * End the parser. Wait for the parsing thread to complete. + */ + virtual void end_parser(); + + /** + * Return the label of the current example. + * + * Raise an error if the input has been specified as unlabelled. + * + * @return Label (if labelled example) + */ + virtual float64_t get_label(); + + /** + * Indicate to the parser that it must fetch the next example. + * + * @return true on success, false on failure (i.e., no more examples). + */ + virtual bool get_next_example(); + + /** + * Indicate that processing of the current example is done. + * The parser then considers it safe to dispose of that example + * and replace it with another one. + */ + virtual void release_example(); + + /** + * Get the number of features in the current example. + * + * @return number of features in current example + */ + virtual int32_t get_num_features(); + + /** Get the current example + * + * @return a SGSparseVector representing the hashed version of the string last read + */ + SGSparseVector get_vector(); + +private: + void init(CStreamingFile* file, bool is_labelled, int32_t size, + int32_t d); + +protected: + + /** dimensionality of new feature space */ + int32_t dim; + + /** Current example */ + SGSparseVector current_vector; + + /** The parser */ + CInputParser > parser; + + /** The current example's label */ + float64_t current_label; +}; +} + +#endif // _STREAMING_HASHED_SPARSEFEATURES__H__ diff --git a/src/shogun/io/streaming/StreamingFileFromSparseFeatures.h b/src/shogun/io/streaming/StreamingFileFromSparseFeatures.h index 26af09d21c1..a9406455e4c 100644 --- a/src/shogun/io/streaming/StreamingFileFromSparseFeatures.h +++ b/src/shogun/io/streaming/StreamingFileFromSparseFeatures.h @@ -89,7 +89,7 @@ template class CStreamingFileFromSparseFeatures: public CStreamingFile /** * Initialize members to defaults */ - void init(); + void init(CSparseFeatures* feat); protected: /// SparseFeatures object @@ -104,31 +104,34 @@ template CStreamingFileFromSparseFeatures::CStreamingFileFromSparseFeatures() : CStreamingFileFromFeatures() { - init(); + init(NULL); } template CStreamingFileFromSparseFeatures::CStreamingFileFromSparseFeatures(CSparseFeatures* feat) : CStreamingFileFromFeatures(feat) { - init(); + init(feat); } template CStreamingFileFromSparseFeatures::CStreamingFileFromSparseFeatures(CSparseFeatures* feat, float64_t* lab) : CStreamingFileFromFeatures(feat,lab) { - init(); + init(feat); } template CStreamingFileFromSparseFeatures::~CStreamingFileFromSparseFeatures() { + SG_UNREF(features); } template -void CStreamingFileFromSparseFeatures::init() +void CStreamingFileFromSparseFeatures::init(CSparseFeatures* feat) { + features = feat; + SG_REF(features); vector_num=0; } @@ -145,7 +148,7 @@ void CStreamingFileFromSparseFeatures::get_sparse_vector } SGSparseVector vec= - ((CSparseFeatures*)this)->get_sparse_feature_vector(vector_num); + features->get_sparse_feature_vector(vector_num); vector=vec.features; len=vec.num_feat_entries; diff --git a/tests/unit/features/StreamingHashedSparseFeatures_unittest.cc b/tests/unit/features/StreamingHashedSparseFeatures_unittest.cc new file mode 100644 index 00000000000..b74bce63fbb --- /dev/null +++ b/tests/unit/features/StreamingHashedSparseFeatures_unittest.cc @@ -0,0 +1,155 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * Written (W) 2013 Evangelos Anagnostopoulos + */ + +#include +#include +#include +#include +#include + +using namespace shogun; + +TEST(StreamingHashedSparseFeaturesTest, dot) +{ + index_t n=3; + index_t dim=10; + + SGMatrix data(dim,n); + for (index_t i=0; i* d_feats = new CSparseFeatures(data); + CStreamingHashedSparseFeatures* h_feats = + new CStreamingHashedSparseFeatures(d_feats, hashing_dim); + + h_feats->start_parser(); + index_t i; + for (i=0; iget_next_example(); i++) + { + SGVector tmp(hashing_dim); + SGVector::fill_vector(tmp, hashing_dim, 0); + for (index_t j=0; jdot(h_feats); + EXPECT_EQ(feat_dot, dot_product); + h_feats->release_example(); + } + h_feats->end_parser(); + + EXPECT_EQ(i, n); + + SG_UNREF(h_feats); +} + +TEST(StreamingHashedSparseFeaturesTest, dense_dot) +{ + index_t n=3; + index_t dim=10; + + SGMatrix data(dim,n); + for (index_t i=0; i* d_feats = new CSparseFeatures(data); + CStreamingHashedSparseFeatures* h_feats = + new CStreamingHashedSparseFeatures(d_feats, hashing_dim); + + h_feats->start_parser(); + for (index_t i=0; iget_next_example(); i++) + { + SGVector tmp(hashing_dim); + SGVector::fill_vector(tmp, hashing_dim, 0); + for (index_t j=0; jdense_dot(tmp.vector, tmp.vlen); + EXPECT_EQ(feat_dot, dot_product); + + h_feats->release_example(); + } + h_feats->end_parser(); + + SG_UNREF(h_feats); +} + +TEST(StreamingHashedSparseFeaturesTest, add_to_dense) +{ + index_t n=3; + index_t dim=10; + + SGMatrix data(dim,n); + for (index_t i=0; i* d_feats = new CSparseFeatures(data); + CStreamingHashedSparseFeatures* h_feats = + new CStreamingHashedSparseFeatures(d_feats, hashing_dim); + + h_feats->start_parser(); + for (index_t i=0; iget_next_example(); i++) + { + SGVector tmp(hashing_dim); + SGVector::fill_vector(tmp, hashing_dim, 0); + for (index_t j=0; j tmp2(hashing_dim); + for (index_t j=0; jadd_to_dense_vec(2, tmp.vector, tmp.vlen); + for (index_t j=0; jrelease_example(); + } + h_feats->end_parser(); + + SG_UNREF(h_feats); +}