Navigation Menu

Skip to content

Commit

Permalink
Merge pull request #1262 from van51/feature/streaming_sparse
Browse files Browse the repository at this point in the history
StreamingHashedSparseFeatures class
  • Loading branch information
Soeren Sonnenburg committed Jul 19, 2013
2 parents 646d0a6 + 9268bff commit 42fbf1b
Show file tree
Hide file tree
Showing 6 changed files with 648 additions and 13 deletions.
14 changes: 13 additions & 1 deletion src/shogun/features/HashedSparseFeatures.cpp
Expand Up @@ -9,6 +9,7 @@
*/

#include <shogun/features/HashedSparseFeatures.h>
#include <shogun/features/HashedDenseFeatures.h>
#include <shogun/base/Parameter.h>
#include <shogun/lib/Hash.h>
#include <shogun/io/SGIO.h>
Expand Down Expand Up @@ -91,7 +92,18 @@ template <class ST>
SGSparseVector<uint32_t> CHashedSparseFeatures<ST>::get_hashed_feature_vector(
int32_t vec_idx) const
{
SGSparseVector<ST> vec = sparse_feats->get_sparse_feature_vector(vec_idx);
return CHashedSparseFeatures<ST>::hash_vector(sparse_feats->get_sparse_feature_vector(vec_idx), dim);
}

template <class ST>
SGSparseVector<uint32_t> CHashedSparseFeatures<ST>::hash_vector(SGVector<ST> vec, int32_t dim)
{
return CHashedDenseFeatures<ST>::get_hashed_vector(vec, dim);
}

template <class ST>
SGSparseVector<uint32_t> CHashedSparseFeatures<ST>::hash_vector(SGSparseVector<ST> vec, int32_t dim)
{
CDynamicArray<index_t> indices(vec.num_feat_entries);
for (index_t i=0; i<vec.num_feat_entries; i++)
{
Expand Down
28 changes: 22 additions & 6 deletions src/shogun/features/HashedSparseFeatures.h
Expand Up @@ -32,37 +32,37 @@ template <class ST> class CHashedSparseFeatures : public CDotFeatures
*
* @param size cache size
*/
CHashedSparseFeatures (int32_t size=0);
CHashedSparseFeatures(int32_t size=0);

/** constructor
*
* @param feats the sparse features to use as a base
* @param d new feature space dimension
*/
CHashedSparseFeatures (CSparseFeatures<ST>* feats, int32_t d);
CHashedSparseFeatures(CSparseFeatures<ST>* feats, int32_t d);

/** constructor
*
* @param matrix feature matrix
* @param d new feature space dimension
*/
CHashedSparseFeatures (SGSparseMatrix<ST> matrix, int32_t d);
CHashedSparseFeatures(SGSparseMatrix<ST> matrix, int32_t d);

/** constructor loading features from file
*
* @param loader File object via which to load data
* @param d new feature space dimension
*/
CHashedSparseFeatures (CFile* loader, int32_t d);
CHashedSparseFeatures(CFile* loader, int32_t d);

/** copy constructor */
CHashedSparseFeatures (const CHashedSparseFeatures & orig);
CHashedSparseFeatures(const CHashedSparseFeatures & orig);

/** duplicate */
virtual CFeatures* duplicate() const;

/** destructor */
virtual ~CHashedSparseFeatures ();
virtual ~CHashedSparseFeatures();

/** obtain the dimensionality of the feature space
*
Expand Down Expand Up @@ -178,6 +178,22 @@ template <class ST> class CHashedSparseFeatures : public CDotFeatures
*/
SGSparseVector<uint32_t> get_hashed_feature_vector(int32_t vec_idx) const;

/** Get the hashed representation of the given vector
*
* @param vec the vector to hash
* @param dim the dimension of the new feature space
* @return the hashed representation of the vector vec
*/
static SGSparseVector<uint32_t> hash_vector(SGVector<ST> vec, int32_t dim);


/** Get the hashed representation of the given sparse vector
*
* @param vec the vector to hash
* @param dim the dimension of the hashed target space
* @return the hashed representation of the vector vec
*/
static SGSparseVector<uint32_t> hash_vector(SGSparseVector<ST> vec, int32_t dim);
protected:
void init(CSparseFeatures<ST>* feats, int32_t d);

Expand Down
225 changes: 225 additions & 0 deletions src/shogun/features/streaming/StreamingHashedSparseFeatures.cpp
@@ -0,0 +1,225 @@
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* Written (W) 2013 Evangelos Anagnostopoulos
* Copyright (C) 2013 Evangelos Anagnostopoulos
*/

#include <shogun/features/streaming/StreamingHashedSparseFeatures.h>
#include <shogun/features/HashedSparseFeatures.h>
#include <shogun/io/streaming/StreamingFileFromSparseFeatures.h>

namespace shogun
{

template <class ST>
CStreamingHashedSparseFeatures<ST>::CStreamingHashedSparseFeatures()
{
init(NULL, false, 0, 0);
}

template <class ST>
CStreamingHashedSparseFeatures<ST>::CStreamingHashedSparseFeatures(CStreamingFile* file,
bool is_labelled, int32_t size, int32_t d)
{
init(file, is_labelled, size, d);
}

template <class ST>
CStreamingHashedSparseFeatures<ST>::CStreamingHashedSparseFeatures(CSparseFeatures<ST>* dot_features,
int32_t d, float64_t* lab)
{
ASSERT(dot_features);

CStreamingFileFromSparseFeatures<ST>* file =
new CStreamingFileFromSparseFeatures<ST>(dot_features, lab);
bool is_labelled = (lab != NULL);
int32_t size = 1024;

init(file, is_labelled, size, d);

parser.set_free_vectors_on_destruct(false);
seekable=true;
}

template <class ST>
CStreamingHashedSparseFeatures<ST>::~CStreamingHashedSparseFeatures()
{
}

template <class ST>
void CStreamingHashedSparseFeatures<ST>::init(CStreamingFile* file, bool is_labelled,
int32_t size, int32_t d)
{
dim = d;
SG_ADD(&dim, "dim", "Size of target dimension", MS_NOT_AVAILABLE);

has_labels = is_labelled;
if (file)
{
working_file = file;
SG_REF(working_file);
parser.init(file, is_labelled, size);
seekable = false;
}
else
file = NULL;

set_read_functions();
parser.set_free_vector_after_release(false);
}

template <class ST>
float32_t CStreamingHashedSparseFeatures<ST>::dot(CStreamingDotFeatures* df)
{
ASSERT(df);
ASSERT(df->get_feature_type() == get_feature_type())
ASSERT(strcmp(df->get_name(),get_name())==0)

CStreamingHashedSparseFeatures<ST>* hdf = (CStreamingHashedSparseFeatures<ST>* ) df;
return current_vector.sparse_dot(hdf->current_vector);
}

template <class ST>
float32_t CStreamingHashedSparseFeatures<ST>::dense_dot(const float32_t* vec2, int32_t vec2_len)
{
ASSERT(vec2_len == dim);

float32_t result = 0;
for (index_t i=0; i<current_vector.num_feat_entries; i++)
result += vec2[current_vector.features[i].feat_index] * current_vector.features[i].entry;

return result;
}

template <class ST>
void CStreamingHashedSparseFeatures<ST>::add_to_dense_vec(float32_t alpha, float32_t* vec2,
int32_t vec2_len, bool abs_val)
{
ASSERT(vec2_len == dim);

if (abs_val)
alpha = CMath::abs(alpha);

for (index_t i=0; i<current_vector.num_feat_entries; i++)
vec2[current_vector.features[i].feat_index] += alpha * current_vector.features[i].entry;
}

template <class ST>
int32_t CStreamingHashedSparseFeatures<ST>::get_dim_feature_space() const
{
return dim;
}

template <class ST>
const char* CStreamingHashedSparseFeatures<ST>::get_name() const
{
return "StreamingHashedSparseFeatures";
}

template <class ST>
int32_t CStreamingHashedSparseFeatures<ST>::get_num_vectors() const
{
return 1;
}

template <class ST>
CFeatures* CStreamingHashedSparseFeatures<ST>::duplicate() const
{
return new CStreamingHashedSparseFeatures<ST>(*this);
}

template <class ST>
void CStreamingHashedSparseFeatures<ST>::set_vector_reader()
{
SG_DEBUG("called inside set_vector_reader\n");
parser.set_read_vector(&CStreamingFile::get_sparse_vector);
}

template <class ST>
void CStreamingHashedSparseFeatures<ST>::set_vector_and_label_reader()
{
parser.set_read_vector_and_label(&CStreamingFile::get_sparse_vector_and_label);
}

template <class ST>
EFeatureType CStreamingHashedSparseFeatures<ST>::get_feature_type() const
{
return F_UINT;
}

template <class ST>
EFeatureClass CStreamingHashedSparseFeatures<ST>::get_feature_class() const
{
return C_STREAMING_SPARSE;
}

template <class ST>
void CStreamingHashedSparseFeatures<ST>::start_parser()
{
if (!parser.is_running())
parser.start_parser();
}

template <class ST>
void CStreamingHashedSparseFeatures<ST>::end_parser()
{
parser.end_parser();
}

template <class ST>
float64_t CStreamingHashedSparseFeatures<ST>::get_label()
{
return current_label;
}

template <class ST>
bool CStreamingHashedSparseFeatures<ST>::get_next_example()
{
SGSparseVector<ST> tmp;
if (parser.get_next_example(tmp.features,
tmp.num_feat_entries, current_label))
{
current_vector = CHashedSparseFeatures<ST>::hash_vector(tmp, dim);
tmp.features = NULL;
tmp.num_feat_entries = -1;
return true;
}
return false;
}

template <class ST>
void CStreamingHashedSparseFeatures<ST>::release_example()
{
parser.finalize_example();
}

template <class ST>
int32_t CStreamingHashedSparseFeatures<ST>::get_num_features()
{
return dim;
}

template <class ST>
SGSparseVector<uint32_t> CStreamingHashedSparseFeatures<ST>::get_vector()
{
return current_vector;
}

template class CStreamingHashedSparseFeatures<bool>;
template class CStreamingHashedSparseFeatures<char>;
template class CStreamingHashedSparseFeatures<int8_t>;
template class CStreamingHashedSparseFeatures<uint8_t>;
template class CStreamingHashedSparseFeatures<int16_t>;
template class CStreamingHashedSparseFeatures<uint16_t>;
template class CStreamingHashedSparseFeatures<int32_t>;
template class CStreamingHashedSparseFeatures<uint32_t>;
template class CStreamingHashedSparseFeatures<int64_t>;
template class CStreamingHashedSparseFeatures<uint64_t>;
template class CStreamingHashedSparseFeatures<float32_t>;
template class CStreamingHashedSparseFeatures<float64_t>;
template class CStreamingHashedSparseFeatures<floatmax_t>;
}

0 comments on commit 42fbf1b

Please sign in to comment.