Skip to content

Commit

Permalink
Quadratic support for rest numerical hashed classes
Browse files Browse the repository at this point in the history
  • Loading branch information
van51 committed Jul 25, 2013
1 parent da8857a commit 627d8cd
Show file tree
Hide file tree
Showing 7 changed files with 388 additions and 52 deletions.
138 changes: 113 additions & 25 deletions src/shogun/features/HashedSparseFeatures.cpp
Expand Up @@ -22,42 +22,50 @@
namespace shogun {

template <class ST>
CHashedSparseFeatures<ST>::CHashedSparseFeatures(int32_t size)
: CDotFeatures(size)
CHashedSparseFeatures<ST>::CHashedSparseFeatures(int32_t size, bool use_quadr,
bool keep_lin_terms) : CDotFeatures(size)
{
init(NULL, 0);
init(NULL, 0, use_quadr, keep_lin_terms);
}

template <class ST>
CHashedSparseFeatures<ST>::CHashedSparseFeatures(CSparseFeatures<ST>* feats, int32_t d)
: CDotFeatures()
CHashedSparseFeatures<ST>::CHashedSparseFeatures(CSparseFeatures<ST>* feats, int32_t d,
bool use_quadr, bool keep_lin_terms) : CDotFeatures()
{
init(feats, d);
init(feats, d, use_quadr, keep_lin_terms);
}

template <class ST>
CHashedSparseFeatures<ST>::CHashedSparseFeatures(SGSparseMatrix<ST> matrix, int32_t d)
: CDotFeatures()
CHashedSparseFeatures<ST>::CHashedSparseFeatures(SGSparseMatrix<ST> matrix, int32_t d,
bool use_quadr, bool keep_lin_terms) : CDotFeatures()
{
CSparseFeatures<ST>* feats = new CSparseFeatures<ST>(matrix);
init(feats, d);
init(feats, d, use_quadr, keep_lin_terms);
}

template <class ST>
CHashedSparseFeatures<ST>::CHashedSparseFeatures(CFile* loader, int32_t d)
: CDotFeatures(loader)
CHashedSparseFeatures<ST>::CHashedSparseFeatures(CFile* loader, int32_t d, bool use_quadr,
bool keep_lin_terms) : CDotFeatures(loader)
{
CSparseFeatures<ST>* feats = new CSparseFeatures<ST>();
feats->load(loader);
init(feats, d);
init(feats, d, use_quadr, keep_lin_terms);
}

template <class ST>
void CHashedSparseFeatures<ST>::init(CSparseFeatures<ST>* feats, int32_t d)
void CHashedSparseFeatures<ST>::init(CSparseFeatures<ST>* feats, int32_t d, bool use_quadr,
bool keep_lin_terms)
{
dim = d;
use_quadratic = use_quadr;
keep_linear_terms = keep_lin_terms;
sparse_feats = feats;
SG_REF(sparse_feats);

SG_ADD(&use_quadratic, "use_quadratic", "Whether to use quadratic features",
MS_NOT_AVAILABLE);
SG_ADD(&keep_linear_terms, "keep_linear_terms", "Whether to keep the linear terms or not",
MS_NOT_AVAILABLE);
SG_ADD(&dim, "dim", "Dimension of new feature space", MS_NOT_AVAILABLE);
SG_ADD((CSGObject** ) &sparse_feats, "sparse_feats ", "Sparse features to work on",
MS_NOT_AVAILABLE);
Expand All @@ -69,7 +77,7 @@ template <class ST>
CHashedSparseFeatures<ST>::CHashedSparseFeatures(const CHashedSparseFeatures& orig)
: CDotFeatures(orig)
{
init(orig.sparse_feats, orig.dim);
init(orig.sparse_feats, orig.dim, orig.use_quadratic, orig.keep_linear_terms);
}

template <class ST>
Expand All @@ -94,25 +102,55 @@ template <class ST>
SGSparseVector<ST> CHashedSparseFeatures<ST>::get_hashed_feature_vector(
int32_t vec_idx) const
{
return CHashedSparseFeatures<ST>::hash_vector(sparse_feats->get_sparse_feature_vector(vec_idx), dim);
return CHashedSparseFeatures<ST>::hash_vector(sparse_feats->get_sparse_feature_vector(vec_idx),
dim, use_quadratic, keep_linear_terms);
}

template <class ST>
SGSparseVector<ST> CHashedSparseFeatures<ST>::hash_vector(SGVector<ST> vec, int32_t dim)
SGSparseVector<ST> CHashedSparseFeatures<ST>::hash_vector(SGVector<ST> vec, int32_t dim,
bool use_quadratic, bool keep_linear_terms)
{
return CHashedDenseFeatures<ST>::hash_vector(vec, dim);
return CHashedDenseFeatures<ST>::hash_vector(vec, dim, use_quadratic, keep_linear_terms);
}

template <class ST>
SGSparseVector<ST> CHashedSparseFeatures<ST>::hash_vector(SGSparseVector<ST> vec, int32_t dim)
SGSparseVector<ST> CHashedSparseFeatures<ST>::hash_vector(SGSparseVector<ST> vec, int32_t dim,
bool use_quadratic, bool keep_linear_terms)
{
SGVector<ST> h_vec(dim);
SGVector<ST>::fill_vector(h_vec, dim, 0);

int32_t hash_cache_size = use_quadratic ? vec.num_feat_entries : 0;
SGVector<uint32_t> hash_cache(hash_cache_size);

for (index_t i=0; i<vec.num_feat_entries; i++)
{
uint32_t h = CHash::MurmurHash3((uint8_t* ) &vec.features[i].feat_index, sizeof (index_t),
uint32_t hash = CHash::MurmurHash3((uint8_t* ) &vec.features[i].feat_index, sizeof (index_t),
vec.features[i].feat_index);
h_vec[h % dim] += vec.features[i].entry;

if (use_quadratic)
hash_cache[i] = hash;

if ( (!use_quadratic) || keep_linear_terms )
h_vec[hash % dim] += vec.features[i].entry;
}

if (use_quadratic)
{
for (index_t i=0; i<vec.num_feat_entries; i++)
{
index_t n_idx = vec.features[i].feat_index + vec.features[i].feat_index;
index_t idx = CHash::MurmurHash3((uint8_t* ) &n_idx, sizeof(index_t),
vec.features[i].feat_index) % dim;

h_vec[idx] += vec.features[i].entry * vec.features[i].entry;

for (index_t j=i+1; j<vec.num_feat_entries; j++)
{
idx = (hash_cache[i] ^ hash_cache[j]) % dim;
h_vec[idx] += vec.features[i].entry * vec.features[j].entry;
}
}
}

int32_t num_nnz_features = 0;
Expand Down Expand Up @@ -162,13 +200,38 @@ float64_t CHashedSparseFeatures<ST>::dense_dot(int32_t vec_idx1, const float64_t

SGSparseVector<ST> vec = sparse_feats->get_sparse_feature_vector(vec_idx1);

int32_t hash_cache_size = use_quadratic ? vec.num_feat_entries : 0;
SGVector<uint32_t> hash_cache(hash_cache_size);

float64_t result = 0;
for (index_t i=0; i<vec.num_feat_entries; i++)
{
uint32_t h_idx = CHash::MurmurHash3((uint8_t* ) &vec.features[i].feat_index, sizeof (index_t),
uint32_t hash = CHash::MurmurHash3((uint8_t* ) &vec.features[i].feat_index, sizeof (index_t),
vec.features[i].feat_index);
h_idx = h_idx % dim;
result += vec2[h_idx] * vec.features[i].entry;

if (use_quadratic)
hash_cache[i] = hash;

if ( (!use_quadratic) || keep_linear_terms)
result += vec2[hash % dim] * vec.features[i].entry;
}

if (use_quadratic)
{
for (index_t i=0; i<vec.num_feat_entries; i++)
{
index_t n_idx = vec.features[i].feat_index + vec.features[i].feat_index;
index_t idx = CHash::MurmurHash3((uint8_t* ) &n_idx, sizeof (index_t),
vec.features[i].feat_index) % dim;

result += vec2[idx] * vec.features[i].entry * vec.features[i].entry;

for (index_t j=i+1; j<vec.num_feat_entries; j++)
{
idx = (hash_cache[i] ^ hash_cache[j]) % dim;
result += vec2[idx] * vec.features[i].entry * vec.features[j].entry;
}
}
}

sparse_feats ->free_feature_vector(vec_idx1);
Expand All @@ -184,11 +247,36 @@ void CHashedSparseFeatures<ST>::add_to_dense_vec(float64_t alpha, int32_t vec_id

SGSparseVector<ST> vec = sparse_feats->get_sparse_feature_vector(vec_idx1);

int32_t hash_cache_size = use_quadratic ? vec.num_feat_entries : 0;
SGVector<uint32_t> hash_cache(hash_cache_size);

for (index_t i=0; i<vec.num_feat_entries; i++)
{
uint32_t h_idx = CHash::MurmurHash3((uint8_t* ) &vec.features[i].feat_index, sizeof (index_t),
uint32_t hash = CHash::MurmurHash3((uint8_t* ) &vec.features[i].feat_index, sizeof (index_t),
vec.features[i].feat_index);
vec2[h_idx%dim] += val * vec.features[i].entry;
if (use_quadratic)
hash_cache[i] = hash;

if ( (!use_quadratic) || keep_linear_terms)
vec2[hash % dim] += val * vec.features[i].entry;
}

if (use_quadratic)
{
for (index_t i=0; i<vec.num_feat_entries; i++)
{
index_t n_idx = vec.features[i].feat_index + vec.features[i].feat_index;
index_t idx = CHash::MurmurHash3((uint8_t* ) &n_idx, sizeof (index_t),
vec.features[i].feat_index) % dim;

vec2[idx] += val * vec.features[i].entry * vec.features[i].entry;

for (index_t j=i+1; j<vec.num_feat_entries; j++)
{
idx = (hash_cache[i] ^ hash_cache[j]) % dim;
vec2[idx] += val * vec.features[i].entry * vec.features[j].entry;
}
}
}
sparse_feats ->free_feature_vector(vec_idx1);
}
Expand Down
37 changes: 30 additions & 7 deletions src/shogun/features/HashedSparseFeatures.h
Expand Up @@ -31,29 +31,40 @@ template <class ST> class CHashedSparseFeatures : public CDotFeatures
/** constructor
*
* @param size cache size
* @param use_quadr whether to use quadratic features or not
* @param keep_lin_terms whether to maintain the linear terms in the computations
*/
CHashedSparseFeatures(int32_t size=0);
CHashedSparseFeatures(int32_t size=0, bool use_quadr = false, bool keep_lin_terms = true);

/** constructor
*
* @param feats the sparse features to use as a base
* @param d new feature space dimension
* @param use_quadr whether to use quadratic features or not
* @param keep_lin_terms whether to maintain the linear terms in the computations
*/
CHashedSparseFeatures(CSparseFeatures<ST>* feats, int32_t d);
CHashedSparseFeatures(CSparseFeatures<ST>* feats, int32_t d, bool use_quadr = false,
bool keep_lin_terms = true);

/** constructor
*
* @param matrix feature matrix
* @param d new feature space dimension
* @param use_quadr whether to use quadratic features or not
* @param keep_lin_terms whether to maintain the linear terms in the computations
*/
CHashedSparseFeatures(SGSparseMatrix<ST> matrix, int32_t d);
CHashedSparseFeatures(SGSparseMatrix<ST> matrix, int32_t d, bool use_quadr = false,
bool keep_lin_terms = true);

/** constructor loading features from file
*
* @param loader File object via which to load data
* @param d new feature space dimension
* @param use_quadr whether to use quadratic features or not
* @param keep_lin_terms whether to maintain the linear terms in the computations
*/
CHashedSparseFeatures(CFile* loader, int32_t d);
CHashedSparseFeatures(CFile* loader, int32_t d, bool use_quadr = false,
bool keep_lin_terms = true);

/** copy constructor */
CHashedSparseFeatures(const CHashedSparseFeatures & orig);
Expand Down Expand Up @@ -182,20 +193,26 @@ template <class ST> class CHashedSparseFeatures : public CDotFeatures
*
* @param vec the vector to hash
* @param dim the dimension of the new feature space
* @param use_quadratic whether to use quadratic features or not
* @param keep_linear_terms whether to maintain the linear terms in the computations
* @return the hashed representation of the vector vec
*/
static SGSparseVector<ST> hash_vector(SGVector<ST> vec, int32_t dim);
static SGSparseVector<ST> hash_vector(SGVector<ST> vec, int32_t dim,
bool use_quadratic = false, bool keep_linear_terms = true);


/** Get the hashed representation of the given sparse vector
*
* @param vec the vector to hash
* @param dim the dimension of the hashed target space
* @param use_quadr whether to use quadratic features or not
* @param keep_lin_terms whether to maintain the linear terms in the computations
* @return the hashed representation of the vector vec
*/
static SGSparseVector<ST> hash_vector(SGSparseVector<ST> vec, int32_t dim);
static SGSparseVector<ST> hash_vector(SGSparseVector<ST> vec, int32_t dim,
bool use_quadratic = false, bool keep_linear_terms = true);
protected:
void init(CSparseFeatures<ST>* feats, int32_t d);
void init(CSparseFeatures<ST>* feats, int32_t d, bool use_quadr, bool keep_lin_terms);

protected:

Expand All @@ -204,6 +221,12 @@ template <class ST> class CHashedSparseFeatures : public CDotFeatures

/** new feature space dimension */
int32_t dim;

/** use quadratic features */
bool use_quadratic;

/** keep linear terms */
bool keep_linear_terms;
};
}

Expand Down
22 changes: 15 additions & 7 deletions src/shogun/features/streaming/StreamingHashedDenseFeatures.cpp
Expand Up @@ -17,19 +17,19 @@ namespace shogun
template <class ST>
CStreamingHashedDenseFeatures<ST>::CStreamingHashedDenseFeatures()
{
init(NULL, false, 0, 0);
init(NULL, false, 0, 0, false, true);
}

template <class ST>
CStreamingHashedDenseFeatures<ST>::CStreamingHashedDenseFeatures(CStreamingFile* file,
bool is_labelled, int32_t size, int32_t d)
bool is_labelled, int32_t size, int32_t d, bool use_quadr, bool keep_lin_terms)
{
init(file, is_labelled, size, d);
init(file, is_labelled, size, d, use_quadr, keep_lin_terms);
}

template <class ST>
CStreamingHashedDenseFeatures<ST>::CStreamingHashedDenseFeatures(CDenseFeatures<ST>* dot_features,
int32_t d, float64_t* lab)
int32_t d, bool use_quadr, bool keep_lin_terms, float64_t* lab)
{
ASSERT(dot_features);

Expand All @@ -38,7 +38,7 @@ CStreamingHashedDenseFeatures<ST>::CStreamingHashedDenseFeatures(CDenseFeatures<
bool is_labelled = (lab != NULL);
int32_t size = 1024;

init(file, is_labelled, size, d);
init(file, is_labelled, size, d, use_quadr, keep_lin_terms);

parser.set_free_vectors_on_destruct(false);
seekable=true;
Expand All @@ -51,9 +51,16 @@ CStreamingHashedDenseFeatures<ST>::~CStreamingHashedDenseFeatures()

template <class ST>
void CStreamingHashedDenseFeatures<ST>::init(CStreamingFile* file, bool is_labelled,
int32_t size, int32_t d)
int32_t size, int32_t d, bool use_quadr, bool keep_lin_terms)
{
dim = d;
use_quadratic = use_quadr;
keep_linear_terms = keep_lin_terms;

SG_ADD(&use_quadratic, "use_quadratic", "Whether to use quadratic features",
MS_NOT_AVAILABLE);
SG_ADD(&keep_linear_terms, "keep_linear_terms", "Whether to keep the linear terms or not",
MS_NOT_AVAILABLE);
SG_ADD(&dim, "dim", "Size of target dimension", MS_NOT_AVAILABLE);

has_labels = is_labelled;
Expand Down Expand Up @@ -183,7 +190,8 @@ bool CStreamingHashedDenseFeatures<ST>::get_next_example()
if (parser.get_next_example(tmp.vector,
tmp.vlen, current_label))
{
current_vector = CHashedDenseFeatures<ST>::hash_vector(tmp, dim);
current_vector = CHashedDenseFeatures<ST>::hash_vector(tmp, dim, use_quadratic,
keep_linear_terms);
tmp.vector = NULL;
tmp.vlen = -1;
return true;
Expand Down

0 comments on commit 627d8cd

Please sign in to comment.