Quadratic support for rest numerical hashed classes

shogun-toolbox · Jul 25, 2013 · 627d8cd · 627d8cd
1 parent da8857a
commit 627d8cd
Show file tree

Hide file tree

Showing 7 changed files with 388 additions and 52 deletions.
diff --git a/src/shogun/features/HashedSparseFeatures.cpp b/src/shogun/features/HashedSparseFeatures.cpp
@@ -22,42 +22,50 @@
 namespace shogun {
 
 template <class ST>
-CHashedSparseFeatures<ST>::CHashedSparseFeatures(int32_t size)
-: CDotFeatures(size)
+CHashedSparseFeatures<ST>::CHashedSparseFeatures(int32_t size, bool use_quadr,
+	bool keep_lin_terms) : CDotFeatures(size)
 {
-	init(NULL, 0);
+	init(NULL, 0, use_quadr, keep_lin_terms);
 }
 
 template <class ST>
-CHashedSparseFeatures<ST>::CHashedSparseFeatures(CSparseFeatures<ST>* feats, int32_t d)
- : CDotFeatures()
+CHashedSparseFeatures<ST>::CHashedSparseFeatures(CSparseFeatures<ST>* feats, int32_t d,
+	bool use_quadr, bool keep_lin_terms) : CDotFeatures()
 {
-	init(feats, d);
+	init(feats, d, use_quadr, keep_lin_terms);
 }
 
 template <class ST>
-CHashedSparseFeatures<ST>::CHashedSparseFeatures(SGSparseMatrix<ST> matrix, int32_t d)
-: CDotFeatures()
+CHashedSparseFeatures<ST>::CHashedSparseFeatures(SGSparseMatrix<ST> matrix, int32_t d,
+	bool use_quadr, bool keep_lin_terms) : CDotFeatures()
 {
 	CSparseFeatures<ST>* feats = new CSparseFeatures<ST>(matrix);
-	init(feats, d);
+	init(feats, d, use_quadr, keep_lin_terms);
 }
 
 template <class ST>
-CHashedSparseFeatures<ST>::CHashedSparseFeatures(CFile* loader, int32_t d)
-: CDotFeatures(loader)
+CHashedSparseFeatures<ST>::CHashedSparseFeatures(CFile* loader, int32_t d, bool use_quadr,
+	bool keep_lin_terms) : CDotFeatures(loader)
 {
 	CSparseFeatures<ST>* feats = new CSparseFeatures<ST>();
 	feats->load(loader);
-	init(feats, d);
+	init(feats, d, use_quadr, keep_lin_terms);
 }
 
 template <class ST>
-void CHashedSparseFeatures<ST>::init(CSparseFeatures<ST>* feats, int32_t d)
+void CHashedSparseFeatures<ST>::init(CSparseFeatures<ST>* feats, int32_t d, bool use_quadr,
+	bool keep_lin_terms)
 {
 	dim = d;
+	use_quadratic = use_quadr;
+	keep_linear_terms = keep_lin_terms;
 	sparse_feats = feats;
 	SG_REF(sparse_feats);
+
+	SG_ADD(&use_quadratic, "use_quadratic", "Whether to use quadratic features",
+		MS_NOT_AVAILABLE);
+	SG_ADD(&keep_linear_terms, "keep_linear_terms", "Whether to keep the linear terms or not",
+		MS_NOT_AVAILABLE);
 	SG_ADD(&dim, "dim", "Dimension of new feature space", MS_NOT_AVAILABLE);
 	SG_ADD((CSGObject** ) &sparse_feats, "sparse_feats ", "Sparse features to work on",
 		MS_NOT_AVAILABLE);
@@ -69,7 +77,7 @@ template <class ST>
 CHashedSparseFeatures<ST>::CHashedSparseFeatures(const CHashedSparseFeatures& orig)
 : CDotFeatures(orig)
 {
-	init(orig.sparse_feats, orig.dim);
+	init(orig.sparse_feats, orig.dim, orig.use_quadratic, orig.keep_linear_terms);
 }
 
 template <class ST>
@@ -94,25 +102,55 @@ template <class ST>
 SGSparseVector<ST> CHashedSparseFeatures<ST>::get_hashed_feature_vector(
 	int32_t vec_idx) const
 {
-	return CHashedSparseFeatures<ST>::hash_vector(sparse_feats->get_sparse_feature_vector(vec_idx), dim);
+	return CHashedSparseFeatures<ST>::hash_vector(sparse_feats->get_sparse_feature_vector(vec_idx),
+		dim, use_quadratic, keep_linear_terms);
 }
 
 template <class ST>
-SGSparseVector<ST> CHashedSparseFeatures<ST>::hash_vector(SGVector<ST> vec, int32_t dim)
+SGSparseVector<ST> CHashedSparseFeatures<ST>::hash_vector(SGVector<ST> vec, int32_t dim,
+	bool use_quadratic, bool keep_linear_terms)
 {
-	return CHashedDenseFeatures<ST>::hash_vector(vec, dim);
+	return CHashedDenseFeatures<ST>::hash_vector(vec, dim, use_quadratic, keep_linear_terms);
 }
 
 template <class ST>
-SGSparseVector<ST> CHashedSparseFeatures<ST>::hash_vector(SGSparseVector<ST> vec, int32_t dim)
+SGSparseVector<ST> CHashedSparseFeatures<ST>::hash_vector(SGSparseVector<ST> vec, int32_t dim,
+	bool use_quadratic, bool keep_linear_terms)
 {
 	SGVector<ST> h_vec(dim);
 	SGVector<ST>::fill_vector(h_vec, dim, 0);
+
+	int32_t hash_cache_size = use_quadratic ? vec.num_feat_entries : 0;
+	SGVector<uint32_t> hash_cache(hash_cache_size);
+
 	for (index_t i=0; i<vec.num_feat_entries; i++)
 	{
-		uint32_t h = CHash::MurmurHash3((uint8_t* ) &vec.features[i].feat_index, sizeof (index_t),
+		uint32_t hash = CHash::MurmurHash3((uint8_t* ) &vec.features[i].feat_index, sizeof (index_t),
 						vec.features[i].feat_index);
-		h_vec[h % dim] += vec.features[i].entry;
+
+		if (use_quadratic)
+			hash_cache[i] = hash;
+
+		if ( (!use_quadratic) || keep_linear_terms )
+			h_vec[hash % dim] += vec.features[i].entry;
+	}
+
+	if (use_quadratic)
+	{
+		for (index_t i=0; i<vec.num_feat_entries; i++)
+		{
+			index_t n_idx = vec.features[i].feat_index + vec.features[i].feat_index;
+			index_t idx = CHash::MurmurHash3((uint8_t* ) &n_idx, sizeof(index_t),
+					vec.features[i].feat_index) % dim;
+
+			h_vec[idx] += vec.features[i].entry * vec.features[i].entry;
+
+			for (index_t j=i+1; j<vec.num_feat_entries; j++)
+			{
+				idx = (hash_cache[i] ^ hash_cache[j]) % dim;
+				h_vec[idx] += vec.features[i].entry * vec.features[j].entry;
+			}
+		}
 	}
 
 	int32_t num_nnz_features = 0; 
@@ -162,13 +200,38 @@ float64_t CHashedSparseFeatures<ST>::dense_dot(int32_t vec_idx1, const float64_t
 
 	SGSparseVector<ST> vec = sparse_feats->get_sparse_feature_vector(vec_idx1);
 
+	int32_t hash_cache_size = use_quadratic ? vec.num_feat_entries : 0;
+	SGVector<uint32_t> hash_cache(hash_cache_size);
+
 	float64_t result = 0;
 	for (index_t i=0; i<vec.num_feat_entries; i++)
 	{
-		uint32_t h_idx = CHash::MurmurHash3((uint8_t* ) &vec.features[i].feat_index, sizeof (index_t),
+		uint32_t hash = CHash::MurmurHash3((uint8_t* ) &vec.features[i].feat_index, sizeof (index_t),
 					   vec.features[i].feat_index);
-		h_idx = h_idx % dim;
-		result += vec2[h_idx] * vec.features[i].entry;
+
+		if (use_quadratic)
+			hash_cache[i] = hash;
+
+		if ( (!use_quadratic) || keep_linear_terms)
+			result += vec2[hash % dim] * vec.features[i].entry;
+	}
+
+	if (use_quadratic)
+	{
+		for (index_t i=0; i<vec.num_feat_entries; i++)
+		{
+			index_t n_idx = vec.features[i].feat_index + vec.features[i].feat_index;
+			index_t idx = CHash::MurmurHash3((uint8_t* ) &n_idx, sizeof (index_t),
+						vec.features[i].feat_index) % dim;
+
+			result += vec2[idx] * vec.features[i].entry * vec.features[i].entry;
+
+			for (index_t j=i+1; j<vec.num_feat_entries; j++)
+			{
+				idx = (hash_cache[i] ^ hash_cache[j]) % dim;
+				result += vec2[idx] * vec.features[i].entry * vec.features[j].entry;	
+			}
+		}
 	}
 
 	sparse_feats ->free_feature_vector(vec_idx1);
@@ -184,11 +247,36 @@ void CHashedSparseFeatures<ST>::add_to_dense_vec(float64_t alpha, int32_t vec_id
 
 	SGSparseVector<ST> vec = sparse_feats->get_sparse_feature_vector(vec_idx1);
 
+	int32_t hash_cache_size = use_quadratic ? vec.num_feat_entries : 0;
+	SGVector<uint32_t> hash_cache(hash_cache_size);
+
 	for (index_t i=0; i<vec.num_feat_entries; i++)
 	{
-		uint32_t h_idx = CHash::MurmurHash3((uint8_t* ) &vec.features[i].feat_index, sizeof (index_t),
+		uint32_t hash = CHash::MurmurHash3((uint8_t* ) &vec.features[i].feat_index, sizeof (index_t),
 					   vec.features[i].feat_index);
-		vec2[h_idx%dim] += val * vec.features[i].entry;
+		if (use_quadratic)
+			hash_cache[i] = hash;
+
+		if ( (!use_quadratic) || keep_linear_terms)
+			vec2[hash % dim] += val * vec.features[i].entry;
+	}
+
+	if (use_quadratic)
+	{
+		for (index_t i=0; i<vec.num_feat_entries; i++)
+		{
+			index_t n_idx = vec.features[i].feat_index + vec.features[i].feat_index;
+			index_t idx = CHash::MurmurHash3((uint8_t* ) &n_idx, sizeof (index_t),
+						vec.features[i].feat_index) % dim;
+
+			vec2[idx] += val * vec.features[i].entry * vec.features[i].entry;
+
+			for (index_t j=i+1; j<vec.num_feat_entries; j++)
+			{
+				idx = (hash_cache[i] ^ hash_cache[j]) % dim;
+				vec2[idx] += val * vec.features[i].entry * vec.features[j].entry;
+			}
+		}
 	}
 	sparse_feats ->free_feature_vector(vec_idx1);	
 }

diff --git a/src/shogun/features/HashedSparseFeatures.h b/src/shogun/features/HashedSparseFeatures.h
@@ -31,29 +31,40 @@ template <class ST> class CHashedSparseFeatures  : public CDotFeatures
 	/** constructor
 	 *
 	 * @param size cache size
+	 * @param use_quadr whether to use quadratic features or not
+	 * @param keep_lin_terms whether to maintain the linear terms in the computations
 	 */
-	CHashedSparseFeatures(int32_t size=0);
+	CHashedSparseFeatures(int32_t size=0, bool use_quadr = false, bool keep_lin_terms = true);
 
 	/** constructor
 	 *
 	 * @param feats	the sparse features to use as a base
 	 * @param d new feature space dimension
+	 * @param use_quadr whether to use quadratic features or not
+	 * @param keep_lin_terms whether to maintain the linear terms in the computations
 	 */
-	CHashedSparseFeatures(CSparseFeatures<ST>* feats, int32_t d);
+	CHashedSparseFeatures(CSparseFeatures<ST>* feats, int32_t d, bool use_quadr = false,
+			bool keep_lin_terms = true);
 
 	/** constructor
 	 *
 	 * @param matrix feature matrix
 	 * @param d new feature space dimension
+	 * @param use_quadr whether to use quadratic features or not
+	 * @param keep_lin_terms whether to maintain the linear terms in the computations
 	 */
-	CHashedSparseFeatures(SGSparseMatrix<ST> matrix, int32_t d);
+	CHashedSparseFeatures(SGSparseMatrix<ST> matrix, int32_t d, bool use_quadr = false,
+			bool keep_lin_terms = true);
 
 	/** constructor loading features from file
 	 *
 	 * @param loader File object via which to load data
 	 * @param d new feature space dimension
+	 * @param use_quadr whether to use quadratic features or not
+	 * @param keep_lin_terms whether to maintain the linear terms in the computations
 	 */
-	CHashedSparseFeatures(CFile* loader, int32_t d);
+	CHashedSparseFeatures(CFile* loader, int32_t d, bool use_quadr = false,
+			bool keep_lin_terms = true);
 
 	/** copy constructor */
 	CHashedSparseFeatures(const CHashedSparseFeatures & orig);
@@ -182,20 +193,26 @@ template <class ST> class CHashedSparseFeatures  : public CDotFeatures
 	 *
 	 * @param vec the vector to hash
 	 * @param dim the dimension of the new feature space
+	 * @param use_quadratic whether to use quadratic features or not
+	 * @param keep_linear_terms whether to maintain the linear terms in the computations
 	 * @return the hashed representation of the vector vec
 	 */
-	static SGSparseVector<ST> hash_vector(SGVector<ST> vec, int32_t dim);
+	static SGSparseVector<ST> hash_vector(SGVector<ST> vec, int32_t dim,
+		bool use_quadratic = false, bool keep_linear_terms = true);
 
 
 	/** Get the hashed representation of the given sparse vector
 	 *
 	 * @param vec the vector to hash
 	 * @param dim the dimension of the hashed target space
+	 * @param use_quadr whether to use quadratic features or not
+	 * @param keep_lin_terms whether to maintain the linear terms in the computations
 	 * @return the hashed representation of the vector vec
 	 */
-	static SGSparseVector<ST> hash_vector(SGSparseVector<ST> vec, int32_t dim);
+	static SGSparseVector<ST> hash_vector(SGSparseVector<ST> vec, int32_t dim,
+		bool use_quadratic = false, bool keep_linear_terms = true);
 protected:
-	void init(CSparseFeatures<ST>* feats, int32_t d);
+	void init(CSparseFeatures<ST>* feats, int32_t d, bool use_quadr, bool keep_lin_terms);
 
 protected:
 
@@ -204,6 +221,12 @@ template <class ST> class CHashedSparseFeatures  : public CDotFeatures
 
 	/** new feature space dimension */
 	int32_t dim;
+
+	/** use quadratic features */
+	bool use_quadratic;
+
+	/** keep linear terms */
+	bool keep_linear_terms;
 };
 }
 

diff --git a/src/shogun/features/streaming/StreamingHashedDenseFeatures.cpp b/src/shogun/features/streaming/StreamingHashedDenseFeatures.cpp
@@ -17,19 +17,19 @@ namespace shogun
 template <class ST>
 CStreamingHashedDenseFeatures<ST>::CStreamingHashedDenseFeatures()
 {
-	init(NULL, false, 0, 0);	
+	init(NULL, false, 0, 0, false, true);
 }
 
 template <class ST>
 CStreamingHashedDenseFeatures<ST>::CStreamingHashedDenseFeatures(CStreamingFile* file,
-	bool is_labelled, int32_t size, int32_t d)
+	bool is_labelled, int32_t size, int32_t d, bool use_quadr, bool keep_lin_terms)
 {
-	init(file, is_labelled, size, d);
+	init(file, is_labelled, size, d, use_quadr, keep_lin_terms);
 }
 
 template <class ST>
 CStreamingHashedDenseFeatures<ST>::CStreamingHashedDenseFeatures(CDenseFeatures<ST>* dot_features,
-	int32_t d, float64_t* lab)
+	int32_t d, bool use_quadr, bool keep_lin_terms, float64_t* lab)
 {
 	ASSERT(dot_features);
 
@@ -38,7 +38,7 @@ CStreamingHashedDenseFeatures<ST>::CStreamingHashedDenseFeatures(CDenseFeatures<
 	bool is_labelled = (lab != NULL);
 	int32_t size = 1024;
 
-	init(file, is_labelled, size, d); 
+	init(file, is_labelled, size, d, use_quadr, keep_lin_terms); 
 
 	parser.set_free_vectors_on_destruct(false);
 	seekable=true;
@@ -51,9 +51,16 @@ CStreamingHashedDenseFeatures<ST>::~CStreamingHashedDenseFeatures()
 
 template <class ST>
 void CStreamingHashedDenseFeatures<ST>::init(CStreamingFile* file, bool is_labelled,
-	int32_t size, int32_t d)	
+	int32_t size, int32_t d, bool use_quadr, bool keep_lin_terms)
 {
 	dim = d;
+	use_quadratic = use_quadr;
+	keep_linear_terms = keep_lin_terms;
+
+	SG_ADD(&use_quadratic, "use_quadratic", "Whether to use quadratic features",
+		MS_NOT_AVAILABLE);
+	SG_ADD(&keep_linear_terms, "keep_linear_terms", "Whether to keep the linear terms or not",
+		MS_NOT_AVAILABLE);
 	SG_ADD(&dim, "dim", "Size of target dimension", MS_NOT_AVAILABLE);
 
 	has_labels = is_labelled;
@@ -183,7 +190,8 @@ bool CStreamingHashedDenseFeatures<ST>::get_next_example()
 	if (parser.get_next_example(tmp.vector,
 		tmp.vlen, current_label))
 	{
-		current_vector = CHashedDenseFeatures<ST>::hash_vector(tmp, dim);
+		current_vector = CHashedDenseFeatures<ST>::hash_vector(tmp, dim, use_quadratic,
+				keep_linear_terms);
 		tmp.vector = NULL;
 		tmp.vlen = -1;
 		return true;