Add support for SparseFeatures for KNN

as KNN_LSH actually supports SparseFeatures
shogun-toolbox · May 23, 2018 · 6721b61 · vigsterkr · May 24, 2018 · 6721b61
1 parent 470b35c
commit 6721b61
Show file tree

Hide file tree

Showing 5 changed files with 168 additions and 173 deletions.
diff --git a/src/shogun/multiclass/KNN.cpp b/src/shogun/multiclass/KNN.cpp
@@ -1,8 +1,8 @@
 /*
  * This software is distributed under BSD 3-clause license (see LICENSE file).
  *
- * Authors: Soeren Sonnenburg, Fernando Iglesias, Giovanni De Toni, 
- *          Saurabh Mahindre, Sergey Lisitsyn, Weijie Lin, Heiko Strathmann, 
+ * Authors: Soeren Sonnenburg, Fernando Iglesias, Giovanni De Toni,
+ *          Saurabh Mahindre, Sergey Lisitsyn, Weijie Lin, Heiko Strathmann,
  *          Evgeniy Andreev, Viktor Gal, Bjoern Esser
  */
 
@@ -54,10 +54,8 @@ void CKNN::init()
 	m_leaf_size=1;
 	m_knn_solver=KNN_BRUTE;
 	solver=NULL;
-#ifdef HAVE_CXX11
 	m_lsh_l = 0;
 	m_lsh_t = 0;
-#endif
 
 	/* use the method classify_multiply_k to experiment with different values
 	 * of k */
@@ -340,13 +338,11 @@ void CKNN::init_solver(KNN_SOLVER knn_solver)
 		SG_GPL_ONLY
 #endif // USE_GPL_SHOGUN
 	}
-#ifdef HAVE_CXX11
 	case KNN_LSH:
 	{
 		solver = new CLSHKNNSolver(m_k, m_q, m_num_classes, m_min_label, m_train_labels, m_lsh_l, m_lsh_t);
 		SG_REF(solver);
 		break;
 	}
-#endif
 	}
 }
diff --git a/src/shogun/multiclass/KNN.h b/src/shogun/multiclass/KNN.h
@@ -1,8 +1,8 @@
 /*
  * This software is distributed under BSD 3-clause license (see LICENSE file).
  *
- * Authors: Fernando Iglesias, Soeren Sonnenburg, Saurabh Mahindre, 
- *          Sergey Lisitsyn, Heiko Strathmann, Evgeniy Andreev, Yuyu Zhang, 
+ * Authors: Fernando Iglesias, Soeren Sonnenburg, Saurabh Mahindre,
+ *          Sergey Lisitsyn, Heiko Strathmann, Evgeniy Andreev, Yuyu Zhang,
  *          Weijie Lin, Bjoern Esser, Saurabh Goyal
  */
 
@@ -32,9 +32,7 @@ namespace shogun
 		KNN_BRUTE,
 		KNN_KDTREE,
 		KNN_COVER_TREE,
-#ifdef HAVE_CXX11
 		KNN_LSH
-#endif
 	};
 
 class CDistanceMachine;
@@ -48,7 +46,7 @@ class CDistanceMachine;
  * \f[
  *		y_{x} = \arg \max_{l} \sum_{i=1}^{k} I[y_{i} = l],
  * \f]
- * 
+ *
  * where \f$y_{m}\f$ denotes the label of the \f$m^{th}\f$ example, and the
  * indicator function \f$I[a = b]\f$ equals 1 if a = b and zero otherwise.
  *
@@ -174,7 +172,7 @@ class CKNN : public CDistanceMachine
 		 */
 		inline int32_t get_leaf_size() const {return m_leaf_size; }
 
-		/** Set leaf size for KD-Tree 
+		/** Set leaf size for KD-Tree
 		 *	@param leaf_size
 		 */
 		inline void set_leaf_size(int32_t leaf_size)
@@ -202,7 +200,6 @@ class CKNN : public CDistanceMachine
 			m_knn_solver = knn_solver;
 		}
 
-#ifdef HAVE_CXX11
 		/** set parameters for LSH solver
 		  * @param l number of hash tables for LSH
 		  * @param t number of probes per query for LSH
@@ -212,7 +209,6 @@ class CKNN : public CDistanceMachine
 			m_lsh_l = l;
 			m_lsh_t = t;
 		}
-#endif
 
 	protected:
 		/** Stores feature data of underlying model.
@@ -272,7 +268,7 @@ class CKNN : public CDistanceMachine
 		 */
 		void choose_class_for_multiple_k(int32_t* output, int32_t* classes, int32_t* train_lab, int32_t step);
 
-		/** 
+		/**
 		 * To init the solver pointer indicated which solver will been used to classify_objects
 		 */
 		void init_solver(KNN_SOLVER knn_solver);
@@ -300,13 +296,11 @@ class CKNN : public CDistanceMachine
 
 		int32_t m_leaf_size;
 
-#ifdef HAVE_CXX11
 		/* Number of hash tables for LSH */
 		int32_t m_lsh_l;
 
 		/* Number of probes per query for LSH */
 		int32_t m_lsh_t;
-#endif
 };
 
 }

diff --git a/src/shogun/multiclass/LSHKNNSolver.cpp b/src/shogun/multiclass/LSHKNNSolver.cpp
@@ -1,72 +1,82 @@
 /* This software is distributed under BSD 3-clause license (see LICENSE file).
  *
- * Copyright (c) 2012-2013 Sergey Lisitsyn
+ * Copyright (c) 2012-2013 Sergey Lisitsyn, Viktor Gal
  */
 
 #include <shogun/multiclass/LSHKNNSolver.h>
 #include <shogun/mathematics/eigen3.h>
 #include <shogun/features/DenseFeatures.h>
+#include <shogun/features/SparseFeatures.h>
 #include <shogun/lib/Signal.h>
 
 using namespace shogun;
 using namespace Eigen;
 
-#ifdef HAVE_CXX11
 #include <shogun/lib/external/falconn/lsh_nn_table.h>
 
 CLSHKNNSolver::CLSHKNNSolver(const int32_t k, const float64_t q, const int32_t num_classes, const int32_t min_label, const SGVector<int32_t> train_labels, const int32_t lsh_l, const int32_t lsh_t):
 CKNNSolver(k, q, num_classes, min_label, train_labels)
 {
 	init();
 
-	m_lsh_l=lsh_l; 
+	m_lsh_l=lsh_l;
 	m_lsh_t=lsh_t;
 }
 
-CMulticlassLabels* CLSHKNNSolver::classify_objects(CDistance* knn_distance, const int32_t num_lab, SGVector<int32_t>& train_lab, SGVector<float64_t>& classes) const
+template<typename PointType, typename FeatureType>
+PointType get_falconn_point(FeatureType* f, index_t i);
+
+template<>
+falconn::DenseVector<double> get_falconn_point(CDenseFeatures<float64_t>* f, index_t i)
 {
-	CMulticlassLabels* output=new CMulticlassLabels(num_lab);
-	CDenseFeatures<float64_t>* features = dynamic_cast<CDenseFeatures<float64_t>*>(knn_distance->get_lhs());
-	std::vector<falconn::DenseVector<double>> feats;
-	for(int32_t i=0; i < features->get_num_vectors(); i++)
-	{
-		int32_t len;
-		bool free;
-		float64_t* vec = features->get_feature_vector(i, len, free);
-		falconn::DenseVector<double> temp = Map<VectorXd> (vec, len);
-		feats.push_back(temp);
-	}
+	index_t len;
+	bool free;
+	float64_t* vec = f->get_feature_vector(i, len, free);
+	return Map<VectorXd>(vec, len);
+}
 
-	falconn::LSHConstructionParameters params 
-		= falconn::get_default_parameters<falconn::DenseVector<double>>(features->get_num_vectors(),
-                           features->get_num_features(),
+template<>
+falconn::SparseVector<double> get_falconn_point(CSparseFeatures<float64_t>* f, index_t i)
+{
+	// FIXME: this basically copies the data :(
+	auto fv = f->get_sparse_feature_vector(i);
+	falconn::SparseVector<double> mapped(fv.num_feat_entries);
+	for (index_t j = 0; j < fv.num_feat_entries; ++j)
+		mapped[j] = std::make_pair(fv.features[j].feat_index, fv.features[j].entry);
+	return mapped;
+}
+
+template<typename PointType, typename FeatureType>
+CMulticlassLabels* CLSHKNNSolver::classify_objects(FeatureType* lhs, FeatureType* query_features, const int32_t num_lab, SGVector<int32_t>& train_lab, SGVector<float64_t>& classes) const
+{
+	auto output = new CMulticlassLabels(num_lab);
+	std::vector<PointType> feats(lhs->get_num_vectors());
+	for(index_t i = 0; i < lhs->get_num_vectors(); ++i)
+		feats[i] = get_falconn_point<PointType>(lhs, i);
+
+	falconn::LSHConstructionParameters params
+		= falconn::get_default_parameters<PointType>(lhs->get_num_vectors(),
+                           lhs->get_num_features(),
                            falconn::DistanceFunction::EuclideanSquared,
                            true);
-	SG_UNREF(features);
+	SG_UNREF(lhs);
 	if (m_lsh_l && m_lsh_t)
 		params.l = m_lsh_l;
 
-	auto lsh_table = falconn::construct_table<falconn::DenseVector<double>>(feats, params);
+	auto lsh_table = falconn::construct_table<PointType>(feats, params);
 	if (m_lsh_t)
 		lsh_table->set_num_probes(m_lsh_t);
 
-	CDenseFeatures<float64_t>* query_features = dynamic_cast<CDenseFeatures<float64_t>*>(knn_distance->get_rhs());
-	std::vector<falconn::DenseVector<double>> query_feats;
-
 	SGMatrix<index_t> NN (m_k, query_features->get_num_vectors());
-	for(index_t i=0; i < query_features->get_num_vectors(); i++)
+	for(index_t i = 0; i < query_features->get_num_vectors(); ++i)
 	{
-		int32_t len;
-		bool free;
-		float64_t* vec = query_features->get_feature_vector(i, len, free);
-		falconn::DenseVector<double> temp = Map<VectorXd> (vec, len);
-		auto indices = new std::vector<int32_t> ();
-		lsh_table->find_k_nearest_neighbors(temp, (int_fast64_t)m_k, indices);
-		sg_memcpy(NN.get_column_vector(i), indices->data(), sizeof(int32_t)*m_k);
+		auto indices = new std::vector<index_t> ();
+		lsh_table->find_k_nearest_neighbors(get_falconn_point<PointType>(query_features, i), (int_fast64_t)m_k, indices);
+		sg_memcpy(NN.get_column_vector(i), indices->data(), sizeof(index_t)*m_k);
 		delete indices;
 	}
 
-	for (index_t i = 0; i < num_lab && (!cancel_computation()); i++)
+	for (index_t i = 0; i < num_lab && (!cancel_computation()); ++i)
 	{
 		//write the labels of the k nearest neighbors from theirs indices
 		for (index_t j=0; j<m_k; j++)
@@ -82,9 +92,30 @@ CMulticlassLabels* CLSHKNNSolver::classify_objects(CDistance* knn_distance, cons
 	return output;
 }
 
+CMulticlassLabels* CLSHKNNSolver::classify_objects(CDistance* knn_distance, const int32_t num_lab, SGVector<int32_t>& train_lab, SGVector<float64_t>& classes) const
+{
+	auto lhs = knn_distance->get_lhs();
+	auto rhs = knn_distance->get_rhs();
+	if ((lhs->get_feature_class() == C_DENSE) && (rhs->get_feature_class() == C_DENSE))
+	{
+		auto features = lhs->as<CDenseFeatures<float64_t>>();
+		auto query_features = rhs->as<CDenseFeatures<float64_t>>();
+		return classify_objects<falconn::DenseVector<double>>(features, query_features, num_lab, train_lab, classes);
+	}
+	else if ((lhs->get_feature_class() == C_SPARSE) && (rhs->get_feature_class() == C_SPARSE))
+	{
+		auto features = lhs->as<CSparseFeatures<float64_t>>();
+		auto query_features = rhs->as<CSparseFeatures<float64_t>>();
+		return classify_objects<falconn::SparseVector<double>>(features, query_features, num_lab, train_lab, classes);
+	}
+	else
+	{
+		SG_ERROR("Unsupported feature type!")
+	}
+}
+
 SGVector<int32_t> CLSHKNNSolver::classify_objects_k(CDistance* d, const int32_t num_lab, SGVector<int32_t>& train_lab, SGVector<int32_t>& classes) const
 {
 	SG_NOTIMPLEMENTED
 	return 0;
 }
-#endif
diff --git a/src/shogun/multiclass/LSHKNNSolver.h b/src/shogun/multiclass/LSHKNNSolver.h
@@ -27,7 +27,7 @@ class CLSHKNNSolver : public CKNNSolver
 		/** default constructor */
 		CLSHKNNSolver() : CKNNSolver()
 		{
-			init(); 
+			init();
 		}
 
 		/** deconstructor */
@@ -55,10 +55,13 @@ class CLSHKNNSolver : public CKNNSolver
 	private:
 		void init()
 		{
-			m_lsh_l=0; 
+			m_lsh_l=0;
 			m_lsh_t=0;
 		}
 
+		template<typename PointType, typename FeatureType>
+		CMulticlassLabels* classify_objects(FeatureType* lhs, FeatureType* query_features, const int32_t num_lab, SGVector<int32_t>& train_lab, SGVector<float64_t>& classes) const;
+
 	protected:
 		/* Number of hash tables for LSH */
 		int32_t m_lsh_l;