Skip to content

Commit

Permalink
Add support for SparseFeatures for KNN
Browse files Browse the repository at this point in the history
as KNN_LSH actually supports SparseFeatures
  • Loading branch information
vigsterkr committed May 23, 2018
1 parent 470b35c commit 6721b61
Show file tree
Hide file tree
Showing 5 changed files with 168 additions and 173 deletions.
8 changes: 2 additions & 6 deletions src/shogun/multiclass/KNN.cpp
@@ -1,8 +1,8 @@
/*
* This software is distributed under BSD 3-clause license (see LICENSE file).
*
* Authors: Soeren Sonnenburg, Fernando Iglesias, Giovanni De Toni,
* Saurabh Mahindre, Sergey Lisitsyn, Weijie Lin, Heiko Strathmann,
* Authors: Soeren Sonnenburg, Fernando Iglesias, Giovanni De Toni,
* Saurabh Mahindre, Sergey Lisitsyn, Weijie Lin, Heiko Strathmann,
* Evgeniy Andreev, Viktor Gal, Bjoern Esser
*/

Expand Down Expand Up @@ -54,10 +54,8 @@ void CKNN::init()
m_leaf_size=1;
m_knn_solver=KNN_BRUTE;
solver=NULL;
#ifdef HAVE_CXX11
m_lsh_l = 0;
m_lsh_t = 0;
#endif

/* use the method classify_multiply_k to experiment with different values
* of k */
Expand Down Expand Up @@ -340,13 +338,11 @@ void CKNN::init_solver(KNN_SOLVER knn_solver)
SG_GPL_ONLY
#endif // USE_GPL_SHOGUN
}
#ifdef HAVE_CXX11
case KNN_LSH:
{
solver = new CLSHKNNSolver(m_k, m_q, m_num_classes, m_min_label, m_train_labels, m_lsh_l, m_lsh_t);
SG_REF(solver);
break;
}
#endif
}
}
16 changes: 5 additions & 11 deletions src/shogun/multiclass/KNN.h
@@ -1,8 +1,8 @@
/*
* This software is distributed under BSD 3-clause license (see LICENSE file).
*
* Authors: Fernando Iglesias, Soeren Sonnenburg, Saurabh Mahindre,
* Sergey Lisitsyn, Heiko Strathmann, Evgeniy Andreev, Yuyu Zhang,
* Authors: Fernando Iglesias, Soeren Sonnenburg, Saurabh Mahindre,
* Sergey Lisitsyn, Heiko Strathmann, Evgeniy Andreev, Yuyu Zhang,
* Weijie Lin, Bjoern Esser, Saurabh Goyal
*/

Expand Down Expand Up @@ -32,9 +32,7 @@ namespace shogun
KNN_BRUTE,
KNN_KDTREE,
KNN_COVER_TREE,
#ifdef HAVE_CXX11
KNN_LSH
#endif
};

class CDistanceMachine;
Expand All @@ -48,7 +46,7 @@ class CDistanceMachine;
* \f[
* y_{x} = \arg \max_{l} \sum_{i=1}^{k} I[y_{i} = l],
* \f]
*
*
* where \f$y_{m}\f$ denotes the label of the \f$m^{th}\f$ example, and the
* indicator function \f$I[a = b]\f$ equals 1 if a = b and zero otherwise.
*
Expand Down Expand Up @@ -174,7 +172,7 @@ class CKNN : public CDistanceMachine
*/
inline int32_t get_leaf_size() const {return m_leaf_size; }

/** Set leaf size for KD-Tree
/** Set leaf size for KD-Tree
* @param leaf_size
*/
inline void set_leaf_size(int32_t leaf_size)
Expand Down Expand Up @@ -202,7 +200,6 @@ class CKNN : public CDistanceMachine
m_knn_solver = knn_solver;
}

#ifdef HAVE_CXX11
/** set parameters for LSH solver
* @param l number of hash tables for LSH
* @param t number of probes per query for LSH
Expand All @@ -212,7 +209,6 @@ class CKNN : public CDistanceMachine
m_lsh_l = l;
m_lsh_t = t;
}
#endif

protected:
/** Stores feature data of underlying model.
Expand Down Expand Up @@ -272,7 +268,7 @@ class CKNN : public CDistanceMachine
*/
void choose_class_for_multiple_k(int32_t* output, int32_t* classes, int32_t* train_lab, int32_t step);

/**
/**
* To init the solver pointer indicated which solver will been used to classify_objects
*/
void init_solver(KNN_SOLVER knn_solver);
Expand Down Expand Up @@ -300,13 +296,11 @@ class CKNN : public CDistanceMachine

int32_t m_leaf_size;

#ifdef HAVE_CXX11
/* Number of hash tables for LSH */
int32_t m_lsh_l;

/* Number of probes per query for LSH */
int32_t m_lsh_t;
#endif
};

}
Expand Down
97 changes: 64 additions & 33 deletions src/shogun/multiclass/LSHKNNSolver.cpp
@@ -1,72 +1,82 @@
/* This software is distributed under BSD 3-clause license (see LICENSE file).
*
* Copyright (c) 2012-2013 Sergey Lisitsyn
* Copyright (c) 2012-2013 Sergey Lisitsyn, Viktor Gal
*/

#include <shogun/multiclass/LSHKNNSolver.h>
#include <shogun/mathematics/eigen3.h>
#include <shogun/features/DenseFeatures.h>
#include <shogun/features/SparseFeatures.h>
#include <shogun/lib/Signal.h>

using namespace shogun;
using namespace Eigen;

#ifdef HAVE_CXX11
#include <shogun/lib/external/falconn/lsh_nn_table.h>

CLSHKNNSolver::CLSHKNNSolver(const int32_t k, const float64_t q, const int32_t num_classes, const int32_t min_label, const SGVector<int32_t> train_labels, const int32_t lsh_l, const int32_t lsh_t):
CKNNSolver(k, q, num_classes, min_label, train_labels)
{
init();

m_lsh_l=lsh_l;
m_lsh_l=lsh_l;
m_lsh_t=lsh_t;
}

CMulticlassLabels* CLSHKNNSolver::classify_objects(CDistance* knn_distance, const int32_t num_lab, SGVector<int32_t>& train_lab, SGVector<float64_t>& classes) const
template<typename PointType, typename FeatureType>
PointType get_falconn_point(FeatureType* f, index_t i);

template<>
falconn::DenseVector<double> get_falconn_point(CDenseFeatures<float64_t>* f, index_t i)
{
CMulticlassLabels* output=new CMulticlassLabels(num_lab);
CDenseFeatures<float64_t>* features = dynamic_cast<CDenseFeatures<float64_t>*>(knn_distance->get_lhs());
std::vector<falconn::DenseVector<double>> feats;
for(int32_t i=0; i < features->get_num_vectors(); i++)
{
int32_t len;
bool free;
float64_t* vec = features->get_feature_vector(i, len, free);
falconn::DenseVector<double> temp = Map<VectorXd> (vec, len);
feats.push_back(temp);
}
index_t len;
bool free;
float64_t* vec = f->get_feature_vector(i, len, free);
return Map<VectorXd>(vec, len);
}

falconn::LSHConstructionParameters params
= falconn::get_default_parameters<falconn::DenseVector<double>>(features->get_num_vectors(),
features->get_num_features(),
template<>
falconn::SparseVector<double> get_falconn_point(CSparseFeatures<float64_t>* f, index_t i)
{
// FIXME: this basically copies the data :(
auto fv = f->get_sparse_feature_vector(i);
falconn::SparseVector<double> mapped(fv.num_feat_entries);
for (index_t j = 0; j < fv.num_feat_entries; ++j)
mapped[j] = std::make_pair(fv.features[j].feat_index, fv.features[j].entry);
return mapped;
}

template<typename PointType, typename FeatureType>
CMulticlassLabels* CLSHKNNSolver::classify_objects(FeatureType* lhs, FeatureType* query_features, const int32_t num_lab, SGVector<int32_t>& train_lab, SGVector<float64_t>& classes) const
{
auto output = new CMulticlassLabels(num_lab);
std::vector<PointType> feats(lhs->get_num_vectors());
for(index_t i = 0; i < lhs->get_num_vectors(); ++i)
feats[i] = get_falconn_point<PointType>(lhs, i);

falconn::LSHConstructionParameters params
= falconn::get_default_parameters<PointType>(lhs->get_num_vectors(),
lhs->get_num_features(),
falconn::DistanceFunction::EuclideanSquared,
true);
SG_UNREF(features);
SG_UNREF(lhs);
if (m_lsh_l && m_lsh_t)
params.l = m_lsh_l;

auto lsh_table = falconn::construct_table<falconn::DenseVector<double>>(feats, params);
auto lsh_table = falconn::construct_table<PointType>(feats, params);
if (m_lsh_t)
lsh_table->set_num_probes(m_lsh_t);

CDenseFeatures<float64_t>* query_features = dynamic_cast<CDenseFeatures<float64_t>*>(knn_distance->get_rhs());
std::vector<falconn::DenseVector<double>> query_feats;

SGMatrix<index_t> NN (m_k, query_features->get_num_vectors());
for(index_t i=0; i < query_features->get_num_vectors(); i++)
for(index_t i = 0; i < query_features->get_num_vectors(); ++i)
{
int32_t len;
bool free;
float64_t* vec = query_features->get_feature_vector(i, len, free);
falconn::DenseVector<double> temp = Map<VectorXd> (vec, len);
auto indices = new std::vector<int32_t> ();
lsh_table->find_k_nearest_neighbors(temp, (int_fast64_t)m_k, indices);
sg_memcpy(NN.get_column_vector(i), indices->data(), sizeof(int32_t)*m_k);
auto indices = new std::vector<index_t> ();
lsh_table->find_k_nearest_neighbors(get_falconn_point<PointType>(query_features, i), (int_fast64_t)m_k, indices);
sg_memcpy(NN.get_column_vector(i), indices->data(), sizeof(index_t)*m_k);
delete indices;
}

for (index_t i = 0; i < num_lab && (!cancel_computation()); i++)
for (index_t i = 0; i < num_lab && (!cancel_computation()); ++i)
{
//write the labels of the k nearest neighbors from theirs indices
for (index_t j=0; j<m_k; j++)
Expand All @@ -82,9 +92,30 @@ CMulticlassLabels* CLSHKNNSolver::classify_objects(CDistance* knn_distance, cons
return output;
}

CMulticlassLabels* CLSHKNNSolver::classify_objects(CDistance* knn_distance, const int32_t num_lab, SGVector<int32_t>& train_lab, SGVector<float64_t>& classes) const
{
auto lhs = knn_distance->get_lhs();
auto rhs = knn_distance->get_rhs();
if ((lhs->get_feature_class() == C_DENSE) && (rhs->get_feature_class() == C_DENSE))

This comment has been minimized.

Copy link
@vigsterkr

vigsterkr May 24, 2018

Author Member

@lisitsyn any idea how to create a nice function/template whatever that we dont have to copy paste around these logic to each and every model which supports both dense and sparse features? :)

{
auto features = lhs->as<CDenseFeatures<float64_t>>();
auto query_features = rhs->as<CDenseFeatures<float64_t>>();
return classify_objects<falconn::DenseVector<double>>(features, query_features, num_lab, train_lab, classes);
}
else if ((lhs->get_feature_class() == C_SPARSE) && (rhs->get_feature_class() == C_SPARSE))
{
auto features = lhs->as<CSparseFeatures<float64_t>>();
auto query_features = rhs->as<CSparseFeatures<float64_t>>();
return classify_objects<falconn::SparseVector<double>>(features, query_features, num_lab, train_lab, classes);
}
else
{
SG_ERROR("Unsupported feature type!")
}
}

SGVector<int32_t> CLSHKNNSolver::classify_objects_k(CDistance* d, const int32_t num_lab, SGVector<int32_t>& train_lab, SGVector<int32_t>& classes) const
{
SG_NOTIMPLEMENTED
return 0;
}
#endif
7 changes: 5 additions & 2 deletions src/shogun/multiclass/LSHKNNSolver.h
Expand Up @@ -27,7 +27,7 @@ class CLSHKNNSolver : public CKNNSolver
/** default constructor */
CLSHKNNSolver() : CKNNSolver()
{
init();
init();
}

/** deconstructor */
Expand Down Expand Up @@ -55,10 +55,13 @@ class CLSHKNNSolver : public CKNNSolver
private:
void init()
{
m_lsh_l=0;
m_lsh_l=0;
m_lsh_t=0;
}

template<typename PointType, typename FeatureType>
CMulticlassLabels* classify_objects(FeatureType* lhs, FeatureType* query_features, const int32_t num_lab, SGVector<int32_t>& train_lab, SGVector<float64_t>& classes) const;

protected:
/* Number of hash tables for LSH */
int32_t m_lsh_l;
Expand Down

0 comments on commit 6721b61

Please sign in to comment.