Skip to content

Commit

Permalink
Add additional check on user input. Closes #3975
Browse files Browse the repository at this point in the history
Add input check and assertion in LMNN regarding k used in KNN and
the number of examples per class.
  • Loading branch information
iglesias committed Apr 20, 2018
1 parent e662cff commit b63e281
Show file tree
Hide file tree
Showing 5 changed files with 77 additions and 14 deletions.
5 changes: 3 additions & 2 deletions src/shogun/metric/LMNN.cpp
@@ -1,7 +1,8 @@
/*
* This software is distributed under BSD 3-clause license (see LICENSE file).
*
* Authors: Fernando Iglesias, Heiko Strathmann, Giovanni De Toni, Viktor Gal
* Authors: Fernando Iglesias, Heiko Strathmann, Giovanni De Toni, Viktor Gal,
* Wuwei Lin
*/

#include <shogun/metric/LMNN.h>
Expand Down Expand Up @@ -53,7 +54,7 @@ void CLMNN::train(SGMatrix<float64_t> init_transform)
SG_DEBUG("Entering CLMNN::train().\n")

// Check training data and arguments, initializing, if necessary, init_transform
CLMNNImpl::check_training_setup(m_features, m_labels, init_transform);
CLMNNImpl::check_training_setup(m_features, m_labels, init_transform, m_k);

// Initializations

Expand Down
49 changes: 47 additions & 2 deletions src/shogun/metric/LMNNImpl.cpp
Expand Up @@ -6,7 +6,10 @@

#include <shogun/metric/LMNNImpl.h>

#include <algorithm>
#include <iterator>
#include <unordered_map>

#include <shogun/mathematics/linalg/LinalgNamespace.h>
#include <shogun/multiclass/KNN.h>
#include <shogun/preprocessor/PCA.h>
Expand All @@ -32,8 +35,9 @@ bool CImpostorNode::operator<(const CImpostorNode& rhs) const
return example < rhs.example;
}

void CLMNNImpl::check_training_setup(CFeatures* features, const CLabels* labels,
SGMatrix<float64_t>& init_transform)
void CLMNNImpl::check_training_setup(
CFeatures* features, CLabels* labels, SGMatrix<float64_t>& init_transform,
int32_t k)
{
REQUIRE(features->has_property(FP_DOT),
"LMNN can only be applied to features that support dot products\n")
Expand All @@ -56,6 +60,47 @@ void CLMNNImpl::check_training_setup(CFeatures* features, const CLabels* labels,
init_transform.num_rows==init_transform.num_cols,
"The initial transform must be a square matrix of size equal to the "
"number of features\n")

check_maximum_k(labels, k);
}

void CLMNNImpl::check_maximum_k(CLabels* labels, int32_t k)
{
CMulticlassLabels* y = CLabelsFactory::to_multiclass(labels);
SGVector<int32_t> int_labels = y->get_int_labels();

// back-up initial values because they will be overwritten by unique
std::vector<int32_t> int_labels_vec;
std::copy(
int_labels.begin(), int_labels.end(),
std::back_inserter(int_labels_vec));

std::sort(int_labels.begin(), int_labels.end());
auto unique_end = std::unique(int_labels.begin(), int_labels.end());

std::vector<int32_t> labels_histogram(
std::distance(int_labels.begin(), unique_end), 0);

std::unordered_map<int32_t, int32_t> label_to_index;
{
int32_t next_index = 0;
for (auto begin = int_labels.begin(); begin != unique_end; begin++)
label_to_index.insert({*begin, next_index++});
}

for (auto int_label : int_labels_vec)
{
labels_histogram[label_to_index[int_label]] += 1;
}

int32_t min_num_examples =
*std::min_element(labels_histogram.begin(), labels_histogram.end());
REQUIRE(
min_num_examples > k,
"The minimum number of examples of any class (%d) must be larger "
"than k (%d); it must be at least k+1 because any example needs "
"k *other* neighbors of the same class.",
min_num_examples, k)
}

SGMatrix<index_t> CLMNNImpl::find_target_nn(CDenseFeatures<float64_t>* x,
Expand Down
11 changes: 10 additions & 1 deletion src/shogun/metric/LMNNImpl.h
Expand Up @@ -78,7 +78,9 @@ class CLMNNImpl
* check feature and label size, dimensions of the initial transform, etc
* if the initial transform has not been initialized, do it using PCA
*/
static void check_training_setup(CFeatures* features, const CLabels* labels, SGMatrix<float64_t>& init_transform);
static void check_training_setup(
CFeatures* features, CLabels* labels,
SGMatrix<float64_t>& init_transform, int32_t k);

/**
* for each feature in x, find its target neighbors; that is, its k
Expand Down Expand Up @@ -160,6 +162,13 @@ class CLMNNImpl
*/
static CEuclideanDistance* setup_distance(CDenseFeatures<float64_t>* x, std::vector<index_t>& a, std::vector<index_t>& b);

/**
* check that k is less than the minimum number of examples in any
* class.
* k must be less than the number of examples in any class because each
* example needs k other examples (the nearest ones) of the same class
*/
static void check_maximum_k(CLabels* labels, int32_t k);

}; /* class CLMNNImpl */

Expand Down
5 changes: 5 additions & 0 deletions src/shogun/multiclass/KNN.cpp
Expand Up @@ -110,6 +110,11 @@ SGMatrix<index_t> CKNN::nearest_neighbors()
{
//number of examples to which kNN is applied
int32_t n=distance->get_num_vec_rhs();

REQUIRE(
n >= m_k,
"K (%d) must not be larger than the number of examples (%d).\n", m_k, n)

//distances to train data
SGVector<float64_t> dists(m_train_labels.vlen);
//indices to train data
Expand Down
21 changes: 12 additions & 9 deletions src/shogun/preprocessor/PCA.cpp
Expand Up @@ -85,7 +85,8 @@ bool CPCA::init(CFeatures* features)
->get_feature_matrix();
int32_t num_vectors = feature_matrix.num_cols;
int32_t num_features = feature_matrix.num_rows;
SG_INFO("num_examples: %ld num_features: %ld \n", num_vectors, num_features)
SG_INFO(
"num_examples: %d num_features: %d\n", num_vectors, num_features)

// max target dim allowed
int32_t max_dim_allowed = CMath::min(num_vectors, num_features);
Expand Down Expand Up @@ -134,7 +135,7 @@ void CPCA::init_with_evd(const SGMatrix<float64_t>& feature_matrix, int32_t max_
cov_mat = fmatrix*fmatrix.transpose();
cov_mat /= (num_vectors-1);

SG_INFO("Computing Eigenvalues ... ")
SG_INFO("Computing Eigenvalues\n")
// eigen value computed
SelfAdjointEigenSolver<MatrixXd> eigenSolve =
SelfAdjointEigenSolver<MatrixXd>(cov_mat);
Expand Down Expand Up @@ -171,7 +172,7 @@ void CPCA::init_with_evd(const SGMatrix<float64_t>& feature_matrix, int32_t max_
}
break;
};
SG_INFO("Done\nReducing from %i to %i features..", num_features, num_dim)
SG_INFO("Reducing from %i to %i features\n", num_features, num_dim)

m_transformation_matrix = SGMatrix<float64_t>(num_features,num_dim);
Map<MatrixXd> transformMatrix(m_transformation_matrix.matrix,
Expand All @@ -188,10 +189,12 @@ void CPCA::init_with_evd(const SGMatrix<float64_t>& feature_matrix, int32_t max_
if (CMath::fequals_abs<float64_t>(0.0, eigenValues[i+max_dim_allowed-num_dim],
m_eigenvalue_zero_tolerance))
{
SG_WARNING("Covariance matrix has almost zero Eigenvalue (ie "
"Eigenvalue within a tolerance of %E around 0) at "
"dimension %d. Consider reducing its dimension.",
m_eigenvalue_zero_tolerance, i+max_dim_allowed-num_dim+1)
SG_WARNING(
"Covariance matrix has almost zero Eigenvalue (ie "
"Eigenvalue within a tolerance of %E around 0) at "
"dimension %d. Consider reducing its dimension.\n",
m_eigenvalue_zero_tolerance,
i + max_dim_allowed - num_dim + 1)

transformMatrix.col(i) = MatrixXd::Zero(num_features,1);
continue;
Expand Down Expand Up @@ -246,7 +249,7 @@ void CPCA::init_with_svd(const SGMatrix<float64_t> &feature_matrix, int32_t max_
}
break;
};
SG_INFO("Done\nReducing from %i to %i features..", num_features, num_dim)
SG_INFO("Reducing from %i to %i features...\n", num_features, num_dim)

// right singular vectors form eigenvectors
m_transformation_matrix = SGMatrix<float64_t>(num_features, num_dim);
Expand Down Expand Up @@ -308,7 +311,7 @@ SGMatrix<float64_t> CPCA::apply_to_feature_matrix(CFeatures* features)
feature_matrix.block(0,0,num_dim,num_vectors) =
transform_matrix.transpose()*feature_matrix;

SG_INFO("Form matrix of target dimension")
SG_INFO("Form matrix of target dimension\n")
for (int32_t col=0; col<num_vectors; col++)
{
for (int32_t row=0; row<num_dim; row++)
Expand Down

0 comments on commit b63e281

Please sign in to comment.