Add additional check on user input. Closes #3975

Add input check and assertion in LMNN regarding k used in KNN and the number of examples per class.
shogun-toolbox · Apr 20, 2018 · b63e281 · b63e281
1 parent e662cff
commit b63e281
Show file tree

Hide file tree

Showing 5 changed files with 77 additions and 14 deletions.
diff --git a/src/shogun/metric/LMNN.cpp b/src/shogun/metric/LMNN.cpp
@@ -1,7 +1,8 @@
 /*
  * This software is distributed under BSD 3-clause license (see LICENSE file).
  *
- * Authors: Fernando Iglesias, Heiko Strathmann, Giovanni De Toni, Viktor Gal
+ * Authors: Fernando Iglesias, Heiko Strathmann, Giovanni De Toni, Viktor Gal,
+ * Wuwei Lin
  */
 
 #include <shogun/metric/LMNN.h>
@@ -53,7 +54,7 @@ void CLMNN::train(SGMatrix<float64_t> init_transform)
 	SG_DEBUG("Entering CLMNN::train().\n")
 
 	// Check training data and arguments, initializing, if necessary, init_transform
-	CLMNNImpl::check_training_setup(m_features, m_labels, init_transform);
+	CLMNNImpl::check_training_setup(m_features, m_labels, init_transform, m_k);
 
 	// Initializations
 

diff --git a/src/shogun/metric/LMNNImpl.cpp b/src/shogun/metric/LMNNImpl.cpp
@@ -6,7 +6,10 @@
 
 #include <shogun/metric/LMNNImpl.h>
 
+#include <algorithm>
 #include <iterator>
+#include <unordered_map>
+
 #include <shogun/mathematics/linalg/LinalgNamespace.h>
 #include <shogun/multiclass/KNN.h>
 #include <shogun/preprocessor/PCA.h>
@@ -32,8 +35,9 @@ bool CImpostorNode::operator<(const CImpostorNode& rhs) const
 		return example < rhs.example;
 }
 
-void CLMNNImpl::check_training_setup(CFeatures* features, const CLabels* labels,
-		SGMatrix<float64_t>& init_transform)
+void CLMNNImpl::check_training_setup(
+    CFeatures* features, CLabels* labels, SGMatrix<float64_t>& init_transform,
+    int32_t k)
 {
 	REQUIRE(features->has_property(FP_DOT),
 			"LMNN can only be applied to features that support dot products\n")
@@ -56,6 +60,47 @@ void CLMNNImpl::check_training_setup(CFeatures* features, const CLabels* labels,
 			init_transform.num_rows==init_transform.num_cols,
 			"The initial transform must be a square matrix of size equal to the "
 			"number of features\n")
+
+	check_maximum_k(labels, k);
+}
+
+void CLMNNImpl::check_maximum_k(CLabels* labels, int32_t k)
+{
+	CMulticlassLabels* y = CLabelsFactory::to_multiclass(labels);
+	SGVector<int32_t> int_labels = y->get_int_labels();
+
+	// back-up initial values because they will be overwritten by unique
+	std::vector<int32_t> int_labels_vec;
+	std::copy(
+	    int_labels.begin(), int_labels.end(),
+	    std::back_inserter(int_labels_vec));
+
+	std::sort(int_labels.begin(), int_labels.end());
+	auto unique_end = std::unique(int_labels.begin(), int_labels.end());
+
+	std::vector<int32_t> labels_histogram(
+	    std::distance(int_labels.begin(), unique_end), 0);
+
+	std::unordered_map<int32_t, int32_t> label_to_index;
+	{
+		int32_t next_index = 0;
+		for (auto begin = int_labels.begin(); begin != unique_end; begin++)
+			label_to_index.insert({*begin, next_index++});
+	}
+
+	for (auto int_label : int_labels_vec)
+	{
+		labels_histogram[label_to_index[int_label]] += 1;
+	}
+
+	int32_t min_num_examples =
+	    *std::min_element(labels_histogram.begin(), labels_histogram.end());
+	REQUIRE(
+	    min_num_examples > k,
+	    "The minimum number of examples of any class (%d) must be larger "
+	    "than k (%d); it must be at least k+1 because any example needs "
+	    "k *other* neighbors of the same class.",
+	    min_num_examples, k)
 }
 
 SGMatrix<index_t> CLMNNImpl::find_target_nn(CDenseFeatures<float64_t>* x,

diff --git a/src/shogun/metric/LMNNImpl.h b/src/shogun/metric/LMNNImpl.h
@@ -78,7 +78,9 @@ class CLMNNImpl
 		 * check feature and label size, dimensions of the initial transform, etc
 		 * if the initial transform has not been initialized, do it using PCA
 		 */
-		static void check_training_setup(CFeatures* features, const CLabels* labels, SGMatrix<float64_t>& init_transform);
+		static void check_training_setup(
+		    CFeatures* features, CLabels* labels,
+		    SGMatrix<float64_t>& init_transform, int32_t k);
 
 		/**
 		 * for each feature in x, find its target neighbors; that is, its k
@@ -160,6 +162,13 @@ class CLMNNImpl
 		 */
 		static CEuclideanDistance* setup_distance(CDenseFeatures<float64_t>* x, std::vector<index_t>& a, std::vector<index_t>& b);
 
+		/**
+		 * check that k is less than the minimum number of examples in any
+		 * class.
+		 * k must be less than the number of examples in any class because each
+		 * example needs k other examples (the nearest ones) of the same class
+		 */
+		static void check_maximum_k(CLabels* labels, int32_t k);
 
 }; /* class CLMNNImpl */
 

diff --git a/src/shogun/multiclass/KNN.cpp b/src/shogun/multiclass/KNN.cpp
@@ -110,6 +110,11 @@ SGMatrix<index_t> CKNN::nearest_neighbors()
 {
 	//number of examples to which kNN is applied
 	int32_t n=distance->get_num_vec_rhs();
+
+	REQUIRE(
+	    n >= m_k,
+	    "K (%d) must not be larger than the number of examples (%d).\n", m_k, n)
+
 	//distances to train data
 	SGVector<float64_t> dists(m_train_labels.vlen);
 	//indices to train data

diff --git a/src/shogun/preprocessor/PCA.cpp b/src/shogun/preprocessor/PCA.cpp
@@ -85,7 +85,8 @@ bool CPCA::init(CFeatures* features)
 									->get_feature_matrix();
 		int32_t num_vectors = feature_matrix.num_cols;
 		int32_t num_features = feature_matrix.num_rows;
-		SG_INFO("num_examples: %ld num_features: %ld \n", num_vectors, num_features)
+		SG_INFO(
+		    "num_examples: %d num_features: %d\n", num_vectors, num_features)
 
 		// max target dim allowed
 		int32_t max_dim_allowed = CMath::min(num_vectors, num_features);
@@ -134,7 +135,7 @@ void CPCA::init_with_evd(const SGMatrix<float64_t>& feature_matrix, int32_t max_
 	cov_mat = fmatrix*fmatrix.transpose();
 	cov_mat /= (num_vectors-1);
 
-	SG_INFO("Computing Eigenvalues ... ")
+	SG_INFO("Computing Eigenvalues\n")
 	// eigen value computed
 	SelfAdjointEigenSolver<MatrixXd> eigenSolve =
 			SelfAdjointEigenSolver<MatrixXd>(cov_mat);
@@ -171,7 +172,7 @@ void CPCA::init_with_evd(const SGMatrix<float64_t>& feature_matrix, int32_t max_
 			}
 			break;
 	};
-	SG_INFO("Done\nReducing from %i to %i features..", num_features, num_dim)
+	SG_INFO("Reducing from %i to %i features\n", num_features, num_dim)
 
 	m_transformation_matrix = SGMatrix<float64_t>(num_features,num_dim);
 	Map<MatrixXd> transformMatrix(m_transformation_matrix.matrix,
@@ -188,10 +189,12 @@ void CPCA::init_with_evd(const SGMatrix<float64_t>& feature_matrix, int32_t max_
 			if (CMath::fequals_abs<float64_t>(0.0, eigenValues[i+max_dim_allowed-num_dim],
 									m_eigenvalue_zero_tolerance))
 			{
-				SG_WARNING("Covariance matrix has almost zero Eigenvalue (ie "
-					"Eigenvalue within a tolerance of %E around 0) at "
-					"dimension %d. Consider reducing its dimension.",
-					m_eigenvalue_zero_tolerance, i+max_dim_allowed-num_dim+1)
+				SG_WARNING(
+				    "Covariance matrix has almost zero Eigenvalue (ie "
+				    "Eigenvalue within a tolerance of %E around 0) at "
+				    "dimension %d. Consider reducing its dimension.\n",
+				    m_eigenvalue_zero_tolerance,
+				    i + max_dim_allowed - num_dim + 1)
 
 				transformMatrix.col(i) = MatrixXd::Zero(num_features,1);
 				continue;
@@ -246,7 +249,7 @@ void CPCA::init_with_svd(const SGMatrix<float64_t> &feature_matrix, int32_t max_
 			}
 			break;
 	};
-	SG_INFO("Done\nReducing from %i to %i features..", num_features, num_dim)
+	SG_INFO("Reducing from %i to %i features...\n", num_features, num_dim)
 
 	// right singular vectors form eigenvectors
 	m_transformation_matrix = SGMatrix<float64_t>(num_features, num_dim);
@@ -308,7 +311,7 @@ SGMatrix<float64_t> CPCA::apply_to_feature_matrix(CFeatures* features)
 			feature_matrix.block(0,0,num_dim,num_vectors) =
 					transform_matrix.transpose()*feature_matrix;
 
-			SG_INFO("Form matrix of target dimension")
+			SG_INFO("Form matrix of target dimension\n")
 			for (int32_t col=0; col<num_vectors; col++)
 			{
 				for (int32_t row=0; row<num_dim; row++)