Simplify LRR via using CDenseFeatures interface for cov, gram, sum

make bias computation optional
shogun-toolbox · Jul 23, 2018 · b7c5e58 · b7c5e58
1 parent 1be1dbb
commit b7c5e58
Show file tree

Hide file tree

Showing 11 changed files with 228 additions and 31 deletions.
diff --git a/doc/cookbook/source/examples/regression/linear_ridge_regression.rst b/doc/cookbook/source/examples/regression/linear_ridge_regression.rst
@@ -10,6 +10,10 @@ We aim to find the linear function that best explains the data, i.e. minimizes t
 
 where :math:`X=\left[{\bf x}_{1},\dots{\bf x}_{N}\right]\in\mathbb{R}^{D\times N}` is the training data matrix, containing :math:`N` training samples of dimension :math:`D`, :math:`y=[y_{1},\dots,y_{N}]^{\top}\in\mathbb{R}^{N}` are the labels, and :math:`\tau>0` scales the regularization term.
 
+Alternatively if :math:`D>N`, the solution can be written as
+.. math::
+    {\bf w}=X\left(\tau I_{N}+X^{\top}X\right)^{-1}y
+
 The bias term is computed as :math:`b=\frac{1}{N}\sum_{i=1}^{N}y_{i}-{\bf w}\cdot\bar{\mathbf{x}}`, where :math:`\bar{\mathbf{x}}=\frac{1}{N}\sum_{i=1}^{N}{\bf x}_{i}`.
 
 For the special case when :math:`\tau = 0`, a wrapper class :sgclass:`CLeastSquaresRegression` is available.
@@ -34,6 +38,10 @@ After training, we can extract :math:`{\bf w}` and the bias.
 
 .. sgexample:: linear_ridge_regression.sg:extract_w
 
+We could also have trained without bias and set it manually.
+
+.. sgexample:: linear_ridge_regression.sg:manual_bias
+
 Finally, we can evaluate the :sgclass:`CMeanSquaredError`.
 
 .. sgexample:: linear_ridge_regression.sg:evaluate_error

diff --git a/examples/meta/src/regression/linear_ridge_regression.sg b/examples/meta/src/regression/linear_ridge_regression.sg
@@ -24,10 +24,19 @@ real b = lrr.get_real("bias")
 RealVector w = lrr.get_real_vector("w")
 #[!extract_w]
 
+#[!manual_bias]
+Machine lrr2 = machine("LinearRidgeRegression", tau=0.001, labels=labels_train, use_bias=False)
+lrr2.train(features_train)
+real my_bias = 0.1
+lrr2.put("bias", my_bias)
+Labels labels_predict2 = lrr2.apply(features_test)
+#[!manual_bias]
+
 #![evaluate_error]
 Evaluation eval = evaluation("MeanSquaredError")
 real mse = eval.evaluate(labels_predict, labels_test)
 #![evaluate_error]
 
 # integration testing variables
-RealVector output = labels_test.get_real_vector("labels")
+RealVector output = labels_predict.get_real_vector("labels")
+RealVector output2 = labels_predict2.get_real_vector("labels")
diff --git a/src/shogun/features/DenseFeatures.cpp b/src/shogun/features/DenseFeatures.cpp
@@ -1001,6 +1001,42 @@ template< class ST > CDenseFeatures< ST >* CDenseFeatures< ST >::obtain_from_gen
 	return (CDenseFeatures< ST >*) base_features;
 }
 
+template <typename ST>
+SGVector<ST> CDenseFeatures<ST>::sum() const
+{
+	// TODO optimize non batch mode, but get_feature_vector is non const :(
+	SGVector<ST> result = linalg::rowwise_sum(get_feature_matrix());
+	return result;
+}
+
+template <typename ST>
+SGMatrix<ST> CDenseFeatures<ST>::cov() const
+{
+	// TODO optimize non batch mode, but get_feature_vector is non const :(
+	auto mat = get_feature_matrix();
+	return linalg::matrix_prod(mat, mat, false, true);
+}
+
+template <typename ST>
+SGMatrix<ST> CDenseFeatures<ST>::gram() const
+{
+	// TODO optimize non batch mode, but get_feature_vector is non const :(
+	auto mat = get_feature_matrix();
+	return linalg::matrix_prod(mat, mat, true, false);
+}
+
+template <typename ST>
+SGVector<ST> CDenseFeatures<ST>::dot(const SGVector<ST>& other) const
+{
+	REQUIRE(
+		get_num_vectors() == other.size(), "Number of feature vectors (%d) "
+		                                   "must match provided vector's size "
+		                                   "(%d).\n",
+		get_num_features(), other.size());
+	// TODO optimize non batch mode, but get_feature_vector is non const :(
+	return linalg::matrix_prod(get_feature_matrix(), other, false);
+}
+
 template class CDenseFeatures<bool>;
 template class CDenseFeatures<char>;
 template class CDenseFeatures<int8_t>;

diff --git a/src/shogun/features/DenseFeatures.h b/src/shogun/features/DenseFeatures.h
@@ -11,14 +11,14 @@
 
 #include <shogun/lib/config.h>
 
-#include <shogun/lib/common.h>
-#include <shogun/lib/Cache.h>
-#include <shogun/io/File.h>
 #include <shogun/features/DotFeatures.h>
 #include <shogun/features/StringFeatures.h>
+#include <shogun/io/File.h>
+#include <shogun/lib/Cache.h>
 #include <shogun/lib/DataType.h>
-
 #include <shogun/lib/SGMatrix.h>
+#include <shogun/lib/common.h>
+#include <shogun/mathematics/linalg/LinalgNamespace.h>
 
 namespace shogun {
 template<class ST> class CStringFeatures;
@@ -303,6 +303,40 @@ template<class ST> class CDenseFeatures: public CDotFeatures
 	virtual float64_t dot(int32_t vec_idx1, CDotFeatures* df,
 			int32_t vec_idx2);
 
+	/** Computes the sum of all feature vectors
+	 * @return Sum of all feature vectors
+	 */
+	SGVector<ST> sum() const;
+
+	/** Computes the \f$DxD\f$ (uncentered, un-normalized) covariance matrix
+	 *
+	 *\f[
+	 * X X^\top
+	 * \f]
+	 *
+	 * where \f$X\f$ is the \f$DxN\f$ dimensional feature matrix with \f$N\f$
+	 * feature vectors of dimension \f$D\f$.
+	 */
+	SGMatrix<ST> cov() const;
+	/** Computes the \f$fNxN\f$ gram matrix of pairwise dot products, that is
+	 *
+	 *\f[
+	 * X^\top X
+	 * \f]
+	 *
+	 * where \f$X\f$ is the \f$DxN\f$ dimensional feature matrix with \f$N\f$
+	 * feature vectors of dimension \f$D\f$.
+	 */
+	SGMatrix<ST> gram() const;
+
+	/** Computes the dot product of the feature matrix with a given vector.
+	 *
+	 * @param other Vector to compute dot products with, size must match number
+	 * of feature vectors
+	 * @return Vector as many entries as feature dimensions
+	 */
+	SGVector<ST> dot(const SGVector<ST>& other) const;
+
 	/** compute dot product between vector1 and a dense vector
 	 *
 	 * possible with subset

diff --git a/src/shogun/mathematics/linalg/LinalgBackendBase.h b/src/shogun/mathematics/linalg/LinalgBackendBase.h
@@ -140,6 +140,20 @@ namespace shogun
 		DEFINE_FOR_ALL_PTYPE(BACKEND_GENERIC_ADD_DIAG, SGMatrix);
 #undef BACKEND_GENERIC_ADD_DIAG
 
+/**
+ * Wrapper method of add diagonal vector A.diagonal = A.diagonal + beta * b.
+ *
+ * @see linalg::add_ridge
+ */
+#define BACKEND_GENERIC_ADD_RIDGE(Type, Container)                             \
+	virtual void add_ridge(SGMatrix<Type>& A, Type beta) const                 \
+	{                                                                          \
+		SG_SNOTIMPLEMENTED;                                                    \
+		return;                                                                \
+	}
+		DEFINE_FOR_ALL_PTYPE(BACKEND_GENERIC_ADD_RIDGE, SGMatrix);
+#undef BACKEND_GENERIC_ADD_RIDGE
+
 /**
  * Wrapper method of add vector to each column of matrix.
  *

diff --git a/src/shogun/mathematics/linalg/LinalgBackendEigen.h b/src/shogun/mathematics/linalg/LinalgBackendEigen.h
@@ -73,6 +73,12 @@ namespace shogun
 		DEFINE_FOR_NUMERIC_PTYPE(BACKEND_GENERIC_ADD_DIAG, SGMatrix)
 #undef BACKEND_GENERIC_ADD_DIAG
 
+/** Implementation of @see LinalgBackendBase::add_ridge */
+#define BACKEND_GENERIC_ADD_RIDGE(Type, Container)                             \
+	virtual void add_ridge(SGMatrix<Type>& A, Type beta) const;
+		DEFINE_FOR_NUMERIC_PTYPE(BACKEND_GENERIC_ADD_RIDGE, SGMatrix)
+#undef BACKEND_GENERIC_ADD_RIDGE
+
 /** Implementation of @see LinalgBackendBase::add_vector */
 #define BACKEND_GENERIC_ADD(Type, Container)                                   \
 	virtual void add_vector(                                                   \
@@ -443,6 +449,10 @@ namespace shogun
 		void add_diag_impl(
 		    SGMatrix<T>& A, const SGVector<T>& b, T alpha, T beta) const;
 
+		/** Eigen3 add diagonal scalar method */
+		template <typename T>
+		void add_ridge_impl(SGMatrix<T>& A, T beta) const;
+
 		/** Eigen3 add vector to each column of matrix method */
 		template <typename T>
 		void add_vector_impl(

diff --git a/src/shogun/mathematics/linalg/LinalgNamespace.h b/src/shogun/mathematics/linalg/LinalgNamespace.h
@@ -583,6 +583,21 @@ namespace shogun
 			infer_backend(A, SGMatrix<T>(b))->add_diag(A, b, alpha, beta);
 		}
 
+		/**
+		 * Performs the operation A.diagonal = A.diagonal + beta.
+		 * The matrix is not required to be square.
+		 *
+		 * @param A The matrix
+		 * @param beta Constant to be multiplied by the vector
+		 */
+		template <typename T>
+		void add_ridge(SGMatrix<T>& A, T beta)
+		{
+			auto diagonal_len = CMath::min(A.num_cols, A.num_rows);
+			REQUIRE(diagonal_len > 0, "Matrix can't be empty.\n");
+			infer_backend(A)->add_ridge(A, beta);
+		}
+
 		/**
 		 * Performs the operation result = alpha * A.col(i) + beta * b,
 		 * for each column of A.

diff --git a/src/shogun/mathematics/linalg/backend/eigen/BasicOps.cpp b/src/shogun/mathematics/linalg/backend/eigen/BasicOps.cpp
@@ -30,6 +30,7 @@
  * Authors: 2016 Pan Deng, Soumyajit De, Heiko Strathmann, Viktor Gal
  */
 
+#include <shogun/base/range.h>
 #include <shogun/mathematics/linalg/LinalgBackendEigen.h>
 #include <shogun/mathematics/linalg/LinalgMacros.h>
 
@@ -67,6 +68,14 @@ DEFINE_FOR_NUMERIC_PTYPE(BACKEND_GENERIC_ADD_COL_VEC, SGMatrix)
 DEFINE_FOR_NUMERIC_PTYPE(BACKEND_GENERIC_ADD_DIAG, SGMatrix)
 #undef BACKEND_GENERIC_ADD_DIAG
 
+#define BACKEND_GENERIC_ADD_RIDGE(Type, Container)                             \
+	void LinalgBackendEigen::add_ridge(SGMatrix<Type>& A, Type beta) const     \
+	{                                                                          \
+		add_ridge_impl(A, beta);                                               \
+	}
+DEFINE_FOR_NUMERIC_PTYPE(BACKEND_GENERIC_ADD_RIDGE, SGMatrix)
+#undef BACKEND_GENERIC_ADD_RIDGE
+
 #define BACKEND_GENERIC_PINVH(Type, Container)                                 \
 	void LinalgBackendEigen::pinvh(                                            \
 	    const SGMatrix<Type>& A, SGMatrix<Type>& result) const                 \
@@ -226,6 +235,13 @@ void LinalgBackendEigen::add_diag_impl(
 	A_eig.diagonal() = alpha * A_eig.diagonal() + beta * b_eig;
 }
 
+template <typename T>
+void LinalgBackendEigen::add_ridge_impl(SGMatrix<T>& A, T beta) const
+{
+	for (auto i : range(std::min(A.num_rows, A.num_cols)))
+		A(i, i) += beta;
+}
+
 template <typename T>
 void LinalgBackendEigen::pinvh_impl(
     const SGMatrix<T>& A, SGMatrix<T>& result) const

diff --git a/src/shogun/regression/LinearRidgeRegression.cpp b/src/shogun/regression/LinearRidgeRegression.cpp
@@ -32,8 +32,34 @@ CLinearRidgeRegression::CLinearRidgeRegression(float64_t tau, CDenseFeatures<flo
 void CLinearRidgeRegression::init()
 {
 	set_tau(1e-6);
+	m_use_bias = true;
 
 	SG_ADD(&m_tau, "tau", "Regularization parameter", MS_AVAILABLE);
+	SG_ADD(
+	    &m_use_bias, "use_bias", "Whether or not to fit an offset term",
+	    MS_NOT_AVAILABLE);
+}
+
+template <typename T>
+SGVector<T> solve_using_covariance(
+    const CDenseFeatures<T>* feats, const SGVector<T>& y, T tau)
+{
+	SGMatrix<float64_t> cov = feats->cov();
+	linalg::add_ridge(cov, tau);
+	auto L = linalg::cholesky_factor(cov);
+	auto Xy = feats->dot(y);
+	return linalg::cholesky_solver(L, Xy);
+}
+
+template <typename T>
+SGVector<T>
+solve_using_gram(const CDenseFeatures<T>* feats, const SGVector<T>& y, T tau)
+{
+	SGMatrix<float64_t> gram = feats->gram();
+	linalg::add_ridge(gram, tau);
+	auto L = linalg::cholesky_factor(gram);
+	auto b = linalg::cholesky_solver(L, y);
+	return feats->dot(b);
 }
 
 bool CLinearRidgeRegression::train_machine(CFeatures* data)
@@ -57,32 +83,26 @@ bool CLinearRidgeRegression::train_machine(CFeatures* data)
     	"Expected Real Features (%d) but got (%d)\n",
     	F_DREAL, data->get_feature_type());
 
-    CDenseFeatures<float64_t>* feats=(CDenseFeatures<float64_t>*) data;
-    int32_t num_feat=feats->get_num_features();
-
-    SGMatrix<float64_t> kernel_matrix(num_feat,num_feat);
-    SGMatrix<float64_t> feats_matrix(feats->get_feature_matrix());
-    SGVector<float64_t> y(num_feat);
-    SGVector<float64_t> tau_vector(num_feat);
-
-    tau_vector.zero();
-    tau_vector.add(m_tau);
-
-	linalg::matrix_prod(feats_matrix, feats_matrix, kernel_matrix, false, true);
-	linalg::add_diag(kernel_matrix, tau_vector);
-
-	auto labels = regression_labels(m_labels);
-	auto lab = regression_labels(m_labels)->get_labels();
-	linalg::matrix_prod(feats_matrix, lab, y);
-
-	auto decomposition = linalg::cholesky_factor(kernel_matrix);
-	y = linalg::cholesky_solver(decomposition, y);
-	set_w(y);
-
-	auto x_bar = linalg::colwise_sum(feats_matrix);
-	linalg::scale(x_bar, x_bar, 1.0 / ((float64_t)x_bar.size()));
-	auto intercept = linalg::mean(lab) - linalg::dot(lab, x_bar);
-	set_bias(intercept);
+	auto feats = data->as<CDenseFeatures<float64_t>>();
+	auto y = regression_labels(m_labels)->get_labels();
+	auto N = feats->get_num_vectors();
+	auto D = feats->get_num_features();
+
+	SGVector<float64_t> w;
+	if (N >= D)
+		w = solve_using_covariance<float64_t>(feats, y, m_tau);
+	else
+		w = solve_using_gram<float64_t>(feats, y, m_tau);
+
+	set_w(w);
+
+	if (m_use_bias)
+	{
+		auto x_bar = feats->sum();
+		linalg::scale(x_bar, x_bar, ((float64_t)1.0) / N);
+		auto intercept = linalg::mean(y) - linalg::dot(w, x_bar);
+		set_bias(intercept);
+	}
 
 	return true;
 }

diff --git a/src/shogun/regression/LinearRidgeRegression.h b/src/shogun/regression/LinearRidgeRegression.h
@@ -39,6 +39,13 @@ namespace shogun
 	 * and \f$b=\frac{1}{N}\sum_{i=1}^{N}y_{i}-{\bf w}\cdot\bar{\mathbf{x}}\f$
 	 * for
 	 * \f$\bar{\mathbf{x}}=\frac{1}{N}\sum_{i=1}^{N}{\bf x}_{i}\f$.
+	 *
+	 * If the dimension of the data \f$D\f$ is larger than the number of data
+	 * \f$N\f$, the weights are computed via the gram matrix
+	 *
+	 * \f[
+	 * X=\left[{\bf x}_{1},\dots{\bf x}_{N}\right]\in\mathbb{R}^{D\times N}
+	 * \f]
 	 */
 	class CLinearRidgeRegression : public CLinearMachine
 	{
@@ -107,6 +114,9 @@ namespace shogun
 	protected:
 		/** regularization parameter tau */
 		float64_t m_tau;
+
+		/** Whether or not to compute an offset term */
+		bool m_use_bias;
 };
 }
 #endif // _LINEARRIDGEREGRESSION_H__
diff --git a/tests/unit/mathematics/linalg/operations/Eigen3_operations_unittest.cc b/tests/unit/mathematics/linalg/operations/Eigen3_operations_unittest.cc
@@ -225,6 +225,31 @@ TEST(LinalgBackendEigen, add_diag)
 	EXPECT_THROW(add_diag(A3, b3), ShogunException);
 }
 
+TEST(LinalgBackendEigen, add_ridge)
+{
+	SGMatrix<float64_t> A1(2, 3);
+
+	A1(0, 0) = 1;
+	A1(0, 1) = 2;
+	A1(0, 2) = 3;
+	A1(1, 0) = 4;
+	A1(1, 1) = 5;
+	A1(1, 2) = 6;
+
+	add_ridge(A1, 1.0);
+
+	EXPECT_NEAR(A1(0, 0), 2, 1e-15);
+	EXPECT_NEAR(A1(0, 1), 2, 1e-15);
+	EXPECT_NEAR(A1(0, 2), 3, 1e-15);
+	EXPECT_NEAR(A1(1, 0), 4, 1e-15);
+	EXPECT_NEAR(A1(1, 1), 6, 1e-15);
+	EXPECT_NEAR(A1(1, 2), 6, 1e-15);
+
+	// test error cases
+	SGMatrix<float64_t> A2;
+	EXPECT_THROW(add_ridge(A2, 1.0), ShogunException);
+}
+
 TEST(LinalgBackendEigen, add_vector)
 {
 	const float64_t alpha = 0.7;