Skip to content

Commit

Permalink
Simplify LRR via using CDenseFeatures interface for cov, gram, sum
Browse files Browse the repository at this point in the history
  • Loading branch information
karlnapf committed Jul 31, 2018
1 parent 58b914a commit 4641206
Show file tree
Hide file tree
Showing 9 changed files with 223 additions and 69 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,21 @@
Linear Ridge Regression
=======================

A linear ridge regression model can be defined as :math:`y_i = \bf{w}^\top\bf{x_i} + b` where :math:`y_i` is the predicted value, :math:`\bf{x_i}` is a feature vector, :math:`\bf{w}` is the weight vector, and :math:`b` is a bias term.
A linear ridge regression model can be defined as :math:`y_i = \bf{w}^\top\bf{x_i}` where :math:`y_i` is the predicted value, :math:`\bf{x_i}` is a feature vector, :math:`\bf{w}` is the weight vector.
We aim to find the linear function that best explains the data, i.e. minimizes the squared loss plus a :math:`L_2` regularization term. One can show the solution can be written as:

.. math::
{\bf w}=\left(\tau I_{D}+XX^{\top}\right)^{-1}X^{\top}y
where :math:`X=\left[{\bf x}_{1},\dots{\bf x}_{N}\right]\in\mathbb{R}^{D\times N}` is the training data matrix, containing :math:`N` training samples of dimension :math:`D`, :math:`y=[y_{1},\dots,y_{N}]^{\top}\in\mathbb{R}^{N}` are the labels, and :math:`\tau>0` scales the regularization term.

The bias term is computed as :math:`b=\frac{1}{N}\sum_{i=1}^{N}y_{i}-{\bf w}\cdot\bar{\mathbf{x}}`, where :math:`\bar{\mathbf{x}}=\frac{1}{N}\sum_{i=1}^{N}{\bf x}_{i}`.
Alternatively if :math:`D>N`, the solution can be written as
.. math::
{\bf w}=X\left(\tau I_{N}+X^{\top}X\right)^{-1}y
In practice, an additional bias :math:`b=\frac{1}{N}\sum_{i=1}^{N}y_{i}-{\bf w}\cdot\bar{\mathbf{x}}` for
:math:`\bar{\mathbf{x}}=\frac{1}{N}\sum_{i=1}^{N}{\bf x}_{i}` can also be included, which effectively centers the :math:`X` before
computing the solution.

For the special case when :math:`\tau = 0`, a wrapper class :sgclass:`CLeastSquaresRegression` is available.

Expand All @@ -34,6 +40,10 @@ After training, we can extract :math:`{\bf w}` and the bias.

.. sgexample:: linear_ridge_regression.sg:extract_w

We could also have trained without bias and set it manually.

.. sgexample:: linear_ridge_regression.sg:manual_bias

Finally, we can evaluate the :sgclass:`CMeanSquaredError`.

.. sgexample:: linear_ridge_regression.sg:evaluate_error
Expand Down
11 changes: 10 additions & 1 deletion examples/meta/src/regression/linear_ridge_regression.sg
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,19 @@ real b = lrr.get_real("bias")
RealVector w = lrr.get_real_vector("w")
#[!extract_w]

#[!manual_bias]
Machine lrr2 = machine("LinearRidgeRegression", tau=0.001, labels=labels_train, use_bias=False)
lrr2.train(features_train)
real my_bias = 0.1
lrr2.put("bias", my_bias)
Labels labels_predict2 = lrr2.apply(features_test)
#[!manual_bias]

#![evaluate_error]
Evaluation eval = evaluation("MeanSquaredError")
real mse = eval.evaluate(labels_predict, labels_test)
#![evaluate_error]

# integration testing variables
RealVector output = labels_test.get_real_vector("labels")
RealVector output = labels_predict.get_real_vector("labels")
RealVector output2 = labels_predict2.get_real_vector("labels")
2 changes: 2 additions & 0 deletions src/interfaces/swig/factory.i
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
%include <shogun/util/factory.h>

%template(features) shogun::features<float64_t>;
%template(labels) shogun::labels<float64_t>;


%newobject shogun::string_features(CFile*, EAlphabet alpha = DNA, EPrimitiveType primitive_type = PT_CHAR);
%newobject shogun::transformer(const std::string&);
60 changes: 60 additions & 0 deletions src/shogun/features/DenseFeatures.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,19 @@
#include <algorithm>
#include <string.h>

#define ASSERT_FLOATING_POINT \
switch (get_feature_type()) \
{ \
case F_SHORTREAL: \
case F_DREAL: \
case F_LONGREAL: \
break; \
default: \
REQUIRE( \
false, "Only defined for %s with real type, not for %s.\n", \
get_name(), demangled_type<ST>().c_str()); \
}

namespace shogun {

template<class ST> CDenseFeatures<ST>::CDenseFeatures(int32_t size) : CDotFeatures(size)
Expand Down Expand Up @@ -1001,6 +1014,53 @@ template< class ST > CDenseFeatures< ST >* CDenseFeatures< ST >::obtain_from_gen
return (CDenseFeatures< ST >*) base_features;
}

template <typename ST>
SGVector<ST> CDenseFeatures<ST>::sum() const
{
// TODO optimize non batch mode, but get_feature_vector is non const :(
SGVector<ST> result = linalg::rowwise_sum(get_feature_matrix());
return result;
}

template <typename ST>
SGVector<ST> CDenseFeatures<ST>::mean() const
{
ASSERT_FLOATING_POINT

auto result = sum();
ST scale = ((ST)1.0) / get_num_vectors();
linalg::scale(result, result, scale);
return result;
}

template <typename ST>
SGMatrix<ST> CDenseFeatures<ST>::cov() const
{
// TODO optimize non batch mode, but get_feature_vector is non const :(
auto mat = get_feature_matrix();
return linalg::matrix_prod(mat, mat, false, true);
}

template <typename ST>
SGMatrix<ST> CDenseFeatures<ST>::gram() const
{
// TODO optimize non batch mode, but get_feature_vector is non const :(
auto mat = get_feature_matrix();
return linalg::matrix_prod(mat, mat, true, false);
}

template <typename ST>
SGVector<ST> CDenseFeatures<ST>::dot(const SGVector<ST>& other) const
{
REQUIRE(
get_num_vectors() == other.size(), "Number of feature vectors (%d) "
"must match provided vector's size "
"(%d).\n",
get_num_features(), other.size());
// TODO optimize non batch mode, but get_feature_vector is non const :(
return linalg::matrix_prod(get_feature_matrix(), other, false);
}

template class CDenseFeatures<bool>;
template class CDenseFeatures<char>;
template class CDenseFeatures<int8_t>;
Expand Down
48 changes: 44 additions & 4 deletions src/shogun/features/DenseFeatures.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@

#include <shogun/lib/config.h>

#include <shogun/lib/common.h>
#include <shogun/lib/Cache.h>
#include <shogun/io/File.h>
#include <shogun/features/DotFeatures.h>
#include <shogun/features/StringFeatures.h>
#include <shogun/io/File.h>
#include <shogun/lib/Cache.h>
#include <shogun/lib/DataType.h>

#include <shogun/lib/SGMatrix.h>
#include <shogun/lib/common.h>
#include <shogun/mathematics/linalg/LinalgNamespace.h>

namespace shogun {
template<class ST> class CStringFeatures;
Expand Down Expand Up @@ -303,6 +303,46 @@ template<class ST> class CDenseFeatures: public CDotFeatures
virtual float64_t dot(int32_t vec_idx1, CDotFeatures* df,
int32_t vec_idx2);

/** Computes the sum of all feature vectors
* @return Sum of all feature vectors
*/
SGVector<ST> sum() const;

/** Computes the empirical mean of all feature vectors
* @return Mean of all feature vectors
*/
SGVector<ST> mean() const;

/** Computes the \f$DxD\f$ (uncentered, un-normalized) covariance matrix
*
*\f[
* X X^\top
* \f]
*
* where \f$X\f$ is the \f$DxN\f$ dimensional feature matrix with \f$N\f$
* feature vectors of dimension \f$D\f$.
*/
SGMatrix<ST> cov() const;
/** Computes the \f$fNxN\f$ (uncentered, un-normalized) gram matrix of
* pairwise dot products, that is
*
*\f[
* X^\top X
* \f]
*
* where \f$X\f$ is the \f$DxN\f$ dimensional feature matrix with \f$N\f$
* feature vectors of dimension \f$D\f$.
*/
SGMatrix<ST> gram() const;

/** Computes the dot product of the feature matrix with a given vector.
*
* @param other Vector to compute dot products with, size must match number
* of feature vectors
* @return Vector as many entries as feature dimensions
*/
SGVector<ST> dot(const SGVector<ST>& other) const;

/** compute dot product between vector1 and a dense vector
*
* possible with subset
Expand Down
4 changes: 3 additions & 1 deletion src/shogun/machine/LinearMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,9 @@ void CLinearMachine::init()

SG_ADD(&m_w, "w", "Parameter vector w.", MS_NOT_AVAILABLE);
SG_ADD(&bias, "bias", "Bias b.", MS_NOT_AVAILABLE);
SG_ADD(&features, "features", "Feature object.", MS_NOT_AVAILABLE);
SG_ADD(
(CFeatures**)&features, "features", "Feature object.",
MS_NOT_AVAILABLE);
}


Expand Down
99 changes: 53 additions & 46 deletions src/shogun/regression/LinearRidgeRegression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,57 +32,64 @@ CLinearRidgeRegression::CLinearRidgeRegression(float64_t tau, CDenseFeatures<flo
void CLinearRidgeRegression::init()
{
set_tau(1e-6);
m_use_bias = true;

SG_ADD(&m_tau, "tau", "Regularization parameter", MS_AVAILABLE);
SG_ADD(
&m_use_bias, "use_bias", "Whether or not to fit an offset term",
MS_NOT_AVAILABLE);
}

bool CLinearRidgeRegression::train_machine(CFeatures* data)
template <typename T>
bool CLinearRidgeRegression::train_machine_templated(
const CDenseFeatures<T>* feats)
{
REQUIRE(m_labels,"No labels set\n");

if (!data)
data=features;

REQUIRE(data,"No features provided and no featured previously set\n");

REQUIRE(m_labels->get_num_labels() == data->get_num_vectors(),
"Number of training vectors (%d) does not match number of labels (%d)\n",
m_labels->get_num_labels(), data->get_num_vectors());

REQUIRE(data->get_feature_class() == C_DENSE,
"Expected Dense Features (%d) but got (%d)\n",
C_DENSE, data->get_feature_class());

REQUIRE(data->get_feature_type() == F_DREAL,
"Expected Real Features (%d) but got (%d)\n",
F_DREAL, data->get_feature_type());

CDenseFeatures<float64_t>* feats=(CDenseFeatures<float64_t>*) data;
int32_t num_feat=feats->get_num_features();

SGMatrix<float64_t> kernel_matrix(num_feat,num_feat);
SGMatrix<float64_t> feats_matrix(feats->get_feature_matrix());
SGVector<float64_t> y(num_feat);
SGVector<float64_t> tau_vector(num_feat);

tau_vector.zero();
tau_vector.add(m_tau);

linalg::matrix_prod(feats_matrix, feats_matrix, kernel_matrix, false, true);
linalg::add_diag(kernel_matrix, tau_vector);

auto labels = regression_labels(m_labels);
auto lab = regression_labels(m_labels)->get_labels();
linalg::matrix_prod(feats_matrix, lab, y);

auto decomposition = linalg::cholesky_factor(kernel_matrix);
y = linalg::cholesky_solver(decomposition, y);
set_w(y);

auto x_bar = linalg::colwise_sum(feats_matrix);
linalg::scale(x_bar, x_bar, 1.0 / ((float64_t)x_bar.size()));
auto intercept = linalg::mean(lab) - linalg::dot(lab, x_bar);
set_bias(intercept);
auto N = feats->get_num_vectors();
auto D = feats->get_num_features();

auto y = regression_labels(m_labels)->get_labels();

SGVector<T> x_bar;
T y_bar;
if (m_use_bias)
{
x_bar = feats->mean();
y_bar = linalg::mean(y);
}

SGVector<float64_t> w;
if (N >= D)
{
SGMatrix<T> cov = feats->cov();
linalg::add_ridge(cov, m_tau);
if (m_use_bias)
linalg::rank_update(cov, x_bar, (T)-N);

auto L = linalg::cholesky_factor(cov);
auto Xy = feats->dot(y);
if (m_use_bias)
linalg::add(Xy, x_bar, Xy, 1.0, -N * y_bar);

w = linalg::cholesky_solver(L, Xy);
}
else
{
if (m_use_bias)
SG_NOTIMPLEMENTED

SGMatrix<T> gram = feats->gram();
linalg::add_ridge(gram, m_tau);
auto L = linalg::cholesky_factor(gram);
auto b = linalg::cholesky_solver(L, y);
w = feats->dot(b);
}
set_w(w);

if (m_use_bias)
{
auto intercept = y_bar - linalg::dot(w, x_bar);
set_bias(intercept);
}

return true;
}
Expand Down

0 comments on commit 4641206

Please sign in to comment.