Skip to content

Commit

Permalink
Simplify LRR via using CDenseFeatures interface for cov, gram, sum
Browse files Browse the repository at this point in the history
  • Loading branch information
karlnapf committed Jul 31, 2018
1 parent 58b914a commit 5969cad
Show file tree
Hide file tree
Showing 10 changed files with 230 additions and 73 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,21 @@
Linear Ridge Regression
=======================

A linear ridge regression model can be defined as :math:`y_i = \bf{w}^\top\bf{x_i} + b` where :math:`y_i` is the predicted value, :math:`\bf{x_i}` is a feature vector, :math:`\bf{w}` is the weight vector, and :math:`b` is a bias term.
A linear ridge regression model can be defined as :math:`y_i = \bf{w}^\top\bf{x_i}` where :math:`y_i` is the predicted value, :math:`\bf{x_i}` is a feature vector, :math:`\bf{w}` is the weight vector.
We aim to find the linear function that best explains the data, i.e. minimizes the squared loss plus a :math:`L_2` regularization term. One can show the solution can be written as:

.. math::
{\bf w}=\left(\tau I_{D}+XX^{\top}\right)^{-1}X^{\top}y
where :math:`X=\left[{\bf x}_{1},\dots{\bf x}_{N}\right]\in\mathbb{R}^{D\times N}` is the training data matrix, containing :math:`N` training samples of dimension :math:`D`, :math:`y=[y_{1},\dots,y_{N}]^{\top}\in\mathbb{R}^{N}` are the labels, and :math:`\tau>0` scales the regularization term.

The bias term is computed as :math:`b=\frac{1}{N}\sum_{i=1}^{N}y_{i}-{\bf w}\cdot\bar{\mathbf{x}}`, where :math:`\bar{\mathbf{x}}=\frac{1}{N}\sum_{i=1}^{N}{\bf x}_{i}`.
Alternatively if :math:`D>N`, the solution can be written as
.. math::
{\bf w}=X\left(\tau I_{N}+X^{\top}X\right)^{-1}y
In practice, an additional bias :math:`b=\frac{1}{N}\sum_{i=1}^{N}y_{i}-{\bf w}\cdot\bar{\mathbf{x}}` for
:math:`\bar{\mathbf{x}}=\frac{1}{N}\sum_{i=1}^{N}{\bf x}_{i}` can also be included, which effectively centers the :math:`X` before
computing the solution.

For the special case when :math:`\tau = 0`, a wrapper class :sgclass:`CLeastSquaresRegression` is available.

Expand All @@ -34,6 +40,10 @@ After training, we can extract :math:`{\bf w}` and the bias.

.. sgexample:: linear_ridge_regression.sg:extract_w

We could also have trained without bias and set it manually.

.. sgexample:: linear_ridge_regression.sg:manual_bias

Finally, we can evaluate the :sgclass:`CMeanSquaredError`.

.. sgexample:: linear_ridge_regression.sg:evaluate_error
Expand Down
11 changes: 10 additions & 1 deletion examples/meta/src/regression/linear_ridge_regression.sg
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,19 @@ real b = lrr.get_real("bias")
RealVector w = lrr.get_real_vector("w")
#[!extract_w]

#[!manual_bias]
Machine lrr2 = machine("LinearRidgeRegression", tau=0.001, labels=labels_train, use_bias=False)
lrr2.train(features_train)
real my_bias = 0.1
lrr2.put("bias", my_bias)
Labels labels_predict2 = lrr2.apply(features_test)
#[!manual_bias]

#![evaluate_error]
Evaluation eval = evaluation("MeanSquaredError")
real mse = eval.evaluate(labels_predict, labels_test)
#![evaluate_error]

# integration testing variables
RealVector output = labels_test.get_real_vector("labels")
RealVector output = labels_predict.get_real_vector("labels")
RealVector output2 = labels_predict2.get_real_vector("labels")
2 changes: 2 additions & 0 deletions src/interfaces/swig/factory.i
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
%include <shogun/util/factory.h>

%template(features) shogun::features<float64_t>;
%template(labels) shogun::labels<float64_t>;


%newobject shogun::string_features(CFile*, EAlphabet alpha = DNA, EPrimitiveType primitive_type = PT_CHAR);
%newobject shogun::transformer(const std::string&);
2 changes: 1 addition & 1 deletion src/shogun/classifier/LDA.h
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ class CLDA : public CDenseRealDispatch<CLDA, CLinearMachine>
/** @return object name */
virtual const char* get_name() const { return "LDA"; }

protected:
/** train LDA classifier
*
* @param data training data (parameter can be avoided if distance or
Expand All @@ -186,7 +187,6 @@ class CLDA : public CDenseRealDispatch<CLDA, CLinearMachine>
std::is_floating_point<ST>::value>>
bool train_machine_templated(CDenseFeatures<ST>* data);

protected:
/**
* Train the machine with the svd-based solver (@see CFisherLDA).
* @param features training data
Expand Down
60 changes: 60 additions & 0 deletions src/shogun/features/DenseFeatures.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,19 @@
#include <algorithm>
#include <string.h>

#define ASSERT_FLOATING_POINT \
switch (get_feature_type()) \
{ \
case F_SHORTREAL: \
case F_DREAL: \
case F_LONGREAL: \
break; \
default: \
REQUIRE( \
false, "Only defined for %s with real type, not for %s.\n", \
get_name(), demangled_type<ST>().c_str()); \
}

namespace shogun {

template<class ST> CDenseFeatures<ST>::CDenseFeatures(int32_t size) : CDotFeatures(size)
Expand Down Expand Up @@ -1001,6 +1014,53 @@ template< class ST > CDenseFeatures< ST >* CDenseFeatures< ST >::obtain_from_gen
return (CDenseFeatures< ST >*) base_features;
}

template <typename ST>
SGVector<ST> CDenseFeatures<ST>::sum() const
{
// TODO optimize non batch mode, but get_feature_vector is non const :(
SGVector<ST> result = linalg::rowwise_sum(get_feature_matrix());
return result;
}

template <typename ST>
SGVector<ST> CDenseFeatures<ST>::mean() const
{
ASSERT_FLOATING_POINT

auto result = sum();
ST scale = ((ST)1.0) / get_num_vectors();
linalg::scale(result, result, scale);
return result;
}

template <typename ST>
SGMatrix<ST> CDenseFeatures<ST>::cov() const
{
// TODO optimize non batch mode, but get_feature_vector is non const :(
auto mat = get_feature_matrix();
return linalg::matrix_prod(mat, mat, false, true);
}

template <typename ST>
SGMatrix<ST> CDenseFeatures<ST>::gram() const
{
// TODO optimize non batch mode, but get_feature_vector is non const :(
auto mat = get_feature_matrix();
return linalg::matrix_prod(mat, mat, true, false);
}

template <typename ST>
SGVector<ST> CDenseFeatures<ST>::dot(const SGVector<ST>& other) const
{
REQUIRE(
get_num_vectors() == other.size(), "Number of feature vectors (%d) "
"must match provided vector's size "
"(%d).\n",
get_num_features(), other.size());
// TODO optimize non batch mode, but get_feature_vector is non const :(
return linalg::matrix_prod(get_feature_matrix(), other, false);
}

template class CDenseFeatures<bool>;
template class CDenseFeatures<char>;
template class CDenseFeatures<int8_t>;
Expand Down
48 changes: 44 additions & 4 deletions src/shogun/features/DenseFeatures.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@

#include <shogun/lib/config.h>

#include <shogun/lib/common.h>
#include <shogun/lib/Cache.h>
#include <shogun/io/File.h>
#include <shogun/features/DotFeatures.h>
#include <shogun/features/StringFeatures.h>
#include <shogun/io/File.h>
#include <shogun/lib/Cache.h>
#include <shogun/lib/DataType.h>

#include <shogun/lib/SGMatrix.h>
#include <shogun/lib/common.h>
#include <shogun/mathematics/linalg/LinalgNamespace.h>

namespace shogun {
template<class ST> class CStringFeatures;
Expand Down Expand Up @@ -303,6 +303,46 @@ template<class ST> class CDenseFeatures: public CDotFeatures
virtual float64_t dot(int32_t vec_idx1, CDotFeatures* df,
int32_t vec_idx2);

/** Computes the sum of all feature vectors
* @return Sum of all feature vectors
*/
SGVector<ST> sum() const;

/** Computes the empirical mean of all feature vectors
* @return Mean of all feature vectors
*/
SGVector<ST> mean() const;

/** Computes the \f$DxD\f$ (uncentered, un-normalized) covariance matrix
*
*\f[
* X X^\top
* \f]
*
* where \f$X\f$ is the \f$DxN\f$ dimensional feature matrix with \f$N\f$
* feature vectors of dimension \f$D\f$.
*/
SGMatrix<ST> cov() const;
/** Computes the \f$fNxN\f$ (uncentered, un-normalized) gram matrix of
* pairwise dot products, that is
*
*\f[
* X^\top X
* \f]
*
* where \f$X\f$ is the \f$DxN\f$ dimensional feature matrix with \f$N\f$
* feature vectors of dimension \f$D\f$.
*/
SGMatrix<ST> gram() const;

/** Computes the dot product of the feature matrix with a given vector.
*
* @param other Vector to compute dot products with, size must match number
* of feature vectors
* @return Vector as many entries as feature dimensions
*/
SGVector<ST> dot(const SGVector<ST>& other) const;

/** compute dot product between vector1 and a dense vector
*
* possible with subset
Expand Down
4 changes: 3 additions & 1 deletion src/shogun/machine/LinearMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,9 @@ void CLinearMachine::init()

SG_ADD(&m_w, "w", "Parameter vector w.", MS_NOT_AVAILABLE);
SG_ADD(&bias, "bias", "Bias b.", MS_NOT_AVAILABLE);
SG_ADD(&features, "features", "Feature object.", MS_NOT_AVAILABLE);
SG_ADD(
(CFeatures**)&features, "features", "Feature object.",
MS_NOT_AVAILABLE);
}


Expand Down
106 changes: 57 additions & 49 deletions src/shogun/regression/LinearRidgeRegression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,14 @@
using namespace shogun;

CLinearRidgeRegression::CLinearRidgeRegression()
: CLinearMachine()
: CDenseRealDispatch<CLinearRidgeRegression, CLinearMachine>()
{
init();
}

CLinearRidgeRegression::CLinearRidgeRegression(float64_t tau, CDenseFeatures<float64_t>* data, CLabels* lab)
: CLinearMachine()
CLinearRidgeRegression::CLinearRidgeRegression(
float64_t tau, CDenseFeatures<float64_t>* data, CLabels* lab)
: CDenseRealDispatch<CLinearRidgeRegression, CLinearMachine>()
{
init();

Expand All @@ -32,57 +33,64 @@ CLinearRidgeRegression::CLinearRidgeRegression(float64_t tau, CDenseFeatures<flo
void CLinearRidgeRegression::init()
{
set_tau(1e-6);
m_use_bias = true;

SG_ADD(&m_tau, "tau", "Regularization parameter", MS_AVAILABLE);
SG_ADD(
&m_use_bias, "use_bias", "Whether or not to fit an offset term",
MS_NOT_AVAILABLE);
}

bool CLinearRidgeRegression::train_machine(CFeatures* data)
template <typename T>
bool CLinearRidgeRegression::train_machine_templated(
const CDenseFeatures<T>* feats)
{
REQUIRE(m_labels,"No labels set\n");

if (!data)
data=features;

REQUIRE(data,"No features provided and no featured previously set\n");

REQUIRE(m_labels->get_num_labels() == data->get_num_vectors(),
"Number of training vectors (%d) does not match number of labels (%d)\n",
m_labels->get_num_labels(), data->get_num_vectors());

REQUIRE(data->get_feature_class() == C_DENSE,
"Expected Dense Features (%d) but got (%d)\n",
C_DENSE, data->get_feature_class());

REQUIRE(data->get_feature_type() == F_DREAL,
"Expected Real Features (%d) but got (%d)\n",
F_DREAL, data->get_feature_type());

CDenseFeatures<float64_t>* feats=(CDenseFeatures<float64_t>*) data;
int32_t num_feat=feats->get_num_features();

SGMatrix<float64_t> kernel_matrix(num_feat,num_feat);
SGMatrix<float64_t> feats_matrix(feats->get_feature_matrix());
SGVector<float64_t> y(num_feat);
SGVector<float64_t> tau_vector(num_feat);

tau_vector.zero();
tau_vector.add(m_tau);

linalg::matrix_prod(feats_matrix, feats_matrix, kernel_matrix, false, true);
linalg::add_diag(kernel_matrix, tau_vector);

auto labels = regression_labels(m_labels);
auto lab = regression_labels(m_labels)->get_labels();
linalg::matrix_prod(feats_matrix, lab, y);

auto decomposition = linalg::cholesky_factor(kernel_matrix);
y = linalg::cholesky_solver(decomposition, y);
set_w(y);

auto x_bar = linalg::colwise_sum(feats_matrix);
linalg::scale(x_bar, x_bar, 1.0 / ((float64_t)x_bar.size()));
auto intercept = linalg::mean(lab) - linalg::dot(lab, x_bar);
set_bias(intercept);
auto N = feats->get_num_vectors();
auto D = feats->get_num_features();

auto y = regression_labels(m_labels)->get_labels();

SGVector<T> x_bar;
T y_bar;
if (m_use_bias)
{
x_bar = feats->mean();
y_bar = linalg::mean(y);
}

SGVector<float64_t> w;
if (N >= D)
{
SGMatrix<T> cov = feats->cov();
linalg::add_ridge(cov, m_tau);
if (m_use_bias)
linalg::rank_update(cov, x_bar, (T)-N);

auto L = linalg::cholesky_factor(cov);
auto Xy = feats->dot(y);
if (m_use_bias)
linalg::add(Xy, x_bar, Xy, 1.0, -N * y_bar);

w = linalg::cholesky_solver(L, Xy);
}
else
{
if (m_use_bias)
SG_NOTIMPLEMENTED

SGMatrix<T> gram = feats->gram();
linalg::add_ridge(gram, m_tau);
auto L = linalg::cholesky_factor(gram);
auto b = linalg::cholesky_solver(L, y);
w = feats->dot(b);
}
set_w(w);

if (m_use_bias)
{
auto intercept = y_bar - linalg::dot(w, x_bar);
set_bias(intercept);
}

return true;
}
Expand Down

0 comments on commit 5969cad

Please sign in to comment.