-
-
Notifications
You must be signed in to change notification settings - Fork 1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Cost functions now support Stan Math, Kept the previous classes for backward compatability. #4294
Changes from 20 commits
220b11d
ff2dd1a
3cbecec
110337f
8829625
7737c07
74dddef
1e8002c
afc5327
bc16178
9dd2d19
34d8aaa
8d89e4a
1bc7318
a926972
1c60921
4e8947b
c29fb06
21a9b7f
b719131
cdadb8d
87a663e
cf2e3f1
d79f878
6ef7f67
11f8c1b
272144d
91cfef9
cd56095
6592f3f
5ba8e0e
955a289
11f6dbd
88b2c3c
42c626f
8ae7fac
3398d7d
a1a5b41
aea5dfa
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
/* | ||
* This software is distributed under BSD 3-clause license (see LICENSE file). | ||
* | ||
* Authors: Elfarouk | ||
*/ | ||
|
||
#include <shogun/optimization/StanFirstOrderSAGCostFunction.h> | ||
#include <shogun/base/range.h> | ||
#include <shogun/mathematics/Math.h> | ||
using namespace shogun; | ||
using stan::math::var; | ||
using std::function; | ||
using Eigen::Matrix; | ||
using Eigen::Dynamic; | ||
|
||
StanFirstOrderSAGCostFunction::StanFirstOrderSAGCostFunction( | ||
SGMatrix<float64_t> X, SGMatrix<float64_t> y, | ||
StanVector* trainable_parameters, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. still dont get why we need pointers. |
||
StanFunctionsVector<float64_t>* cost_for_ith_point, | ||
FunctionReturnsStan<StanVector*>* total_cost) | ||
{ | ||
REQUIRE(X.size() > 0, "Empty X provided"); | ||
REQUIRE(y.size() > 0, "Empty y provided"); | ||
auto num_of_variables = trainable_parameters->rows(); | ||
REQUIRE( | ||
num_of_variables > 0, "Provided %d variables in the parameters, more " | ||
"than 0 parameters required", | ||
num_of_variables); | ||
REQUIRE(cost_for_ith_point != NULL, "Cost for ith point is not provided"); | ||
REQUIRE(total_cost != NULL, "Total cost function is not provided"); | ||
m_X = X; | ||
m_y = y; | ||
m_trainable_parameters = trainable_parameters; | ||
m_cost_for_ith_point = cost_for_ith_point; | ||
m_total_cost = total_cost; | ||
m_ref_trainable_parameters = SGVector<float64_t>(num_of_variables); | ||
for (auto i : range(num_of_variables)) | ||
{ | ||
m_ref_trainable_parameters[i] = (*m_trainable_parameters)(i, 0).val(); | ||
} | ||
} | ||
|
||
void StanFirstOrderSAGCostFunction::set_training_data( | ||
SGMatrix<float64_t> X_new, SGMatrix<float64_t> y_new) | ||
{ | ||
REQUIRE(X_new.size() > 0, "Empty X provided"); | ||
REQUIRE(y_new.size() > 0, "Empty y provided"); | ||
this->m_X = X_new; | ||
this->m_y = y_new; | ||
} | ||
|
||
StanFirstOrderSAGCostFunction::~StanFirstOrderSAGCostFunction() | ||
{ | ||
} | ||
|
||
void StanFirstOrderSAGCostFunction::begin_sample() | ||
{ | ||
m_index_of_sample = -1; | ||
} | ||
|
||
bool StanFirstOrderSAGCostFunction::next_sample() | ||
{ | ||
++m_index_of_sample; | ||
return m_index_of_sample < get_sample_size(); | ||
} | ||
|
||
void StanFirstOrderSAGCostFunction::update_stan_vectors_to_reference_values() | ||
{ | ||
auto num_of_variables = m_trainable_parameters->rows(); | ||
for (auto i : range(num_of_variables)) | ||
{ | ||
(*m_trainable_parameters)(i, 0) = m_ref_trainable_parameters[i]; | ||
} | ||
} | ||
SGVector<float64_t> StanFirstOrderSAGCostFunction::get_gradient() | ||
{ | ||
auto num_of_variables = m_trainable_parameters->rows(); | ||
REQUIRE( | ||
num_of_variables > 0, | ||
"Number of sample must be greater than 0, you provided no samples"); | ||
|
||
update_stan_vectors_to_reference_values(); | ||
var f_i = (*m_cost_for_ith_point)(m_index_of_sample, 0)( | ||
m_trainable_parameters, m_index_of_sample); | ||
|
||
stan::math::set_zero_all_adjoints(); | ||
f_i.grad(); | ||
|
||
SGVector<float64_t>::EigenVectorXt gradients = | ||
m_trainable_parameters->unaryExpr( | ||
[](stan::math::var x) -> float64_t { return x.adj(); }); | ||
// clone needed because gradients is local variable | ||
return SGVector<float64_t>(gradients).clone(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you could just simply wrap the EigenVectorXt with SGVector and not do cloning...
but i reckon it'd be better that simply just create an
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @vigsterkr But the data was being destroyed since the variable was local. I tried the second one but when I returned gradients, it just contained garbage values? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if you return SGVector<float64_t> that should trigger a copy-ctor that should ++ the ref counter and hence shouldn't delete the data itself. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. there should be many examples where this is used in shogun.... e.g. https://github.com/shogun-toolbox/shogun/blob/develop/src/shogun/machine/gp/GaussianLikelihood.cpp#L238 |
||
} | ||
|
||
float64_t StanFirstOrderSAGCostFunction::get_cost() | ||
{ | ||
auto n = get_sample_size(); | ||
StanVector cost_argument(n); | ||
|
||
update_stan_vectors_to_reference_values(); | ||
for (auto i : range(n)) | ||
{ | ||
cost_argument(i, 0) = | ||
(*m_cost_for_ith_point)(i, 0)(m_trainable_parameters, i); | ||
} | ||
var cost = (*m_total_cost)(&cost_argument); | ||
return cost.val(); | ||
} | ||
|
||
index_t StanFirstOrderSAGCostFunction::get_sample_size() | ||
{ | ||
return m_X.num_cols; | ||
} | ||
|
||
SGVector<float64_t> StanFirstOrderSAGCostFunction::get_average_gradient() | ||
{ | ||
int32_t params_num = m_trainable_parameters->rows(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. auto |
||
SGVector<float64_t> average_gradients(params_num); | ||
|
||
auto old_index_sample = m_index_of_sample; | ||
auto n = get_sample_size(); | ||
REQUIRE( | ||
n > 0, | ||
"Number of sample must be greater than 0, you provided no samples"); | ||
|
||
for (index_t i = 0; i < n; ++i) | ||
{ | ||
m_index_of_sample = i; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. mmmm this feels very very strange... as this definitely makes the whole cost function not thread-safe. i'm not saying that they should be, but i'm not convinced atm that this is actually required, or the only way to solve what you want There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is one of the things I'm discussing with you today. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should be addressed in the next set of commits. |
||
average_gradients += get_gradient(); | ||
} | ||
average_gradients.scale(1.0 / n); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. plz use |
||
m_index_of_sample = old_index_sample; | ||
return average_gradients; | ||
} | ||
|
||
SGVector<float64_t> StanFirstOrderSAGCostFunction::obtain_variable_reference() | ||
{ | ||
return m_ref_trainable_parameters; | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,155 @@ | ||
/* | ||
* This software is distributed under BSD 3-clause license (see LICENSE file). | ||
* | ||
* Authors: Elfarouk | ||
*/ | ||
|
||
#ifndef StanFirstOrderSAGCostFunction_H | ||
#define StanFirstOrderSAGCostFunction_H | ||
|
||
#include <stan/math.hpp> | ||
#include <functional> | ||
#include <shogun/lib/SGMatrix.h> | ||
#include <shogun/lib/SGVector.h> | ||
#include <shogun/lib/config.h> | ||
#include <shogun/mathematics/eigen3.h> | ||
#include <shogun/optimization/FirstOrderSAGCostFunction.h> | ||
using StanVector = Eigen::Matrix<stan::math::var, Eigen::Dynamic, 1>; | ||
template <class T> | ||
using FunctionReturnsStan = std::function<stan::math::var(T)>; | ||
template <class T> | ||
using FunctionStanVectorArg = std::function<stan::math::var(StanVector*, T)>; | ||
template <class S> | ||
using StanFunctionsVector = | ||
Eigen::Matrix<FunctionStanVectorArg<S>, Eigen::Dynamic, 1>; | ||
namespace shogun | ||
{ | ||
/** @brief The first order stochastic cost function base class for | ||
* implementing the SAG Cost function | ||
* | ||
* The class gives the implementation used in first order stochastic | ||
* minimizers | ||
* | ||
* The cost function must be Written as a finite sample-specific sum of | ||
* cost. | ||
* For example, least squares cost function, | ||
* \f[ | ||
* f(w)=\frac{ \sum_i{ (y_i-w^T x_i)^2 } }{2} | ||
* \f] | ||
* where \f$(y_i,x_i)\f$ is the i-th sample, | ||
* \f$y_i\f$ is the label and \f$x_i\f$ is the features | ||
*/ | ||
class StanFirstOrderSAGCostFunction : public FirstOrderSAGCostFunction | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure if this relationship between There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agreed. I've changed the parent class to be FirstOrderStochasticCostFunction in the next set of commits, since all these implemented functions are directly from it. (FirstOrderSAGCostFunction also inherits from FirstOrderStochasticCostFunction so it makes sense here to inherit from it as the Stan version is an alternative of FirstOrderSAGCostFunction). |
||
{ | ||
public: | ||
StanFirstOrderSAGCostFunction( | ||
SGMatrix<float64_t> X, SGMatrix<float64_t> y, | ||
StanVector* trainable_parameters, | ||
StanFunctionsVector<float64_t>* cost_for_ith_point, | ||
FunctionReturnsStan<StanVector*>* total_cost); | ||
|
||
StanFirstOrderSAGCostFunction(){}; | ||
|
||
/** Setter for the training data X */ | ||
virtual void | ||
set_training_data(SGMatrix<float64_t> X_new, SGMatrix<float64_t> y_new); | ||
|
||
virtual ~StanFirstOrderSAGCostFunction(); | ||
|
||
/** Initialize to generate a sample sequence | ||
* | ||
*/ | ||
virtual void begin_sample(); | ||
|
||
/** Get next sample | ||
* | ||
* @return false if reach the end of the sample sequence | ||
* */ | ||
virtual bool next_sample(); | ||
|
||
/** Get the sample gradient value wrt target variables | ||
* | ||
* WARNING | ||
* This method does return | ||
* \f$ \frac{\partial f_i(w) }{\partial w} \f$, | ||
* instead of | ||
* \f$\sum_i{ \frac{\partial f_i(w) }{\partial w} }\f$ | ||
* | ||
* For least squares cost function, that is the value of | ||
* \f$\frac{\partial f_i(w) }{\partial w}\f$ given \f$w\f$ is known | ||
* where the index \f$i\f$ is obtained by next_sample() | ||
* | ||
* @return sample gradient of variables | ||
*/ | ||
virtual SGVector<float64_t> get_gradient(); | ||
|
||
/** Get the cost given current target variables | ||
* | ||
* For least squares, that is the value of \f$f(w)\f$. | ||
* | ||
* @return cost | ||
*/ | ||
virtual float64_t get_cost(); | ||
|
||
/** Get the sample size | ||
* | ||
* @return the sample size | ||
*/ | ||
virtual index_t get_sample_size(); | ||
|
||
/** Get the average gradient value wrt target variables | ||
* | ||
* Note that the average gradient is the mean of sample gradient from | ||
* get_gradient() | ||
* if samples are generated (uniformly) at random. | ||
* | ||
* WARNING | ||
* This method returns | ||
* \f$ \frac{\sum_i^n{ \frac{\partial f_i(w) }{\partial w} }}{n}\f$ | ||
* | ||
* For least squares, that is the value of | ||
* \f$ \frac{\frac{\partial f(w) }{\partial w}}{n} \f$ given \f$w\f$ is | ||
* known | ||
* where \f$f(w)=\frac{ \sum_i^n{ (y_i-w^t x_i)^2 } }{2}\f$ | ||
* | ||
* @return average gradient of target variables | ||
*/ | ||
virtual SGVector<float64_t> get_average_gradient(); | ||
|
||
virtual SGVector<float64_t> obtain_variable_reference(); | ||
|
||
/** Updates m_trainable_parameters values to m_ref_trainable_parameters | ||
*/ | ||
void update_stan_vectors_to_reference_values(); | ||
|
||
protected: | ||
/** X is the training data in column major matrix format */ | ||
SGMatrix<float64_t> m_X; | ||
|
||
/** y is the ground truth, or the correct prediction */ | ||
SGMatrix<float64_t> m_y; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @iglesias In the unit test of FirstOrderSAGCostFunction, we implement an additional class called CRegressionExample that wraps FirstOrderSAGCostFunction and contains the training data. I've simply removed the necessity for that class since it just acted as a wrapper around the cost function and data, and have included the data in the loss function, or atleast a reference to the data. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We must be careful and we must no add unnecessary relationships. If it turns out that the training data is a member both in the cost function, and in the neural network, and in etc etc, that is going to cause lot of usage confusion. It does not sounds unreasonable that the tests have a facility packaged to prepare input data and avoid code duplication (e.g. inside a class such as the CRegressionExample you are mentioning). What did you find wrong with it? |
||
|
||
/** trainable_parameters are the variables that are optimized for */ | ||
StanVector* m_trainable_parameters; | ||
|
||
/** cost_for_ith_point is the cost contributed by each point in the | ||
* training data */ | ||
|
||
StanFunctionsVector<float64_t>* m_cost_for_ith_point; | ||
|
||
/** total_cost is the total cost to be minimized, that in this case is a | ||
* form of sum of cost_for_ith_point*/ | ||
// std::function<stan::math::var(StanVector*)>* m_total_cost; | ||
FunctionReturnsStan<StanVector*>* m_total_cost; | ||
|
||
/** Reference values for trainable_parameters so that minimizers can | ||
* perform inplace updates */ | ||
SGVector<float64_t> m_ref_trainable_parameters; | ||
|
||
/** index_of_sample is the index of the column in X for the current | ||
* sample */ | ||
index_t m_index_of_sample; | ||
}; | ||
} | ||
|
||
#endif /* StanFirstOrderSAGCostFunction_H */ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As you are already touching this file, it would be nice to do a tiny extra effort and update the Copyright to the more modern shorter version.