Skip to content

Commit

Permalink
[TMVA] Add API Support for Adagrad Optimizer.
Browse files Browse the repository at this point in the history
  • Loading branch information
ravikiran0606 committed Aug 7, 2018
1 parent 6e27f3c commit 1eeafac
Show file tree
Hide file tree
Showing 8 changed files with 262 additions and 5 deletions.
179 changes: 179 additions & 0 deletions tmva/tmva/inc/TMVA/DNN/Adagrad.h
@@ -0,0 +1,179 @@
// @(#)root/tmva/tmva/dnn:$Id$
// Author: Ravi Kiran S

/**********************************************************************************
* Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
* Package: TMVA *
* Class : TAdagrad *
* Web : http://tmva.sourceforge.net *
* *
* Description: *
* Adagrad Optimizer Class *
* *
* Authors (alphabetical): *
* Ravi Kiran S <sravikiran0606@gmail.com> - CERN, Switzerland *
* *
* Copyright (c) 2005-2018: *
* CERN, Switzerland *
* U. of Victoria, Canada *
* MPI-K Heidelberg, Germany *
* U. of Bonn, Germany *
* *
* Redistribution and use in source and binary forms, with or without *
* modification, are permitted according to the terms listed in LICENSE *
* (http://tmva.sourceforge.net/LICENSE) *
**********************************************************************************/

#ifndef TMVA_DNN_ADAGRAD
#define TMVA_DNN_ADAGRAD

#include "TMatrix.h"
#include "TMVA/DNN/Optimizer.h"
#include "TMVA/DNN/Functions.h"

namespace TMVA {
namespace DNN {

/** \class TAdagrad
* Adagrad Optimizer class
*
* This class represents the Adagrad Optimizer.
*/
template <typename Architecture_t, typename Layer_t = VGeneralLayer<Architecture_t>,
typename DeepNet_t = TDeepNet<Architecture_t, Layer_t>>
class TAdagrad : public VOptimizer<Architecture_t, Layer_t, DeepNet_t> {
public:
using Matrix_t = typename Architecture_t::Matrix_t;
using Scalar_t = typename Architecture_t::Scalar_t;

protected:
Scalar_t fEpsilon; ///< The Smoothing term used to avoid division by zero.

std::vector<std::vector<Matrix_t>>
fPastSquaredWeightGradients; ///< The sum of the square of the past weight gradients associated with the deep net.
std::vector<std::vector<Matrix_t>>
fPastSquaredBiasGradients; ///< The sum of the square of the past bias gradients associated with the deep net.

/*! Update the weights, given the current weight gradients. */
void UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights, const std::vector<Matrix_t> &weightGradients);

/*! Update the biases, given the current bias gradients. */
void UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases, const std::vector<Matrix_t> &biasGradients);

public:
/*! Constructor. */
TAdagrad(DeepNet_t &deepNet, Scalar_t learningRate = 0.01, Scalar_t epsilon = 1e-8);

/*! Destructor. */
~TAdagrad() = default;

/*! Getters */
Scalar_t GetEpsilon() const { return fEpsilon; }

std::vector<std::vector<Matrix_t>> &GetPastSquaredWeightGradients() { return fPastSquaredWeightGradients; }
std::vector<Matrix_t> &GetPastSquaredWeightGradientsAt(size_t i) { return fPastSquaredWeightGradients[i]; }

std::vector<std::vector<Matrix_t>> &GetPastSquaredBiasGradients() { return fPastSquaredBiasGradients; }
std::vector<Matrix_t> &GetPastSquaredBiasGradientsAt(size_t i) { return fPastSquaredBiasGradients[i]; }
};

//
//
// The Stochastic Gradient Descent Optimizer Class - Implementation
//_________________________________________________________________________________________________
template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
TAdagrad<Architecture_t, Layer_t, DeepNet_t>::TAdagrad(DeepNet_t &deepNet, Scalar_t learningRate, Scalar_t epsilon)
: VOptimizer<Architecture_t, Layer_t, DeepNet_t>(learningRate, deepNet), fEpsilon(epsilon)
{
std::vector<Layer_t *> &layers = deepNet.GetLayers();
const size_t layersNSlices = layers.size();
fPastSquaredWeightGradients.resize(layersNSlices);
fPastSquaredBiasGradients.resize(layersNSlices);

for (size_t i = 0; i < layersNSlices; i++) {
const size_t weightsNSlices = (layers[i]->GetWeights()).size();

for (size_t j = 0; j < weightsNSlices; j++) {
Matrix_t &currentWeights = layers[i]->GetWeightsAt(j);
const size_t weightsNRows = currentWeights.GetNrows();
const size_t weightsNCols = currentWeights.GetNcols();

fPastSquaredWeightGradients[i].emplace_back(weightsNRows, weightsNCols);
initialize<Architecture_t>(fPastSquaredWeightGradients[i][j], EInitialization::kZero);
}

const size_t biasesNSlices = (layers[i]->GetBiases()).size();

for (size_t j = 0; j < biasesNSlices; j++) {
Matrix_t &currentBiases = layers[i]->GetBiasesAt(j);
const size_t biasesNRows = currentBiases.GetNrows();
const size_t biasesNCols = currentBiases.GetNcols();

fPastSquaredBiasGradients[i].emplace_back(biasesNRows, biasesNCols);
initialize<Architecture_t>(fPastSquaredBiasGradients[i][j], EInitialization::kZero);
}
}
}

//_________________________________________________________________________________________________
template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
auto TAdagrad<Architecture_t, Layer_t, DeepNet_t>::UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights,
const std::vector<Matrix_t> &weightGradients) -> void
{
std::vector<Matrix_t> &currentLayerPastSquaredWeightGradients = this->GetPastSquaredWeightGradientsAt(layerIndex);

for (size_t k = 0; k < currentLayerPastSquaredWeightGradients.size(); k++) {

// Vt = Vt-1 + currentSquaredWeightGradients
Matrix_t currentSquaredWeightGradients(weightGradients[k].GetNrows(), weightGradients[k].GetNcols());
Architecture_t::Copy(currentSquaredWeightGradients, weightGradients[k]);
Architecture_t::SquareElementWise(currentSquaredWeightGradients);
Architecture_t::ScaleAdd(currentLayerPastSquaredWeightGradients[k], currentSquaredWeightGradients, 1.0);
}

// updating the weights.
// theta = theta - learningRate * currentWeightGradients / (sqrt(Vt + epsilon))
for (size_t i = 0; i < weights.size(); i++) {
Matrix_t currentWeightUpdates(weights[i].GetNrows(), weights[i].GetNcols());
Architecture_t::Copy(currentWeightUpdates, currentLayerPastSquaredWeightGradients[i]);
Architecture_t::ConstAdd(currentWeightUpdates, this->GetEpsilon());
Architecture_t::SqrtElementWise(currentWeightUpdates);
Architecture_t::ReciprocalElementWise(currentWeightUpdates);
Architecture_t::Hadamard(currentWeightUpdates, weightGradients[i]);
Architecture_t::ScaleAdd(weights[i], currentWeightUpdates, -this->GetLearningRate());
}
}

//_________________________________________________________________________________________________
template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
auto TAdagrad<Architecture_t, Layer_t, DeepNet_t>::UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases,
const std::vector<Matrix_t> &biasGradients) -> void
{
std::vector<Matrix_t> &currentLayerPastSquaredBiasGradients = this->GetPastSquaredBiasGradientsAt(layerIndex);

for (size_t k = 0; k < currentLayerPastSquaredBiasGradients.size(); k++) {

// Vt = Vt-1 + currentSquaredBiasGradients
Matrix_t currentSquaredBiasGradients(biasGradients[k].GetNrows(), biasGradients[k].GetNcols());
Architecture_t::Copy(currentSquaredBiasGradients, biasGradients[k]);
Architecture_t::SquareElementWise(currentSquaredBiasGradients);
Architecture_t::ScaleAdd(currentLayerPastSquaredBiasGradients[k], currentSquaredBiasGradients, 1.0);
}

// updating the biases.
// theta = theta - learningRate * currentBiasGradients / (sqrt(Vt + epsilon))
for (size_t i = 0; i < biases.size(); i++) {
Matrix_t currentBiasUpdates(biases[i].GetNrows(), biases[i].GetNcols());
Architecture_t::Copy(currentBiasUpdates, currentLayerPastSquaredBiasGradients[i]);
Architecture_t::ConstAdd(currentBiasUpdates, this->GetEpsilon());
Architecture_t::SqrtElementWise(currentBiasUpdates);
Architecture_t::ReciprocalElementWise(currentBiasUpdates);
Architecture_t::Hadamard(currentBiasUpdates, biasGradients[i]);
Architecture_t::ScaleAdd(biases[i], currentBiasUpdates, -this->GetLearningRate());
}
}

} // namespace DNN
} // namespace TMVA

#endif
1 change: 1 addition & 0 deletions tmva/tmva/inc/TMVA/DNN/Functions.h
Expand Up @@ -80,6 +80,7 @@ enum class EInitialization {
enum class EOptimizer {
kSGD = 0,
kAdam = 1,
kAdagrad = 2,
};

//______________________________________________________________________________
Expand Down
8 changes: 8 additions & 0 deletions tmva/tmva/src/MethodDL.cxx
Expand Up @@ -41,6 +41,7 @@
#include "TMVA/DNN/DLMinimizers.h"
#include "TMVA/DNN/SGD.h"
#include "TMVA/DNN/Adam.h"
#include "TMVA/DNN/Adagrad.h"
#include "TStopwatch.h"

#include <chrono>
Expand Down Expand Up @@ -351,6 +352,8 @@ void MethodDL::ProcessOptions()
settings.optimizer = DNN::EOptimizer::kSGD;
} else if (optimizer == "ADAM") {
settings.optimizer = DNN::EOptimizer::kAdam;
} else if (optimizer == "ADAGRAD") {
settings.optimizer = DNN::EOptimizer::kAdagrad;
} else {
// Make Adam as default choice if the input string is
// incorrect.
Expand Down Expand Up @@ -1124,6 +1127,11 @@ void MethodDL::TrainDeepNet()
optimizer = std::unique_ptr<DNN::TAdam<Architecture_t, Layer_t, DeepNet_t>>(
new DNN::TAdam<Architecture_t, Layer_t, DeepNet_t>(deepNet, settings.learningRate));
break;

case EOptimizer::kAdagrad:
optimizer = std::unique_ptr<DNN::TAdagrad<Architecture_t, Layer_t, DeepNet_t>>(
new DNN::TAdagrad<Architecture_t, Layer_t, DeepNet_t>(deepNet, settings.learningRate));
break;
}

// Initialize the vector of batches, one batch for one slave network
Expand Down
5 changes: 5 additions & 0 deletions tmva/tmva/test/DNN/CMakeLists.txt
Expand Up @@ -118,6 +118,11 @@ if ( (BLAS_FOUND OR mathmore) AND imt AND tmva-cpu)
LIBRARIES ${Libraries})
ROOT_ADD_TEST(TMVA-DNN-MethodDL-Adam-Optimization-Cpu COMMAND testMethodDLAdamOptimizationCpu)

# DNN - MethodDL Adagrad Optimization CPU
ROOT_EXECUTABLE(testMethodDLAdagradOptimizationCpu TestMethodDLAdagradOptimizationCpu.cxx
LIBRARIES ${Libraries})
ROOT_ADD_TEST(TMVA-DNN-MethodDL-Adagrad-Optimization-Cpu COMMAND testMethodDLAdagradOptimizationCpu)


endif ()

Expand Down
40 changes: 40 additions & 0 deletions tmva/tmva/test/DNN/TestMethodDLAdagradOptimizationCpu.cxx
@@ -0,0 +1,40 @@
// @(#)root/tmva/tmva/dnn:$Id$
// Author: Ravi Kiran S

/**********************************************************************************
* Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
* Package: TMVA *
* Class : *
* Web : http://tmva.sourceforge.net *
* *
* Description: *
* Testing MethodDL with DNN for Adagrad optimizer ( CPU backend ) *
* *
* Authors (alphabetical): *
* Ravi Kiran S <sravikiran0606@gmail.com> - CERN, Switzerland *
* *
* Copyright (c) 2005-2018: *
* CERN, Switzerland *
* U. of Victoria, Canada *
* MPI-K Heidelberg, Germany *
* U. of Bonn, Germany *
* *
* Redistribution and use in source and binary forms, with or without *
* modification, are permitted according to the terms listed in LICENSE *
* (http://tmva.sourceforge.net/LICENSE) *
**********************************************************************************/

#include "TestMethodDLOptimization.h"
#include "TString.h"

int main()
{
std::cout << "Testing Method DL with Adagrad Optimizer for CPU backend: " << std::endl;

// CPU Architecture:
TString archCPU = "CPU";

testMethodDL_DNN(archCPU, "ADAGRAD");

return 0;
}
2 changes: 1 addition & 1 deletion tmva/tmva/test/DNN/TestMethodDLOptimization.h
Expand Up @@ -99,7 +99,7 @@ void testMethodDL_DNN(TString architectureStr, TString optimizerStr)
// Training strategies.
TString training0("LearningRate=1e-2,Optimizer=" + optimizerStr +
",Momentum=0.9,Repetitions=1,"
"ConvergenceSteps=20,BatchSize=256,TestRepetitions=10,"
"ConvergenceSteps=20,BatchSize=256,TestRepetitions=1,"
"WeightDecay=1e-4,Regularization=L2,"
"DropConfig=0.0+0.5+0.5+0.5,Multithreading=True");

Expand Down
11 changes: 8 additions & 3 deletions tmva/tmva/test/DNN/TestOptimization.h
Expand Up @@ -8,7 +8,7 @@
* Web : http://tmva.sourceforge.net *
* *
* Description: *
* Testing Stochastic Batch Gradient Descent Optimizer *
* Testing Various Optimizers for training DeepNet *
* *
* Authors (alphabetical): *
* Ravi Kiran S <sravikiran0606@gmail.com> - CERN, Switzerland *
Expand Down Expand Up @@ -42,6 +42,7 @@

#include "TMVA/DNN/SGD.h"
#include "TMVA/DNN/Adam.h"
#include "TMVA/DNN/Adagrad.h"
#include "TMVA/DNN/TensorDataLoader.h"

#include <limits>
Expand All @@ -52,9 +53,9 @@ using namespace TMVA::DNN;
using TMVA::DNN::EOptimizer;

/** Train a linear neural network on a randomly generated linear mapping
* from an 8-dimensional input space to a 1-dimensional output space.
* from an 32-dimensional input space to a 1-dimensional output space.
* Returns the error of the response of the network to the input containing
* only ones to the 1x8 matrix used to generate the training data.
* only ones to the 1x32 matrix used to generate the training data.
*/
template <typename Architecture_t>
auto testOptimization(typename Architecture_t::Scalar_t momentum, EOptimizer optimizerType, Bool_t debug) ->
Expand Down Expand Up @@ -151,6 +152,10 @@ auto testOptimization(typename Architecture_t::Scalar_t momentum, EOptimizer opt
optimizer = std::unique_ptr<TAdam<Architecture_t, Layer_t, DeepNet_t>>(
new TAdam<Architecture_t, Layer_t, DeepNet_t>(deepNet, 0.001));
break;
case EOptimizer::kAdagrad:
optimizer = std::unique_ptr<TAdagrad<Architecture_t, Layer_t, DeepNet_t>>(
new TAdagrad<Architecture_t, Layer_t, DeepNet_t>(deepNet, 0.01));
break;
}

// Initialize the variables related to training procedure
Expand Down
21 changes: 20 additions & 1 deletion tmva/tmva/test/DNN/TestOptimizationCpu.cxx
Expand Up @@ -8,7 +8,7 @@
* Web : http://tmva.sourceforge.net *
* *
* Description: *
* Testing Stochastic Batch Gradient Descent Optimizer for Cpu Backend *
* Testing Various Optimizers for Cpu Backend *
* *
* Authors (alphabetical): *
* Ravi Kiran S <sravikiran0606@gmail.com> - CERN, Switzerland *
Expand Down Expand Up @@ -61,6 +61,15 @@ int main()
return 1;
}

momentumSinglePrecision = 0.0;
std::cout << "Adagrad Optimizer: ";
// Adagrad doesn't use momentum. Passing this as a parameter just to match the function prototype.
error = testOptimization<TCpu<Real_t>>(momentumSinglePrecision, EOptimizer::kAdagrad, false);
std::cout << "Mean Absolute error = " << error << std::endl;
if (error > 1e-3) {
return 1;
}

std::cout << std::endl << "Testing optimization: (double precision)" << std::endl;

Double_t momentumDoublePrecision = 0.0;
Expand Down Expand Up @@ -89,5 +98,15 @@ int main()
return 1;
}

momentumDoublePrecision = 0.0;
std::cout << "Adagrad Optimizer: ";

// Adagrad doesn't use momentum. Passing this as a parameter just to match the function prototype.
error = testOptimization<TCpu<Double_t>>(momentumSinglePrecision, EOptimizer::kAdagrad, false);
std::cout << "Mean Absolute error = " << error << std::endl;
if (error > 1e-5) {
return 1;
}

return 0;
}

0 comments on commit 1eeafac

Please sign in to comment.