[TMVA] Add API Support for Adagrad Optimizer.

root-project · Aug 7, 2018 · 1eeafac · 1eeafac
1 parent 6e27f3c
commit 1eeafac
Show file tree

Hide file tree

Showing 8 changed files with 262 additions and 5 deletions.
diff --git a/tmva/tmva/inc/TMVA/DNN/Adagrad.h b/tmva/tmva/inc/TMVA/DNN/Adagrad.h
@@ -0,0 +1,179 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Ravi Kiran S
+
+/**********************************************************************************
+ * Project: TMVA - a Root-integrated toolkit for multivariate data analysis       *
+ * Package: TMVA                                                                  *
+ * Class  : TAdagrad                                                                 *
+ * Web    : http://tmva.sourceforge.net                                           *
+ *                                                                                *
+ * Description:                                                                   *
+ *      Adagrad Optimizer Class                                                      *
+ *                                                                                *
+ * Authors (alphabetical):                                                        *
+ *      Ravi Kiran S      <sravikiran0606@gmail.com>  - CERN, Switzerland         *
+ *                                                                                *
+ * Copyright (c) 2005-2018:                                                       *
+ *      CERN, Switzerland                                                         *
+ *      U. of Victoria, Canada                                                    *
+ *      MPI-K Heidelberg, Germany                                                 *
+ *      U. of Bonn, Germany                                                       *
+ *                                                                                *
+ * Redistribution and use in source and binary forms, with or without             *
+ * modification, are permitted according to the terms listed in LICENSE           *
+ * (http://tmva.sourceforge.net/LICENSE)                                          *
+ **********************************************************************************/
+
+#ifndef TMVA_DNN_ADAGRAD
+#define TMVA_DNN_ADAGRAD
+
+#include "TMatrix.h"
+#include "TMVA/DNN/Optimizer.h"
+#include "TMVA/DNN/Functions.h"
+
+namespace TMVA {
+namespace DNN {
+
+/** \class TAdagrad
+ *  Adagrad Optimizer class
+ *
+ *  This class represents the Adagrad Optimizer.
+ */
+template <typename Architecture_t, typename Layer_t = VGeneralLayer<Architecture_t>,
+          typename DeepNet_t = TDeepNet<Architecture_t, Layer_t>>
+class TAdagrad : public VOptimizer<Architecture_t, Layer_t, DeepNet_t> {
+public:
+   using Matrix_t = typename Architecture_t::Matrix_t;
+   using Scalar_t = typename Architecture_t::Scalar_t;
+
+protected:
+   Scalar_t fEpsilon; ///< The Smoothing term used to avoid division by zero.
+
+   std::vector<std::vector<Matrix_t>>
+      fPastSquaredWeightGradients; ///< The sum of the square of the past weight gradients associated with the deep net.
+   std::vector<std::vector<Matrix_t>>
+      fPastSquaredBiasGradients; ///< The sum of the square of the past bias gradients associated with the deep net.
+
+   /*! Update the weights, given the current weight gradients. */
+   void UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights, const std::vector<Matrix_t> &weightGradients);
+
+   /*! Update the biases, given the current bias gradients. */
+   void UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases, const std::vector<Matrix_t> &biasGradients);
+
+public:
+   /*! Constructor. */
+   TAdagrad(DeepNet_t &deepNet, Scalar_t learningRate = 0.01, Scalar_t epsilon = 1e-8);
+
+   /*! Destructor. */
+   ~TAdagrad() = default;
+
+   /*! Getters */
+   Scalar_t GetEpsilon() const { return fEpsilon; }
+
+   std::vector<std::vector<Matrix_t>> &GetPastSquaredWeightGradients() { return fPastSquaredWeightGradients; }
+   std::vector<Matrix_t> &GetPastSquaredWeightGradientsAt(size_t i) { return fPastSquaredWeightGradients[i]; }
+
+   std::vector<std::vector<Matrix_t>> &GetPastSquaredBiasGradients() { return fPastSquaredBiasGradients; }
+   std::vector<Matrix_t> &GetPastSquaredBiasGradientsAt(size_t i) { return fPastSquaredBiasGradients[i]; }
+};
+
+//
+//
+//  The Stochastic Gradient Descent Optimizer Class - Implementation
+//_________________________________________________________________________________________________
+template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
+TAdagrad<Architecture_t, Layer_t, DeepNet_t>::TAdagrad(DeepNet_t &deepNet, Scalar_t learningRate, Scalar_t epsilon)
+   : VOptimizer<Architecture_t, Layer_t, DeepNet_t>(learningRate, deepNet), fEpsilon(epsilon)
+{
+   std::vector<Layer_t *> &layers = deepNet.GetLayers();
+   const size_t layersNSlices = layers.size();
+   fPastSquaredWeightGradients.resize(layersNSlices);
+   fPastSquaredBiasGradients.resize(layersNSlices);
+
+   for (size_t i = 0; i < layersNSlices; i++) {
+      const size_t weightsNSlices = (layers[i]->GetWeights()).size();
+
+      for (size_t j = 0; j < weightsNSlices; j++) {
+         Matrix_t &currentWeights = layers[i]->GetWeightsAt(j);
+         const size_t weightsNRows = currentWeights.GetNrows();
+         const size_t weightsNCols = currentWeights.GetNcols();
+
+         fPastSquaredWeightGradients[i].emplace_back(weightsNRows, weightsNCols);
+         initialize<Architecture_t>(fPastSquaredWeightGradients[i][j], EInitialization::kZero);
+      }
+
+      const size_t biasesNSlices = (layers[i]->GetBiases()).size();
+
+      for (size_t j = 0; j < biasesNSlices; j++) {
+         Matrix_t &currentBiases = layers[i]->GetBiasesAt(j);
+         const size_t biasesNRows = currentBiases.GetNrows();
+         const size_t biasesNCols = currentBiases.GetNcols();
+
+         fPastSquaredBiasGradients[i].emplace_back(biasesNRows, biasesNCols);
+         initialize<Architecture_t>(fPastSquaredBiasGradients[i][j], EInitialization::kZero);
+      }
+   }
+}
+
+//_________________________________________________________________________________________________
+template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
+auto TAdagrad<Architecture_t, Layer_t, DeepNet_t>::UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights,
+                                                                 const std::vector<Matrix_t> &weightGradients) -> void
+{
+   std::vector<Matrix_t> &currentLayerPastSquaredWeightGradients = this->GetPastSquaredWeightGradientsAt(layerIndex);
+
+   for (size_t k = 0; k < currentLayerPastSquaredWeightGradients.size(); k++) {
+
+      // Vt = Vt-1 + currentSquaredWeightGradients
+      Matrix_t currentSquaredWeightGradients(weightGradients[k].GetNrows(), weightGradients[k].GetNcols());
+      Architecture_t::Copy(currentSquaredWeightGradients, weightGradients[k]);
+      Architecture_t::SquareElementWise(currentSquaredWeightGradients);
+      Architecture_t::ScaleAdd(currentLayerPastSquaredWeightGradients[k], currentSquaredWeightGradients, 1.0);
+   }
+
+   // updating the weights.
+   // theta = theta - learningRate * currentWeightGradients / (sqrt(Vt + epsilon))
+   for (size_t i = 0; i < weights.size(); i++) {
+      Matrix_t currentWeightUpdates(weights[i].GetNrows(), weights[i].GetNcols());
+      Architecture_t::Copy(currentWeightUpdates, currentLayerPastSquaredWeightGradients[i]);
+      Architecture_t::ConstAdd(currentWeightUpdates, this->GetEpsilon());
+      Architecture_t::SqrtElementWise(currentWeightUpdates);
+      Architecture_t::ReciprocalElementWise(currentWeightUpdates);
+      Architecture_t::Hadamard(currentWeightUpdates, weightGradients[i]);
+      Architecture_t::ScaleAdd(weights[i], currentWeightUpdates, -this->GetLearningRate());
+   }
+}
+
+//_________________________________________________________________________________________________
+template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
+auto TAdagrad<Architecture_t, Layer_t, DeepNet_t>::UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases,
+                                                                const std::vector<Matrix_t> &biasGradients) -> void
+{
+   std::vector<Matrix_t> &currentLayerPastSquaredBiasGradients = this->GetPastSquaredBiasGradientsAt(layerIndex);
+
+   for (size_t k = 0; k < currentLayerPastSquaredBiasGradients.size(); k++) {
+
+      // Vt = Vt-1 + currentSquaredBiasGradients
+      Matrix_t currentSquaredBiasGradients(biasGradients[k].GetNrows(), biasGradients[k].GetNcols());
+      Architecture_t::Copy(currentSquaredBiasGradients, biasGradients[k]);
+      Architecture_t::SquareElementWise(currentSquaredBiasGradients);
+      Architecture_t::ScaleAdd(currentLayerPastSquaredBiasGradients[k], currentSquaredBiasGradients, 1.0);
+   }
+
+   // updating the biases.
+   // theta = theta - learningRate * currentBiasGradients / (sqrt(Vt + epsilon))
+   for (size_t i = 0; i < biases.size(); i++) {
+      Matrix_t currentBiasUpdates(biases[i].GetNrows(), biases[i].GetNcols());
+      Architecture_t::Copy(currentBiasUpdates, currentLayerPastSquaredBiasGradients[i]);
+      Architecture_t::ConstAdd(currentBiasUpdates, this->GetEpsilon());
+      Architecture_t::SqrtElementWise(currentBiasUpdates);
+      Architecture_t::ReciprocalElementWise(currentBiasUpdates);
+      Architecture_t::Hadamard(currentBiasUpdates, biasGradients[i]);
+      Architecture_t::ScaleAdd(biases[i], currentBiasUpdates, -this->GetLearningRate());
+   }
+}
+
+} // namespace DNN
+} // namespace TMVA
+
+#endif
diff --git a/tmva/tmva/inc/TMVA/DNN/Functions.h b/tmva/tmva/inc/TMVA/DNN/Functions.h
@@ -80,6 +80,7 @@ enum class EInitialization {
 enum class EOptimizer {
    kSGD = 0,
    kAdam = 1,
+   kAdagrad = 2,
 };
 
 //______________________________________________________________________________

diff --git a/tmva/tmva/src/MethodDL.cxx b/tmva/tmva/src/MethodDL.cxx
@@ -41,6 +41,7 @@
 #include "TMVA/DNN/DLMinimizers.h"
 #include "TMVA/DNN/SGD.h"
 #include "TMVA/DNN/Adam.h"
+#include "TMVA/DNN/Adagrad.h"
 #include "TStopwatch.h"
 
 #include <chrono>
@@ -351,6 +352,8 @@ void MethodDL::ProcessOptions()
          settings.optimizer = DNN::EOptimizer::kSGD;
       } else if (optimizer == "ADAM") {
          settings.optimizer = DNN::EOptimizer::kAdam;
+      } else if (optimizer == "ADAGRAD") {
+         settings.optimizer = DNN::EOptimizer::kAdagrad;
       } else {
          // Make Adam as default choice if the input string is
          // incorrect.
@@ -1124,6 +1127,11 @@ void MethodDL::TrainDeepNet()
          optimizer = std::unique_ptr<DNN::TAdam<Architecture_t, Layer_t, DeepNet_t>>(
             new DNN::TAdam<Architecture_t, Layer_t, DeepNet_t>(deepNet, settings.learningRate));
          break;
+
+      case EOptimizer::kAdagrad:
+         optimizer = std::unique_ptr<DNN::TAdagrad<Architecture_t, Layer_t, DeepNet_t>>(
+            new DNN::TAdagrad<Architecture_t, Layer_t, DeepNet_t>(deepNet, settings.learningRate));
+         break;
       }
 
       // Initialize the vector of batches, one batch for one slave network

diff --git a/tmva/tmva/test/DNN/CMakeLists.txt b/tmva/tmva/test/DNN/CMakeLists.txt
@@ -118,6 +118,11 @@ if ( (BLAS_FOUND OR mathmore) AND imt AND tmva-cpu)
     LIBRARIES ${Libraries})
   ROOT_ADD_TEST(TMVA-DNN-MethodDL-Adam-Optimization-Cpu COMMAND testMethodDLAdamOptimizationCpu)
 
+  # DNN - MethodDL Adagrad Optimization CPU
+  ROOT_EXECUTABLE(testMethodDLAdagradOptimizationCpu TestMethodDLAdagradOptimizationCpu.cxx
+    LIBRARIES ${Libraries})
+  ROOT_ADD_TEST(TMVA-DNN-MethodDL-Adagrad-Optimization-Cpu COMMAND testMethodDLAdagradOptimizationCpu)
+
 
 endif ()
 

diff --git a/tmva/tmva/test/DNN/TestMethodDLAdagradOptimizationCpu.cxx b/tmva/tmva/test/DNN/TestMethodDLAdagradOptimizationCpu.cxx
@@ -0,0 +1,40 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Ravi Kiran S
+
+/**********************************************************************************
+ * Project: TMVA - a Root-integrated toolkit for multivariate data analysis       *
+ * Package: TMVA                                                                  *
+ * Class  :                                                                       *
+ * Web    : http://tmva.sourceforge.net                                           *
+ *                                                                                *
+ * Description:                                                                   *
+ *      Testing MethodDL with DNN for Adagrad optimizer ( CPU backend )           *
+ *                                                                                *
+ * Authors (alphabetical):                                                        *
+ *      Ravi Kiran S           <sravikiran0606@gmail.com>  - CERN, Switzerland    *
+ *                                                                                *
+ * Copyright (c) 2005-2018:                                                       *
+ *      CERN, Switzerland                                                         *
+ *      U. of Victoria, Canada                                                    *
+ *      MPI-K Heidelberg, Germany                                                 *
+ *      U. of Bonn, Germany                                                       *
+ *                                                                                *
+ * Redistribution and use in source and binary forms, with or without             *
+ * modification, are permitted according to the terms listed in LICENSE           *
+ * (http://tmva.sourceforge.net/LICENSE)                                          *
+ **********************************************************************************/
+
+#include "TestMethodDLOptimization.h"
+#include "TString.h"
+
+int main()
+{
+   std::cout << "Testing Method DL with Adagrad Optimizer for CPU backend: " << std::endl;
+
+   // CPU Architecture:
+   TString archCPU = "CPU";
+
+   testMethodDL_DNN(archCPU, "ADAGRAD");
+
+   return 0;
+}
diff --git a/tmva/tmva/test/DNN/TestMethodDLOptimization.h b/tmva/tmva/test/DNN/TestMethodDLOptimization.h
@@ -99,7 +99,7 @@ void testMethodDL_DNN(TString architectureStr, TString optimizerStr)
    // Training strategies.
    TString training0("LearningRate=1e-2,Optimizer=" + optimizerStr +
                      ",Momentum=0.9,Repetitions=1,"
-                     "ConvergenceSteps=20,BatchSize=256,TestRepetitions=10,"
+                     "ConvergenceSteps=20,BatchSize=256,TestRepetitions=1,"
                      "WeightDecay=1e-4,Regularization=L2,"
                      "DropConfig=0.0+0.5+0.5+0.5,Multithreading=True");
 

diff --git a/tmva/tmva/test/DNN/TestOptimization.h b/tmva/tmva/test/DNN/TestOptimization.h
@@ -8,7 +8,7 @@
  * Web    : http://tmva.sourceforge.net                                           *
  *                                                                                *
  * Description:                                                                   *
- *      Testing Stochastic Batch Gradient Descent Optimizer                       *
+ *      Testing Various Optimizers for training DeepNet                           *
  *                                                                                *
  * Authors (alphabetical):                                                        *
  *      Ravi Kiran S      <sravikiran0606@gmail.com>  - CERN, Switzerland         *
@@ -42,6 +42,7 @@
 
 #include "TMVA/DNN/SGD.h"
 #include "TMVA/DNN/Adam.h"
+#include "TMVA/DNN/Adagrad.h"
 #include "TMVA/DNN/TensorDataLoader.h"
 
 #include <limits>
@@ -52,9 +53,9 @@ using namespace TMVA::DNN;
 using TMVA::DNN::EOptimizer;
 
 /** Train a linear neural network on a randomly generated linear mapping
- *  from an 8-dimensional input space to a 1-dimensional output space.
+ *  from an 32-dimensional input space to a 1-dimensional output space.
  *  Returns the error of the response of the network to the input containing
- *  only ones to the 1x8 matrix used to generate the training data.
+ *  only ones to the 1x32 matrix used to generate the training data.
  */
 template <typename Architecture_t>
 auto testOptimization(typename Architecture_t::Scalar_t momentum, EOptimizer optimizerType, Bool_t debug) ->
@@ -151,6 +152,10 @@ auto testOptimization(typename Architecture_t::Scalar_t momentum, EOptimizer opt
       optimizer = std::unique_ptr<TAdam<Architecture_t, Layer_t, DeepNet_t>>(
          new TAdam<Architecture_t, Layer_t, DeepNet_t>(deepNet, 0.001));
       break;
+   case EOptimizer::kAdagrad:
+      optimizer = std::unique_ptr<TAdagrad<Architecture_t, Layer_t, DeepNet_t>>(
+         new TAdagrad<Architecture_t, Layer_t, DeepNet_t>(deepNet, 0.01));
+      break;
    }
 
    // Initialize the variables related to training procedure

diff --git a/tmva/tmva/test/DNN/TestOptimizationCpu.cxx b/tmva/tmva/test/DNN/TestOptimizationCpu.cxx
@@ -8,7 +8,7 @@
  * Web    : http://tmva.sourceforge.net                                           *
  *                                                                                *
  * Description:                                                                   *
- *      Testing Stochastic Batch Gradient Descent Optimizer for Cpu Backend       *
+ *      Testing Various Optimizers for Cpu Backend                                *
  *                                                                                *
  * Authors (alphabetical):                                                        *
  *      Ravi Kiran S      <sravikiran0606@gmail.com>  - CERN, Switzerland         *
@@ -61,6 +61,15 @@ int main()
       return 1;
    }
 
+   momentumSinglePrecision = 0.0;
+   std::cout << "Adagrad Optimizer: ";
+   // Adagrad doesn't use momentum. Passing this as a parameter just to match the function prototype.
+   error = testOptimization<TCpu<Real_t>>(momentumSinglePrecision, EOptimizer::kAdagrad, false);
+   std::cout << "Mean Absolute error = " << error << std::endl;
+   if (error > 1e-3) {
+      return 1;
+   }
+
    std::cout << std::endl << "Testing optimization: (double precision)" << std::endl;
 
    Double_t momentumDoublePrecision = 0.0;
@@ -89,5 +98,15 @@ int main()
       return 1;
    }
 
+   momentumDoublePrecision = 0.0;
+   std::cout << "Adagrad Optimizer: ";
+
+   // Adagrad doesn't use momentum. Passing this as a parameter just to match the function prototype.
+   error = testOptimization<TCpu<Double_t>>(momentumSinglePrecision, EOptimizer::kAdagrad, false);
+   std::cout << "Mean Absolute error = " << error << std::endl;
+   if (error > 1e-5) {
+      return 1;
+   }
+
    return 0;
 }