[TMVA] API-Support for SGD Optimizer:

* Implement Base Class VOptimizer * Implement Class TSGD with Momentum method * Add Unit Tests for TSGD * Add Parsing Options for Optimizers * Modify MethodDL to include the optimizer TSGD * Add ROOT Style docs and clang-format the code
root-project · Jul 19, 2018 · 5bf9b81 · 5bf9b81
1 parent a7fac60
commit 5bf9b81
Show file tree

Hide file tree

Showing 9 changed files with 731 additions and 45 deletions.
diff --git a/tmva/tmva/inc/TMVA/DNN/Functions.h b/tmva/tmva/inc/TMVA/DNN/Functions.h
@@ -76,6 +76,11 @@ enum class EInitialization {
     kGlorotUniform = 'F',
 };
 
+/// Enum representing the optimizer used for training.
+enum class EOptimizer {
+   kSGD = 0,
+};
+
 //______________________________________________________________________________
 //
 //  Activation Functions

diff --git a/tmva/tmva/inc/TMVA/DNN/Optimizer.h b/tmva/tmva/inc/TMVA/DNN/Optimizer.h
@@ -0,0 +1,114 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Ravi Kiran S
+
+/**********************************************************************************
+ * Project: TMVA - a Root-integrated toolkit for multivariate data analysis       *
+ * Package: TMVA                                                                  *
+ * Class  : VOptimizer                                                            *
+ * Web    : http://tmva.sourceforge.net                                           *
+ *                                                                                *
+ * Description:                                                                   *
+ *      General Optimizer Class                                                   *
+ *                                                                                *
+ * Authors (alphabetical):                                                        *
+ *      Ravi Kiran S      <sravikiran0606@gmail.com>  - CERN, Switzerland         *
+ *                                                                                *
+ * Copyright (c) 2005-2018 :                                                      *
+ *      CERN, Switzerland                                                         *
+ *      U. of Victoria, Canada                                                    *
+ *      MPI-K Heidelberg, Germany                                                 *
+ *      U. of Bonn, Germany                                                       *
+ *                                                                                *
+ * Redistribution and use in source and binary forms, with or without             *
+ * modification, are permitted according to the terms listed in LICENSE           *
+ * (http://tmva.sourceforge.net/LICENSE)                                          *
+ **********************************************************************************/
+
+#ifndef TMVA_DNN_OPTIMIZER
+#define TMVA_DNN_OPTIMIZER
+
+#include "TMVA/DNN/GeneralLayer.h"
+#include "TMVA/DNN/DeepNet.h"
+
+namespace TMVA {
+namespace DNN {
+
+/** \class VOptimizer
+    Generic Optimizer class
+
+    This class represents the general class for all optimizers in the Deep Learning
+    Module.
+ */
+template <typename Architecture_t, typename Layer_t = VGeneralLayer<Architecture_t>,
+          typename DeepNet_t = TDeepNet<Architecture_t, Layer_t>>
+class VOptimizer {
+public:
+   using Matrix_t = typename Architecture_t::Matrix_t;
+   using Scalar_t = typename Architecture_t::Scalar_t;
+
+protected:
+   Scalar_t fLearningRate; ///< The learning rate used for training.
+   size_t fGlobalStep;     ///< The current global step count during training.
+   DeepNet_t &fDeepNet;    ///< The reference to the deep net.
+
+   /*! Update the weights, given the current weight gradients. */
+   virtual void
+   UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights, const std::vector<Matrix_t> &weightGradients) = 0;
+
+   /*! Update the biases, given the current bias gradients. */
+   virtual void
+   UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases, const std::vector<Matrix_t> &biasGradients) = 0;
+
+public:
+   /*! Constructor. */
+   VOptimizer(Scalar_t learningRate, DeepNet_t &deepNet);
+
+   /*! Performs one step of optimization. */
+   void Step();
+
+   /*! Virtual Destructor. */
+   virtual ~VOptimizer() = default;
+
+   /*! Increments the global step. */
+   void IncrementGlobalStep() { this->fGlobalStep++; }
+
+   /*! Getters */
+   Scalar_t GetLearningRate() const { return fLearningRate; }
+   size_t GetGlobalStep() const { return fGlobalStep; }
+   std::vector<Layer_t *> &GetLayers() { return fDeepNet.GetLayers(); }
+   Layer_t *GetLayerAt(size_t i) { return fDeepNet.GetLayerAt(i); }
+
+   /*! Setters */
+   void SetLearningRate(size_t learningRate) { fLearningRate = learningRate; }
+};
+
+//
+//
+//  The General Optimizer Class - Implementation
+//_________________________________________________________________________________________________
+template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
+VOptimizer<Architecture_t, Layer_t, DeepNet_t>::VOptimizer(Scalar_t learningRate, DeepNet_t &deepNet)
+   : fLearningRate(learningRate), fGlobalStep(0), fDeepNet(deepNet)
+{
+}
+
+// //_________________________________________________________________________________________________
+// template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
+// VOptimizer<Architecture_t, Layer_t, DeepNet_t>::~VOptimizer()
+// {
+// }
+
+//_________________________________________________________________________________________________
+template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
+auto VOptimizer<Architecture_t, Layer_t, DeepNet_t>::Step() -> void
+{
+   for (size_t i = 0; i < this->GetLayers().size(); i++) {
+      this->UpdateWeights(i, this->GetLayerAt(i)->GetWeights(), this->GetLayerAt(i)->GetWeightGradients());
+      this->UpdateBiases(i, this->GetLayerAt(i)->GetBiases(), this->GetLayerAt(i)->GetBiasGradients());
+   }
+}
+
+} // namespace DNN
+} // namespace TMVA
+
+#endif
diff --git a/tmva/tmva/inc/TMVA/DNN/SGD.h b/tmva/tmva/inc/TMVA/DNN/SGD.h
@@ -0,0 +1,174 @@
+// @(#)root/tmva/tmva/dnn:$Id$
+// Author: Ravi Kiran S
+
+/**********************************************************************************
+ * Project: TMVA - a Root-integrated toolkit for multivariate data analysis       *
+ * Package: TMVA                                                                  *
+ * Class  : TSGD                                                                  *
+ * Web    : http://tmva.sourceforge.net                                           *
+ *                                                                                *
+ * Description:                                                                   *
+ *      Stochastic Batch Gradient Descent Optimizer Class                         *
+ *                                                                                *
+ * Authors (alphabetical):                                                        *
+ *      Ravi Kiran S      <sravikiran0606@gmail.com>  - CERN, Switzerland         *
+ *                                                                                *
+ * Copyright (c) 2005-2018:                                                       *
+ *      CERN, Switzerland                                                         *
+ *      U. of Victoria, Canada                                                    *
+ *      MPI-K Heidelberg, Germany                                                 *
+ *      U. of Bonn, Germany                                                       *
+ *                                                                                *
+ * Redistribution and use in source and binary forms, with or without             *
+ * modification, are permitted according to the terms listed in LICENSE           *
+ * (http://tmva.sourceforge.net/LICENSE)                                          *
+ **********************************************************************************/
+
+#ifndef TMVA_DNN_SGD
+#define TMVA_DNN_SGD
+
+#include "TMatrix.h"
+#include "TMVA/DNN/Optimizer.h"
+#include "TMVA/DNN/Functions.h"
+
+namespace TMVA {
+namespace DNN {
+
+/** \class TSGD
+    Stochastic Batch Gradient Descent Optimizer class
+
+    This class represents the Stochastic Batch Gradient Descent Optimizer with options for applying momentum
+    and nesterov momentum.
+ */
+template <typename Architecture_t, typename Layer_t = VGeneralLayer<Architecture_t>,
+          typename DeepNet_t = TDeepNet<Architecture_t, Layer_t>>
+class TSGD : public VOptimizer<Architecture_t, Layer_t, DeepNet_t> {
+public:
+   using Matrix_t = typename Architecture_t::Matrix_t;
+   using Scalar_t = typename Architecture_t::Scalar_t;
+
+protected:
+   Scalar_t fMomentum; ///< The momentum used for training.
+   std::vector<std::vector<Matrix_t>>
+      fPastWeightGradients; ///< The sum of the past weight gradients associated with the deep net.
+   std::vector<std::vector<Matrix_t>>
+      fPastBiasGradients; ///< The sum of the past bias gradients associated with the deep net.
+
+   /*! Update the weights, given the current weight gradients. */
+   void UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights, const std::vector<Matrix_t> &weightGradients);
+
+   /*! Update the biases, given the current bias gradients. */
+   void UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases, const std::vector<Matrix_t> &biasGradients);
+
+public:
+   /*! Constructor. */
+   TSGD(Scalar_t learningRate, DeepNet_t &deepNet, Scalar_t momentum);
+
+   /*! Destructor. */
+   ~TSGD() = default;
+
+   /*! Getters */
+   Scalar_t GetMomentum() const { return fMomentum; }
+
+   std::vector<std::vector<Matrix_t>> &GetPastWeightGradients() { return fPastWeightGradients; }
+   std::vector<Matrix_t> &GetPastWeightGradientsAt(size_t i) { return fPastWeightGradients[i]; }
+
+   std::vector<std::vector<Matrix_t>> &GetPastBiasGradients() { return fPastBiasGradients; }
+   std::vector<Matrix_t> &GetPastBiasGradientsAt(size_t i) { return fPastBiasGradients[i]; }
+};
+
+//
+//
+//  The Stochastic Gradient Descent Optimizer Class - Implementation
+//_________________________________________________________________________________________________
+template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
+TSGD<Architecture_t, Layer_t, DeepNet_t>::TSGD(Scalar_t learningRate, DeepNet_t &deepNet, Scalar_t momentum)
+   : VOptimizer<Architecture_t, Layer_t, DeepNet_t>(learningRate, deepNet), fMomentum(momentum)
+{
+   std::vector<Layer_t *> &layers = deepNet.GetLayers();
+   size_t layersNSlices = layers.size();
+   fPastWeightGradients.resize(layersNSlices);
+   fPastBiasGradients.resize(layersNSlices);
+
+   for (size_t i = 0; i < layersNSlices; i++) {
+      size_t weightsNSlices = (layers[i]->GetWeights()).size();
+
+      for (size_t j = 0; j < weightsNSlices; j++) {
+         Matrix_t &currentWeights = layers[i]->GetWeightsAt(j);
+         size_t weightsNRows = currentWeights.GetNrows();
+         size_t weightsNCols = currentWeights.GetNcols();
+
+         fPastWeightGradients[i].emplace_back(weightsNRows, weightsNCols);
+         initialize<Architecture_t>(fPastWeightGradients[i][j], EInitialization::kZero);
+      }
+
+      size_t biasesNSlices = (layers[i]->GetBiases()).size();
+
+      for (size_t j = 0; j < biasesNSlices; j++) {
+         Matrix_t &currentBiases = layers[i]->GetBiasesAt(j);
+         size_t biasesNRows = currentBiases.GetNrows();
+         size_t biasesNCols = currentBiases.GetNcols();
+
+         fPastBiasGradients[i].emplace_back(biasesNRows, biasesNCols);
+         initialize<Architecture_t>(fPastBiasGradients[i][j], EInitialization::kZero);
+      }
+   }
+}
+
+// //_________________________________________________________________________________________________
+// template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
+// TSGD<Architecture_t, Layer_t, DeepNet_t>::~TSGD()
+// {
+// }
+
+//_________________________________________________________________________________________________
+template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
+auto TSGD<Architecture_t, Layer_t, DeepNet_t>::UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights,
+                                                             const std::vector<Matrix_t> &weightGradients) -> void
+{
+   // accumulating the current layer past weight gradients to include the current weight gradients.
+   // Vt = momentum * Vt-1 + currentGradients
+   std::vector<Matrix_t> &currentLayerPastWeightGradients = this->GetPastWeightGradientsAt(layerIndex);
+   for (size_t k = 0; k < currentLayerPastWeightGradients.size(); k++) {
+      Matrix_t accumulation(currentLayerPastWeightGradients[k].GetNrows(),
+                            currentLayerPastWeightGradients[k].GetNcols());
+      initialize<Architecture_t>(accumulation, EInitialization::kZero);
+      Architecture_t::ScaleAdd(accumulation, currentLayerPastWeightGradients[k], this->GetMomentum());
+      Architecture_t::ScaleAdd(accumulation, weightGradients[k], 1.0);
+      Architecture_t::Copy(currentLayerPastWeightGradients[k], accumulation);
+   }
+
+   // updating the weights.
+   // theta = theta - learningRate * Vt
+   for (size_t i = 0; i < weights.size(); i++) {
+      Architecture_t::ScaleAdd(weights[i], currentLayerPastWeightGradients[i], -this->GetLearningRate());
+   }
+}
+
+//_________________________________________________________________________________________________
+template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
+auto TSGD<Architecture_t, Layer_t, DeepNet_t>::UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases,
+                                                            const std::vector<Matrix_t> &biasGradients) -> void
+{
+   // accumulating the current layer past bias gradients to include the current bias gradients.
+   // Vt = momentum * Vt-1 + currentGradients
+   std::vector<Matrix_t> &currentLayerPastBiasGradients = this->GetPastBiasGradientsAt(layerIndex);
+   for (size_t k = 0; k < currentLayerPastBiasGradients.size(); k++) {
+      Matrix_t accumulation(currentLayerPastBiasGradients[k].GetNrows(), currentLayerPastBiasGradients[k].GetNcols());
+      initialize<Architecture_t>(accumulation, EInitialization::kZero);
+      Architecture_t::ScaleAdd(accumulation, currentLayerPastBiasGradients[k], this->GetMomentum());
+      Architecture_t::ScaleAdd(accumulation, biasGradients[k], 1.0);
+      Architecture_t::Copy(currentLayerPastBiasGradients[k], accumulation);
+   }
+
+   // updating the biases
+   // theta = theta - learningRate * Vt
+   for (size_t i = 0; i < biases.size(); i++) {
+      Architecture_t::ScaleAdd(biases[i], currentLayerPastBiasGradients[i], -this->GetLearningRate());
+   }
+}
+
+} // namespace DNN
+} // namespace TMVA
+
+#endif
diff --git a/tmva/tmva/inc/TMVA/MethodDL.h b/tmva/tmva/inc/TMVA/MethodDL.h
@@ -65,6 +65,7 @@ struct TTrainingSettings {
    size_t convergenceSteps;
    size_t maxEpochs; 
    DNN::ERegularization regularization;
+   DNN::EOptimizer optimizer;
    Double_t learningRate;
    Double_t momentum;
    Double_t weightDecay;