optimize SGD and ADAM update calculations. For ADAM define specific u…

…pdate functions which are re-implemented for each architecture
root-project · Nov 2, 2018 · cac4901 · cac4901
1 parent 68cdebc
commit cac4901
Show file tree

Hide file tree

Showing 9 changed files with 226 additions and 74 deletions.
diff --git a/tmva/tmva/inc/TMVA/DNN/Adam.h b/tmva/tmva/inc/TMVA/DNN/Adam.h
@@ -147,41 +147,19 @@ auto TAdam<Architecture_t, Layer_t, DeepNet_t>::UpdateWeights(size_t layerIndex,
    std::vector<Matrix_t> &currentLayerFirstMomentWeights = this->GetFirstMomentWeightsAt(layerIndex);
    std::vector<Matrix_t> &currentLayerSecondMomentWeights = this->GetSecondMomentWeightsAt(layerIndex);
 
-   for (size_t k = 0; k < currentLayerFirstMomentWeights.size(); k++) {
-
-      // accumulation matrix used for temporary storing of the current accumulation
-      Matrix_t accumulation(currentLayerFirstMomentWeights[k].GetNrows(), currentLayerFirstMomentWeights[k].GetNcols());
-
-      // Mt = beta1 * Mt-1 + (1-beta1) * currentWeightGradients
-      initialize<Architecture_t>(accumulation, EInitialization::kZero);
-      Architecture_t::ScaleAdd(accumulation, currentLayerFirstMomentWeights[k], this->GetBeta1());
-      Architecture_t::ScaleAdd(accumulation, weightGradients[k], 1 - (this->GetBeta1()));
-      Architecture_t::Copy(currentLayerFirstMomentWeights[k], accumulation);
-
-      // Vt = beta2 * Vt-1 + (1-beta2) * currentSquaredWeightGradients
-      initialize<Architecture_t>(accumulation, EInitialization::kZero);
-      Matrix_t currentSquaredWeightGradients(weightGradients[k].GetNrows(), weightGradients[k].GetNcols());
-      Architecture_t::Copy(currentSquaredWeightGradients, weightGradients[k]);
-      Architecture_t::SquareElementWise(currentSquaredWeightGradients);
-      Architecture_t::ScaleAdd(accumulation, currentLayerSecondMomentWeights[k], this->GetBeta2());
-      Architecture_t::ScaleAdd(accumulation, currentSquaredWeightGradients, 1 - (this->GetBeta2()));
-      Architecture_t::Copy(currentLayerSecondMomentWeights[k], accumulation);
-   }
-
    // alpha = learningRate * sqrt(1 - beta2^t) / (1-beta1^t)
    Scalar_t alpha = (this->GetLearningRate()) * (sqrt(1 - pow(this->GetBeta2(), this->GetGlobalStep()))) /
                     (1 - pow(this->GetBeta1(), this->GetGlobalStep()));
 
-   // updating the weights.
-   // theta = theta - alpha * Mt / (sqrt(Vt) + epsilon)
+   /// Adam update of first and second momentum of the weights 
    for (size_t i = 0; i < weights.size(); i++) {
-      Matrix_t currentWeightUpdates(weights[i].GetNrows(), weights[i].GetNcols());
-      Architecture_t::Copy(currentWeightUpdates, currentLayerSecondMomentWeights[i]);
-      Architecture_t::SqrtElementWise(currentWeightUpdates);
-      Architecture_t::ConstAdd(currentWeightUpdates, this->GetEpsilon());
-      Architecture_t::ReciprocalElementWise(currentWeightUpdates);
-      Architecture_t::Hadamard(currentWeightUpdates, currentLayerFirstMomentWeights[i]);
-      Architecture_t::ScaleAdd(weights[i], currentWeightUpdates, -alpha);
+      // Mt = beta1 * Mt-1 + (1-beta1) * WeightGradients
+      Architecture_t::AdamUpdateFirstMom(currentLayerFirstMomentWeights[i], weightGradients[i], this->GetBeta1() );
+      // Vt = beta2 * Vt-1 + (1-beta2) * WeightGradients^2
+      Architecture_t::AdamUpdateSecondMom(currentLayerSecondMomentWeights[i], weightGradients[i], this->GetBeta2() );
+      // Weight = Weight - alpha * Mt / (sqrt(Vt) + epsilon)
+      Architecture_t::AdamUpdate(weights[i], currentLayerFirstMomentWeights[i], currentLayerSecondMomentWeights[i],
+                                 alpha, this->GetEpsilon() );
    }
 }
 
@@ -193,45 +171,23 @@ auto TAdam<Architecture_t, Layer_t, DeepNet_t>::UpdateBiases(size_t layerIndex,
    std::vector<Matrix_t> &currentLayerFirstMomentBiases = this->GetFirstMomentBiasesAt(layerIndex);
    std::vector<Matrix_t> &currentLayerSecondMomentBiases = this->GetSecondMomentBiasesAt(layerIndex);
 
-   for (size_t k = 0; k < currentLayerFirstMomentBiases.size(); k++) {
-
-      // accumulation matrix used for temporary storing of the current accumulation
-      Matrix_t accumulation(currentLayerFirstMomentBiases[k].GetNrows(), currentLayerFirstMomentBiases[k].GetNcols());
-
-      // Mt = beta1 * Mt-1 + (1-beta1) * currentBiasGradients
-      initialize<Architecture_t>(accumulation, EInitialization::kZero);
-      Architecture_t::ScaleAdd(accumulation, currentLayerFirstMomentBiases[k], this->GetBeta1());
-      Architecture_t::ScaleAdd(accumulation, biasGradients[k], 1 - (this->GetBeta1()));
-      Architecture_t::Copy(currentLayerFirstMomentBiases[k], accumulation);
-
-      // Vt = beta2 * Vt-1 + (1-beta2) * currentSquaredBiasGradients
-      initialize<Architecture_t>(accumulation, EInitialization::kZero);
-      Matrix_t currentSquaredBiasGradients(biasGradients[k].GetNrows(), biasGradients[k].GetNcols());
-      Architecture_t::Copy(currentSquaredBiasGradients, biasGradients[k]);
-      Architecture_t::SquareElementWise(currentSquaredBiasGradients);
-      Architecture_t::ScaleAdd(accumulation, currentLayerSecondMomentBiases[k], this->GetBeta2());
-      Architecture_t::ScaleAdd(accumulation, currentSquaredBiasGradients, 1 - (this->GetBeta2()));
-      Architecture_t::Copy(currentLayerSecondMomentBiases[k], accumulation);
-   }
-
    // alpha = learningRate * sqrt(1 - beta2^t) / (1-beta1^t)
    Scalar_t alpha = (this->GetLearningRate()) * (sqrt(1 - pow(this->GetBeta2(), this->GetGlobalStep()))) /
                     (1 - pow(this->GetBeta1(), this->GetGlobalStep()));
 
-   // updating the biases.
-   // theta = theta - alpha * Mt / (sqrt(Vt) + epsilon)
+   // updating of the biases.
    for (size_t i = 0; i < biases.size(); i++) {
-      Matrix_t currentBiasUpdates(biases[i].GetNrows(), biases[i].GetNcols());
-      Architecture_t::Copy(currentBiasUpdates, currentLayerSecondMomentBiases[i]);
-      Architecture_t::SqrtElementWise(currentBiasUpdates);
-      Architecture_t::ConstAdd(currentBiasUpdates, this->GetEpsilon());
-      Architecture_t::ReciprocalElementWise(currentBiasUpdates);
-      Architecture_t::Hadamard(currentBiasUpdates, currentLayerFirstMomentBiases[i]);
-      Architecture_t::ScaleAdd(biases[i], currentBiasUpdates, -alpha);
+      // Mt = beta1 * Mt-1 + (1-beta1) * BiasGradients
+      Architecture_t::AdamUpdateFirstMom(currentLayerFirstMomentBiases[i], biasGradients[i], this->GetBeta1() );
+      // Vt = beta2 * Vt-1 + (1-beta2) * BiasGradients^2
+      Architecture_t::AdamUpdateSecondMom(currentLayerSecondMomentBiases[i], biasGradients[i], this->GetBeta2() );
+      // theta = theta - alpha * Mt / (sqrt(Vt) + epsilon)
+      Architecture_t::AdamUpdate(biases[i], currentLayerFirstMomentBiases[i], currentLayerSecondMomentBiases[i],
+                                 alpha, this->GetEpsilon() );
    }
 }
 
 } // namespace DNN
 } // namespace TMVA
 
-#endif
+#endif
diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cpu.h
@@ -518,6 +518,12 @@ class TCpu
     * \p A
     */
    static void SqrtElementWise(TCpuMatrix<Scalar_t> &A);
+
+     // optimizer functions
+   static void AdamUpdate(TCpuMatrix<Scalar_t> & A, const TCpuMatrix<Scalar_t> & M, const TCpuMatrix<Scalar_t> & V, Scalar_t alpha, Scalar_t eps);
+   static void AdamUpdateFirstMom(TCpuMatrix<Scalar_t> & A, const TCpuMatrix<Scalar_t> & B, Scalar_t beta);
+   static void AdamUpdateSecondMom(TCpuMatrix<Scalar_t> & A, const TCpuMatrix<Scalar_t> & B, Scalar_t beta);
+
 };
 
 //____________________________________________________________________________

diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Cuda.h
@@ -525,6 +525,12 @@ class TCuda
     * \p A
     */
    static void SqrtElementWise(TCudaMatrix<AFloat> &A);
+
+  // optimizer functions
+   static void AdamUpdate(TCudaMatrix<AFloat> & A, const TCudaMatrix<AFloat> & M, const TCudaMatrix<AFloat> & V, AFloat alpha, AFloat eps);
+   static void AdamUpdateFirstMom(TCudaMatrix<AFloat> & A, const TCudaMatrix<AFloat> & B, AFloat beta);
+   static void AdamUpdateSecondMom(TCudaMatrix<AFloat> & A, const TCudaMatrix<AFloat> & B, AFloat beta);
+
 };
 
 //____________________________________________________________________________

diff --git a/tmva/tmva/inc/TMVA/DNN/Architectures/Reference.h b/tmva/tmva/inc/TMVA/DNN/Architectures/Reference.h
@@ -498,6 +498,15 @@ class TReference
     */
    static void SqrtElementWise(TMatrixT<AReal> &A);
 
+   // optimizer update functions
+
+   /// Update functions for ADAM optimizer
+   static void AdamUpdate(TMatrixT<AReal> & A, const TMatrixT<AReal> & M, const TMatrixT<AReal> & V, AReal alpha, AReal eps);
+   static void AdamUpdateFirstMom(TMatrixT<AReal> & A, const TMatrixT<AReal> & B, AReal beta);
+   static void AdamUpdateSecondMom(TMatrixT<AReal> & A, const TMatrixT<AReal> & B, AReal beta);
+
+
+
    //____________________________________________________________________________
    //
    // AutoEncoder Propagation

diff --git a/tmva/tmva/inc/TMVA/DNN/SGD.h b/tmva/tmva/inc/TMVA/DNN/SGD.h
@@ -115,21 +115,21 @@ TSGD<Architecture_t, Layer_t, DeepNet_t>::TSGD(Scalar_t learningRate, DeepNet_t
    }
 }
 
+
+
 //_________________________________________________________________________________________________
 template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
 auto TSGD<Architecture_t, Layer_t, DeepNet_t>::UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights,
                                                              const std::vector<Matrix_t> &weightGradients) -> void
 {
    // accumulating the current layer past weight gradients to include the current weight gradients.
    // Vt = momentum * Vt-1 + currentGradients
+
    std::vector<Matrix_t> &currentLayerPastWeightGradients = this->GetPastWeightGradientsAt(layerIndex);
+
    for (size_t k = 0; k < currentLayerPastWeightGradients.size(); k++) {
-      Matrix_t accumulation(currentLayerPastWeightGradients[k].GetNrows(),
-                            currentLayerPastWeightGradients[k].GetNcols());
-      initialize<Architecture_t>(accumulation, EInitialization::kZero);
-      Architecture_t::ScaleAdd(accumulation, currentLayerPastWeightGradients[k], this->GetMomentum());
-      Architecture_t::ScaleAdd(accumulation, weightGradients[k], 1.0);
-      Architecture_t::Copy(currentLayerPastWeightGradients[k], accumulation);
+      Architecture_t::ConstMult(currentLayerPastWeightGradients[k], this->GetMomentum());
+      Architecture_t::ScaleAdd(currentLayerPastWeightGradients[k], weightGradients[k], 1.0);
    }
 
    // updating the weights.
@@ -146,13 +146,12 @@ auto TSGD<Architecture_t, Layer_t, DeepNet_t>::UpdateBiases(size_t layerIndex, s
 {
    // accumulating the current layer past bias gradients to include the current bias gradients.
    // Vt = momentum * Vt-1 + currentGradients
+
    std::vector<Matrix_t> &currentLayerPastBiasGradients = this->GetPastBiasGradientsAt(layerIndex);
+
    for (size_t k = 0; k < currentLayerPastBiasGradients.size(); k++) {
-      Matrix_t accumulation(currentLayerPastBiasGradients[k].GetNrows(), currentLayerPastBiasGradients[k].GetNcols());
-      initialize<Architecture_t>(accumulation, EInitialization::kZero);
-      Architecture_t::ScaleAdd(accumulation, currentLayerPastBiasGradients[k], this->GetMomentum());
-      Architecture_t::ScaleAdd(accumulation, biasGradients[k], 1.0);
-      Architecture_t::Copy(currentLayerPastBiasGradients[k], accumulation);
+      Architecture_t::ConstMult(currentLayerPastBiasGradients[k], this->GetMomentum());
+      Architecture_t::ScaleAdd(currentLayerPastBiasGradients[k], biasGradients[k], 1.0);
    }
 
    // updating the biases
@@ -165,4 +164,4 @@ auto TSGD<Architecture_t, Layer_t, DeepNet_t>::UpdateBiases(size_t layerIndex, s
 } // namespace DNN
 } // namespace TMVA
 
-#endif
+#endif
diff --git a/tmva/tmva/src/DNN/Architectures/Cpu/Arithmetic.cxx b/tmva/tmva/src/DNN/Architectures/Cpu/Arithmetic.cxx
@@ -254,5 +254,46 @@ void TCpu<Real_t>::SqrtElementWise(TCpuMatrix<Real_t> &A)
    A.Map(f);
 }
 
+/// Adam updates 
+//____________________________________________________________________________
+template<typename Real_t>
+void TCpu<Real_t>::AdamUpdate(TCpuMatrix<Real_t> &A, const TCpuMatrix<Real_t> & M, const TCpuMatrix<Real_t> & V, Real_t alpha, Real_t eps)
+{
+   // ADAM update the weights.
+   // Weight = Weight - alpha * M / (sqrt(V) + epsilon)
+   Real_t * a = A.GetRawDataPointer();
+   const Real_t * m = M.GetRawDataPointer(); 
+   const Real_t * v = V.GetRawDataPointer();
+   for (size_t index = 0; index < A.GetNoElements() ; ++index) {
+      a[index] = a[index] - alpha * m[index]/( sqrt(v[index]) + eps);
+   }
+}
+
+//____________________________________________________________________________
+template<typename Real_t>
+void TCpu<Real_t>::AdamUpdateFirstMom(TCpuMatrix<Real_t> &A, const TCpuMatrix<Real_t> & B, Real_t beta)
+{
+   // First momentum weight gradient update for ADAM
+   // Mt = beta1 * Mt-1 + (1-beta1) * WeightGradients
+   Real_t * a = A.GetRawDataPointer();
+   const Real_t * b = B.GetRawDataPointer();
+   for (size_t index = 0; index < A.GetNoElements() ; ++index) {
+      a[index] = beta * a[index] + (1.-beta) * b[index];
+   }
+}
+//____________________________________________________________________________
+template<typename Real_t>
+void TCpu<Real_t>::AdamUpdateSecondMom(TCpuMatrix<Real_t> &A, const TCpuMatrix<Real_t> & B, Real_t beta)
+{
+   // Second momentum weight gradient update for ADAM 
+   // Vt = beta2 * Vt-1 + (1-beta2) * WeightGradients^2
+   Real_t * a = A.GetRawDataPointer();
+   const Real_t * b = B.GetRawDataPointer();
+   for (size_t index = 0; index < A.GetNoElements() ; ++index) {
+      a[index] = beta * a[index] + (1.-beta) * b[index] * b[index];
+   }
+}
+
+
 } // DNN
 } // TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/Arithmetic.cu b/tmva/tmva/src/DNN/Architectures/Cuda/Arithmetic.cu
@@ -401,6 +401,57 @@ void TCuda<AFloat>::SqrtElementWise(TCudaMatrix<AFloat> &A)
        (int) A.GetNcols());
 }
 
+/// Adam updates 
+//____________________________________________________________________________
+template<typename AFloat>
+void TCuda<AFloat>::AdamUpdate(TCudaMatrix<AFloat> &A, const TCudaMatrix<AFloat> & M, const TCudaMatrix<AFloat> & V, AFloat alpha, AFloat eps)
+{
+   dim3 blockDims = TDevice::BlockDims2D();
+   dim3 gridDims  = TDevice::GridDims2D(A);
+   cudaStream_t s = A.GetComputeStream();
+   ::TMVA::DNN::Cuda::AdamUpdate<<<gridDims, blockDims, 0, s>>>(
+       A.GetDataPointer(),
+       M.GetDataPointer(),
+       V.GetDataPointer(),
+       (int) A.GetNrows(),
+       (int) A.GetNcols(),
+       alpha, eps);
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+void TCuda<AFloat>::AdamUpdateFirstMom(TCudaMatrix<AFloat> &A, const TCudaMatrix<AFloat> & B, AFloat beta)
+{
+   dim3 blockDims = TDevice::BlockDims2D();
+   dim3 gridDims  = TDevice::GridDims2D(A);
+   cudaStream_t s = A.GetComputeStream();
+   ::TMVA::DNN::Cuda::AdamUpdateFirstMom<<<gridDims, blockDims, 0, s>>>(
+       A.GetDataPointer(),
+       B.GetDataPointer(),
+       (int) A.GetNrows(),
+       (int) A.GetNcols(), beta);
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+void TCuda<AFloat>::AdamUpdateSecondMom(TCudaMatrix<AFloat> &A, const TCudaMatrix<AFloat> & B, AFloat beta)
+{
+   dim3 blockDims = TDevice::BlockDims2D();
+   dim3 gridDims  = TDevice::GridDims2D(A);
+   cudaStream_t s = A.GetComputeStream();
+   ::TMVA::DNN::Cuda::AdamUpdateSecondMom<<<gridDims, blockDims, 0, s>>>(
+       A.GetDataPointer(),
+       B.GetDataPointer(),
+       (int) A.GetNrows(),
+       (int) A.GetNcols(), beta);
+}
+
+
+
+
+
+
 
+
 } // DNN
 } // TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/Kernels.cuh b/tmva/tmva/src/DNN/Architectures/Cuda/Kernels.cuh
@@ -400,10 +400,55 @@ __global__ void SqrtElementWise(AFloat * A,
    }
 }
 
+
+/// optimizer kernel functions
+
+//____________________________________________________________________________
+template<typename AFloat>
+__global__ void AdamUpdate(AFloat * A, const AFloat * M, const AFloat * V,
+                           int m, int n, AFloat alpha, AFloat eps)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int index = j * m + i;
+
+   if ((i < m) && (j < n)) {
+      A[index] = A[index] - alpha * M[index]/( sqrt(V[index]) + eps);
+   }
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+__global__ void AdamUpdateFirstMom(AFloat * A, const AFloat * B,
+                           int m, int n, AFloat beta)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int index = j * m + i;
+
+   if ((i < m) && (j < n)) {
+      A[index] = beta * A[index] + (1.-beta) * B[index];
+   }
+}
+
+//____________________________________________________________________________
+template<typename AFloat>
+__global__ void AdamUpdateSecondMom(AFloat * A, const AFloat * B,
+                           int m, int n, AFloat beta)
+{
+   int i = blockDim.y * blockIdx.y + threadIdx.y;
+   int j = blockDim.x * blockIdx.x + threadIdx.x;
+   int index = j * m + i;
+
+   if ((i < m) && (j < n)) {
+      A[index] = beta * A[index] + (1.-beta) * B[index] * B[index];
+   }
+}
+
 //____________________________________________________________________________
 template<typename AFloat>
 __global__ void IdentityDerivative(AFloat * A,
-                                   int m, int n)
+                                   int m, int n)   
 {
    int i = blockDim.y * blockIdx.y + threadIdx.y;
    int j = blockDim.x * blockIdx.x + threadIdx.x;