Skip to content

Commit

Permalink
optimize SGD and ADAM update calculations. For ADAM define specific u…
Browse files Browse the repository at this point in the history
…pdate functions which are re-implemented for each architecture
  • Loading branch information
moneta authored and lmoneta committed Nov 2, 2018
1 parent 68cdebc commit cac4901
Show file tree
Hide file tree
Showing 9 changed files with 226 additions and 74 deletions.
78 changes: 17 additions & 61 deletions tmva/tmva/inc/TMVA/DNN/Adam.h
Expand Up @@ -147,41 +147,19 @@ auto TAdam<Architecture_t, Layer_t, DeepNet_t>::UpdateWeights(size_t layerIndex,
std::vector<Matrix_t> &currentLayerFirstMomentWeights = this->GetFirstMomentWeightsAt(layerIndex);
std::vector<Matrix_t> &currentLayerSecondMomentWeights = this->GetSecondMomentWeightsAt(layerIndex);

for (size_t k = 0; k < currentLayerFirstMomentWeights.size(); k++) {

// accumulation matrix used for temporary storing of the current accumulation
Matrix_t accumulation(currentLayerFirstMomentWeights[k].GetNrows(), currentLayerFirstMomentWeights[k].GetNcols());

// Mt = beta1 * Mt-1 + (1-beta1) * currentWeightGradients
initialize<Architecture_t>(accumulation, EInitialization::kZero);
Architecture_t::ScaleAdd(accumulation, currentLayerFirstMomentWeights[k], this->GetBeta1());
Architecture_t::ScaleAdd(accumulation, weightGradients[k], 1 - (this->GetBeta1()));
Architecture_t::Copy(currentLayerFirstMomentWeights[k], accumulation);

// Vt = beta2 * Vt-1 + (1-beta2) * currentSquaredWeightGradients
initialize<Architecture_t>(accumulation, EInitialization::kZero);
Matrix_t currentSquaredWeightGradients(weightGradients[k].GetNrows(), weightGradients[k].GetNcols());
Architecture_t::Copy(currentSquaredWeightGradients, weightGradients[k]);
Architecture_t::SquareElementWise(currentSquaredWeightGradients);
Architecture_t::ScaleAdd(accumulation, currentLayerSecondMomentWeights[k], this->GetBeta2());
Architecture_t::ScaleAdd(accumulation, currentSquaredWeightGradients, 1 - (this->GetBeta2()));
Architecture_t::Copy(currentLayerSecondMomentWeights[k], accumulation);
}

// alpha = learningRate * sqrt(1 - beta2^t) / (1-beta1^t)
Scalar_t alpha = (this->GetLearningRate()) * (sqrt(1 - pow(this->GetBeta2(), this->GetGlobalStep()))) /
(1 - pow(this->GetBeta1(), this->GetGlobalStep()));

// updating the weights.
// theta = theta - alpha * Mt / (sqrt(Vt) + epsilon)
/// Adam update of first and second momentum of the weights
for (size_t i = 0; i < weights.size(); i++) {
Matrix_t currentWeightUpdates(weights[i].GetNrows(), weights[i].GetNcols());
Architecture_t::Copy(currentWeightUpdates, currentLayerSecondMomentWeights[i]);
Architecture_t::SqrtElementWise(currentWeightUpdates);
Architecture_t::ConstAdd(currentWeightUpdates, this->GetEpsilon());
Architecture_t::ReciprocalElementWise(currentWeightUpdates);
Architecture_t::Hadamard(currentWeightUpdates, currentLayerFirstMomentWeights[i]);
Architecture_t::ScaleAdd(weights[i], currentWeightUpdates, -alpha);
// Mt = beta1 * Mt-1 + (1-beta1) * WeightGradients
Architecture_t::AdamUpdateFirstMom(currentLayerFirstMomentWeights[i], weightGradients[i], this->GetBeta1() );
// Vt = beta2 * Vt-1 + (1-beta2) * WeightGradients^2
Architecture_t::AdamUpdateSecondMom(currentLayerSecondMomentWeights[i], weightGradients[i], this->GetBeta2() );
// Weight = Weight - alpha * Mt / (sqrt(Vt) + epsilon)
Architecture_t::AdamUpdate(weights[i], currentLayerFirstMomentWeights[i], currentLayerSecondMomentWeights[i],
alpha, this->GetEpsilon() );
}
}

Expand All @@ -193,45 +171,23 @@ auto TAdam<Architecture_t, Layer_t, DeepNet_t>::UpdateBiases(size_t layerIndex,
std::vector<Matrix_t> &currentLayerFirstMomentBiases = this->GetFirstMomentBiasesAt(layerIndex);
std::vector<Matrix_t> &currentLayerSecondMomentBiases = this->GetSecondMomentBiasesAt(layerIndex);

for (size_t k = 0; k < currentLayerFirstMomentBiases.size(); k++) {

// accumulation matrix used for temporary storing of the current accumulation
Matrix_t accumulation(currentLayerFirstMomentBiases[k].GetNrows(), currentLayerFirstMomentBiases[k].GetNcols());

// Mt = beta1 * Mt-1 + (1-beta1) * currentBiasGradients
initialize<Architecture_t>(accumulation, EInitialization::kZero);
Architecture_t::ScaleAdd(accumulation, currentLayerFirstMomentBiases[k], this->GetBeta1());
Architecture_t::ScaleAdd(accumulation, biasGradients[k], 1 - (this->GetBeta1()));
Architecture_t::Copy(currentLayerFirstMomentBiases[k], accumulation);

// Vt = beta2 * Vt-1 + (1-beta2) * currentSquaredBiasGradients
initialize<Architecture_t>(accumulation, EInitialization::kZero);
Matrix_t currentSquaredBiasGradients(biasGradients[k].GetNrows(), biasGradients[k].GetNcols());
Architecture_t::Copy(currentSquaredBiasGradients, biasGradients[k]);
Architecture_t::SquareElementWise(currentSquaredBiasGradients);
Architecture_t::ScaleAdd(accumulation, currentLayerSecondMomentBiases[k], this->GetBeta2());
Architecture_t::ScaleAdd(accumulation, currentSquaredBiasGradients, 1 - (this->GetBeta2()));
Architecture_t::Copy(currentLayerSecondMomentBiases[k], accumulation);
}

// alpha = learningRate * sqrt(1 - beta2^t) / (1-beta1^t)
Scalar_t alpha = (this->GetLearningRate()) * (sqrt(1 - pow(this->GetBeta2(), this->GetGlobalStep()))) /
(1 - pow(this->GetBeta1(), this->GetGlobalStep()));

// updating the biases.
// theta = theta - alpha * Mt / (sqrt(Vt) + epsilon)
// updating of the biases.
for (size_t i = 0; i < biases.size(); i++) {
Matrix_t currentBiasUpdates(biases[i].GetNrows(), biases[i].GetNcols());
Architecture_t::Copy(currentBiasUpdates, currentLayerSecondMomentBiases[i]);
Architecture_t::SqrtElementWise(currentBiasUpdates);
Architecture_t::ConstAdd(currentBiasUpdates, this->GetEpsilon());
Architecture_t::ReciprocalElementWise(currentBiasUpdates);
Architecture_t::Hadamard(currentBiasUpdates, currentLayerFirstMomentBiases[i]);
Architecture_t::ScaleAdd(biases[i], currentBiasUpdates, -alpha);
// Mt = beta1 * Mt-1 + (1-beta1) * BiasGradients
Architecture_t::AdamUpdateFirstMom(currentLayerFirstMomentBiases[i], biasGradients[i], this->GetBeta1() );
// Vt = beta2 * Vt-1 + (1-beta2) * BiasGradients^2
Architecture_t::AdamUpdateSecondMom(currentLayerSecondMomentBiases[i], biasGradients[i], this->GetBeta2() );
// theta = theta - alpha * Mt / (sqrt(Vt) + epsilon)
Architecture_t::AdamUpdate(biases[i], currentLayerFirstMomentBiases[i], currentLayerSecondMomentBiases[i],
alpha, this->GetEpsilon() );
}
}

} // namespace DNN
} // namespace TMVA

#endif
#endif
6 changes: 6 additions & 0 deletions tmva/tmva/inc/TMVA/DNN/Architectures/Cpu.h
Expand Up @@ -518,6 +518,12 @@ class TCpu
* \p A
*/
static void SqrtElementWise(TCpuMatrix<Scalar_t> &A);

// optimizer functions
static void AdamUpdate(TCpuMatrix<Scalar_t> & A, const TCpuMatrix<Scalar_t> & M, const TCpuMatrix<Scalar_t> & V, Scalar_t alpha, Scalar_t eps);
static void AdamUpdateFirstMom(TCpuMatrix<Scalar_t> & A, const TCpuMatrix<Scalar_t> & B, Scalar_t beta);
static void AdamUpdateSecondMom(TCpuMatrix<Scalar_t> & A, const TCpuMatrix<Scalar_t> & B, Scalar_t beta);

};

//____________________________________________________________________________
Expand Down
6 changes: 6 additions & 0 deletions tmva/tmva/inc/TMVA/DNN/Architectures/Cuda.h
Expand Up @@ -525,6 +525,12 @@ class TCuda
* \p A
*/
static void SqrtElementWise(TCudaMatrix<AFloat> &A);

// optimizer functions
static void AdamUpdate(TCudaMatrix<AFloat> & A, const TCudaMatrix<AFloat> & M, const TCudaMatrix<AFloat> & V, AFloat alpha, AFloat eps);
static void AdamUpdateFirstMom(TCudaMatrix<AFloat> & A, const TCudaMatrix<AFloat> & B, AFloat beta);
static void AdamUpdateSecondMom(TCudaMatrix<AFloat> & A, const TCudaMatrix<AFloat> & B, AFloat beta);

};

//____________________________________________________________________________
Expand Down
9 changes: 9 additions & 0 deletions tmva/tmva/inc/TMVA/DNN/Architectures/Reference.h
Expand Up @@ -498,6 +498,15 @@ class TReference
*/
static void SqrtElementWise(TMatrixT<AReal> &A);

// optimizer update functions

/// Update functions for ADAM optimizer
static void AdamUpdate(TMatrixT<AReal> & A, const TMatrixT<AReal> & M, const TMatrixT<AReal> & V, AReal alpha, AReal eps);
static void AdamUpdateFirstMom(TMatrixT<AReal> & A, const TMatrixT<AReal> & B, AReal beta);
static void AdamUpdateSecondMom(TMatrixT<AReal> & A, const TMatrixT<AReal> & B, AReal beta);



//____________________________________________________________________________
//
// AutoEncoder Propagation
Expand Down
23 changes: 11 additions & 12 deletions tmva/tmva/inc/TMVA/DNN/SGD.h
Expand Up @@ -115,21 +115,21 @@ TSGD<Architecture_t, Layer_t, DeepNet_t>::TSGD(Scalar_t learningRate, DeepNet_t
}
}



//_________________________________________________________________________________________________
template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
auto TSGD<Architecture_t, Layer_t, DeepNet_t>::UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights,
const std::vector<Matrix_t> &weightGradients) -> void
{
// accumulating the current layer past weight gradients to include the current weight gradients.
// Vt = momentum * Vt-1 + currentGradients

std::vector<Matrix_t> &currentLayerPastWeightGradients = this->GetPastWeightGradientsAt(layerIndex);

for (size_t k = 0; k < currentLayerPastWeightGradients.size(); k++) {
Matrix_t accumulation(currentLayerPastWeightGradients[k].GetNrows(),
currentLayerPastWeightGradients[k].GetNcols());
initialize<Architecture_t>(accumulation, EInitialization::kZero);
Architecture_t::ScaleAdd(accumulation, currentLayerPastWeightGradients[k], this->GetMomentum());
Architecture_t::ScaleAdd(accumulation, weightGradients[k], 1.0);
Architecture_t::Copy(currentLayerPastWeightGradients[k], accumulation);
Architecture_t::ConstMult(currentLayerPastWeightGradients[k], this->GetMomentum());
Architecture_t::ScaleAdd(currentLayerPastWeightGradients[k], weightGradients[k], 1.0);
}

// updating the weights.
Expand All @@ -146,13 +146,12 @@ auto TSGD<Architecture_t, Layer_t, DeepNet_t>::UpdateBiases(size_t layerIndex, s
{
// accumulating the current layer past bias gradients to include the current bias gradients.
// Vt = momentum * Vt-1 + currentGradients

std::vector<Matrix_t> &currentLayerPastBiasGradients = this->GetPastBiasGradientsAt(layerIndex);

for (size_t k = 0; k < currentLayerPastBiasGradients.size(); k++) {
Matrix_t accumulation(currentLayerPastBiasGradients[k].GetNrows(), currentLayerPastBiasGradients[k].GetNcols());
initialize<Architecture_t>(accumulation, EInitialization::kZero);
Architecture_t::ScaleAdd(accumulation, currentLayerPastBiasGradients[k], this->GetMomentum());
Architecture_t::ScaleAdd(accumulation, biasGradients[k], 1.0);
Architecture_t::Copy(currentLayerPastBiasGradients[k], accumulation);
Architecture_t::ConstMult(currentLayerPastBiasGradients[k], this->GetMomentum());
Architecture_t::ScaleAdd(currentLayerPastBiasGradients[k], biasGradients[k], 1.0);
}

// updating the biases
Expand All @@ -165,4 +164,4 @@ auto TSGD<Architecture_t, Layer_t, DeepNet_t>::UpdateBiases(size_t layerIndex, s
} // namespace DNN
} // namespace TMVA

#endif
#endif
41 changes: 41 additions & 0 deletions tmva/tmva/src/DNN/Architectures/Cpu/Arithmetic.cxx
Expand Up @@ -254,5 +254,46 @@ void TCpu<Real_t>::SqrtElementWise(TCpuMatrix<Real_t> &A)
A.Map(f);
}

/// Adam updates
//____________________________________________________________________________
template<typename Real_t>
void TCpu<Real_t>::AdamUpdate(TCpuMatrix<Real_t> &A, const TCpuMatrix<Real_t> & M, const TCpuMatrix<Real_t> & V, Real_t alpha, Real_t eps)
{
// ADAM update the weights.
// Weight = Weight - alpha * M / (sqrt(V) + epsilon)
Real_t * a = A.GetRawDataPointer();
const Real_t * m = M.GetRawDataPointer();
const Real_t * v = V.GetRawDataPointer();
for (size_t index = 0; index < A.GetNoElements() ; ++index) {
a[index] = a[index] - alpha * m[index]/( sqrt(v[index]) + eps);
}
}

//____________________________________________________________________________
template<typename Real_t>
void TCpu<Real_t>::AdamUpdateFirstMom(TCpuMatrix<Real_t> &A, const TCpuMatrix<Real_t> & B, Real_t beta)
{
// First momentum weight gradient update for ADAM
// Mt = beta1 * Mt-1 + (1-beta1) * WeightGradients
Real_t * a = A.GetRawDataPointer();
const Real_t * b = B.GetRawDataPointer();
for (size_t index = 0; index < A.GetNoElements() ; ++index) {
a[index] = beta * a[index] + (1.-beta) * b[index];
}
}
//____________________________________________________________________________
template<typename Real_t>
void TCpu<Real_t>::AdamUpdateSecondMom(TCpuMatrix<Real_t> &A, const TCpuMatrix<Real_t> & B, Real_t beta)
{
// Second momentum weight gradient update for ADAM
// Vt = beta2 * Vt-1 + (1-beta2) * WeightGradients^2
Real_t * a = A.GetRawDataPointer();
const Real_t * b = B.GetRawDataPointer();
for (size_t index = 0; index < A.GetNoElements() ; ++index) {
a[index] = beta * a[index] + (1.-beta) * b[index] * b[index];
}
}


} // DNN
} // TMVA
51 changes: 51 additions & 0 deletions tmva/tmva/src/DNN/Architectures/Cuda/Arithmetic.cu
Expand Up @@ -401,6 +401,57 @@ void TCuda<AFloat>::SqrtElementWise(TCudaMatrix<AFloat> &A)
(int) A.GetNcols());
}

/// Adam updates
//____________________________________________________________________________
template<typename AFloat>
void TCuda<AFloat>::AdamUpdate(TCudaMatrix<AFloat> &A, const TCudaMatrix<AFloat> & M, const TCudaMatrix<AFloat> & V, AFloat alpha, AFloat eps)
{
dim3 blockDims = TDevice::BlockDims2D();
dim3 gridDims = TDevice::GridDims2D(A);
cudaStream_t s = A.GetComputeStream();
::TMVA::DNN::Cuda::AdamUpdate<<<gridDims, blockDims, 0, s>>>(
A.GetDataPointer(),
M.GetDataPointer(),
V.GetDataPointer(),
(int) A.GetNrows(),
(int) A.GetNcols(),
alpha, eps);
}

//____________________________________________________________________________
template<typename AFloat>
void TCuda<AFloat>::AdamUpdateFirstMom(TCudaMatrix<AFloat> &A, const TCudaMatrix<AFloat> & B, AFloat beta)
{
dim3 blockDims = TDevice::BlockDims2D();
dim3 gridDims = TDevice::GridDims2D(A);
cudaStream_t s = A.GetComputeStream();
::TMVA::DNN::Cuda::AdamUpdateFirstMom<<<gridDims, blockDims, 0, s>>>(
A.GetDataPointer(),
B.GetDataPointer(),
(int) A.GetNrows(),
(int) A.GetNcols(), beta);
}

//____________________________________________________________________________
template<typename AFloat>
void TCuda<AFloat>::AdamUpdateSecondMom(TCudaMatrix<AFloat> &A, const TCudaMatrix<AFloat> & B, AFloat beta)
{
dim3 blockDims = TDevice::BlockDims2D();
dim3 gridDims = TDevice::GridDims2D(A);
cudaStream_t s = A.GetComputeStream();
::TMVA::DNN::Cuda::AdamUpdateSecondMom<<<gridDims, blockDims, 0, s>>>(
A.GetDataPointer(),
B.GetDataPointer(),
(int) A.GetNrows(),
(int) A.GetNcols(), beta);
}








} // DNN
} // TMVA
47 changes: 46 additions & 1 deletion tmva/tmva/src/DNN/Architectures/Cuda/Kernels.cuh
Expand Up @@ -400,10 +400,55 @@ __global__ void SqrtElementWise(AFloat * A,
}
}


/// optimizer kernel functions

//____________________________________________________________________________
template<typename AFloat>
__global__ void AdamUpdate(AFloat * A, const AFloat * M, const AFloat * V,
int m, int n, AFloat alpha, AFloat eps)
{
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
int index = j * m + i;

if ((i < m) && (j < n)) {
A[index] = A[index] - alpha * M[index]/( sqrt(V[index]) + eps);
}
}

//____________________________________________________________________________
template<typename AFloat>
__global__ void AdamUpdateFirstMom(AFloat * A, const AFloat * B,
int m, int n, AFloat beta)
{
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
int index = j * m + i;

if ((i < m) && (j < n)) {
A[index] = beta * A[index] + (1.-beta) * B[index];
}
}

//____________________________________________________________________________
template<typename AFloat>
__global__ void AdamUpdateSecondMom(AFloat * A, const AFloat * B,
int m, int n, AFloat beta)
{
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
int index = j * m + i;

if ((i < m) && (j < n)) {
A[index] = beta * A[index] + (1.-beta) * B[index] * B[index];
}
}

//____________________________________________________________________________
template<typename AFloat>
__global__ void IdentityDerivative(AFloat * A,
int m, int n)
int m, int n)
{
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
Expand Down

0 comments on commit cac4901

Please sign in to comment.