/
Cuda.h
562 lines (471 loc) · 25.2 KB
/
Cuda.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
// @(#)root/tmva/tmva/dnn:$Id$
// Author: Simon Pfreundschuh 05/07/16
/*************************************************************************
* Copyright (C) 2016, Simon Pfreundschuh *
* All rights reserved. *
* *
* For the licensing terms see $ROOTSYS/LICENSE. *
* For the list of contributors see $ROOTSYS/README/CREDITS. *
*************************************************************************/
///////////////////////////////////////////////////////////////////
// Definition of the TCuda architecture class, which provides an //
// implementation of the low-level functionality for neural //
// networks for the CUDA computing architectures. //
///////////////////////////////////////////////////////////////////
#ifndef TMVA_DNN_ARCHITECTURES_CUDA
#define TMVA_DNN_ARCHITECTURES_CUDA
#include "TMVA/DNN/Functions.h"
#include "TMVA/DNN/CNN/ConvLayer.h"
#include "cuda.h"
#include "Cuda/CudaBuffers.h"
#include "Cuda/CudaMatrix.h"
#include "TMVA/DNN/DataLoader.h"
#include <utility>
#include <vector>
class TRandom;
namespace TMVA
{
namespace DNN
{
/** The TCuda architecture class.
*
* Low-level interface class for CUDA computing architectures. Contains as
* public types the declaration of the scalar, matrix and buffer types
* for this architecture as well as the remaining functions in the low-level
* interface in the form of static members.
*/
template<typename AFloat = Real_t>
class TCuda
{
private:
static TRandom * fgRandomGen;
public:
using Scalar_t = AFloat;
using Matrix_t = TCudaMatrix<AFloat>;
using DeviceBuffer_t = TCudaDeviceBuffer<AFloat>;
using HostBuffer_t = TCudaHostBuffer<AFloat>;
//____________________________________________________________________________
//
// Propagation
//____________________________________________________________________________
/** @name Forward Propagation
* Low-level functions required for the forward propagation of activations
* through the network.
*/
///@{
/** Matrix-multiply \p input with the transpose of \pweights and
* write the results into \p output. */
static void MultiplyTranspose(TCudaMatrix<AFloat> &output,
const TCudaMatrix<AFloat> &input,
const TCudaMatrix<AFloat> &weights);
/** Add the vectors biases row-wise to the matrix output */
static void AddRowWise(TCudaMatrix<AFloat> &output,
const TCudaMatrix<AFloat> &biases);
///@}
/** @name Backward Propagation
* Low-level functions required for the forward propagation of activations
* through the network.
*/
///@{
/** Perform the complete backward propagation step. If the provided
* \p activationGradientsBackward matrix is not empty, compute the
* gradients of the objective function with respect to the activations
* of the previous layer (backward direction).
* Also compute the weight and the bias gradients. Modifies the values
* in \p df and thus produces only a valid result, if it is applied the
* first time after the corresponding forward propagation has been per-
* formed. */
static void Backward(TCudaMatrix<AFloat> & activationGradientsBackward,
TCudaMatrix<AFloat> & weightGradients,
TCudaMatrix<AFloat> & biasGradients,
TCudaMatrix<AFloat> & df,
const TCudaMatrix<AFloat> & activationGradients,
const TCudaMatrix<AFloat> & weights,
const TCudaMatrix<AFloat> & activationBackward);
/** Backward pass for Recurrent Networks */
static Matrix_t & RecurrentLayerBackward(TCudaMatrix<AFloat> & state_gradients_backward, // BxH
TCudaMatrix<AFloat> & input_weight_gradients,
TCudaMatrix<AFloat> & state_weight_gradients,
TCudaMatrix<AFloat> & bias_gradients,
TCudaMatrix<AFloat> & df, //DxH
const TCudaMatrix<AFloat> & state, // BxH
const TCudaMatrix<AFloat> & weights_input, // HxD
const TCudaMatrix<AFloat> & weights_state, // HxH
const TCudaMatrix<AFloat> & input, // BxD
TCudaMatrix<AFloat> & input_gradient);
/** Adds a the elements in matrix B scaled by c to the elements in
* the matrix A. This is required for the weight update in the gradient
* descent step.*/
static void ScaleAdd(TCudaMatrix<AFloat> & A,
const TCudaMatrix<AFloat> & B,
Scalar_t beta = 1.0);
/** Copy the elements of matrix A into matrix B. */
static void Copy(TCudaMatrix<AFloat> & B,
const TCudaMatrix<AFloat> & A);
// copy from another type of matrix
template<typename AMatrix_t>
static void CopyDiffArch(TCudaMatrix<Scalar_t> & B, const AMatrix_t & A);
/** Above functions extended to vectors */
static void ScaleAdd(std::vector<TCudaMatrix<Scalar_t>> & A,
const std::vector<TCudaMatrix<Scalar_t>> & B,
Scalar_t beta = 1.0);
static void Copy(std::vector<TCudaMatrix<Scalar_t>> & A,
const std::vector<TCudaMatrix<Scalar_t>> & B);
// copy from another architecture
template<typename AMatrix_t>
static void CopyDiffArch(std::vector<TCudaMatrix<Scalar_t>> & A,
const std::vector<AMatrix_t> & B);
///@}
//____________________________________________________________________________
//
// Activation Functions
//____________________________________________________________________________
/** @name Activation Functions
* For each activation function, the low-level interface contains two routines.
* One that applies the acitvation function to a matrix and one that evaluate
* the derivatives of the activation function at the elements of a given matrix
* and writes the results into the result matrix.
*/
///@{
static void Identity(TCudaMatrix<AFloat> & B);
static void IdentityDerivative(TCudaMatrix<AFloat> & B,
const TCudaMatrix<AFloat> & A);
static void Relu(TCudaMatrix<AFloat> & B);
static void ReluDerivative(TCudaMatrix<AFloat> & B,
const TCudaMatrix<AFloat> & A);
static void Sigmoid(TCudaMatrix<AFloat> & B);
static void SigmoidDerivative(TCudaMatrix<AFloat> & B,
const TCudaMatrix<AFloat> & A);
static void Tanh(TCudaMatrix<AFloat> & B);
static void TanhDerivative(TCudaMatrix<AFloat> & B,
const TCudaMatrix<AFloat> & A);
static void SymmetricRelu(TCudaMatrix<AFloat> & B);
static void SymmetricReluDerivative(TCudaMatrix<AFloat> & B,
const TCudaMatrix<AFloat> & A);
static void SoftSign(TCudaMatrix<AFloat> & B);
static void SoftSignDerivative(TCudaMatrix<AFloat> & B,
const TCudaMatrix<AFloat> & A);
static void Gauss(TCudaMatrix<AFloat> & B);
static void GaussDerivative(TCudaMatrix<AFloat> & B,
const TCudaMatrix<AFloat> & A);
///@}
//____________________________________________________________________________
//
// Loss Functions
//____________________________________________________________________________
/** @name Loss Functions
* Loss functions compute a scalar value given the \p output of the network
* for a given training input and the expected network prediction \p Y that
* quantifies the quality of the prediction. For each function also a routing
* that computes the gradients (suffixed by Gradients) must be provided for
* the starting of the backpropagation algorithm.
*/
///@{
static AFloat MeanSquaredError(const TCudaMatrix<AFloat> &Y, const TCudaMatrix<AFloat> &output,
const TCudaMatrix<AFloat> &weights);
static void MeanSquaredErrorGradients(TCudaMatrix<AFloat> &dY, const TCudaMatrix<AFloat> &Y,
const TCudaMatrix<AFloat> &output, const TCudaMatrix<AFloat> &weights);
/** Sigmoid transformation is implicitly applied, thus \p output should
* hold the linear activations of the last layer in the net. */
static AFloat CrossEntropy(const TCudaMatrix<AFloat> &Y, const TCudaMatrix<AFloat> &output,
const TCudaMatrix<AFloat> &weights);
static void CrossEntropyGradients(TCudaMatrix<AFloat> &dY, const TCudaMatrix<AFloat> &Y,
const TCudaMatrix<AFloat> &output, const TCudaMatrix<AFloat> &weights);
/** Softmax transformation is implicitly applied, thus \p output should
* hold the linear activations of the last layer in the net. */
static AFloat SoftmaxCrossEntropy(const TCudaMatrix<AFloat> &Y, const TCudaMatrix<AFloat> &output,
const TCudaMatrix<AFloat> &weights);
static void SoftmaxCrossEntropyGradients(TCudaMatrix<AFloat> &dY, const TCudaMatrix<AFloat> &Y,
const TCudaMatrix<AFloat> &output, const TCudaMatrix<AFloat> &weights);
///@}
//____________________________________________________________________________
//
// Output Functions
//____________________________________________________________________________
/** @name Output Functions
* Output functions transform the activations \p output of the
* output layer in the network to a valid prediction \p YHat for
* the desired usage of the network, e.g. the identity function
* for regression or the sigmoid transformation for two-class
* classification.
*/
///@{
static void Sigmoid(TCudaMatrix<AFloat> &YHat,
const TCudaMatrix<AFloat> & );
static void Softmax(TCudaMatrix<AFloat> &YHat,
const TCudaMatrix<AFloat> & );
///@}
//____________________________________________________________________________
//
// Regularization
//____________________________________________________________________________
/** @name Regularization
* For each regularization type two functions are required, one named
* <tt><Type>Regularization</tt> that evaluates the corresponding
* regularization functional for a given weight matrix and the
* <tt>Add<Type>RegularizationGradients</tt>, that adds the regularization
* component in the gradients to the provided matrix.
*/
///@{
static AFloat L1Regularization(const TCudaMatrix<AFloat> & W);
static void AddL1RegularizationGradients(TCudaMatrix<AFloat> & A,
const TCudaMatrix<AFloat> & W,
AFloat weightDecay);
static AFloat L2Regularization(const TCudaMatrix<AFloat> & W);
static void AddL2RegularizationGradients(TCudaMatrix<AFloat> & A,
const TCudaMatrix<AFloat> & W,
AFloat weightDecay);
///@}
//____________________________________________________________________________
//
// Initialization
//____________________________________________________________________________
/** @name Initialization
* For each initialization method, one function in the low-level interface
* is provided. The naming scheme is <p>Initialize<Type></p> for a given
* initialization method Type.
*/
///@{
static void InitializeGauss(TCudaMatrix<AFloat> & A);
static void InitializeUniform(TCudaMatrix<AFloat> & A);
static void InitializeIdentity(TCudaMatrix<AFloat> & A);
static void InitializeZero(TCudaMatrix<AFloat> & A);
static void InitializeGlorotUniform(TCudaMatrix<AFloat> & A);
static void InitializeGlorotNormal(TCudaMatrix<AFloat> & A);
// return static instance of random generator used for initialization
// if generator does not exist it is created the first time with a random seed (e.g. seed = 0)
static TRandom & GetRandomGenerator();
// set random seed for the static geenrator
// if the static geneerator does not exists it is created
static void SetRandomSeed(size_t seed);
///@}
//____________________________________________________________________________
//
// Dropout
//____________________________________________________________________________
/** @name Dropout
*/
///@{
/** Apply dropout with activation probability \p p to the given
* matrix \p A and scale the result by reciprocal of \p p. */
static void Dropout(TCudaMatrix<AFloat> & A, AFloat p);
///@}
//____________________________________________________________________________
//
// Convolutional Layer Propagation
//____________________________________________________________________________
/** @name Forward Propagation in Convolutional Layer
*/
///@{
/** Attaches a cuda stream to each matrix in order to accomodate parallel kernel launches. */
static void PrepareInternals(std::vector<TCudaMatrix<AFloat>> & inputPrime);
/** Calculate how many neurons "fit" in the output layer, given the input as well as the layer's hyperparameters. */
static size_t calculateDimension(size_t imgDim, size_t fltDim, size_t padding, size_t stride);
/** Transform the matrix \p B in local view format, suitable for
* convolution, and store it in matrix \p A. */
static void Im2col(TCudaMatrix<AFloat> &A,
const TCudaMatrix<AFloat> &B,
size_t imgHeight,
size_t imgWidth,
size_t fltHeight,
size_t fltWidth,
size_t strideRows,
size_t strideCols,
size_t zeroPaddingHeight,
size_t zeroPaddingWidth);
static void Im2colIndices(std::vector<int> & /* V */, const TCudaMatrix<AFloat> & /* B */, size_t /* nLocalViews */,
size_t /* imgHeight */, size_t /* imgWidth */, size_t /* fltHeight */,
size_t /* fltWidth */, size_t /* strideRows */, size_t /* strideCols */,
size_t /* zeroPaddingHeight */, size_t /* zeroPaddingWidth */) {}
static void Im2colFast(TCudaMatrix<AFloat> & /* A */, const TCudaMatrix<AFloat> & /* B */,
const std::vector<int> & /* V */) {}
/** Rotates the matrix \p B, which is representing a weights,
* and stores them in the matrix \p A. */
static void RotateWeights(TCudaMatrix<AFloat> &A, const TCudaMatrix<AFloat> &B, size_t filterDepth,
size_t filterHeight, size_t filterWidth, size_t numFilters);
/** Add the biases in the Convolutional Layer. */
static void AddConvBiases(TCudaMatrix<AFloat> &output, const TCudaMatrix<AFloat> &biases);
///@}
/** Forward propagation in the Convolutional layer */
static void ConvLayerForward(std::vector<TCudaMatrix<AFloat>> & output,
std::vector<TCudaMatrix<AFloat>> & derivatives,
const std::vector<TCudaMatrix<AFloat>> &input,
const TCudaMatrix<AFloat> &weights, const TCudaMatrix<AFloat> & biases,
const DNN::CNN::TConvParams & params, EActivationFunction activFunc,
std::vector<TCudaMatrix<AFloat>> & inputPrime);
/** @name Backward Propagation in Convolutional Layer
*/
///@{
/** Perform the complete backward propagation step in a Convolutional Layer.
* If the provided \p activationGradientsBackward matrix is not empty, compute the
* gradients of the objective function with respect to the activations
* of the previous layer (backward direction).
* Also compute the weight and the bias gradients. Modifies the values
* in \p df and thus produces only a valid result, if it is applied the
* first time after the corresponding forward propagation has been per-
* formed. */
static void ConvLayerBackward(std::vector<TCudaMatrix<AFloat>> &activationGradientsBackward,
TCudaMatrix<AFloat> &weightGradients, TCudaMatrix<AFloat> &biasGradients,
std::vector<TCudaMatrix<AFloat>> &df,
const std::vector<TCudaMatrix<AFloat>> &activationGradients,
const TCudaMatrix<AFloat> &weights,
const std::vector<TCudaMatrix<AFloat>> &activationBackward, size_t batchSize,
size_t inputHeight, size_t inputWidth, size_t depth, size_t height, size_t width,
size_t filterDepth, size_t filterHeight, size_t filterWidth, size_t nLocalViews);
/** Utility function for calculating the activation gradients of the layer
* before the convolutional layer. */
static void CalculateConvActivationGradients(std::vector<TCudaMatrix<AFloat>> &activationGradientsBackward,
std::vector<TCudaMatrix<AFloat>> &df,
const TCudaMatrix<AFloat> &weights, size_t batchSize,
size_t inputHeight, size_t inputWidth, size_t depth, size_t height,
size_t width, size_t filterDepth, size_t filterHeight,
size_t filterWidth);
/** Utility function for calculating the weight gradients of the convolutional
* layer. */
static void CalculateConvWeightGradients(TCudaMatrix<AFloat> &weightGradients, std::vector<TCudaMatrix<AFloat>> &df,
const std::vector<TCudaMatrix<AFloat>> &activations_backward,
size_t batchSize, size_t inputHeight, size_t inputWidth, size_t depth,
size_t height, size_t width, size_t filterDepth, size_t filterHeight,
size_t filterWidth, size_t nLocalViews);
/** Utility function for calculating the bias gradients of the convolutional
* layer */
static void CalculateConvBiasGradients(TCudaMatrix<AFloat> &biasGradients, std::vector<TCudaMatrix<AFloat>> &df,
size_t batchSize, size_t depth, size_t nLocalViews);
///@}
//____________________________________________________________________________
//
// Max Pooling Layer Propagation
//____________________________________________________________________________
/** @name Forward Propagation in Max Pooling Layer
*/
///@{
/** Downsample the matrix \p C to the matrix \p A, using max
* operation, such that the winning indices are stored in matrix
* \p B. */
static void Downsample(TCudaMatrix<AFloat> &A, TCudaMatrix<AFloat> &B, const TCudaMatrix<AFloat> &C,
size_t imgHeight, size_t imgWidth, size_t fltHeight, size_t fltWidth,
size_t strideRows, size_t strideCols);
///@}
/** @name Backward Propagation in Max Pooling Layer
*/
///@{
/** Perform the complete backward propagation step in a Pooling Layer. Based on the
* winning idices stored in the index matrix, it just forwards the actiovation
* gradients to the previous layer. */
static void MaxPoolLayerBackward(TCudaMatrix<AFloat> &activationGradientsBackward,
const TCudaMatrix<AFloat> &activationGradients,
const TCudaMatrix<AFloat> &indexMatrix,
size_t imgHeight,
size_t imgWidth,
size_t fltHeight,
size_t fltWidth,
size_t strideRows,
size_t strideCols,
size_t nLocalViews);
///@}
//____________________________________________________________________________
//
// Reshape Layer Propagation
//____________________________________________________________________________
/** @name Forward and Backward Propagation in Reshape Layer
*/
///@{
/** Transform the matrix \p B to a matrix with different dimensions \p A */
static void Reshape(TCudaMatrix<AFloat> &A, const TCudaMatrix<AFloat> &B);
/** Flattens the tensor \p B, such that each matrix, is stretched in
* one row, resulting with a matrix \p A. */
static void Flatten(TCudaMatrix<AFloat> &A, const std::vector<TCudaMatrix<AFloat>> &B, size_t size, size_t nRows,
size_t nCols);
/** Transforms each row of \p B to a matrix and stores it in the tensor \p B. */
static void Deflatten(std::vector<TCudaMatrix<AFloat>> &A, const TCudaMatrix<AFloat> &B, size_t index, size_t nRows,
size_t nCols);
/** Rearrage data accoring to time fill B x T x D out with T x B x D matrix in*/
static void Rearrange(std::vector<TCudaMatrix<AFloat>> &out, const std::vector<TCudaMatrix<AFloat>> &in);
///@}
//____________________________________________________________________________
//
// Additional Arithmetic Functions
//____________________________________________________________________________
/** @name Additional Arithmetic Functions
*
* Additional arithmetic on CUDA matrices used to implement the low-level
* interface.
*/
///@{
/** Standard multiplication of two matrices \p A and \p B with the result being
* written into C.
*/
static void Multiply(TCudaMatrix<AFloat> & C,
const TCudaMatrix<AFloat> & A,
const TCudaMatrix<AFloat> & B);
/** Matrix multiplication of two matrices \p A and \p B^T (transposed) with the
* result being written into C.
*/
static void TransposeMultiply(TCudaMatrix<AFloat> & output,
const TCudaMatrix<AFloat> & input,
const TCudaMatrix<AFloat> & Weights);
/** In-place Hadamard (element-wise) product of matrices \p A and \p B
* with the result being written into \p A.
*/
static void Hadamard(TCudaMatrix<AFloat> & A, const TCudaMatrix<AFloat> & B);
/** Sum columns of (m x n) matrix \p A and write the results into the first
* m elements in \p B.
*/
static void SumColumns(TCudaMatrix<AFloat> & B, const TCudaMatrix<AFloat> & A);
/** Sum rows of (m x n) matrix \p A and write the results into the first
* m elements in \p B.
*/
static void SumRows(TCudaMatrix<AFloat> & B, const TCudaMatrix<AFloat> & A);
/** Compute the sum of all elements in \p A */
static AFloat Sum(const TCudaMatrix<AFloat> &A);
/** Check two matrices for equality, taking floating point arithmetic errors into account. */
static bool AlmostEquals(const TCudaMatrix<AFloat> &A, const TCudaMatrix<AFloat> &B, double epsilon = 0.1);
/** Add the constant \p beta to all the elements of matrix \p A and write the
* result into \p A.
*/
static void ConstAdd(TCudaMatrix<AFloat> &A, AFloat beta);
/** Multiply the constant \p beta to all the elements of matrix \p A and write the
* result into \p A.
*/
static void ConstMult(TCudaMatrix<AFloat> &A, AFloat beta);
/** Reciprocal each element of the matrix \p A and write the result into
* \p A
*/
static void ReciprocalElementWise(TCudaMatrix<AFloat> &A);
/** Square each element of the matrix \p A and write the result into
* \p A
*/
static void SquareElementWise(TCudaMatrix<AFloat> &A);
/** Square root each element of the matrix \p A and write the result into
* \p A
*/
static void SqrtElementWise(TCudaMatrix<AFloat> &A);
// optimizer functions
static void AdamUpdate(TCudaMatrix<AFloat> & A, const TCudaMatrix<AFloat> & M, const TCudaMatrix<AFloat> & V, AFloat alpha, AFloat eps);
static void AdamUpdateFirstMom(TCudaMatrix<AFloat> & A, const TCudaMatrix<AFloat> & B, AFloat beta);
static void AdamUpdateSecondMom(TCudaMatrix<AFloat> & A, const TCudaMatrix<AFloat> & B, AFloat beta);
};
//____________________________________________________________________________
template <typename AFloat>
template <typename AMatrix_t>
void TCuda<AFloat>::CopyDiffArch(TCudaMatrix<AFloat> &B,
const AMatrix_t &A)
{
// copy from another architecture using the reference one
// this is not very efficient since creates temporary objects
TMatrixT<AFloat> tmp = A;
Copy(B, TCudaMatrix<AFloat>(tmp) );
}
//____________________________________________________________________________
template <typename AFloat>
template <typename AMatrix_t>
void TCuda<AFloat>::CopyDiffArch(std::vector<TCudaMatrix<AFloat>> &B,
const std::vector<AMatrix_t> &A)
{
for (size_t i = 0; i < B.size(); ++i) {
CopyDiffArch(B[i], A[i]);
}
}
} // namespace DNN
} // namespace TMVA
#endif