Skip to content

Commit

Permalink
Factorize CUDA_KERNEL_LOOP used in CUDA kernels (apache#16197)
Browse files Browse the repository at this point in the history
* Factorize CUDA_KERNEL_LOOP used in CUDA kernels

Signed-off-by: Serge Panev <spanev@nvidia.com>

* Retrigger CI
  • Loading branch information
Kh4L authored and sojiadeshina committed Sep 30, 2019
1 parent 0ac05e6 commit bc01a86
Show file tree
Hide file tree
Showing 5 changed files with 6 additions and 23 deletions.
4 changes: 0 additions & 4 deletions src/operator/contrib/count_sketch.cu
Expand Up @@ -33,10 +33,6 @@
#define WARPS_PER_BLOCK 1
#define THREADS_PER_BLOCK 512

#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
i < (n); \
i += blockDim.x * gridDim.x)
namespace mshadow {
namespace cuda {
// wrappers to deal with atomic add
Expand Down
4 changes: 0 additions & 4 deletions src/operator/contrib/deformable_psroi_pooling.cu
Expand Up @@ -38,10 +38,6 @@
cudaError_t error = condition; \
CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \
} while (0)
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
i < (n); \
i += blockDim.x * gridDim.x)

namespace mshadow {
namespace cuda {
Expand Down
4 changes: 0 additions & 4 deletions src/operator/contrib/psroi_pooling.cu
Expand Up @@ -39,10 +39,6 @@
cudaError_t error = condition; \
CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \
} while (0)
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
i < (n); \
i += blockDim.x * gridDim.x)

namespace mshadow {
namespace cuda {
Expand Down
11 changes: 4 additions & 7 deletions src/operator/contrib/roi_align.cu
Expand Up @@ -24,15 +24,12 @@
* Adapted from Caffe2
*/
#include "./roi_align-inl.h"
#include "../mxnet_op.h"


namespace mxnet {
namespace op {

#define CUDA_1D_KERNEL_LOOP(i, n) \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)

using namespace mshadow::cuda;

// The maximum number of blocks to use in the default kernel call.
Expand Down Expand Up @@ -120,7 +117,7 @@ __global__ void RoIAlignForwardKernel(
const int sampling_ratio,
const T* bottom_rois,
T* top_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
CUDA_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
Expand Down Expand Up @@ -259,7 +256,7 @@ __global__ void RoIAlignBackwardKernel(
const int sampling_ratio,
T* bottom_diff,
const T* bottom_rois) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
CUDA_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
Expand Down Expand Up @@ -353,7 +350,7 @@ __global__ void RoIAlignBackwardKernel(
} // if
} // ix
} // iy
} // CUDA_1D_KERNEL_LOOP
} // CUDA_KERNEL_LOOP
} // RoIAlignBackward

template<typename xpu>
Expand Down
6 changes: 2 additions & 4 deletions src/operator/correlation.cu
Expand Up @@ -28,6 +28,7 @@
#include <mshadow/cuda/reduce.cuh>
#include <algorithm>
#include <vector>
#include "./mxnet_op.h"

#define ROUND_OFF 50000
#define WARPS_PER_BLOCK 1
Expand All @@ -38,10 +39,7 @@
cudaError_t error = condition; \
CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \
} while (0)
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
i < (n); \
i += blockDim.x * gridDim.x)

namespace mshadow {
namespace cuda {
// == Correlation Kernel
Expand Down

0 comments on commit bc01a86

Please sign in to comment.