fix asserts in cuda code (#39047)

Summary: Gets rid of some in-kernel asserts where they can be replaced with static_asserts Replaces bare in-kernel `assert` in one case with `CUDA_KERNEL_ASSERT` where necessary replaces host code `assert`s with `TORCH_INTERNAL_ASSERT` Another group of asserts is in fractional max pooling kernels which should be fixed regardless #39044, the problems there are not just asserts. I've audited remaining cases of in-kernel asserts, and they are more like `TORCH_INTERNAL_ASSERT`, so they should not happen with invalid user data. I think it's ok to leave them as is. Pull Request resolved: #39047 Differential Revision: D21750392 Pulled By: ngimel fbshipit-source-id: e9417523a2c672284de3515933cb7ed166e56719
pytorch · Jun 3, 2020 · 5d01f87 · 5d01f87
1 parent 82f549b
commit 5d01f87
Show file tree

Hide file tree

Showing 8 changed files with 14 additions and 14 deletions.
diff --git a/aten/src/ATen/native/cuda/EmbeddingBag.cu b/aten/src/ATen/native/cuda/EmbeddingBag.cu
@@ -54,7 +54,7 @@ __global__ void EmbeddingBag_updateOutputKernel(
       scalar_t *weightFeat = weight + featureDim * weight_stride1;
       int64_t begin = bag == 0 ? 0 : offsets[bag]; // forces first offset to be 0 instead of asserting on it
       int64_t end = (bag < numBags - 1) ? (offsets[bag + 1]) : numIndices;
-      assert(end >= begin);
+      CUDA_KERNEL_ASSERT(end >= begin);
 
       accscalar_t weightFeatSum = 0;
       scalar_t weightFeatMax;

diff --git a/aten/src/ATen/native/cuda/SortingRadixSelect.cuh b/aten/src/ATen/native/cuda/SortingRadixSelect.cuh
@@ -65,7 +65,7 @@ struct TopKTypeConfig<int16_t> {
   typedef uint32_t RadixType;
 
   static inline __device__ RadixType convert(int16_t v) {
-    assert(sizeof(short) == 2);
+    static_assert(sizeof(short) == 2, "");
     return 32768u + v;
   }
 
@@ -79,7 +79,7 @@ struct TopKTypeConfig<int32_t> {
   typedef uint32_t RadixType;
 
   static inline __device__ RadixType convert(int32_t v) {
-    assert(sizeof(int) == 4);
+    static_assert(sizeof(int) == 4, "");
     return 2147483648u + v;
   }
 
@@ -93,7 +93,7 @@ struct TopKTypeConfig<int64_t> {
   typedef uint64_t RadixType;
 
   static inline __device__ RadixType convert(int64_t v) {
-    assert(sizeof(int64_t) == 8);
+    static_assert(sizeof(int64_t) == 8, "");
     return 9223372036854775808ull + v;
   }
 

diff --git a/aten/src/THC/THCTensorInfo.cuh b/aten/src/THC/THCTensorInfo.cuh
@@ -73,15 +73,15 @@ TensorInfo<T, IndexType>::TensorInfo(T* p,
 template <typename T, typename IndexType>
 void
 TensorInfo<T, IndexType>::reduceDim(int dim) {
-  assert(dim < dims && dim >= 0);
+  TORCH_INTERNAL_ASSERT(dim < dims && dim >= 0);
   sizes[dim] = 1;
 }
 
 template <typename T, typename IndexType>
 int
 TensorInfo<T, IndexType>::collapseDims(const int excludeDim) {
 
-  assert(excludeDim >= -1 && excludeDim < dims);
+  TORCH_INTERNAL_ASSERT(excludeDim >= -1 && excludeDim < dims);
 
   int stopDim = (excludeDim == -1) ? dims : excludeDim;
   int newIndex = -1;

diff --git a/aten/src/THC/generic/THCTensorMode.cu b/aten/src/THC/generic/THCTensorMode.cu
@@ -269,7 +269,7 @@ void THCTensor_(mode)(THCState *state,
         break;
       case 1:
       default:
-        assert(false);
+        TORCH_INTERNAL_ASSERT(false);
     }
     THCudaCheck(cudaGetLastError());
 

diff --git a/aten/src/THC/generic/THCTensorSort.cu b/aten/src/THC/generic/THCTensorSort.cu
@@ -101,7 +101,7 @@ void THCTensor_(sortKeyValueInplace)(THCState* state,
       /* Nothing to do, data already sorted */          \
       break;                                            \
       default:                                          \
-      assert(false);                                    \
+      TORCH_INTERNAL_ASSERT(false);                                    \
     }                                                   \
   }
 

diff --git a/caffe2/core/common_gpu.h b/caffe2/core/common_gpu.h
@@ -261,10 +261,6 @@ CAFFE2_CUDA_API const char* curandGetErrorString(curandStatus_t error);
     for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); \
          j += blockDim.y * gridDim.y)
 
-#if defined(__APPLE__) || defined(__HIP_PLATFORM_HCC__)
-#define CUDA_KERNEL_ASSERT(...)
-#endif
-
 // The following helper functions are here so that you can write a kernel call
 // when you are not particularly interested in maxing out the kernels'
 // performance. Usually, this will give you a reasonable speed, but if you

diff --git a/caffe2/operators/top_k_radix_selection.cuh b/caffe2/operators/top_k_radix_selection.cuh
@@ -76,7 +76,7 @@ struct TopKTypeConfig<short> {
   typedef unsigned int RadixType;
 
   static inline __device__ RadixType convert(short v) {
-    CUDA_KERNEL_ASSERT(sizeof(short) == 2);
+    static_assert(sizeof(short) == 2, "");
     return 32768u + v;
   }
 
@@ -90,7 +90,7 @@ struct TopKTypeConfig<int> {
   typedef unsigned int RadixType;
 
   static inline __device__ RadixType convert(int v) {
-    CUDA_KERNEL_ASSERT(sizeof(int) == 4);
+    static_assert(sizeof(int) == 4, "");
     return 2147483648u + v;
   }
 
@@ -104,6 +104,7 @@ struct TopKTypeConfig<long> {
   typedef unsigned long long int RadixType;
 
   static inline __device__ RadixType convert(long v) {
+    //static_assert fails on windows, so leave it as CUDA_KERNEL_ASSERT
     CUDA_KERNEL_ASSERT(sizeof(long) == 8);
     return 9223372036854775808ull + v;
   }

diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py
@@ -8010,6 +8010,9 @@
         ("curandGenerateUniform", ("hiprandGenerateUniform", API_CAFFE2)),
         ("curand_generator", ("hiprand_generator", API_CAFFE2)),
         ("CaffeCudaGetDevice", ("CaffeHipGetDevice", API_CAFFE2)),
+        # do not rename CUDA_KERNEL_ASSERT,
+        # the ordered dict guarantees this pattern will match first, before "CUDA"
+        ("CUDA_KERNEL_ASSERT", ("CUDA_KERNEL_ASSERT", API_CAFFE2)),
         ("CUDA", ("HIP", API_CAFFE2)),
         ("Cuda", ("Hip", API_CAFFE2)),
         ("cuda_", ("hip_", API_CAFFE2)),