Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion aten/src/ATen/native/cuda/EmbeddingBag.cu
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ __global__ void EmbeddingBag_updateOutputKernel(
scalar_t *weightFeat = weight + featureDim * weight_stride1;
int64_t begin = bag == 0 ? 0 : offsets[bag]; // forces first offset to be 0 instead of asserting on it
int64_t end = (bag < numBags - 1) ? (offsets[bag + 1]) : numIndices;
assert(end >= begin);
CUDA_KERNEL_ASSERT(end >= begin);

accscalar_t weightFeatSum = 0;
scalar_t weightFeatMax;
Expand Down
6 changes: 3 additions & 3 deletions aten/src/ATen/native/cuda/SortingRadixSelect.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ struct TopKTypeConfig<int16_t> {
typedef uint32_t RadixType;

static inline __device__ RadixType convert(int16_t v) {
assert(sizeof(short) == 2);
static_assert(sizeof(short) == 2, "");
return 32768u + v;
}

Expand All @@ -79,7 +79,7 @@ struct TopKTypeConfig<int32_t> {
typedef uint32_t RadixType;

static inline __device__ RadixType convert(int32_t v) {
assert(sizeof(int) == 4);
static_assert(sizeof(int) == 4, "");
return 2147483648u + v;
}

Expand All @@ -93,7 +93,7 @@ struct TopKTypeConfig<int64_t> {
typedef uint64_t RadixType;

static inline __device__ RadixType convert(int64_t v) {
assert(sizeof(int64_t) == 8);
static_assert(sizeof(int64_t) == 8, "");
return 9223372036854775808ull + v;
}

Expand Down
4 changes: 2 additions & 2 deletions aten/src/THC/THCTensorInfo.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -73,15 +73,15 @@ TensorInfo<T, IndexType>::TensorInfo(T* p,
template <typename T, typename IndexType>
void
TensorInfo<T, IndexType>::reduceDim(int dim) {
assert(dim < dims && dim >= 0);
TORCH_INTERNAL_ASSERT(dim < dims && dim >= 0);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh, these aren't run in CUDA?! Intruiging.

sizes[dim] = 1;
}

template <typename T, typename IndexType>
int
TensorInfo<T, IndexType>::collapseDims(const int excludeDim) {

assert(excludeDim >= -1 && excludeDim < dims);
TORCH_INTERNAL_ASSERT(excludeDim >= -1 && excludeDim < dims);

int stopDim = (excludeDim == -1) ? dims : excludeDim;
int newIndex = -1;
Expand Down
2 changes: 1 addition & 1 deletion aten/src/THC/generic/THCTensorMode.cu
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,7 @@ void THCTensor_(mode)(THCState *state,
break;
case 1:
default:
assert(false);
TORCH_INTERNAL_ASSERT(false);
}
THCudaCheck(cudaGetLastError());

Expand Down
2 changes: 1 addition & 1 deletion aten/src/THC/generic/THCTensorSort.cu
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ void THCTensor_(sortKeyValueInplace)(THCState* state,
/* Nothing to do, data already sorted */ \
break; \
default: \
assert(false); \
TORCH_INTERNAL_ASSERT(false); \
} \
}

Expand Down
4 changes: 0 additions & 4 deletions caffe2/core/common_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -255,10 +255,6 @@ CAFFE2_CUDA_API const char* curandGetErrorString(curandStatus_t error);
for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); \
j += blockDim.y * gridDim.y)

#if defined(__APPLE__) || defined(__HIP_PLATFORM_HCC__)
#define CUDA_KERNEL_ASSERT(...)
#endif

// The following helper functions are here so that you can write a kernel call
// when you are not particularly interested in maxing out the kernels'
// performance. Usually, this will give you a reasonable speed, but if you
Expand Down
5 changes: 3 additions & 2 deletions caffe2/operators/top_k_radix_selection.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ struct TopKTypeConfig<short> {
typedef unsigned int RadixType;

static inline __device__ RadixType convert(short v) {
CUDA_KERNEL_ASSERT(sizeof(short) == 2);
static_assert(sizeof(short) == 2, "");
return 32768u + v;
}

Expand All @@ -90,7 +90,7 @@ struct TopKTypeConfig<int> {
typedef unsigned int RadixType;

static inline __device__ RadixType convert(int v) {
CUDA_KERNEL_ASSERT(sizeof(int) == 4);
static_assert(sizeof(int) == 4, "");
return 2147483648u + v;
}

Expand All @@ -104,6 +104,7 @@ struct TopKTypeConfig<long> {
typedef unsigned long long int RadixType;

static inline __device__ RadixType convert(long v) {
//static_assert fails on windows, so leave it as CUDA_KERNEL_ASSERT
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

...how does the CUDA_KERNEL_ASSERT not fail on windows?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It will fail, if someone tries to run caffe2 radix sort for long inputs on windows pytorch build. Hopefully, no one will do it, but I can't make it static_assert, because with static_assert windows build itself fails.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we just make this into int64_t?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not familiar with caffe2 code and don't know if it's possible. It is also probably untested?

Copy link
Collaborator

@peterjc123 peterjc123 May 29, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think it will break anything because of the assert sizeof(long) == 8 here.

CUDA_KERNEL_ASSERT(sizeof(long) == 8);
return 9223372036854775808ull + v;
}
Expand Down
3 changes: 3 additions & 0 deletions torch/utils/hipify/cuda_to_hip_mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -8010,6 +8010,9 @@
("curandGenerateUniform", ("hiprandGenerateUniform", API_CAFFE2)),
("curand_generator", ("hiprand_generator", API_CAFFE2)),
("CaffeCudaGetDevice", ("CaffeHipGetDevice", API_CAFFE2)),
# do not rename CUDA_KERNEL_ASSERT,
# the ordered dict guarantees this pattern will match first, before "CUDA"
("CUDA_KERNEL_ASSERT", ("CUDA_KERNEL_ASSERT", API_CAFFE2)),
("CUDA", ("HIP", API_CAFFE2)),
("Cuda", ("Hip", API_CAFFE2)),
("cuda_", ("hip_", API_CAFFE2)),
Expand Down