Skip to content

Commit

Permalink
Speedup bincount and histc on CUDA
Browse files Browse the repository at this point in the history
  • Loading branch information
yuantailing committed Mar 18, 2023
1 parent 4805441 commit 8c0ae4d
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 81 deletions.
7 changes: 2 additions & 5 deletions aten/src/ATen/cuda/Atomic.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -203,11 +203,8 @@ static inline __device__ void gpuAtomicAdd(int64_t *address, int64_t val) {
#if defined(USE_ROCM)
__atomic_fetch_add(address, val, __ATOMIC_RELAXED);
#else
AtomicAddIntegerImpl<int64_t, sizeof(int64_t)>()(address,
val,
[](int64_t a, int64_t b) {
return a + b;
});
static_assert(sizeof(unsigned long long int) == sizeof(int64_t), "bitwidth change is not allowed");
atomicAdd(reinterpret_cast<unsigned long long int *>(address), static_cast<unsigned long long int>(val));
#endif
}

Expand Down
87 changes: 19 additions & 68 deletions aten/src/ATen/native/cuda/SummaryOps.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,7 @@

namespace at {
namespace cuda {
#define THRESH_NUMBER_BINS_FOR_MULTI_BLOCK_MEM 100
#define THRESH_NUMBER_BINS_FOR_GLOBAL_MEM 1000
#define RATIO_OF_GMEM_ATOMIC_ADD_TO_SMEM_ATOMIC_ADD 8
#define FOR_KERNEL_LOOP(i, lim) \
for (IndexType i = blockIdx.x * blockDim.x + threadIdx.x; i < lim; \
i += gridDim.x * blockDim.x)
Expand All @@ -30,7 +29,7 @@ namespace cuda {
Memory types used for the 3 histogram implementations.
See `CUDA_tensor_histogram` below.
*/
enum class CUDAHistogramMemoryType { SHARED, MULTI_BLOCK, GLOBAL };
enum class CUDAHistogramMemoryType { SHARED, GLOBAL };
namespace {
template <typename input_t, typename IndexType>
__device__ static IndexType getBin(
Expand Down Expand Up @@ -60,7 +59,7 @@ template <
int ADims,
int PDims,
int BDims,
CUDAHistogramMemoryType MemoryType = CUDAHistogramMemoryType::MULTI_BLOCK,
CUDAHistogramMemoryType MemoryType,
typename Op>
C10_LAUNCH_BOUNDS_1(cuda::getApplyBlockSize())
__global__ void kernelHistogram1D(
Expand Down Expand Up @@ -106,39 +105,6 @@ __global__ void kernelHistogram1D(
gpuAtomicAddNoReturn(&a.data[aOffset], smem[i]);
}

} else if (MemoryType == CUDAHistogramMemoryType::MULTI_BLOCK) {
////////////////////////// Multi Block memory //////////////////////////
// atomically add to block specific global tensor
// then atomically add to the global output tensor
// compute histogram for the block
FOR_KERNEL_LOOP(linearIndex, totalElements) {
// Convert `linearIndex` into an offset of `b`
const IndexType bOffset =
detail::IndexToOffset<input_t, IndexType, BDims>::get(linearIndex, b);
const auto bVal = b.data[bOffset];
if (bVal >= minvalue && bVal <= maxvalue) {
// Use value at `b` as an offset of `p`
const IndexType bin =
getBin<input_t, IndexType>(bVal, minvalue, maxvalue, nbins);
const IndexType pIdx = p.strides[0] * blockIdx.x + bin;
const IndexType pOffset =
detail::IndexToOffset<output_t, IndexType, PDims>::get(pIdx, p);
gpuAtomicAddNoReturn(&p.data[pOffset], getOp(linearIndex));
}
}
__syncthreads();
// NOTE: atomically update output bin count.
// Atomic update is imp since __syncthread() will only synchronize threads
// in a given block, not across blocks.
const IndexType pIdx = p.strides[0] * blockIdx.x;
const IndexType pOffset =
detail::IndexToOffset<output_t, IndexType, PDims>::get(pIdx, p);
for (IndexType i = threadIdx.x; i < a.sizes[0]; i += blockDim.x) {
const IndexType aOffset =
detail::IndexToOffset<output_t, IndexType, ADims>::get(i, a);
gpuAtomicAddNoReturn(&a.data[aOffset], p.data[pOffset + i]);
}

} else {
////////////////////////// Global memory //////////////////////////
// atomically add to the output tensor
Expand Down Expand Up @@ -184,9 +150,6 @@ __global__ void kernelHistogram1D(
case CUDAHistogramMemoryType::SHARED: \
HANDLE_CASE(CUDAHistogramMemoryType::SHARED, getOp, sharedMem); \
break; \
case CUDAHistogramMemoryType::MULTI_BLOCK: \
HANDLE_CASE(CUDAHistogramMemoryType::MULTI_BLOCK, getOp, 0); \
break; \
default: \
HANDLE_CASE(CUDAHistogramMemoryType::GLOBAL, getOp, 0); \
}
Expand All @@ -210,13 +173,10 @@ inline int64_t getFreeGlobalMemory() {
See `help torch.bincount` for details on the math.
3 implementations based of input size and memory usage:
case: #bins < THRESH_NUMBER_BINS_FOR_MULTI_BLOCK_MEM and enough shared mem
case: enough shared mem
SHARED: Each block atomically adds to it's own **shared** hist copy,
then atomically updates the global tensor.
case: #bins < THRESH_NUMBER_BINS_FOR_GLOBAL_MEM and enough global mem
MULTI_BLOCK: Each block atomically adds to it's own **global** hist
copy, then atomically updates the global tensor.
case: THRESH_NUMBER_BINS_FOR_GLOBAL_MEM <= #bins
case: no enough shared mem
GLOBAL: all threads atomically update to a single **global** hist copy.
*/
template <typename output_t, typename input_t, bool HasWeights>
Expand Down Expand Up @@ -250,35 +210,27 @@ bool CUDA_tensor_histogram(
CUDAHistogramMemoryType memType = CUDAHistogramMemoryType::GLOBAL;
auto maxSharedMem = getCurrentDeviceProperties()->sharedMemPerBlock;
auto sharedMem = nbins * sizeof(output_t) + 8; // 8 guard bytes
auto maxGlobalMem = getFreeGlobalMemory();
auto multiBlockMem = nbins * grid.x * sizeof(output_t) + 8; // 8 guard bytes
// determine memory type to use in the kernel
if (nbins < THRESH_NUMBER_BINS_FOR_MULTI_BLOCK_MEM &&
sharedMem < maxSharedMem) {
if (sharedMem < maxSharedMem) {
// Solve equations:
// (1) #(smem atomicAdd per SM) = totalElements / min(grid.x, #SM)
// (2) #(gmem atomicAdd) = grid.x * nbins
// (3) RATIO_OF_GMEM_ATOMIC_ADD_TO_SMEM_ATOMIC_ADD = #(gmem atomicAdd) / #(smem atomicAdd per SM)
unsigned optimalGrid = ceil_div<size_t>(RATIO_OF_GMEM_ATOMIC_ADD_TO_SMEM_ATOMIC_ADD * totalElements,
(nbins * getCurrentDeviceProperties()->multiProcessorCount));
if (optimalGrid < (unsigned)getCurrentDeviceProperties()->multiProcessorCount) {
optimalGrid = 1 + (unsigned)std::sqrt(RATIO_OF_GMEM_ATOMIC_ADD_TO_SMEM_ATOMIC_ADD * totalElements / nbins);
}
auto optimalSteps = ceil_div<size_t>(totalElements, optimalGrid * block.x);
optimalGrid = ceil_div<size_t>(totalElements, optimalSteps * block.x);
grid.x = std::min(grid.x, optimalGrid);
memType = CUDAHistogramMemoryType::SHARED;
} else if (
nbins < THRESH_NUMBER_BINS_FOR_GLOBAL_MEM &&
multiBlockMem < static_cast<size_t>(maxGlobalMem / 2)) {
// check against half of free mem to be extra safe
// due to cached allocator, we may anyway have slightly more free mem
memType = CUDAHistogramMemoryType::MULTI_BLOCK;
}

// alloc memory for MULTI_BLOCK
using IndexType = int64_t;
auto aInfo = detail::getTensorInfo<output_t, IndexType>(a);
auto bInfo = detail::getTensorInfo<input_t, IndexType>(b);
detail::TensorInfo<output_t, IndexType> pInfo(nullptr, 0, {}, {});
Tensor partial_output;
if (memType == CUDAHistogramMemoryType::MULTI_BLOCK) {
partial_output = at::zeros(
{grid.x, nbins},
optTypeMetaToScalarType(a.options().dtype_opt()),
a.options().layout_opt(),
a.options().device_opt(),
a.options().pinned_memory_opt());
pInfo = detail::getTensorInfo<output_t, IndexType>(partial_output);
}

if (HasWeights) {
auto cInfo = detail::getTensorInfo<output_t, IndexType>(c);
Expand All @@ -298,8 +250,7 @@ bool CUDA_tensor_histogram(
#undef HANDLE_CASE
#undef HANDLE_SWITCH_CASE
#undef FOR_KERNEL_LOOP
#undef THRESH_NUMBER_BINS_FOR_GLOBAL_MEM
#undef THRESH_NUMBER_BINS_FOR_MULTI_BLOCK_MEM
#undef RATIO_OF_GMEM_ATOMIC_ADD_TO_SMEM_ATOMIC_ADD
} // namespace cuda

namespace {
Expand Down
12 changes: 4 additions & 8 deletions test/test_cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -1709,21 +1709,17 @@ def test_nvtx(self):

def test_bincount_ext(self):
# ensure CUDA code coverage
input_size = (5000,)
input_size = (100000,)
w = torch.randn(input_size, dtype=torch.double, device='cuda')
w_cpu = w.cpu()
# test shared memory impl
t = torch.randint(50, input_size, dtype=torch.int8, device='cuda')
self.assertEqual(t.cpu().bincount(), t.bincount())
self.assertEqual(t.cpu().bincount(w_cpu), t.bincount(w))
# test multi block memory impl
# see `THRESH_NUMBER_BINS_FOR_MULTI_BLOCK_MEM` in SummaryOps.cu
t = torch.randint(500, input_size, dtype=torch.int64, device='cuda')
self.assertEqual(t.cpu().bincount(), t.bincount())
self.assertEqual(t.cpu().bincount(w_cpu), t.bincount(w))
# test global memory impl
# see `THRESH_NUMBER_BINS_FOR_GLOBAL_MEM` in SummaryOps.cu
t = torch.randint(2000, input_size, dtype=torch.int64, device='cuda')
# see `CUDAHistogramMemoryType` in SummaryOps.cu
# 50000 * sizeof(int64_t) == 390 KiB, which should exceed smem of any known GPU
t = torch.randint(50000, input_size, dtype=torch.int64, device='cuda')
self.assertEqual(t.cpu().bincount(), t.bincount())
self.assertEqual(t.cpu().bincount(w_cpu), t.bincount(w))

Expand Down

0 comments on commit 8c0ae4d

Please sign in to comment.