From cfd8c58b453c42bedaeaae50411637d85feee904 Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Thu, 13 Jun 2019 16:18:46 -0700 Subject: [PATCH] Tune elementwise ops for ROCm (#21754) Summary: ``` The stride calculation using OffsetCalculator performs poorly with MAX_DIMS=25. This reduces MAX_DIMS (after coalescing) to 16 on ROCm. I think it's unlikely that anyone will exceed this limit. If they do, we can add additional specializations for ROCm with more dimensions. ``` I'm not sure about the underlying cause. With MAX_DIM=25, the add kernel's params is ~648 bytes vs. ~424 bytes with MAX_DIM=16. The kernel instruction footprint is bigger too, but most of these instructions are never executed and most kernel parameters are never loaded because the typical dimensionality is much smaller. Mini benchmark here: https://gist.github.com/colesbury/1e917ae6a0ca9d24712121b92fed4c8f (broadcasting operations are much faster) cc iotamudelta Pull Request resolved: https://github.com/pytorch/pytorch/pull/21754 Reviewed By: bddppq Differential Revision: D15811906 Pulled By: colesbury fbshipit-source-id: 063f92c083d26e2ef2edc98df7ff0400f9432b9d --- aten/src/ATen/cuda/detail/OffsetCalculator.cuh | 10 +++++++--- aten/src/ATen/native/cuda/Loops.cuh | 2 +- aten/src/THC/THCIntegerDivider.cuh | 5 ++++- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/aten/src/ATen/cuda/detail/OffsetCalculator.cuh b/aten/src/ATen/cuda/detail/OffsetCalculator.cuh index af45bae837f9..e43967db4d0b 100644 --- a/aten/src/ATen/cuda/detail/OffsetCalculator.cuh +++ b/aten/src/ATen/cuda/detail/OffsetCalculator.cuh @@ -9,15 +9,19 @@ /// OffsetCalculator calculates the offset in bytes of a linear index for NARGS /// operands that share the same shape, but may have different strides. +#ifdef __HIP_PLATFORM_HCC__ +constexpr int MAX_DIMS = 16; +#else +constexpr int MAX_DIMS = 25; +#endif + template struct OffsetCalculator { - static constexpr int MAX_DIMS = 25; - // The offset for each argument (in bytes). Wrapper around fixed-size array. using offset_type = at::detail::Array; OffsetCalculator(int dims, const int64_t* sizes, const int64_t* const* strides) : dims(dims) { - TORCH_CHECK(dims <= MAX_DIMS, "tensor has too many (>25) dims"); + TORCH_CHECK(dims <= MAX_DIMS, "tensor has too many (>", MAX_DIMS, ") dims"); for (int i = 0; i < MAX_DIMS; ++i) { if (i < dims) { sizes_[i] = IntDivider(sizes[i]); diff --git a/aten/src/ATen/native/cuda/Loops.cuh b/aten/src/ATen/native/cuda/Loops.cuh index bee9474aa102..6ab08ffd3941 100644 --- a/aten/src/ATen/native/cuda/Loops.cuh +++ b/aten/src/ATen/native/cuda/Loops.cuh @@ -25,7 +25,7 @@ #ifdef __HIP_PLATFORM_HCC__ static constexpr int launch_size_1d = 1024; static constexpr int launch_size_nd = 1024; -static constexpr int launch_bound2 = 16; +static constexpr int launch_bound2 = 1; #else static constexpr int launch_size_1d = 512; static constexpr int launch_size_nd = 128; diff --git a/aten/src/THC/THCIntegerDivider.cuh b/aten/src/THC/THCIntegerDivider.cuh index cf71deb3efd7..75c0a5079ff7 100644 --- a/aten/src/THC/THCIntegerDivider.cuh +++ b/aten/src/THC/THCIntegerDivider.cuh @@ -2,6 +2,9 @@ #define THC_INTEGER_DIVIDER_INC #include +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) +#include +#endif // A utility class to implement integer division by muliplication, given a fixed // divisor. @@ -91,7 +94,7 @@ struct IntDivider { } __host__ __device__ inline unsigned int div(unsigned int n) const { -#ifdef __CUDA_ARCH__ +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) // 't' is the higher 32-bits of unsigned 32-bit multiplication of 'n' and // 'm1'. unsigned int t = __umulhi(n, m1);