don't produce invalid grid configs (#166973) (#167158)

ngimel · web-flow · commit 8f658d7599a9 · 2025-11-06T12:36:55.000-05:00
Proper fix for #164048, fixes gather too, reverts #164049 Pull Request resolved: #166974 Approved by: https://github.com/eqy
diff --git a/aten/src/ATen/native/cuda/IndexKernel.cu b/aten/src/ATen/native/cuda/IndexKernel.cu
@@ -73,7 +73,6 @@ void gpu_index_kernel(TensorIteratorBase& iter, const IntArrayRef index_size, co
 
   char* const out_ptr = static_cast<char*>(iter.data_ptr(0));
   char* const in_ptr = static_cast<char*>(iter.data_ptr(1));
-
   if (is_gather_like && num_indices==1) {
       const size_t element_size = iter.element_size(0);
       constexpr size_t alignment = 16;
@@ -83,11 +82,10 @@ void gpu_index_kernel(TensorIteratorBase& iter, const IntArrayRef index_size, co
         auto ind_dim_size = index_size[0];
         auto inp_stride_bytes = index_stride[0];
         auto out_stride_bytes = iter.strides(0)[1];
-        if (iter.numel() == 0) return;
         at::native::vectorized_gather_kernel_launch<alignment, int64_t>(out_ptr, in_ptr, (int64_t*)iter.data_ptr(2), num_ind,
         slice_size, ind_dim_size, inp_stride_bytes, out_stride_bytes, /*allow_neg_indices*/true);
         return;
-      }
+    }
   }
 
   auto sizes = std::array<int64_t, MAX_DIMS>{};
diff --git a/aten/src/ATen/native/cuda/IndexKernelUtils.cu b/aten/src/ATen/native/cuda/IndexKernelUtils.cu
@@ -14,10 +14,11 @@ __global__ void vectorized_gather_kernel(char * out, char * inp, index_t * idx,
         ind = (ind < 0) ? ind + ind_dim_size : ind;
     }
     CUDA_KERNEL_ASSERT(ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds");
-    int32_t off = (blockDim.x * blockIdx.y + threadIdx.x) * Alignment; // off is guaranteed to be within int32 limits
-    if (off >= slice_size) return;
-    auto vec = at::native::memory::ld_vec<Alignment>(inp + ind * inp_stride + off);
-    at::native::memory::st_vec<Alignment>(out + blockIdx.x * (int32_t)out_stride + off, vec);  // out offset is guaranteed to be within int32 limits
+    // off is guaranteed to be within int32 limits
+    for (int32_t off = (blockDim.x * blockIdx.y + threadIdx.x) * Alignment; off < slice_size; off += blockDim.x * gridDim.y * Alignment) {
+      auto vec = at::native::memory::ld_vec<Alignment>(inp + ind * inp_stride + off);
+      at::native::memory::st_vec<Alignment>(out + blockIdx.x * (int32_t)out_stride + off, vec);  // out offset is guaranteed to be within int32 limits
+    }
 }
 
 
@@ -30,7 +31,9 @@ void vectorized_gather_kernel_launch(char * out, char * inp, index_t * idx, int
   auto num_threads = at::round_up(
       at::ceil_div(slice_size_in_bytes, Alignment),
       static_cast<int64_t>(C10_WARP_SIZE));
-  dim3 grid = {static_cast<uint32_t>(num_ind), static_cast<uint32_t>(at::ceil_div(slice_size_in_bytes, max_num_threads * Alignment)), 1};
+  uint32_t grid_y = at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
+  grid_y = std::min(static_cast<uint32_t>(at::ceil_div(slice_size_in_bytes, max_num_threads * Alignment)), grid_y);
+  dim3 grid = {static_cast<uint32_t>(num_ind), grid_y, 1};
   auto block = std::min(max_num_threads, num_threads);
   vectorized_gather_kernel<Alignment, index_t><<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(out, inp, idx, num_ind, slice_size_in_bytes,
   ind_dim_size, inp_stride_bytes, out_stride_bytes, allow_neg_indices);
diff --git a/test/test_scatter_gather_ops.py b/test/test_scatter_gather_ops.py
@@ -6,7 +6,7 @@
 
 from torch.testing import make_tensor
 from torch.testing._internal.common_utils import \
-    (parametrize, run_tests, TestCase, DeterministicGuard, TEST_WITH_ROCM)
+    (parametrize, run_tests, TestCase, DeterministicGuard, TEST_WITH_ROCM, serialTest)
 from torch.testing._internal.common_device_type import \
     (instantiate_device_type_tests, onlyCPU, dtypes, dtypesIfCUDA,
      toleranceOverride, tol,)
@@ -65,10 +65,12 @@ def test_gather(self, device, dtype):
             actual = torch.gather(src, 2, idx)
             self.assertEqual(actual, expected, atol=0, rtol=0)
 
+    @serialTest()
     @dtypes(torch.int8, torch.bfloat16)
     def test_gather_large(self, device, dtype):
         # test larger shapes to check vectorized implementation
-        for (m, n, k) in ((4096, 3072, 4096), (4096, 3072, 4100)):
+        for (m, n, k) in ((4096, 3072, 4096), (4096, 3072, 4100), (4, 4, 16384 * 8192)):
+            torch.cuda.empty_cache()
             src = make_tensor((m, k), device=device, dtype=dtype)
             alloc0 = torch.empty(src.nelement() * 2, device=device, dtype=dtype)
             discontig = alloc0.view(m, 2 * k)[:, ::2].copy_(src)
@@ -111,6 +113,8 @@ def test_gather_large(self, device, dtype):
                 self.assertEqual(res_ind, ref, atol=0, rtol=0)
                 res_gather = torch.gather(misaligned1, dim=dim, index=ind)
                 self.assertEqual(res_gather, ref, atol=0, rtol=0)
+            del src, alloc0, alloc1, alloc2
+            del discontig, misaligned, misaligned1
         # test gather along 1st dim that can accidentally trigger fast path
         # because due to index dimension in the gather dim being 1
         # an unexpected squashing in tensorIterator happens