From f79687a5903478033cae1adbe93c921fd5b412e2 Mon Sep 17 00:00:00 2001
From: Li Li <li.li3@amd.com>
Date: Wed, 12 Nov 2025 22:56:39 +0000
Subject: [PATCH 1/5] opt embedding_bounds_check_v2

---
 .../utils/embedding_bounds_check_v2.cu        | 22 ++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)
diff --git a/fbgemm_gpu/codegen/utils/embedding_bounds_check_v2.cu b/fbgemm_gpu/codegen/utils/embedding_bounds_check_v2.cu
index 2f7bfc8bb7..ed21c8c5e3 100644
--- a/fbgemm_gpu/codegen/utils/embedding_bounds_check_v2.cu
+++ b/fbgemm_gpu/codegen/utils/embedding_bounds_check_v2.cu
@@ -31,6 +31,10 @@ __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel_v2(
   index_t invalid_i = -1, invalid_idx = -1;
   int32_t invalid_b_t = -1;
   int64_t warning_inc = 0;
+  extern __shared__ int64_t block_warning_buffer[];
+  const int linear_tid = threadIdx.z * (blockDim.y * blockDim.x) +
+      threadIdx.y * blockDim.x + threadIdx.x;
+  const int active_threads = blockDim.x * blockDim.y * blockDim.z;
 
   // Check the last element
   if (b_t_start == 0 && threadIdx.x == 0) {
@@ -142,9 +146,21 @@ __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel_v2(
     }
   } // for b_t
 
-  if (warning_inc > 0) {
-    gpuAtomicAdd(&warning[0], warning_inc);
+  // Accumulate per-thread warning counts in shared memory and reduce once per block.
+  block_warning_buffer[linear_tid] = warning_inc;
+  __syncthreads();
+
+  if (linear_tid == 0) {
+    int64_t block_warning_sum = 0;
+    for (int idx = 0; idx < active_threads; ++idx) {
+      block_warning_sum += block_warning_buffer[idx];
+    }
+    block_warning_buffer[0] = block_warning_sum;
+    if (block_warning_sum > 0) {
+      gpuAtomicAdd(&warning[0], block_warning_sum);
+    }
   }
+  __syncthreads();
   if (bounds_check_mode == BoundsCheckMode::WARNING && invalid_i != -1 &&
       static_cast<int64_t>(atomicAdd(
           reinterpret_cast<unsigned long long int*>(&warning[0]), 0)) == 0) {
@@ -227,7 +243,7 @@ void _bounds_check_indices_cuda_v2(
               grid_dim,                                                        \
               dim3(                                                            \
                   fbgemm_gpu::kWarpSize, kNumThreads / fbgemm_gpu::kWarpSize), \
-              0,                                                               \
+              sizeof(int64_t) * kNumThreads,                               \
               at::cuda::getCurrentCUDAStream(),                                \
               PTA_B(rows_per_table, int64_t, 1, 32),                           \
               PTA_B(indices, index_t, 1, 32),                                  \

From 25b936177930e64259ddbed04a0ff5d3f4202e68 Mon Sep 17 00:00:00 2001
From: Li Li <li.li3@amd.com>
Date: Thu, 13 Nov 2025 17:42:24 +0000
Subject: [PATCH 2/5] fixed size of the shared memory

---
 fbgemm_gpu/codegen/utils/embedding_bounds_check_v2.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fbgemm_gpu/codegen/utils/embedding_bounds_check_v2.cu b/fbgemm_gpu/codegen/utils/embedding_bounds_check_v2.cu
index ed21c8c5e3..f4f8d74363 100644
--- a/fbgemm_gpu/codegen/utils/embedding_bounds_check_v2.cu
+++ b/fbgemm_gpu/codegen/utils/embedding_bounds_check_v2.cu
@@ -31,7 +31,7 @@ __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel_v2(
   index_t invalid_i = -1, invalid_idx = -1;
   int32_t invalid_b_t = -1;
   int64_t warning_inc = 0;
-  extern __shared__ int64_t block_warning_buffer[];
+  __shared__ int64_t block_warning_buffer[kMaxThreads];
   const int linear_tid = threadIdx.z * (blockDim.y * blockDim.x) +
       threadIdx.y * blockDim.x + threadIdx.x;
   const int active_threads = blockDim.x * blockDim.y * blockDim.z;
@@ -243,7 +243,7 @@ void _bounds_check_indices_cuda_v2(
               grid_dim,                                                        \
               dim3(                                                            \
                   fbgemm_gpu::kWarpSize, kNumThreads / fbgemm_gpu::kWarpSize), \
-              sizeof(int64_t) * kNumThreads,                               \
+              0                                ,                               \
               at::cuda::getCurrentCUDAStream(),                                \
               PTA_B(rows_per_table, int64_t, 1, 32),                           \
               PTA_B(indices, index_t, 1, 32),                                  \

From b6956747f078a49883b15095f29c251b1c5d0358 Mon Sep 17 00:00:00 2001
From: Aryaman Gupta <aryaman.gupta@amd.com>
Date: Thu, 13 Nov 2025 13:23:09 -0600
Subject: [PATCH 3/5] use parallel tree reduction for shared warning buffer
 within thread block

---
 .../codegen/utils/embedding_bounds_check_v2.cu    | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/fbgemm_gpu/codegen/utils/embedding_bounds_check_v2.cu b/fbgemm_gpu/codegen/utils/embedding_bounds_check_v2.cu
index f4f8d74363..f1e2420601 100644
--- a/fbgemm_gpu/codegen/utils/embedding_bounds_check_v2.cu
+++ b/fbgemm_gpu/codegen/utils/embedding_bounds_check_v2.cu
@@ -150,12 +150,17 @@ __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel_v2(
   block_warning_buffer[linear_tid] = warning_inc;
   __syncthreads();
 
-  if (linear_tid == 0) {
-    int64_t block_warning_sum = 0;
-    for (int idx = 0; idx < active_threads; ++idx) {
-      block_warning_sum += block_warning_buffer[idx];
+  // Parallel tree reduction
+  for (int stride = active_threads / 2; stride > 0; stride >>= 1) {
+    if (linear_tid < stride) {
+      block_warning_buffer[linear_tid] += block_warning_buffer[linear_tid + stride];
     }
-    block_warning_buffer[0] = block_warning_sum;
+    __syncthreads();
+  }
+
+  // Thread 0 has the final sum
+  if (linear_tid == 0) {
+    int64_t block_warning_sum = block_warning_buffer[0];
     if (block_warning_sum > 0) {
       gpuAtomicAdd(&warning[0], block_warning_sum);
     }

From 4cdac24e74a87896dd2a2a66073d577a9a53f74b Mon Sep 17 00:00:00 2001
From: Li Li <li.li3@amd.com>
Date: Thu, 13 Nov 2025 19:44:18 +0000
Subject: [PATCH 4/5] minor format fix

---
 fbgemm_gpu/codegen/utils/embedding_bounds_check_v2.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fbgemm_gpu/codegen/utils/embedding_bounds_check_v2.cu b/fbgemm_gpu/codegen/utils/embedding_bounds_check_v2.cu
index f1e2420601..ae13a5afcb 100644
--- a/fbgemm_gpu/codegen/utils/embedding_bounds_check_v2.cu
+++ b/fbgemm_gpu/codegen/utils/embedding_bounds_check_v2.cu
@@ -248,7 +248,7 @@ void _bounds_check_indices_cuda_v2(
               grid_dim,                                                        \
               dim3(                                                            \
                   fbgemm_gpu::kWarpSize, kNumThreads / fbgemm_gpu::kWarpSize), \
-              0                                ,                               \
+              0,                                                               \
               at::cuda::getCurrentCUDAStream(),                                \
               PTA_B(rows_per_table, int64_t, 1, 32),                           \
               PTA_B(indices, index_t, 1, 32),                                  \

From 2aaaee361af464331760c15b402ed69a586856fd Mon Sep 17 00:00:00 2001
From: Li Li <li.li3@amd.com>
Date: Fri, 21 Nov 2025 05:05:47 +0000
Subject: [PATCH 5/5] guard atomic operation optimization in
 /workspace/FBGEMM/fbgemm_gpu/codegen/utils/embedding_bounds_check_v2.cu by
 #ifdef USE_ROCM

---
 .../utils/embedding_bounds_check_v2.cu        | 22 +++++++++++++------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/fbgemm_gpu/codegen/utils/embedding_bounds_check_v2.cu b/fbgemm_gpu/codegen/utils/embedding_bounds_check_v2.cu
index ae13a5afcb..27811cb0ec 100644
--- a/fbgemm_gpu/codegen/utils/embedding_bounds_check_v2.cu
+++ b/fbgemm_gpu/codegen/utils/embedding_bounds_check_v2.cu
@@ -28,13 +28,15 @@ __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel_v2(
 
   const index_t num_indices = indices.size(0);
   const auto b_t_start = blockIdx.x * blockDim.y + threadIdx.y;
-  index_t invalid_i = -1, invalid_idx = -1;
-  int32_t invalid_b_t = -1;
-  int64_t warning_inc = 0;
-  __shared__ int64_t block_warning_buffer[kMaxThreads];
-  const int linear_tid = threadIdx.z * (blockDim.y * blockDim.x) +
-      threadIdx.y * blockDim.x + threadIdx.x;
-  const int active_threads = blockDim.x * blockDim.y * blockDim.z;
+  #ifdef USE_ROCM
+    index_t invalid_i = -1, invalid_idx = -1;
+    int32_t invalid_b_t = -1;
+    int64_t warning_inc = 0;
+    __shared__ int64_t block_warning_buffer[kMaxThreads];
+    const int linear_tid = threadIdx.z * (blockDim.y * blockDim.x) +
+        threadIdx.y * blockDim.x + threadIdx.x;
+    const int active_threads = blockDim.x * blockDim.y * blockDim.z;
+  #endif
 
   // Check the last element
   if (b_t_start == 0 && threadIdx.x == 0) {
@@ -146,6 +148,7 @@ __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel_v2(
     }
   } // for b_t
 
+#ifdef USE_ROCM
   // Accumulate per-thread warning counts in shared memory and reduce once per block.
   block_warning_buffer[linear_tid] = warning_inc;
   __syncthreads();
@@ -166,6 +169,11 @@ __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel_v2(
     }
   }
   __syncthreads();
+#else
+  if (warning_inc > 0) {
+    gpuAtomicAdd(&warning[0], warning_inc);
+  }
+#endif
   if (bounds_check_mode == BoundsCheckMode::WARNING && invalid_i != -1 &&
       static_cast<int64_t>(atomicAdd(
           reinterpret_cast<unsigned long long int*>(&warning[0]), 0)) == 0) {