pytorch · jiayisunx · Sep 17, 2025 · Sep 17, 2025 · Sep 18, 2025 · Oct 11, 2025
diff --git a/test/inductor/test_compiled_autograd.py b/test/inductor/test_compiled_autograd.py
@@ -2617,7 +2617,7 @@ def test_autograd_cpp_node_saved_dynamic(self, load_inline, is_traceable):
         )
 
         def fn():
-            for i in [10, 100, 10, 20, 10]:
+            for i in [10, 30, 10, 20, 10]:
                 x = torch.ones(i, i, requires_grad=True)
                 out = module.custom_op_backed_by_autograd_fn(x)
                 loss = out.sum()

diff --git a/test/inductor/test_cpu_repro.py b/test/inductor/test_cpu_repro.py
@@ -2729,6 +2729,18 @@ def test_large_mean(self):
             actual = torch.compile(op)(t)
             self.assertEqual(expected, actual)
 
+    def test_outer_mean_large_size(self):
+        def fn(x):
+            x = x.flatten()
+            x_one = torch.ones_like(x)
+            x = torch.outer(x, x_one)
+            return torch.mean(x, dim=1)
+
+        x = torch.randn(2, 2, 64, 64)
+        expected = fn(x)
+        actual = torch.compile(fn)(x)
+        self.assertEqual(expected, actual, atol=1e-4, rtol=1e-4)
+
     @unittest.skipIf(IS_FBCODE, "Not yet runnable in fbcode")
     @requires_vectorization
     @patch("torch.cuda.is_available", lambda: False)

diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py
@@ -2205,28 +2205,22 @@ def need_use_acc_helper(self, reduction_type, dtype, use_scalar):
             reduction_size = functools.reduce(
                 operator.mul, self.call_ranges[self.reduction_depth :]
             )
-            if config.cpp.dynamic_threads:
-                # If dynamic threads, to be conservative,
-                # use reduction_size as the range size
-                rt_size = reduction_size
-            else:
-                rt_size = CeilDiv(reduction_size, parallel_num_threads())
 
             # chunk size to balance accuracy and performance
-            chunk_size = 2**20
+            chunk_size = 4096
 
             # use acc helper If cannot get size_hint
             try:
-                rt_size_hint = V.graph.sizevars.size_hint(rt_size)
+                reduction_size_hint = V.graph.sizevars.size_hint(reduction_size)
             except Exception:
                 return True
 
-            if rt_size_hint > chunk_size:
+            if reduction_size_hint > chunk_size:
                 # use helper if the reduction size is too large
-                V.graph.sizevars.check_lt(chunk_size, rt_size)
+                V.graph.sizevars.check_lt(chunk_size, reduction_size)
                 return True
             else:
-                V.graph.sizevars.check_leq(rt_size, chunk_size)
+                V.graph.sizevars.check_leq(reduction_size, chunk_size)
         return False
 
     def _acc_helper_init(
@@ -2243,7 +2237,7 @@ def _acc_helper_init(
         )
         num_range_thread_expr = cexpr_index(num_range_thread)
         assert reduction_type in ["welford_reduce", "sum"]
-        chunk_size = 4096 if reduction_type == "welford_reduce" else 2**20
+        chunk_size = 4096
         num_chunks = CeilDiv(num_range_thread, chunk_size)
         helper_type = (
             "WelfordHelper"