pytorch · janeyx99 · Nov 14, 2023 · Nov 14, 2023 · Nov 15, 2023 · Nov 15, 2023
diff --git a/aten/src/ATen/native/ForeachOpsKernels.cpp b/aten/src/ATen/native/ForeachOpsKernels.cpp
@@ -334,6 +334,7 @@ FOREACH_BINARY_OP_LIST_ALPHA(lerp);
 
 FOREACH_BINARY_OP_TENSOR_ALPHA(add);
 FOREACH_BINARY_OP_TENSOR(mul);
+FOREACH_BINARY_OP_TENSOR(div);
 
 FOREACH_BINARY_OP_SCALAR(add);
 FOREACH_BINARY_OP_SCALAR(sub);

diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarTensor.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarTensor.cu
@@ -9,6 +9,7 @@
 #include <ATen/NativeFunctions.h>
 #else
 #include <ATen/ops/_foreach_add_native.h>
+#include <ATen/ops/_foreach_div_native.h>
 #include <ATen/ops/_foreach_mul_native.h>
 
 #include <ATen/ops/empty_like_native.h>
@@ -98,6 +99,10 @@ void foreach_binary_op_(
 #define FOREACH_BINARY_OP_SCALAR_TENSOR(FUNCTION, NAME, OP, DIVISION_OP) \
   void foreach_tensor_##NAME##_tensor_kernel_cuda_(                      \
       TensorList tensors, const Tensor& scalar) {                        \
+    if (scalar.device().type() == DeviceType::CPU) {                     \
+      return at::native::foreach_tensor_##NAME##_scalar_kernel_cuda_(    \
+          tensors, scalar.item());                                       \
+    }                                                                    \
     check_foreach_api_restrictions(tensors);                             \
     if (!(can_use_fast_route(                                            \
               ArrayRef<TensorList>{tensors}, {}, DIVISION_OP) &&         \
@@ -111,6 +116,10 @@ void foreach_binary_op_(
                                                                          \
   std::vector<Tensor> foreach_tensor_##NAME##_tensor_kernel_cuda(        \
       TensorList tensors, const Tensor& scalar) {                        \
+    if (scalar.device().type() == DeviceType::CPU) {                     \
+      return at::native::foreach_tensor_##NAME##_scalar_kernel_cuda(     \
+          tensors, scalar.item());                                       \
+    }                                                                    \
     check_foreach_api_restrictions(tensors);                             \
     if (!(can_use_fast_route(                                            \
               ArrayRef<TensorList>{tensors}, {}, DIVISION_OP) &&         \
@@ -187,4 +196,11 @@ FOREACH_BINARY_OP_SCALAR_TENSOR(
     mul,
     std::multiplies,
     /* div_op */ false);
+
+FOREACH_BINARY_OP_SCALAR_TENSOR(
+    all_types_complex_bool_half_bfloat16,
+    div,
+    std::divides,
+    /* div_op */ true);
+
 } // namespace at::native
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
@@ -10343,6 +10343,21 @@
     CUDA: foreach_tensor_div_scalarlist_kernel_cuda_
   autogen: _foreach_div.ScalarList_out
 
+- func: _foreach_div.Tensor(Tensor[] self, Tensor other) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_div_tensor_kernel_slow
+    CUDA: foreach_tensor_div_tensor_kernel_cuda
+
+- func: _foreach_div_.Tensor(Tensor(a!)[] self, Tensor other) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_div_tensor_kernel_slow_
+    CUDA: foreach_tensor_div_tensor_kernel_cuda_
+  autogen: _foreach_div.Tensor_out
+
 - func: _foreach_clamp_max.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
   variants: function

diff --git a/test/expect/HasDecompTest.test_has_decomposition.expect b/test/expect/HasDecompTest.test_has_decomposition.expect
@@ -200,9 +200,12 @@ aten::_foreach_div.Scalar
 aten::_foreach_div.ScalarList
 aten::_foreach_div.ScalarList_out
 aten::_foreach_div.Scalar_out
+aten::_foreach_div.Tensor
+aten::_foreach_div.Tensor_out
 aten::_foreach_div_.List
 aten::_foreach_div_.Scalar
 aten::_foreach_div_.ScalarList
+aten::_foreach_div_.Tensor
 aten::_foreach_erf
 aten::_foreach_erf.out
 aten::_foreach_erf_

diff --git a/test/test_foreach.py b/test/test_foreach.py
@@ -48,7 +48,7 @@ def __init__(self, func):
         # Some foreach functions don't have in-place implementations.
         self.is_inplace = False if func is None else func.__name__.endswith('_')
 
-    def __call__(self, inputs, is_cuda, is_fastpath, **kwargs):
+    def __call__(self, inputs, is_cuda, expect_fastpath, **kwargs):
         actual = None
         zero_size = kwargs.pop("zero_size", False)
         if (
@@ -60,7 +60,7 @@ def __call__(self, inputs, is_cuda, is_fastpath, **kwargs):
                 actual = self.func(*inputs, **kwargs)
             keys = tuple([e.key for e in p.key_averages()])
             mta_called = any("multi_tensor_apply_kernel" in k for k in keys)
-            assert mta_called == (is_fastpath and (not zero_size))
+            assert mta_called == (expect_fastpath and (not zero_size))
         else:
             actual = self.func(*inputs, **kwargs)
         # note(mkozuki): inplace foreach functions are void functions.
@@ -127,9 +127,9 @@ def test_all_zero_size_tensors_do_not_launch_kernel(self, device, dtype, op):
 
         for sample in op.sample_zero_size_inputs(device, dtype):
             if not op.has_no_out_of_place:
-                wrapped_op((sample.input, *sample.args), is_cuda=self.is_cuda, is_fastpath=True, zero_size=True)
+                wrapped_op((sample.input, *sample.args), is_cuda=self.is_cuda, expect_fastpath=True, zero_size=True)
             with InplaceForeachVersionBumpCheck(self, sample.input):
-                inplace_op((sample.input, *sample.args), is_cuda=self.is_cuda, is_fastpath=True, zero_size=True)
+                inplace_op((sample.input, *sample.args), is_cuda=self.is_cuda, expect_fastpath=True, zero_size=True)
 
     @unittest.skipIf(TEST_WITH_ROCM, "Skipped on ROCm, since it is failing on ROCm 5.7")
     @ops(
@@ -150,7 +150,9 @@ def test_parity(self, device, dtype, op, noncontiguous, inplace):
         for sample in op.sample_inputs(device, dtype, noncontiguous=noncontiguous):
             ref_kwargs = sample.kwargs
             kwargs = ref_kwargs.copy()
-            expect_fastpath = not (noncontiguous or sample.disable_fastpath)
+            # div promotes ints to floats, so we cannot go on the fastpath there
+            div_slowpath = dtype in integral_types_and(torch.bool) and op.name == '_foreach_div'
+            expect_fastpath = not (noncontiguous or sample.disable_fastpath or div_slowpath)
             if op in foreach_pointwise_op_db:
                 values = kwargs.pop("values", None)
                 if values is not None:
@@ -242,7 +244,7 @@ def clone(arg):
                     (rhs_arg,) = transformed_sample.args
                     ref_tensors, ref_rhs_arg = clone(tensors), clone(rhs_arg)
                     sum(wrapped_op(
-                        [rhs_arg, tensors], is_cuda=False, is_fastpath=False
+                        [rhs_arg, tensors], is_cuda=False, expect_fastpath=False
                     )).mean().backward()
                     sum([ref.func(ref_rhs_arg, t) for t in ref_tensors]).mean().backward()
                     self.assertEqual([t.grad for t in tensors], [t.grad for t in ref_tensors])
@@ -628,7 +630,7 @@ def test_foreach_l2_large_value_input(self, device, dtype, op):
         # make sure that the min. of squared L2 norm value per tensor is greater than the max value of `dtype`.
         self.assertTrue(scaler * scaler * N > max_value)
         fn, ref_fn, *_ = self._get_funcs(op)
-        actual = fn(inputs, is_cuda=True, is_fastpath=True, ord=ord, zero_size=False)
+        actual = fn(inputs, is_cuda=True, expect_fastpath=True, ord=ord, zero_size=False)
         expect = ref_fn(inputs, ord=ord)
 
         if dtype == torch.float16:
@@ -692,7 +694,7 @@ def test_outplace_with_invalid_grads(self, device, dtype, op):
         self.assertTrue(all(t.requires_grad for t in sample.input))
         if func.func in foreach_pointwise_op_db:
             sample.kwargs.pop("values", None)
-        (out1, out2) = func([sample.input, *sample.args], is_cuda=False, is_fastpath=False, **sample.kwargs)
+        (out1, out2) = func([sample.input, *sample.args], is_cuda=False, expect_fastpath=False, **sample.kwargs)
         out1.backward(torch.ones_like(out1))
         self.assertIsNotNone(sample.input[0].grad)
         self.assertIsNone(sample.input[1].grad)
@@ -710,7 +712,7 @@ def get_ref(func, sample):
             class Foo:
                 pass
 
-            out = func((sample.input, *sample.args), is_cuda=False, is_fastpath=False, **sample.kwargs)
+            out = func((sample.input, *sample.args), is_cuda=False, expect_fastpath=False, **sample.kwargs)
             foo = Foo()
             meta_dict = out[0].grad_fn.metadata
             meta_dict[0] = foo
@@ -772,12 +774,24 @@ def test_tensors_grouping(self):
                 self.assertEqual(l3[i], list3[index])
         self.assertEqual(num_tensors_seen, 2 * num_tensors_per_list)
 
+    @onlyCUDA
+    def test_0dim_tensor_overload_cpu_ok(self):
+        tensors = [torch.ones((), device="cuda", dtype=torch.float32) for _ in range(2)]
+        scalar_cpu_tensor = torch.tensor(4.0, device="cpu")
+
+        # For mul and div, the scalar is allowed to be on CPU too
+        actual = torch._foreach_mul(tensors, scalar_cpu_tensor)
+        self.assertEqual(actual, [t.mul(scalar_cpu_tensor) for t in tensors])
+        actual = torch._foreach_div(tensors, scalar_cpu_tensor)
+        self.assertEqual(actual, [t.div(scalar_cpu_tensor) for t in tensors])
+
+
     @onlyCUDA
     def test_0dim_tensor_overload_exception(self):
         # check exceptions of fast path
         tensors = [make_tensor((2, 2), dtype=torch.float, device="cuda") for _ in range(2)]
         with self.assertRaisesRegex(RuntimeError, "scalar tensor expected to be on"):
-            torch._foreach_mul(tensors, torch.tensor(1.0, device="cpu"))
+            torch._foreach_add(tensors, torch.tensor(1.0, device="cpu"), alpha=1.0)
 
         tensors = [make_tensor((2, 2), dtype=torch.float, device=d) for d in ("cpu", "cuda")]
         with self.assertRaisesRegex(RuntimeError, "scalar tensor expected to be 0 dim but"):

diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
@@ -9479,7 +9479,7 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
         "div",
         dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
         dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
-        sample_inputs_func=foreach_inputs_sample_func(2, True, True),
+        sample_inputs_func=foreach_inputs_sample_func(2, True, True, True),
         skips=(
             DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_inplace"),
             DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_inplace"),

diff --git a/torchgen/api/autograd.py b/torchgen/api/autograd.py
@@ -319,6 +319,7 @@ def is_foreach_func(f: NativeFunction) -> bool:
 _foreach_with_tensor_overload = {
     "_foreach_add.Tensor",
     "_foreach_mul.Tensor",
+    "_foreach_div.Tensor",
 }