pytorch · peterbell10 · Jan 8, 2021 · Jan 8, 2021 · Jan 8, 2021 · Jan 8, 2021
diff --git a/aten/src/ATen/BatchingRegistrations.cpp b/aten/src/ATen/BatchingRegistrations.cpp
@@ -1129,6 +1129,12 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) {
   BINARY_POINTWISE_VA(rsub, Scalar);
   BINARY_POINTWISE(mul);
   BINARY_POINTWISE(div);
+  {
+    using Binop = Tensor (*)(const Tensor&, const Tensor&, std::string);
+    using Unop = Tensor (*)(const Tensor&, Scalar, std::string);
+    m.impl("div.Tensor_mode", binary_pointwise_batching_rule<Binop, at::div, std::string>);
+    m.impl("div.Scalar_mode", unwrap_and_call<Unop, at::div, Scalar, std::string>);
+  }
 
   // at::pow has three out-of-place overloads
   m.impl("pow.Tensor_Tensor", binary_pointwise_batching_rule<TensorTensorType, at::pow>);

diff --git a/aten/src/ATen/native/BinaryOps.cpp b/aten/src/ATen/native/BinaryOps.cpp
@@ -29,7 +29,9 @@ DEFINE_DISPATCH(add_stub);
 DEFINE_DISPATCH(add_clamp_stub);
 DEFINE_DISPATCH(sub_stub);
 DEFINE_DISPATCH(mul_stub);
-DEFINE_DISPATCH(div_stub);
+DEFINE_DISPATCH(div_true_stub);
+DEFINE_DISPATCH(div_floor_stub);
+DEFINE_DISPATCH(div_trunc_stub);
 DEFINE_DISPATCH(remainder_stub);
 DEFINE_DISPATCH(atan2_stub);
 DEFINE_DISPATCH(bitwise_and_stub);
@@ -148,21 +150,45 @@ Tensor& copysign_(Tensor& self, Scalar other) {
   return native::copysign_(self, wrapped_scalar_tensor(other));
 }
 
-Tensor& div_out(const Tensor& self, const Tensor& other, Tensor& result) {
+Tensor& div_true_out(const Tensor& self, const Tensor& other, Tensor& result) {
   auto iter = TensorIterator::binary_float_op(result, self, other);
-  div_stub(iter.device_type(), iter);
+  div_true_stub(iter.device_type(), iter);
+  if (!result.defined()) {
+    result = iter.output();
+  }
+  return result;
+}
+
+Tensor& div_trunc_out(const Tensor& self, const Tensor& other, Tensor& result) {
+  auto iter = TensorIterator::binary_op(result, self, other);
+  div_trunc_stub(iter.device_type(), iter);
+  if (!result.defined()) {
+    result = iter.output();
+  }
   return result;
 }
 
+Tensor& div_floor_out(const Tensor& self, const Tensor& other, Tensor& result) {
+  auto iter = TensorIterator::binary_op(result, self, other);
+  div_floor_stub(iter.device_type(), iter);
+  if (!result.defined()) {
+    result = iter.output();
+  }
+  return result;
+}
+
+Tensor& div_out(const Tensor& self, const Tensor& other, Tensor& result) {
+  return div_true_out(self, other, result);
+}
+
 Tensor div(const Tensor& self, const Tensor& other) {
   Tensor result;
-  auto iter = TensorIterator::binary_float_op(result, self, other);
-  div_stub(iter.device_type(), iter);
-  return iter.output();
+  div_true_out(self, other, result);
+  return result;
 }
 
 Tensor& div_(Tensor& self, const Tensor& other) {
-  return native::div_out(self, other, self);
+  return div_true_out(self, other, self);
 }
 
 // WARNING: There doesn't appear to be any testing for this function
@@ -179,6 +205,38 @@ Tensor& div_(Tensor& self, Scalar other) {
   return self.div_(wrapped_scalar_tensor(other)); // redispatch!
 }
 
+Tensor& div_out(const Tensor& self, const Tensor& other, std::string rounding_mode, Tensor& result) {
+  if (rounding_mode == "true") {
+    return div_true_out(self, other, result);
+  } else if (rounding_mode == "trunc") {
+    return div_trunc_out(self, other, result);
+  } else if (rounding_mode == "floor") {
+    return div_floor_out(self, other, result);
+  }
+
+  AT_ERROR("div expected rounding_mode to be one of 'true', 'trunc', or 'floor' "
+           "but found '", rounding_mode, "'");
+}
+
+Tensor div(const Tensor& self, const Tensor& other, std::string rounding_mode) {
+  Tensor result;
+  native::div_out(self, other, std::move(rounding_mode), result);
+  TORCH_INTERNAL_ASSERT(result.defined());
+  return result;
+}
+
+Tensor& div_(Tensor& self, const Tensor& other, std::string rounding_mode) {
+  return native::div_out(self, other, std::move(rounding_mode), self);
+}
+
+Tensor div(const Tensor& self, Scalar other, std::string rounding_mode) {
+  return self.div(wrapped_scalar_tensor(other), std::move(rounding_mode)); // redispatch!
+}
+
+Tensor& div_(Tensor& self, Scalar other, std::string rounding_mode) {
+  return self.div_(wrapped_scalar_tensor(other), std::move(rounding_mode)); // redispatch!
+}
+
 // divide, alias for div
 Tensor& divide_out(Tensor& result, const Tensor& self, const Tensor& other) {
   return at::div_out(result, self, other);
@@ -200,6 +258,26 @@ Tensor& divide_(Tensor& self, Scalar other) {
   return self.div_(other);
 }
 
+Tensor& divide_out(Tensor& result, const Tensor& self, const Tensor& other, std::string rounding_mode) {
+  return at::div_out(result, self, other, std::move(rounding_mode));
+}
+
+Tensor divide(const Tensor& self, const Tensor& other, std::string rounding_mode) {
+  return self.div(other, std::move(rounding_mode));
+}
+
+Tensor& divide_(Tensor& self, const Tensor& other, std::string rounding_mode) {
+  return self.div_(other, std::move(rounding_mode));
+}
+
+Tensor divide(const Tensor& self, Scalar other, std::string rounding_mode) {
+  return self.div(other, std::move(rounding_mode));
+}
+
+Tensor& divide_(Tensor& self, Scalar other, std::string rounding_mode) {
+  return self.div_(other, std::move(rounding_mode));
+}
+
 // true_divide, an alias for div
 Tensor& true_divide_out(Tensor& result, const Tensor& self, const Tensor& divisor) {
   return at::div_out(result, self, divisor);
@@ -239,28 +317,14 @@ Tensor& remainder_(Tensor& self, const Tensor& other) {
 }
 
 Tensor& floor_divide_out(Tensor& result, const Tensor& self, const Tensor& other) {
-  auto iter = TensorIterator::binary_op(result, self, other);
-  div_stub(iter.device_type(), iter);
-
-  if (result.is_floating_point()) {
-    result.trunc_();
-  }
-
-  return result;
+  // FIXME: Not actually doing floor division
+  return div_trunc_out(self, other, result);
 }
 
 Tensor floor_divide(const Tensor& self, const Tensor& other) {
   Tensor result;
-  auto iter = TensorIterator::binary_op(result, self, other);
-
-  div_stub(iter.device_type(), iter);
-
-  auto out = iter.output();
-  if (out.is_floating_point()) {
-    out.trunc_();
-  }
-
-  return out;
+  native::floor_divide_out(result, self, other);
+  return result;
 }
 
 Tensor& floor_divide_(Tensor& self, const Tensor& other) {

diff --git a/aten/src/ATen/native/BinaryOps.h b/aten/src/ATen/native/BinaryOps.h
@@ -37,7 +37,9 @@ DECLARE_DISPATCH(structured_binary_fn_alpha, add_stub);
 DECLARE_DISPATCH(binary_clamp_fn_alpha, add_clamp_stub);
 DECLARE_DISPATCH(binary_fn_alpha, sub_stub);
 DECLARE_DISPATCH(binary_fn, mul_stub);
-DECLARE_DISPATCH(binary_fn, div_stub);
+DECLARE_DISPATCH(binary_fn, div_true_stub);
+DECLARE_DISPATCH(binary_fn, div_floor_stub);
+DECLARE_DISPATCH(binary_fn, div_trunc_stub);
 DECLARE_DISPATCH(binary_fn, remainder_stub);
 DECLARE_DISPATCH(binary_fn, atan2_stub);
 DECLARE_DISPATCH(binary_fn, bitwise_and_stub);

diff --git a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
@@ -92,24 +92,76 @@ void mul_kernel(TensorIterator& iter) {
   }
 }
 
-void div_kernel(TensorIterator& iter) {
-  if (isIntegralType(iter.dtype(), /*includeBool*/ false)) {
+void div_true_kernel(TensorIterator& iter) {
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kBFloat16, kHalf, iter.common_dtype(), "div_true_cpu", [&]() {
+    cpu_kernel_vec(iter,
+      [](scalar_t a, scalar_t b) __ubsan_ignore_float_divide_by_zero__ -> scalar_t {
+        return a / b;
+      },
+      [](Vec256<scalar_t> a, Vec256<scalar_t> b) {
+        return a / b;
+      });
+  });
+}
+
+void div_trunc_kernel(TensorIterator& iter) {
+  auto dtype = iter.common_dtype();
+  if (isIntegralType(dtype, /*includeBool*/ false)) {
     // There's no SIMD integer division, so don't try to vectorize it.
     // TODO: if the divisor is a scalar, rewrite as multiplication by a constant.
-    AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "div_cpu", [&]() {
+    AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "div_trunc_cpu", [&]() {
       cpu_kernel(iter, [](scalar_t a, scalar_t b) -> scalar_t {
         TORCH_CHECK(b != 0, "ZeroDivisionError");
         return a / b;
       });
     });
   } else {
-    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kBFloat16, kHalf, iter.dtype(), "div_cpu", [&]() {
+    AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, dtype, "div_trunc_cpu", [&]() {
+      cpu_kernel_vec(iter,
+        [](scalar_t a, scalar_t b) __ubsan_ignore_float_divide_by_zero__ -> scalar_t {
+          return std::trunc(a / b);
+        },
+        [](Vec256<scalar_t> a, Vec256<scalar_t> b) {
+          return (a / b).trunc();
+        });
+    });
+  }
+}
+
+void div_floor_kernel(TensorIterator& iter) {
+  const auto dtype = iter.common_dtype();
+  if (dtype == kByte) {
+    // In the special case of unsigned integer division, floor division is
+    // equivalent to truncation division (since the signs of the divisor and
+    // dividend are always the same)
+    return div_trunc_kernel(iter);
+  } else if (isIntegralType(dtype, /*includeBool*/ false)) {
+    // There's no SIMD integer division, so don't try to vectorize it.
+    // TODO: if the divisor is a scalar, rewrite as multiplication by a constant.
+    AT_DISPATCH_INTEGRAL_TYPES(iter.common_dtype(), "div_floor_cpu", [&]() {
+      cpu_kernel(iter, [](scalar_t a, scalar_t b) -> scalar_t {
+
+        TORCH_CHECK(b != 0, "ZeroDivisionError");
+        if ((a < 0) != (b < 0)) {
+          // Subtracts one from the results of truncation division if the
+          // divisor and dividend have different sign(bit)s and the remainder of
+          // the division is nonzero
+          const auto quot = a / b;
+          const auto rem = a % b;
+          return rem ? quot - 1 : quot;
+        }
+
+        return a / b;
+      });
+    });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, iter.common_dtype(), "div_floor_cpu", [&]() {
       cpu_kernel_vec(iter,
         [](scalar_t a, scalar_t b) __ubsan_ignore_float_divide_by_zero__ -> scalar_t {
-          return a / b;
+          return std::floor(a / b);
         },
         [](Vec256<scalar_t> a, Vec256<scalar_t> b) {
-          return a / b;
+          return (a / b).floor();
         });
     });
   }
@@ -838,7 +890,9 @@ REGISTER_DISPATCH(add_stub, &add_kernel);
 REGISTER_DISPATCH(add_clamp_stub, &add_clamp_kernel);
 REGISTER_DISPATCH(sub_stub, &sub_kernel);
 REGISTER_DISPATCH(mul_stub, &mul_kernel);
-REGISTER_DISPATCH(div_stub, &div_kernel);
+REGISTER_DISPATCH(div_true_stub, &div_true_kernel);
+REGISTER_DISPATCH(div_trunc_stub, &div_trunc_kernel);
+REGISTER_DISPATCH(div_floor_stub, &div_floor_kernel);
 REGISTER_DISPATCH(remainder_stub, &remainder_kernel);
 REGISTER_DISPATCH(atan2_stub, &atan2_kernel);
 REGISTER_DISPATCH(bitwise_and_stub, &bitwise_and_kernel);

diff --git a/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu b/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu
@@ -44,26 +44,100 @@ struct MulFunctor<bool> {
 };
 
 
-void div_kernel_cuda(TensorIterator& iter) {
-  if (!isIntegralType(iter.common_dtype(), /*includeBool*/ false) && iter.is_cpu_scalar(2)) {
+void div_true_kernel_cuda(TensorIterator& iter) {
+  if (iter.is_cpu_scalar(2)) {
     // optimization for floating-point types: if the second operand is a CPU
     // scalar, compute a * reciprocal(b). Note that this may lose one bit of
     // precision compared to computing the division.
-    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.common_dtype(), "div_cuda", [&]() {
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.common_dtype(), "div_true_cuda", [&]() {
       using accscalar_t = at::acc_type<scalar_t, true>;
       auto inv_b = accscalar_t(1.0) / iter.scalar_value<accscalar_t>(2);
       iter.remove_operand(2);
       MulScalarFunctor<scalar_t, decltype(inv_b)> f(inv_b);
       gpu_kernel(iter, f);
     });
   } else {
-    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16, iter.common_dtype(), "div_cuda", [&]() {
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.common_dtype(), "div_true_cuda", [&]() {
       DivFunctor<scalar_t> f;
       gpu_kernel_with_scalars(iter, f);
     });
   }
 }
 
+void div_trunc_kernel_cuda(TensorIterator& iter) {
+  auto dtype = iter.common_dtype();
+  if (isIntegralType(dtype, /*includeBool*/ false)) {
+    AT_DISPATCH_INTEGRAL_TYPES(dtype, "div_trunc_cuda", [&]() {
+      gpu_kernel_with_scalars(iter, [] GPU_LAMBDA (scalar_t a, scalar_t b) -> scalar_t {
+        return a / b;
+      });
+    });
+  } else if (iter.is_cpu_scalar(2)) {
+    // optimization for floating-point types: if the second operand is a CPU
+    // scalar, compute a * reciprocal(b). Note that this may lose one bit of
+    // precision compared to computing the division.
+    AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, dtype, "div_trunc_cuda", [&]() {
+      using accscalar_t = at::acc_type<scalar_t, true>;
+      auto inv_b = accscalar_t(1.0) / iter.scalar_value<accscalar_t>(2);
+      iter.remove_operand(2);
+      gpu_kernel(iter, [inv_b] GPU_LAMBDA (scalar_t a) -> scalar_t {
+        return std::trunc(a * inv_b);
+      });
+    });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, dtype, "div_trunc_cuda", [&]() {
+      gpu_kernel_with_scalars(iter, [] GPU_LAMBDA (scalar_t a, scalar_t b) -> scalar_t {
+        return std::trunc(a / b);
+      });
+    });
+  }
+}
+
+void div_floor_kernel_cuda(TensorIterator& iter) {
+  const auto dtype = iter.common_dtype();
+  if (dtype == kByte) {
+    // In the special case of unsigned integer division, floor division is
+    // equivalent to truncation division (since the signs of the divisor and
+    // dividend are always the same)
+    return div_trunc_kernel_cuda(iter);
+  } else if (isIntegralType(dtype, /*includeBool*/ false)) {
+    // There's no SIMD integer division, so don't try to vectorize it.
+    // TODO: if the divisor is a scalar, rewrite as multiplication by a constant.
+    AT_DISPATCH_INTEGRAL_TYPES(iter.common_dtype(), "div_floor_cuda", [&]() {
+      gpu_kernel_with_scalars(iter, [] GPU_LAMBDA (scalar_t a, scalar_t b) -> scalar_t {
+        if ((a < 0) != (b < 0)) {
+          // Subtracts one from the results of truncation division if the
+          // divisor and dividend have different sign(bit)s and the remainder of
+          // the division is nonzero
+          const auto quot = a / b;
+          const auto rem = a % b;
+          return rem ? quot - 1 : quot;
+        }
+
+        return a / b;
+      });
+    });
+  } else if (iter.is_cpu_scalar(2)) {
+    // optimization for floating-point types: if the second operand is a CPU
+    // scalar, compute a * reciprocal(b). Note that this may lose one bit of
+    // precision compared to computing the division.
+    AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, dtype, "div_floor_cuda", [&]() {
+      using accscalar_t = at::acc_type<scalar_t, true>;
+      auto inv_b = accscalar_t(1.0) / iter.scalar_value<accscalar_t>(2);
+      iter.remove_operand(2);
+      gpu_kernel(iter, [inv_b] GPU_LAMBDA (scalar_t a) -> scalar_t {
+        return std::floor(a * inv_b);
+      });
+    });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, dtype, "div_floor_cuda", [&]() {
+      gpu_kernel_with_scalars(iter, [] GPU_LAMBDA (scalar_t a, scalar_t b) -> scalar_t {
+        return std::floor(a / b);
+      });
+    });
+  }
+}
+
 void mul_kernel_cuda(TensorIterator& iter) {
   if (!isIntegralType(iter.common_dtype(), /*includeBool*/ true) &&
     (iter.is_cpu_scalar(1) || iter.is_cpu_scalar(2))) {
@@ -86,7 +160,9 @@ void mul_kernel_cuda(TensorIterator& iter) {
   }
 }
 
-REGISTER_DISPATCH(div_stub, &div_kernel_cuda);
+REGISTER_DISPATCH(div_true_stub, &div_true_kernel_cuda);
+REGISTER_DISPATCH(div_trunc_stub, &div_trunc_kernel_cuda);
+REGISTER_DISPATCH(div_floor_stub, &div_floor_kernel_cuda);
 REGISTER_DISPATCH(mul_stub, &mul_kernel_cuda);
 
 }} // namespace at::native