Implement fast pass for CPU scalars /number literals (#29915)

Summary: The main changes in this PR are: - skip device dispatch for CPU scalars (number literals also fall into this). In most cases scalars should be on CPU for best perf, but if users explicitly put on other device, we will respect that setting and exit fast pass. - directly manipulate Tensor data_ptr when filling scalar into a 1-element tensor. Some perf benchmark numbers: ``` ## Before In [4]: def test(x): ...: x = x + 2 ...: return x ...: In [5]: with torch.no_grad(): ...: x = torch.ones(100) ...: %timeit {test(x)} ...: 79.8 µs ± 127 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) ## After In [2]: def test(x): ...: x = x + 2 ...: return x ...: In [3]: with torch.no_grad(): ...: x = torch.ones(100) ...: %timeit {test(x)} ...: 60.5 µs ± 334 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) ``` Before the patch `tensor_slow` took 15.74% of total time. <img width="1186" alt="Screen Shot 2019-11-15 at 12 49 51 PM" src="https://user-images.githubusercontent.com/5248122/68976895-cc808c00-07ab-11ea-8f3c-7f15597d12cf.png"> After the patch `tensor_slow` takes 3.84% of total time. <img width="1190" alt="Screen Shot 2019-11-15 at 1 13 03 PM" src="https://user-images.githubusercontent.com/5248122/68976925-e28e4c80-07ab-11ea-94c0-91172fc3bb53.png"> cc: roosephu who originally reported this issue to me. Pull Request resolved: #29915 Differential Revision: D18584251 Pulled By: ailzhang fbshipit-source-id: 2353c8012450a81872e1e09717b3b181362be401
pytorch · Nov 19, 2019 · 2b02d15 · 2b02d15
1 parent e88d096
commit 2b02d15
Show file tree

Hide file tree

Showing 3 changed files with 43 additions and 0 deletions.
diff --git a/aten/src/ATen/ScalarOps.h b/aten/src/ATen/ScalarOps.h
@@ -10,6 +10,19 @@ namespace c10 {
 // FIXME: this should be (and was) Scalar::toTensor, but there is currently no way
 // to implement this without going through Derived Types (which are not part of core).
 inline at::Tensor scalar_to_tensor(Scalar s, const Device device = at::kCPU) {
+  // This is the fast track we have for CPU scalar tensors.
+  if (device == at::kCPU) {
+    if (s.isFloatingPoint()) {
+      return at::native::scalar_tensor(s, at::device(at::kCPU).dtype(at::kDouble));
+    } else if (s.isBoolean()) {
+      return at::native::scalar_tensor(s, at::device(at::kCPU).dtype(at::kBool));
+    } else if (s.isComplex()) {
+      return at::native::scalar_tensor(s, at::device(at::kCPU).dtype(at::kComplexDouble));
+    } else {
+      AT_ASSERT(s.isIntegral(false));
+      return at::native::scalar_tensor(s, at::device(at::kCPU).dtype(at::kLong));
+    }
+  }
   if (s.isFloatingPoint()) {
     return at::scalar_tensor(s, at::device(device).dtype(at::kDouble));
   } else if (s.isBoolean()) {

diff --git a/aten/src/ATen/native/Fill.cpp b/aten/src/ATen/native/Fill.cpp
@@ -9,8 +9,26 @@ namespace at {
 namespace native {
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ fill ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+namespace {
+  template <typename scalar_t>
+  inline void fill_fast(Tensor& self, Scalar value_scalar) {
+    auto value = value_scalar.to<scalar_t>();
+    scalar_t * dptr = static_cast<scalar_t *>(self.data_ptr());
+    *dptr = value;
+  }
+} // namspace
 
 Tensor& fill_out(Tensor& self, Scalar value) {
+  // When filling a number to 1-element CPU tensor, we want to skip
+  // everything but manipulate data ptr directly.
+  // Ideally this fast pass should be implemented in TensorIterator,
+  // but we also want to skip compute_types which in not avoidable
+  // in TensorIterator for now.
+  if (self.device() == at::kCPU && self.numel() == 1 && !value.isComplex()) {
+     AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBool, kBFloat16, self.scalar_type(), "fill_out", [&]() {
+        fill_fast<scalar_t>(self, value);});
+     return self;
+  }
   auto iter = TensorIterator::nullary_op(self);
   fill_stub(iter.device_type(), iter, value);
   return self;

diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp
@@ -415,6 +415,18 @@ Tensor ones_like(
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ scalar_tensor ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Tensor scalar_tensor(Scalar s, const TensorOptions& options) {
+  if (options.device() == at::kCPU) {
+    // This is a fast track to skip device dispatch for making scalar tensor on CPU.
+    // See https://github.com/pytorch/pytorch/pull/29915 for more detailed perf
+    // difference.
+    // In the future when we remove the overhead of device dispatch, we'll happily
+    // revert this to following:
+    //   auto result = at::empty({}, options);
+    at::AutoNonVariableTypeMode non_var_type_mode(true);
+    auto result = empty_cpu({}, options);
+    at::native::fill_(result, s);
+    return result;
+  }
   return at::empty({}, options).fill_(s);
 }