pytorch · malfet · Nov 2, 2022 · Nov 3, 2022 · kulinseth · Nov 3, 2022
diff --git a/aten/src/ATen/native/mps/operations/UnaryOps.mm b/aten/src/ATen/native/mps/operations/UnaryOps.mm
@@ -7,6 +7,13 @@
 #include <ATen/native/mps/OperationUtils.h>
 #include <torch/library.h>
 
+// TODO: Remove me when moved to MacOS 13
+@interface MPSGraph (VenturaOps)
+- (MPSGraphTensor *)cumulativeSumWithTensor:(MPSGraphTensor *)tensor
+                                       axis:(NSInteger)axis
+                                       name:(NSString *)name;
+@end
+
 namespace at {
 namespace native {
 namespace mps {
@@ -30,7 +37,7 @@ void unary_op(const Tensor& self, const Tensor& output, std::string op_name, Una
   }
   MPSGraphCache* cache_ = MPSGraphCache::getInstance();
   @autoreleasepool {
-    string key = op_name + getTensorsStringKey({self}, /*use_scalar_value*/ false);
+    string key = op_name + getTensorsStringKey({self, output}, /*use_scalar_value*/ false);
     auto cachedGraph = cache_->LookUpAs<MPSUnaryCachedGraph>(key);
 
     if(!cachedGraph) {
@@ -263,5 +270,42 @@ void unary_op(const Tensor& self, const Tensor& output, std::string op_name, Una
                 });
 }
 
+
+static bool mpsSupportsCumsum() {
+  id mpsCD = NSClassFromString(@"MPSGraph");
+  return [mpsCD instancesRespondToSelector:@selector(cumulativeSumWithTensor:axis:name:)] == YES;
+}
+
+
+TORCH_IMPL_FUNC(cumsum_out_mps)
+(const Tensor& self,
+ int64_t dim,
+ c10::optional<ScalarType> dtype,
+ const Tensor& result) {
+  TORCH_CHECK(dim >=0 && dim < std::max(1LL, self.ndimension()), "Expected dim to be between 0 and ", self.ndimension(), " but got ", dim);
+  if (!mpsSupportsCumsum()) {
+    TORCH_WARN_ONCE("torch.cumsum supported by MPS on MacOS 13+, please upgrade");
+    auto cpu_result = self.to(at::Device(kCPU)).cumsum(dim, dtype);
+    at::_copy_from_and_resize(cpu_result, result);
+    return;
+  }
+  auto input = dtype.has_value() ? self.to(dtype.value()) : self;
+  mps::unary_op(input, result, "cumsum_out_mp" + std::to_string(dim),
+                ^ MPSGraphTensor* (MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
+       // cumsum is horribly broken for int8, int16 and as chances for overflow is pretty high, cast to int32
+       if (isIntegralType(input.scalar_type()) && input.scalar_type() !=ScalarType::Int) {
+           inputTensor = mps::castMPSTensor(mpsGraph, inputTensor, result.scalar_type());
+       }
+       auto rc = [mpsGraph cumulativeSumWithTensor: inputTensor
+                                              axis: dim
+                                              name: nil];
+       if (result.scalar_type()!= input.scalar_type() ||
+           (isIntegralType(input.scalar_type()) && input.scalar_type() !=ScalarType::Int)) {
+         return mps::castMPSTensor(mpsGraph, rc, result.scalar_type());
+       }
+       return rc;
+    });
+}
+
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
@@ -1776,6 +1776,7 @@
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: cumsum_out
+    MPS: cumsum_out_mps
 
 - func: cumsum.dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
   device_check: NoCheck   # TensorIterator

diff --git a/test/test_mps.py b/test/test_mps.py
@@ -6592,8 +6592,7 @@ def test_index_put_accumulate_duplicate_indices(self, device="mps"):
             # lots of duplicates interleaved with each other
             delta = torch.empty(i, dtype=torch.float32, device=device).uniform_(-1, 1)
 
-            # cumsum not supported on 'mps', fallback on 'cpu'
-            indices = delta.cpu().cumsum(0).long().to("mps")
+            indices = delta.cumsum(0).long().to("mps")
 
             # abs for int64 is not supported on mps, fallback on 'cpu' to calculate it
             input = torch.randn(indices.cpu().abs().max().to("mps") + 1, device=device)
@@ -7220,6 +7219,7 @@ class TestConsistency(TestCase):
         'cos': ['f32', 'i16', 'i32', 'u8', 'i64'],
         'cosh': ['f32', 'i16', 'i32', 'u8', 'i64'],
         'cov': ['f32'],
+        'cumsum': ['f16', 'f32', 'int16', 'int32'],
         'deg2rad': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'diag': ['f32', 'i32'],
         'diag_embed': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64'],
@@ -7447,6 +7447,7 @@ class TestConsistency(TestCase):
         'corrcoef': ['f32'],
         'cos': ['f32'],
         'cosh': ['f32'],
+        'cumsum': ['f16', 'f32'],
         'deg2rad': ['f16', 'f32'],
         'diag': ['f32'],
         'diag_embed': ['f16', 'f32'],