pytorch · bdhirsh · Oct 17, 2023 · Oct 17, 2023 · Oct 19, 2023 · Oct 25, 2023
diff --git a/aten/src/ATen/FunctionalStorageImpl.cpp b/aten/src/ATen/FunctionalStorageImpl.cpp
@@ -10,7 +10,7 @@ namespace at::functionalization {
 
 ViewMeta ViewMeta::to_out_idx(int64_t out_idx) {
   if (out_idx == this->out_index) return *this;
-  return ViewMeta(forward_fn, reverse_fn, out_idx);
+  return ViewMeta(forward_fn, reverse_fn, is_multi_output, out_idx);
 }
 
 // Note [Functionalization: Alias Removal Part 2]

diff --git a/aten/src/ATen/FunctionalStorageImpl.h b/aten/src/ATen/FunctionalStorageImpl.h
@@ -31,16 +31,21 @@ struct ViewMeta {
   ViewMeta(
       std::function<Tensor(const Tensor&, int64_t)> forward,
       std::function<Tensor(const Tensor&, const Tensor&, int64_t)> reverse,
+      bool is_multi_output = false,
       int64_t out_idx = 0)
       : forward_fn(std::move(forward)),
         reverse_fn(std::move(reverse)),
-        out_index(out_idx) {}
+        out_index(out_idx),
+        is_multi_output(is_multi_output) {}
 
   std::function<Tensor(const Tensor&, int64_t)> forward_fn;
   std::function<Tensor(const Tensor&, const Tensor&, int64_t)> reverse_fn;
   // See Note [out_idx in ViewMeta]
   int64_t out_index;
 
+  // Tells us if this is a multi-output view
+  bool is_multi_output;
+
   // Returns a copy of the current ViewMeta, if out_idx matches the current
   // out_index. Otherwise, returns a new ViewMeta with the same forward/reverse
   // functions, but a new out index.

diff --git a/aten/src/ATen/FunctionalTensorWrapper.cpp b/aten/src/ATen/FunctionalTensorWrapper.cpp
@@ -135,7 +135,8 @@ FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const
       view_value.dtype(),
       view_value.device()
     ),
-    value_(view_value)
+    value_(view_value),
+    is_multi_output_view_(base->is_multi_output_view_ || meta.is_multi_output)
 {
   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(value_));
   TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize));

diff --git a/aten/src/ATen/FunctionalTensorWrapper.h b/aten/src/ATen/FunctionalTensorWrapper.h
@@ -126,6 +126,10 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
   // replace_() swaps out the wrapped tensor, value_, with tmp.
   void replace_(const Tensor& other);
 
+  bool is_multi_output_view() {
+    return is_multi_output_view_;
+  }
+
   // See Note[resize_() in functionalization pass]
   void maybe_replace_storage(const Tensor& other);
 
@@ -173,6 +177,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
   Tensor value_;
   int64_t level_;
   bool has_metadata_mutation_ = false;
+  bool is_multi_output_view_ = false;
 
   size_t generation_ = 0;
   std::vector<at::functionalization::ViewMeta> view_metas_;

diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
@@ -825,6 +825,110 @@ def forward(self, primals_1):
     view = torch.ops.aten.view.default(mul, [-1]);  mul = None
     return [view]""")
 
+    def test_output_aliases_intermediate_multi_output_view(self):
+        # All aliased outs are from multi-output views, so AOTAutograd will hide the aliasing from autograd.
+        def f1(a):
+            out = torch.mul(a, 3)
+            return list(out.unbind(0))
+
+        inp = torch.ones(3, 3, requires_grad=True)
+        inp_ref = torch.ones(3, 3, requires_grad=True)
+        f1_compiled = aot_function(f1, nop)
+
+        out_ref = f1(inp_ref)
+        out_test = f1_compiled(inp)
+        # Assert that we get CompiledFunctionBackward in the backward graph,
+        # and not AsStridedBackward. No view-regeneration necesssary for this mult-output view case.
+        # See Note: [AOTAutograd: differentiable outputs that alias each other from a multi-output view call]
+        self.assertTrue(all('CompiledFunctionBackward' in str(o.grad_fn) for o in out_test))
+
+        sum(out_ref).sum().backward()
+        sum(out_test).sum().backward()
+        self.assertEqual(inp_ref.grad, inp.grad)
+
+        # All aliased outs but one are from multi-output views, so AOTAutograd will hide the aliasing from autograd.
+        def f2(a):
+            out = torch.mul(a, 3)
+            return *list(out.unbind(0)), out
+
+        inp = torch.ones(3, 3, requires_grad=True)
+        inp_ref = torch.ones(3, 3, requires_grad=True)
+        f2_compiled = aot_function(f2, nop)
+
+        out_ref = f2(inp_ref)
+        out_test = f2_compiled(inp)
+        # Assert that we get CompiledFunctionBackward in the backward graph,
+        # and not AsStridedBackward. No view-regeneration necesssary for this mult-output view case.
+        # See Note: [AOTAutograd: differentiable outputs that alias each other from a multi-output view call]
+        self.assertTrue(all('CompiledFunctionBackward' in str(o.grad_fn) for o in out_test))
+
+        # The last output is not from a multi-output view, so autograd will let us mutate it.
+        out_ref[-1].mul_(2)
+        out_test[-1].mul_(2)
+        out_ref[-1].sum().backward()
+        out_test[-1].sum().backward()
+        self.assertEqual(inp_ref.grad, inp.grad)
+
+        # All aliased outs but one are from multi-output views, so AOTAutograd will hide the aliasing from autograd.
+        def f3(a):
+            out = torch.mul(a, 3)
+            return *list(out.unbind(0)), out.view(out.shape)
+
+        inp = torch.ones(3, 3, requires_grad=True)
+        inp_ref = torch.ones(3, 3, requires_grad=True)
+        f3_compiled = aot_function(f3, nop)
+
+        out_ref = f3(inp_ref)
+        out_test = f3_compiled(inp)
+        # Assert that we get CompiledFunctionBackward in the backward graph,
+        # and not AsStridedBackward. No view-regeneration necesssary for this mult-output view case.
+        # See Note: [AOTAutograd: differentiable outputs that alias each other from a multi-output view call]
+        self.assertTrue(all('CompiledFunctionBackward' in str(o.grad_fn) for o in out_test))
+
+        # The last output is not from a multi-output view, so autograd will let us mutate it.
+        out_ref[-1].mul_(2)
+        out_test[-1].mul_(2)
+        out_ref[-1].sum().backward()
+        out_test[-1].sum().backward()
+        self.assertEqual(inp_ref.grad, inp.grad)
+
+        # There are 5 outputs that all alias each other.
+        # 3 of them come from multi-output views, but the other 3 are "ordinary" aliases.
+        # Therefore, AOTAutograd will not attempt the multi-output-view optimization,
+        # and apply the intermediate_base logic to all aliases.
+        # (In theory we could probably get AOTAutograd to only apply the intermediate base
+        # logic to the last 2 outputs and not the first 3. We should probably
+        # just do the graph partitioning defined in this doc instead though).
+        # https://docs.google.com/document/d/1DlfFq8TKbuAn2zyJxLfoW-X1qkkm5PLdHFtySo03QAk/edit
+        def f4(a):
+            out = torch.mul(a, 3)
+            # also return the graph intermediate directly,
+            # which will force AOTAutograd to do the "intermediate base" logic.
+            # (Why? The user can mutate "out", which should change the autograd metadata
+            #  of the other aliased outputs)
+            return *list(out.unbind(0)), out, out.view(out.shape)
+
+        inp = torch.ones(3, 3, requires_grad=True)
+        inp_ref = torch.ones(3, 3, requires_grad=True)
+        f4_compiled = aot_function(f4, nop)
+
+        out_ref = f4(inp_ref)
+        out_test = f4_compiled(inp)
+        # Mutate the last output of f4 (autograd will allow this, since it is not a multi-output view,
+        # as long as *only* the non-multi-output views participate in the backward)
+        # Note: We could probably try to hide **only** the multi-output views from autograd here
+        # and only do the intermediate base logic for the last two aliases.
+        # Longer term solution of graph partitioning is probably cleaner though (see the note).
+        out_ref[-1].mul_(2)
+        out_test[-1].mul_(2)
+
+        out_ref_sum = out_ref[-1] + out_ref[-2]
+        out_test_sum = out_test[-1] + out_test[-2]
+        out_ref_sum.sum().backward()
+        out_test_sum.sum().backward()
+        self.assertEqual(inp_ref.grad, inp.grad)
+
+
     def test_output_aliases_intermediate_mutation_linear(self):
         def f(x):
             return (x + 1).view(-1)

diff --git a/torch/_functorch/aot_autograd.py b/torch/_functorch/aot_autograd.py
@@ -1078,11 +1078,83 @@ def inner(*flat_args):
 
         # Keep track of which outputs alias other outputs
         out_tensor_alias_counts = collections.defaultdict(int)
+        # This tells us, for a given group of outputs that alias each other,
+        # whether they e.g. all came from an unbind call
+        num_aliased_tensors_that_are_multi_output_views = collections.defaultdict(int)
         out_storage_to_tensors = collections.defaultdict(set)
         for o in flat_f_outs:
             if isinstance(o, torch.Tensor):
                 curr_storage = StorageWeakRef(o.untyped_storage())
                 out_tensor_alias_counts[curr_storage] += 1
+                # Note: [AOTAutograd: differentiable outputs that alias each other from a multi-output view call]
+                # This is an optimization on top of the "alias of intermediates" logic,
+                # which you can read more about under Note [AOT Autograd: outputs aliasing inputs or intermediates!]
+                #
+                # What is this optimization? Consider the below case:
+                # def f(x):
+                #     intermediate = x.mul(2)
+                #     # x and intermediate here require grad
+                #     o1, o2, ... o10 = intermediate.unbind(-1)
+                #     return intermediate, o1, o2, ... o10
+                # Now, the "intermediate base" handling in AOTAutograd implies that we must do the following:
+                #   (1) return "intermediate as an extra output of the compiled graph
+                #   (2) regenerate each aliased output off of "intermediate", **outside** of the autograd.Function.
+                # The reason AOTAutograd ordinarily does this is for safety: the autograd engine needs to know
+                # that o1 through o10 are all aliased, and if we blindly return o1 through o10 from the autograd.Function,
+                # this information will be hidden.
+                # In particular, mutating one alias might require autograd to update autograd metadata on the other aliases
+                # (like their grad_fn, for example, when the autograd engine needs to do view-replay).
+                #
+                # However, intermediate_base logic can be bad for backward performance (we sometimes generate
+                # as_strided calls during the intermediate base logic, which can have a slow backward formula).
+                # Is it possible to find a set of conditions where it is **safe** to hide the output aliasing from autograd?
+                #
+                # For a set of outputs of the graph that alias each other, o_1...o_k, consider:
+                # (1) They came from the same multi-outout view op, e.g. o_1, ..., o_k = intermediate.unbind(0)
+                # (2) If there are any other aliases of o_1 through o_k (in the example above, intermediate),
+                #     **at most** 1 can escape from the graph (e.g. there is not some other graph input/output
+                #     o_other, that aliases these outputs)
+                # (3) o_1...o_k all require_grad, they all share the same ._base, and their ._base requires grad.
+                #     This condition is important because it's what causes slowness in the intermediate_base
+                #     codepath of aot_autograd. Ordinarily, o_1...o_k would all get a grad_fn, and
+                #     aot_autograd's view-replay might give each output an AsStridedBackward as its grad_fn.
+                #     "K" AsStridedBackward calls will be *much* slower than a single UnbindBackward.
+                # In this setup, is it possible to mutate one of the outputs o_i in a way that would affect the autograd meta
+                # of the other aliases?
+                #
+                # Claim: No! Consider a few example (which I'm pretty sure cover all cases of mutation w.r.t. autograd):
+                # (a) What happens if we mutate any of o_1 through o_k directly?
+                #     Autograd raises an error:
+                #     "RuntimeError: Output 0 of UnbindBackward0 is a view and is being modified inplace. This view is
+                #      the output of a function that returns multiple views. Such functions do not allow the output
+                #      views to be modified inplace. You should replace the inplace operation by an out-of-place one."
+                # (b) What if we take a view of o_k and mutate it, o_k.view(o_k.shape).mul_(2)?
+                #     Autograd raises the same error- the "multi-output-view"ness of an alias propagates to future views.
+                # (c) What if we mutate o_k under no_grad?
+                #     Autograd raises the same error
+                # (d) What if we detach and mutate, e.g. o_k.detach().mul_(2)?
+                #     Autograd allows this, *but* autograd updates all alias's grad_fn's to be error functions when accessed.
+                #     Autograd raises the same error
+                # (e) What if we try to mutate another alias of o_1...o_k, that was **not** created from a multi-output view?
+                #     We promised that there is at most **one** such alias, e.g. intermediate in the example above.
+                #     You can mutate intermediate, but in eager mode this will change the grad_fn of o_1...o_k
+                #     to be error fn's.
+                #     Since intermediate was the *only* non-multi-output-alias, there are no other aliases
+                #     of `intermediate` around that were produced by the compiled fn and have a valid grad_fn.
+                #
+                # Coming back to this optimization:
+                # Given that it is not possible for mutating one of these aliases to affect the autograd metadata of another alias
+                # without causing an error in eager mode, we will simple hide the aliasing from autograd during torch.compile
+                # if all of the above conditions are met.
+                # This has the slight downside that it's possible to write some "bad" code that autograd will raise an error on
+                # in eager but fail to during torch.compile, but it has the benefit that this code has much better performance.
+                # NOTE: if and when we eventually update AOTAutograd to do the "view graph slicing" defined here:
+                # https://docs.google.com/document/d/1DlfFq8TKbuAn2zyJxLfoW-X1qkkm5PLdHFtySo03QAk/edit,
+                # then this optimization will probably matter less and might be ok to remove.
+                is_cur_tensor_multi_out_view = isinstance(o, FunctionalTensor) \
+                    and torch._functionalize_is_multi_output_view(o.elem)
+                if is_cur_tensor_multi_out_view:
+                    num_aliased_tensors_that_are_multi_output_views[curr_storage] += 1
                 out_storage_to_tensors[curr_storage].add(o)
 
         # maps the id of an intermediate base to its index in the output of the compiled forward
@@ -1125,7 +1197,11 @@ def inner(*flat_args):
                 and o.requires_grad
                 and o._base.requires_grad
             ):
-                if out_tensor_alias_counts[curr_storage] == 1:
+                num_aliased_outs = out_tensor_alias_counts[curr_storage]
+                num_multi_output_view_outs = num_aliased_tensors_that_are_multi_output_views[curr_storage]
+                num_aliased_outs_that_are_not_multi_output_views = num_aliased_outs - num_multi_output_view_outs
+                # Note: [AOTAutograd: differentiable outputs that alias each other from a multi-output view call]
+                if out_tensor_alias_counts[curr_storage] == 1 or num_aliased_outs_that_are_not_multi_output_views <= 1:
                     # Note [Intermediate Bases Optimization]
                     # Normally if we have an output that aliases an intermediate,
                     # we need to add the extra "intermediate base" logic further down

diff --git a/torch/csrc/autograd/python_torch_functions_manual.cpp b/torch/csrc/autograd/python_torch_functions_manual.cpp
@@ -535,6 +535,27 @@ static PyObject* THPVariable__functionalize_enable_reapply_views(
   END_HANDLE_TH_ERRORS
 }
 
+static PyObject* THPVariable__functionalize_is_multi_output_view(
+    PyObject* self,
+    PyObject* args,
+    PyObject* kwargs) {
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser(
+      {"_functionalize_is_multi_output_view(Tensor t)"},
+      /*traceable=*/true);
+  ParsedArgs<1> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  auto t = r.tensor(0);
+  TORCH_CHECK(at::functionalization::impl::isFunctionalTensor(t));
+  auto t_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(t);
+  if (t_impl->is_multi_output_view()) {
+    Py_RETURN_TRUE;
+  } else {
+    Py_RETURN_FALSE;
+  }
+  END_HANDLE_TH_ERRORS
+}
+
 static PyObject* THPVariable__disable_functionalization(
     PyObject* self,
     PyObject* args,
@@ -664,6 +685,11 @@ static PyMethodDef torch_functions_manual[] = {
          THPVariable__functionalize_has_metadata_mutation),
      METH_VARARGS | METH_KEYWORDS | METH_STATIC,
      nullptr},
+    {"_functionalize_is_multi_output_view",
+     castPyCFunctionWithKeywords(
+         THPVariable__functionalize_is_multi_output_view),
+     METH_VARARGS | METH_KEYWORDS | METH_STATIC,
+     nullptr},
     {"_functionalize_enable_reapply_views",
      castPyCFunctionWithKeywords(
          THPVariable__functionalize_enable_reapply_views),

diff --git a/torchgen/gen_functionalization_type.py b/torchgen/gen_functionalization_type.py
@@ -377,6 +377,7 @@ def emit_view_functionalization_body(
 """
 
         else:
+            is_multi_output_view = isinstance(f.func.returns[0].type, ListType)
             return f"""
     {dispatcher_sig.defn(name=wrapper_name(f.func), is_redispatching_fn=True)} {{
       {unwrap_tensor_args_str}
@@ -415,7 +416,8 @@ def emit_view_functionalization_body(
         }},
         {reverse_lambda.decl()} {{
           return {reverse_lambda.inner_call()}
-        }}
+        }},
+        /*is_multi_output=*/{str(is_multi_output_view).lower()}
       );
       auto out = at::functionalization::impl::create_functional_tensor_with_view_meta(tmp_output, {view_tensor_name}, view_meta);
       // See  Note [Propagating strides in the functionalization pass]