pytorch · bohnstingl · May 9, 2024 · May 9, 2024 · May 9, 2024 · May 10, 2024
diff --git a/functorch/experimental/control_flow.py b/functorch/experimental/control_flow.py
@@ -5,4 +5,4 @@
     _stack_pytree,
     _unstack_pytree,
     map,
-)
+)
diff --git a/test/functorch/test_control_flow.py b/test/functorch/test_control_flow.py
diff --git a/torch/__init__.py b/torch/__init__.py
@@ -1912,7 +1912,6 @@ def fn(model: Callable):
 
 
 from torch import export as export
-
 from torch._higher_order_ops import cond
 
 def _register_device_module(device_type, module):

diff --git a/torch/_higher_order_ops/cond.py b/torch/_higher_order_ops/cond.py
@@ -6,6 +6,7 @@
 import torch.utils._pytree as pytree
 
 from torch._C import DispatchKey
+from torch._dispatch.python import suspend_functionalization
 from torch._C._functorch import (
     _add_batch_dim,
     get_unwrapped,
@@ -19,22 +20,27 @@
     _has_potential_branch_input_alias,
     _has_potential_branch_input_mutation,
     _set_compilation_env,
-    autograd_not_implemented,
     reenter_make_fx,
     unique_graph_id,
     UnsupportedAliasMutationException,
 )
 
 from torch._ops import HigherOrderOperator
 from torch._subclasses.fake_tensor import FakeTensorMode
+from torch._subclasses.functional_tensor import (
+    disable_functional_mode,
+)
 from torch.fx.experimental.proxy_tensor import (
+    disable_proxy_modes_tracing,
+    make_fx,
     _temp_remove_pre_dispatch_torch_function_mode,
     ProxyTorchDispatchMode,
     track_tensor_tree,
 )
 from torch.fx.passes.shape_prop import _extract_tensor_metadata
 from torch.utils._python_dispatch import _get_current_dispatch_mode
-
+from torch.fx.experimental.proxy_tensor import _temp_remove_pre_dispatch_torch_function_mode
+from .utils import _from_fun, clone_outputs_aliasing_inputs, prepare_fw_with_masks
 
 @exposed_in("torch")
 def cond(pred, true_fn, false_fn, operands):
@@ -101,8 +107,6 @@ def false_fn(x: torch.Tensor):
     .. warning::
         Temporal Limitations:
 
-        - `cond` only supports **inference** right now. Autograd will be supported in the future.
-
         - The **output** of branches must be a **single Tensor**. Pytree of tensors will be supported in the future.
 
     """
@@ -142,12 +146,111 @@ def _validate_input(pred, true_fn, false_fn, operands):
                     pred, true_fn, false_fn, operands
                 )
 
-
 """
 We're going to define a `cond_op` operation.
 In order to do this, we need implementations for each of the dispatch keys.
 """
 cond_op = HigherOrderOperator("cond")
+cond_op.__module__ = "torch.ops.higher_order"
+
+def create_fw_bw_graph(true_fn, false_fn, *operands):
+
+    from torch._functorch.aot_autograd import AOTConfig, create_joint
+    dummy_aot_config = AOTConfig(
+        fw_compiler=None,  # type: ignore[arg-type]
+        bw_compiler=None,  # type: ignore[arg-type]
+        partition_fn=None,  # type: ignore[arg-type]
+        decompositions={},
+        num_params_buffers=0,
+        aot_id=0,
+        keep_inference_input_mutations=False,
+    )
+
+    # Note:[HOP create fw_bw graph] We create "clean" environments for make_fx by suspending all dispatch keys
+    # between Autograd and Python key. Currently, we only suspend functionalization but more can be
+    # added when required. Will encounter two problems if we don't suspend functionalization:
+    #
+    # 1. make_fx fails to capture operations on input: the inputs are wrapped as _to_functional_tensor_wrapper,
+    # but they will be unwrapped before entering ProxyTorchDispatchMode as part of the dispatching.
+    # However, it's the outside wrapper that tracer creates proxies for. This casuses tracer fail to
+    # fetch the proxy for the inputs and fail to capture any operations on them.
+    #
+    # 2. make_fx fails to capture output: the outputs after ProxyTorchDispatchMode are further
+    # wrapped as FunctionalTensorWrapper in Functionalize key after return. However, the tracer
+    # only associates the inner tensor with proxy in ProxyTorchDispatchMode. Therefore,
+    # when creating the output node, it fails to associate the wrapped tensor with its proxy.
+    # Instead, it will create _tensor_constant as output.
+
+    with suspend_functionalization(), disable_functional_mode():
+        with disable_proxy_modes_tracing():
+
+            num_mapped_args = len(operands)
+            unwrapped_mapped_operands = pytree.tree_map(_from_fun, operands)
+            example_operands = unwrapped_mapped_operands
+
+            #Note, the true_fn and the false_fn produce the same output
+            #shape, thus we can simply generate the example outputs from the true_fn.
+            example_flat_out = pytree.tree_map(
+                _from_fun, true_fn(*example_operands)
+            )
+            if any(
+                not isinstance(out, torch.Tensor)
+                for out in example_flat_out
+                if out is not None
+            ):
+                raise RuntimeError(
+                    "Expect outputs of map only contains tensors or None. "
+                    f"Got types {[type(out) for out in example_flat_out]}."
+                )
+            example_grad = [_from_fun(out) for out in example_flat_out]
+
+            fw_true_graph = make_fx(true_fn)(*example_operands)
+            fw_false_graph = make_fx(false_fn)(*example_operands)
+
+        def joint_f_true(*joint_mapped_args):
+            mapped_input = joint_mapped_args[:num_mapped_args]
+            mapped_grads = joint_mapped_args[num_mapped_args:]
+
+            joint = create_joint(prepare_fw_with_masks(true_fn), aot_config=dummy_aot_config)
+            _, grads = joint(
+                list(mapped_input),
+                [
+                    grad
+                    for grad in mapped_grads
+                    if grad is not None and grad.requires_grad
+                ],
+            )
+
+            # In order to keep map functional for backward graph,
+            # we clone outputs that are aliasing inputs           
+            maybe_clone = clone_outputs_aliasing_inputs(joint_mapped_args)
+
+            return pytree.tree_map(maybe_clone, grads)
+
+        def joint_f_false(*joint_mapped_args):
+            mapped_input = joint_mapped_args[:num_mapped_args]
+            mapped_grads = joint_mapped_args[num_mapped_args:]
+
+            joint = create_joint(prepare_fw_with_masks(false_fn), aot_config=dummy_aot_config)
+            _, grads = joint(
+                list(mapped_input),
+                [
+                    grad
+                    for grad in mapped_grads
+                    if grad is not None and grad.requires_grad
+                ],
+            )
+
+            # In order to keep map functional for backward graph,
+            # we clone outputs that are aliasing inputs
+            maybe_clone = clone_outputs_aliasing_inputs(joint_mapped_args)
+
+            return pytree.tree_map(maybe_clone, grads)
+
+        joint_operands_grads = list(example_operands) + list(example_grad)
+        joint_true_graph = make_fx(joint_f_true)(*joint_operands_grads)
+        joint_false_graph = make_fx(joint_f_false)(*joint_operands_grads)
+        return fw_true_graph, fw_false_graph, joint_true_graph, joint_false_graph
 
 
 def trace_cond(proxy_mode, func_overload, pred, true_fn, false_fn, operands):
@@ -243,10 +346,36 @@ def cond_op_dense(pred, true_fn, false_fn, operands):
         return false_fn(*operands)
 
 
-cond_op.py_impl(DispatchKey.Autograd)(
-    autograd_not_implemented(cond_op, deferred_error=True)
-)
+class CondAutogradOp(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, pred, fw_true_graph, fw_false_graph, joint_true_graph, joint_false_graph, num_mapped_args, *operands):
+        ctx._pred = pred
+        ctx._joint_true_graph = joint_true_graph
+        ctx._joint_false_graph = joint_false_graph
+        ctx.save_for_backward(*operands)
+
+        with torch._C._AutoDispatchBelowAutograd():
+            with _set_compilation_env(), torch._dynamo.utils.disable_cache_limit():
+                return torch.compile(cond_op, backend="eager", fullgraph=True)(
+                    pred, fw_true_graph, fw_false_graph, operands
+                )
 
+    @staticmethod
+    def backward(ctx, *flat_grads):       
+        operands = ctx.saved_tensors
+
+        with _set_compilation_env(), torch._dynamo.utils.disable_cache_limit():
+            grads = torch.compile(cond_op, backend="eager", fullgraph=True)(
+                ctx._pred, ctx._joint_true_graph, ctx._joint_false_graph, operands + flat_grads
+            )
+            return None, None, None, None, None, None, *grads
+
+@cond_op.py_impl(DispatchKey.Autograd)
+def cond_autograd(pred, true_fn, false_fn, operands):
+    num_mapped_args = len(operands)
+    fw_true_graph, fw_false_graph, joint_true_graph, joint_false_graph = create_fw_bw_graph(true_fn, false_fn, *operands)
+    flat_out = CondAutogradOp.apply(pred, fw_true_graph, fw_false_graph, joint_true_graph, joint_false_graph, num_mapped_args, *operands)
+    return flat_out
 
 @cond_op.py_impl(ProxyTorchDispatchMode)
 def inner(mode, pred, true_fn, false_fn, operands):

diff --git a/torch/_higher_order_ops/map.py b/torch/_higher_order_ops/map.py
@@ -2,7 +2,7 @@
 import torch.utils._pytree as pytree
 from torch._C import DispatchKey
 from torch._dispatch.python import suspend_functionalization
-from torch._functorch.aot_autograd import AOTConfig, create_joint, from_fun
+from torch._functorch.aot_autograd import AOTConfig, create_joint
 
 from torch._higher_order_ops.utils import (
     _has_potential_branch_input_alias,
@@ -14,15 +14,14 @@
 from torch._subclasses.fake_tensor import FakeTensorMode
 from torch._subclasses.functional_tensor import (
     disable_functional_mode,
-    FunctionalTensor,
 )
 from torch.fx.experimental.proxy_tensor import (
     disable_proxy_modes_tracing,
     make_fx,
     ProxyTorchDispatchMode,
     track_tensor_tree,
 )
-from torch.multiprocessing.reductions import StorageWeakRef
+from .utils import _from_fun, clone_outputs_aliasing_inputs, prepare_fw_with_masks, _unstack_pytree, _stack_pytree
 
 
 # TODO: We add this to prevent dymamo from tracing into map_wrapper,
@@ -68,31 +67,6 @@ def create_fw_bw_graph(f, num_mapped_args, *args):
     with suspend_functionalization(), disable_functional_mode():
         with disable_proxy_modes_tracing():
 
-            def _from_fun(t):
-                if isinstance(t, torch.Tensor):
-                    if t.dtype != torch.bool:
-                        return torch.empty_strided(
-                            t.size(),
-                            t.stride(),
-                            dtype=t.dtype,
-                            requires_grad=t.requires_grad,
-                        )
-                    else:
-                        # clone of a functional tensor produces a functional tensor
-                        # but we want to avoid it so we clone a non-functional version
-                        maybe_unfunc_t = t
-                        if isinstance(t, FunctionalTensor):
-                            torch._sync(t)
-                            maybe_unfunc_t = from_fun(t)
-                        elif torch._is_functional_tensor(t):
-                            # need to handle both types of functionalization here:
-                            # these are the tensors that came from the user,
-                            # which could be either FunctionalTensorWrapper or FunctionalTensor
-                            torch._sync(t)
-                            maybe_unfunc_t = torch._from_functional_tensor(t)
-                        return maybe_unfunc_t.clone()
-                return t
-
             unwrapped_mapped_xs = pytree.tree_map(_from_fun, mapped_xs)
             example_xs = _unstack_pytree(unwrapped_mapped_xs)[0]
 
@@ -123,16 +97,7 @@ def joint_f(*example_args):
             mapped_input = joint_mapped_args[:num_mapped_args]
             mapped_grads = joint_mapped_args[num_mapped_args:]
 
-            def fw_with_masks(*args):
-                fw_out = f(*args)
-                return fw_out, [
-                    True
-                    if isinstance(ret, torch.Tensor) and ret.requires_grad
-                    else False
-                    for ret in fw_out
-                ]
-
-            joint = create_joint(fw_with_masks, aot_config=dummy_aot_config)
+            joint = create_joint(prepare_fw_with_masks(f), aot_config=dummy_aot_config)
             _, grads = joint(
                 list(mapped_input) + list(args),
                 [
@@ -144,19 +109,7 @@ def fw_with_masks(*args):
 
             # In order to keep map functional for backward graph,
             # we clone outputs that are aliasing inputs
-            input_storage = {
-                StorageWeakRef(arg._typed_storage())
-                for arg in example_args
-                if isinstance(arg, torch.Tensor)
-            }
-
-            def maybe_clone(t):
-                if (
-                    isinstance(t, torch.Tensor)
-                    and StorageWeakRef(t._typed_storage()) in input_storage
-                ):
-                    return t.clone()
-                return t
+            maybe_clone = clone_outputs_aliasing_inputs(example_args)
 
             return pytree.tree_map(maybe_clone, grads)
 
@@ -255,47 +208,6 @@ def expand_tensor(t):
         expanded_outs, out_proxy, constant=None, tracer=proxy_mode.tracer
     )
 
-
-def _unstack_pytree(xs):
-    flat_xs, inspec = pytree.tree_flatten(xs)
-    if not all(isinstance(xs, torch.Tensor) for xs in flat_xs):
-        raise RuntimeError(f"Leaves of xs must be Tensor {flat_xs}")
-
-    if not all(xs.shape[0] == flat_xs[0].shape[0] for xs in flat_xs):
-        raise RuntimeError(
-            f"Leaves of xs must have same leading dimension size {[xs.shape for xs in flat_xs]}"
-        )
-
-    a = zip(*flat_xs)
-
-    pytrees = []
-    for tuple in a:
-        pytrees.append(pytree.tree_unflatten(tuple, inspec))
-    return pytrees
-
-
-def _stack_pytree(pytrees):
-    flat_out = []
-    out_spec = None
-    for pt in pytrees:
-        flat_pt, out_spec = pytree.tree_flatten(pt)
-        flat_out.append(flat_pt)
-    assert out_spec is not None
-    b = zip(*flat_out)
-    stacked_out = []
-    for leaves in b:
-        if all(isinstance(leaf, torch.Tensor) for leaf in leaves):
-            stacked_out.append(torch.stack(leaves))
-        elif all(leaf is None for leaf in leaves):
-            # Backward graph can return None output when forward inputs doesn't require grad.
-            # When we eagerly execute backward graph, we need to call _stack_pytree on its output,
-            # therefore we need to deal with None output.
-            stacked_out.append(None)  # type: ignore[arg-type]
-        else:
-            raise RuntimeError(f"Cannot stack {leaves}.")
-    return pytree.tree_unflatten(stacked_out, out_spec)
-
-
 @map_impl.py_impl(DispatchKey.CompositeExplicitAutograd)
 def map_dense(f, xs, pos_args):
     pytrees = []