pytorch · voznesenskym · Sep 20, 2023 · Sep 20, 2023 · Sep 20, 2023 · Sep 20, 2023
diff --git a/test/dynamo/test_backward_higher_order_ops.py b/test/dynamo/test_backward_higher_order_ops.py
@@ -0,0 +1,251 @@
+# Owner(s): ["module: dynamo"]
+# flake8: noqa
+
+import functools
+
+import torch
+
+import torch._dynamo.test_case
+import torch._dynamo.testing
+import torch._dynamo.utils
+from torch import _inductor as inductor
+from torch._dynamo import compiled_autograd
+from torch._dynamo._trace_wrapped_higher_order_op import trace_wrapped
+from torch._dynamo.testing import normalize_gm
+from torch._dynamo.utils import counters
+from torch.fx.experimental.proxy_tensor import make_fx
+
+
+def _multiply(x):
+    return x * x
+
+
+def _multiply_invoke(grad):
+    return trace_wrapped(grad, fn=_multiply)
+
+
+class BackwardHigherOrderOpTests(torch._dynamo.test_case.TestCase):
+    def test_invoke_in_eager(self):
+        x = torch.tensor([0.5, 0.5], requires_grad=True)
+        y = torch.tensor([0.5, 0.5], requires_grad=True)
+
+        def fn(x, y):
+            x.register_hook(_multiply_invoke)
+            return x * y
+
+        out = fn(x, y)
+        grad_out = torch.tensor([2.0, 2.0])
+        out.backward(grad_out)
+        self.assertEqual(x.grad, y * grad_out)
+
+    def test_invoke_in_pt2(self):
+        for backend in ["eager", "aot_eager", "inductor"]:
+            torch._dynamo.reset()
+            x = torch.tensor([0.5, 0.5], requires_grad=True)
+            y = torch.tensor([0.5, 0.5], requires_grad=True)
+
+            def fn(x, y):
+                x.register_hook(_multiply_invoke)
+                return x * y
+
+            fn = torch._dynamo.optimize(backend)(fn)
+            out = fn(x, y)
+            grad_out = torch.tensor([2.0, 2.0])
+            out.backward(grad_out)
+            self.assertEqual(x.grad, grad_out * y)
+
+    def test_invoke_make_fx_forward_contrived(self):
+        x = torch.tensor([0.5, 0.5], requires_grad=True)
+        out = make_fx(_multiply_invoke)(x)
+        self.assertEqual(out(x), torch.tensor([0.25, 0.25]))
+        actual = normalize_gm(out.print_readable(False))
+
+        expected = """\
+class _multiply_invoke(torch.nn.Module):
+    def forward(self, grad_1: f32[2]):
+        invocation: f32[2] = torch__dynamo__trace_wrapped_higher_order_op_self_invoke(grad_1);  grad_1 = None
+        assert_1: f32[2] = torch__dynamo__trace_wrapped_higher_order_op__assert_meta(invocation, (2,), (1,), torch.float32);  invocation = None
+        detach: f32[2] = torch.ops.aten.detach.default(assert_1);  assert_1 = None
+        detach_1: f32[2] = torch.ops.aten.detach.default(detach);  detach = None
+        detach_2: f32[2] = torch.ops.aten.detach.default(detach_1);  detach_1 = None
+        detach_3: f32[2] = torch.ops.aten.detach.default(detach_2);  detach_2 = None
+        return detach_3
+"""
+        self.assertExpectedInline(actual, expected)
+
+    def test_invoke_make_bw(self):
+        x = torch.tensor([0.5, 0.5], requires_grad=True)
+
+        def fwd(x):
+            z = x * x
+            return z + z
+
+        res = fwd(x)
+        res.backward(torch.tensor([1.0, 1.0]))
+        out = make_fx(_multiply_invoke)(x.grad)
+        self.assertEqual(out(x.grad), torch.tensor([4.0, 4.0]))
+        actual = normalize_gm(out.print_readable(False))
+
+        expected = """\
+class _multiply_invoke(torch.nn.Module):
+    def forward(self, grad_1: f32[2]):
+        invocation: f32[2] = torch__dynamo__trace_wrapped_higher_order_op_self_invoke(grad_1);  grad_1 = None
+        assert_1: f32[2] = torch__dynamo__trace_wrapped_higher_order_op__assert_meta(invocation, (2,), (1,), torch.float32);  invocation = None
+        return assert_1
+"""
+        self.assertExpectedInline(actual, expected)
+
+    def test_invoke_in_pt2_compiled_autograd(self):
+        graph = None
+
+        def compiler_fn(gm):
+            def inner_compiler(gm_, example_inputs_):
+                nonlocal graph
+                self.assertEqual(graph, None)
+                graph = gm_
+                return inductor.compile(gm_, example_inputs_)
+
+            return torch.compile(
+                gm, backend=inner_compiler, fullgraph=True, dynamic=True
+            )
+
+        for backend in ["eager", "aot_eager", "inductor"]:
+            torch._dynamo.reset()
+            x = torch.tensor([0.5, 0.5], requires_grad=True)
+            y = torch.tensor([0.5, 0.5], requires_grad=True)
+
+            def fn(x, y):
+                x.register_hook(_multiply_invoke)
+                return x + y
+
+            fn = torch._dynamo.optimize(backend)(fn)
+            out = fn(x, y)
+            grad_out = torch.tensor([2.0, 2.0])
+            with compiled_autograd.enable(compiler_fn):
+                out.backward(grad_out)
+            actual = normalize_gm(graph.print_readable(False))
+            self.assertEqual(x.grad, grad_out * grad_out)
+            expected = """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_inputs_0_ : torch.Tensor):
+        getitem = L_inputs_0_
+
+        new_empty_strided = torch.ops.aten.new_empty_strided.default(getitem, [2], [1], dtype = torch.float32, layout = torch.strided, device = device(type='cpu'))
+
+        copy_ = torch.ops.aten.copy_.default(new_empty_strided, getitem);  new_empty_strided = None
+
+        call_hook = getitem * getitem;  getitem = None
+
+        new_empty_strided_1 = torch.ops.aten.new_empty_strided.default(call_hook, [2], [1], dtype = torch.float32, layout = torch.strided, device = device(type='cpu'))
+
+        copy__1 = torch.ops.aten.copy_.default(new_empty_strided_1, call_hook);  new_empty_strided_1 = call_hook = None
+        return (copy_, copy__1)
+"""
+            self.assertExpectedInline(actual, expected)
+
+            graph = None
+
+    def test_invoke_in_pt2_compiled_autograd_side_effect(self):
+        def _side_effect_stateful_fn2(x, obj):
+            obj.counter = obj.counter + 1
+            return _multiply(x)
+
+        def _side_effectful_invoke2(grad, fn):
+            return trace_wrapped(grad, fn=fn)
+
+        graph = None
+
+        def compiler_fn(gm):
+            def inner_compiler(gm_, example_inputs_):
+                nonlocal graph
+                self.assertEqual(graph, None)
+                graph = gm_
+                return inductor.compile(gm_, example_inputs_)
+
+            return torch.compile(
+                gm, backend=inner_compiler, fullgraph=True, dynamic=True
+            )
+
+        for backend in ["eager", "aot_eager", "inductor"]:
+            torch._dynamo.reset()
+            x = torch.tensor([0.5, 0.5], requires_grad=True)
+            y = torch.tensor([0.5, 0.5], requires_grad=True)
+
+            class MyObj:
+                def __init__(self):
+                    self.counter = 0
+
+            obj = MyObj()
+            inner_fn = functools.partial(_side_effect_stateful_fn2, obj=obj)
+            hook_fn = functools.partial(_side_effectful_invoke2, fn=inner_fn)
+            x.register_hook(hook_fn)
+
+            def fn(x, y):
+                return x + y
+
+            fn = torch._dynamo.optimize(backend, nopython=True)(fn)
+            out = fn(x, y)
+            grad_out = torch.tensor([2.0, 2.0])
+            with compiled_autograd.enable(compiler_fn):
+                out.backward(grad_out)
+            actual = normalize_gm(graph.print_readable(False))
+            self.assertEqual(obj.counter, 1)
+            self.assertEqual(x.grad, grad_out + grad_out)
+            expected = """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_inputs_0_ : torch.Tensor):
+        getitem = L_inputs_0_
+
+        new_empty_strided = torch.ops.aten.new_empty_strided.default(getitem, [2], [1], dtype = torch.float32, layout = torch.strided, device = device(type='cpu'))
+
+        copy_ = torch.ops.aten.copy_.default(new_empty_strided, getitem);  new_empty_strided = None
+
+        call_hook = getitem * getitem;  getitem = None
+
+        new_empty_strided_1 = torch.ops.aten.new_empty_strided.default(call_hook, [2], [1], dtype = torch.float32, layout = torch.strided, device = device(type='cpu'))
+
+        copy__1 = torch.ops.aten.copy_.default(new_empty_strided_1, call_hook);  new_empty_strided_1 = call_hook = None
+        return (copy_, copy__1)
+"""
+            self.assertExpectedInline(actual, expected)
+
+            out = fn(x, y)
+            out.backward(grad_out)
+            self.assertEqual(obj.counter, 2)
+
+            out = fn(x, y)
+            out.backward(grad_out)
+            self.assertEqual(obj.counter, 3)
+            graph = None
+
+    def test_invoke_in_pt2_compiled_autograd_graph_breaks(self):
+        def _graph_breaking_fn(x):
+            print("Boo!")
+            return _multiply(x)
+
+        def _graph_break_invoke(grad):
+            return trace_wrapped(grad, fn=_graph_breaking_fn)
+
+        def compiler_fn(gm):
+            return torch.compile(gm, backend="inductor", fullgraph=True, dynamic=True)
+
+        for backend in ["eager", "aot_eager", "inductor"]:
+            torch._dynamo.reset()
+            x = torch.tensor([0.5, 0.5], requires_grad=True)
+            y = torch.tensor([0.5, 0.5], requires_grad=True)
+
+            def fn(x, y):
+                x.register_hook(_graph_break_invoke)
+                return x + y
+
+            fn = torch._dynamo.optimize(backend, nopython=True)(fn)
+            out = fn(x, y)
+            grad_out = torch.tensor([2.0, 2.0])
+            with self.assertRaisesRegex(
+                torch._dynamo.exc.Unsupported,
+                "print",
+            ):
+                with compiled_autograd.enable(compiler_fn):
+                    out.backward(grad_out)
+
+            graph = None
diff --git a/torch/_dynamo/_trace_wrapped_higher_order_op.py b/torch/_dynamo/_trace_wrapped_higher_order_op.py
@@ -0,0 +1,111 @@
+import torch.utils._pytree as pytree
+from torch._C import DispatchKey
+from torch._higher_order_ops.utils import autograd_not_implemented
+
+from torch._ops import HigherOrderOperator
+from torch._subclasses import FakeTensorMode
+
+from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode, track_tensor_tree
+from torch.utils._python_dispatch import _get_current_dispatch_mode
+
+
+__all__ = ["trace_wrapped"]
+
+
+# Trace wrapped is a higher order op meant for both invoking a bound function,
+# and for registering it as a call_function.
+# This allows us to re-enter dynamo during compiled autograd to trace (or graph break)
+# the functions as needed. While there is nothing backward specific about this op, the way it is written means
+# we can support functions in backward with complex python. It can be thought of as an allow_in_graph
+# for our aten graph. If we were to not do this, the functions would get inlined into their composing aten ops,
+# and we would lose the python state mutation.
+def trace_wrapped(*args, fn):
+    return _trace_wrapped_op(*args, fn=fn)
+
+
+_trace_wrapped_op = HigherOrderOperator("trace_wrapped")
+
+
+def _assert_meta(grad, size, stride, dtype):
+    assert grad.size() == size, "size mismatch"
+    assert grad.stride() == stride, "stride mismatch"
+    assert grad.dtype == dtype, "dtype mismatch"
+    return grad
+
+
+@_trace_wrapped_op.py_impl(ProxyTorchDispatchMode)
+def inner_trace(mode, *args, fn):
+    import torch
+
+    assert len(args) == 1
+    grad = args[0]
+    assert isinstance(grad, torch.Tensor)
+
+    # We've implemented a higher-order operator that remains consistent in proxy tensor tracing.
+    # However, Dynamo is aware and traces this into its genuine functionality.
+    # The operator's purpose is to facilitate invoking non-traceable functions
+    # and embedding them directly into the graph. Essentially, this transforms function
+    # calls into "leaf modules" as per traditional FX terminology.
+    # Note: Instead of naming it "allow_in_graph", we opted for a different name since "allow_in_graph"
+    # might imply that it's traceable, whereas this function is intrinsically non-traceable.
+    # Note2: I hate this name
+    def self_invoke(*args):
+        return _trace_wrapped_op(*args, fn=fn)
+
+    proxy_args = (mode.tracer.unwrap_proxy(grad),)
+    out_proxy = mode.tracer.create_proxy(
+        "call_function", self_invoke, proxy_args, {}, name="invocation"
+    )
+    grad = torch.zeros_like(grad)
+    grad = track_tensor_tree(grad, out_proxy, constant=None, tracer=mode.tracer)
+
+    # We have a little shortcut here, wherein we DO NOT yet run a meta func, and so
+    # we take on an assumption that input and output meta matches. As such, we must introduce
+    # a runtime assert
+    proxy_args = pytree.tree_map(
+        mode.tracer.unwrap_proxy, (grad, grad.size(), grad.stride(), grad.dtype)
+    )
+    out_proxy = mode.tracer.create_proxy(
+        "call_function",
+        _assert_meta,
+        proxy_args,
+        {},
+        name="assert",
+    )
+    grad = torch.empty_like(grad)
+    grad = track_tensor_tree(grad, out_proxy, constant=None, tracer=mode.tracer)
+    return grad
+
+
+@_trace_wrapped_op.py_impl(FakeTensorMode)
+def inner_fake(*args, fn):
+    raise RuntimeError("This op should never be invoked here")
+
+
+@_trace_wrapped_op.py_impl(DispatchKey.CompositeExplicitAutograd)
+def _trace_wrapped_op_dense(*args, fn):
+    mode = _get_current_dispatch_mode()
+    assert mode is None, "Mode should never be enabled for CPU/CUDA key"
+    return fn(*args)
+
+
+_trace_wrapped_op.py_impl(DispatchKey.Autograd)(
+    autograd_not_implemented(_trace_wrapped_op, deferred_error=True)
+)
+
+
+@_trace_wrapped_op.py_functionalize_impl
+def _trace_wrapped_functionalized(ctx, *args, fn):
+    unwrapped_args = ctx.unwrap_tensors(args)
+    wrapped_fn = ctx.functionalize(fn)
+    with ctx.redispatch_to_next():
+        return ctx.wrap_tensors(_trace_wrapped_op(*unwrapped_args, fn=wrapped_fn))
+
+
+# TODO(voz): Make this automatic for keys, this is very ugly atm
+_trace_wrapped_op.fallthrough(DispatchKey.PythonDispatcher)  # type: ignore[attr-defined]
+_trace_wrapped_op.fallthrough(DispatchKey.PythonTLSSnapshot)  # type: ignore[attr-defined]
+_trace_wrapped_op.fallthrough(DispatchKey.ADInplaceOrView)
+_trace_wrapped_op.fallthrough(DispatchKey.BackendSelect)
+_trace_wrapped_op.fallthrough(DispatchKey.AutocastCPU)  # type: ignore[attr-defined]
+_trace_wrapped_op.fallthrough(DispatchKey.AutocastCUDA)  # type: ignore[attr-defined]
diff --git a/torch/_dynamo/side_effects.py b/torch/_dynamo/side_effects.py
@@ -522,6 +522,7 @@ def is_empty(self):
         return not (
             any(map(self.is_modified, self.id_to_variable.values()))
             or self.save_for_backward
+            or self.tensor_hooks
         )
 
     def clear(self):

diff --git a/torch/_dynamo/skipfiles.py b/torch/_dynamo/skipfiles.py
@@ -175,6 +175,10 @@ def _module_dir(m: types.ModuleType):
     _module_dir(torch) + "distributed/_tensor/device_mesh.py",
 }
 
+FILENAME_ALLOWLIST |= {
+    _module_dir(torch) + "_dynamo/_trace_wrapped_higher_order_op.py",
+}
+
 SKIP_DIRS_RE = None
 
 is_fbcode = importlib.import_module("torch._inductor.config").is_fbcode()