Update on "recompile fx.GraphModule lazily"

Context: eellison 's review comment [here](#103642 (comment)) complains about my code calling `torch.fx.GraphModule.recompile` after I changed the graph. We didn't simply remove the call to `recompile` at that time since that increases the risk that user see or run stale python code. In this PR, I recompile GraphModule lazily without increasing the risk that user see/run stale python code. When training BertForMaskedLM, the `GraphModule.recompile` is called 707 times and takes 1.8s in total. The whole compilation takes around 60 seconds. By spot checking, I found the main reason we call recompile so frequently is due to inductor pattern matcher. E.g., if we want to replace src_fn with dst_fn, we need trace both src_fn and dst_fn. After tracing is done, we create a GraphModule. The init method of GraphModule will call recompile. By doing recompile lazily, we reduce the number of calls for `GraphModule._real_recompile` (in this PR, `recompile` just mark the class as needing recompilation and is very light weight. `_real_recompile` does the real recompilation) to 37 times and reduces its total execution time to 0.045s. [ghstack-poisoned]
pytorch · Jul 17, 2023 · 4694c6d · 4694c6d
2 parents 01a5e32 + 90c8cd5
commit 4694c6d
Show file tree

Hide file tree

Showing 4 changed files with 38 additions and 3 deletions.
diff --git a/test/fx/test_lazy_recompile.py b/test/fx/test_lazy_recompile.py
@@ -3,6 +3,7 @@
 from torch.testing._internal.common_utils import TestCase, run_tests
 from torch import fx
 import torch
+import torch._export
 
 class TestLazyRecompile(TestCase):
     def test_replace_sin_with_cos(self):
@@ -42,6 +43,15 @@ def f(x):
         print(f"sin {x.sin()}, cos {x.cos()}, expected {expected}, actual {actual}")
         self.assertTrue(torch.allclose(expected, actual))
 
+    def test_export(self):
+        """
+        torch.export will access GraphModule._out_spec. Make sure we generate them
+        if we have not done that yet.
+        """
+        def f(x):
+            return x.sin()
+        gm = torch._export.export(f, (torch.randn(2, 3),))
+        self.assertTrue(isinstance(gm, torch._export.ExportedProgram))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/_export/__init__.py b/torch/_export/__init__.py
@@ -212,7 +212,7 @@ def export(
             # because aot_export expects a tuple as return type
             return_val = f(*args)
             flat_args, in_spec = pytree.tree_flatten(args)
-            out_spec = orig_out_spec = gm_torch_level._out_spec
+            out_spec = orig_out_spec = gm_torch_level.out_spec
             # this means it is scalar return value, so will make it tuple
             if not isinstance(return_val, (list, tuple)):
                 out_spec = pytree.tree_flatten((return_val,))[1]

diff --git a/torch/_functorch/compilers.py b/torch/_functorch/compilers.py
@@ -183,6 +183,11 @@ def debug_nop(fx_g: fx.GraphModule, _) -> Callable:
 @make_boxed_compiler
 def simple_ts_compile(fx_g, _):
     strip_overloads(fx_g)
+
+    # realize the lazy recompilication to make jit.script happy.
+    if fx_g._needs_recompile():
+        fx_g._real_recompile()
+
     f = torch.jit.script(fx_g)
     f = torch.jit.freeze(f.eval())
     return f

diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py
@@ -642,13 +642,22 @@ def code(self) -> str:
         Return the Python code generated from the ``Graph`` underlying this
         ``GraphModule``.
         """
-        if self._needs_recompile():
-            self._real_recompile()
 
+        self.real_recompile()
         if not hasattr(self, '_code'):
             raise RuntimeError('Code has not been generated! Please report a bug to PyTorch')
         return self._code
 
+    @property
+    def in_spec(self):
+        self.real_recompile()
+        return getattr(self, "_in_spec", None)
+
+    @property
+    def out_spec(self):
+        self.real_recompile()
+        return getattr(self, "_out_spec", None)
+
     @compatibility(is_backward_compatible=True)
     @classmethod
     def recompile(cls):
@@ -665,6 +674,17 @@ def _lazy_forward(self, *args, **kwargs):
 
     forward = _lazy_forward
 
+    def real_recompile(self):
+        """
+        A torch script safe wrapper around _real_recompile.
+        Call _real_recompile only if we have not done that yet after the last
+        change to the fx.Graph
+        """
+        # Jit scripting can not handle `_needs_recompile` or `_real_recompile`.
+        if not torch.jit.is_scripting():
+            if self._needs_recompile():
+                self._real_recompile()
+
     def _real_recompile(self) -> PythonCode:
         """
         Recompile this GraphModule from its ``graph`` attribute. This should be