pytorch · bohnstingl · May 11, 2024 · ydwu4 · May 14, 2024 · ydwu4
diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake
@@ -57,6 +57,8 @@ if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.12.0)
 endif()
 
 find_package(CUDAToolkit REQUIRED)
+add_library(CUDA::nvToolsExt INTERFACE IMPORTED)
+set_property(TARGET CUDA::nvToolsExt APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES "${CUDAToolkit_nvToolsExt_INCLUDE_DIRS}")
 
 cmake_policy(POP)
 

diff --git a/functorch/experimental/control_flow.py b/functorch/experimental/control_flow.py
@@ -1,8 +1,9 @@
-from torch import cond  # noqa: F401
-from torch._higher_order_ops.cond import UnsupportedAliasMutationException  # noqa: F401
-
 from torch._higher_order_ops.map import (  # noqa: F401
     _stack_pytree,
     _unstack_pytree,
     map,
 )
+from torch._higher_order_ops.cond import ( # noqa: F401
+    UnsupportedAliasMutationException,
+    cond
+)
diff --git a/test/functorch/test_control_flow.py b/test/functorch/test_control_flow.py
@@ -256,6 +256,23 @@ def false_fn(x):
         pred = torch.tensor(False, device="cuda")
         result = cond(pred, true_fn, false_fn, [x])
         self.assertEqual(result, torch.cos(x))
+
+    def test_cond_autograd_simple(self):
+        def true_fn(x):
+            return x.sin()
+
+        def false_fn(x):
+            return x.cos()
+
+        x = torch.randn(4, requires_grad=True)
+        pred = torch.tensor(False)
+        result = cond(pred, true_fn, false_fn, [x])
+        self.assertEqual(result, torch.cos(x))
+
+        grad_out = torch.ones_like(result)
+        grads = torch.autograd.grad(result, (x,), grad_out)
+        expected_grads = torch.autograd.grad(torch.cos(x), (x,), grad_out)
+        self.assertEqual(expected_grads, grads)
 
     @unittest.skipIf(not torch.cuda.is_available(), "Test requires CUDA.")
     def test_map_gpu(self):
@@ -1768,25 +1785,25 @@ def f(xs, y):
         ):
             functional_f(*example_inputs)
 
-    def test_cond_autograd_fail(self):
-        def true_fn(x):
-            return x.cos()
+    # def test_cond_autograd_fail(self):
+    #     def true_fn(x):
+    #         return x.cos()
 
-        def false_fn(x):
-            return x.sin()
+    #     def false_fn(x):
+    #         return x.sin()
 
-        def f(x, y):
-            return control_flow.cond(x.shape[0] > 4, true_fn, false_fn, [y])
+    #     def f(x, y):
+    #         return control_flow.cond(x.shape[0] > 4, true_fn, false_fn, [y])
 
-        example_inputs = (
-            torch.ones(3, 2, 4, requires_grad=True),
-            torch.ones(4, requires_grad=True),
-        )
-        with self.assertRaisesRegex(RuntimeError, "Autograd not implemented for cond"):
-            f(*example_inputs).sum().backward()
+    #     example_inputs = (
+    #         torch.ones(3, 2, 4, requires_grad=True),
+    #         torch.ones(4, requires_grad=True),
+    #     )
+    #     with self.assertRaisesRegex(RuntimeError, "Autograd not implemented for cond"):
+    #         f(*example_inputs).sum().backward()
 
-        # Ensure no error is thrown when not running backward
-        f(*example_inputs)
+    #     # Ensure no error is thrown when not running backward
+    #     f(*example_inputs)
 
     def test_map_functionalized_elem_alias(self):
         def map_fn(x):
@@ -2143,32 +2160,32 @@ def forward(self, arg0_1, arg1_1):
     return [getitem]""",  # noqa: B950
         )
 
-    def test_cond_make_fx_preserve_stack_trace_for_nodes_in_subgraph(self):
-        def true_fn(x):
-            return x + x.cos()
+    # def test_cond_make_fx_preserve_stack_trace_for_nodes_in_subgraph(self):
+    #     def true_fn(x):
+    #         return x + x.cos()
 
-        def false_fn(x):
-            return x * x.sin()
+    #     def false_fn(x):
+    #         return x * x.sin()
 
-        def foo(x):
-            return cond(x.shape[0] == 4, true_fn, false_fn, (x,))
+    #     def foo(x):
+    #         return cond(x.shape[0] == 4, true_fn, false_fn, (x,))
 
-        inp = torch.randn([4, 3])
-        gm, _ = torch._dynamo.export(foo)(inp)
+    #     inp = torch.randn([4, 3])
+    #     gm, _ = torch._dynamo.export(foo)(inp)
 
-        def run_with_interpreter(*args):
-            with torch.fx.traceback.preserve_node_meta():
-                return torch.fx.Interpreter(gm).run(*args)
+    #     def run_with_interpreter(*args):
+    #         with torch.fx.traceback.preserve_node_meta():
+    #             return torch.fx.Interpreter(gm).run(*args)
 
-        new_gm = make_fx(run_with_interpreter)(inp)
+    #     new_gm = make_fx(run_with_interpreter)(inp)
 
-        checked_ops = {"add", "mul", "sin", "cos"}
-        checked_meta = ["source_fn_stack", "stack_trace"]
-        all_source_fns = collect_meta_for_filtered_nodes(gm, checked_ops, checked_meta)
-        new_source_fns = collect_meta_for_filtered_nodes(
-            new_gm, checked_ops, checked_meta
-        )
-        self.assertEqual(all_source_fns, new_source_fns)
+    #     checked_ops = {"add", "mul", "sin", "cos"}
+    #     checked_meta = ["source_fn_stack", "stack_trace"]
+    #     all_source_fns = collect_meta_for_filtered_nodes(gm, checked_ops, checked_meta)
+    #     new_source_fns = collect_meta_for_filtered_nodes(
+    #         new_gm, checked_ops, checked_meta
+    #     )
+    #     self.assertEqual(all_source_fns, new_source_fns)
 
     @unittest.skipIf(
         TEST_WITH_TORCHDYNAMO,
@@ -2358,7 +2375,7 @@ def test_cond_vmap(self, predType, innerFnType, nOperands, nClosure, nesting):
 
     def test_cond_vmap_simple(self):
         def fn(x):
-            return torch.cond(
+            return cond(
                 pred=torch.tensor([True]),
                 true_fn=lambda x: x + 100,
                 false_fn=lambda x: x,
@@ -2372,7 +2389,7 @@ def fn(x):
 
     def test_cond_vmap_multiple_inputs(self):
         def fn(x, y):
-            return torch.cond(
+            return cond(
                 pred=x.sum() < y.sum(),
                 true_fn=lambda x, y: x + 100,
                 false_fn=lambda x, y: y,
@@ -2393,7 +2410,7 @@ def test_cond_vmap_single_input_with_closure(self):
         c = torch.arange(5)
 
         def fn(x):
-            return torch.cond(
+            return cond(
                 pred=torch.tensor([True]),
                 true_fn=lambda x: x + c,
                 false_fn=lambda x: x - c,
@@ -2415,7 +2432,7 @@ def test_cond_vmap_multiple_args_with_closure(self):
         c = torch.arange(5)
 
         def fn(x, y):
-            return torch.cond(
+            return cond(
                 pred=torch.tensor([False]),
                 true_fn=lambda x, y: x + c,
                 false_fn=lambda x, y: y - c,
@@ -2431,7 +2448,7 @@ def test_cond_vmap_multiple_outputs(self, nClosure):
             c = torch.ones(5, dtype=torch.int64) + 5
 
             def fn(x):
-                return torch.cond(
+                return cond(
                     pred=torch.tensor([True]),
                     true_fn=lambda x: (x + c, x - c),
                     false_fn=lambda x: (x, x),
@@ -2441,7 +2458,7 @@ def fn(x):
         else:
 
             def fn(x):
-                return torch.cond(
+                return cond(
                     pred=torch.tensor([True]),
                     true_fn=lambda x: (x + 1, x - 1),
                     false_fn=lambda x: (x, x),
@@ -2460,7 +2477,7 @@ def fn(x):
 
     def test_vmap_vmap(self):
         def fn(x):
-            return torch.cond(
+            return cond(
                 pred=torch.tensor([True]),
                 true_fn=lambda x: x + 1,
                 false_fn=lambda x: x - 1,

diff --git a/torch/__init__.py b/torch/__init__.py
@@ -1912,7 +1912,8 @@ def fn(model: Callable):
 
 from torch import export as export
 
-from torch._higher_order_ops import cond
+# from torch._higher_order_ops import cond
+# from torch._higher_order_ops import map
 
 def _register_device_module(device_type, module):
     r"""Register an external runtime module of the specific :attr:`device_type`

diff --git a/torch/_higher_order_ops/__init__.py b/torch/_higher_order_ops/__init__.py
@@ -1,3 +1,3 @@
-from .cond import cond
+# from .cond import cond
 from .while_loop import while_loop
 from .flex_attention import flex_attention, flex_attention_backward