pytorch · mlazos · Jun 6, 2023 · Jun 6, 2023 · Jun 6, 2023 · Jun 6, 2023
diff --git a/test/optim/test_optim.py b/test/optim/test_optim.py
@@ -27,6 +27,9 @@
     skipIfRocm,
     skipIfTorchDynamo
 )
+
+from torch._dynamo import disable
+
 from torch.testing._internal.common_cuda import TEST_MULTIGPU, TEST_CUDA
 from torch.testing._internal.common_device_type import largeTensorTest
 from typing import Dict, Any, Tuple
@@ -191,12 +194,23 @@ def fn():
             else:
                 self.assertLess(fn().item(), initial_value)
 
+    # Note: disable dynamo on this function
+    # This allows us to continue running actual logic of the optimizer
+    # tests in dynamo without tracing this test code which has a lot of unsupported
+    # behavior
+    @disable(recursive=False)
     def _test_state_dict(self, weight, bias, input, constructor, atol=None, rtol=None):
         weight = Parameter(weight)
         bias = Parameter(bias)
         with torch.no_grad():
             input = input.clone().detach().requires_grad_()
 
+        # Note: Disable dynamo on this function
+        # This avoids a bug where input_cuda is not detected in the environment
+        # because it currently is not defined in the local environmet. Unable to repro
+        # anywhere else however and this is test code that we don't need to spend
+        # time getting dynamo to trace unless the issue repros in real models.
+        @disable(recursive=False)
         def fn_base(optimizer, weight, bias):
             optimizer.zero_grad()
             i = input_cuda if weight.is_cuda else input
@@ -220,7 +234,7 @@ def fn_base(optimizer, weight, bias):
         state_dict = deepcopy(optimizer.state_dict())
         state_dict_c = deepcopy(optimizer.state_dict())
         optimizer_c.load_state_dict(state_dict_c)
-        # Run both optimizations in parallel
+        # Run both optimizers in parallel
         for _ in range(20):
             optimizer.step(fn)
             optimizer_c.step(fn_c)
@@ -1073,6 +1087,7 @@ def test_sparse_adam(self):
             optim.SparseAdam([{"params": [torch.zeros(3, layout=torch.sparse_coo)]}])
 
     # ROCm precision is too low to pass this test
+    @skipIfTorchDynamo("Unsupported mutation of step")
     def test_adadelta(self):
         # Handles https://github.com/pytorch/pytorch/issues/69698
         self.rel_tol = 4e-3
@@ -1115,6 +1130,7 @@ def test_adadelta(self):
         with self.assertRaisesRegex(ValueError, "Invalid rho value: 1.1"):
             optim.Adadelta(None, lr=1e-2, rho=1.1)
 
+    @skipIfTorchDynamo("Unsupported mutation of step")
     def test_adadelta_complex(self):
         # Handles https://github.com/pytorch/pytorch/issues/69698
         self.rel_tol = 2e-2
@@ -1323,6 +1339,7 @@ def test_radam(self):
         with self.assertRaisesRegex(ValueError, "Invalid weight_decay value: -1"):
             optim.RAdam(None, lr=1e-2, weight_decay=-1)
 
+    @skipIfTorchDynamo("Unsupported mutation of step")
     def test_rmsprop(self):
         for foreach in (False, True):
             self._test_basic_cases(
@@ -1783,7 +1800,9 @@ def _diff_fn(p, grad, opt_differentiable_state, opt_class, kwargs, *ignored):
     )
 
 
+@skipIfTorchDynamo("Differentiable optimizers not supported")
 class TestDifferentiableOptimizer(TestCase):
+
     def test_sgd(self):
         p = torch.rand(10, requires_grad=True, dtype=torch.float64)
         grad = torch.rand(10, requires_grad=True, dtype=torch.float64)
@@ -1801,6 +1820,7 @@ def test_sgd(self):
             ),
         )
 
+
     def test_adam(self):
         state = {}
         p = torch.rand(10, requires_grad=True, dtype=torch.float64)
@@ -1826,6 +1846,7 @@ def test_adam(self):
             ),
         )
 
+
     def test_rmsprop(self):
         state = {}
         p = torch.rand(10, requires_grad=True, dtype=torch.float64)
@@ -1858,6 +1879,7 @@ def test_rmsprop(self):
             ),
         )
 
+
     def test_adadelta(self):
         state = {}
         p = torch.rand(10, requires_grad=True, dtype=torch.float64)
@@ -1879,6 +1901,7 @@ def test_adadelta(self):
             ),
         )
 
+
     def test_adagrad(self):
         state = {}
         p = torch.rand(10, requires_grad=True, dtype=torch.float64)
@@ -1899,6 +1922,7 @@ def test_adagrad(self):
             ),
         )
 
+
     def test_adamax(self):
         state = {}
         p = torch.rand(10, requires_grad=True, dtype=torch.float64)
@@ -1920,6 +1944,7 @@ def test_adamax(self):
             ),
         )
 
+
     @skipIfTorchDynamo("The inplace mu update fails with dynamo, "
                        "since this is only happening when differentiable is enabled, skipping for now")
     def test_asgd(self):
@@ -1945,7 +1970,6 @@ def test_asgd(self):
             ),
         )
 
-    @skipIfTorchDynamo()
     def test_rprop(self):
         state = {}
         p = torch.rand(10, requires_grad=True, dtype=torch.float64)

diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
@@ -1274,6 +1274,9 @@ def patch():
                 opt.step = disable(opt.step)
 
             opt.zero_grad = disable(opt.zero_grad)
+            opt.state_dict = disable(opt.state_dict)
+            opt.load_state_dict = disable(opt.load_state_dict)
+            opt.add_param_group = disable(opt.add_param_group)
 
             # disable any currently set hooks
             # Note: we only want to disable the profiling hook