pytorch · qihqi · Sep 4, 2025 · Sep 2, 2025 · Sep 2, 2025 · Sep 3, 2025
diff --git a/test/cpp/test_aten_xla_tensor_2.cpp b/test/cpp/test_aten_xla_tensor_2.cpp
@@ -1555,20 +1555,18 @@ TEST_F(AtenXlaTensorTest, TestGroupNormBackward) {
             /*cudnn_enabled=*/false);
       };
       torch::Tensor undef;
-      ForEachDevice({XlaDeviceType::CUDA, XlaDeviceType::TPU},
-                    [&](const torch::Device& device) {
-                      TestBackward({input, undef_weight ? undef : weight,
-                                    undef_weight ? undef : bias},
-                                   device, testfn,
-                                   /*rtol=*/1e-3, /*atol=*/1e-3,
-                                   /*derivative_level=*/2);
-                      ExpectCounterNotChanged("aten::.*",
-                                              cpp_test::GetIgnoredCounters());
-                      ExpectCounterChanged("xla::native_batch_norm",
-                                           cpp_test::GetIgnoredCounters());
-                      ExpectCounterChanged("xla::native_batch_norm_backward",
-                                           cpp_test::GetIgnoredCounters());
-                    });
+      ForEachDevice({XlaDeviceType::TPU}, [&](const torch::Device& device) {
+        TestBackward(
+            {input, undef_weight ? undef : weight, undef_weight ? undef : bias},
+            device, testfn,
+            /*rtol=*/1e-3, /*atol=*/1e-3,
+            /*derivative_level=*/2);
+        ExpectCounterNotChanged("aten::.*", cpp_test::GetIgnoredCounters());
+        ExpectCounterChanged("xla::native_batch_norm",
+                             cpp_test::GetIgnoredCounters());
+        ExpectCounterChanged("xla::native_batch_norm_backward",
+                             cpp_test::GetIgnoredCounters());
+      });
     }
   }
 }

diff --git a/test/cpp/test_aten_xla_tensor_6.cpp b/test/cpp/test_aten_xla_tensor_6.cpp
@@ -873,7 +873,7 @@ TEST_F(AtenXlaTensorTest, TestEmbeddingBackward) {
 TEST_F(AtenXlaTensorTest, TestAmpUpdateScale) {
   XlaDeviceType hw_type =
       static_cast<XlaDeviceType>(bridge::GetDefaultDevice()->type());
-  if (hw_type != XlaDeviceType::CUDA && hw_type != XlaDeviceType::CPU) {
+  if (hw_type != XlaDeviceType::CPU) {
     return;
   }
   torch::Tensor growth_tracker =

diff --git a/test/cpp/test_replication.cpp b/test/cpp/test_replication.cpp
@@ -98,9 +98,7 @@ void TestSingleReplication(
 
 class ReplicationTest : public AtenXlaTensorTestBase {};
 
-// Parallelism for DataParallel uses multi-threads. But cuda assumes one GPU
-// device per process instead of relying on threads so we will not run the test
-// on GPU.
+// Parallelism for DataParallel uses multi-threads.
 TEST_F(ReplicationTest, TestNSingleReplication) {
   WithAllDevices(
       {XlaDeviceType::TPU},

diff --git a/test/ds/test_dynamic_shapes.py b/test/ds/test_dynamic_shapes.py
@@ -186,7 +186,7 @@ def test_masked_select_shape(self):
   def test_nonzero_cast(self):
     t1 = torch.ones(5, 2, device='xla')
     # Result of the nonzero should be the index type. Currently
-    # index type is s64 on cpu and gpu, but s32 on TPU. We should be
+    # index type is s64 on cpu, but s32 on TPU. We should be
     # able to cast it to any other type without error.
     t2 = torch.nonzero(t1.int()).float()
     torch_xla.sync()

diff --git a/test/dynamo/test_dynamo.py b/test/dynamo/test_dynamo.py
@@ -148,17 +148,6 @@ def fn_simple(self, x, y):
     b = torch.sin(y)
     return a + b
 
-  def _choose_proper_device(self, initialize_on_cuda):
-    if not initialize_on_cuda:
-      return torch_xla.device()
-
-    assert initialize_on_cuda
-    if xr.device_type() != "CUDA" or not torch.cuda.is_available():
-      self.skipTest(
-          "Skip this test because it requires xr.device_type()=='CUDA' and torch.cuda.is_available()."
-      )
-    return "cuda:0"
-
   @skipOnNeuron
   def test_simple_model(self):
     device = torch_xla.device()
@@ -193,71 +182,23 @@ def test_simple_model(self):
     # Dynamo has to sync the input since they are intermedate IR(xla_xy and xla_y3)
     self.assertEqual(met.counter_value('DynamoSyncInputExecuteTime'), 1)
 
-  # Tests that the dynamo bridge automatically moves tensors to XLA device,
-  # then back to the original device.
-  @unittest.skipIf(xr.device_type() != "CUDA" or not torch.cuda.is_available(),
-                   f"GPU tests should only run on GPU devices.")
-  @parameterized.parameters(
-      "0",
-      "1",
-  )
-  def test_simple_model_automoves_tensors(self, zero_copy_enabled):
-    x = torch.tensor(100.0, requires_grad=True, device="cuda:0")
-    y = torch.tensor(200.0, requires_grad=True, device="cuda:0")
-    original_device = x.device
-    eager_result = self.fn_simple(x, y)
-
-    # Since all tests run in the same process, have to reset the metrics report.
-    met.clear_all()
-    torch._dynamo.reset()
-
-    fn_simple_dynamo = torch.compile(self.fn_simple, backend="openxla")
-    res_xla_dynamo = fn_simple_dynamo(x, y)
-    self.assertIn('xla::add', met.counter_names())
-    self.assertTrue(res_xla_dynamo.device == original_device)
-    self.assertTrue(torch.allclose(eager_result, res_xla_dynamo))
-
-    # verify that tracing is skipped in following runs
-    met.clear_counters()
-    res_xla_dynamo_reused = fn_simple_dynamo(x, y)
-    self.assertNotIn('xla::add', met.counter_names())
-    self.assertTrue(res_xla_dynamo_reused.device == original_device)
-    self.assertTrue(torch.allclose(eager_result, res_xla_dynamo_reused))
-
-    # verify that dynamo can handle different inputs
-    res_xla_dynamo_different = fn_simple_dynamo(x + y, y * 3)
-    res_cpu_3 = self.fn_simple(x + y, y * 3)
-    self.assertTrue(res_xla_dynamo_different.device == original_device)
-    self.assertTrue(torch.allclose(res_cpu_3, res_xla_dynamo_different))
-
-    # There should not be any fallbacks.
-    self.assertEqual(torch_xla._XLAC._get_executed_fallback_ops(), [])
-
-  @parameterized.parameters(
-      True,
-      False,
-  )
-  def test_fn_without_input(self, initialize_on_cuda):
+  def test_fn_without_input(self):
 
     def fn_without_input(device):
       constant = 0.835
       expanded = torch.full((4, 4), constant, device=device)
       arange = torch.arange(16, device=device).reshape(4, 4)
       return expanded + arange
 
-    device = self._choose_proper_device(initialize_on_cuda)
+    device = torch_xla.device()
 
     compiled_fn = torch.compile(fn_without_input, backend='openxla')
     res_cpu = fn_without_input('cpu')
     res_xla_dynamo = compiled_fn(device)
     self.assertTrue(torch.allclose(res_cpu, res_xla_dynamo.cpu()))
 
-  @parameterized.parameters(
-      (True, 'openxla'),
-      (False, dynamo_backend2.dynamo_backend),
-      (False, 'openxla'),
-  )
-  def test_simple_model_with_in_place_ops(self, initialize_on_cuda, backend):
+  @parameterized.parameters('openxla', dynamo_backend2.dynamo_backend)
+  def test_simple_model_with_in_place_ops(self, backend):
 
     class TestModel(nn.Module):
 
@@ -279,7 +220,7 @@ def forward(self, index, copy_tensor, input_tensor, op_name):
         output = input_tensor + self.self_tensor
         return output
 
-    device = self._choose_proper_device(initialize_on_cuda)
+    device = torch_xla.device()
 
     torch._dynamo.reset()
     met.clear_all()
@@ -306,18 +247,14 @@ def forward(self, index, copy_tensor, input_tensor, op_name):
           op_name=in_place_op)
       self.assertTrue(torch.allclose(res_cpu, res_device_dynamo.cpu()))
 
-  @parameterized.parameters(
-      (True, 'openxla'),
-      (False, dynamo_backend2.dynamo_backend),
-      (False, 'openxla'),
-  )
-  def test_einsum(self, initialize_on_cuda, backend):
+  @parameterized.parameters('openxla', dynamo_backend2.dynamo_backend)
+  def test_einsum(self, backend):
     # einsum currently does not have meta function to compute the shape hence
     # will fallback to XLA with FakeTensor as input to infer the output shape.
     def einsum_mm(a, b):
       return torch.einsum('ijkl,ijlm->ijkm', a, b)
 
-    device = self._choose_proper_device(initialize_on_cuda)
+    device = torch_xla.device()
     a = torch.randn(4, 4, 4, 4).to(device)
     b = torch.randn(4, 4, 4, 4).to(device)
     torch_xla.sync()
@@ -328,16 +265,10 @@ def einsum_mm(a, b):
     self.assertTrue(
         torch.allclose(res_device_non_dynamo.cpu(), res_device_dynamo.cpu()))
 
-  @parameterized.parameters(
-      True,
-      False,
-  )
-  def test_simple_model_with_different_input_shape(self, initialize_on_cuda):
+  def test_simple_model_with_different_input_shape(self):
     met.clear_all()
-    device = self._choose_proper_device(initialize_on_cuda)
-    # We need to make `dim` depend on `initialize_on_cuda` because the XLA compilation cache
-    # does not clean itself between the parameterized tests.
-    dim = 5 + int(initialize_on_cuda)
+    device = torch_xla.device()
+    dim = 5
     device_x = torch.randn(dim, dim).to(device)
     device_y = torch.randn(dim, dim).to(device)
     new_dim = 2 * dim
@@ -369,13 +300,9 @@ def get_loader(self, device, sample_count, batch_size=4):
 
   @skipOnTpu
   @skipOnNeuron
-  @parameterized.parameters(
-      (True, 'openxla'),
-      (False, dynamo_backend2.dynamo_backend),
-      (False, 'openxla'),
-  )
-  def test_resnet18(self, initialize_on_cuda, backend):
-    device = self._choose_proper_device(initialize_on_cuda)
+  @parameterized.parameters('openxla', dynamo_backend2.dynamo_backend)
+  def test_resnet18(self, backend):
+    device = torch_xla.device()
     sample_count = xu.getenv_as('SAMPLE_COUNT', int, defval=10)
     loader = self.get_loader(device, sample_count, batch_size=4)
     resnet18 = torchvision.models.resnet18()

diff --git a/test/dynamo/test_traceable_collectives.py b/test/dynamo/test_traceable_collectives.py
@@ -20,7 +20,7 @@ def collective_broadcast_and_cos(input, src):
 def _mp_fn(index):
   device = torch_xla.device()
   world_size = xr.world_size()
-  if xm.xla_device_hw(device) not in ('TPU', 'CUDA', 'NEURON'):
+  if xm.xla_device_hw(device) not in ('TPU', 'NEURON'):
     print(f'skip this test for hw {xm.xla_device_hw(device)}')
     return
   ordinal_tensor = torch.tensor([index], dtype=torch.float).to(device)

diff --git a/test/pjrt/test_runtime.py b/test/pjrt/test_runtime.py
@@ -17,7 +17,7 @@ class TestExperimentalPjrt(parameterized.TestCase):
   def setUp(self):
     xr.set_device_type('CPU')
 
-  @parameterized.parameters(('CPU', 'CPU'), ('CUDA', 'CUDA'), ('TPU', 'TPU'))
+  @parameterized.parameters(('CPU', 'CPU'), ('TPU', 'TPU'))
   def test_device_type(self, pjrt_device, expected):
     with mock.patch.dict(os.environ, {'PJRT_DEVICE': pjrt_device}, clear=True):
       self.assertEqual(xr.device_type(), expected)
@@ -69,11 +69,6 @@ def test_xla_device_error(self):
   }, True), ('pjrt_tpu_precedence', {
       'PJRT_DEVICE': 'TPU',
       'XRT_TPU_CONFIG': 'localservice;0;localhost:51011',
-  }, True), ('gpu_num_devives', {
-      'GPU_NUM_DEVICES': '4'
-  }, True), ('pjrt_gpu', {
-      'PJRT_DEVICE': 'CUDA',
-      'GPU_NUM_DEVICES': '4'
   }, True))
   def test_pjrt_default_device(self, env_vars, expect_using_pjrt):
     # Prevent flag checking during reinitialization of PJRT backend.
@@ -86,7 +81,7 @@ def test_pjrt_default_device(self, env_vars, expect_using_pjrt):
         reload(torch_xla)
         logs_context = contextlib.nullcontext()
         if expect_using_pjrt:
-          self.assertIn(xr.device_type(), ['CPU', 'CUDA', 'TPU', 'NEURON'])
+          self.assertIn(xr.device_type(), ['CPU', 'TPU', 'NEURON'])
         else:
           self.assertIsNone(xr.device_type())
 

diff --git a/test/pytorch_test_base.py b/test/pytorch_test_base.py
@@ -295,7 +295,7 @@
         'test_leaky_relu_inplace_with_neg_slope_xla',  # expecting a specific error message
         'test_upsamplingBicubic2d_correctness_xla',  # FIXME! Got dtypes torch.float32 and torch.float64
         'test_CTCLoss_no_batch_dim_xla',  # Value out of range
-        'test_upsamplingBilinear2d_xla',  # precision on GPU/TPU, slow compilation on CPU
+        'test_upsamplingBilinear2d_xla',  # precision on TPU, slow compilation on CPU
         # torch.autograd.gradcheck.GradcheckError: Jacobian mismatch for output 0 with respect to input 0
         'test_GRU_grad_and_gradgrad_xla_float64',  # grad check failure
         'test_LSTM_grad_and_gradgrad_xla_float64',  # grad check failure
@@ -475,18 +475,6 @@
     },
 }
 
-DISABLED_TORCH_TESTS_GPU_ONLY = {
-    # test_torch.py
-    'TestTorchDeviceTypeXLA': {
-        'test_maximum_minimum_float_nan_and_inf',  # maximum(nan,inf) = inf on GPU
-    },
-
-    # test_indexing.py
-    'TestIndexingXLA': {
-        'test_index_put_accumulate_large_tensor_xla',  # illegal memory access was encountered
-    },
-}
-
 
 class MatchSet(object):
 
@@ -526,15 +514,12 @@ def union_of_disabled_tests(sets):
 
 
 DISABLED_TORCH_TESTS_CPU = DISABLED_TORCH_TESTS_ANY
-DISABLED_TORCH_TESTS_GPU = union_of_disabled_tests(
-    [DISABLED_TORCH_TESTS_ANY, DISABLED_TORCH_TESTS_GPU_ONLY])
 DISABLED_TORCH_TESTS_TPU = union_of_disabled_tests(
     [DISABLED_TORCH_TESTS_ANY, DISABLED_TORCH_TESTS_TPU_ONLY])
 
 DISABLED_TORCH_TESTS = {
     'TPU': prepare_match_set(DISABLED_TORCH_TESTS_TPU),
     'CPU': prepare_match_set(DISABLED_TORCH_TESTS_CPU),
-    'CUDA': prepare_match_set(DISABLED_TORCH_TESTS_GPU),
 }
 
 

diff --git a/test/run_tests.sh b/test/run_tests.sh
@@ -254,7 +254,6 @@ function run_xla_op_tests3 {
   run_test "$_TEST_DIR/test_devices.py"
   run_test "$_TEST_DIR/test_manual_xla_registration.py"
   run_test_multi_devices "$_TEST_DIR/spmd/test_xla_dtensor_placements.py"
-  # NOTE: this line below is testing export and don't care about GPU
   PJRT_DEVICE=CPU CPU_NUM_DEVICES=1 run_coverage "$_TEST_DIR/test_core_aten_ops.py"
   run_test "$_TEST_DIR/test_pallas.py"
   run_xla_ir_hlo_debug run_test "$_TEST_DIR/test_user_computation_debug_cache.py"

diff --git a/test/test_autocast.py b/test/test_autocast.py
@@ -152,82 +152,6 @@ def __init__(self, dev):
     self.methods_bf16 = [("__matmul__", mat0_bf16 + mat1_fp32)]
 
 
-class AutocastCudaTestExtraLists(object):
-
-  def __init__(self, dev):
-    super().__init__()
-    n = 8
-    dimsets = ((n, n, n), (n, n, n, n), (n, n, n, n, n))
-    conv_args_fp32 = [(torch.randn(dimset, dtype=torch.float32, device=dev),
-                       torch.randn(dimset, dtype=torch.float32, device=dev))
-                      for dimset in dimsets]
-
-    mat0_fp32 = (torch.randn((n, n), dtype=torch.float32, device=dev),)
-    mat1_fp32 = (torch.randn((n, n), dtype=torch.float32, device=dev),)
-    mat2_fp32 = (torch.randn((n, n), dtype=torch.float32, device=dev),)
-    mat3_fp32 = (torch.randn((n, n), dtype=torch.float32, device=dev),)
-
-    pointwise0_fp32 = (torch.randn(n, dtype=torch.float32, device=dev),)
-
-    element0_fp32 = (torch.randn(1, dtype=torch.float32, device=dev),)
-
-    # This is currently not part of AutocastTestLists and excludes `relu`, `addbmm`
-    self.torch_bf16 = [
-        ("conv1d", conv_args_fp32[0]),
-        ("conv2d", conv_args_fp32[1]),
-        ("conv3d", conv_args_fp32[2]),
-        ("bmm", (torch.randn((n, n, n), device=dev, dtype=torch.float32),
-                 torch.randn((n, n, n), device=dev, dtype=torch.float32))),
-        ("mm", mat0_fp32 + mat1_fp32),
-        ("matmul",
-         torch.matmul(
-             torch.ones([2, 3], device=dev, dtype=torch.float32),
-             torch.ones([3, 2], device=dev, dtype=torch.float32))),
-        ("baddbmm", (torch.randn((n, n, n), device=dev, dtype=torch.float32),
-                     torch.randn((n, n, n), device=dev, dtype=torch.float32),
-                     torch.randn((n, n, n), device=dev, dtype=torch.float32))),
-        ("addmm", mat1_fp32 + mat2_fp32 + mat3_fp32),
-        ("conv_tbc", (torch.randn((10, 7, 3), device=dev, dtype=torch.float32),
-                      torch.randn((5, 3, 5), device=dev, dtype=torch.float32),
-                      torch.randn(5, device=dev, dtype=torch.float32), 0)),
-        ("conv_transpose1d", conv_args_fp32[0]),
-        ("conv_transpose2d", conv_args_fp32[1]),
-        ("conv_transpose3d", conv_args_fp32[2]),
-        ("prelu", pointwise0_fp32 + element0_fp32),
-    ]
-
-
-class AutocastCudaTestUnsupportedLists(object):
-
-  def __init__(self):
-    super().__init__()
-    # Utility arguments, created as one-element tuples
-    self.torch_expect_builtin_promote = [
-        "cat",  # requires all input tensors to be the same type
-        "equal",  # requires all input tensors to be the same type
-        "stack",  # return f16 instead of f32
-    ]
-    self.methods_expect_builtin_promote = []
-
-    # The remaining lists organize ops that autocast treats explicitly.
-    self.torch_fp16 = [
-        "_convolution_nogroup",  # need lowering
-        "addmv",  # need lowering
-    ]
-    self.torch_fp32 = [
-        "norm",  # produce f16 instead of f32
-    ]
-    self.torch_need_autocast_promote = [
-        "scatter_add",  # cat currently requires all input tensors to be the same type
-    ]
-    self.nn_fp16 = []
-    self.nn_fp32 = []
-    self.linalg_fp16 = []
-    self.methods_fp16 = []
-    self.methods_fp32 = []
-    self.banned = []
-
-
 class TestAutocastBase(unittest.TestCase):
 
   @classmethod