diff --git a/test/cpp/test_aten_xla_tensor_2.cpp b/test/cpp/test_aten_xla_tensor_2.cpp
index 31491249f618..dc3d605da34a 100644
--- a/test/cpp/test_aten_xla_tensor_2.cpp
+++ b/test/cpp/test_aten_xla_tensor_2.cpp
@@ -1555,20 +1555,18 @@ TEST_F(AtenXlaTensorTest, TestGroupNormBackward) {
             /*cudnn_enabled=*/false);
       };
       torch::Tensor undef;
-      ForEachDevice({XlaDeviceType::CUDA, XlaDeviceType::TPU},
-                    [&](const torch::Device& device) {
-                      TestBackward({input, undef_weight ? undef : weight,
-                                    undef_weight ? undef : bias},
-                                   device, testfn,
-                                   /*rtol=*/1e-3, /*atol=*/1e-3,
-                                   /*derivative_level=*/2);
-                      ExpectCounterNotChanged("aten::.*",
-                                              cpp_test::GetIgnoredCounters());
-                      ExpectCounterChanged("xla::native_batch_norm",
-                                           cpp_test::GetIgnoredCounters());
-                      ExpectCounterChanged("xla::native_batch_norm_backward",
-                                           cpp_test::GetIgnoredCounters());
-                    });
+      ForEachDevice({XlaDeviceType::TPU}, [&](const torch::Device& device) {
+        TestBackward(
+            {input, undef_weight ? undef : weight, undef_weight ? undef : bias},
+            device, testfn,
+            /*rtol=*/1e-3, /*atol=*/1e-3,
+            /*derivative_level=*/2);
+        ExpectCounterNotChanged("aten::.*", cpp_test::GetIgnoredCounters());
+        ExpectCounterChanged("xla::native_batch_norm",
+                             cpp_test::GetIgnoredCounters());
+        ExpectCounterChanged("xla::native_batch_norm_backward",
+                             cpp_test::GetIgnoredCounters());
+      });
     }
   }
 }
diff --git a/test/cpp/test_aten_xla_tensor_6.cpp b/test/cpp/test_aten_xla_tensor_6.cpp
index df3e1280b6e2..b9a669760b1b 100644
--- a/test/cpp/test_aten_xla_tensor_6.cpp
+++ b/test/cpp/test_aten_xla_tensor_6.cpp
@@ -873,7 +873,7 @@ TEST_F(AtenXlaTensorTest, TestEmbeddingBackward) {
 TEST_F(AtenXlaTensorTest, TestAmpUpdateScale) {
   XlaDeviceType hw_type =
       static_cast<XlaDeviceType>(bridge::GetDefaultDevice()->type());
-  if (hw_type != XlaDeviceType::CUDA && hw_type != XlaDeviceType::CPU) {
+  if (hw_type != XlaDeviceType::CPU) {
     return;
   }
   torch::Tensor growth_tracker =
diff --git a/test/cpp/test_replication.cpp b/test/cpp/test_replication.cpp
index 00ec937bf8a1..73db4ab42392 100644
--- a/test/cpp/test_replication.cpp
+++ b/test/cpp/test_replication.cpp
@@ -98,9 +98,7 @@ void TestSingleReplication(
 
 class ReplicationTest : public AtenXlaTensorTestBase {};
 
-// Parallelism for DataParallel uses multi-threads. But cuda assumes one GPU
-// device per process instead of relying on threads so we will not run the test
-// on GPU.
+// Parallelism for DataParallel uses multi-threads.
 TEST_F(ReplicationTest, TestNSingleReplication) {
   WithAllDevices(
       {XlaDeviceType::TPU},
diff --git a/test/ds/test_dynamic_shapes.py b/test/ds/test_dynamic_shapes.py
index 46f329de4537..355a795bd14e 100644
--- a/test/ds/test_dynamic_shapes.py
+++ b/test/ds/test_dynamic_shapes.py
@@ -186,7 +186,7 @@ def test_masked_select_shape(self):
   def test_nonzero_cast(self):
     t1 = torch.ones(5, 2, device='xla')
     # Result of the nonzero should be the index type. Currently
-    # index type is s64 on cpu and gpu, but s32 on TPU. We should be
+    # index type is s64 on cpu, but s32 on TPU. We should be
     # able to cast it to any other type without error.
     t2 = torch.nonzero(t1.int()).float()
     torch_xla.sync()
diff --git a/test/dynamo/test_dynamo.py b/test/dynamo/test_dynamo.py
index 2c05adf7716c..c106483e1a37 100644
--- a/test/dynamo/test_dynamo.py
+++ b/test/dynamo/test_dynamo.py
@@ -148,17 +148,6 @@ def fn_simple(self, x, y):
     b = torch.sin(y)
     return a + b
 
-  def _choose_proper_device(self, initialize_on_cuda):
-    if not initialize_on_cuda:
-      return torch_xla.device()
-
-    assert initialize_on_cuda
-    if xr.device_type() != "CUDA" or not torch.cuda.is_available():
-      self.skipTest(
-          "Skip this test because it requires xr.device_type()=='CUDA' and torch.cuda.is_available()."
-      )
-    return "cuda:0"
-
   @skipOnNeuron
   def test_simple_model(self):
     device = torch_xla.device()
@@ -193,51 +182,7 @@ def test_simple_model(self):
     # Dynamo has to sync the input since they are intermedate IR(xla_xy and xla_y3)
     self.assertEqual(met.counter_value('DynamoSyncInputExecuteTime'), 1)
 
-  # Tests that the dynamo bridge automatically moves tensors to XLA device,
-  # then back to the original device.
-  @unittest.skipIf(xr.device_type() != "CUDA" or not torch.cuda.is_available(),
-                   f"GPU tests should only run on GPU devices.")
-  @parameterized.parameters(
-      "0",
-      "1",
-  )
-  def test_simple_model_automoves_tensors(self, zero_copy_enabled):
-    x = torch.tensor(100.0, requires_grad=True, device="cuda:0")
-    y = torch.tensor(200.0, requires_grad=True, device="cuda:0")
-    original_device = x.device
-    eager_result = self.fn_simple(x, y)
-
-    # Since all tests run in the same process, have to reset the metrics report.
-    met.clear_all()
-    torch._dynamo.reset()
-
-    fn_simple_dynamo = torch.compile(self.fn_simple, backend="openxla")
-    res_xla_dynamo = fn_simple_dynamo(x, y)
-    self.assertIn('xla::add', met.counter_names())
-    self.assertTrue(res_xla_dynamo.device == original_device)
-    self.assertTrue(torch.allclose(eager_result, res_xla_dynamo))
-
-    # verify that tracing is skipped in following runs
-    met.clear_counters()
-    res_xla_dynamo_reused = fn_simple_dynamo(x, y)
-    self.assertNotIn('xla::add', met.counter_names())
-    self.assertTrue(res_xla_dynamo_reused.device == original_device)
-    self.assertTrue(torch.allclose(eager_result, res_xla_dynamo_reused))
-
-    # verify that dynamo can handle different inputs
-    res_xla_dynamo_different = fn_simple_dynamo(x + y, y * 3)
-    res_cpu_3 = self.fn_simple(x + y, y * 3)
-    self.assertTrue(res_xla_dynamo_different.device == original_device)
-    self.assertTrue(torch.allclose(res_cpu_3, res_xla_dynamo_different))
-
-    # There should not be any fallbacks.
-    self.assertEqual(torch_xla._XLAC._get_executed_fallback_ops(), [])
-
-  @parameterized.parameters(
-      True,
-      False,
-  )
-  def test_fn_without_input(self, initialize_on_cuda):
+  def test_fn_without_input(self):
 
     def fn_without_input(device):
       constant = 0.835
@@ -245,19 +190,15 @@ def fn_without_input(device):
       arange = torch.arange(16, device=device).reshape(4, 4)
       return expanded + arange
 
-    device = self._choose_proper_device(initialize_on_cuda)
+    device = torch_xla.device()
 
     compiled_fn = torch.compile(fn_without_input, backend='openxla')
     res_cpu = fn_without_input('cpu')
     res_xla_dynamo = compiled_fn(device)
     self.assertTrue(torch.allclose(res_cpu, res_xla_dynamo.cpu()))
 
-  @parameterized.parameters(
-      (True, 'openxla'),
-      (False, dynamo_backend2.dynamo_backend),
-      (False, 'openxla'),
-  )
-  def test_simple_model_with_in_place_ops(self, initialize_on_cuda, backend):
+  @parameterized.parameters('openxla', dynamo_backend2.dynamo_backend)
+  def test_simple_model_with_in_place_ops(self, backend):
 
     class TestModel(nn.Module):
 
@@ -279,7 +220,7 @@ def forward(self, index, copy_tensor, input_tensor, op_name):
         output = input_tensor + self.self_tensor
         return output
 
-    device = self._choose_proper_device(initialize_on_cuda)
+    device = torch_xla.device()
 
     torch._dynamo.reset()
     met.clear_all()
@@ -306,18 +247,14 @@ def forward(self, index, copy_tensor, input_tensor, op_name):
           op_name=in_place_op)
       self.assertTrue(torch.allclose(res_cpu, res_device_dynamo.cpu()))
 
-  @parameterized.parameters(
-      (True, 'openxla'),
-      (False, dynamo_backend2.dynamo_backend),
-      (False, 'openxla'),
-  )
-  def test_einsum(self, initialize_on_cuda, backend):
+  @parameterized.parameters('openxla', dynamo_backend2.dynamo_backend)
+  def test_einsum(self, backend):
     # einsum currently does not have meta function to compute the shape hence
     # will fallback to XLA with FakeTensor as input to infer the output shape.
     def einsum_mm(a, b):
       return torch.einsum('ijkl,ijlm->ijkm', a, b)
 
-    device = self._choose_proper_device(initialize_on_cuda)
+    device = torch_xla.device()
     a = torch.randn(4, 4, 4, 4).to(device)
     b = torch.randn(4, 4, 4, 4).to(device)
     torch_xla.sync()
@@ -328,16 +265,10 @@ def einsum_mm(a, b):
     self.assertTrue(
         torch.allclose(res_device_non_dynamo.cpu(), res_device_dynamo.cpu()))
 
-  @parameterized.parameters(
-      True,
-      False,
-  )
-  def test_simple_model_with_different_input_shape(self, initialize_on_cuda):
+  def test_simple_model_with_different_input_shape(self):
     met.clear_all()
-    device = self._choose_proper_device(initialize_on_cuda)
-    # We need to make `dim` depend on `initialize_on_cuda` because the XLA compilation cache
-    # does not clean itself between the parameterized tests.
-    dim = 5 + int(initialize_on_cuda)
+    device = torch_xla.device()
+    dim = 5
     device_x = torch.randn(dim, dim).to(device)
     device_y = torch.randn(dim, dim).to(device)
     new_dim = 2 * dim
@@ -369,13 +300,9 @@ def get_loader(self, device, sample_count, batch_size=4):
 
   @skipOnTpu
   @skipOnNeuron
-  @parameterized.parameters(
-      (True, 'openxla'),
-      (False, dynamo_backend2.dynamo_backend),
-      (False, 'openxla'),
-  )
-  def test_resnet18(self, initialize_on_cuda, backend):
-    device = self._choose_proper_device(initialize_on_cuda)
+  @parameterized.parameters('openxla', dynamo_backend2.dynamo_backend)
+  def test_resnet18(self, backend):
+    device = torch_xla.device()
     sample_count = xu.getenv_as('SAMPLE_COUNT', int, defval=10)
     loader = self.get_loader(device, sample_count, batch_size=4)
     resnet18 = torchvision.models.resnet18()
diff --git a/test/dynamo/test_traceable_collectives.py b/test/dynamo/test_traceable_collectives.py
index 45bd89266604..e9416dcb2e05 100644
--- a/test/dynamo/test_traceable_collectives.py
+++ b/test/dynamo/test_traceable_collectives.py
@@ -20,7 +20,7 @@ def collective_broadcast_and_cos(input, src):
 def _mp_fn(index):
   device = torch_xla.device()
   world_size = xr.world_size()
-  if xm.xla_device_hw(device) not in ('TPU', 'CUDA', 'NEURON'):
+  if xm.xla_device_hw(device) not in ('TPU', 'NEURON'):
     print(f'skip this test for hw {xm.xla_device_hw(device)}')
     return
   ordinal_tensor = torch.tensor([index], dtype=torch.float).to(device)
diff --git a/test/pjrt/test_runtime.py b/test/pjrt/test_runtime.py
index 6529b5e826e1..31af6acd4e36 100644
--- a/test/pjrt/test_runtime.py
+++ b/test/pjrt/test_runtime.py
@@ -17,7 +17,7 @@ class TestExperimentalPjrt(parameterized.TestCase):
   def setUp(self):
     xr.set_device_type('CPU')
 
-  @parameterized.parameters(('CPU', 'CPU'), ('CUDA', 'CUDA'), ('TPU', 'TPU'))
+  @parameterized.parameters(('CPU', 'CPU'), ('TPU', 'TPU'))
   def test_device_type(self, pjrt_device, expected):
     with mock.patch.dict(os.environ, {'PJRT_DEVICE': pjrt_device}, clear=True):
       self.assertEqual(xr.device_type(), expected)
@@ -69,11 +69,6 @@ def test_xla_device_error(self):
   }, True), ('pjrt_tpu_precedence', {
       'PJRT_DEVICE': 'TPU',
       'XRT_TPU_CONFIG': 'localservice;0;localhost:51011',
-  }, True), ('gpu_num_devives', {
-      'GPU_NUM_DEVICES': '4'
-  }, True), ('pjrt_gpu', {
-      'PJRT_DEVICE': 'CUDA',
-      'GPU_NUM_DEVICES': '4'
   }, True))
   def test_pjrt_default_device(self, env_vars, expect_using_pjrt):
     # Prevent flag checking during reinitialization of PJRT backend.
@@ -86,7 +81,7 @@ def test_pjrt_default_device(self, env_vars, expect_using_pjrt):
         reload(torch_xla)
         logs_context = contextlib.nullcontext()
         if expect_using_pjrt:
-          self.assertIn(xr.device_type(), ['CPU', 'CUDA', 'TPU', 'NEURON'])
+          self.assertIn(xr.device_type(), ['CPU', 'TPU', 'NEURON'])
         else:
           self.assertIsNone(xr.device_type())
 
diff --git a/test/pytorch_test_base.py b/test/pytorch_test_base.py
index 3355f8efba99..7c426deca1e0 100644
--- a/test/pytorch_test_base.py
+++ b/test/pytorch_test_base.py
@@ -295,7 +295,7 @@
         'test_leaky_relu_inplace_with_neg_slope_xla',  # expecting a specific error message
         'test_upsamplingBicubic2d_correctness_xla',  # FIXME! Got dtypes torch.float32 and torch.float64
         'test_CTCLoss_no_batch_dim_xla',  # Value out of range
-        'test_upsamplingBilinear2d_xla',  # precision on GPU/TPU, slow compilation on CPU
+        'test_upsamplingBilinear2d_xla',  # precision on TPU, slow compilation on CPU
         # torch.autograd.gradcheck.GradcheckError: Jacobian mismatch for output 0 with respect to input 0
         'test_GRU_grad_and_gradgrad_xla_float64',  # grad check failure
         'test_LSTM_grad_and_gradgrad_xla_float64',  # grad check failure
@@ -475,18 +475,6 @@
     },
 }
 
-DISABLED_TORCH_TESTS_GPU_ONLY = {
-    # test_torch.py
-    'TestTorchDeviceTypeXLA': {
-        'test_maximum_minimum_float_nan_and_inf',  # maximum(nan,inf) = inf on GPU
-    },
-
-    # test_indexing.py
-    'TestIndexingXLA': {
-        'test_index_put_accumulate_large_tensor_xla',  # illegal memory access was encountered
-    },
-}
-
 
 class MatchSet(object):
 
@@ -526,15 +514,12 @@ def union_of_disabled_tests(sets):
 
 
 DISABLED_TORCH_TESTS_CPU = DISABLED_TORCH_TESTS_ANY
-DISABLED_TORCH_TESTS_GPU = union_of_disabled_tests(
-    [DISABLED_TORCH_TESTS_ANY, DISABLED_TORCH_TESTS_GPU_ONLY])
 DISABLED_TORCH_TESTS_TPU = union_of_disabled_tests(
     [DISABLED_TORCH_TESTS_ANY, DISABLED_TORCH_TESTS_TPU_ONLY])
 
 DISABLED_TORCH_TESTS = {
     'TPU': prepare_match_set(DISABLED_TORCH_TESTS_TPU),
     'CPU': prepare_match_set(DISABLED_TORCH_TESTS_CPU),
-    'CUDA': prepare_match_set(DISABLED_TORCH_TESTS_GPU),
 }
 
 
diff --git a/test/run_tests.sh b/test/run_tests.sh
index 033089d651f5..bb03d7abe161 100755
--- a/test/run_tests.sh
+++ b/test/run_tests.sh
@@ -254,7 +254,6 @@ function run_xla_op_tests3 {
   run_test "$_TEST_DIR/test_devices.py"
   run_test "$_TEST_DIR/test_manual_xla_registration.py"
   run_test_multi_devices "$_TEST_DIR/spmd/test_xla_dtensor_placements.py"
-  # NOTE: this line below is testing export and don't care about GPU
   PJRT_DEVICE=CPU CPU_NUM_DEVICES=1 run_coverage "$_TEST_DIR/test_core_aten_ops.py"
   run_test "$_TEST_DIR/test_pallas.py"
   run_xla_ir_hlo_debug run_test "$_TEST_DIR/test_user_computation_debug_cache.py"
diff --git a/test/test_autocast.py b/test/test_autocast.py
index 32f72ae9762a..c16fee30a725 100644
--- a/test/test_autocast.py
+++ b/test/test_autocast.py
@@ -152,82 +152,6 @@ def __init__(self, dev):
     self.methods_bf16 = [("__matmul__", mat0_bf16 + mat1_fp32)]
 
 
-class AutocastCudaTestExtraLists(object):
-
-  def __init__(self, dev):
-    super().__init__()
-    n = 8
-    dimsets = ((n, n, n), (n, n, n, n), (n, n, n, n, n))
-    conv_args_fp32 = [(torch.randn(dimset, dtype=torch.float32, device=dev),
-                       torch.randn(dimset, dtype=torch.float32, device=dev))
-                      for dimset in dimsets]
-
-    mat0_fp32 = (torch.randn((n, n), dtype=torch.float32, device=dev),)
-    mat1_fp32 = (torch.randn((n, n), dtype=torch.float32, device=dev),)
-    mat2_fp32 = (torch.randn((n, n), dtype=torch.float32, device=dev),)
-    mat3_fp32 = (torch.randn((n, n), dtype=torch.float32, device=dev),)
-
-    pointwise0_fp32 = (torch.randn(n, dtype=torch.float32, device=dev),)
-
-    element0_fp32 = (torch.randn(1, dtype=torch.float32, device=dev),)
-
-    # This is currently not part of AutocastTestLists and excludes `relu`, `addbmm`
-    self.torch_bf16 = [
-        ("conv1d", conv_args_fp32[0]),
-        ("conv2d", conv_args_fp32[1]),
-        ("conv3d", conv_args_fp32[2]),
-        ("bmm", (torch.randn((n, n, n), device=dev, dtype=torch.float32),
-                 torch.randn((n, n, n), device=dev, dtype=torch.float32))),
-        ("mm", mat0_fp32 + mat1_fp32),
-        ("matmul",
-         torch.matmul(
-             torch.ones([2, 3], device=dev, dtype=torch.float32),
-             torch.ones([3, 2], device=dev, dtype=torch.float32))),
-        ("baddbmm", (torch.randn((n, n, n), device=dev, dtype=torch.float32),
-                     torch.randn((n, n, n), device=dev, dtype=torch.float32),
-                     torch.randn((n, n, n), device=dev, dtype=torch.float32))),
-        ("addmm", mat1_fp32 + mat2_fp32 + mat3_fp32),
-        ("conv_tbc", (torch.randn((10, 7, 3), device=dev, dtype=torch.float32),
-                      torch.randn((5, 3, 5), device=dev, dtype=torch.float32),
-                      torch.randn(5, device=dev, dtype=torch.float32), 0)),
-        ("conv_transpose1d", conv_args_fp32[0]),
-        ("conv_transpose2d", conv_args_fp32[1]),
-        ("conv_transpose3d", conv_args_fp32[2]),
-        ("prelu", pointwise0_fp32 + element0_fp32),
-    ]
-
-
-class AutocastCudaTestUnsupportedLists(object):
-
-  def __init__(self):
-    super().__init__()
-    # Utility arguments, created as one-element tuples
-    self.torch_expect_builtin_promote = [
-        "cat",  # requires all input tensors to be the same type
-        "equal",  # requires all input tensors to be the same type
-        "stack",  # return f16 instead of f32
-    ]
-    self.methods_expect_builtin_promote = []
-
-    # The remaining lists organize ops that autocast treats explicitly.
-    self.torch_fp16 = [
-        "_convolution_nogroup",  # need lowering
-        "addmv",  # need lowering
-    ]
-    self.torch_fp32 = [
-        "norm",  # produce f16 instead of f32
-    ]
-    self.torch_need_autocast_promote = [
-        "scatter_add",  # cat currently requires all input tensors to be the same type
-    ]
-    self.nn_fp16 = []
-    self.nn_fp32 = []
-    self.linalg_fp16 = []
-    self.methods_fp16 = []
-    self.methods_fp32 = []
-    self.banned = []
-
-
 class TestAutocastBase(unittest.TestCase):
 
   @classmethod
diff --git a/test/test_compilation_cache_utils.py b/test/test_compilation_cache_utils.py
index 0ac8a013d814..1113d3b67a83 100644
--- a/test/test_compilation_cache_utils.py
+++ b/test/test_compilation_cache_utils.py
@@ -73,7 +73,7 @@ def _test_num_graph_hash(self, use_dynamo, use_persistent):
       use_persistent=(True, False),
   )
   def test_num_graph_hash(self, use_dynamo, use_persistent):
-    if use_persistent and (xr.device_type() not in {'TPU', 'CUDA', 'NEURON'}):
+    if use_persistent and (xr.device_type() not in {'TPU', 'NEURON'}):
       raise absltest.SkipTest('Device type does not support persistent caching')
     _test_spawn(self._test_num_graph_hash, (use_dynamo, use_persistent))
 
diff --git a/test/test_operations.py b/test/test_operations.py
index 9d377083da54..08c2f39d63c3 100644
--- a/test/test_operations.py
+++ b/test/test_operations.py
@@ -90,12 +90,7 @@ def skipIfFunctionalizationDisabled(reason):
 
 def onlyOnCPU(fn):
   accelerator = os.environ.get("PJRT_DEVICE").lower()
-  return unittest.skipIf(accelerator != "cpu", "PJRT_DEVICE=CUDA required")(fn)
-
-
-def onlyOnCUDA(fn):
-  accelerator = os.environ.get("PJRT_DEVICE").lower()
-  return unittest.skipIf(accelerator != "cuda", "PJRT_DEVICE=CUDA required")(fn)
+  return unittest.skipIf(accelerator != "cpu", "PJRT_DEVICE=CPU required")(fn)
 
 
 def onlyIfXLAExperimentalContains(feat):
@@ -108,79 +103,10 @@ def _gen_tensor(*args, **kwargs):
   return torch.randn(*args, **kwargs)
 
 
-def _gen_int_tensor(*args, **kwargs):
-  return torch.randint(*args, **kwargs)
-
-
 def _gen_mask(size):
   return torch.randint(0, 2, size, dtype=torch.bool)
 
 
-def _get_device_support(devname):
-  devices = torch_xla._XLAC._xla_get_devices()
-  num_devices = 0
-  for device in devices:
-    if re.match(devname + r':\d+$', device):
-      num_devices += 1
-  return DeviceSupport(num_devices=num_devices) if num_devices > 0 else None
-
-
-def _support_replicated(devname, num_devices):
-  devsup = _get_device_support(devname)
-  if not devsup:
-    return False
-  return devsup.num_devices >= num_devices
-
-
-def _random_inputs(shapes, num_replicas=1):
-  random_tensors = []
-  for _ in range(0, num_replicas):
-    replica_inputs = []
-    for shape in shapes:
-      replica_inputs.append(_gen_tensor(*shape))
-    random_tensors.append(tuple(replica_inputs))
-  return tuple(random_tensors)
-
-
-def _random_like(tensor_list):
-  random_tensors = []
-  for o in tensor_list:
-    if o.dtype == torch.float32 or o.dtype == torch.float64:
-      random_tensors += [_gen_tensor(*o.shape, dtype=o.dtype)]
-    elif o.dtype == torch.int64:
-      # TODO remove this, we shouldn't be needing to pass random_tensor for long types
-      random_tensors += [torch.empty_like(o)]
-    else:
-      raise RuntimeError('Unsupported type: ', o.dtype)
-  return random_tensors
-
-
-def _zeros_like(tensor_list):
-  zeros_tensors = []
-  for o in tensor_list:
-    if o.dtype == torch.float32 or o.dtype == torch.float64:
-      zeros_tensors += [torch.zeros(*o.shape, dtype=o.dtype)]
-    elif o.dtype == torch.int64:
-      # TODO remove this, we shouldn't be needing to pass zeros_tensor for long types
-      zeros_tensors += [torch.zeros_like(o)]
-    else:
-      raise RuntimeError('Unsupported type: ', o.dtype)
-  return zeros_tensors
-
-
-def onlyIfTorchSupportsCUDA(fn):
-  return unittest.skipIf(
-      not torch.cuda.is_available(), reason="requires PyTorch CUDA support")(
-          fn)
-
-
-def onlyIfPJRTDeviceIsCUDA(fn):
-  return unittest.skipIf(
-      os.environ.get("PJRT_DEVICE") not in ("GPU", "CUDA"),
-      reason="requires CUDA as PJRT_DEVICE")(
-          fn)
-
-
 class TestToXlaTensorArena(test_utils.XlaTestCase):
 
   def test(self):
@@ -274,10 +200,6 @@ def forward(self, x):
     return F.log_softmax(x, dim=1)
 
 
-@unittest.skipIf(
-    xr.device_type() == 'CUDA',
-    'Parallelism for DataParallel uses multi-threads. But cuda assumes one GPU device per process instead of relying on threads.'
-)
 class TestParallelTensorMNIST(test_utils.XlaTestCase):
 
   def test(self):
@@ -436,7 +358,7 @@ def test_masked_select_shape(self):
   def test_nonzero_cast(self):
     t1 = torch.ones(5, 2, device='xla')
     # Result of the nonzero should be the index type. Currently
-    # index type is s64 on cpu and gpu, but s32 on TPU. We should be
+    # index type is s64 on cpu, but s32 on TPU. We should be
     # able to cast it to any other type without error.
     t2 = torch.nonzero(t1.int()).float()
     torch_xla.sync()
@@ -3036,27 +2958,6 @@ def test_as_strided_input_larger(self):
 
     self.assertEqual(a, former_a)
 
-  def _test_move_tensor_cuda_to_xla(self, cpu_tensor):
-    # Assumes CPU-XLA data movement works.
-    cuda_tensor = cpu_tensor.to("cuda")
-    # Move tensor CUDA -> XLA.
-    xla_tensor = cuda_tensor.to('xla')
-    # Move the XLA tensor back to CPU, and check that it is the same as
-    # the original CPU tensor.
-    self.assertTrue(torch.equal(cpu_tensor, xla_tensor.cpu()))
-
-  @onlyIfTorchSupportsCUDA
-  @onlyIfPJRTDeviceIsCUDA
-  def test_aten_move_cuda_to_xla(self):
-    self._test_move_tensor_cuda_to_xla(torch.arange(5))
-
-  @onlyIfTorchSupportsCUDA
-  @onlyIfPJRTDeviceIsCUDA
-  def test_aten_move_scalar_cuda_to_xla(self):
-    # 0-dimensional scalar-tensor
-    # Has a different execution path than other tensors.
-    self._test_move_tensor_cuda_to_xla(torch.tensor(42))
-
   def test_unsafe_buffer_pointer(self):
     xla_device = torch_xla.device()
     xla_tensor_0 = torch.tensor(42).to(xla_device)
@@ -3083,178 +2984,6 @@ def test_unsafe_buffer_pointer(self):
     self.assertGreaterEqual(buf_ptr_3, 0)
 
 
-class TestDLPack(parameterized.TestCase):
-
-  def _test_dlpack_capsule_conversion_helper(self, xla_tensor):
-    dlpt = xdlpack.to_dlpack(xla_tensor)  # dlpt1 has type PyCapsule
-    xla_tensor2 = xdlpack.from_dlpack(dlpt)
-
-    self.assertEqual(xla_tensor.device, xla_tensor2.device)
-    self.assertTrue(torch.allclose(xla_tensor.cpu(), xla_tensor2.cpu()))
-    self.assertRaisesRegex(RuntimeError,
-                           "DLTensor capsule can be consumed only once",
-                           lambda: xdlpack.from_dlpack(dlpt))
-
-    self.assertEqual(
-        torch_xla._XLAC._unsafe_buffer_pointer(xla_tensor),
-        torch_xla._XLAC._unsafe_buffer_pointer(xla_tensor2))
-
-  @onlyIfTorchSupportsCUDA
-  @onlyIfPJRTDeviceIsCUDA
-  @parameterized.parameters(*all_types_and(torch.half, torch.bfloat16))
-  def test_dlpack_roundtrip_tensor(self, dtype):
-    xla_device = torch_xla.device()
-    # xtensor->CurrentDataHandle() == nullptr but xtensor->CurrentIrValue().node != nullptr and device_data != nullptr
-    # xla_tensor_2 uses XLANativeFunctions::_to_copy
-    xla_tensor_2 = torch.arange(5, dtype=dtype).to(xla_device)
-    self._test_dlpack_capsule_conversion_helper(xla_tensor_2)
-
-    # xla_tensor_3 uses arange_out IR node.
-    xla_tensor_3 = torch.arange(5, dtype=dtype, device='xla')
-    torch_xla.sync()
-    self._test_dlpack_capsule_conversion_helper(xla_tensor_3)
-
-  @onlyIfTorchSupportsCUDA
-  @onlyIfPJRTDeviceIsCUDA
-  @parameterized.parameters(
-      *all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool,
-                                 torch.uint16, torch.uint32, torch.uint64))
-  def test_dlpack_roundtrip_scalar(self, dtype):
-    xla_device = torch_xla.device()
-    xla_tensor_0 = torch.tensor(42, dtype=dtype).to(xla_device)
-    # `torch_xla.sync()` ensures xtensor->CurrentDataHandle() != nullptr
-    torch_xla.sync()
-    self._test_dlpack_capsule_conversion_helper(xla_tensor_0)
-
-    xla_tensor_1 = torch.tensor(42, dtype=dtype).to(xla_device)
-    # xtensor->CurrentDataHandle() == nullptr but xtensor->CurrentIrValue().node != nullptr and device_data != nullptr
-    self._test_dlpack_capsule_conversion_helper(xla_tensor_1)
-
-  @onlyIfTorchSupportsCUDA
-  @onlyIfPJRTDeviceIsCUDA
-  def test_dlpack_roundtrip_bool(self):
-    xla_tensor = torch.ones(1, dtype=torch.bool).to('xla')
-    self._test_dlpack_capsule_conversion_helper(xla_tensor)
-
-  @onlyIfTorchSupportsCUDA
-  @onlyIfPJRTDeviceIsCUDA
-  def test_dlpack_pytorch_cuda_to_xla(self):
-    t1_cuda = torch.arange(5).cuda()
-    dlt1 = torch.utils.dlpack.to_dlpack(t1_cuda)
-    xla_t1 = xdlpack.from_dlpack(dlt1)
-    self.assertEqual(xla_t1.device.type, 'xla')
-    self.assertEqual(xla_t1.device.index, t1_cuda.device.index)
-    t1_cuda[0] = t1_cuda[0] + 20
-    self.assertTrue(torch.allclose(xla_t1.cpu(), t1_cuda.cpu()))
-
-    t2_cuda = torch.tensor(5).cuda()
-    dlt2 = torch.utils.dlpack.to_dlpack(t2_cuda)
-    xla_t2 = xdlpack.from_dlpack(dlt2)
-    self.assertEqual(xla_t2.device.type, 'xla')
-    self.assertEqual(xla_t2.device.index, t2_cuda.device.index)
-    t2_cuda.fill_(6)
-    self.assertTrue(torch.allclose(xla_t2.cpu(), t2_cuda.cpu()))
-
-    cuda1 = torch.device('cuda:1')
-    t3_cuda = torch.tensor(5, device=cuda1)
-    dlt3 = torch.utils.dlpack.to_dlpack(t3_cuda)
-    xla_t3 = xdlpack.from_dlpack(dlt3)
-    self.assertEqual(xla_t3.device.type, 'xla')
-    self.assertEqual(
-        xla_t3.device.index,
-        t3_cuda.device.index,
-        msg='both value should 1. xla_t3.device should be xla:1.')
-    t3_cuda.fill_(6)
-    self.assertTrue(torch.allclose(xla_t3.cpu(), t3_cuda.cpu()))
-
-  @onlyIfTorchSupportsCUDA
-  @onlyIfPJRTDeviceIsCUDA
-  def test_dlpack_pytorch_cuda_to_xla_protocol_conversion(self):
-    # Unlike the test_dlpack_pytorch_cuda_to_xla,
-    # torch_cuda_tensor has attribute __dlpack__ and __dlpack_device__.
-    # From cuda tensors to xla tensors, the synchronization is handdled implicitly.
-    t1_cuda = torch.arange(5).cuda()
-    xla_t1 = xdlpack.from_dlpack(t1_cuda)
-    self.assertEqual(xla_t1.device.type, 'xla')
-    self.assertEqual(xla_t1.device.index, t1_cuda.device.index)
-    t1_cuda[0] = t1_cuda[0] + 20
-    self.assertTrue(torch.allclose(xla_t1.cpu(), t1_cuda.cpu()))
-
-    t2_cuda = torch.tensor(5).cuda()
-    xla_t2 = xdlpack.from_dlpack(t2_cuda)
-    self.assertEqual(xla_t2.device.type, 'xla')
-    self.assertEqual(xla_t2.device.index, t2_cuda.device.index)
-    t2_cuda.fill_(6)
-    self.assertTrue(torch.allclose(xla_t2.cpu(), t2_cuda.cpu()))
-
-    cuda1 = torch.device('cuda:1')
-    t3_cuda = torch.tensor(5, device=cuda1)
-    xla_t3 = xdlpack.from_dlpack(t3_cuda)
-    self.assertEqual(xla_t3.device.type, 'xla')
-    self.assertEqual(
-        xla_t3.device.index,
-        t3_cuda.device.index,
-        msg='both value should 1. xla_t3.device should be xla:1.')
-    t3_cuda.fill_(6)
-    self.assertTrue(torch.allclose(xla_t3.cpu(), t3_cuda.cpu()))
-
-  @onlyIfTorchSupportsCUDA
-  @onlyIfPJRTDeviceIsCUDA
-  def test_dlpack_xla_to_pytorch_cuda(self):
-    xla_t1 = torch.arange(5).to('xla')
-    dlt1 = xdlpack.to_dlpack(xla_t1)
-    cuda_t1 = torch.utils.dlpack.from_dlpack(dlt1)
-    self.assertEqual(cuda_t1.device.type, 'cuda')
-    self.assertEqual(cuda_t1.device.index, xla_t1.device.index)
-    cuda_t1[0] = cuda_t1[0] + 20
-    self.assertTrue(torch.allclose(xla_t1.cpu(), cuda_t1.cpu()))
-
-  @onlyIfTorchSupportsCUDA
-  @onlyIfPJRTDeviceIsCUDA
-  def test_dlpack_xla_to_pytorch_cuda_protocol_conversion(self):
-    xla_t1 = torch.arange(5).to('xla')
-    cuda_t1 = torch.utils.dlpack.from_dlpack(xla_t1)
-    self.assertEqual(cuda_t1.device.type, 'cuda')
-    self.assertEqual(cuda_t1.device.index, xla_t1.device.index)
-    cuda_t1[0] = cuda_t1[0] + 20
-    self.assertTrue(torch.allclose(xla_t1.cpu(), cuda_t1.cpu()))
-
-  @onlyIfTorchSupportsCUDA
-  @onlyIfPJRTDeviceIsCUDA
-  def test_dlpack_non_default_layout(self):
-    cuda_t = torch.arange(25, device=torch.device('cuda')).reshape(5, 5)
-
-    t1 = cuda_t.t()
-    xla_t1 = xdlpack.from_dlpack(t1.__dlpack__())
-    self.assertEqual(xla_t1.device.type, 'xla')
-    self.assertEqual(xla_t1.device.index, t1.device.index)
-    self.assertTrue(torch.allclose(t1.cpu(), xla_t1.cpu()))
-
-    t2 = cuda_t[0]
-    xla_t2 = xdlpack.from_dlpack(t2.__dlpack__())
-    self.assertEqual(xla_t2.device.type, 'xla')
-    self.assertEqual(xla_t2.device.index, t2.device.index)
-    self.assertTrue(torch.allclose(t2.cpu(), xla_t2.cpu()))
-
-    t3 = cuda_t[:, 0]
-    self.assertRaisesRegex(
-        RuntimeError,
-        r"Only DLPack tensors with trivial \(compact\) striding are supported",
-        lambda: xdlpack.from_dlpack(t3.__dlpack__()))
-
-    t4 = cuda_t[1, :]
-    xla_t4 = xdlpack.from_dlpack(t4.__dlpack__())
-    self.assertEqual(xla_t4.device.type, 'xla')
-    self.assertEqual(xla_t4.device.index, t4.device.index)
-    self.assertTrue(torch.allclose(t4.cpu(), xla_t4.cpu()))
-
-    t5 = cuda_t[1]
-    xla_t5 = xdlpack.from_dlpack(t5.__dlpack__())
-    self.assertEqual(xla_t5.device.type, 'xla')
-    self.assertEqual(xla_t5.device.index, t5.device.index)
-    self.assertTrue(torch.allclose(t5.cpu(), xla_t5.cpu()))
-
-
 class SimpleModelWithDropout(torch.nn.Module):
 
   def __init__(self):
@@ -3308,7 +3037,7 @@ def test_opt_barrier(self):
         opt_barrier = line
         break
 
-    # Somehow the CPU/GPU CI will not have the opt-barrier.
+    # Somehow the CPU CI will not have the opt-barrier.
     if opt_barrier != "":
       self.assertEqual(opt_barrier.count("f32[128,128]"), 6)
       self.assertEqual(opt_barrier.count("f32[128]"), 2)
diff --git a/test/test_ops.py b/test/test_ops.py
index 167b2024d07f..2e71948b76a1 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -126,6 +126,7 @@ def get_allowed_ops_map(
     AllowedOpInfoEntry('gt'),
     AllowedOpInfoEntry('imag'),
     AllowedOpInfoEntry('inverse'),
+    AllowedOpInfoEntry('index_put'),
     AllowedOpInfoEntry('isin'),
     AllowedOpInfoEntry('isneginf'),
     AllowedOpInfoEntry('le'),
@@ -365,11 +366,7 @@ def get_allowed_ops_map(
     # AllowedOpInfoEntry('logdet'), xla::lodget does not handle empty input
     # AllowedOpInfoEntry('qr'),  # Slice dim size 1 greater than dynamic slice dimension: 0
 
-    # Failed on CUDA CI only (investigate)
-    # app.circleci.com/pipelines/github/pytorch/xla/9088/workflows/2d59c649-db2b-4384-921e-5e43eba1b51a/jobs/17875
-    # AllowedOpInfoEntry('index_put'),
-
-    # Worked locally (but failing on CI both CPU and CUDA)
+    # Worked locally (but failing on CI both CPU)
     # app.circleci.com/pipelines/github/pytorch/xla/9130/workflows/71c74f3d-1735-4328-81b5-784d6e6744da/jobs/17998
     # AllowedOpInfoEntry('var_mean'),
     # AllowedOpInfoEntry('pow'), # for int64 don't work, likely rounding issue
diff --git a/test/test_persistent_cache.py b/test/test_persistent_cache.py
index a7cfa7ab1fd7..21bee1472cae 100644
--- a/test/test_persistent_cache.py
+++ b/test/test_persistent_cache.py
@@ -94,8 +94,6 @@ def _spmd_sharded_test(tmpdir, metrics):
   _assert_correctness_and_metrics(t, xt, metrics)
 
 
-# Skip CUDA, the on disk cache cannot be deserialized after XLA pin update in
-# #8908
 @absltest.skipUnless(xr.device_type() in {'TPU', 'NEURON'},
                      'Device type does not support persistent caching')
 class PersistentCacheTest(parameterized.TestCase):
@@ -133,9 +131,7 @@ def test_persistent_cache_mp(self):
       ('spmd_replicated', _spmd_replicated_test),
       ('spmd_sharded', _spmd_sharded_test),
   )
-  @absltest.skipUnless(
-      xr.device_type() == 'TPU',
-      'TPU required for SPMD; single-device GPU is pending #6023')
+  @absltest.skipUnless(xr.device_type() == 'TPU', 'TPU required for SPMD')
   def test_persistent_cache(self, test_fn):
     self._run_test(_test_spawn, test_fn)
 
diff --git a/test/test_profiler.py b/test/test_profiler.py
index 3f2a1482ffb2..2017de1d67e7 100644
--- a/test/test_profiler.py
+++ b/test/test_profiler.py
@@ -77,11 +77,6 @@ def _check_trace_namespace_exists(self, path):
                     f'Expected "build_graph" trace in: {path}')
 
   def test_trace_and_metrics(self):
-    # Create a new context for forking processes with the spawn method.
-    # This is necessary so as to avoid CUDA initialization issues when
-    # both PyTorch and PyTorch/XLA were compiled with CUDA support.
-    context = multiprocessing.get_context("spawn")
-
     port = xu.get_free_tcp_ports()[0]
     training_started = context.Event()
     p = context.Process(
diff --git a/test/test_python_ops.py b/test/test_python_ops.py
index 9dc145947f62..a9cdd816fe85 100644
--- a/test/test_python_ops.py
+++ b/test/test_python_ops.py
@@ -53,7 +53,6 @@ def ref_put(dst, idx, src, accumulate):
         src = make_arg(src_size, noncontiguous=not src_contig)
 
         # If accumulate=True, `put_` should be deterministic regardless of the inputs on CPU
-        # On CUDA it may not be, but the test has enough tolerance to account for this
         if accumulate:
           idx = make_idx(src_size, high=dst.numel())
         else:
diff --git a/test/test_zero1.py b/test/test_zero1.py
index 8bb2fbc3d822..62aa13b4ecc6 100644
--- a/test/test_zero1.py
+++ b/test/test_zero1.py
@@ -154,13 +154,12 @@ def test_zero1_load(self):
 
 def _mp_fn(index):
   device = torch_xla.device()
-  if xm.xla_device_hw(device) in ('TPU', 'CUDA'):
+  if xm.xla_device_hw(device) in ('TPU',):
     test = unittest.main(exit=False)
     sys.exit(0 if test.result.wasSuccessful() else 1)
   else:
     print(
-        'Default device {} is not a TPU or CUDA device'.format(device),
-        file=sys.stderr)
+        'Default device {} is not a TPU device'.format(device), file=sys.stderr)
 
 
 if __name__ == '__main__':