diff --git a/test/cpp/test_aten_xla_tensor_2.cpp b/test/cpp/test_aten_xla_tensor_2.cpp index 31491249f618..dc3d605da34a 100644 --- a/test/cpp/test_aten_xla_tensor_2.cpp +++ b/test/cpp/test_aten_xla_tensor_2.cpp @@ -1555,20 +1555,18 @@ TEST_F(AtenXlaTensorTest, TestGroupNormBackward) { /*cudnn_enabled=*/false); }; torch::Tensor undef; - ForEachDevice({XlaDeviceType::CUDA, XlaDeviceType::TPU}, - [&](const torch::Device& device) { - TestBackward({input, undef_weight ? undef : weight, - undef_weight ? undef : bias}, - device, testfn, - /*rtol=*/1e-3, /*atol=*/1e-3, - /*derivative_level=*/2); - ExpectCounterNotChanged("aten::.*", - cpp_test::GetIgnoredCounters()); - ExpectCounterChanged("xla::native_batch_norm", - cpp_test::GetIgnoredCounters()); - ExpectCounterChanged("xla::native_batch_norm_backward", - cpp_test::GetIgnoredCounters()); - }); + ForEachDevice({XlaDeviceType::TPU}, [&](const torch::Device& device) { + TestBackward( + {input, undef_weight ? undef : weight, undef_weight ? undef : bias}, + device, testfn, + /*rtol=*/1e-3, /*atol=*/1e-3, + /*derivative_level=*/2); + ExpectCounterNotChanged("aten::.*", cpp_test::GetIgnoredCounters()); + ExpectCounterChanged("xla::native_batch_norm", + cpp_test::GetIgnoredCounters()); + ExpectCounterChanged("xla::native_batch_norm_backward", + cpp_test::GetIgnoredCounters()); + }); } } } diff --git a/test/cpp/test_aten_xla_tensor_6.cpp b/test/cpp/test_aten_xla_tensor_6.cpp index df3e1280b6e2..b9a669760b1b 100644 --- a/test/cpp/test_aten_xla_tensor_6.cpp +++ b/test/cpp/test_aten_xla_tensor_6.cpp @@ -873,7 +873,7 @@ TEST_F(AtenXlaTensorTest, TestEmbeddingBackward) { TEST_F(AtenXlaTensorTest, TestAmpUpdateScale) { XlaDeviceType hw_type = static_cast(bridge::GetDefaultDevice()->type()); - if (hw_type != XlaDeviceType::CUDA && hw_type != XlaDeviceType::CPU) { + if (hw_type != XlaDeviceType::CPU) { return; } torch::Tensor growth_tracker = diff --git a/test/cpp/test_replication.cpp b/test/cpp/test_replication.cpp index 00ec937bf8a1..73db4ab42392 100644 --- a/test/cpp/test_replication.cpp +++ b/test/cpp/test_replication.cpp @@ -98,9 +98,7 @@ void TestSingleReplication( class ReplicationTest : public AtenXlaTensorTestBase {}; -// Parallelism for DataParallel uses multi-threads. But cuda assumes one GPU -// device per process instead of relying on threads so we will not run the test -// on GPU. +// Parallelism for DataParallel uses multi-threads. TEST_F(ReplicationTest, TestNSingleReplication) { WithAllDevices( {XlaDeviceType::TPU}, diff --git a/test/ds/test_dynamic_shapes.py b/test/ds/test_dynamic_shapes.py index 46f329de4537..355a795bd14e 100644 --- a/test/ds/test_dynamic_shapes.py +++ b/test/ds/test_dynamic_shapes.py @@ -186,7 +186,7 @@ def test_masked_select_shape(self): def test_nonzero_cast(self): t1 = torch.ones(5, 2, device='xla') # Result of the nonzero should be the index type. Currently - # index type is s64 on cpu and gpu, but s32 on TPU. We should be + # index type is s64 on cpu, but s32 on TPU. We should be # able to cast it to any other type without error. t2 = torch.nonzero(t1.int()).float() torch_xla.sync() diff --git a/test/dynamo/test_dynamo.py b/test/dynamo/test_dynamo.py index 2c05adf7716c..c106483e1a37 100644 --- a/test/dynamo/test_dynamo.py +++ b/test/dynamo/test_dynamo.py @@ -148,17 +148,6 @@ def fn_simple(self, x, y): b = torch.sin(y) return a + b - def _choose_proper_device(self, initialize_on_cuda): - if not initialize_on_cuda: - return torch_xla.device() - - assert initialize_on_cuda - if xr.device_type() != "CUDA" or not torch.cuda.is_available(): - self.skipTest( - "Skip this test because it requires xr.device_type()=='CUDA' and torch.cuda.is_available()." - ) - return "cuda:0" - @skipOnNeuron def test_simple_model(self): device = torch_xla.device() @@ -193,51 +182,7 @@ def test_simple_model(self): # Dynamo has to sync the input since they are intermedate IR(xla_xy and xla_y3) self.assertEqual(met.counter_value('DynamoSyncInputExecuteTime'), 1) - # Tests that the dynamo bridge automatically moves tensors to XLA device, - # then back to the original device. - @unittest.skipIf(xr.device_type() != "CUDA" or not torch.cuda.is_available(), - f"GPU tests should only run on GPU devices.") - @parameterized.parameters( - "0", - "1", - ) - def test_simple_model_automoves_tensors(self, zero_copy_enabled): - x = torch.tensor(100.0, requires_grad=True, device="cuda:0") - y = torch.tensor(200.0, requires_grad=True, device="cuda:0") - original_device = x.device - eager_result = self.fn_simple(x, y) - - # Since all tests run in the same process, have to reset the metrics report. - met.clear_all() - torch._dynamo.reset() - - fn_simple_dynamo = torch.compile(self.fn_simple, backend="openxla") - res_xla_dynamo = fn_simple_dynamo(x, y) - self.assertIn('xla::add', met.counter_names()) - self.assertTrue(res_xla_dynamo.device == original_device) - self.assertTrue(torch.allclose(eager_result, res_xla_dynamo)) - - # verify that tracing is skipped in following runs - met.clear_counters() - res_xla_dynamo_reused = fn_simple_dynamo(x, y) - self.assertNotIn('xla::add', met.counter_names()) - self.assertTrue(res_xla_dynamo_reused.device == original_device) - self.assertTrue(torch.allclose(eager_result, res_xla_dynamo_reused)) - - # verify that dynamo can handle different inputs - res_xla_dynamo_different = fn_simple_dynamo(x + y, y * 3) - res_cpu_3 = self.fn_simple(x + y, y * 3) - self.assertTrue(res_xla_dynamo_different.device == original_device) - self.assertTrue(torch.allclose(res_cpu_3, res_xla_dynamo_different)) - - # There should not be any fallbacks. - self.assertEqual(torch_xla._XLAC._get_executed_fallback_ops(), []) - - @parameterized.parameters( - True, - False, - ) - def test_fn_without_input(self, initialize_on_cuda): + def test_fn_without_input(self): def fn_without_input(device): constant = 0.835 @@ -245,19 +190,15 @@ def fn_without_input(device): arange = torch.arange(16, device=device).reshape(4, 4) return expanded + arange - device = self._choose_proper_device(initialize_on_cuda) + device = torch_xla.device() compiled_fn = torch.compile(fn_without_input, backend='openxla') res_cpu = fn_without_input('cpu') res_xla_dynamo = compiled_fn(device) self.assertTrue(torch.allclose(res_cpu, res_xla_dynamo.cpu())) - @parameterized.parameters( - (True, 'openxla'), - (False, dynamo_backend2.dynamo_backend), - (False, 'openxla'), - ) - def test_simple_model_with_in_place_ops(self, initialize_on_cuda, backend): + @parameterized.parameters('openxla', dynamo_backend2.dynamo_backend) + def test_simple_model_with_in_place_ops(self, backend): class TestModel(nn.Module): @@ -279,7 +220,7 @@ def forward(self, index, copy_tensor, input_tensor, op_name): output = input_tensor + self.self_tensor return output - device = self._choose_proper_device(initialize_on_cuda) + device = torch_xla.device() torch._dynamo.reset() met.clear_all() @@ -306,18 +247,14 @@ def forward(self, index, copy_tensor, input_tensor, op_name): op_name=in_place_op) self.assertTrue(torch.allclose(res_cpu, res_device_dynamo.cpu())) - @parameterized.parameters( - (True, 'openxla'), - (False, dynamo_backend2.dynamo_backend), - (False, 'openxla'), - ) - def test_einsum(self, initialize_on_cuda, backend): + @parameterized.parameters('openxla', dynamo_backend2.dynamo_backend) + def test_einsum(self, backend): # einsum currently does not have meta function to compute the shape hence # will fallback to XLA with FakeTensor as input to infer the output shape. def einsum_mm(a, b): return torch.einsum('ijkl,ijlm->ijkm', a, b) - device = self._choose_proper_device(initialize_on_cuda) + device = torch_xla.device() a = torch.randn(4, 4, 4, 4).to(device) b = torch.randn(4, 4, 4, 4).to(device) torch_xla.sync() @@ -328,16 +265,10 @@ def einsum_mm(a, b): self.assertTrue( torch.allclose(res_device_non_dynamo.cpu(), res_device_dynamo.cpu())) - @parameterized.parameters( - True, - False, - ) - def test_simple_model_with_different_input_shape(self, initialize_on_cuda): + def test_simple_model_with_different_input_shape(self): met.clear_all() - device = self._choose_proper_device(initialize_on_cuda) - # We need to make `dim` depend on `initialize_on_cuda` because the XLA compilation cache - # does not clean itself between the parameterized tests. - dim = 5 + int(initialize_on_cuda) + device = torch_xla.device() + dim = 5 device_x = torch.randn(dim, dim).to(device) device_y = torch.randn(dim, dim).to(device) new_dim = 2 * dim @@ -369,13 +300,9 @@ def get_loader(self, device, sample_count, batch_size=4): @skipOnTpu @skipOnNeuron - @parameterized.parameters( - (True, 'openxla'), - (False, dynamo_backend2.dynamo_backend), - (False, 'openxla'), - ) - def test_resnet18(self, initialize_on_cuda, backend): - device = self._choose_proper_device(initialize_on_cuda) + @parameterized.parameters('openxla', dynamo_backend2.dynamo_backend) + def test_resnet18(self, backend): + device = torch_xla.device() sample_count = xu.getenv_as('SAMPLE_COUNT', int, defval=10) loader = self.get_loader(device, sample_count, batch_size=4) resnet18 = torchvision.models.resnet18() diff --git a/test/dynamo/test_traceable_collectives.py b/test/dynamo/test_traceable_collectives.py index 45bd89266604..e9416dcb2e05 100644 --- a/test/dynamo/test_traceable_collectives.py +++ b/test/dynamo/test_traceable_collectives.py @@ -20,7 +20,7 @@ def collective_broadcast_and_cos(input, src): def _mp_fn(index): device = torch_xla.device() world_size = xr.world_size() - if xm.xla_device_hw(device) not in ('TPU', 'CUDA', 'NEURON'): + if xm.xla_device_hw(device) not in ('TPU', 'NEURON'): print(f'skip this test for hw {xm.xla_device_hw(device)}') return ordinal_tensor = torch.tensor([index], dtype=torch.float).to(device) diff --git a/test/pjrt/test_runtime.py b/test/pjrt/test_runtime.py index 6529b5e826e1..31af6acd4e36 100644 --- a/test/pjrt/test_runtime.py +++ b/test/pjrt/test_runtime.py @@ -17,7 +17,7 @@ class TestExperimentalPjrt(parameterized.TestCase): def setUp(self): xr.set_device_type('CPU') - @parameterized.parameters(('CPU', 'CPU'), ('CUDA', 'CUDA'), ('TPU', 'TPU')) + @parameterized.parameters(('CPU', 'CPU'), ('TPU', 'TPU')) def test_device_type(self, pjrt_device, expected): with mock.patch.dict(os.environ, {'PJRT_DEVICE': pjrt_device}, clear=True): self.assertEqual(xr.device_type(), expected) @@ -69,11 +69,6 @@ def test_xla_device_error(self): }, True), ('pjrt_tpu_precedence', { 'PJRT_DEVICE': 'TPU', 'XRT_TPU_CONFIG': 'localservice;0;localhost:51011', - }, True), ('gpu_num_devives', { - 'GPU_NUM_DEVICES': '4' - }, True), ('pjrt_gpu', { - 'PJRT_DEVICE': 'CUDA', - 'GPU_NUM_DEVICES': '4' }, True)) def test_pjrt_default_device(self, env_vars, expect_using_pjrt): # Prevent flag checking during reinitialization of PJRT backend. @@ -86,7 +81,7 @@ def test_pjrt_default_device(self, env_vars, expect_using_pjrt): reload(torch_xla) logs_context = contextlib.nullcontext() if expect_using_pjrt: - self.assertIn(xr.device_type(), ['CPU', 'CUDA', 'TPU', 'NEURON']) + self.assertIn(xr.device_type(), ['CPU', 'TPU', 'NEURON']) else: self.assertIsNone(xr.device_type()) diff --git a/test/pytorch_test_base.py b/test/pytorch_test_base.py index 3355f8efba99..7c426deca1e0 100644 --- a/test/pytorch_test_base.py +++ b/test/pytorch_test_base.py @@ -295,7 +295,7 @@ 'test_leaky_relu_inplace_with_neg_slope_xla', # expecting a specific error message 'test_upsamplingBicubic2d_correctness_xla', # FIXME! Got dtypes torch.float32 and torch.float64 'test_CTCLoss_no_batch_dim_xla', # Value out of range - 'test_upsamplingBilinear2d_xla', # precision on GPU/TPU, slow compilation on CPU + 'test_upsamplingBilinear2d_xla', # precision on TPU, slow compilation on CPU # torch.autograd.gradcheck.GradcheckError: Jacobian mismatch for output 0 with respect to input 0 'test_GRU_grad_and_gradgrad_xla_float64', # grad check failure 'test_LSTM_grad_and_gradgrad_xla_float64', # grad check failure @@ -475,18 +475,6 @@ }, } -DISABLED_TORCH_TESTS_GPU_ONLY = { - # test_torch.py - 'TestTorchDeviceTypeXLA': { - 'test_maximum_minimum_float_nan_and_inf', # maximum(nan,inf) = inf on GPU - }, - - # test_indexing.py - 'TestIndexingXLA': { - 'test_index_put_accumulate_large_tensor_xla', # illegal memory access was encountered - }, -} - class MatchSet(object): @@ -526,15 +514,12 @@ def union_of_disabled_tests(sets): DISABLED_TORCH_TESTS_CPU = DISABLED_TORCH_TESTS_ANY -DISABLED_TORCH_TESTS_GPU = union_of_disabled_tests( - [DISABLED_TORCH_TESTS_ANY, DISABLED_TORCH_TESTS_GPU_ONLY]) DISABLED_TORCH_TESTS_TPU = union_of_disabled_tests( [DISABLED_TORCH_TESTS_ANY, DISABLED_TORCH_TESTS_TPU_ONLY]) DISABLED_TORCH_TESTS = { 'TPU': prepare_match_set(DISABLED_TORCH_TESTS_TPU), 'CPU': prepare_match_set(DISABLED_TORCH_TESTS_CPU), - 'CUDA': prepare_match_set(DISABLED_TORCH_TESTS_GPU), } diff --git a/test/run_tests.sh b/test/run_tests.sh index 033089d651f5..bb03d7abe161 100755 --- a/test/run_tests.sh +++ b/test/run_tests.sh @@ -254,7 +254,6 @@ function run_xla_op_tests3 { run_test "$_TEST_DIR/test_devices.py" run_test "$_TEST_DIR/test_manual_xla_registration.py" run_test_multi_devices "$_TEST_DIR/spmd/test_xla_dtensor_placements.py" - # NOTE: this line below is testing export and don't care about GPU PJRT_DEVICE=CPU CPU_NUM_DEVICES=1 run_coverage "$_TEST_DIR/test_core_aten_ops.py" run_test "$_TEST_DIR/test_pallas.py" run_xla_ir_hlo_debug run_test "$_TEST_DIR/test_user_computation_debug_cache.py" diff --git a/test/test_autocast.py b/test/test_autocast.py index 32f72ae9762a..c16fee30a725 100644 --- a/test/test_autocast.py +++ b/test/test_autocast.py @@ -152,82 +152,6 @@ def __init__(self, dev): self.methods_bf16 = [("__matmul__", mat0_bf16 + mat1_fp32)] -class AutocastCudaTestExtraLists(object): - - def __init__(self, dev): - super().__init__() - n = 8 - dimsets = ((n, n, n), (n, n, n, n), (n, n, n, n, n)) - conv_args_fp32 = [(torch.randn(dimset, dtype=torch.float32, device=dev), - torch.randn(dimset, dtype=torch.float32, device=dev)) - for dimset in dimsets] - - mat0_fp32 = (torch.randn((n, n), dtype=torch.float32, device=dev),) - mat1_fp32 = (torch.randn((n, n), dtype=torch.float32, device=dev),) - mat2_fp32 = (torch.randn((n, n), dtype=torch.float32, device=dev),) - mat3_fp32 = (torch.randn((n, n), dtype=torch.float32, device=dev),) - - pointwise0_fp32 = (torch.randn(n, dtype=torch.float32, device=dev),) - - element0_fp32 = (torch.randn(1, dtype=torch.float32, device=dev),) - - # This is currently not part of AutocastTestLists and excludes `relu`, `addbmm` - self.torch_bf16 = [ - ("conv1d", conv_args_fp32[0]), - ("conv2d", conv_args_fp32[1]), - ("conv3d", conv_args_fp32[2]), - ("bmm", (torch.randn((n, n, n), device=dev, dtype=torch.float32), - torch.randn((n, n, n), device=dev, dtype=torch.float32))), - ("mm", mat0_fp32 + mat1_fp32), - ("matmul", - torch.matmul( - torch.ones([2, 3], device=dev, dtype=torch.float32), - torch.ones([3, 2], device=dev, dtype=torch.float32))), - ("baddbmm", (torch.randn((n, n, n), device=dev, dtype=torch.float32), - torch.randn((n, n, n), device=dev, dtype=torch.float32), - torch.randn((n, n, n), device=dev, dtype=torch.float32))), - ("addmm", mat1_fp32 + mat2_fp32 + mat3_fp32), - ("conv_tbc", (torch.randn((10, 7, 3), device=dev, dtype=torch.float32), - torch.randn((5, 3, 5), device=dev, dtype=torch.float32), - torch.randn(5, device=dev, dtype=torch.float32), 0)), - ("conv_transpose1d", conv_args_fp32[0]), - ("conv_transpose2d", conv_args_fp32[1]), - ("conv_transpose3d", conv_args_fp32[2]), - ("prelu", pointwise0_fp32 + element0_fp32), - ] - - -class AutocastCudaTestUnsupportedLists(object): - - def __init__(self): - super().__init__() - # Utility arguments, created as one-element tuples - self.torch_expect_builtin_promote = [ - "cat", # requires all input tensors to be the same type - "equal", # requires all input tensors to be the same type - "stack", # return f16 instead of f32 - ] - self.methods_expect_builtin_promote = [] - - # The remaining lists organize ops that autocast treats explicitly. - self.torch_fp16 = [ - "_convolution_nogroup", # need lowering - "addmv", # need lowering - ] - self.torch_fp32 = [ - "norm", # produce f16 instead of f32 - ] - self.torch_need_autocast_promote = [ - "scatter_add", # cat currently requires all input tensors to be the same type - ] - self.nn_fp16 = [] - self.nn_fp32 = [] - self.linalg_fp16 = [] - self.methods_fp16 = [] - self.methods_fp32 = [] - self.banned = [] - - class TestAutocastBase(unittest.TestCase): @classmethod diff --git a/test/test_compilation_cache_utils.py b/test/test_compilation_cache_utils.py index 0ac8a013d814..1113d3b67a83 100644 --- a/test/test_compilation_cache_utils.py +++ b/test/test_compilation_cache_utils.py @@ -73,7 +73,7 @@ def _test_num_graph_hash(self, use_dynamo, use_persistent): use_persistent=(True, False), ) def test_num_graph_hash(self, use_dynamo, use_persistent): - if use_persistent and (xr.device_type() not in {'TPU', 'CUDA', 'NEURON'}): + if use_persistent and (xr.device_type() not in {'TPU', 'NEURON'}): raise absltest.SkipTest('Device type does not support persistent caching') _test_spawn(self._test_num_graph_hash, (use_dynamo, use_persistent)) diff --git a/test/test_operations.py b/test/test_operations.py index 9d377083da54..08c2f39d63c3 100644 --- a/test/test_operations.py +++ b/test/test_operations.py @@ -90,12 +90,7 @@ def skipIfFunctionalizationDisabled(reason): def onlyOnCPU(fn): accelerator = os.environ.get("PJRT_DEVICE").lower() - return unittest.skipIf(accelerator != "cpu", "PJRT_DEVICE=CUDA required")(fn) - - -def onlyOnCUDA(fn): - accelerator = os.environ.get("PJRT_DEVICE").lower() - return unittest.skipIf(accelerator != "cuda", "PJRT_DEVICE=CUDA required")(fn) + return unittest.skipIf(accelerator != "cpu", "PJRT_DEVICE=CPU required")(fn) def onlyIfXLAExperimentalContains(feat): @@ -108,79 +103,10 @@ def _gen_tensor(*args, **kwargs): return torch.randn(*args, **kwargs) -def _gen_int_tensor(*args, **kwargs): - return torch.randint(*args, **kwargs) - - def _gen_mask(size): return torch.randint(0, 2, size, dtype=torch.bool) -def _get_device_support(devname): - devices = torch_xla._XLAC._xla_get_devices() - num_devices = 0 - for device in devices: - if re.match(devname + r':\d+$', device): - num_devices += 1 - return DeviceSupport(num_devices=num_devices) if num_devices > 0 else None - - -def _support_replicated(devname, num_devices): - devsup = _get_device_support(devname) - if not devsup: - return False - return devsup.num_devices >= num_devices - - -def _random_inputs(shapes, num_replicas=1): - random_tensors = [] - for _ in range(0, num_replicas): - replica_inputs = [] - for shape in shapes: - replica_inputs.append(_gen_tensor(*shape)) - random_tensors.append(tuple(replica_inputs)) - return tuple(random_tensors) - - -def _random_like(tensor_list): - random_tensors = [] - for o in tensor_list: - if o.dtype == torch.float32 or o.dtype == torch.float64: - random_tensors += [_gen_tensor(*o.shape, dtype=o.dtype)] - elif o.dtype == torch.int64: - # TODO remove this, we shouldn't be needing to pass random_tensor for long types - random_tensors += [torch.empty_like(o)] - else: - raise RuntimeError('Unsupported type: ', o.dtype) - return random_tensors - - -def _zeros_like(tensor_list): - zeros_tensors = [] - for o in tensor_list: - if o.dtype == torch.float32 or o.dtype == torch.float64: - zeros_tensors += [torch.zeros(*o.shape, dtype=o.dtype)] - elif o.dtype == torch.int64: - # TODO remove this, we shouldn't be needing to pass zeros_tensor for long types - zeros_tensors += [torch.zeros_like(o)] - else: - raise RuntimeError('Unsupported type: ', o.dtype) - return zeros_tensors - - -def onlyIfTorchSupportsCUDA(fn): - return unittest.skipIf( - not torch.cuda.is_available(), reason="requires PyTorch CUDA support")( - fn) - - -def onlyIfPJRTDeviceIsCUDA(fn): - return unittest.skipIf( - os.environ.get("PJRT_DEVICE") not in ("GPU", "CUDA"), - reason="requires CUDA as PJRT_DEVICE")( - fn) - - class TestToXlaTensorArena(test_utils.XlaTestCase): def test(self): @@ -274,10 +200,6 @@ def forward(self, x): return F.log_softmax(x, dim=1) -@unittest.skipIf( - xr.device_type() == 'CUDA', - 'Parallelism for DataParallel uses multi-threads. But cuda assumes one GPU device per process instead of relying on threads.' -) class TestParallelTensorMNIST(test_utils.XlaTestCase): def test(self): @@ -436,7 +358,7 @@ def test_masked_select_shape(self): def test_nonzero_cast(self): t1 = torch.ones(5, 2, device='xla') # Result of the nonzero should be the index type. Currently - # index type is s64 on cpu and gpu, but s32 on TPU. We should be + # index type is s64 on cpu, but s32 on TPU. We should be # able to cast it to any other type without error. t2 = torch.nonzero(t1.int()).float() torch_xla.sync() @@ -3036,27 +2958,6 @@ def test_as_strided_input_larger(self): self.assertEqual(a, former_a) - def _test_move_tensor_cuda_to_xla(self, cpu_tensor): - # Assumes CPU-XLA data movement works. - cuda_tensor = cpu_tensor.to("cuda") - # Move tensor CUDA -> XLA. - xla_tensor = cuda_tensor.to('xla') - # Move the XLA tensor back to CPU, and check that it is the same as - # the original CPU tensor. - self.assertTrue(torch.equal(cpu_tensor, xla_tensor.cpu())) - - @onlyIfTorchSupportsCUDA - @onlyIfPJRTDeviceIsCUDA - def test_aten_move_cuda_to_xla(self): - self._test_move_tensor_cuda_to_xla(torch.arange(5)) - - @onlyIfTorchSupportsCUDA - @onlyIfPJRTDeviceIsCUDA - def test_aten_move_scalar_cuda_to_xla(self): - # 0-dimensional scalar-tensor - # Has a different execution path than other tensors. - self._test_move_tensor_cuda_to_xla(torch.tensor(42)) - def test_unsafe_buffer_pointer(self): xla_device = torch_xla.device() xla_tensor_0 = torch.tensor(42).to(xla_device) @@ -3083,178 +2984,6 @@ def test_unsafe_buffer_pointer(self): self.assertGreaterEqual(buf_ptr_3, 0) -class TestDLPack(parameterized.TestCase): - - def _test_dlpack_capsule_conversion_helper(self, xla_tensor): - dlpt = xdlpack.to_dlpack(xla_tensor) # dlpt1 has type PyCapsule - xla_tensor2 = xdlpack.from_dlpack(dlpt) - - self.assertEqual(xla_tensor.device, xla_tensor2.device) - self.assertTrue(torch.allclose(xla_tensor.cpu(), xla_tensor2.cpu())) - self.assertRaisesRegex(RuntimeError, - "DLTensor capsule can be consumed only once", - lambda: xdlpack.from_dlpack(dlpt)) - - self.assertEqual( - torch_xla._XLAC._unsafe_buffer_pointer(xla_tensor), - torch_xla._XLAC._unsafe_buffer_pointer(xla_tensor2)) - - @onlyIfTorchSupportsCUDA - @onlyIfPJRTDeviceIsCUDA - @parameterized.parameters(*all_types_and(torch.half, torch.bfloat16)) - def test_dlpack_roundtrip_tensor(self, dtype): - xla_device = torch_xla.device() - # xtensor->CurrentDataHandle() == nullptr but xtensor->CurrentIrValue().node != nullptr and device_data != nullptr - # xla_tensor_2 uses XLANativeFunctions::_to_copy - xla_tensor_2 = torch.arange(5, dtype=dtype).to(xla_device) - self._test_dlpack_capsule_conversion_helper(xla_tensor_2) - - # xla_tensor_3 uses arange_out IR node. - xla_tensor_3 = torch.arange(5, dtype=dtype, device='xla') - torch_xla.sync() - self._test_dlpack_capsule_conversion_helper(xla_tensor_3) - - @onlyIfTorchSupportsCUDA - @onlyIfPJRTDeviceIsCUDA - @parameterized.parameters( - *all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool, - torch.uint16, torch.uint32, torch.uint64)) - def test_dlpack_roundtrip_scalar(self, dtype): - xla_device = torch_xla.device() - xla_tensor_0 = torch.tensor(42, dtype=dtype).to(xla_device) - # `torch_xla.sync()` ensures xtensor->CurrentDataHandle() != nullptr - torch_xla.sync() - self._test_dlpack_capsule_conversion_helper(xla_tensor_0) - - xla_tensor_1 = torch.tensor(42, dtype=dtype).to(xla_device) - # xtensor->CurrentDataHandle() == nullptr but xtensor->CurrentIrValue().node != nullptr and device_data != nullptr - self._test_dlpack_capsule_conversion_helper(xla_tensor_1) - - @onlyIfTorchSupportsCUDA - @onlyIfPJRTDeviceIsCUDA - def test_dlpack_roundtrip_bool(self): - xla_tensor = torch.ones(1, dtype=torch.bool).to('xla') - self._test_dlpack_capsule_conversion_helper(xla_tensor) - - @onlyIfTorchSupportsCUDA - @onlyIfPJRTDeviceIsCUDA - def test_dlpack_pytorch_cuda_to_xla(self): - t1_cuda = torch.arange(5).cuda() - dlt1 = torch.utils.dlpack.to_dlpack(t1_cuda) - xla_t1 = xdlpack.from_dlpack(dlt1) - self.assertEqual(xla_t1.device.type, 'xla') - self.assertEqual(xla_t1.device.index, t1_cuda.device.index) - t1_cuda[0] = t1_cuda[0] + 20 - self.assertTrue(torch.allclose(xla_t1.cpu(), t1_cuda.cpu())) - - t2_cuda = torch.tensor(5).cuda() - dlt2 = torch.utils.dlpack.to_dlpack(t2_cuda) - xla_t2 = xdlpack.from_dlpack(dlt2) - self.assertEqual(xla_t2.device.type, 'xla') - self.assertEqual(xla_t2.device.index, t2_cuda.device.index) - t2_cuda.fill_(6) - self.assertTrue(torch.allclose(xla_t2.cpu(), t2_cuda.cpu())) - - cuda1 = torch.device('cuda:1') - t3_cuda = torch.tensor(5, device=cuda1) - dlt3 = torch.utils.dlpack.to_dlpack(t3_cuda) - xla_t3 = xdlpack.from_dlpack(dlt3) - self.assertEqual(xla_t3.device.type, 'xla') - self.assertEqual( - xla_t3.device.index, - t3_cuda.device.index, - msg='both value should 1. xla_t3.device should be xla:1.') - t3_cuda.fill_(6) - self.assertTrue(torch.allclose(xla_t3.cpu(), t3_cuda.cpu())) - - @onlyIfTorchSupportsCUDA - @onlyIfPJRTDeviceIsCUDA - def test_dlpack_pytorch_cuda_to_xla_protocol_conversion(self): - # Unlike the test_dlpack_pytorch_cuda_to_xla, - # torch_cuda_tensor has attribute __dlpack__ and __dlpack_device__. - # From cuda tensors to xla tensors, the synchronization is handdled implicitly. - t1_cuda = torch.arange(5).cuda() - xla_t1 = xdlpack.from_dlpack(t1_cuda) - self.assertEqual(xla_t1.device.type, 'xla') - self.assertEqual(xla_t1.device.index, t1_cuda.device.index) - t1_cuda[0] = t1_cuda[0] + 20 - self.assertTrue(torch.allclose(xla_t1.cpu(), t1_cuda.cpu())) - - t2_cuda = torch.tensor(5).cuda() - xla_t2 = xdlpack.from_dlpack(t2_cuda) - self.assertEqual(xla_t2.device.type, 'xla') - self.assertEqual(xla_t2.device.index, t2_cuda.device.index) - t2_cuda.fill_(6) - self.assertTrue(torch.allclose(xla_t2.cpu(), t2_cuda.cpu())) - - cuda1 = torch.device('cuda:1') - t3_cuda = torch.tensor(5, device=cuda1) - xla_t3 = xdlpack.from_dlpack(t3_cuda) - self.assertEqual(xla_t3.device.type, 'xla') - self.assertEqual( - xla_t3.device.index, - t3_cuda.device.index, - msg='both value should 1. xla_t3.device should be xla:1.') - t3_cuda.fill_(6) - self.assertTrue(torch.allclose(xla_t3.cpu(), t3_cuda.cpu())) - - @onlyIfTorchSupportsCUDA - @onlyIfPJRTDeviceIsCUDA - def test_dlpack_xla_to_pytorch_cuda(self): - xla_t1 = torch.arange(5).to('xla') - dlt1 = xdlpack.to_dlpack(xla_t1) - cuda_t1 = torch.utils.dlpack.from_dlpack(dlt1) - self.assertEqual(cuda_t1.device.type, 'cuda') - self.assertEqual(cuda_t1.device.index, xla_t1.device.index) - cuda_t1[0] = cuda_t1[0] + 20 - self.assertTrue(torch.allclose(xla_t1.cpu(), cuda_t1.cpu())) - - @onlyIfTorchSupportsCUDA - @onlyIfPJRTDeviceIsCUDA - def test_dlpack_xla_to_pytorch_cuda_protocol_conversion(self): - xla_t1 = torch.arange(5).to('xla') - cuda_t1 = torch.utils.dlpack.from_dlpack(xla_t1) - self.assertEqual(cuda_t1.device.type, 'cuda') - self.assertEqual(cuda_t1.device.index, xla_t1.device.index) - cuda_t1[0] = cuda_t1[0] + 20 - self.assertTrue(torch.allclose(xla_t1.cpu(), cuda_t1.cpu())) - - @onlyIfTorchSupportsCUDA - @onlyIfPJRTDeviceIsCUDA - def test_dlpack_non_default_layout(self): - cuda_t = torch.arange(25, device=torch.device('cuda')).reshape(5, 5) - - t1 = cuda_t.t() - xla_t1 = xdlpack.from_dlpack(t1.__dlpack__()) - self.assertEqual(xla_t1.device.type, 'xla') - self.assertEqual(xla_t1.device.index, t1.device.index) - self.assertTrue(torch.allclose(t1.cpu(), xla_t1.cpu())) - - t2 = cuda_t[0] - xla_t2 = xdlpack.from_dlpack(t2.__dlpack__()) - self.assertEqual(xla_t2.device.type, 'xla') - self.assertEqual(xla_t2.device.index, t2.device.index) - self.assertTrue(torch.allclose(t2.cpu(), xla_t2.cpu())) - - t3 = cuda_t[:, 0] - self.assertRaisesRegex( - RuntimeError, - r"Only DLPack tensors with trivial \(compact\) striding are supported", - lambda: xdlpack.from_dlpack(t3.__dlpack__())) - - t4 = cuda_t[1, :] - xla_t4 = xdlpack.from_dlpack(t4.__dlpack__()) - self.assertEqual(xla_t4.device.type, 'xla') - self.assertEqual(xla_t4.device.index, t4.device.index) - self.assertTrue(torch.allclose(t4.cpu(), xla_t4.cpu())) - - t5 = cuda_t[1] - xla_t5 = xdlpack.from_dlpack(t5.__dlpack__()) - self.assertEqual(xla_t5.device.type, 'xla') - self.assertEqual(xla_t5.device.index, t5.device.index) - self.assertTrue(torch.allclose(t5.cpu(), xla_t5.cpu())) - - class SimpleModelWithDropout(torch.nn.Module): def __init__(self): @@ -3308,7 +3037,7 @@ def test_opt_barrier(self): opt_barrier = line break - # Somehow the CPU/GPU CI will not have the opt-barrier. + # Somehow the CPU CI will not have the opt-barrier. if opt_barrier != "": self.assertEqual(opt_barrier.count("f32[128,128]"), 6) self.assertEqual(opt_barrier.count("f32[128]"), 2) diff --git a/test/test_ops.py b/test/test_ops.py index 167b2024d07f..2e71948b76a1 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -126,6 +126,7 @@ def get_allowed_ops_map( AllowedOpInfoEntry('gt'), AllowedOpInfoEntry('imag'), AllowedOpInfoEntry('inverse'), + AllowedOpInfoEntry('index_put'), AllowedOpInfoEntry('isin'), AllowedOpInfoEntry('isneginf'), AllowedOpInfoEntry('le'), @@ -365,11 +366,7 @@ def get_allowed_ops_map( # AllowedOpInfoEntry('logdet'), xla::lodget does not handle empty input # AllowedOpInfoEntry('qr'), # Slice dim size 1 greater than dynamic slice dimension: 0 - # Failed on CUDA CI only (investigate) - # app.circleci.com/pipelines/github/pytorch/xla/9088/workflows/2d59c649-db2b-4384-921e-5e43eba1b51a/jobs/17875 - # AllowedOpInfoEntry('index_put'), - - # Worked locally (but failing on CI both CPU and CUDA) + # Worked locally (but failing on CI both CPU) # app.circleci.com/pipelines/github/pytorch/xla/9130/workflows/71c74f3d-1735-4328-81b5-784d6e6744da/jobs/17998 # AllowedOpInfoEntry('var_mean'), # AllowedOpInfoEntry('pow'), # for int64 don't work, likely rounding issue diff --git a/test/test_persistent_cache.py b/test/test_persistent_cache.py index a7cfa7ab1fd7..21bee1472cae 100644 --- a/test/test_persistent_cache.py +++ b/test/test_persistent_cache.py @@ -94,8 +94,6 @@ def _spmd_sharded_test(tmpdir, metrics): _assert_correctness_and_metrics(t, xt, metrics) -# Skip CUDA, the on disk cache cannot be deserialized after XLA pin update in -# #8908 @absltest.skipUnless(xr.device_type() in {'TPU', 'NEURON'}, 'Device type does not support persistent caching') class PersistentCacheTest(parameterized.TestCase): @@ -133,9 +131,7 @@ def test_persistent_cache_mp(self): ('spmd_replicated', _spmd_replicated_test), ('spmd_sharded', _spmd_sharded_test), ) - @absltest.skipUnless( - xr.device_type() == 'TPU', - 'TPU required for SPMD; single-device GPU is pending #6023') + @absltest.skipUnless(xr.device_type() == 'TPU', 'TPU required for SPMD') def test_persistent_cache(self, test_fn): self._run_test(_test_spawn, test_fn) diff --git a/test/test_profiler.py b/test/test_profiler.py index 3f2a1482ffb2..2017de1d67e7 100644 --- a/test/test_profiler.py +++ b/test/test_profiler.py @@ -77,11 +77,6 @@ def _check_trace_namespace_exists(self, path): f'Expected "build_graph" trace in: {path}') def test_trace_and_metrics(self): - # Create a new context for forking processes with the spawn method. - # This is necessary so as to avoid CUDA initialization issues when - # both PyTorch and PyTorch/XLA were compiled with CUDA support. - context = multiprocessing.get_context("spawn") - port = xu.get_free_tcp_ports()[0] training_started = context.Event() p = context.Process( diff --git a/test/test_python_ops.py b/test/test_python_ops.py index 9dc145947f62..a9cdd816fe85 100644 --- a/test/test_python_ops.py +++ b/test/test_python_ops.py @@ -53,7 +53,6 @@ def ref_put(dst, idx, src, accumulate): src = make_arg(src_size, noncontiguous=not src_contig) # If accumulate=True, `put_` should be deterministic regardless of the inputs on CPU - # On CUDA it may not be, but the test has enough tolerance to account for this if accumulate: idx = make_idx(src_size, high=dst.numel()) else: diff --git a/test/test_zero1.py b/test/test_zero1.py index 8bb2fbc3d822..62aa13b4ecc6 100644 --- a/test/test_zero1.py +++ b/test/test_zero1.py @@ -154,13 +154,12 @@ def test_zero1_load(self): def _mp_fn(index): device = torch_xla.device() - if xm.xla_device_hw(device) in ('TPU', 'CUDA'): + if xm.xla_device_hw(device) in ('TPU',): test = unittest.main(exit=False) sys.exit(0 if test.result.wasSuccessful() else 1) else: print( - 'Default device {} is not a TPU or CUDA device'.format(device), - file=sys.stderr) + 'Default device {} is not a TPU device'.format(device), file=sys.stderr) if __name__ == '__main__':