Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 12 additions & 14 deletions test/cpp/test_aten_xla_tensor_2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1555,20 +1555,18 @@ TEST_F(AtenXlaTensorTest, TestGroupNormBackward) {
/*cudnn_enabled=*/false);
};
torch::Tensor undef;
ForEachDevice({XlaDeviceType::CUDA, XlaDeviceType::TPU},
[&](const torch::Device& device) {
TestBackward({input, undef_weight ? undef : weight,
undef_weight ? undef : bias},
device, testfn,
/*rtol=*/1e-3, /*atol=*/1e-3,
/*derivative_level=*/2);
ExpectCounterNotChanged("aten::.*",
cpp_test::GetIgnoredCounters());
ExpectCounterChanged("xla::native_batch_norm",
cpp_test::GetIgnoredCounters());
ExpectCounterChanged("xla::native_batch_norm_backward",
cpp_test::GetIgnoredCounters());
});
ForEachDevice({XlaDeviceType::TPU}, [&](const torch::Device& device) {
TestBackward(
{input, undef_weight ? undef : weight, undef_weight ? undef : bias},
device, testfn,
/*rtol=*/1e-3, /*atol=*/1e-3,
/*derivative_level=*/2);
ExpectCounterNotChanged("aten::.*", cpp_test::GetIgnoredCounters());
ExpectCounterChanged("xla::native_batch_norm",
cpp_test::GetIgnoredCounters());
ExpectCounterChanged("xla::native_batch_norm_backward",
cpp_test::GetIgnoredCounters());
});
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion test/cpp/test_aten_xla_tensor_6.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -873,7 +873,7 @@ TEST_F(AtenXlaTensorTest, TestEmbeddingBackward) {
TEST_F(AtenXlaTensorTest, TestAmpUpdateScale) {
XlaDeviceType hw_type =
static_cast<XlaDeviceType>(bridge::GetDefaultDevice()->type());
if (hw_type != XlaDeviceType::CUDA && hw_type != XlaDeviceType::CPU) {
if (hw_type != XlaDeviceType::CPU) {
return;
}
torch::Tensor growth_tracker =
Expand Down
4 changes: 1 addition & 3 deletions test/cpp/test_replication.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,9 +98,7 @@ void TestSingleReplication(

class ReplicationTest : public AtenXlaTensorTestBase {};

// Parallelism for DataParallel uses multi-threads. But cuda assumes one GPU
// device per process instead of relying on threads so we will not run the test
// on GPU.
// Parallelism for DataParallel uses multi-threads.
TEST_F(ReplicationTest, TestNSingleReplication) {
WithAllDevices(
{XlaDeviceType::TPU},
Expand Down
2 changes: 1 addition & 1 deletion test/ds/test_dynamic_shapes.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def test_masked_select_shape(self):
def test_nonzero_cast(self):
t1 = torch.ones(5, 2, device='xla')
# Result of the nonzero should be the index type. Currently
# index type is s64 on cpu and gpu, but s32 on TPU. We should be
# index type is s64 on cpu, but s32 on TPU. We should be
# able to cast it to any other type without error.
t2 = torch.nonzero(t1.int()).float()
torch_xla.sync()
Expand Down
101 changes: 14 additions & 87 deletions test/dynamo/test_dynamo.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,17 +148,6 @@ def fn_simple(self, x, y):
b = torch.sin(y)
return a + b

def _choose_proper_device(self, initialize_on_cuda):
if not initialize_on_cuda:
return torch_xla.device()

assert initialize_on_cuda
if xr.device_type() != "CUDA" or not torch.cuda.is_available():
self.skipTest(
"Skip this test because it requires xr.device_type()=='CUDA' and torch.cuda.is_available()."
)
return "cuda:0"

@skipOnNeuron
def test_simple_model(self):
device = torch_xla.device()
Expand Down Expand Up @@ -193,71 +182,23 @@ def test_simple_model(self):
# Dynamo has to sync the input since they are intermedate IR(xla_xy and xla_y3)
self.assertEqual(met.counter_value('DynamoSyncInputExecuteTime'), 1)

# Tests that the dynamo bridge automatically moves tensors to XLA device,
# then back to the original device.
@unittest.skipIf(xr.device_type() != "CUDA" or not torch.cuda.is_available(),
f"GPU tests should only run on GPU devices.")
@parameterized.parameters(
"0",
"1",
)
def test_simple_model_automoves_tensors(self, zero_copy_enabled):
x = torch.tensor(100.0, requires_grad=True, device="cuda:0")
y = torch.tensor(200.0, requires_grad=True, device="cuda:0")
original_device = x.device
eager_result = self.fn_simple(x, y)

# Since all tests run in the same process, have to reset the metrics report.
met.clear_all()
torch._dynamo.reset()

fn_simple_dynamo = torch.compile(self.fn_simple, backend="openxla")
res_xla_dynamo = fn_simple_dynamo(x, y)
self.assertIn('xla::add', met.counter_names())
self.assertTrue(res_xla_dynamo.device == original_device)
self.assertTrue(torch.allclose(eager_result, res_xla_dynamo))

# verify that tracing is skipped in following runs
met.clear_counters()
res_xla_dynamo_reused = fn_simple_dynamo(x, y)
self.assertNotIn('xla::add', met.counter_names())
self.assertTrue(res_xla_dynamo_reused.device == original_device)
self.assertTrue(torch.allclose(eager_result, res_xla_dynamo_reused))

# verify that dynamo can handle different inputs
res_xla_dynamo_different = fn_simple_dynamo(x + y, y * 3)
res_cpu_3 = self.fn_simple(x + y, y * 3)
self.assertTrue(res_xla_dynamo_different.device == original_device)
self.assertTrue(torch.allclose(res_cpu_3, res_xla_dynamo_different))

# There should not be any fallbacks.
self.assertEqual(torch_xla._XLAC._get_executed_fallback_ops(), [])

@parameterized.parameters(
True,
False,
)
def test_fn_without_input(self, initialize_on_cuda):
def test_fn_without_input(self):

def fn_without_input(device):
constant = 0.835
expanded = torch.full((4, 4), constant, device=device)
arange = torch.arange(16, device=device).reshape(4, 4)
return expanded + arange

device = self._choose_proper_device(initialize_on_cuda)
device = torch_xla.device()

compiled_fn = torch.compile(fn_without_input, backend='openxla')
res_cpu = fn_without_input('cpu')
res_xla_dynamo = compiled_fn(device)
self.assertTrue(torch.allclose(res_cpu, res_xla_dynamo.cpu()))

@parameterized.parameters(
(True, 'openxla'),
(False, dynamo_backend2.dynamo_backend),
(False, 'openxla'),
)
def test_simple_model_with_in_place_ops(self, initialize_on_cuda, backend):
@parameterized.parameters('openxla', dynamo_backend2.dynamo_backend)
def test_simple_model_with_in_place_ops(self, backend):

class TestModel(nn.Module):

Expand All @@ -279,7 +220,7 @@ def forward(self, index, copy_tensor, input_tensor, op_name):
output = input_tensor + self.self_tensor
return output

device = self._choose_proper_device(initialize_on_cuda)
device = torch_xla.device()

torch._dynamo.reset()
met.clear_all()
Expand All @@ -306,18 +247,14 @@ def forward(self, index, copy_tensor, input_tensor, op_name):
op_name=in_place_op)
self.assertTrue(torch.allclose(res_cpu, res_device_dynamo.cpu()))

@parameterized.parameters(
(True, 'openxla'),
(False, dynamo_backend2.dynamo_backend),
(False, 'openxla'),
)
def test_einsum(self, initialize_on_cuda, backend):
@parameterized.parameters('openxla', dynamo_backend2.dynamo_backend)
def test_einsum(self, backend):
# einsum currently does not have meta function to compute the shape hence
# will fallback to XLA with FakeTensor as input to infer the output shape.
def einsum_mm(a, b):
return torch.einsum('ijkl,ijlm->ijkm', a, b)

device = self._choose_proper_device(initialize_on_cuda)
device = torch_xla.device()
a = torch.randn(4, 4, 4, 4).to(device)
b = torch.randn(4, 4, 4, 4).to(device)
torch_xla.sync()
Expand All @@ -328,16 +265,10 @@ def einsum_mm(a, b):
self.assertTrue(
torch.allclose(res_device_non_dynamo.cpu(), res_device_dynamo.cpu()))

@parameterized.parameters(
True,
False,
)
def test_simple_model_with_different_input_shape(self, initialize_on_cuda):
def test_simple_model_with_different_input_shape(self):
met.clear_all()
device = self._choose_proper_device(initialize_on_cuda)
# We need to make `dim` depend on `initialize_on_cuda` because the XLA compilation cache
# does not clean itself between the parameterized tests.
dim = 5 + int(initialize_on_cuda)
device = torch_xla.device()
dim = 5
device_x = torch.randn(dim, dim).to(device)
device_y = torch.randn(dim, dim).to(device)
new_dim = 2 * dim
Expand Down Expand Up @@ -369,13 +300,9 @@ def get_loader(self, device, sample_count, batch_size=4):

@skipOnTpu
@skipOnNeuron
@parameterized.parameters(
(True, 'openxla'),
(False, dynamo_backend2.dynamo_backend),
(False, 'openxla'),
)
def test_resnet18(self, initialize_on_cuda, backend):
device = self._choose_proper_device(initialize_on_cuda)
@parameterized.parameters('openxla', dynamo_backend2.dynamo_backend)
def test_resnet18(self, backend):
device = torch_xla.device()
sample_count = xu.getenv_as('SAMPLE_COUNT', int, defval=10)
loader = self.get_loader(device, sample_count, batch_size=4)
resnet18 = torchvision.models.resnet18()
Expand Down
2 changes: 1 addition & 1 deletion test/dynamo/test_traceable_collectives.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def collective_broadcast_and_cos(input, src):
def _mp_fn(index):
device = torch_xla.device()
world_size = xr.world_size()
if xm.xla_device_hw(device) not in ('TPU', 'CUDA', 'NEURON'):
if xm.xla_device_hw(device) not in ('TPU', 'NEURON'):
print(f'skip this test for hw {xm.xla_device_hw(device)}')
return
ordinal_tensor = torch.tensor([index], dtype=torch.float).to(device)
Expand Down
9 changes: 2 additions & 7 deletions test/pjrt/test_runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class TestExperimentalPjrt(parameterized.TestCase):
def setUp(self):
xr.set_device_type('CPU')

@parameterized.parameters(('CPU', 'CPU'), ('CUDA', 'CUDA'), ('TPU', 'TPU'))
@parameterized.parameters(('CPU', 'CPU'), ('TPU', 'TPU'))
def test_device_type(self, pjrt_device, expected):
with mock.patch.dict(os.environ, {'PJRT_DEVICE': pjrt_device}, clear=True):
self.assertEqual(xr.device_type(), expected)
Expand Down Expand Up @@ -69,11 +69,6 @@ def test_xla_device_error(self):
}, True), ('pjrt_tpu_precedence', {
'PJRT_DEVICE': 'TPU',
'XRT_TPU_CONFIG': 'localservice;0;localhost:51011',
}, True), ('gpu_num_devives', {
'GPU_NUM_DEVICES': '4'
}, True), ('pjrt_gpu', {
'PJRT_DEVICE': 'CUDA',
'GPU_NUM_DEVICES': '4'
}, True))
def test_pjrt_default_device(self, env_vars, expect_using_pjrt):
# Prevent flag checking during reinitialization of PJRT backend.
Expand All @@ -86,7 +81,7 @@ def test_pjrt_default_device(self, env_vars, expect_using_pjrt):
reload(torch_xla)
logs_context = contextlib.nullcontext()
if expect_using_pjrt:
self.assertIn(xr.device_type(), ['CPU', 'CUDA', 'TPU', 'NEURON'])
self.assertIn(xr.device_type(), ['CPU', 'TPU', 'NEURON'])
else:
self.assertIsNone(xr.device_type())

Expand Down
17 changes: 1 addition & 16 deletions test/pytorch_test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,7 @@
'test_leaky_relu_inplace_with_neg_slope_xla', # expecting a specific error message
'test_upsamplingBicubic2d_correctness_xla', # FIXME! Got dtypes torch.float32 and torch.float64
'test_CTCLoss_no_batch_dim_xla', # Value out of range
'test_upsamplingBilinear2d_xla', # precision on GPU/TPU, slow compilation on CPU
'test_upsamplingBilinear2d_xla', # precision on TPU, slow compilation on CPU
# torch.autograd.gradcheck.GradcheckError: Jacobian mismatch for output 0 with respect to input 0
'test_GRU_grad_and_gradgrad_xla_float64', # grad check failure
'test_LSTM_grad_and_gradgrad_xla_float64', # grad check failure
Expand Down Expand Up @@ -475,18 +475,6 @@
},
}

DISABLED_TORCH_TESTS_GPU_ONLY = {
# test_torch.py
'TestTorchDeviceTypeXLA': {
'test_maximum_minimum_float_nan_and_inf', # maximum(nan,inf) = inf on GPU
},

# test_indexing.py
'TestIndexingXLA': {
'test_index_put_accumulate_large_tensor_xla', # illegal memory access was encountered
},
}


class MatchSet(object):

Expand Down Expand Up @@ -526,15 +514,12 @@ def union_of_disabled_tests(sets):


DISABLED_TORCH_TESTS_CPU = DISABLED_TORCH_TESTS_ANY
DISABLED_TORCH_TESTS_GPU = union_of_disabled_tests(
[DISABLED_TORCH_TESTS_ANY, DISABLED_TORCH_TESTS_GPU_ONLY])
DISABLED_TORCH_TESTS_TPU = union_of_disabled_tests(
[DISABLED_TORCH_TESTS_ANY, DISABLED_TORCH_TESTS_TPU_ONLY])

DISABLED_TORCH_TESTS = {
'TPU': prepare_match_set(DISABLED_TORCH_TESTS_TPU),
'CPU': prepare_match_set(DISABLED_TORCH_TESTS_CPU),
'CUDA': prepare_match_set(DISABLED_TORCH_TESTS_GPU),
}


Expand Down
1 change: 0 additions & 1 deletion test/run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,6 @@ function run_xla_op_tests3 {
run_test "$_TEST_DIR/test_devices.py"
run_test "$_TEST_DIR/test_manual_xla_registration.py"
run_test_multi_devices "$_TEST_DIR/spmd/test_xla_dtensor_placements.py"
# NOTE: this line below is testing export and don't care about GPU
PJRT_DEVICE=CPU CPU_NUM_DEVICES=1 run_coverage "$_TEST_DIR/test_core_aten_ops.py"
run_test "$_TEST_DIR/test_pallas.py"
run_xla_ir_hlo_debug run_test "$_TEST_DIR/test_user_computation_debug_cache.py"
Expand Down
76 changes: 0 additions & 76 deletions test/test_autocast.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,82 +152,6 @@ def __init__(self, dev):
self.methods_bf16 = [("__matmul__", mat0_bf16 + mat1_fp32)]


class AutocastCudaTestExtraLists(object):

def __init__(self, dev):
super().__init__()
n = 8
dimsets = ((n, n, n), (n, n, n, n), (n, n, n, n, n))
conv_args_fp32 = [(torch.randn(dimset, dtype=torch.float32, device=dev),
torch.randn(dimset, dtype=torch.float32, device=dev))
for dimset in dimsets]

mat0_fp32 = (torch.randn((n, n), dtype=torch.float32, device=dev),)
mat1_fp32 = (torch.randn((n, n), dtype=torch.float32, device=dev),)
mat2_fp32 = (torch.randn((n, n), dtype=torch.float32, device=dev),)
mat3_fp32 = (torch.randn((n, n), dtype=torch.float32, device=dev),)

pointwise0_fp32 = (torch.randn(n, dtype=torch.float32, device=dev),)

element0_fp32 = (torch.randn(1, dtype=torch.float32, device=dev),)

# This is currently not part of AutocastTestLists and excludes `relu`, `addbmm`
self.torch_bf16 = [
("conv1d", conv_args_fp32[0]),
("conv2d", conv_args_fp32[1]),
("conv3d", conv_args_fp32[2]),
("bmm", (torch.randn((n, n, n), device=dev, dtype=torch.float32),
torch.randn((n, n, n), device=dev, dtype=torch.float32))),
("mm", mat0_fp32 + mat1_fp32),
("matmul",
torch.matmul(
torch.ones([2, 3], device=dev, dtype=torch.float32),
torch.ones([3, 2], device=dev, dtype=torch.float32))),
("baddbmm", (torch.randn((n, n, n), device=dev, dtype=torch.float32),
torch.randn((n, n, n), device=dev, dtype=torch.float32),
torch.randn((n, n, n), device=dev, dtype=torch.float32))),
("addmm", mat1_fp32 + mat2_fp32 + mat3_fp32),
("conv_tbc", (torch.randn((10, 7, 3), device=dev, dtype=torch.float32),
torch.randn((5, 3, 5), device=dev, dtype=torch.float32),
torch.randn(5, device=dev, dtype=torch.float32), 0)),
("conv_transpose1d", conv_args_fp32[0]),
("conv_transpose2d", conv_args_fp32[1]),
("conv_transpose3d", conv_args_fp32[2]),
("prelu", pointwise0_fp32 + element0_fp32),
]


class AutocastCudaTestUnsupportedLists(object):

def __init__(self):
super().__init__()
# Utility arguments, created as one-element tuples
self.torch_expect_builtin_promote = [
"cat", # requires all input tensors to be the same type
"equal", # requires all input tensors to be the same type
"stack", # return f16 instead of f32
]
self.methods_expect_builtin_promote = []

# The remaining lists organize ops that autocast treats explicitly.
self.torch_fp16 = [
"_convolution_nogroup", # need lowering
"addmv", # need lowering
]
self.torch_fp32 = [
"norm", # produce f16 instead of f32
]
self.torch_need_autocast_promote = [
"scatter_add", # cat currently requires all input tensors to be the same type
]
self.nn_fp16 = []
self.nn_fp32 = []
self.linalg_fp16 = []
self.methods_fp16 = []
self.methods_fp32 = []
self.banned = []


class TestAutocastBase(unittest.TestCase):

@classmethod
Expand Down
Loading