pytorch
diff --git a/‎test/nn/test_embedding.py‎
Lines changed: 77 additions & 19 deletions b/‎test/nn/test_embedding.py‎
Lines changed: 77 additions & 19 deletions
@@ -7,16 +7,17 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from torch.testing._internal.common_cuda import TEST_CUDA
 from torch.testing._internal.common_device_type import (
     dtypes,
     dtypesIfCUDA,
+    dtypesIfXPU,
     instantiate_device_type_tests,
     largeTensorTest,
-    onlyCUDA,
     onlyNativeDeviceTypes,
+    onlyOn,
     skipCUDAIf,
     skipMeta,
+    skipXPUIf,
     TEST_WITH_ROCM,
 )
 from torch.testing._internal.common_nn import NNTestCase
@@ -29,14 +30,21 @@
     run_tests,
     set_default_dtype,
     skipIfTorchDynamo,
+    TEST_CUDA,
+    TEST_XPU,
+)
+
+
+device_type = (
+    acc.type if (acc := torch.accelerator.current_accelerator(True)) else "cpu"
 )
 
 
 class TestEmbeddingNN(NNTestCase):
     _do_cuda_memory_leak_check = True
     _do_cuda_non_default_stream = True
 
-    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    @unittest.skipIf(not TEST_CUDA and not TEST_XPU, "CUDA/XPU unavailable")
     def test_embedding_max_norm_unsorted_repeating_indices(self):
         def create_embedding(device):
             # Seed RNG so we get the same Embedding each time
@@ -48,8 +56,8 @@ def create_embedding(device):
         ix = torch.arange(2, device="cpu", dtype=torch.long).repeat(2000)
         out_cpu = create_embedding("cpu")(ix)
 
-        ix = ix.to("cuda")
-        out = create_embedding("cuda")(ix)
+        ix = ix.to(device_type)
+        out = create_embedding(device_type)(ix)
         self.assertEqual(out.cpu(), out_cpu)
 
     def test_embedding_sparse_basic(self):
@@ -81,9 +89,9 @@ def test_move_sparse_half_embedding(self):
         self.assertEqual(embedding.embedding_dim, 3)
         self.assertEqual(embedding.num_embeddings, 10)
 
-        if torch.cuda.is_available():
-            embedding.to("cuda")
-            self.assertEqual(embedding.weight.device.type, "cuda")
+        if not torch.accelerator.is_available():
+            embedding.to(device_type)
+            self.assertEqual(embedding.weight.device.type, device_type)
             embedding.to("cpu")
             self.assertEqual(embedding.weight.device.type, "cpu")
 
@@ -182,11 +190,11 @@ def test_embedding_functional(self):
         self.assertEqual(res_old, res_F)
 
     # https://github.com/pytorch/pytorch/issues/130806
-    @unittest.skipIf(not TEST_CUDA, "CUDA not available")
-    @largeTensorTest("40GB", device="cuda")
+    @unittest.skipIf(not TEST_CUDA and not TEST_XPU, "CUDA/XPU not available")
+    @largeTensorTest("40GB", device=device_type)
     def test_large_tensors(self):
-        input = torch.randint(low=0, high=16032, size=[131072], device="cuda")
-        w = torch.randn([16032, 16384], device="cuda")
+        input = torch.randint(low=0, high=16032, size=[131072], device=device_type)
+        w = torch.randn([16032, 16384], device=device_type)
         out = torch.nn.functional.embedding(input, w)
         self.assertEqual(out.dim(), 2)
         self.assertEqual(out.numel(), 2147483648)
@@ -308,6 +316,7 @@ def test_embedding_scalar_weight_error(self, device):
                 torch.nn.functional.embedding(indices, weight)
 
     @dtypesIfCUDA(torch.float16, torch.float64)
+    @dtypesIfXPU(torch.float16, torch.float64)
     @dtypes(torch.float64)
     def test_embedding_backward(self, device, dtype):
         embedding = nn.Embedding(10, 3, sparse=True)
@@ -348,6 +357,7 @@ def test_embedding_backward(self, device, dtype):
             else (torch.float, torch.double, torch.half)
         )
     )
+    @dtypesIfXPU(torch.float32, torch.double, torch.half)
     @dtypes(torch.float32)
     def test_embedding_max_norm_backward(self, device, dtype):
         # can't use gradcheck since in place renorm makes analytical gradients different from produced ones
@@ -372,6 +382,7 @@ def test_embedding_max_norm_backward(self, device, dtype):
             else (torch.float, torch.double, torch.half)
         )
     )
+    @dtypesIfXPU(torch.float32, torch.double, torch.half)
     @dtypes(torch.float32)
     def test_embedding_max_norm_fwd_AD(self, device, dtype):
         if torch.device(device).type == "xla":
@@ -396,6 +407,7 @@ def test_embedding_max_norm_fwd_AD(self, device, dtype):
             else (torch.float, torch.double, torch.half)
         )
     )
+    @dtypesIfXPU(torch.float32, torch.double, torch.half)
     @dtypes(torch.float32)
     def test_embedding_padding_idx(self, device, dtype):
         embedding = nn.Embedding(10, 20, padding_idx=0).to(device, dtype)
@@ -488,6 +500,7 @@ def test_embedding_padding_idx(self, device, dtype):
     @onlyNativeDeviceTypes
     @dtypes(torch.float32, torch.float64)
     @dtypesIfCUDA(torch.half, torch.bfloat16)
+    @dtypesIfXPU(torch.half, torch.bfloat16)
     def test_embedding_bag_1D_padding_idx(self, device, dtype):
         num_features = 3
         max_indices_per_bag = 10
@@ -632,11 +645,12 @@ def gen_2D_indices_from_1D(
                     weights.grad, weights_check.grad, msg=msg, atol=atol, rtol=rtol
                 )
 
-    @onlyCUDA
+    @onlyOn(["cuda", "xpu"])
     @dtypes(
         torch.bfloat16,
     )
     @largeTensorTest("80GB", device="cuda")
+    @largeTensorTest("80GB", device="xpu")
     def test_embedding_backward_large_batch_overflow(self, device, dtype):
         """
         Test that embedding_dense_backward handles large batches that exceed INT32_MAX thread IDs.
@@ -708,6 +722,7 @@ def test_embedding_backward_large_batch_overflow(self, device, dtype):
     @onlyNativeDeviceTypes
     @dtypes(torch.float32, torch.float64)
     @dtypesIfCUDA(torch.half, torch.bfloat16)
+    @dtypesIfXPU(torch.half, torch.bfloat16)
     def test_embedding_bag_2D_padding_idx(self, device, dtype):
         # Use a Python implementation of embedding_bag with padding_idx support
         # to check torch.nn.functional.embedding_bag correctness
@@ -818,7 +833,7 @@ def embedding_bag_check(indices, weights, mode, sparse, padding_idx):
                     rtol = None
                 self.assertEqual(grad, grad_check, msg=msg, atol=atol, rtol=rtol)
 
-    @onlyCUDA
+    @onlyOn(["cuda", "xpu"])
     @dtypes(
         *(
             (torch.float, torch.double, torch.bfloat16, torch.half)
@@ -854,6 +869,7 @@ def test_embedding_bag_empty_input(self, device, dtypes):
             self.assertEqual(output, torch.zeros_like(output))
 
     @skipCUDAIf(True, "no out-of-bounds check on CUDA for perf.")
+    @skipXPUIf(True, "no out-of-bounds check on XPU for perf.")
     @dtypes(*itertools.product((torch.float, torch.double), (torch.int, torch.long)))
     @parametrize_test("padding_idx", [None, 0])
     @parametrize_test("mode", ["sum", "mean", "max"])
@@ -1066,6 +1082,13 @@ def _embedding_bag_reference_impl(
             (torch.float, torch.double, torch.half),
         )
     )
+    @dtypesIfXPU(
+        *itertools.product(
+            (torch.int, torch.long),
+            (torch.int, torch.long),
+            (torch.float32, torch.double, torch.half),
+        )
+    )
     def test_EmbeddingBag_empty_per_sample_weights_and_offsets(self, device, dtypes):
         # Test empty input and per sample weight, and backward pass. There was a CUDA
         # invalid configuration bug (more context in #46572)
@@ -1132,6 +1155,13 @@ def test_per_sample_weights(mode, trainable_scale):
             (torch.float, torch.double, torch.half),
         )
     )
+    @dtypesIfXPU(
+        *itertools.product(
+            (torch.int, torch.long),
+            (torch.int, torch.long),
+            (torch.float32, torch.double, torch.half),
+        )
+    )
     def test_EmbeddingBag_per_sample_weights_and_offsets(self, device, dtypes):
         def test_per_sample_weights(mode, trainable_scale):
             es = nn.EmbeddingBag(5, 2, mode=mode).to(dtype=dtypes[2], device=device)
@@ -1193,6 +1223,13 @@ def test_per_sample_weights(mode, trainable_scale):
             (torch.float, torch.double, torch.half),
         )
     )
+    @dtypesIfXPU(
+        *itertools.product(
+            (torch.int, torch.long),
+            (torch.int, torch.long),
+            (torch.float32, torch.double, torch.half),
+        )
+    )
     def test_EmbeddingBag_per_sample_weights_and_new_offsets(self, device, dtypes):
         def test_per_sample_weights_new_offsets(
             mode, trainable_scale, include_last_offset, has_weight=True
@@ -1357,6 +1394,11 @@ def _test_EmbeddingBag_vs_Embedding(
             (torch.int, torch.long), (torch.half, torch.float, torch.double)
         )
     )
+    @dtypesIfXPU(
+        *itertools.product(
+            (torch.int, torch.long), (torch.half, torch.float32, torch.double)
+        )
+    )
     @dtypes(*itertools.product((torch.int, torch.long), (torch.float, torch.double)))
     def test_EmbeddingBag_per_sample_weights_and_no_offsets(self, device, dtypes):
         def run_tests(mode, sparse, trainable_per_sample_weights):
@@ -1390,8 +1432,8 @@ def run_tests(mode, sparse, trainable_per_sample_weights):
         ):
             run_tests(mode, sparse, trainable_per_sample_weights)
 
-        # Test CUDA Dense on half precision
-        if device == "cuda":
+        # Test CUDA/XPU Dense on half precision
+        if device != "cpu":
             modes = ("sum",)
             sparsity = (False,)
             trainable_scale = (True, False)
@@ -1552,9 +1594,18 @@ def _test_EmbeddingBag(
             (torch.float, torch.double, torch.half),
         )
     )
+    @dtypesIfXPU(
+        *itertools.product(
+            (torch.int, torch.long),
+            (torch.int, torch.long),
+            (torch.float32, torch.double, torch.half),
+        )
+    )
     def test_embedding_bag_device(self, device, dtypes):
         if IS_JETSON and torch.bfloat16 in dtypes and device == "cpu":
             self.skipTest("bfloat16 not supported with Jetson cpu")
+        if dtypes[2] == torch.float64 and "xpu" in device:
+            self.skipTest("https://github.com/intel/torch-xpu-ops/issues/2295")
         with set_default_dtype(torch.double):
             self._test_EmbeddingBag(
                 device,
@@ -1582,10 +1633,10 @@ def test_embedding_bag_device(self, device, dtypes):
             )
 
             test_backward = False
-            if self.device_type == "cuda":
+            if self.device_type != "cpu":
                 # see 'todo' in test_embedding_bag.
                 test_backward = dtypes[2] is not torch.float16
-            elif self.device_type == "cpu":
+            else:
                 # TODO: figure out why precision on sparse embeddings isn't the
                 # same as for dense.
                 test_backward = (
@@ -1626,6 +1677,13 @@ def test_embedding_bag_device(self, device, dtypes):
             (torch.float, torch.double, torch.half),
         )
     )
+    @dtypesIfXPU(
+        *itertools.product(
+            (torch.int, torch.long),
+            (torch.int, torch.long),
+            (torch.float32, torch.double, torch.half),
+        )
+    )
     def test_embedding_bag_non_contiguous_weight(self, device, dtypes):
         weight_tensor = torch.randn(3, 4, dtype=dtypes[2], device=device)
 
@@ -1703,7 +1761,7 @@ def test_embedding_bag_per_sample_weights_grad(
         bag(x, per_sample_weights=F.softmax(w, dim=-1))
 
 
-instantiate_device_type_tests(TestEmbeddingNNDeviceType, globals())
+instantiate_device_type_tests(TestEmbeddingNNDeviceType, globals(), allow_xpu=True)
 instantiate_parametrized_tests(TestEmbeddingNN)
 
 if __name__ == "__main__":