|
6 | 6 |
|
7 | 7 | from torch.testing import make_tensor |
8 | 8 | from torch.testing._internal.common_utils import \ |
9 | | - (parametrize, run_tests, TestCase, DeterministicGuard, TEST_WITH_ROCM) |
| 9 | + (parametrize, run_tests, TestCase, DeterministicGuard, TEST_WITH_ROCM, serialTest) |
10 | 10 | from torch.testing._internal.common_device_type import \ |
11 | 11 | (instantiate_device_type_tests, onlyCPU, dtypes, dtypesIfCUDA, |
12 | 12 | toleranceOverride, tol,) |
@@ -65,10 +65,12 @@ def test_gather(self, device, dtype): |
65 | 65 | actual = torch.gather(src, 2, idx) |
66 | 66 | self.assertEqual(actual, expected, atol=0, rtol=0) |
67 | 67 |
|
| 68 | + @serialTest() |
68 | 69 | @dtypes(torch.int8, torch.bfloat16) |
69 | 70 | def test_gather_large(self, device, dtype): |
70 | 71 | # test larger shapes to check vectorized implementation |
71 | | - for (m, n, k) in ((4096, 3072, 4096), (4096, 3072, 4100)): |
| 72 | + for (m, n, k) in ((4096, 3072, 4096), (4096, 3072, 4100), (4, 4, 16384 * 8192)): |
| 73 | + torch.cuda.empty_cache() |
72 | 74 | src = make_tensor((m, k), device=device, dtype=dtype) |
73 | 75 | alloc0 = torch.empty(src.nelement() * 2, device=device, dtype=dtype) |
74 | 76 | discontig = alloc0.view(m, 2 * k)[:, ::2].copy_(src) |
@@ -111,6 +113,8 @@ def test_gather_large(self, device, dtype): |
111 | 113 | self.assertEqual(res_ind, ref, atol=0, rtol=0) |
112 | 114 | res_gather = torch.gather(misaligned1, dim=dim, index=ind) |
113 | 115 | self.assertEqual(res_gather, ref, atol=0, rtol=0) |
| 116 | + del src, alloc0, alloc1, alloc2 |
| 117 | + del discontig, misaligned, misaligned1 |
114 | 118 | # test gather along 1st dim that can accidentally trigger fast path |
115 | 119 | # because due to index dimension in the gather dim being 1 |
116 | 120 | # an unexpected squashing in tensorIterator happens |
|
0 commit comments