New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Normalize DLPack stride to 1 where shape < 2 #83158
Closed
Closed
Changes from 5 commits
Commits
Show all changes
6 commits
Select commit
Hold shift + click to select a range
d581c78
TEST: move dlpack tests to separate file
mattip 3dd5220
ENH: test, fix normalizing strides in toDLPack
mattip 94d947b
cleanup test file
mattip 964a7c5
use unittest assertEqual, not bare pytest asserts
mattip 1a00f37
fixes from review
mattip d117e02
from review: use as_strided() to create a view
mattip File filter
Filter by extension
Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,193 @@ | ||
# -*- coding: utf-8 -*- | ||
# Owner(s): ["module: tests"] | ||
|
||
import torch | ||
from torch.testing import make_tensor | ||
from torch.testing._internal.common_utils import TestCase, run_tests | ||
from torch.testing._internal.common_device_type import ( | ||
instantiate_device_type_tests, onlyCUDA, dtypes, skipMeta, | ||
onlyNativeDeviceTypes) | ||
from torch.testing._internal.common_dtype import all_types_and_complex_and | ||
from torch.utils.dlpack import from_dlpack, to_dlpack | ||
|
||
|
||
class TestTorchDlPack(TestCase): | ||
exact_dtype = True | ||
|
||
@skipMeta | ||
@onlyNativeDeviceTypes | ||
@dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16)) | ||
def test_dlpack_capsule_conversion(self, device, dtype): | ||
# DLpack does not explicitly support bool (xref dmlc/dlpack#75) | ||
x = make_tensor((5,), dtype=dtype, device=device) | ||
z = from_dlpack(to_dlpack(x)) | ||
self.assertEqual(z, x) | ||
|
||
@skipMeta | ||
@onlyNativeDeviceTypes | ||
@dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16)) | ||
def test_dlpack_protocol_conversion(self, device, dtype): | ||
x = make_tensor((5,), dtype=dtype, device=device) | ||
z = from_dlpack(x) | ||
self.assertEqual(z, x) | ||
|
||
@skipMeta | ||
@onlyNativeDeviceTypes | ||
def test_dlpack_shared_storage(self, device): | ||
x = make_tensor((5,), dtype=torch.float64, device=device) | ||
z = from_dlpack(to_dlpack(x)) | ||
z[0] = z[0] + 20.0 | ||
self.assertEqual(z, x) | ||
|
||
@skipMeta | ||
@onlyCUDA | ||
@dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16)) | ||
def test_dlpack_conversion_with_streams(self, device, dtype): | ||
# Create a stream where the tensor will reside | ||
stream = torch.cuda.Stream() | ||
with torch.cuda.stream(stream): | ||
# Do an operation in the actual stream | ||
x = make_tensor((5,), dtype=dtype, device=device) + 1 | ||
# DLPack protocol helps establish a correct stream order | ||
# (hence data dependency) at the exchange boundary. | ||
# DLPack manages this synchronization for us, so we don't need to | ||
# explicitly wait until x is populated | ||
stream = torch.cuda.Stream() | ||
with torch.cuda.stream(stream): | ||
z = from_dlpack(x) | ||
stream.synchronize() | ||
self.assertEqual(z, x) | ||
|
||
@skipMeta | ||
@onlyNativeDeviceTypes | ||
@dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16)) | ||
def test_from_dlpack(self, device, dtype): | ||
x = make_tensor((5,), dtype=dtype, device=device) | ||
y = torch.from_dlpack(x) | ||
self.assertEqual(x, y) | ||
|
||
@skipMeta | ||
@onlyNativeDeviceTypes | ||
@dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16)) | ||
def test_from_dlpack_noncontinguous(self, device, dtype): | ||
x = make_tensor((25,), dtype=dtype, device=device).reshape(5, 5) | ||
|
||
y1 = x[0] | ||
y1_dl = torch.from_dlpack(y1) | ||
self.assertEqual(y1, y1_dl) | ||
|
||
y2 = x[:, 0] | ||
y2_dl = torch.from_dlpack(y2) | ||
self.assertEqual(y2, y2_dl) | ||
|
||
y3 = x[1, :] | ||
y3_dl = torch.from_dlpack(y3) | ||
self.assertEqual(y3, y3_dl) | ||
|
||
y4 = x[1] | ||
y4_dl = torch.from_dlpack(y4) | ||
self.assertEqual(y4, y4_dl) | ||
|
||
y5 = x.t() | ||
y5_dl = torch.from_dlpack(y5) | ||
self.assertEqual(y5, y5_dl) | ||
|
||
@skipMeta | ||
@onlyCUDA | ||
@dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16)) | ||
def test_dlpack_conversion_with_diff_streams(self, device, dtype): | ||
stream_a = torch.cuda.Stream() | ||
stream_b = torch.cuda.Stream() | ||
# DLPack protocol helps establish a correct stream order | ||
# (hence data dependency) at the exchange boundary. | ||
# the `tensor.__dlpack__` method will insert a synchronization event | ||
# in the current stream to make sure that it was correctly populated. | ||
with torch.cuda.stream(stream_a): | ||
x = make_tensor((5,), dtype=dtype, device=device) + 1 | ||
z = torch.from_dlpack(x.__dlpack__(stream_b.cuda_stream)) | ||
stream_a.synchronize() | ||
stream_b.synchronize() | ||
self.assertEqual(z, x) | ||
|
||
@skipMeta | ||
@onlyNativeDeviceTypes | ||
@dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16)) | ||
def test_from_dlpack_dtype(self, device, dtype): | ||
x = make_tensor((5,), dtype=dtype, device=device) | ||
y = torch.from_dlpack(x) | ||
assert x.dtype == y.dtype | ||
|
||
@skipMeta | ||
@onlyCUDA | ||
def test_dlpack_default_stream(self, device): | ||
class DLPackTensor: | ||
def __init__(self, tensor): | ||
self.tensor = tensor | ||
|
||
def __dlpack_device__(self): | ||
return self.tensor.__dlpack_device__() | ||
|
||
def __dlpack__(self, stream=None): | ||
if torch.version.hip is None: | ||
assert stream == 1 | ||
else: | ||
assert stream == 0 | ||
capsule = self.tensor.__dlpack__(stream) | ||
return capsule | ||
|
||
# CUDA-based tests runs on non-default streams | ||
with torch.cuda.stream(torch.cuda.default_stream()): | ||
x = DLPackTensor(make_tensor((5,), dtype=torch.float32, device=device)) | ||
from_dlpack(x) | ||
|
||
@skipMeta | ||
@onlyNativeDeviceTypes | ||
@dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16)) | ||
def test_dlpack_tensor_invalid_stream(self, device, dtype): | ||
with self.assertRaises(TypeError): | ||
x = make_tensor((5,), dtype=dtype, device=device) | ||
x.__dlpack__(stream=object()) | ||
|
||
@skipMeta | ||
def test_dlpack_error_on_bool_tensor(self): | ||
x = torch.tensor([True], dtype=torch.bool) | ||
with self.assertRaises(RuntimeError): | ||
to_dlpack(x) | ||
|
||
# TODO: add interchange tests once NumPy 1.22 (dlpack support) is required | ||
@skipMeta | ||
def test_dlpack_export_requires_grad(self): | ||
x = torch.zeros(10, dtype=torch.float32, requires_grad=True) | ||
with self.assertRaisesRegex(RuntimeError, r"require gradient"): | ||
x.__dlpack__() | ||
|
||
@skipMeta | ||
def test_dlpack_export_is_conj(self): | ||
x = torch.tensor([-1 + 1j, -2 + 2j, 3 - 3j]) | ||
y = torch.conj(x) | ||
with self.assertRaisesRegex(RuntimeError, r"conjugate bit"): | ||
y.__dlpack__() | ||
|
||
@skipMeta | ||
def test_dlpack_export_non_strided(self): | ||
x = torch.sparse_coo_tensor([[0]], [1], size=(1,)) | ||
y = torch.conj(x) | ||
with self.assertRaisesRegex(RuntimeError, r"strided"): | ||
y.__dlpack__() | ||
|
||
@skipMeta | ||
def test_dlpack_normalize_strides(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. for other reviewers: this is the new test (the rest of the tests are just moving code around) |
||
x = torch.rand(16) | ||
y = x[::3][:1] | ||
self.assertEqual(y.shape, (1,)) | ||
self.assertEqual(y.stride(), (3,)) | ||
z = from_dlpack(y) | ||
self.assertEqual(z.shape, (1,)) | ||
# gh-83069, make sure __dlpack__ normalizes strides | ||
self.assertEqual(z.stride(), (1,)) | ||
|
||
|
||
instantiate_device_type_tests(TestTorchDlPack, globals()) | ||
|
||
if __name__ == '__main__': | ||
run_tests() |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you guys have something like a
MAX_DIM
concept? perhaps you could add a slotstrides[MAX_DIM]
in theATenDLMTensor
structure to store the strides. That way you can implement this hole thing with a singlenew
allocation.int64_t *strides = ATenDLMTensor->strides;
PS: This is just a suggestion for an alternative implementation. I'm totally fine with things as they are now.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There is a concept of MAX_DIMS but I am not sure the
src.sizes.data()
is forced to be aint64_t[MAX_DIMS]
. If it is shorter, arbitrary memory would be copied into theATenDLMTensor.strides
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It does not have to be. It does not matter.
No, simply because the
for
loop below fills up tosrc.dim()
entries in thestride
buffer. Any entries beyondsrc.dim()
will be unused.The real issue is whether you are willing to pay the extra 1 KiB needed to store the
MAX_DIMS
strides vs saving onenew
allocation call.Am I making myself clear enough?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ahh right. I misunderstood. I will let other reviewers weigh in whether they want to avoid the new call.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't really care about perf here, I just want to minimize manual memory management. Probably the slickest way to do this is to create a new Tensor that has the normalized strides and then use the fields on that tensor directly from dlpack. WDYT
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yeah,
as_strided
is the ticket hereThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In order to use
as_strided
I need to allocate an int vector for the new strides, right? I am not sure what advantage creating a new container gives over using the existing capsule deallocate mechanism.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As I said, the benefit is you don't have to manually memory manage strides separately from Tensor object. Tensor is the owning object that is managed and everything else hangs off it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As I said, the benefit is you don't have to manually memory manage strides separately from Tensor object. Tensor is the owning object that is managed and everything else hangs off it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done (assuming CI passes)