diff --git a/docs/source/conf.py b/docs/source/conf.py index 66138c2d12e..d9d64591a43 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -88,8 +88,8 @@ def __init__(self, src_dir): "plot_transforms_e2e.py", "plot_cutmix_mixup.py", "plot_custom_transforms.py", - "plot_datapoints.py", - "plot_custom_datapoints.py", + "plot_vision_tensors.py", + "plot_custom_vision_tensors.py", ] def __call__(self, filename): diff --git a/docs/source/index.rst b/docs/source/index.rst index bc38fdb0307..e4822718392 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -32,7 +32,7 @@ architectures, and common image transformations for computer vision. :caption: Package Reference transforms - datapoints + vision_tensors models datasets utils diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst index 74ab20605b1..1d20bee5717 100644 --- a/docs/source/transforms.rst +++ b/docs/source/transforms.rst @@ -30,12 +30,12 @@ tasks (image classification, detection, segmentation, video classification). .. code:: python # Detection (re-using imports and transforms from above) - from torchvision import datapoints + from torchvision import vision_tensors img = torch.randint(0, 256, size=(3, H, W), dtype=torch.uint8) bboxes = torch.randint(0, H // 2, size=(3, 4)) bboxes[:, 2:] += bboxes[:, :2] - bboxes = datapoints.BoundingBoxes(bboxes, format="XYXY", canvas_size=(H, W)) + bboxes = vision_tensors.BoundingBoxes(bboxes, format="XYXY", canvas_size=(H, W)) # The same transforms can be used! img, bboxes = transforms(img, bboxes) @@ -183,8 +183,8 @@ Transforms are available as classes like This is very much like the :mod:`torch.nn` package which defines both classes and functional equivalents in :mod:`torch.nn.functional`. -The functionals support PIL images, pure tensors, or :ref:`datapoints -`, e.g. both ``resize(image_tensor)`` and ``resize(bboxes)`` are +The functionals support PIL images, pure tensors, or :ref:`vision_tensors +`, e.g. both ``resize(image_tensor)`` and ``resize(bboxes)`` are valid. .. note:: diff --git a/docs/source/datapoints.rst b/docs/source/vision_tensors.rst similarity index 50% rename from docs/source/datapoints.rst rename to docs/source/vision_tensors.rst index 2ecfdec54c2..f75454336f3 100644 --- a/docs/source/datapoints.rst +++ b/docs/source/vision_tensors.rst @@ -1,13 +1,13 @@ -.. _datapoints: +.. _vision_tensors: -Datapoints -========== +VisionTensors +============= -.. currentmodule:: torchvision.datapoints +.. currentmodule:: torchvision.vision_tensors -Datapoints are tensor subclasses which the :mod:`~torchvision.transforms.v2` v2 transforms use under the hood to +VisionTensors are tensor subclasses which the :mod:`~torchvision.transforms.v2` v2 transforms use under the hood to dispatch their inputs to the appropriate lower-level kernels. Most users do not -need to manipulate datapoints directly and can simply rely on dataset wrapping - +need to manipulate vision_tensors directly and can simply rely on dataset wrapping - see e.g. :ref:`sphx_glr_auto_examples_transforms_plot_transforms_e2e.py`. .. autosummary:: @@ -19,6 +19,6 @@ see e.g. :ref:`sphx_glr_auto_examples_transforms_plot_transforms_e2e.py`. BoundingBoxFormat BoundingBoxes Mask - Datapoint + VisionTensor set_return_type wrap diff --git a/gallery/others/plot_video_api.py b/gallery/others/plot_video_api.py index ac9eb0ba27d..1ef46ecd67c 100644 --- a/gallery/others/plot_video_api.py +++ b/gallery/others/plot_video_api.py @@ -86,7 +86,7 @@ print("PTS for first five frames ", ptss[:5]) print("Total number of frames: ", len(frames)) approx_nf = metadata['audio']['duration'][0] * metadata['audio']['framerate'][0] -print("Approx total number of datapoints we can expect: ", approx_nf) +print("Approx total number of vision_tensors we can expect: ", approx_nf) print("Read data size: ", frames[0].size(0) * len(frames)) # %% @@ -170,7 +170,7 @@ def example_read_video(video_object, start=0, end=None, read_video=True, read_au return video_frames, audio_frames, (video_pts, audio_pts), video_object.get_metadata() -# Total number of frames should be 327 for video and 523264 datapoints for audio +# Total number of frames should be 327 for video and 523264 vision_tensors for audio vf, af, info, meta = example_read_video(video) print(vf.size(), af.size()) diff --git a/gallery/transforms/helpers.py b/gallery/transforms/helpers.py index 957d9bcb709..809ab89ed87 100644 --- a/gallery/transforms/helpers.py +++ b/gallery/transforms/helpers.py @@ -1,7 +1,7 @@ import matplotlib.pyplot as plt import torch from torchvision.utils import draw_bounding_boxes, draw_segmentation_masks -from torchvision import datapoints +from torchvision import vision_tensors from torchvision.transforms.v2 import functional as F @@ -22,7 +22,7 @@ def plot(imgs, row_title=None, **imshow_kwargs): if isinstance(target, dict): boxes = target.get("boxes") masks = target.get("masks") - elif isinstance(target, datapoints.BoundingBoxes): + elif isinstance(target, vision_tensors.BoundingBoxes): boxes = target else: raise ValueError(f"Unexpected target type: {type(target)}") diff --git a/gallery/transforms/plot_custom_transforms.py b/gallery/transforms/plot_custom_transforms.py index 55e8e3f060f..c16227bb632 100644 --- a/gallery/transforms/plot_custom_transforms.py +++ b/gallery/transforms/plot_custom_transforms.py @@ -13,7 +13,7 @@ # %% import torch -from torchvision import datapoints +from torchvision import vision_tensors from torchvision.transforms import v2 @@ -62,7 +62,7 @@ def forward(self, img, bboxes, label): # we assume inputs are always structured H, W = 256, 256 img = torch.rand(3, H, W) -bboxes = datapoints.BoundingBoxes( +bboxes = vision_tensors.BoundingBoxes( torch.tensor([[0, 10, 10, 20], [50, 50, 70, 70]]), format="XYXY", canvas_size=(H, W) @@ -74,9 +74,9 @@ def forward(self, img, bboxes, label): # we assume inputs are always structured print(f"Output image shape: {out_img.shape}\nout_bboxes = {out_bboxes}\n{out_label = }") # %% # .. note:: -# While working with datapoint classes in your code, make sure to +# While working with vision_tensor classes in your code, make sure to # familiarize yourself with this section: -# :ref:`datapoint_unwrapping_behaviour` +# :ref:`vision_tensor_unwrapping_behaviour` # # Supporting arbitrary input structures # ===================================== @@ -111,7 +111,7 @@ def forward(self, img, bboxes, label): # we assume inputs are always structured # In brief, the core logic is to unpack the input into a flat list using `pytree # `_, and # then transform only the entries that can be transformed (the decision is made -# based on the **class** of the entries, as all datapoints are +# based on the **class** of the entries, as all vision_tensors are # tensor-subclasses) plus some custom logic that is out of score here - check the # code for details. The (potentially transformed) entries are then repacked and # returned, in the same structure as the input. diff --git a/gallery/transforms/plot_custom_datapoints.py b/gallery/transforms/plot_custom_vision_tensors.py similarity index 70% rename from gallery/transforms/plot_custom_datapoints.py rename to gallery/transforms/plot_custom_vision_tensors.py index 674aceb6e5b..c4ec6cf1a13 100644 --- a/gallery/transforms/plot_custom_datapoints.py +++ b/gallery/transforms/plot_custom_vision_tensors.py @@ -1,62 +1,62 @@ """ -===================================== -How to write your own Datapoint class -===================================== +======================================== +How to write your own VisionTensor class +======================================== .. note:: - Try on `collab `_ - or :ref:`go to the end ` to download the full example code. + Try on `collab `_ + or :ref:`go to the end ` to download the full example code. This guide is intended for advanced users and downstream library maintainers. We explain how to -write your own datapoint class, and how to make it compatible with the built-in +write your own vision_tensor class, and how to make it compatible with the built-in Torchvision v2 transforms. Before continuing, make sure you have read -:ref:`sphx_glr_auto_examples_transforms_plot_datapoints.py`. +:ref:`sphx_glr_auto_examples_transforms_plot_vision_tensors.py`. """ # %% import torch -from torchvision import datapoints +from torchvision import vision_tensors from torchvision.transforms import v2 # %% # We will create a very simple class that just inherits from the base -# :class:`~torchvision.datapoints.Datapoint` class. It will be enough to cover +# :class:`~torchvision.vision_tensors.VisionTensor` class. It will be enough to cover # what you need to know to implement your more elaborate uses-cases. If you need # to create a class that carries meta-data, take a look at how the -# :class:`~torchvision.datapoints.BoundingBoxes` class is `implemented -# `_. +# :class:`~torchvision.vision_tensors.BoundingBoxes` class is `implemented +# `_. -class MyDatapoint(datapoints.Datapoint): +class MyVisionTensor(vision_tensors.VisionTensor): pass -my_dp = MyDatapoint([1, 2, 3]) +my_dp = MyVisionTensor([1, 2, 3]) my_dp # %% -# Now that we have defined our custom Datapoint class, we want it to be +# Now that we have defined our custom VisionTensor class, we want it to be # compatible with the built-in torchvision transforms, and the functional API. # For that, we need to implement a kernel which performs the core of the # transformation, and then "hook" it to the functional that we want to support # via :func:`~torchvision.transforms.v2.functional.register_kernel`. # # We illustrate this process below: we create a kernel for the "horizontal flip" -# operation of our MyDatapoint class, and register it to the functional API. +# operation of our MyVisionTensor class, and register it to the functional API. from torchvision.transforms.v2 import functional as F -@F.register_kernel(functional="hflip", datapoint_cls=MyDatapoint) -def hflip_my_datapoint(my_dp, *args, **kwargs): +@F.register_kernel(functional="hflip", vision_tensor_cls=MyVisionTensor) +def hflip_my_vision_tensor(my_dp, *args, **kwargs): print("Flipping!") out = my_dp.flip(-1) - return datapoints.wrap(out, like=my_dp) + return vision_tensors.wrap(out, like=my_dp) # %% -# To understand why :func:`~torchvision.datapoints.wrap` is used, see -# :ref:`datapoint_unwrapping_behaviour`. Ignore the ``*args, **kwargs`` for now, +# To understand why :func:`~torchvision.vision_tensors.wrap` is used, see +# :ref:`vision_tensor_unwrapping_behaviour`. Ignore the ``*args, **kwargs`` for now, # we will explain it below in :ref:`param_forwarding`. # # .. note:: @@ -67,9 +67,9 @@ def hflip_my_datapoint(my_dp, *args, **kwargs): # ``@register_kernel(functional=F.hflip, ...)``. # # Now that we have registered our kernel, we can call the functional API on a -# ``MyDatapoint`` instance: +# ``MyVisionTensor`` instance: -my_dp = MyDatapoint(torch.rand(3, 256, 256)) +my_dp = MyVisionTensor(torch.rand(3, 256, 256)) _ = F.hflip(my_dp) # %% @@ -102,10 +102,10 @@ def hflip_my_datapoint(my_dp, *args, **kwargs): # to its :func:`~torchvision.transforms.v2.functional.hflip` functional. If you # already defined and registered your own kernel as -def hflip_my_datapoint(my_dp): # noqa +def hflip_my_vision_tensor(my_dp): # noqa print("Flipping!") out = my_dp.flip(-1) - return datapoints.wrap(out, like=my_dp) + return vision_tensors.wrap(out, like=my_dp) # %% diff --git a/gallery/transforms/plot_transforms_e2e.py b/gallery/transforms/plot_transforms_e2e.py index 313c7b7e606..38693d9678c 100644 --- a/gallery/transforms/plot_transforms_e2e.py +++ b/gallery/transforms/plot_transforms_e2e.py @@ -23,7 +23,7 @@ import torch import torch.utils.data -from torchvision import models, datasets, datapoints +from torchvision import models, datasets, vision_tensors from torchvision.transforms import v2 torch.manual_seed(0) @@ -72,7 +72,7 @@ # %% # We used the ``target_keys`` parameter to specify the kind of output we're # interested in. Our dataset now returns a target which is dict where the values -# are :ref:`Datapoints ` (all are :class:`torch.Tensor` +# are :ref:`VisionTensors ` (all are :class:`torch.Tensor` # subclasses). We're dropped all unncessary keys from the previous output, but # if you need any of the original keys e.g. "image_id", you can still ask for # it. @@ -103,7 +103,7 @@ [ v2.ToImage(), v2.RandomPhotometricDistort(p=1), - v2.RandomZoomOut(fill={datapoints.Image: (123, 117, 104), "others": 0}), + v2.RandomZoomOut(fill={vision_tensors.Image: (123, 117, 104), "others": 0}), v2.RandomIoUCrop(), v2.RandomHorizontalFlip(p=1), v2.SanitizeBoundingBoxes(), diff --git a/gallery/transforms/plot_transforms_getting_started.py b/gallery/transforms/plot_transforms_getting_started.py index f44fbb2efd5..be949370659 100644 --- a/gallery/transforms/plot_transforms_getting_started.py +++ b/gallery/transforms/plot_transforms_getting_started.py @@ -88,9 +88,9 @@ # # Let's briefly look at a detection example with bounding boxes. -from torchvision import datapoints # we'll describe this a bit later, bare with us +from torchvision import vision_tensors # we'll describe this a bit later, bare with us -boxes = datapoints.BoundingBoxes( +boxes = vision_tensors.BoundingBoxes( [ [15, 10, 370, 510], [275, 340, 510, 510], @@ -111,44 +111,44 @@ # %% # # The example above focuses on object detection. But if we had masks -# (:class:`torchvision.datapoints.Mask`) for object segmentation or semantic -# segmentation, or videos (:class:`torchvision.datapoints.Video`), we could have +# (:class:`torchvision.vision_tensors.Mask`) for object segmentation or semantic +# segmentation, or videos (:class:`torchvision.vision_tensors.Video`), we could have # passed them to the transforms in exactly the same way. # -# By now you likely have a few questions: what are these datapoints, how do we +# By now you likely have a few questions: what are these vision_tensors, how do we # use them, and what is the expected input/output of those transforms? We'll # answer these in the next sections. # %% # -# .. _what_are_datapoints: +# .. _what_are_vision_tensors: # -# What are Datapoints? -# -------------------- +# What are VisionTensors? +# ----------------------- # -# Datapoints are :class:`torch.Tensor` subclasses. The available datapoints are -# :class:`~torchvision.datapoints.Image`, -# :class:`~torchvision.datapoints.BoundingBoxes`, -# :class:`~torchvision.datapoints.Mask`, and -# :class:`~torchvision.datapoints.Video`. +# VisionTensors are :class:`torch.Tensor` subclasses. The available vision_tensors are +# :class:`~torchvision.vision_tensors.Image`, +# :class:`~torchvision.vision_tensors.BoundingBoxes`, +# :class:`~torchvision.vision_tensors.Mask`, and +# :class:`~torchvision.vision_tensors.Video`. # -# Datapoints look and feel just like regular tensors - they **are** tensors. +# VisionTensors look and feel just like regular tensors - they **are** tensors. # Everything that is supported on a plain :class:`torch.Tensor` like ``.sum()`` -# or any ``torch.*`` operator will also work on a datapoint: +# or any ``torch.*`` operator will also work on a vision_tensor: -img_dp = datapoints.Image(torch.randint(0, 256, (3, 256, 256), dtype=torch.uint8)) +img_dp = vision_tensors.Image(torch.randint(0, 256, (3, 256, 256), dtype=torch.uint8)) print(f"{isinstance(img_dp, torch.Tensor) = }") print(f"{img_dp.dtype = }, {img_dp.shape = }, {img_dp.sum() = }") # %% -# These Datapoint classes are at the core of the transforms: in order to +# These VisionTensor classes are at the core of the transforms: in order to # transform a given input, the transforms first look at the **class** of the # object, and dispatch to the appropriate implementation accordingly. # -# You don't need to know much more about datapoints at this point, but advanced +# You don't need to know much more about vision_tensors at this point, but advanced # users who want to learn more can refer to -# :ref:`sphx_glr_auto_examples_transforms_plot_datapoints.py`. +# :ref:`sphx_glr_auto_examples_transforms_plot_vision_tensors.py`. # # What do I pass as input? # ------------------------ @@ -196,17 +196,17 @@ # Pure :class:`torch.Tensor` objects are, in general, treated as images (or # as videos for video-specific transforms). Indeed, you may have noticed # that in the code above we haven't used the -# :class:`~torchvision.datapoints.Image` class at all, and yet our images +# :class:`~torchvision.vision_tensors.Image` class at all, and yet our images # got transformed properly. Transforms follow the following logic to # determine whether a pure Tensor should be treated as an image (or video), # or just ignored: # -# * If there is an :class:`~torchvision.datapoints.Image`, -# :class:`~torchvision.datapoints.Video`, +# * If there is an :class:`~torchvision.vision_tensors.Image`, +# :class:`~torchvision.vision_tensors.Video`, # or :class:`PIL.Image.Image` instance in the input, all other pure # tensors are passed-through. -# * If there is no :class:`~torchvision.datapoints.Image` or -# :class:`~torchvision.datapoints.Video` instance, only the first pure +# * If there is no :class:`~torchvision.vision_tensors.Image` or +# :class:`~torchvision.vision_tensors.Video` instance, only the first pure # :class:`torch.Tensor` will be transformed as image or video, while all # others will be passed-through. Here "first" means "first in a depth-wise # traversal". @@ -234,9 +234,9 @@ # Torchvision also supports datasets for object detection or segmentation like # :class:`torchvision.datasets.CocoDetection`. Those datasets predate # the existence of the :mod:`torchvision.transforms.v2` module and of the -# datapoints, so they don't return datapoints out of the box. +# vision_tensors, so they don't return vision_tensors out of the box. # -# An easy way to force those datasets to return datapoints and to make them +# An easy way to force those datasets to return vision_tensors and to make them # compatible with v2 transforms is to use the # :func:`torchvision.datasets.wrap_dataset_for_transforms_v2` function: # @@ -246,14 +246,14 @@ # # dataset = CocoDetection(..., transforms=my_transforms) # dataset = wrap_dataset_for_transforms_v2(dataset) -# # Now the dataset returns datapoints! +# # Now the dataset returns vision_tensors! # # Using your own datasets # ^^^^^^^^^^^^^^^^^^^^^^^ # # If you have a custom dataset, then you'll need to convert your objects into -# the appropriate Datapoint classes. Creating Datapoint instances is very easy, -# refer to :ref:`datapoint_creation` for more details. +# the appropriate VisionTensor classes. Creating VisionTensor instances is very easy, +# refer to :ref:`vision_tensor_creation` for more details. # # There are two main places where you can implement that conversion logic: # diff --git a/gallery/transforms/plot_datapoints.py b/gallery/transforms/plot_vision_tensors.py similarity index 50% rename from gallery/transforms/plot_datapoints.py rename to gallery/transforms/plot_vision_tensors.py index 726046097a9..ae984ffc2e3 100644 --- a/gallery/transforms/plot_datapoints.py +++ b/gallery/transforms/plot_vision_tensors.py @@ -1,43 +1,43 @@ """ -============== -Datapoints FAQ -============== +================= +VisionTensors FAQ +================= .. note:: - Try on `collab `_ - or :ref:`go to the end ` to download the full example code. + Try on `collab `_ + or :ref:`go to the end ` to download the full example code. -Datapoints are Tensor subclasses introduced together with -``torchvision.transforms.v2``. This example showcases what these datapoints are +VisionTensors are Tensor subclasses introduced together with +``torchvision.transforms.v2``. This example showcases what these vision_tensors are and how they behave. .. warning:: - **Intended Audience** Unless you're writing your own transforms or your own datapoints, you + **Intended Audience** Unless you're writing your own transforms or your own vision_tensors, you probably do not need to read this guide. This is a fairly low-level topic that most users will not need to worry about: you do not need to understand - the internals of datapoints to efficiently rely on + the internals of vision_tensors to efficiently rely on ``torchvision.transforms.v2``. It may however be useful for advanced users trying to implement their own datasets, transforms, or work directly with - the datapoints. + the vision_tensors. """ # %% import PIL.Image import torch -from torchvision import datapoints +from torchvision import vision_tensors # %% -# What are datapoints? -# -------------------- +# What are vision_tensors? +# ------------------------ # -# Datapoints are zero-copy tensor subclasses: +# VisionTensors are zero-copy tensor subclasses: tensor = torch.rand(3, 256, 256) -image = datapoints.Image(tensor) +image = vision_tensors.Image(tensor) assert isinstance(image, torch.Tensor) assert image.data_ptr() == tensor.data_ptr() @@ -46,33 +46,33 @@ # Under the hood, they are needed in :mod:`torchvision.transforms.v2` to correctly dispatch to the appropriate function # for the input data. # -# :mod:`torchvision.datapoints` supports four types of datapoints: +# :mod:`torchvision.vision_tensors` supports four types of vision_tensors: # -# * :class:`~torchvision.datapoints.Image` -# * :class:`~torchvision.datapoints.Video` -# * :class:`~torchvision.datapoints.BoundingBoxes` -# * :class:`~torchvision.datapoints.Mask` +# * :class:`~torchvision.vision_tensors.Image` +# * :class:`~torchvision.vision_tensors.Video` +# * :class:`~torchvision.vision_tensors.BoundingBoxes` +# * :class:`~torchvision.vision_tensors.Mask` # -# What can I do with a datapoint? -# ------------------------------- +# What can I do with a vision_tensor? +# ----------------------------------- # -# Datapoints look and feel just like regular tensors - they **are** tensors. +# VisionTensors look and feel just like regular tensors - they **are** tensors. # Everything that is supported on a plain :class:`torch.Tensor` like ``.sum()`` or -# any ``torch.*`` operator will also work on datapoints. See -# :ref:`datapoint_unwrapping_behaviour` for a few gotchas. +# any ``torch.*`` operator will also work on vision_tensors. See +# :ref:`vision_tensor_unwrapping_behaviour` for a few gotchas. # %% -# .. _datapoint_creation: +# .. _vision_tensor_creation: # -# How do I construct a datapoint? -# ------------------------------- +# How do I construct a vision_tensor? +# ----------------------------------- # # Using the constructor # ^^^^^^^^^^^^^^^^^^^^^ # -# Each datapoint class takes any tensor-like data that can be turned into a :class:`~torch.Tensor` +# Each vision_tensor class takes any tensor-like data that can be turned into a :class:`~torch.Tensor` -image = datapoints.Image([[[[0, 1], [1, 0]]]]) +image = vision_tensors.Image([[[[0, 1], [1, 0]]]]) print(image) @@ -80,64 +80,64 @@ # Similar to other PyTorch creations ops, the constructor also takes the ``dtype``, ``device``, and ``requires_grad`` # parameters. -float_image = datapoints.Image([[[0, 1], [1, 0]]], dtype=torch.float32, requires_grad=True) +float_image = vision_tensors.Image([[[0, 1], [1, 0]]], dtype=torch.float32, requires_grad=True) print(float_image) # %% -# In addition, :class:`~torchvision.datapoints.Image` and :class:`~torchvision.datapoints.Mask` can also take a +# In addition, :class:`~torchvision.vision_tensors.Image` and :class:`~torchvision.vision_tensors.Mask` can also take a # :class:`PIL.Image.Image` directly: -image = datapoints.Image(PIL.Image.open("../assets/astronaut.jpg")) +image = vision_tensors.Image(PIL.Image.open("../assets/astronaut.jpg")) print(image.shape, image.dtype) # %% -# Some datapoints require additional metadata to be passed in ordered to be constructed. For example, -# :class:`~torchvision.datapoints.BoundingBoxes` requires the coordinate format as well as the size of the +# Some vision_tensors require additional metadata to be passed in ordered to be constructed. For example, +# :class:`~torchvision.vision_tensors.BoundingBoxes` requires the coordinate format as well as the size of the # corresponding image (``canvas_size``) alongside the actual values. These # metadata are required to properly transform the bounding boxes. -bboxes = datapoints.BoundingBoxes( +bboxes = vision_tensors.BoundingBoxes( [[17, 16, 344, 495], [0, 10, 0, 10]], - format=datapoints.BoundingBoxFormat.XYXY, + format=vision_tensors.BoundingBoxFormat.XYXY, canvas_size=image.shape[-2:] ) print(bboxes) # %% -# Using ``datapoints.wrap()`` -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# Using ``vision_tensors.wrap()`` +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # -# You can also use the :func:`~torchvision.datapoints.wrap` function to wrap a tensor object -# into a datapoint. This is useful when you already have an object of the +# You can also use the :func:`~torchvision.vision_tensors.wrap` function to wrap a tensor object +# into a vision_tensor. This is useful when you already have an object of the # desired type, which typically happens when writing transforms: you just want # to wrap the output like the input. new_bboxes = torch.tensor([0, 20, 30, 40]) -new_bboxes = datapoints.wrap(new_bboxes, like=bboxes) -assert isinstance(new_bboxes, datapoints.BoundingBoxes) +new_bboxes = vision_tensors.wrap(new_bboxes, like=bboxes) +assert isinstance(new_bboxes, vision_tensors.BoundingBoxes) assert new_bboxes.canvas_size == bboxes.canvas_size # %% # The metadata of ``new_bboxes`` is the same as ``bboxes``, but you could pass # it as a parameter to override it. # -# .. _datapoint_unwrapping_behaviour: +# .. _vision_tensor_unwrapping_behaviour: # -# I had a Datapoint but now I have a Tensor. Help! -# ------------------------------------------------ +# I had a VisionTensor but now I have a Tensor. Help! +# --------------------------------------------------- # -# By default, operations on :class:`~torchvision.datapoints.Datapoint` objects +# By default, operations on :class:`~torchvision.vision_tensors.VisionTensor` objects # will return a pure Tensor: -assert isinstance(bboxes, datapoints.BoundingBoxes) +assert isinstance(bboxes, vision_tensors.BoundingBoxes) # Shift bboxes by 3 pixels in both H and W new_bboxes = bboxes + 3 assert isinstance(new_bboxes, torch.Tensor) -assert not isinstance(new_bboxes, datapoints.BoundingBoxes) +assert not isinstance(new_bboxes, vision_tensors.BoundingBoxes) # %% # .. note:: @@ -145,36 +145,36 @@ # This behavior only affects native ``torch`` operations. If you are using # the built-in ``torchvision`` transforms or functionals, you will always get # as output the same type that you passed as input (pure ``Tensor`` or -# ``Datapoint``). +# ``VisionTensor``). # %% -# But I want a Datapoint back! -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# But I want a VisionTensor back! +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # -# You can re-wrap a pure tensor into a datapoint by just calling the datapoint -# constructor, or by using the :func:`~torchvision.datapoints.wrap` function -# (see more details above in :ref:`datapoint_creation`): +# You can re-wrap a pure tensor into a vision_tensor by just calling the vision_tensor +# constructor, or by using the :func:`~torchvision.vision_tensors.wrap` function +# (see more details above in :ref:`vision_tensor_creation`): new_bboxes = bboxes + 3 -new_bboxes = datapoints.wrap(new_bboxes, like=bboxes) -assert isinstance(new_bboxes, datapoints.BoundingBoxes) +new_bboxes = vision_tensors.wrap(new_bboxes, like=bboxes) +assert isinstance(new_bboxes, vision_tensors.BoundingBoxes) # %% -# Alternatively, you can use the :func:`~torchvision.datapoints.set_return_type` +# Alternatively, you can use the :func:`~torchvision.vision_tensors.set_return_type` # as a global config setting for the whole program, or as a context manager # (read its docs to learn more about caveats): -with datapoints.set_return_type("datapoint"): +with vision_tensors.set_return_type("vision_tensor"): new_bboxes = bboxes + 3 -assert isinstance(new_bboxes, datapoints.BoundingBoxes) +assert isinstance(new_bboxes, vision_tensors.BoundingBoxes) # %% # Why is this happening? # ^^^^^^^^^^^^^^^^^^^^^^ # -# **For performance reasons**. :class:`~torchvision.datapoints.Datapoint` +# **For performance reasons**. :class:`~torchvision.vision_tensors.VisionTensor` # classes are Tensor subclasses, so any operation involving a -# :class:`~torchvision.datapoints.Datapoint` object will go through the +# :class:`~torchvision.vision_tensors.VisionTensor` object will go through the # `__torch_function__ # `_ # protocol. This induces a small overhead, which we want to avoid when possible. @@ -183,12 +183,12 @@ # ``forward``. # # **The alternative isn't much better anyway.** For every operation where -# preserving the :class:`~torchvision.datapoints.Datapoint` type makes +# preserving the :class:`~torchvision.vision_tensors.VisionTensor` type makes # sense, there are just as many operations where returning a pure Tensor is -# preferable: for example, is ``img.sum()`` still an :class:`~torchvision.datapoints.Image`? -# If we were to preserve :class:`~torchvision.datapoints.Datapoint` types all +# preferable: for example, is ``img.sum()`` still an :class:`~torchvision.vision_tensors.Image`? +# If we were to preserve :class:`~torchvision.vision_tensors.VisionTensor` types all # the way, even model's logits or the output of the loss function would end up -# being of type :class:`~torchvision.datapoints.Image`, and surely that's not +# being of type :class:`~torchvision.vision_tensors.Image`, and surely that's not # desirable. # # .. note:: @@ -203,22 +203,22 @@ # There are a few exceptions to this "unwrapping" rule: # :meth:`~torch.Tensor.clone`, :meth:`~torch.Tensor.to`, # :meth:`torch.Tensor.detach`, and :meth:`~torch.Tensor.requires_grad_` retain -# the datapoint type. +# the vision_tensor type. # -# Inplace operations on datapoints like ``obj.add_()`` will preserve the type of +# Inplace operations on vision_tensors like ``obj.add_()`` will preserve the type of # ``obj``. However, the **returned** value of inplace operations will be a pure # tensor: -image = datapoints.Image([[[0, 1], [1, 0]]]) +image = vision_tensors.Image([[[0, 1], [1, 0]]]) new_image = image.add_(1).mul_(2) -# image got transformed in-place and is still an Image datapoint, but new_image +# image got transformed in-place and is still an Image vision_tensor, but new_image # is a Tensor. They share the same underlying data and they're equal, just # different classes. -assert isinstance(image, datapoints.Image) +assert isinstance(image, vision_tensors.Image) print(image) -assert isinstance(new_image, torch.Tensor) and not isinstance(new_image, datapoints.Image) +assert isinstance(new_image, torch.Tensor) and not isinstance(new_image, vision_tensors.Image) assert (new_image == image).all() assert new_image.data_ptr() == image.data_ptr() diff --git a/mypy.ini b/mypy.ini index 07b9c75c516..fbe89b4f35b 100644 --- a/mypy.ini +++ b/mypy.ini @@ -7,7 +7,7 @@ allow_redefinition = True no_implicit_optional = True warn_redundant_casts = True -[mypy-torchvision.prototype.datapoints.*] +[mypy-torchvision.prototype.vision_tensors.*] ; untyped definitions and calls disallow_untyped_defs = True diff --git a/references/detection/presets.py b/references/detection/presets.py index e7b2ca35792..190cb4eada5 100644 --- a/references/detection/presets.py +++ b/references/detection/presets.py @@ -7,10 +7,10 @@ def get_modules(use_v2): # We need a protected import to avoid the V2 warning in case just V1 is used if use_v2: - import torchvision.datapoints import torchvision.transforms.v2 + import torchvision.vision_tensors - return torchvision.transforms.v2, torchvision.datapoints + return torchvision.transforms.v2, torchvision.vision_tensors else: return reference_transforms, None @@ -28,16 +28,16 @@ def __init__( use_v2=False, ): - T, datapoints = get_modules(use_v2) + T, vision_tensors = get_modules(use_v2) transforms = [] backend = backend.lower() - if backend == "datapoint": + if backend == "vision_tensor": transforms.append(T.ToImage()) elif backend == "tensor": transforms.append(T.PILToTensor()) elif backend != "pil": - raise ValueError(f"backend can be 'datapoint', 'tensor' or 'pil', but got {backend}") + raise ValueError(f"backend can be 'vision_tensor', 'tensor' or 'pil', but got {backend}") if data_augmentation == "hflip": transforms += [T.RandomHorizontalFlip(p=hflip_prob)] @@ -54,7 +54,7 @@ def __init__( T.RandomHorizontalFlip(p=hflip_prob), ] elif data_augmentation == "ssd": - fill = defaultdict(lambda: mean, {datapoints.Mask: 0}) if use_v2 else list(mean) + fill = defaultdict(lambda: mean, {vision_tensors.Mask: 0}) if use_v2 else list(mean) transforms += [ T.RandomPhotometricDistort(), T.RandomZoomOut(fill=fill), @@ -77,7 +77,7 @@ def __init__( if use_v2: transforms += [ - T.ConvertBoundingBoxFormat(datapoints.BoundingBoxFormat.XYXY), + T.ConvertBoundingBoxFormat(vision_tensors.BoundingBoxFormat.XYXY), T.SanitizeBoundingBoxes(), T.ToPureTensor(), ] @@ -98,10 +98,10 @@ def __init__(self, backend="pil", use_v2=False): transforms += [T.ToImage() if use_v2 else T.PILToTensor()] elif backend == "tensor": transforms += [T.PILToTensor()] - elif backend == "datapoint": + elif backend == "vision_tensor": transforms += [T.ToImage()] else: - raise ValueError(f"backend can be 'datapoint', 'tensor' or 'pil', but got {backend}") + raise ValueError(f"backend can be 'vision_tensor', 'tensor' or 'pil', but got {backend}") transforms += [T.ToDtype(torch.float, scale=True)] diff --git a/references/detection/train.py b/references/detection/train.py index 892ffbbbc1c..f128e1d2f97 100644 --- a/references/detection/train.py +++ b/references/detection/train.py @@ -180,8 +180,8 @@ def get_args_parser(add_help=True): def main(args): - if args.backend.lower() == "datapoint" and not args.use_v2: - raise ValueError("Use --use-v2 if you want to use the datapoint backend.") + if args.backend.lower() == "vision_tensor" and not args.use_v2: + raise ValueError("Use --use-v2 if you want to use the vision_tensor backend.") if args.dataset not in ("coco", "coco_kp"): raise ValueError(f"Dataset should be coco or coco_kp, got {args.dataset}") if "keypoint" in args.model and args.dataset != "coco_kp": diff --git a/references/segmentation/presets.py b/references/segmentation/presets.py index b0539fcca3f..692685d3480 100644 --- a/references/segmentation/presets.py +++ b/references/segmentation/presets.py @@ -4,11 +4,11 @@ def get_modules(use_v2): # We need a protected import to avoid the V2 warning in case just V1 is used if use_v2: - import torchvision.datapoints import torchvision.transforms.v2 + import torchvision.vision_tensors import v2_extras - return torchvision.transforms.v2, torchvision.datapoints, v2_extras + return torchvision.transforms.v2, torchvision.vision_tensors, v2_extras else: import transforms @@ -27,16 +27,16 @@ def __init__( backend="pil", use_v2=False, ): - T, datapoints, v2_extras = get_modules(use_v2) + T, vision_tensors, v2_extras = get_modules(use_v2) transforms = [] backend = backend.lower() - if backend == "datapoint": + if backend == "vision_tensor": transforms.append(T.ToImage()) elif backend == "tensor": transforms.append(T.PILToTensor()) elif backend != "pil": - raise ValueError(f"backend can be 'datapoint', 'tensor' or 'pil', but got {backend}") + raise ValueError(f"backend can be 'vision_tensor', 'tensor' or 'pil', but got {backend}") transforms += [T.RandomResize(min_size=int(0.5 * base_size), max_size=int(2.0 * base_size))] @@ -46,7 +46,7 @@ def __init__( if use_v2: # We need a custom pad transform here, since the padding we want to perform here is fundamentally # different from the padding in `RandomCrop` if `pad_if_needed=True`. - transforms += [v2_extras.PadIfSmaller(crop_size, fill={datapoints.Mask: 255, "others": 0})] + transforms += [v2_extras.PadIfSmaller(crop_size, fill={vision_tensors.Mask: 255, "others": 0})] transforms += [T.RandomCrop(crop_size)] @@ -54,9 +54,9 @@ def __init__( transforms += [T.PILToTensor()] if use_v2: - img_type = datapoints.Image if backend == "datapoint" else torch.Tensor + img_type = vision_tensors.Image if backend == "vision_tensor" else torch.Tensor transforms += [ - T.ToDtype(dtype={img_type: torch.float32, datapoints.Mask: torch.int64, "others": None}, scale=True) + T.ToDtype(dtype={img_type: torch.float32, vision_tensors.Mask: torch.int64, "others": None}, scale=True) ] else: # No need to explicitly convert masks as they're magically int64 already @@ -82,10 +82,10 @@ def __init__( backend = backend.lower() if backend == "tensor": transforms += [T.PILToTensor()] - elif backend == "datapoint": + elif backend == "vision_tensor": transforms += [T.ToImage()] elif backend != "pil": - raise ValueError(f"backend can be 'datapoint', 'tensor' or 'pil', but got {backend}") + raise ValueError(f"backend can be 'vision_tensor', 'tensor' or 'pil', but got {backend}") if use_v2: transforms += [T.Resize(size=(base_size, base_size))] diff --git a/references/segmentation/train.py b/references/segmentation/train.py index 7ca4bd1c592..dbd5f43691a 100644 --- a/references/segmentation/train.py +++ b/references/segmentation/train.py @@ -128,7 +128,7 @@ def train_one_epoch(model, criterion, optimizer, data_loader, lr_scheduler, devi def main(args): if args.backend.lower() != "pil" and not args.use_v2: # TODO: Support tensor backend in V1? - raise ValueError("Use --use-v2 if you want to use the datapoint or tensor backend.") + raise ValueError("Use --use-v2 if you want to use the vision_tensor or tensor backend.") if args.use_v2 and args.dataset != "coco": raise ValueError("v2 is only support supported for coco dataset for now.") diff --git a/references/segmentation/v2_extras.py b/references/segmentation/v2_extras.py index ae55f0727a4..caa2e36ac08 100644 --- a/references/segmentation/v2_extras.py +++ b/references/segmentation/v2_extras.py @@ -1,6 +1,6 @@ """This file only exists to be lazy-imported and avoid V2-related import warnings when just using V1.""" import torch -from torchvision import datapoints +from torchvision import vision_tensors from torchvision.transforms import v2 @@ -80,4 +80,4 @@ def forward(self, image, target): if segmentation_mask is None: segmentation_mask = torch.zeros(v2.functional.get_size(image), dtype=torch.uint8) - return image, datapoints.Mask(segmentation_mask) + return image, vision_tensors.Mask(segmentation_mask) diff --git a/test/common_utils.py b/test/common_utils.py index 61f06994801..d2258d2f3ec 100644 --- a/test/common_utils.py +++ b/test/common_utils.py @@ -19,7 +19,7 @@ from PIL import Image from torch.testing._comparison import BooleanPair, NonePair, not_close_error_metas, NumberPair, TensorLikePair -from torchvision import datapoints, io +from torchvision import io, vision_tensors from torchvision.transforms._functional_tensor import _max_value as get_max_value from torchvision.transforms.v2.functional import to_image, to_pil_image @@ -391,7 +391,7 @@ def make_image( if color_space in {"GRAY_ALPHA", "RGBA"}: data[..., -1, :, :] = max_value - return datapoints.Image(data) + return vision_tensors.Image(data) def make_image_tensor(*args, **kwargs): @@ -405,7 +405,7 @@ def make_image_pil(*args, **kwargs): def make_bounding_boxes( canvas_size=DEFAULT_SIZE, *, - format=datapoints.BoundingBoxFormat.XYXY, + format=vision_tensors.BoundingBoxFormat.XYXY, dtype=None, device="cpu", ): @@ -415,7 +415,7 @@ def sample_position(values, max_value): return torch.stack([torch.randint(max_value - v, ()) for v in values.tolist()]) if isinstance(format, str): - format = datapoints.BoundingBoxFormat[format] + format = vision_tensors.BoundingBoxFormat[format] dtype = dtype or torch.float32 @@ -424,21 +424,21 @@ def sample_position(values, max_value): y = sample_position(h, canvas_size[0]) x = sample_position(w, canvas_size[1]) - if format is datapoints.BoundingBoxFormat.XYWH: + if format is vision_tensors.BoundingBoxFormat.XYWH: parts = (x, y, w, h) - elif format is datapoints.BoundingBoxFormat.XYXY: + elif format is vision_tensors.BoundingBoxFormat.XYXY: x1, y1 = x, y x2 = x1 + w y2 = y1 + h parts = (x1, y1, x2, y2) - elif format is datapoints.BoundingBoxFormat.CXCYWH: + elif format is vision_tensors.BoundingBoxFormat.CXCYWH: cx = x + w / 2 cy = y + h / 2 parts = (cx, cy, w, h) else: raise ValueError(f"Format {format} is not supported") - return datapoints.BoundingBoxes( + return vision_tensors.BoundingBoxes( torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, canvas_size=canvas_size ) @@ -446,7 +446,7 @@ def sample_position(values, max_value): def make_detection_mask(size=DEFAULT_SIZE, *, dtype=None, device="cpu"): """Make a "detection" mask, i.e. (*, N, H, W), where each object is encoded as one of N boolean masks""" num_objects = 1 - return datapoints.Mask( + return vision_tensors.Mask( torch.testing.make_tensor( (num_objects, *size), low=0, @@ -459,7 +459,7 @@ def make_detection_mask(size=DEFAULT_SIZE, *, dtype=None, device="cpu"): def make_segmentation_mask(size=DEFAULT_SIZE, *, num_categories=10, batch_dims=(), dtype=None, device="cpu"): """Make a "segmentation" mask, i.e. (*, H, W), where the category is encoded as pixel value""" - return datapoints.Mask( + return vision_tensors.Mask( torch.testing.make_tensor( (*batch_dims, *size), low=0, @@ -471,7 +471,7 @@ def make_segmentation_mask(size=DEFAULT_SIZE, *, num_categories=10, batch_dims=( def make_video(size=DEFAULT_SIZE, *, num_frames=3, batch_dims=(), **kwargs): - return datapoints.Video(make_image(size, batch_dims=(*batch_dims, num_frames), **kwargs)) + return vision_tensors.Video(make_image(size, batch_dims=(*batch_dims, num_frames), **kwargs)) def make_video_tensor(*args, **kwargs): diff --git a/test/datasets_utils.py b/test/datasets_utils.py index f7a1b8dd3de..9482a52cfd5 100644 --- a/test/datasets_utils.py +++ b/test/datasets_utils.py @@ -568,7 +568,7 @@ def test_transforms(self, config): @test_all_configs def test_transforms_v2_wrapper(self, config): - from torchvision import datapoints + from torchvision import vision_tensors from torchvision.datasets import wrap_dataset_for_transforms_v2 try: @@ -590,7 +590,7 @@ def test_transforms_v2_wrapper(self, config): wrapped_sample = wrapped_dataset[0] assert tree_any( - lambda item: isinstance(item, (datapoints.Datapoint, PIL.Image.Image)), wrapped_sample + lambda item: isinstance(item, (vision_tensors.VisionTensor, PIL.Image.Image)), wrapped_sample ) except TypeError as error: msg = f"No wrapper exists for dataset class {type(dataset).__name__}" @@ -717,7 +717,7 @@ def check_transforms_v2_wrapper_spawn(dataset): pytest.skip("Multiprocessing spawning is only checked on macOS.") from torch.utils.data import DataLoader - from torchvision import datapoints + from torchvision import vision_tensors from torchvision.datasets import wrap_dataset_for_transforms_v2 wrapped_dataset = wrap_dataset_for_transforms_v2(dataset) @@ -726,7 +726,7 @@ def check_transforms_v2_wrapper_spawn(dataset): for wrapped_sample in dataloader: assert tree_any( - lambda item: isinstance(item, (datapoints.Image, datapoints.Video, PIL.Image.Image)), wrapped_sample + lambda item: isinstance(item, (vision_tensors.Image, vision_tensors.Video, PIL.Image.Image)), wrapped_sample ) diff --git a/test/prototype_common_utils.py b/test/prototype_common_utils.py index acbe1a6a77a..ee7a6751acd 100644 --- a/test/prototype_common_utils.py +++ b/test/prototype_common_utils.py @@ -6,7 +6,7 @@ import torch from torch.nn.functional import one_hot -from torchvision.prototype import datapoints +from torchvision.prototype import vision_tensors from transforms_v2_legacy_utils import combinations_grid, DEFAULT_EXTRA_DIMS, from_loader, from_loaders, TensorLoader @@ -40,7 +40,7 @@ def fn(shape, dtype, device): # The idiom `make_tensor(..., dtype=torch.int64).to(dtype)` is intentional to only get integer values, # regardless of the requested dtype, e.g. 0 or 0.0 rather than 0 or 0.123 data = torch.testing.make_tensor(shape, low=0, high=num_categories, dtype=torch.int64, device=device).to(dtype) - return datapoints.Label(data, categories=categories) + return vision_tensors.Label(data, categories=categories) return LabelLoader(fn, shape=extra_dims, dtype=dtype, categories=categories) @@ -64,7 +64,7 @@ def fn(shape, dtype, device): # since `one_hot` only supports int64 label = make_label_loader(extra_dims=extra_dims, categories=num_categories, dtype=torch.int64).load(device) data = one_hot(label, num_classes=num_categories).to(dtype) - return datapoints.OneHotLabel(data, categories=categories) + return vision_tensors.OneHotLabel(data, categories=categories) return OneHotLabelLoader(fn, shape=(*extra_dims, num_categories), dtype=dtype, categories=categories) diff --git a/test/test_datasets.py b/test/test_datasets.py index 265316264f8..ad5bd622eba 100644 --- a/test/test_datasets.py +++ b/test/test_datasets.py @@ -3387,11 +3387,11 @@ def test_missing_wrapper(self): datasets.wrap_dataset_for_transforms_v2(dataset) def test_subclass(self, mocker): - from torchvision import datapoints + from torchvision import vision_tensors sentinel = object() mocker.patch.dict( - datapoints._dataset_wrapper.WRAPPER_FACTORIES, + vision_tensors._dataset_wrapper.WRAPPER_FACTORIES, clear=False, values={datasets.FakeData: lambda dataset, target_keys: lambda idx, sample: sentinel}, ) diff --git a/test/test_prototype_datasets_builtin.py b/test/test_prototype_datasets_builtin.py index 8497ea27b54..d015c191445 100644 --- a/test/test_prototype_datasets_builtin.py +++ b/test/test_prototype_datasets_builtin.py @@ -19,12 +19,12 @@ from torchdata.dataloader2.graph.utils import traverse_dps from torchdata.datapipes.iter import ShardingFilter, Shuffler from torchdata.datapipes.utils import StreamWrapper -from torchvision import datapoints +from torchvision import vision_tensors from torchvision._utils import sequence_to_str from torchvision.prototype import datasets -from torchvision.prototype.datapoints import Label from torchvision.prototype.datasets.utils import EncodedImage from torchvision.prototype.datasets.utils._internal import INFINITE_BUFFER_SIZE +from torchvision.prototype.vision_tensors import Label from torchvision.transforms.v2._utils import is_pure_tensor @@ -147,7 +147,7 @@ def test_no_unaccompanied_pure_tensors(self, dataset_mock, config): pure_tensors = {key for key, value in sample.items() if is_pure_tensor(value)} if pure_tensors and not any( - isinstance(item, (datapoints.Image, datapoints.Video, EncodedImage)) for item in sample.values() + isinstance(item, (vision_tensors.Image, vision_tensors.Video, EncodedImage)) for item in sample.values() ): raise AssertionError( f"The values of key(s) " @@ -276,7 +276,7 @@ def test_sample_content(self, dataset_mock, config): assert "image" in sample assert "label" in sample - assert isinstance(sample["image"], datapoints.Image) + assert isinstance(sample["image"], vision_tensors.Image) assert isinstance(sample["label"], Label) assert sample["image"].shape == (1, 16, 16) diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py index b4e1d108748..4f27bddb174 100644 --- a/test/test_prototype_transforms.py +++ b/test/test_prototype_transforms.py @@ -7,11 +7,11 @@ from common_utils import assert_equal from prototype_common_utils import make_label - -from torchvision.datapoints import BoundingBoxes, BoundingBoxFormat, Image, Mask, Video -from torchvision.prototype import datapoints, transforms +from torchvision.prototype import transforms, vision_tensors from torchvision.transforms.v2._utils import check_type, is_pure_tensor from torchvision.transforms.v2.functional import clamp_bounding_boxes, InterpolationMode, pil_to_tensor, to_pil_image + +from torchvision.vision_tensors import BoundingBoxes, BoundingBoxFormat, Image, Mask, Video from transforms_v2_legacy_utils import ( DEFAULT_EXTRA_DIMS, make_bounding_boxes, @@ -51,7 +51,7 @@ def test__extract_image_targets_assertion(self, mocker): # images, batch size = 2 self.create_fake_image(mocker, Image), # labels, bboxes, masks - mocker.MagicMock(spec=datapoints.Label), + mocker.MagicMock(spec=vision_tensors.Label), mocker.MagicMock(spec=BoundingBoxes), mocker.MagicMock(spec=Mask), # labels, bboxes, masks @@ -63,7 +63,7 @@ def test__extract_image_targets_assertion(self, mocker): transform._extract_image_targets(flat_sample) @pytest.mark.parametrize("image_type", [Image, PIL.Image.Image, torch.Tensor]) - @pytest.mark.parametrize("label_type", [datapoints.Label, datapoints.OneHotLabel]) + @pytest.mark.parametrize("label_type", [vision_tensors.Label, vision_tensors.OneHotLabel]) def test__extract_image_targets(self, image_type, label_type, mocker): transform = transforms.SimpleCopyPaste() @@ -101,7 +101,7 @@ def test__extract_image_targets(self, image_type, label_type, mocker): assert isinstance(target[key], type_) assert target[key] in flat_sample - @pytest.mark.parametrize("label_type", [datapoints.Label, datapoints.OneHotLabel]) + @pytest.mark.parametrize("label_type", [vision_tensors.Label, vision_tensors.OneHotLabel]) def test__copy_paste(self, label_type): image = 2 * torch.ones(3, 32, 32) masks = torch.zeros(2, 32, 32) @@ -111,7 +111,7 @@ def test__copy_paste(self, label_type): blending = True resize_interpolation = InterpolationMode.BILINEAR antialias = None - if label_type == datapoints.OneHotLabel: + if label_type == vision_tensors.OneHotLabel: labels = torch.nn.functional.one_hot(labels, num_classes=5) target = { "boxes": BoundingBoxes( @@ -126,7 +126,7 @@ def test__copy_paste(self, label_type): paste_masks[0, 13:19, 12:18] = 1 paste_masks[1, 15:19, 1:8] = 1 paste_labels = torch.tensor([3, 4]) - if label_type == datapoints.OneHotLabel: + if label_type == vision_tensors.OneHotLabel: paste_labels = torch.nn.functional.one_hot(paste_labels, num_classes=5) paste_target = { "boxes": BoundingBoxes( @@ -148,7 +148,7 @@ def test__copy_paste(self, label_type): torch.testing.assert_close(output_target["boxes"][2:, :], paste_target["boxes"]) expected_labels = torch.tensor([1, 2, 3, 4]) - if label_type == datapoints.OneHotLabel: + if label_type == vision_tensors.OneHotLabel: expected_labels = torch.nn.functional.one_hot(expected_labels, num_classes=5) torch.testing.assert_close(output_target["labels"], label_type(expected_labels)) @@ -258,10 +258,10 @@ def test__transform_bounding_boxes_clamping(self, mocker): class TestLabelToOneHot: def test__transform(self): categories = ["apple", "pear", "pineapple"] - labels = datapoints.Label(torch.tensor([0, 1, 2, 1]), categories=categories) + labels = vision_tensors.Label(torch.tensor([0, 1, 2, 1]), categories=categories) transform = transforms.LabelToOneHot() ohe_labels = transform(labels) - assert isinstance(ohe_labels, datapoints.OneHotLabel) + assert isinstance(ohe_labels, vision_tensors.OneHotLabel) assert ohe_labels.shape == (4, 3) assert ohe_labels.categories == labels.categories == categories @@ -383,7 +383,7 @@ def import_transforms_from_references(reference): def test_fixed_sized_crop_against_detection_reference(): - def make_datapoints(): + def make_vision_tensors(): size = (600, 800) num_objects = 22 @@ -405,19 +405,19 @@ def make_datapoints(): yield (tensor_image, target) - datapoint_image = make_image(size=size, color_space="RGB") + vision_tensor_image = make_image(size=size, color_space="RGB") target = { "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float), "labels": make_label(extra_dims=(num_objects,), categories=80), "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long), } - yield (datapoint_image, target) + yield (vision_tensor_image, target) t = transforms.FixedSizeCrop((1024, 1024), fill=0) t_ref = det_transforms.FixedSizeCrop((1024, 1024), fill=0) - for dp in make_datapoints(): + for dp in make_vision_tensors(): # We should use prototype transform first as reference transform performs inplace target update torch.manual_seed(12) output = t(dp) diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py index 982c86d0426..6ea07487f4e 100644 --- a/test/test_transforms_v2.py +++ b/test/test_transforms_v2.py @@ -13,7 +13,7 @@ from common_utils import assert_equal, cpu_and_cuda from torch.utils._pytree import tree_flatten, tree_unflatten -from torchvision import datapoints +from torchvision import vision_tensors from torchvision.ops.boxes import box_iou from torchvision.transforms.functional import to_pil_image from torchvision.transforms.v2 import functional as F @@ -66,10 +66,10 @@ def auto_augment_adapter(transform, input, device): adapted_input = {} image_or_video_found = False for key, value in input.items(): - if isinstance(value, (datapoints.BoundingBoxes, datapoints.Mask)): + if isinstance(value, (vision_tensors.BoundingBoxes, vision_tensors.Mask)): # AA transforms don't support bounding boxes or masks continue - elif check_type(value, (datapoints.Image, datapoints.Video, is_pure_tensor, PIL.Image.Image)): + elif check_type(value, (vision_tensors.Image, vision_tensors.Video, is_pure_tensor, PIL.Image.Image)): if image_or_video_found: # AA transforms only support a single image or video continue @@ -99,7 +99,7 @@ def normalize_adapter(transform, input, device): if isinstance(value, PIL.Image.Image): # normalize doesn't support PIL images continue - elif check_type(value, (datapoints.Image, datapoints.Video, is_pure_tensor)): + elif check_type(value, (vision_tensors.Image, vision_tensors.Video, is_pure_tensor)): # normalize doesn't support integer images value = F.to_dtype(value, torch.float32, scale=True) adapted_input[key] = value @@ -142,7 +142,7 @@ class TestSmoke: (transforms.Resize([16, 16], antialias=True), None), (transforms.ScaleJitter((16, 16), scale_range=(0.8, 1.2), antialias=True), None), (transforms.ClampBoundingBoxes(), None), - (transforms.ConvertBoundingBoxFormat(datapoints.BoundingBoxFormat.CXCYWH), None), + (transforms.ConvertBoundingBoxFormat(vision_tensors.BoundingBoxFormat.CXCYWH), None), (transforms.ConvertImageDtype(), None), (transforms.GaussianBlur(kernel_size=3), None), ( @@ -178,19 +178,19 @@ def test_common(self, transform, adapter, container_type, image_or_video, de_ser canvas_size = F.get_size(image_or_video) input = dict( image_or_video=image_or_video, - image_datapoint=make_image(size=canvas_size), - video_datapoint=make_video(size=canvas_size), + image_vision_tensor=make_image(size=canvas_size), + video_vision_tensor=make_video(size=canvas_size), image_pil=next(make_pil_images(sizes=[canvas_size], color_spaces=["RGB"])), bounding_boxes_xyxy=make_bounding_boxes( - format=datapoints.BoundingBoxFormat.XYXY, canvas_size=canvas_size, batch_dims=(3,) + format=vision_tensors.BoundingBoxFormat.XYXY, canvas_size=canvas_size, batch_dims=(3,) ), bounding_boxes_xywh=make_bounding_boxes( - format=datapoints.BoundingBoxFormat.XYWH, canvas_size=canvas_size, batch_dims=(4,) + format=vision_tensors.BoundingBoxFormat.XYWH, canvas_size=canvas_size, batch_dims=(4,) ), bounding_boxes_cxcywh=make_bounding_boxes( - format=datapoints.BoundingBoxFormat.CXCYWH, canvas_size=canvas_size, batch_dims=(5,) + format=vision_tensors.BoundingBoxFormat.CXCYWH, canvas_size=canvas_size, batch_dims=(5,) ), - bounding_boxes_degenerate_xyxy=datapoints.BoundingBoxes( + bounding_boxes_degenerate_xyxy=vision_tensors.BoundingBoxes( [ [0, 0, 0, 0], # no height or width [0, 0, 0, 1], # no height @@ -199,10 +199,10 @@ def test_common(self, transform, adapter, container_type, image_or_video, de_ser [0, 2, 1, 1], # x1 < x2, y1 > y2 [2, 2, 1, 1], # x1 > x2, y1 > y2 ], - format=datapoints.BoundingBoxFormat.XYXY, + format=vision_tensors.BoundingBoxFormat.XYXY, canvas_size=canvas_size, ), - bounding_boxes_degenerate_xywh=datapoints.BoundingBoxes( + bounding_boxes_degenerate_xywh=vision_tensors.BoundingBoxes( [ [0, 0, 0, 0], # no height or width [0, 0, 0, 1], # no height @@ -211,10 +211,10 @@ def test_common(self, transform, adapter, container_type, image_or_video, de_ser [0, 0, -1, 1], # negative width [0, 0, -1, -1], # negative height and width ], - format=datapoints.BoundingBoxFormat.XYWH, + format=vision_tensors.BoundingBoxFormat.XYWH, canvas_size=canvas_size, ), - bounding_boxes_degenerate_cxcywh=datapoints.BoundingBoxes( + bounding_boxes_degenerate_cxcywh=vision_tensors.BoundingBoxes( [ [0, 0, 0, 0], # no height or width [0, 0, 0, 1], # no height @@ -223,7 +223,7 @@ def test_common(self, transform, adapter, container_type, image_or_video, de_ser [0, 0, -1, 1], # negative width [0, 0, -1, -1], # negative height and width ], - format=datapoints.BoundingBoxFormat.CXCYWH, + format=vision_tensors.BoundingBoxFormat.CXCYWH, canvas_size=canvas_size, ), detection_mask=make_detection_mask(size=canvas_size), @@ -262,7 +262,7 @@ def test_common(self, transform, adapter, container_type, image_or_video, de_ser else: assert output_item is input_item - if isinstance(input_item, datapoints.BoundingBoxes) and not isinstance( + if isinstance(input_item, vision_tensors.BoundingBoxes) and not isinstance( transform, transforms.ConvertBoundingBoxFormat ): assert output_item.format == input_item.format @@ -270,9 +270,9 @@ def test_common(self, transform, adapter, container_type, image_or_video, de_ser # Enforce that the transform does not turn a degenerate box marked by RandomIoUCrop (or any other future # transform that does this), back into a valid one. # TODO: we should test that against all degenerate boxes above - for format in list(datapoints.BoundingBoxFormat): + for format in list(vision_tensors.BoundingBoxFormat): sample = dict( - boxes=datapoints.BoundingBoxes([[0, 0, 0, 0]], format=format, canvas_size=(224, 244)), + boxes=vision_tensors.BoundingBoxes([[0, 0, 0, 0]], format=format, canvas_size=(224, 244)), labels=torch.tensor([3]), ) assert transforms.SanitizeBoundingBoxes()(sample)["boxes"].shape == (0, 4) @@ -652,7 +652,7 @@ def test__get_params(self, value): class TestTransform: @pytest.mark.parametrize( "inpt_type", - [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBoxes, str, int], + [torch.Tensor, PIL.Image.Image, vision_tensors.Image, np.ndarray, vision_tensors.BoundingBoxes, str, int], ) def test_check_transformed_types(self, inpt_type, mocker): # This test ensures that we correctly handle which types to transform and which to bypass @@ -670,7 +670,7 @@ def test_check_transformed_types(self, inpt_type, mocker): class TestToImage: @pytest.mark.parametrize( "inpt_type", - [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBoxes, str, int], + [torch.Tensor, PIL.Image.Image, vision_tensors.Image, np.ndarray, vision_tensors.BoundingBoxes, str, int], ) def test__transform(self, inpt_type, mocker): fn = mocker.patch( @@ -681,7 +681,7 @@ def test__transform(self, inpt_type, mocker): inpt = mocker.MagicMock(spec=inpt_type) transform = transforms.ToImage() transform(inpt) - if inpt_type in (datapoints.BoundingBoxes, datapoints.Image, str, int): + if inpt_type in (vision_tensors.BoundingBoxes, vision_tensors.Image, str, int): assert fn.call_count == 0 else: fn.assert_called_once_with(inpt) @@ -690,7 +690,7 @@ def test__transform(self, inpt_type, mocker): class TestToPILImage: @pytest.mark.parametrize( "inpt_type", - [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBoxes, str, int], + [torch.Tensor, PIL.Image.Image, vision_tensors.Image, np.ndarray, vision_tensors.BoundingBoxes, str, int], ) def test__transform(self, inpt_type, mocker): fn = mocker.patch("torchvision.transforms.v2.functional.to_pil_image") @@ -698,7 +698,7 @@ def test__transform(self, inpt_type, mocker): inpt = mocker.MagicMock(spec=inpt_type) transform = transforms.ToPILImage() transform(inpt) - if inpt_type in (PIL.Image.Image, datapoints.BoundingBoxes, str, int): + if inpt_type in (PIL.Image.Image, vision_tensors.BoundingBoxes, str, int): assert fn.call_count == 0 else: fn.assert_called_once_with(inpt, mode=transform.mode) @@ -707,7 +707,7 @@ def test__transform(self, inpt_type, mocker): class TestToTensor: @pytest.mark.parametrize( "inpt_type", - [torch.Tensor, PIL.Image.Image, datapoints.Image, np.ndarray, datapoints.BoundingBoxes, str, int], + [torch.Tensor, PIL.Image.Image, vision_tensors.Image, np.ndarray, vision_tensors.BoundingBoxes, str, int], ) def test__transform(self, inpt_type, mocker): fn = mocker.patch("torchvision.transforms.functional.to_tensor") @@ -716,7 +716,7 @@ def test__transform(self, inpt_type, mocker): with pytest.warns(UserWarning, match="deprecated and will be removed"): transform = transforms.ToTensor() transform(inpt) - if inpt_type in (datapoints.Image, torch.Tensor, datapoints.BoundingBoxes, str, int): + if inpt_type in (vision_tensors.Image, torch.Tensor, vision_tensors.BoundingBoxes, str, int): assert fn.call_count == 0 else: fn.assert_called_once_with(inpt) @@ -757,7 +757,7 @@ class TestRandomIoUCrop: def test__get_params(self, device, options): orig_h, orig_w = size = (24, 32) image = make_image(size) - bboxes = datapoints.BoundingBoxes( + bboxes = vision_tensors.BoundingBoxes( torch.tensor([[1, 1, 10, 10], [20, 20, 23, 23], [1, 20, 10, 23], [20, 1, 23, 10]]), format="XYXY", canvas_size=size, @@ -792,8 +792,8 @@ def test__get_params(self, device, options): def test__transform_empty_params(self, mocker): transform = transforms.RandomIoUCrop(sampler_options=[2.0]) - image = datapoints.Image(torch.rand(1, 3, 4, 4)) - bboxes = datapoints.BoundingBoxes(torch.tensor([[1, 1, 2, 2]]), format="XYXY", canvas_size=(4, 4)) + image = vision_tensors.Image(torch.rand(1, 3, 4, 4)) + bboxes = vision_tensors.BoundingBoxes(torch.tensor([[1, 1, 2, 2]]), format="XYXY", canvas_size=(4, 4)) label = torch.tensor([1]) sample = [image, bboxes, label] # Let's mock transform._get_params to control the output: @@ -827,11 +827,11 @@ def test__transform(self, mocker): # check number of bboxes vs number of labels: output_bboxes = output[1] - assert isinstance(output_bboxes, datapoints.BoundingBoxes) + assert isinstance(output_bboxes, vision_tensors.BoundingBoxes) assert (output_bboxes[~is_within_crop_area] == 0).all() output_masks = output[2] - assert isinstance(output_masks, datapoints.Mask) + assert isinstance(output_masks, vision_tensors.Mask) class TestScaleJitter: @@ -899,7 +899,7 @@ def test_assertions(self): [ 122 * torch.ones(1, 3, 8, 8), 122.0 * torch.ones(1, 3, 8, 8), - datapoints.Image(122 * torch.ones(1, 3, 8, 8)), + vision_tensors.Image(122 * torch.ones(1, 3, 8, 8)), PIL.Image.new("RGB", (8, 8), (122, 122, 122)), ], ) @@ -941,7 +941,7 @@ class TestUniformTemporalSubsample: [ torch.zeros(10, 3, 8, 8), torch.zeros(1, 10, 3, 8, 8), - datapoints.Video(torch.zeros(1, 10, 3, 8, 8)), + vision_tensors.Video(torch.zeros(1, 10, 3, 8, 8)), ], ) def test__transform(self, inpt): @@ -971,12 +971,12 @@ def test_antialias_warning(): transforms.RandomResize(10, 20)(tensor_img) with pytest.warns(UserWarning, match=match): - F.resized_crop(datapoints.Image(tensor_img), 0, 0, 10, 10, (20, 20)) + F.resized_crop(vision_tensors.Image(tensor_img), 0, 0, 10, 10, (20, 20)) with pytest.warns(UserWarning, match=match): - F.resize(datapoints.Video(tensor_video), (20, 20)) + F.resize(vision_tensors.Video(tensor_video), (20, 20)) with pytest.warns(UserWarning, match=match): - F.resized_crop(datapoints.Video(tensor_video), 0, 0, 10, 10, (20, 20)) + F.resized_crop(vision_tensors.Video(tensor_video), 0, 0, 10, 10, (20, 20)) with warnings.catch_warnings(): warnings.simplefilter("error") @@ -990,17 +990,17 @@ def test_antialias_warning(): transforms.RandomShortestSize((20, 20), antialias=True)(tensor_img) transforms.RandomResize(10, 20, antialias=True)(tensor_img) - F.resized_crop(datapoints.Image(tensor_img), 0, 0, 10, 10, (20, 20), antialias=True) - F.resized_crop(datapoints.Video(tensor_video), 0, 0, 10, 10, (20, 20), antialias=True) + F.resized_crop(vision_tensors.Image(tensor_img), 0, 0, 10, 10, (20, 20), antialias=True) + F.resized_crop(vision_tensors.Video(tensor_video), 0, 0, 10, 10, (20, 20), antialias=True) -@pytest.mark.parametrize("image_type", (PIL.Image, torch.Tensor, datapoints.Image)) +@pytest.mark.parametrize("image_type", (PIL.Image, torch.Tensor, vision_tensors.Image)) @pytest.mark.parametrize("label_type", (torch.Tensor, int)) @pytest.mark.parametrize("dataset_return_type", (dict, tuple)) @pytest.mark.parametrize("to_tensor", (transforms.ToTensor, transforms.ToImage)) def test_classif_preset(image_type, label_type, dataset_return_type, to_tensor): - image = datapoints.Image(torch.randint(0, 256, size=(1, 3, 250, 250), dtype=torch.uint8)) + image = vision_tensors.Image(torch.randint(0, 256, size=(1, 3, 250, 250), dtype=torch.uint8)) if image_type is PIL.Image: image = to_pil_image(image[0]) elif image_type is torch.Tensor: @@ -1056,7 +1056,7 @@ def test_classif_preset(image_type, label_type, dataset_return_type, to_tensor): assert out_label == label -@pytest.mark.parametrize("image_type", (PIL.Image, torch.Tensor, datapoints.Image)) +@pytest.mark.parametrize("image_type", (PIL.Image, torch.Tensor, vision_tensors.Image)) @pytest.mark.parametrize("data_augmentation", ("hflip", "lsj", "multiscale", "ssd", "ssdlite")) @pytest.mark.parametrize("to_tensor", (transforms.ToTensor, transforms.ToImage)) @pytest.mark.parametrize("sanitize", (True, False)) @@ -1082,7 +1082,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize): # leaving FixedSizeCrop in prototype for now, and it expects Label # classes which we won't release yet. # transforms.FixedSizeCrop( - # size=(1024, 1024), fill=defaultdict(lambda: (123.0, 117.0, 104.0), {datapoints.Mask: 0}) + # size=(1024, 1024), fill=defaultdict(lambda: (123.0, 117.0, 104.0), {vision_tensors.Mask: 0}) # ), transforms.RandomCrop((1024, 1024), pad_if_needed=True), transforms.RandomHorizontalFlip(p=1), @@ -1101,7 +1101,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize): elif data_augmentation == "ssd": t = [ transforms.RandomPhotometricDistort(p=1), - transforms.RandomZoomOut(fill={"others": (123.0, 117.0, 104.0), datapoints.Mask: 0}, p=1), + transforms.RandomZoomOut(fill={"others": (123.0, 117.0, 104.0), vision_tensors.Mask: 0}, p=1), transforms.RandomIoUCrop(), transforms.RandomHorizontalFlip(p=1), to_tensor, @@ -1121,7 +1121,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize): num_boxes = 5 H = W = 250 - image = datapoints.Image(torch.randint(0, 256, size=(1, 3, H, W), dtype=torch.uint8)) + image = vision_tensors.Image(torch.randint(0, 256, size=(1, 3, H, W), dtype=torch.uint8)) if image_type is PIL.Image: image = to_pil_image(image[0]) elif image_type is torch.Tensor: @@ -1133,9 +1133,9 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize): boxes = torch.randint(0, min(H, W) // 2, size=(num_boxes, 4)) boxes[:, 2:] += boxes[:, :2] boxes = boxes.clamp(min=0, max=min(H, W)) - boxes = datapoints.BoundingBoxes(boxes, format="XYXY", canvas_size=(H, W)) + boxes = vision_tensors.BoundingBoxes(boxes, format="XYXY", canvas_size=(H, W)) - masks = datapoints.Mask(torch.randint(0, 2, size=(num_boxes, H, W), dtype=torch.uint8)) + masks = vision_tensors.Mask(torch.randint(0, 2, size=(num_boxes, H, W), dtype=torch.uint8)) sample = { "image": image, @@ -1146,10 +1146,10 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize): out = t(sample) - if isinstance(to_tensor, transforms.ToTensor) and image_type is not datapoints.Image: + if isinstance(to_tensor, transforms.ToTensor) and image_type is not vision_tensors.Image: assert is_pure_tensor(out["image"]) else: - assert isinstance(out["image"], datapoints.Image) + assert isinstance(out["image"], vision_tensors.Image) assert isinstance(out["label"], type(sample["label"])) num_boxes_expected = { @@ -1204,13 +1204,13 @@ def test_sanitize_bounding_boxes(min_size, labels_getter, sample_type): boxes = torch.tensor(boxes) labels = torch.arange(boxes.shape[0]) - boxes = datapoints.BoundingBoxes( + boxes = vision_tensors.BoundingBoxes( boxes, - format=datapoints.BoundingBoxFormat.XYXY, + format=vision_tensors.BoundingBoxFormat.XYXY, canvas_size=(H, W), ) - masks = datapoints.Mask(torch.randint(0, 2, size=(boxes.shape[0], H, W))) + masks = vision_tensors.Mask(torch.randint(0, 2, size=(boxes.shape[0], H, W))) whatever = torch.rand(10) input_img = torch.randint(0, 256, size=(1, 3, H, W), dtype=torch.uint8) sample = { @@ -1244,8 +1244,8 @@ def test_sanitize_bounding_boxes(min_size, labels_getter, sample_type): assert out_image is input_img assert out_whatever is whatever - assert isinstance(out_boxes, datapoints.BoundingBoxes) - assert isinstance(out_masks, datapoints.Mask) + assert isinstance(out_boxes, vision_tensors.BoundingBoxes) + assert isinstance(out_masks, vision_tensors.Mask) if labels_getter is None or (callable(labels_getter) and labels_getter({"labels": "blah"}) is None): assert out_labels is labels @@ -1266,15 +1266,15 @@ def test_sanitize_bounding_boxes_no_label(): transforms.SanitizeBoundingBoxes()(img, boxes) out_img, out_boxes = transforms.SanitizeBoundingBoxes(labels_getter=None)(img, boxes) - assert isinstance(out_img, datapoints.Image) - assert isinstance(out_boxes, datapoints.BoundingBoxes) + assert isinstance(out_img, vision_tensors.Image) + assert isinstance(out_boxes, vision_tensors.BoundingBoxes) def test_sanitize_bounding_boxes_errors(): - good_bbox = datapoints.BoundingBoxes( + good_bbox = vision_tensors.BoundingBoxes( [[0, 0, 10, 10]], - format=datapoints.BoundingBoxFormat.XYXY, + format=vision_tensors.BoundingBoxFormat.XYXY, canvas_size=(20, 20), ) diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py index a06ecb74824..d5d53e41bec 100644 --- a/test/test_transforms_v2_consistency.py +++ b/test/test_transforms_v2_consistency.py @@ -13,7 +13,7 @@ import torchvision.transforms.v2 as v2_transforms from common_utils import assert_close, assert_equal, set_rng_seed from torch import nn -from torchvision import datapoints, transforms as legacy_transforms +from torchvision import transforms as legacy_transforms, vision_tensors from torchvision._utils import sequence_to_str from torchvision.transforms import functional as legacy_F @@ -478,15 +478,15 @@ def check_call_consistency( output_prototype_image = prototype_transform(image) except Exception as exc: raise AssertionError( - f"Transforming a image datapoint with shape {image_repr} failed in the prototype transform with " + f"Transforming a image vision_tensor with shape {image_repr} failed in the prototype transform with " f"the error above. This means there is a consistency bug either in `_get_params` or in the " - f"`datapoints.Image` path in `_transform`." + f"`vision_tensors.Image` path in `_transform`." ) from exc assert_close( output_prototype_image, output_prototype_tensor, - msg=lambda msg: f"Output for datapoint and tensor images is not equal: \n\n{msg}", + msg=lambda msg: f"Output for vision_tensor and tensor images is not equal: \n\n{msg}", **closeness_kwargs, ) @@ -747,7 +747,7 @@ class TestAATransforms: [ torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8), PIL.Image.new("RGB", (256, 256), 123), - datapoints.Image(torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)), + vision_tensors.Image(torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)), ], ) @pytest.mark.parametrize( @@ -812,7 +812,7 @@ def test_randaug_jit(self, interpolation): [ torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8), PIL.Image.new("RGB", (256, 256), 123), - datapoints.Image(torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)), + vision_tensors.Image(torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)), ], ) @pytest.mark.parametrize( @@ -887,7 +887,7 @@ def test_trivial_aug_jit(self, interpolation): [ torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8), PIL.Image.new("RGB", (256, 256), 123), - datapoints.Image(torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)), + vision_tensors.Image(torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)), ], ) @pytest.mark.parametrize( @@ -964,7 +964,7 @@ def test_augmix_jit(self, interpolation): [ torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8), PIL.Image.new("RGB", (256, 256), 123), - datapoints.Image(torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)), + vision_tensors.Image(torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)), ], ) @pytest.mark.parametrize( @@ -1030,7 +1030,7 @@ def import_transforms_from_references(reference): class TestRefDetTransforms: - def make_datapoints(self, with_mask=True): + def make_vision_tensors(self, with_mask=True): size = (600, 800) num_objects = 22 @@ -1057,7 +1057,7 @@ def make_label(extra_dims, categories): yield (tensor_image, target) - datapoint_image = make_image(size=size, color_space="RGB", dtype=torch.float32) + vision_tensor_image = make_image(size=size, color_space="RGB", dtype=torch.float32) target = { "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float), "labels": make_label(extra_dims=(num_objects,), categories=80), @@ -1065,7 +1065,7 @@ def make_label(extra_dims, categories): if with_mask: target["masks"] = make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long) - yield (datapoint_image, target) + yield (vision_tensor_image, target) @pytest.mark.parametrize( "t_ref, t, data_kwargs", @@ -1095,7 +1095,7 @@ def make_label(extra_dims, categories): ], ) def test_transform(self, t_ref, t, data_kwargs): - for dp in self.make_datapoints(**data_kwargs): + for dp in self.make_vision_tensors(**data_kwargs): # We should use prototype transform first as reference transform performs inplace target update torch.manual_seed(12) @@ -1135,7 +1135,7 @@ def _transform(self, inpt, params): class TestRefSegTransforms: - def make_datapoints(self, supports_pil=True, image_dtype=torch.uint8): + def make_vision_tensors(self, supports_pil=True, image_dtype=torch.uint8): size = (256, 460) num_categories = 21 @@ -1145,13 +1145,13 @@ def make_datapoints(self, supports_pil=True, image_dtype=torch.uint8): conv_fns.extend([torch.Tensor, lambda x: x]) for conv_fn in conv_fns: - datapoint_image = make_image(size=size, color_space="RGB", dtype=image_dtype) - datapoint_mask = make_segmentation_mask(size=size, num_categories=num_categories, dtype=torch.uint8) + vision_tensor_image = make_image(size=size, color_space="RGB", dtype=image_dtype) + vision_tensor_mask = make_segmentation_mask(size=size, num_categories=num_categories, dtype=torch.uint8) - dp = (conv_fn(datapoint_image), datapoint_mask) + dp = (conv_fn(vision_tensor_image), vision_tensor_mask) dp_ref = ( - to_pil_image(datapoint_image) if supports_pil else datapoint_image.as_subclass(torch.Tensor), - to_pil_image(datapoint_mask), + to_pil_image(vision_tensor_image) if supports_pil else vision_tensor_image.as_subclass(torch.Tensor), + to_pil_image(vision_tensor_mask), ) yield dp, dp_ref @@ -1161,7 +1161,7 @@ def set_seed(self, seed=12): random.seed(seed) def check(self, t, t_ref, data_kwargs=None): - for dp, dp_ref in self.make_datapoints(**data_kwargs or dict()): + for dp, dp_ref in self.make_vision_tensors(**data_kwargs or dict()): self.set_seed() actual = actual_image, actual_mask = t(dp) @@ -1192,7 +1192,7 @@ def check(self, t, t_ref, data_kwargs=None): seg_transforms.RandomCrop(size=480), v2_transforms.Compose( [ - PadIfSmaller(size=480, fill={datapoints.Mask: 255, "others": 0}), + PadIfSmaller(size=480, fill={vision_tensors.Mask: 255, "others": 0}), v2_transforms.RandomCrop(size=480), ] ), diff --git a/test/test_transforms_v2_functional.py b/test/test_transforms_v2_functional.py index 315993c750e..75de56fba4f 100644 --- a/test/test_transforms_v2_functional.py +++ b/test/test_transforms_v2_functional.py @@ -10,7 +10,7 @@ from common_utils import assert_close, cache, cpu_and_cuda, needs_cuda, set_rng_seed from torch.utils._pytree import tree_map -from torchvision import datapoints +from torchvision import vision_tensors from torchvision.transforms.functional import _get_perspective_coeffs from torchvision.transforms.v2 import functional as F from torchvision.transforms.v2._utils import is_pure_tensor @@ -164,22 +164,22 @@ def _unbatch(self, batch, *, data_dims): def test_batched_vs_single(self, test_id, info, args_kwargs, device): (batched_input, *other_args), kwargs = args_kwargs.load(device) - datapoint_type = datapoints.Image if is_pure_tensor(batched_input) else type(batched_input) + vision_tensor_type = vision_tensors.Image if is_pure_tensor(batched_input) else type(batched_input) # This dictionary contains the number of rightmost dimensions that contain the actual data. # Everything to the left is considered a batch dimension. data_dims = { - datapoints.Image: 3, - datapoints.BoundingBoxes: 1, + vision_tensors.Image: 3, + vision_tensors.BoundingBoxes: 1, # `Mask`'s are special in the sense that the data dimensions depend on the type of mask. For detection masks # it is 3 `(*, N, H, W)`, but for segmentation masks it is 2 `(*, H, W)`. Since both a grouped under one # type all kernels should also work without differentiating between the two. Thus, we go with 2 here as # common ground. - datapoints.Mask: 2, - datapoints.Video: 4, - }.get(datapoint_type) + vision_tensors.Mask: 2, + vision_tensors.Video: 4, + }.get(vision_tensor_type) if data_dims is None: raise pytest.UsageError( - f"The number of data dimensions cannot be determined for input of type {datapoint_type.__name__}." + f"The number of data dimensions cannot be determined for input of type {vision_tensor_type.__name__}." ) from None elif batched_input.ndim <= data_dims: pytest.skip("Input is not batched.") @@ -305,8 +305,8 @@ def make_spy(fn, *, module=None, name=None): class TestDispatchers: image_sample_inputs = make_info_args_kwargs_parametrization( - [info for info in DISPATCHER_INFOS if datapoints.Image in info.kernels], - args_kwargs_fn=lambda info: info.sample_inputs(datapoints.Image), + [info for info in DISPATCHER_INFOS if vision_tensors.Image in info.kernels], + args_kwargs_fn=lambda info: info.sample_inputs(vision_tensors.Image), ) @make_info_args_kwargs_parametrization( @@ -328,8 +328,8 @@ def test_logging(self, spy_on, info, args_kwargs, device): def test_scripted_smoke(self, info, args_kwargs, device): dispatcher = script(info.dispatcher) - (image_datapoint, *other_args), kwargs = args_kwargs.load(device) - image_pure_tensor = torch.Tensor(image_datapoint) + (image_vision_tensor, *other_args), kwargs = args_kwargs.load(device) + image_pure_tensor = torch.Tensor(image_vision_tensor) dispatcher(image_pure_tensor, *other_args, **kwargs) @@ -355,25 +355,25 @@ def test_scriptable(self, dispatcher): @image_sample_inputs def test_pure_tensor_output_type(self, info, args_kwargs): - (image_datapoint, *other_args), kwargs = args_kwargs.load() - image_pure_tensor = image_datapoint.as_subclass(torch.Tensor) + (image_vision_tensor, *other_args), kwargs = args_kwargs.load() + image_pure_tensor = image_vision_tensor.as_subclass(torch.Tensor) output = info.dispatcher(image_pure_tensor, *other_args, **kwargs) - # We cannot use `isinstance` here since all datapoints are instances of `torch.Tensor` as well + # We cannot use `isinstance` here since all vision_tensors are instances of `torch.Tensor` as well assert type(output) is torch.Tensor @make_info_args_kwargs_parametrization( [info for info in DISPATCHER_INFOS if info.pil_kernel_info is not None], - args_kwargs_fn=lambda info: info.sample_inputs(datapoints.Image), + args_kwargs_fn=lambda info: info.sample_inputs(vision_tensors.Image), ) def test_pil_output_type(self, info, args_kwargs): - (image_datapoint, *other_args), kwargs = args_kwargs.load() + (image_vision_tensor, *other_args), kwargs = args_kwargs.load() - if image_datapoint.ndim > 3: + if image_vision_tensor.ndim > 3: pytest.skip("Input is batched") - image_pil = F.to_pil_image(image_datapoint) + image_pil = F.to_pil_image(image_vision_tensor) output = info.dispatcher(image_pil, *other_args, **kwargs) @@ -383,38 +383,44 @@ def test_pil_output_type(self, info, args_kwargs): DISPATCHER_INFOS, args_kwargs_fn=lambda info: info.sample_inputs(), ) - def test_datapoint_output_type(self, info, args_kwargs): - (datapoint, *other_args), kwargs = args_kwargs.load() + def test_vision_tensor_output_type(self, info, args_kwargs): + (vision_tensor, *other_args), kwargs = args_kwargs.load() - output = info.dispatcher(datapoint, *other_args, **kwargs) + output = info.dispatcher(vision_tensor, *other_args, **kwargs) - assert isinstance(output, type(datapoint)) + assert isinstance(output, type(vision_tensor)) - if isinstance(datapoint, datapoints.BoundingBoxes) and info.dispatcher is not F.convert_bounding_box_format: - assert output.format == datapoint.format + if ( + isinstance(vision_tensor, vision_tensors.BoundingBoxes) + and info.dispatcher is not F.convert_bounding_box_format + ): + assert output.format == vision_tensor.format @pytest.mark.parametrize( - ("dispatcher_info", "datapoint_type", "kernel_info"), + ("dispatcher_info", "vision_tensor_type", "kernel_info"), [ pytest.param( - dispatcher_info, datapoint_type, kernel_info, id=f"{dispatcher_info.id}-{datapoint_type.__name__}" + dispatcher_info, + vision_tensor_type, + kernel_info, + id=f"{dispatcher_info.id}-{vision_tensor_type.__name__}", ) for dispatcher_info in DISPATCHER_INFOS - for datapoint_type, kernel_info in dispatcher_info.kernel_infos.items() + for vision_tensor_type, kernel_info in dispatcher_info.kernel_infos.items() ], ) - def test_dispatcher_kernel_signatures_consistency(self, dispatcher_info, datapoint_type, kernel_info): + def test_dispatcher_kernel_signatures_consistency(self, dispatcher_info, vision_tensor_type, kernel_info): dispatcher_signature = inspect.signature(dispatcher_info.dispatcher) dispatcher_params = list(dispatcher_signature.parameters.values())[1:] kernel_signature = inspect.signature(kernel_info.kernel) kernel_params = list(kernel_signature.parameters.values())[1:] - # We filter out metadata that is implicitly passed to the dispatcher through the input datapoint, but has to be + # We filter out metadata that is implicitly passed to the dispatcher through the input vision_tensor, but has to be # explicitly passed to the kernel. input_type = {v: k for k, v in dispatcher_info.kernels.items()}.get(kernel_info.kernel) explicit_metadata = { - datapoints.BoundingBoxes: {"format", "canvas_size"}, + vision_tensors.BoundingBoxes: {"format", "canvas_size"}, } kernel_params = [param for param in kernel_params if param.name not in explicit_metadata.get(input_type, set())] @@ -445,9 +451,9 @@ def test_unkown_type(self, info): [ info for info in DISPATCHER_INFOS - if datapoints.BoundingBoxes in info.kernels and info.dispatcher is not F.convert_bounding_box_format + if vision_tensors.BoundingBoxes in info.kernels and info.dispatcher is not F.convert_bounding_box_format ], - args_kwargs_fn=lambda info: info.sample_inputs(datapoints.BoundingBoxes), + args_kwargs_fn=lambda info: info.sample_inputs(vision_tensors.BoundingBoxes), ) def test_bounding_boxes_format_consistency(self, info, args_kwargs): (bounding_boxes, *other_args), kwargs = args_kwargs.load() @@ -497,7 +503,7 @@ class TestClampBoundingBoxes: "metadata", [ dict(), - dict(format=datapoints.BoundingBoxFormat.XYXY), + dict(format=vision_tensors.BoundingBoxFormat.XYXY), dict(canvas_size=(1, 1)), ], ) @@ -510,16 +516,16 @@ def test_pure_tensor_insufficient_metadata(self, metadata): @pytest.mark.parametrize( "metadata", [ - dict(format=datapoints.BoundingBoxFormat.XYXY), + dict(format=vision_tensors.BoundingBoxFormat.XYXY), dict(canvas_size=(1, 1)), - dict(format=datapoints.BoundingBoxFormat.XYXY, canvas_size=(1, 1)), + dict(format=vision_tensors.BoundingBoxFormat.XYXY, canvas_size=(1, 1)), ], ) - def test_datapoint_explicit_metadata(self, metadata): - datapoint = next(make_multiple_bounding_boxes()) + def test_vision_tensor_explicit_metadata(self, metadata): + vision_tensor = next(make_multiple_bounding_boxes()) with pytest.raises(ValueError, match=re.escape("`format` and `canvas_size` must not be passed")): - F.clamp_bounding_boxes(datapoint, **metadata) + F.clamp_bounding_boxes(vision_tensor, **metadata) class TestConvertFormatBoundingBoxes: @@ -527,7 +533,7 @@ class TestConvertFormatBoundingBoxes: ("inpt", "old_format"), [ (next(make_multiple_bounding_boxes()), None), - (next(make_multiple_bounding_boxes()).as_subclass(torch.Tensor), datapoints.BoundingBoxFormat.XYXY), + (next(make_multiple_bounding_boxes()).as_subclass(torch.Tensor), vision_tensors.BoundingBoxFormat.XYXY), ], ) def test_missing_new_format(self, inpt, old_format): @@ -538,14 +544,14 @@ def test_pure_tensor_insufficient_metadata(self): pure_tensor = next(make_multiple_bounding_boxes()).as_subclass(torch.Tensor) with pytest.raises(ValueError, match=re.escape("`old_format` has to be passed")): - F.convert_bounding_box_format(pure_tensor, new_format=datapoints.BoundingBoxFormat.CXCYWH) + F.convert_bounding_box_format(pure_tensor, new_format=vision_tensors.BoundingBoxFormat.CXCYWH) - def test_datapoint_explicit_metadata(self): - datapoint = next(make_multiple_bounding_boxes()) + def test_vision_tensor_explicit_metadata(self): + vision_tensor = next(make_multiple_bounding_boxes()) with pytest.raises(ValueError, match=re.escape("`old_format` must not be passed")): F.convert_bounding_box_format( - datapoint, old_format=datapoint.format, new_format=datapoints.BoundingBoxFormat.CXCYWH + vision_tensor, old_format=vision_tensor.format, new_format=vision_tensors.BoundingBoxFormat.CXCYWH ) @@ -579,7 +585,11 @@ def _compute_affine_matrix(angle_, translate_, scale_, shear_, center_): @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize( "format", - [datapoints.BoundingBoxFormat.XYXY, datapoints.BoundingBoxFormat.XYWH, datapoints.BoundingBoxFormat.CXCYWH], + [ + vision_tensors.BoundingBoxFormat.XYXY, + vision_tensors.BoundingBoxFormat.XYWH, + vision_tensors.BoundingBoxFormat.CXCYWH, + ], ) @pytest.mark.parametrize( "top, left, height, width, expected_bboxes", @@ -602,7 +612,7 @@ def test_correctness_crop_bounding_boxes(device, format, top, left, height, widt # out_box = denormalize_bbox(n_out_box, height, width) # expected_bboxes.append(out_box) - format = datapoints.BoundingBoxFormat.XYXY + format = vision_tensors.BoundingBoxFormat.XYXY canvas_size = (64, 76) in_boxes = [ [10.0, 15.0, 25.0, 35.0], @@ -610,11 +620,11 @@ def test_correctness_crop_bounding_boxes(device, format, top, left, height, widt [45.0, 46.0, 56.0, 62.0], ] in_boxes = torch.tensor(in_boxes, device=device) - if format != datapoints.BoundingBoxFormat.XYXY: - in_boxes = convert_bounding_box_format(in_boxes, datapoints.BoundingBoxFormat.XYXY, format) + if format != vision_tensors.BoundingBoxFormat.XYXY: + in_boxes = convert_bounding_box_format(in_boxes, vision_tensors.BoundingBoxFormat.XYXY, format) expected_bboxes = clamp_bounding_boxes( - datapoints.BoundingBoxes(expected_bboxes, format="XYXY", canvas_size=canvas_size) + vision_tensors.BoundingBoxes(expected_bboxes, format="XYXY", canvas_size=canvas_size) ).tolist() output_boxes, output_canvas_size = F.crop_bounding_boxes( @@ -626,8 +636,8 @@ def test_correctness_crop_bounding_boxes(device, format, top, left, height, widt canvas_size[1], ) - if format != datapoints.BoundingBoxFormat.XYXY: - output_boxes = convert_bounding_box_format(output_boxes, format, datapoints.BoundingBoxFormat.XYXY) + if format != vision_tensors.BoundingBoxFormat.XYXY: + output_boxes = convert_bounding_box_format(output_boxes, format, vision_tensors.BoundingBoxFormat.XYXY) torch.testing.assert_close(output_boxes.tolist(), expected_bboxes) torch.testing.assert_close(output_canvas_size, canvas_size) @@ -648,7 +658,11 @@ def test_correctness_vertical_flip_segmentation_mask_on_fixed_input(device): @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize( "format", - [datapoints.BoundingBoxFormat.XYXY, datapoints.BoundingBoxFormat.XYWH, datapoints.BoundingBoxFormat.CXCYWH], + [ + vision_tensors.BoundingBoxFormat.XYXY, + vision_tensors.BoundingBoxFormat.XYWH, + vision_tensors.BoundingBoxFormat.CXCYWH, + ], ) @pytest.mark.parametrize( "top, left, height, width, size", @@ -666,7 +680,7 @@ def _compute_expected_bbox(bbox, top_, left_, height_, width_, size_): bbox[3] = (bbox[3] - top_) * size_[0] / height_ return bbox - format = datapoints.BoundingBoxFormat.XYXY + format = vision_tensors.BoundingBoxFormat.XYXY canvas_size = (100, 100) in_boxes = [ [10.0, 10.0, 20.0, 20.0], @@ -677,16 +691,16 @@ def _compute_expected_bbox(bbox, top_, left_, height_, width_, size_): expected_bboxes.append(_compute_expected_bbox(list(in_box), top, left, height, width, size)) expected_bboxes = torch.tensor(expected_bboxes, device=device) - in_boxes = datapoints.BoundingBoxes( - in_boxes, format=datapoints.BoundingBoxFormat.XYXY, canvas_size=canvas_size, device=device + in_boxes = vision_tensors.BoundingBoxes( + in_boxes, format=vision_tensors.BoundingBoxFormat.XYXY, canvas_size=canvas_size, device=device ) - if format != datapoints.BoundingBoxFormat.XYXY: - in_boxes = convert_bounding_box_format(in_boxes, datapoints.BoundingBoxFormat.XYXY, format) + if format != vision_tensors.BoundingBoxFormat.XYXY: + in_boxes = convert_bounding_box_format(in_boxes, vision_tensors.BoundingBoxFormat.XYXY, format) output_boxes, output_canvas_size = F.resized_crop_bounding_boxes(in_boxes, format, top, left, height, width, size) - if format != datapoints.BoundingBoxFormat.XYXY: - output_boxes = convert_bounding_box_format(output_boxes, format, datapoints.BoundingBoxFormat.XYXY) + if format != vision_tensors.BoundingBoxFormat.XYXY: + output_boxes = convert_bounding_box_format(output_boxes, format, vision_tensors.BoundingBoxFormat.XYXY) torch.testing.assert_close(output_boxes, expected_bboxes) torch.testing.assert_close(output_canvas_size, size) @@ -713,14 +727,14 @@ def _compute_expected_bbox(bbox, format, padding_): dtype = bbox.dtype bbox = ( bbox.clone() - if format == datapoints.BoundingBoxFormat.XYXY - else convert_bounding_box_format(bbox, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY) + if format == vision_tensors.BoundingBoxFormat.XYXY + else convert_bounding_box_format(bbox, old_format=format, new_format=vision_tensors.BoundingBoxFormat.XYXY) ) bbox[0::2] += pad_left bbox[1::2] += pad_up - bbox = convert_bounding_box_format(bbox, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format) + bbox = convert_bounding_box_format(bbox, old_format=vision_tensors.BoundingBoxFormat.XYXY, new_format=format) if bbox.dtype != dtype: # Temporary cast to original dtype # e.g. float32 -> int @@ -785,7 +799,9 @@ def _compute_expected_bbox(bbox, format_, canvas_size_, pcoeffs_): ] ) - bbox_xyxy = convert_bounding_box_format(bbox, old_format=format_, new_format=datapoints.BoundingBoxFormat.XYXY) + bbox_xyxy = convert_bounding_box_format( + bbox, old_format=format_, new_format=vision_tensors.BoundingBoxFormat.XYXY + ) points = np.array( [ [bbox_xyxy[0].item(), bbox_xyxy[1].item(), 1.0], @@ -807,7 +823,7 @@ def _compute_expected_bbox(bbox, format_, canvas_size_, pcoeffs_): ) out_bbox = torch.from_numpy(out_bbox) out_bbox = convert_bounding_box_format( - out_bbox, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format_ + out_bbox, old_format=vision_tensors.BoundingBoxFormat.XYXY, new_format=format_ ) return clamp_bounding_boxes(out_bbox, format=format_, canvas_size=canvas_size_).to(bbox) @@ -846,7 +862,7 @@ def _compute_expected_bbox(bbox, format_, canvas_size_, pcoeffs_): def test_correctness_center_crop_bounding_boxes(device, output_size): def _compute_expected_bbox(bbox, format_, canvas_size_, output_size_): dtype = bbox.dtype - bbox = convert_bounding_box_format(bbox.float(), format_, datapoints.BoundingBoxFormat.XYWH) + bbox = convert_bounding_box_format(bbox.float(), format_, vision_tensors.BoundingBoxFormat.XYWH) if len(output_size_) == 1: output_size_.append(output_size_[-1]) @@ -860,7 +876,7 @@ def _compute_expected_bbox(bbox, format_, canvas_size_, output_size_): bbox[3].item(), ] out_bbox = torch.tensor(out_bbox) - out_bbox = convert_bounding_box_format(out_bbox, datapoints.BoundingBoxFormat.XYWH, format_) + out_bbox = convert_bounding_box_format(out_bbox, vision_tensors.BoundingBoxFormat.XYWH, format_) out_bbox = clamp_bounding_boxes(out_bbox, format=format_, canvas_size=output_size) return out_bbox.to(dtype=dtype, device=bbox.device) @@ -958,7 +974,7 @@ def test_correctness_gaussian_blur_image_tensor(device, canvas_size, dt, ksize, torch.tensor(true_cv2_results[gt_key]).reshape(shape[-2], shape[-1], shape[-3]).permute(2, 0, 1).to(tensor) ) - image = datapoints.Image(tensor) + image = vision_tensors.Image(tensor) out = fn(image, kernel_size=ksize, sigma=sigma) torch.testing.assert_close(out, true_out, rtol=0.0, atol=1.0, msg=f"{ksize}, {sigma}") diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py index 2fea19e8190..d4929f40f94 100644 --- a/test/test_transforms_v2_refactored.py +++ b/test/test_transforms_v2_refactored.py @@ -36,7 +36,7 @@ from torch.testing import assert_close from torch.utils._pytree import tree_map from torch.utils.data import DataLoader, default_collate -from torchvision import datapoints +from torchvision import vision_tensors from torchvision.transforms._functional_tensor import _max_value as get_max_value from torchvision.transforms.functional import pil_modes_mapping @@ -167,7 +167,7 @@ def check_kernel( def _check_functional_scripted_smoke(functional, input, *args, **kwargs): """Checks if the functional can be scripted and the scripted version can be called without error.""" - if not isinstance(input, datapoints.Image): + if not isinstance(input, vision_tensors.Image): return functional_scripted = _script(functional) @@ -187,7 +187,7 @@ def check_functional(functional, input, *args, check_scripted_smoke=True, **kwar assert isinstance(output, type(input)) - if isinstance(input, datapoints.BoundingBoxes): + if isinstance(input, vision_tensors.BoundingBoxes): assert output.format == input.format if check_scripted_smoke: @@ -199,11 +199,11 @@ def check_functional_kernel_signature_match(functional, *, kernel, input_type): functional_params = list(inspect.signature(functional).parameters.values())[1:] kernel_params = list(inspect.signature(kernel).parameters.values())[1:] - if issubclass(input_type, datapoints.Datapoint): - # We filter out metadata that is implicitly passed to the functional through the input datapoint, but has to be + if issubclass(input_type, vision_tensors.VisionTensor): + # We filter out metadata that is implicitly passed to the functional through the input vision_tensor, but has to be # explicitly passed to the kernel. explicit_metadata = { - datapoints.BoundingBoxes: {"format", "canvas_size"}, + vision_tensors.BoundingBoxes: {"format", "canvas_size"}, } kernel_params = [param for param in kernel_params if param.name not in explicit_metadata.get(input_type, set())] @@ -264,7 +264,7 @@ def check_transform(transform, input, check_v1_compatibility=True): output = transform(input) assert isinstance(output, type(input)) - if isinstance(input, datapoints.BoundingBoxes): + if isinstance(input, vision_tensors.BoundingBoxes): assert output.format == input.format if check_v1_compatibility: @@ -362,7 +362,7 @@ def affine_bounding_boxes(bounding_boxes): input_xyxy = F.convert_bounding_box_format( bounding_boxes.to(torch.float64, copy=True), old_format=format, - new_format=datapoints.BoundingBoxFormat.XYXY, + new_format=vision_tensors.BoundingBoxFormat.XYXY, inplace=True, ) x1, y1, x2, y2 = input_xyxy.squeeze(0).tolist() @@ -387,7 +387,7 @@ def affine_bounding_boxes(bounding_boxes): ) output = F.convert_bounding_box_format( - output_xyxy, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format + output_xyxy, old_format=vision_tensors.BoundingBoxFormat.XYXY, new_format=format ) if clamp: @@ -400,7 +400,7 @@ def affine_bounding_boxes(bounding_boxes): return output - return datapoints.BoundingBoxes( + return vision_tensors.BoundingBoxes( torch.cat([affine_bounding_boxes(b) for b in bounding_boxes.reshape(-1, 4).unbind()], dim=0).reshape( bounding_boxes.shape ), @@ -479,7 +479,7 @@ def test_kernel_image_tensor(self, size, interpolation, use_max_size, antialias, check_scripted_vs_eager=not isinstance(size, int), ) - @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat)) + @pytest.mark.parametrize("format", list(vision_tensors.BoundingBoxFormat)) @pytest.mark.parametrize("size", OUTPUT_SIZES) @pytest.mark.parametrize("use_max_size", [True, False]) @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) @@ -529,10 +529,10 @@ def test_functional(self, size, make_input): [ (F.resize_image, torch.Tensor), (F._resize_image_pil, PIL.Image.Image), - (F.resize_image, datapoints.Image), - (F.resize_bounding_boxes, datapoints.BoundingBoxes), - (F.resize_mask, datapoints.Mask), - (F.resize_video, datapoints.Video), + (F.resize_image, vision_tensors.Image), + (F.resize_bounding_boxes, vision_tensors.BoundingBoxes), + (F.resize_mask, vision_tensors.Mask), + (F.resize_video, vision_tensors.Video), ], ) def test_functional_signature(self, kernel, input_type): @@ -605,7 +605,7 @@ def _reference_resize_bounding_boxes(self, bounding_boxes, *, size, max_size=Non new_canvas_size=(new_height, new_width), ) - @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat)) + @pytest.mark.parametrize("format", list(vision_tensors.BoundingBoxFormat)) @pytest.mark.parametrize("size", OUTPUT_SIZES) @pytest.mark.parametrize("use_max_size", [True, False]) @pytest.mark.parametrize("fn", [F.resize, transform_cls_to_functional(transforms.Resize)]) @@ -734,9 +734,9 @@ def test_noop(self, size, make_input): # This identity check is not a requirement. It is here to avoid breaking the behavior by accident. If there # is a good reason to break this, feel free to downgrade to an equality check. - if isinstance(input, datapoints.Datapoint): + if isinstance(input, vision_tensors.VisionTensor): # We can't test identity directly, since that checks for the identity of the Python object. Since all - # datapoints unwrap before a kernel and wrap again afterwards, the Python object changes. Thus, we check + # vision_tensors unwrap before a kernel and wrap again afterwards, the Python object changes. Thus, we check # that the underlying storage is the same assert output.data_ptr() == input.data_ptr() else: @@ -782,7 +782,7 @@ def _make_image(self, *args, batch_dims=(), memory_format=torch.contiguous_forma ) if emulate_channels_last: - image = datapoints.wrap(image.view(*batch_dims, *image.shape[-3:]), like=image) + image = vision_tensors.wrap(image.view(*batch_dims, *image.shape[-3:]), like=image) return image @@ -833,7 +833,7 @@ class TestHorizontalFlip: def test_kernel_image_tensor(self, dtype, device): check_kernel(F.horizontal_flip_image, make_image(dtype=dtype, device=device)) - @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat)) + @pytest.mark.parametrize("format", list(vision_tensors.BoundingBoxFormat)) @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) @pytest.mark.parametrize("device", cpu_and_cuda()) def test_kernel_bounding_boxes(self, format, dtype, device): @@ -864,10 +864,10 @@ def test_functional(self, make_input): [ (F.horizontal_flip_image, torch.Tensor), (F._horizontal_flip_image_pil, PIL.Image.Image), - (F.horizontal_flip_image, datapoints.Image), - (F.horizontal_flip_bounding_boxes, datapoints.BoundingBoxes), - (F.horizontal_flip_mask, datapoints.Mask), - (F.horizontal_flip_video, datapoints.Video), + (F.horizontal_flip_image, vision_tensors.Image), + (F.horizontal_flip_bounding_boxes, vision_tensors.BoundingBoxes), + (F.horizontal_flip_mask, vision_tensors.Mask), + (F.horizontal_flip_video, vision_tensors.Video), ], ) def test_functional_signature(self, kernel, input_type): @@ -902,7 +902,7 @@ def _reference_horizontal_flip_bounding_boxes(self, bounding_boxes): return reference_affine_bounding_boxes_helper(bounding_boxes, affine_matrix=affine_matrix) - @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat)) + @pytest.mark.parametrize("format", list(vision_tensors.BoundingBoxFormat)) @pytest.mark.parametrize( "fn", [F.horizontal_flip, transform_cls_to_functional(transforms.RandomHorizontalFlip, p=1)] ) @@ -999,7 +999,7 @@ def test_kernel_image_tensor(self, param, value, dtype, device): shear=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["shear"], center=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["center"], ) - @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat)) + @pytest.mark.parametrize("format", list(vision_tensors.BoundingBoxFormat)) @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) @pytest.mark.parametrize("device", cpu_and_cuda()) def test_kernel_bounding_boxes(self, param, value, format, dtype, device): @@ -1032,10 +1032,10 @@ def test_functional(self, make_input): [ (F.affine_image, torch.Tensor), (F._affine_image_pil, PIL.Image.Image), - (F.affine_image, datapoints.Image), - (F.affine_bounding_boxes, datapoints.BoundingBoxes), - (F.affine_mask, datapoints.Mask), - (F.affine_video, datapoints.Video), + (F.affine_image, vision_tensors.Image), + (F.affine_bounding_boxes, vision_tensors.BoundingBoxes), + (F.affine_mask, vision_tensors.Mask), + (F.affine_video, vision_tensors.Video), ], ) def test_functional_signature(self, kernel, input_type): @@ -1148,7 +1148,7 @@ def _reference_affine_bounding_boxes(self, bounding_boxes, *, angle, translate, ), ) - @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat)) + @pytest.mark.parametrize("format", list(vision_tensors.BoundingBoxFormat)) @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"]) @pytest.mark.parametrize("translate", _CORRECTNESS_AFFINE_KWARGS["translate"]) @pytest.mark.parametrize("scale", _CORRECTNESS_AFFINE_KWARGS["scale"]) @@ -1176,7 +1176,7 @@ def test_functional_bounding_boxes_correctness(self, format, angle, translate, s torch.testing.assert_close(actual, expected) - @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat)) + @pytest.mark.parametrize("format", list(vision_tensors.BoundingBoxFormat)) @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) @pytest.mark.parametrize("seed", list(range(5))) def test_transform_bounding_boxes_correctness(self, format, center, seed): @@ -1283,7 +1283,7 @@ class TestVerticalFlip: def test_kernel_image_tensor(self, dtype, device): check_kernel(F.vertical_flip_image, make_image(dtype=dtype, device=device)) - @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat)) + @pytest.mark.parametrize("format", list(vision_tensors.BoundingBoxFormat)) @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) @pytest.mark.parametrize("device", cpu_and_cuda()) def test_kernel_bounding_boxes(self, format, dtype, device): @@ -1314,10 +1314,10 @@ def test_functional(self, make_input): [ (F.vertical_flip_image, torch.Tensor), (F._vertical_flip_image_pil, PIL.Image.Image), - (F.vertical_flip_image, datapoints.Image), - (F.vertical_flip_bounding_boxes, datapoints.BoundingBoxes), - (F.vertical_flip_mask, datapoints.Mask), - (F.vertical_flip_video, datapoints.Video), + (F.vertical_flip_image, vision_tensors.Image), + (F.vertical_flip_bounding_boxes, vision_tensors.BoundingBoxes), + (F.vertical_flip_mask, vision_tensors.Mask), + (F.vertical_flip_video, vision_tensors.Video), ], ) def test_functional_signature(self, kernel, input_type): @@ -1350,7 +1350,7 @@ def _reference_vertical_flip_bounding_boxes(self, bounding_boxes): return reference_affine_bounding_boxes_helper(bounding_boxes, affine_matrix=affine_matrix) - @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat)) + @pytest.mark.parametrize("format", list(vision_tensors.BoundingBoxFormat)) @pytest.mark.parametrize("fn", [F.vertical_flip, transform_cls_to_functional(transforms.RandomVerticalFlip, p=1)]) def test_bounding_boxes_correctness(self, format, fn): bounding_boxes = make_bounding_boxes(format=format) @@ -1419,7 +1419,7 @@ def test_kernel_image_tensor(self, param, value, dtype, device): expand=[False, True], center=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["center"], ) - @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat)) + @pytest.mark.parametrize("format", list(vision_tensors.BoundingBoxFormat)) @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) @pytest.mark.parametrize("device", cpu_and_cuda()) def test_kernel_bounding_boxes(self, param, value, format, dtype, device): @@ -1456,10 +1456,10 @@ def test_functional(self, make_input): [ (F.rotate_image, torch.Tensor), (F._rotate_image_pil, PIL.Image.Image), - (F.rotate_image, datapoints.Image), - (F.rotate_bounding_boxes, datapoints.BoundingBoxes), - (F.rotate_mask, datapoints.Mask), - (F.rotate_video, datapoints.Video), + (F.rotate_image, vision_tensors.Image), + (F.rotate_bounding_boxes, vision_tensors.BoundingBoxes), + (F.rotate_mask, vision_tensors.Mask), + (F.rotate_video, vision_tensors.Video), ], ) def test_functional_signature(self, kernel, input_type): @@ -1553,11 +1553,11 @@ def _compute_output_canvas_size(self, *, expand, canvas_size, affine_matrix): def _recenter_bounding_boxes_after_expand(self, bounding_boxes, *, recenter_xy): x, y = recenter_xy - if bounding_boxes.format is datapoints.BoundingBoxFormat.XYXY: + if bounding_boxes.format is vision_tensors.BoundingBoxFormat.XYXY: translate = [x, y, x, y] else: translate = [x, y, 0.0, 0.0] - return datapoints.wrap( + return vision_tensors.wrap( (bounding_boxes.to(torch.float64) - torch.tensor(translate)).to(bounding_boxes.dtype), like=bounding_boxes ) @@ -1590,7 +1590,7 @@ def _reference_rotate_bounding_boxes(self, bounding_boxes, *, angle, expand, cen bounding_boxes ) - @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat)) + @pytest.mark.parametrize("format", list(vision_tensors.BoundingBoxFormat)) @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"]) @pytest.mark.parametrize("expand", [False, True]) @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) @@ -1603,7 +1603,7 @@ def test_functional_bounding_boxes_correctness(self, format, angle, expand, cent torch.testing.assert_close(actual, expected) torch.testing.assert_close(F.get_size(actual), F.get_size(expected), atol=2 if expand else 0, rtol=0) - @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat)) + @pytest.mark.parametrize("format", list(vision_tensors.BoundingBoxFormat)) @pytest.mark.parametrize("expand", [False, True]) @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) @pytest.mark.parametrize("seed", list(range(5))) @@ -1861,7 +1861,7 @@ def test_others_catch_all_and_none(self, make_input): # make sure "others" works as a catch-all and that None means no conversion sample, inpt_dtype, bbox_dtype, mask_dtype = self.make_inpt_with_bbox_and_mask(make_input) - out = transforms.ToDtype(dtype={datapoints.Mask: torch.int64, "others": None})(sample) + out = transforms.ToDtype(dtype={vision_tensors.Mask: torch.int64, "others": None})(sample) assert out["inpt"].dtype == inpt_dtype assert out["bbox"].dtype == bbox_dtype assert out["mask"].dtype != mask_dtype @@ -1874,7 +1874,7 @@ def test_typical_use_case(self, make_input): sample, inpt_dtype, bbox_dtype, mask_dtype = self.make_inpt_with_bbox_and_mask(make_input) out = transforms.ToDtype( - dtype={type(sample["inpt"]): torch.float32, datapoints.Mask: torch.int64, "others": None}, scale=True + dtype={type(sample["inpt"]): torch.float32, vision_tensors.Mask: torch.int64, "others": None}, scale=True )(sample) assert out["inpt"].dtype != inpt_dtype assert out["inpt"].dtype == torch.float32 @@ -1888,9 +1888,9 @@ def test_errors_warnings(self, make_input): sample, inpt_dtype, bbox_dtype, mask_dtype = self.make_inpt_with_bbox_and_mask(make_input) with pytest.raises(ValueError, match="No dtype was specified for"): - out = transforms.ToDtype(dtype={datapoints.Mask: torch.float32})(sample) + out = transforms.ToDtype(dtype={vision_tensors.Mask: torch.float32})(sample) with pytest.warns(UserWarning, match=re.escape("plain `torch.Tensor` will *not* be transformed")): - transforms.ToDtype(dtype={torch.Tensor: torch.float32, datapoints.Image: torch.float32}) + transforms.ToDtype(dtype={torch.Tensor: torch.float32, vision_tensors.Image: torch.float32}) with pytest.warns(UserWarning, match="no scaling will be done"): out = transforms.ToDtype(dtype={"others": None}, scale=True)(sample) assert out["inpt"].dtype == inpt_dtype @@ -1923,8 +1923,8 @@ def test_functional(self, make_input): [ (F.adjust_brightness_image, torch.Tensor), (F._adjust_brightness_image_pil, PIL.Image.Image), - (F.adjust_brightness_image, datapoints.Image), - (F.adjust_brightness_video, datapoints.Video), + (F.adjust_brightness_image, vision_tensors.Image), + (F.adjust_brightness_video, vision_tensors.Video), ], ) def test_functional_signature(self, kernel, input_type): @@ -2028,8 +2028,8 @@ def test_error(self, T): for input_with_bad_type in ( F.to_pil_image(imgs[0]), - datapoints.Mask(torch.rand(12, 12)), - datapoints.BoundingBoxes(torch.rand(2, 4), format="XYXY", canvas_size=12), + vision_tensors.Mask(torch.rand(12, 12)), + vision_tensors.BoundingBoxes(torch.rand(2, 4), format="XYXY", canvas_size=12), ): with pytest.raises(ValueError, match="does not support PIL images, "): cutmix_mixup(input_with_bad_type) @@ -2172,12 +2172,12 @@ def test_unsupported_types(self, functional, make_input): class TestRegisterKernel: @pytest.mark.parametrize("functional", (F.resize, "resize")) def test_register_kernel(self, functional): - class CustomDatapoint(datapoints.Datapoint): + class CustomVisionTensor(vision_tensors.VisionTensor): pass kernel_was_called = False - @F.register_kernel(functional, CustomDatapoint) + @F.register_kernel(functional, CustomVisionTensor) def new_resize(dp, *args, **kwargs): nonlocal kernel_was_called kernel_was_called = True @@ -2185,38 +2185,38 @@ def new_resize(dp, *args, **kwargs): t = transforms.Resize(size=(224, 224), antialias=True) - my_dp = CustomDatapoint(torch.rand(3, 10, 10)) + my_dp = CustomVisionTensor(torch.rand(3, 10, 10)) out = t(my_dp) assert out is my_dp assert kernel_was_called # Sanity check to make sure we didn't override the kernel of other types t(torch.rand(3, 10, 10)).shape == (3, 224, 224) - t(datapoints.Image(torch.rand(3, 10, 10))).shape == (3, 224, 224) + t(vision_tensors.Image(torch.rand(3, 10, 10))).shape == (3, 224, 224) def test_errors(self): with pytest.raises(ValueError, match="Could not find functional with name"): - F.register_kernel("bad_name", datapoints.Image) + F.register_kernel("bad_name", vision_tensors.Image) with pytest.raises(ValueError, match="Kernels can only be registered on functionals"): - F.register_kernel(datapoints.Image, F.resize) + F.register_kernel(vision_tensors.Image, F.resize) with pytest.raises(ValueError, match="Kernels can only be registered for subclasses"): F.register_kernel(F.resize, object) - with pytest.raises(ValueError, match="cannot be registered for the builtin datapoint classes"): - F.register_kernel(F.resize, datapoints.Image)(F.resize_image) + with pytest.raises(ValueError, match="cannot be registered for the builtin vision_tensor classes"): + F.register_kernel(F.resize, vision_tensors.Image)(F.resize_image) - class CustomDatapoint(datapoints.Datapoint): + class CustomVisionTensor(vision_tensors.VisionTensor): pass - def resize_custom_datapoint(): + def resize_custom_vision_tensor(): pass - F.register_kernel(F.resize, CustomDatapoint)(resize_custom_datapoint) + F.register_kernel(F.resize, CustomVisionTensor)(resize_custom_vision_tensor) with pytest.raises(ValueError, match="already has a kernel registered for type"): - F.register_kernel(F.resize, CustomDatapoint)(resize_custom_datapoint) + F.register_kernel(F.resize, CustomVisionTensor)(resize_custom_vision_tensor) class TestGetKernel: @@ -2225,10 +2225,10 @@ class TestGetKernel: KERNELS = { torch.Tensor: F.resize_image, PIL.Image.Image: F._resize_image_pil, - datapoints.Image: F.resize_image, - datapoints.BoundingBoxes: F.resize_bounding_boxes, - datapoints.Mask: F.resize_mask, - datapoints.Video: F.resize_video, + vision_tensors.Image: F.resize_image, + vision_tensors.BoundingBoxes: F.resize_bounding_boxes, + vision_tensors.Mask: F.resize_mask, + vision_tensors.Video: F.resize_video, } @pytest.mark.parametrize("input_type", [str, int, object]) @@ -2244,57 +2244,57 @@ def resize_with_pure_kernels(): pass for input_type, kernel in self.KERNELS.items(): - _register_kernel_internal(resize_with_pure_kernels, input_type, datapoint_wrapper=False)(kernel) + _register_kernel_internal(resize_with_pure_kernels, input_type, vision_tensor_wrapper=False)(kernel) assert _get_kernel(resize_with_pure_kernels, input_type) is kernel - def test_builtin_datapoint_subclass(self): + def test_builtin_vision_tensor_subclass(self): # We cannot use F.resize together with self.KERNELS mapping here directly here, since this is only the # ideal wrapping. Practically, we have an intermediate wrapper layer. Thus, we create a new resize functional - # here, register the kernels without wrapper, and check if subclasses of our builtin datapoints get dispatched + # here, register the kernels without wrapper, and check if subclasses of our builtin vision_tensors get dispatched # to the kernel of the corresponding superclass def resize_with_pure_kernels(): pass - class MyImage(datapoints.Image): + class MyImage(vision_tensors.Image): pass - class MyBoundingBoxes(datapoints.BoundingBoxes): + class MyBoundingBoxes(vision_tensors.BoundingBoxes): pass - class MyMask(datapoints.Mask): + class MyMask(vision_tensors.Mask): pass - class MyVideo(datapoints.Video): + class MyVideo(vision_tensors.Video): pass - for custom_datapoint_subclass in [ + for custom_vision_tensor_subclass in [ MyImage, MyBoundingBoxes, MyMask, MyVideo, ]: - builtin_datapoint_class = custom_datapoint_subclass.__mro__[1] - builtin_datapoint_kernel = self.KERNELS[builtin_datapoint_class] - _register_kernel_internal(resize_with_pure_kernels, builtin_datapoint_class, datapoint_wrapper=False)( - builtin_datapoint_kernel - ) + builtin_vision_tensor_class = custom_vision_tensor_subclass.__mro__[1] + builtin_vision_tensor_kernel = self.KERNELS[builtin_vision_tensor_class] + _register_kernel_internal( + resize_with_pure_kernels, builtin_vision_tensor_class, vision_tensor_wrapper=False + )(builtin_vision_tensor_kernel) - assert _get_kernel(resize_with_pure_kernels, custom_datapoint_subclass) is builtin_datapoint_kernel + assert _get_kernel(resize_with_pure_kernels, custom_vision_tensor_subclass) is builtin_vision_tensor_kernel - def test_datapoint_subclass(self): - class MyDatapoint(datapoints.Datapoint): + def test_vision_tensor_subclass(self): + class MyVisionTensor(vision_tensors.VisionTensor): pass with pytest.raises(TypeError, match="supports inputs of type"): - _get_kernel(F.resize, MyDatapoint) + _get_kernel(F.resize, MyVisionTensor) - def resize_my_datapoint(): + def resize_my_vision_tensor(): pass - _register_kernel_internal(F.resize, MyDatapoint, datapoint_wrapper=False)(resize_my_datapoint) + _register_kernel_internal(F.resize, MyVisionTensor, vision_tensor_wrapper=False)(resize_my_vision_tensor) - assert _get_kernel(F.resize, MyDatapoint) is resize_my_datapoint + assert _get_kernel(F.resize, MyVisionTensor) is resize_my_vision_tensor def test_pil_image_subclass(self): opened_image = PIL.Image.open(Path(__file__).parent / "assets" / "encode_jpeg" / "grace_hopper_517x606.jpg") @@ -2342,8 +2342,8 @@ def test_functional(self, make_input): [ (F.permute_channels_image, torch.Tensor), (F._permute_channels_image_pil, PIL.Image.Image), - (F.permute_channels_image, datapoints.Image), - (F.permute_channels_video, datapoints.Video), + (F.permute_channels_image, vision_tensors.Image), + (F.permute_channels_video, vision_tensors.Video), ], ) def test_functional_signature(self, kernel, input_type): @@ -2352,7 +2352,7 @@ def test_functional_signature(self, kernel, input_type): def reference_image_correctness(self, image, permutation): channel_images = image.split(1, dim=-3) permuted_channel_images = [channel_images[channel_idx] for channel_idx in permutation] - return datapoints.Image(torch.concat(permuted_channel_images, dim=-3)) + return vision_tensors.Image(torch.concat(permuted_channel_images, dim=-3)) @pytest.mark.parametrize("permutation", [[2, 0, 1], [1, 2, 0], [2, 0, 1], [0, 1, 2]]) @pytest.mark.parametrize("batch_dims", [(), (2,), (2, 1)]) @@ -2392,7 +2392,7 @@ def test_kernel_image_tensor(self, param, value, dtype, device): check_scripted_vs_eager=not (param == "fill" and isinstance(value, (int, float))), ) - @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat)) + @pytest.mark.parametrize("format", list(vision_tensors.BoundingBoxFormat)) @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) @pytest.mark.parametrize("device", cpu_and_cuda()) def test_kernel_bounding_boxes(self, format, dtype, device): @@ -2428,10 +2428,10 @@ def test_functional(self, make_input): [ (F.elastic_image, torch.Tensor), (F._elastic_image_pil, PIL.Image.Image), - (F.elastic_image, datapoints.Image), - (F.elastic_bounding_boxes, datapoints.BoundingBoxes), - (F.elastic_mask, datapoints.Mask), - (F.elastic_video, datapoints.Video), + (F.elastic_image, vision_tensors.Image), + (F.elastic_bounding_boxes, vision_tensors.BoundingBoxes), + (F.elastic_mask, vision_tensors.Mask), + (F.elastic_video, vision_tensors.Video), ], ) def test_functional_signature(self, kernel, input_type): @@ -2481,7 +2481,7 @@ def test_correctness(self): out = transforms.ToPureTensor()(input) for input_value, out_value in zip(input.values(), out.values()): - if isinstance(input_value, datapoints.Datapoint): - assert isinstance(out_value, torch.Tensor) and not isinstance(out_value, datapoints.Datapoint) + if isinstance(input_value, vision_tensors.VisionTensor): + assert isinstance(out_value, torch.Tensor) and not isinstance(out_value, vision_tensors.VisionTensor) else: assert isinstance(out_value, type(input_value)) diff --git a/test/test_transforms_v2_utils.py b/test/test_transforms_v2_utils.py index 511b0c364aa..957cc748369 100644 --- a/test/test_transforms_v2_utils.py +++ b/test/test_transforms_v2_utils.py @@ -6,46 +6,46 @@ import torchvision.transforms.v2._utils from common_utils import DEFAULT_SIZE, make_bounding_boxes, make_detection_mask, make_image -from torchvision import datapoints +from torchvision import vision_tensors from torchvision.transforms.v2._utils import has_all, has_any from torchvision.transforms.v2.functional import to_pil_image IMAGE = make_image(DEFAULT_SIZE, color_space="RGB") -BOUNDING_BOX = make_bounding_boxes(DEFAULT_SIZE, format=datapoints.BoundingBoxFormat.XYXY) +BOUNDING_BOX = make_bounding_boxes(DEFAULT_SIZE, format=vision_tensors.BoundingBoxFormat.XYXY) MASK = make_detection_mask(DEFAULT_SIZE) @pytest.mark.parametrize( ("sample", "types", "expected"), [ - ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Image,), True), - ((IMAGE, BOUNDING_BOX, MASK), (datapoints.BoundingBoxes,), True), - ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Mask,), True), - ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Image, datapoints.BoundingBoxes), True), - ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Image, datapoints.Mask), True), - ((IMAGE, BOUNDING_BOX, MASK), (datapoints.BoundingBoxes, datapoints.Mask), True), - ((MASK,), (datapoints.Image, datapoints.BoundingBoxes), False), - ((BOUNDING_BOX,), (datapoints.Image, datapoints.Mask), False), - ((IMAGE,), (datapoints.BoundingBoxes, datapoints.Mask), False), + ((IMAGE, BOUNDING_BOX, MASK), (vision_tensors.Image,), True), + ((IMAGE, BOUNDING_BOX, MASK), (vision_tensors.BoundingBoxes,), True), + ((IMAGE, BOUNDING_BOX, MASK), (vision_tensors.Mask,), True), + ((IMAGE, BOUNDING_BOX, MASK), (vision_tensors.Image, vision_tensors.BoundingBoxes), True), + ((IMAGE, BOUNDING_BOX, MASK), (vision_tensors.Image, vision_tensors.Mask), True), + ((IMAGE, BOUNDING_BOX, MASK), (vision_tensors.BoundingBoxes, vision_tensors.Mask), True), + ((MASK,), (vision_tensors.Image, vision_tensors.BoundingBoxes), False), + ((BOUNDING_BOX,), (vision_tensors.Image, vision_tensors.Mask), False), + ((IMAGE,), (vision_tensors.BoundingBoxes, vision_tensors.Mask), False), ( (IMAGE, BOUNDING_BOX, MASK), - (datapoints.Image, datapoints.BoundingBoxes, datapoints.Mask), + (vision_tensors.Image, vision_tensors.BoundingBoxes, vision_tensors.Mask), True, ), - ((), (datapoints.Image, datapoints.BoundingBoxes, datapoints.Mask), False), - ((IMAGE, BOUNDING_BOX, MASK), (lambda obj: isinstance(obj, datapoints.Image),), True), + ((), (vision_tensors.Image, vision_tensors.BoundingBoxes, vision_tensors.Mask), False), + ((IMAGE, BOUNDING_BOX, MASK), (lambda obj: isinstance(obj, vision_tensors.Image),), True), ((IMAGE, BOUNDING_BOX, MASK), (lambda _: False,), False), ((IMAGE, BOUNDING_BOX, MASK), (lambda _: True,), True), - ((IMAGE,), (datapoints.Image, PIL.Image.Image, torchvision.transforms.v2._utils.is_pure_tensor), True), + ((IMAGE,), (vision_tensors.Image, PIL.Image.Image, torchvision.transforms.v2._utils.is_pure_tensor), True), ( (torch.Tensor(IMAGE),), - (datapoints.Image, PIL.Image.Image, torchvision.transforms.v2._utils.is_pure_tensor), + (vision_tensors.Image, PIL.Image.Image, torchvision.transforms.v2._utils.is_pure_tensor), True, ), ( (to_pil_image(IMAGE),), - (datapoints.Image, PIL.Image.Image, torchvision.transforms.v2._utils.is_pure_tensor), + (vision_tensors.Image, PIL.Image.Image, torchvision.transforms.v2._utils.is_pure_tensor), True, ), ], @@ -57,31 +57,31 @@ def test_has_any(sample, types, expected): @pytest.mark.parametrize( ("sample", "types", "expected"), [ - ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Image,), True), - ((IMAGE, BOUNDING_BOX, MASK), (datapoints.BoundingBoxes,), True), - ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Mask,), True), - ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Image, datapoints.BoundingBoxes), True), - ((IMAGE, BOUNDING_BOX, MASK), (datapoints.Image, datapoints.Mask), True), - ((IMAGE, BOUNDING_BOX, MASK), (datapoints.BoundingBoxes, datapoints.Mask), True), + ((IMAGE, BOUNDING_BOX, MASK), (vision_tensors.Image,), True), + ((IMAGE, BOUNDING_BOX, MASK), (vision_tensors.BoundingBoxes,), True), + ((IMAGE, BOUNDING_BOX, MASK), (vision_tensors.Mask,), True), + ((IMAGE, BOUNDING_BOX, MASK), (vision_tensors.Image, vision_tensors.BoundingBoxes), True), + ((IMAGE, BOUNDING_BOX, MASK), (vision_tensors.Image, vision_tensors.Mask), True), + ((IMAGE, BOUNDING_BOX, MASK), (vision_tensors.BoundingBoxes, vision_tensors.Mask), True), ( (IMAGE, BOUNDING_BOX, MASK), - (datapoints.Image, datapoints.BoundingBoxes, datapoints.Mask), + (vision_tensors.Image, vision_tensors.BoundingBoxes, vision_tensors.Mask), True, ), - ((BOUNDING_BOX, MASK), (datapoints.Image, datapoints.BoundingBoxes), False), - ((BOUNDING_BOX, MASK), (datapoints.Image, datapoints.Mask), False), - ((IMAGE, MASK), (datapoints.BoundingBoxes, datapoints.Mask), False), + ((BOUNDING_BOX, MASK), (vision_tensors.Image, vision_tensors.BoundingBoxes), False), + ((BOUNDING_BOX, MASK), (vision_tensors.Image, vision_tensors.Mask), False), + ((IMAGE, MASK), (vision_tensors.BoundingBoxes, vision_tensors.Mask), False), ( (IMAGE, BOUNDING_BOX, MASK), - (datapoints.Image, datapoints.BoundingBoxes, datapoints.Mask), + (vision_tensors.Image, vision_tensors.BoundingBoxes, vision_tensors.Mask), True, ), - ((BOUNDING_BOX, MASK), (datapoints.Image, datapoints.BoundingBoxes, datapoints.Mask), False), - ((IMAGE, MASK), (datapoints.Image, datapoints.BoundingBoxes, datapoints.Mask), False), - ((IMAGE, BOUNDING_BOX), (datapoints.Image, datapoints.BoundingBoxes, datapoints.Mask), False), + ((BOUNDING_BOX, MASK), (vision_tensors.Image, vision_tensors.BoundingBoxes, vision_tensors.Mask), False), + ((IMAGE, MASK), (vision_tensors.Image, vision_tensors.BoundingBoxes, vision_tensors.Mask), False), + ((IMAGE, BOUNDING_BOX), (vision_tensors.Image, vision_tensors.BoundingBoxes, vision_tensors.Mask), False), ( (IMAGE, BOUNDING_BOX, MASK), - (lambda obj: isinstance(obj, (datapoints.Image, datapoints.BoundingBoxes, datapoints.Mask)),), + (lambda obj: isinstance(obj, (vision_tensors.Image, vision_tensors.BoundingBoxes, vision_tensors.Mask)),), True, ), ((IMAGE, BOUNDING_BOX, MASK), (lambda _: False,), False), diff --git a/test/test_datapoints.py b/test/test_vision_tensors.py similarity index 71% rename from test/test_datapoints.py rename to test/test_vision_tensors.py index 1aeb2367752..f361df36cfa 100644 --- a/test/test_datapoints.py +++ b/test/test_vision_tensors.py @@ -5,7 +5,7 @@ from common_utils import assert_equal, make_bounding_boxes, make_image, make_segmentation_mask, make_video from PIL import Image -from torchvision import datapoints +from torchvision import vision_tensors @pytest.fixture(autouse=True) @@ -13,40 +13,40 @@ def restore_tensor_return_type(): # This is for security, as we should already be restoring the default manually in each test anyway # (at least at the time of writing...) yield - datapoints.set_return_type("Tensor") + vision_tensors.set_return_type("Tensor") @pytest.mark.parametrize("data", [torch.rand(3, 32, 32), Image.new("RGB", (32, 32), color=123)]) def test_image_instance(data): - image = datapoints.Image(data) + image = vision_tensors.Image(data) assert isinstance(image, torch.Tensor) assert image.ndim == 3 and image.shape[0] == 3 @pytest.mark.parametrize("data", [torch.randint(0, 10, size=(1, 32, 32)), Image.new("L", (32, 32), color=2)]) def test_mask_instance(data): - mask = datapoints.Mask(data) + mask = vision_tensors.Mask(data) assert isinstance(mask, torch.Tensor) assert mask.ndim == 3 and mask.shape[0] == 1 @pytest.mark.parametrize("data", [torch.randint(0, 32, size=(5, 4)), [[0, 0, 5, 5], [2, 2, 7, 7]], [1, 2, 3, 4]]) @pytest.mark.parametrize( - "format", ["XYXY", "CXCYWH", datapoints.BoundingBoxFormat.XYXY, datapoints.BoundingBoxFormat.XYWH] + "format", ["XYXY", "CXCYWH", vision_tensors.BoundingBoxFormat.XYXY, vision_tensors.BoundingBoxFormat.XYWH] ) def test_bbox_instance(data, format): - bboxes = datapoints.BoundingBoxes(data, format=format, canvas_size=(32, 32)) + bboxes = vision_tensors.BoundingBoxes(data, format=format, canvas_size=(32, 32)) assert isinstance(bboxes, torch.Tensor) assert bboxes.ndim == 2 and bboxes.shape[1] == 4 if isinstance(format, str): - format = datapoints.BoundingBoxFormat[(format.upper())] + format = vision_tensors.BoundingBoxFormat[(format.upper())] assert bboxes.format == format def test_bbox_dim_error(): data_3d = [[[1, 2, 3, 4]]] with pytest.raises(ValueError, match="Expected a 1D or 2D tensor, got 3D"): - datapoints.BoundingBoxes(data_3d, format="XYXY", canvas_size=(32, 32)) + vision_tensors.BoundingBoxes(data_3d, format="XYXY", canvas_size=(32, 32)) @pytest.mark.parametrize( @@ -64,8 +64,8 @@ def test_bbox_dim_error(): ], ) def test_new_requires_grad(data, input_requires_grad, expected_requires_grad): - datapoint = datapoints.Image(data, requires_grad=input_requires_grad) - assert datapoint.requires_grad is expected_requires_grad + vision_tensor = vision_tensors.Image(data, requires_grad=input_requires_grad) + assert vision_tensor.requires_grad is expected_requires_grad @pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video]) @@ -75,7 +75,7 @@ def test_isinstance(make_input): def test_wrapping_no_copy(): tensor = torch.rand(3, 16, 16) - image = datapoints.Image(tensor) + image = vision_tensors.Image(tensor) assert image.data_ptr() == tensor.data_ptr() @@ -91,25 +91,25 @@ def test_to_wrapping(make_input): @pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video]) -@pytest.mark.parametrize("return_type", ["Tensor", "datapoint"]) -def test_to_datapoint_reference(make_input, return_type): +@pytest.mark.parametrize("return_type", ["Tensor", "vision_tensor"]) +def test_to_vision_tensor_reference(make_input, return_type): tensor = torch.rand((3, 16, 16), dtype=torch.float64) dp = make_input() - with datapoints.set_return_type(return_type): + with vision_tensors.set_return_type(return_type): tensor_to = tensor.to(dp) - assert type(tensor_to) is (type(dp) if return_type == "datapoint" else torch.Tensor) + assert type(tensor_to) is (type(dp) if return_type == "vision_tensor" else torch.Tensor) assert tensor_to.dtype is dp.dtype assert type(tensor) is torch.Tensor @pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video]) -@pytest.mark.parametrize("return_type", ["Tensor", "datapoint"]) +@pytest.mark.parametrize("return_type", ["Tensor", "vision_tensor"]) def test_clone_wrapping(make_input, return_type): dp = make_input() - with datapoints.set_return_type(return_type): + with vision_tensors.set_return_type(return_type): dp_clone = dp.clone() assert type(dp_clone) is type(dp) @@ -117,13 +117,13 @@ def test_clone_wrapping(make_input, return_type): @pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video]) -@pytest.mark.parametrize("return_type", ["Tensor", "datapoint"]) +@pytest.mark.parametrize("return_type", ["Tensor", "vision_tensor"]) def test_requires_grad__wrapping(make_input, return_type): dp = make_input(dtype=torch.float) assert not dp.requires_grad - with datapoints.set_return_type(return_type): + with vision_tensors.set_return_type(return_type): dp_requires_grad = dp.requires_grad_(True) assert type(dp_requires_grad) is type(dp) @@ -132,54 +132,54 @@ def test_requires_grad__wrapping(make_input, return_type): @pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video]) -@pytest.mark.parametrize("return_type", ["Tensor", "datapoint"]) +@pytest.mark.parametrize("return_type", ["Tensor", "vision_tensor"]) def test_detach_wrapping(make_input, return_type): dp = make_input(dtype=torch.float).requires_grad_(True) - with datapoints.set_return_type(return_type): + with vision_tensors.set_return_type(return_type): dp_detached = dp.detach() assert type(dp_detached) is type(dp) -@pytest.mark.parametrize("return_type", ["Tensor", "datapoint"]) +@pytest.mark.parametrize("return_type", ["Tensor", "vision_tensor"]) def test_force_subclass_with_metadata(return_type): - # Sanity checks for the ops in _FORCE_TORCHFUNCTION_SUBCLASS and datapoints with metadata + # Sanity checks for the ops in _FORCE_TORCHFUNCTION_SUBCLASS and vision_tensors with metadata # Largely the same as above, we additionally check that the metadata is preserved format, canvas_size = "XYXY", (32, 32) - bbox = datapoints.BoundingBoxes([[0, 0, 5, 5], [2, 2, 7, 7]], format=format, canvas_size=canvas_size) + bbox = vision_tensors.BoundingBoxes([[0, 0, 5, 5], [2, 2, 7, 7]], format=format, canvas_size=canvas_size) - datapoints.set_return_type(return_type) + vision_tensors.set_return_type(return_type) bbox = bbox.clone() - if return_type == "datapoint": + if return_type == "vision_tensor": assert bbox.format, bbox.canvas_size == (format, canvas_size) bbox = bbox.to(torch.float64) - if return_type == "datapoint": + if return_type == "vision_tensor": assert bbox.format, bbox.canvas_size == (format, canvas_size) bbox = bbox.detach() - if return_type == "datapoint": + if return_type == "vision_tensor": assert bbox.format, bbox.canvas_size == (format, canvas_size) assert not bbox.requires_grad bbox.requires_grad_(True) - if return_type == "datapoint": + if return_type == "vision_tensor": assert bbox.format, bbox.canvas_size == (format, canvas_size) assert bbox.requires_grad - datapoints.set_return_type("tensor") + vision_tensors.set_return_type("tensor") @pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video]) -@pytest.mark.parametrize("return_type", ["Tensor", "datapoint"]) +@pytest.mark.parametrize("return_type", ["Tensor", "vision_tensor"]) def test_other_op_no_wrapping(make_input, return_type): dp = make_input() - with datapoints.set_return_type(return_type): + with vision_tensors.set_return_type(return_type): # any operation besides the ones listed in _FORCE_TORCHFUNCTION_SUBCLASS will do here output = dp * 2 - assert type(output) is (type(dp) if return_type == "datapoint" else torch.Tensor) + assert type(output) is (type(dp) if return_type == "vision_tensor" else torch.Tensor) @pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video]) @@ -200,15 +200,15 @@ def test_no_tensor_output_op_no_wrapping(make_input, op): @pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video]) -@pytest.mark.parametrize("return_type", ["Tensor", "datapoint"]) +@pytest.mark.parametrize("return_type", ["Tensor", "vision_tensor"]) def test_inplace_op_no_wrapping(make_input, return_type): dp = make_input() original_type = type(dp) - with datapoints.set_return_type(return_type): + with vision_tensors.set_return_type(return_type): output = dp.add_(0) - assert type(output) is (type(dp) if return_type == "datapoint" else torch.Tensor) + assert type(output) is (type(dp) if return_type == "vision_tensor" else torch.Tensor) assert type(dp) is original_type @@ -219,7 +219,7 @@ def test_wrap(make_input): # any operation besides the ones listed in _FORCE_TORCHFUNCTION_SUBCLASS will do here output = dp * 2 - dp_new = datapoints.wrap(output, like=dp) + dp_new = vision_tensors.wrap(output, like=dp) assert type(dp_new) is type(dp) assert dp_new.data_ptr() == output.data_ptr() @@ -243,7 +243,7 @@ def test_deepcopy(make_input, requires_grad): @pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video]) -@pytest.mark.parametrize("return_type", ["Tensor", "datapoint"]) +@pytest.mark.parametrize("return_type", ["Tensor", "vision_tensor"]) @pytest.mark.parametrize( "op", ( @@ -265,10 +265,10 @@ def test_deepcopy(make_input, requires_grad): def test_usual_operations(make_input, return_type, op): dp = make_input() - with datapoints.set_return_type(return_type): + with vision_tensors.set_return_type(return_type): out = op(dp) - assert type(out) is (type(dp) if return_type == "datapoint" else torch.Tensor) - if isinstance(dp, datapoints.BoundingBoxes) and return_type == "datapoint": + assert type(out) is (type(dp) if return_type == "vision_tensor" else torch.Tensor) + if isinstance(dp, vision_tensors.BoundingBoxes) and return_type == "vision_tensor": assert hasattr(out, "format") assert hasattr(out, "canvas_size") @@ -286,22 +286,22 @@ def test_set_return_type(): assert type(img + 3) is torch.Tensor - with datapoints.set_return_type("datapoint"): - assert type(img + 3) is datapoints.Image + with vision_tensors.set_return_type("vision_tensor"): + assert type(img + 3) is vision_tensors.Image assert type(img + 3) is torch.Tensor - datapoints.set_return_type("datapoint") - assert type(img + 3) is datapoints.Image + vision_tensors.set_return_type("vision_tensor") + assert type(img + 3) is vision_tensors.Image - with datapoints.set_return_type("tensor"): + with vision_tensors.set_return_type("tensor"): assert type(img + 3) is torch.Tensor - with datapoints.set_return_type("datapoint"): - assert type(img + 3) is datapoints.Image - datapoints.set_return_type("tensor") + with vision_tensors.set_return_type("vision_tensor"): + assert type(img + 3) is vision_tensors.Image + vision_tensors.set_return_type("tensor") assert type(img + 3) is torch.Tensor assert type(img + 3) is torch.Tensor # Exiting a context manager will restore the return type as it was prior to entering it, - # regardless of whether the "global" datapoints.set_return_type() was called within the context manager. - assert type(img + 3) is datapoints.Image + # regardless of whether the "global" vision_tensors.set_return_type() was called within the context manager. + assert type(img + 3) is vision_tensors.Image - datapoints.set_return_type("tensor") + vision_tensors.set_return_type("tensor") diff --git a/test/transforms_v2_dispatcher_infos.py b/test/transforms_v2_dispatcher_infos.py index a20f38d8482..0deb09e331d 100644 --- a/test/transforms_v2_dispatcher_infos.py +++ b/test/transforms_v2_dispatcher_infos.py @@ -2,7 +2,7 @@ import pytest import torchvision.transforms.v2.functional as F -from torchvision import datapoints +from torchvision import vision_tensors from transforms_v2_kernel_infos import KERNEL_INFOS, pad_xfail_jit_fill_condition from transforms_v2_legacy_utils import InfoBase, TestMark @@ -44,19 +44,19 @@ def __init__( self.pil_kernel_info = pil_kernel_info kernel_infos = {} - for datapoint_type, kernel in self.kernels.items(): + for vision_tensor_type, kernel in self.kernels.items(): kernel_info = self._KERNEL_INFO_MAP.get(kernel) if not kernel_info: raise pytest.UsageError( - f"Can't register {kernel.__name__} for type {datapoint_type} since there is no `KernelInfo` for it. " + f"Can't register {kernel.__name__} for type {vision_tensor_type} since there is no `KernelInfo` for it. " f"Please add a `KernelInfo` for it in `transforms_v2_kernel_infos.py`." ) - kernel_infos[datapoint_type] = kernel_info + kernel_infos[vision_tensor_type] = kernel_info self.kernel_infos = kernel_infos - def sample_inputs(self, *datapoint_types, filter_metadata=True): - for datapoint_type in datapoint_types or self.kernel_infos.keys(): - kernel_info = self.kernel_infos.get(datapoint_type) + def sample_inputs(self, *vision_tensor_types, filter_metadata=True): + for vision_tensor_type in vision_tensor_types or self.kernel_infos.keys(): + kernel_info = self.kernel_infos.get(vision_tensor_type) if not kernel_info: raise pytest.UsageError(f"There is no kernel registered for type {type.__name__}") @@ -69,12 +69,12 @@ def sample_inputs(self, *datapoint_types, filter_metadata=True): import itertools for args_kwargs in sample_inputs: - if hasattr(datapoint_type, "__annotations__"): + if hasattr(vision_tensor_type, "__annotations__"): for name in itertools.chain( - datapoint_type.__annotations__.keys(), + vision_tensor_type.__annotations__.keys(), # FIXME: this seems ok for conversion dispatchers, but we should probably handle this on a # per-dispatcher level. However, so far there is no option for that. - (f"old_{name}" for name in datapoint_type.__annotations__.keys()), + (f"old_{name}" for name in vision_tensor_type.__annotations__.keys()), ): if name in args_kwargs.kwargs: del args_kwargs.kwargs[name] @@ -97,9 +97,9 @@ def xfail_jit_python_scalar_arg(name, *, reason=None): ) -skip_dispatch_datapoint = TestMark( - ("TestDispatchers", "test_dispatch_datapoint"), - pytest.mark.skip(reason="Dispatcher doesn't support arbitrary datapoint dispatch."), +skip_dispatch_vision_tensor = TestMark( + ("TestDispatchers", "test_dispatch_vision_tensor"), + pytest.mark.skip(reason="Dispatcher doesn't support arbitrary vision_tensor dispatch."), ) multi_crop_skips = [ @@ -107,9 +107,9 @@ def xfail_jit_python_scalar_arg(name, *, reason=None): ("TestDispatchers", test_name), pytest.mark.skip(reason="Multi-crop dispatchers return a sequence of items rather than a single one."), ) - for test_name in ["test_pure_tensor_output_type", "test_pil_output_type", "test_datapoint_output_type"] + for test_name in ["test_pure_tensor_output_type", "test_pil_output_type", "test_vision_tensor_output_type"] ] -multi_crop_skips.append(skip_dispatch_datapoint) +multi_crop_skips.append(skip_dispatch_vision_tensor) def xfails_pil(reason, *, condition=None): @@ -142,30 +142,30 @@ def fill_sequence_needs_broadcast(args_kwargs): DispatcherInfo( F.crop, kernels={ - datapoints.Image: F.crop_image, - datapoints.Video: F.crop_video, - datapoints.BoundingBoxes: F.crop_bounding_boxes, - datapoints.Mask: F.crop_mask, + vision_tensors.Image: F.crop_image, + vision_tensors.Video: F.crop_video, + vision_tensors.BoundingBoxes: F.crop_bounding_boxes, + vision_tensors.Mask: F.crop_mask, }, pil_kernel_info=PILKernelInfo(F._crop_image_pil, kernel_name="crop_image_pil"), ), DispatcherInfo( F.resized_crop, kernels={ - datapoints.Image: F.resized_crop_image, - datapoints.Video: F.resized_crop_video, - datapoints.BoundingBoxes: F.resized_crop_bounding_boxes, - datapoints.Mask: F.resized_crop_mask, + vision_tensors.Image: F.resized_crop_image, + vision_tensors.Video: F.resized_crop_video, + vision_tensors.BoundingBoxes: F.resized_crop_bounding_boxes, + vision_tensors.Mask: F.resized_crop_mask, }, pil_kernel_info=PILKernelInfo(F._resized_crop_image_pil), ), DispatcherInfo( F.pad, kernels={ - datapoints.Image: F.pad_image, - datapoints.Video: F.pad_video, - datapoints.BoundingBoxes: F.pad_bounding_boxes, - datapoints.Mask: F.pad_mask, + vision_tensors.Image: F.pad_image, + vision_tensors.Video: F.pad_video, + vision_tensors.BoundingBoxes: F.pad_bounding_boxes, + vision_tensors.Mask: F.pad_mask, }, pil_kernel_info=PILKernelInfo(F._pad_image_pil, kernel_name="pad_image_pil"), test_marks=[ @@ -184,10 +184,10 @@ def fill_sequence_needs_broadcast(args_kwargs): DispatcherInfo( F.perspective, kernels={ - datapoints.Image: F.perspective_image, - datapoints.Video: F.perspective_video, - datapoints.BoundingBoxes: F.perspective_bounding_boxes, - datapoints.Mask: F.perspective_mask, + vision_tensors.Image: F.perspective_image, + vision_tensors.Video: F.perspective_video, + vision_tensors.BoundingBoxes: F.perspective_bounding_boxes, + vision_tensors.Mask: F.perspective_mask, }, pil_kernel_info=PILKernelInfo(F._perspective_image_pil), test_marks=[ @@ -198,10 +198,10 @@ def fill_sequence_needs_broadcast(args_kwargs): DispatcherInfo( F.elastic, kernels={ - datapoints.Image: F.elastic_image, - datapoints.Video: F.elastic_video, - datapoints.BoundingBoxes: F.elastic_bounding_boxes, - datapoints.Mask: F.elastic_mask, + vision_tensors.Image: F.elastic_image, + vision_tensors.Video: F.elastic_video, + vision_tensors.BoundingBoxes: F.elastic_bounding_boxes, + vision_tensors.Mask: F.elastic_mask, }, pil_kernel_info=PILKernelInfo(F._elastic_image_pil), test_marks=[xfail_jit_python_scalar_arg("fill")], @@ -209,10 +209,10 @@ def fill_sequence_needs_broadcast(args_kwargs): DispatcherInfo( F.center_crop, kernels={ - datapoints.Image: F.center_crop_image, - datapoints.Video: F.center_crop_video, - datapoints.BoundingBoxes: F.center_crop_bounding_boxes, - datapoints.Mask: F.center_crop_mask, + vision_tensors.Image: F.center_crop_image, + vision_tensors.Video: F.center_crop_video, + vision_tensors.BoundingBoxes: F.center_crop_bounding_boxes, + vision_tensors.Mask: F.center_crop_mask, }, pil_kernel_info=PILKernelInfo(F._center_crop_image_pil), test_marks=[ @@ -222,8 +222,8 @@ def fill_sequence_needs_broadcast(args_kwargs): DispatcherInfo( F.gaussian_blur, kernels={ - datapoints.Image: F.gaussian_blur_image, - datapoints.Video: F.gaussian_blur_video, + vision_tensors.Image: F.gaussian_blur_image, + vision_tensors.Video: F.gaussian_blur_video, }, pil_kernel_info=PILKernelInfo(F._gaussian_blur_image_pil), test_marks=[ @@ -234,99 +234,99 @@ def fill_sequence_needs_broadcast(args_kwargs): DispatcherInfo( F.equalize, kernels={ - datapoints.Image: F.equalize_image, - datapoints.Video: F.equalize_video, + vision_tensors.Image: F.equalize_image, + vision_tensors.Video: F.equalize_video, }, pil_kernel_info=PILKernelInfo(F._equalize_image_pil, kernel_name="equalize_image_pil"), ), DispatcherInfo( F.invert, kernels={ - datapoints.Image: F.invert_image, - datapoints.Video: F.invert_video, + vision_tensors.Image: F.invert_image, + vision_tensors.Video: F.invert_video, }, pil_kernel_info=PILKernelInfo(F._invert_image_pil, kernel_name="invert_image_pil"), ), DispatcherInfo( F.posterize, kernels={ - datapoints.Image: F.posterize_image, - datapoints.Video: F.posterize_video, + vision_tensors.Image: F.posterize_image, + vision_tensors.Video: F.posterize_video, }, pil_kernel_info=PILKernelInfo(F._posterize_image_pil, kernel_name="posterize_image_pil"), ), DispatcherInfo( F.solarize, kernels={ - datapoints.Image: F.solarize_image, - datapoints.Video: F.solarize_video, + vision_tensors.Image: F.solarize_image, + vision_tensors.Video: F.solarize_video, }, pil_kernel_info=PILKernelInfo(F._solarize_image_pil, kernel_name="solarize_image_pil"), ), DispatcherInfo( F.autocontrast, kernels={ - datapoints.Image: F.autocontrast_image, - datapoints.Video: F.autocontrast_video, + vision_tensors.Image: F.autocontrast_image, + vision_tensors.Video: F.autocontrast_video, }, pil_kernel_info=PILKernelInfo(F._autocontrast_image_pil, kernel_name="autocontrast_image_pil"), ), DispatcherInfo( F.adjust_sharpness, kernels={ - datapoints.Image: F.adjust_sharpness_image, - datapoints.Video: F.adjust_sharpness_video, + vision_tensors.Image: F.adjust_sharpness_image, + vision_tensors.Video: F.adjust_sharpness_video, }, pil_kernel_info=PILKernelInfo(F._adjust_sharpness_image_pil, kernel_name="adjust_sharpness_image_pil"), ), DispatcherInfo( F.erase, kernels={ - datapoints.Image: F.erase_image, - datapoints.Video: F.erase_video, + vision_tensors.Image: F.erase_image, + vision_tensors.Video: F.erase_video, }, pil_kernel_info=PILKernelInfo(F._erase_image_pil), test_marks=[ - skip_dispatch_datapoint, + skip_dispatch_vision_tensor, ], ), DispatcherInfo( F.adjust_contrast, kernels={ - datapoints.Image: F.adjust_contrast_image, - datapoints.Video: F.adjust_contrast_video, + vision_tensors.Image: F.adjust_contrast_image, + vision_tensors.Video: F.adjust_contrast_video, }, pil_kernel_info=PILKernelInfo(F._adjust_contrast_image_pil, kernel_name="adjust_contrast_image_pil"), ), DispatcherInfo( F.adjust_gamma, kernels={ - datapoints.Image: F.adjust_gamma_image, - datapoints.Video: F.adjust_gamma_video, + vision_tensors.Image: F.adjust_gamma_image, + vision_tensors.Video: F.adjust_gamma_video, }, pil_kernel_info=PILKernelInfo(F._adjust_gamma_image_pil, kernel_name="adjust_gamma_image_pil"), ), DispatcherInfo( F.adjust_hue, kernels={ - datapoints.Image: F.adjust_hue_image, - datapoints.Video: F.adjust_hue_video, + vision_tensors.Image: F.adjust_hue_image, + vision_tensors.Video: F.adjust_hue_video, }, pil_kernel_info=PILKernelInfo(F._adjust_hue_image_pil, kernel_name="adjust_hue_image_pil"), ), DispatcherInfo( F.adjust_saturation, kernels={ - datapoints.Image: F.adjust_saturation_image, - datapoints.Video: F.adjust_saturation_video, + vision_tensors.Image: F.adjust_saturation_image, + vision_tensors.Video: F.adjust_saturation_video, }, pil_kernel_info=PILKernelInfo(F._adjust_saturation_image_pil, kernel_name="adjust_saturation_image_pil"), ), DispatcherInfo( F.five_crop, kernels={ - datapoints.Image: F.five_crop_image, - datapoints.Video: F.five_crop_video, + vision_tensors.Image: F.five_crop_image, + vision_tensors.Video: F.five_crop_video, }, pil_kernel_info=PILKernelInfo(F._five_crop_image_pil), test_marks=[ @@ -337,8 +337,8 @@ def fill_sequence_needs_broadcast(args_kwargs): DispatcherInfo( F.ten_crop, kernels={ - datapoints.Image: F.ten_crop_image, - datapoints.Video: F.ten_crop_video, + vision_tensors.Image: F.ten_crop_image, + vision_tensors.Video: F.ten_crop_video, }, test_marks=[ xfail_jit_python_scalar_arg("size"), @@ -349,8 +349,8 @@ def fill_sequence_needs_broadcast(args_kwargs): DispatcherInfo( F.normalize, kernels={ - datapoints.Image: F.normalize_image, - datapoints.Video: F.normalize_video, + vision_tensors.Image: F.normalize_image, + vision_tensors.Video: F.normalize_video, }, test_marks=[ xfail_jit_python_scalar_arg("mean"), @@ -360,24 +360,24 @@ def fill_sequence_needs_broadcast(args_kwargs): DispatcherInfo( F.uniform_temporal_subsample, kernels={ - datapoints.Video: F.uniform_temporal_subsample_video, + vision_tensors.Video: F.uniform_temporal_subsample_video, }, test_marks=[ - skip_dispatch_datapoint, + skip_dispatch_vision_tensor, ], ), DispatcherInfo( F.clamp_bounding_boxes, - kernels={datapoints.BoundingBoxes: F.clamp_bounding_boxes}, + kernels={vision_tensors.BoundingBoxes: F.clamp_bounding_boxes}, test_marks=[ - skip_dispatch_datapoint, + skip_dispatch_vision_tensor, ], ), DispatcherInfo( F.convert_bounding_box_format, - kernels={datapoints.BoundingBoxes: F.convert_bounding_box_format}, + kernels={vision_tensors.BoundingBoxes: F.convert_bounding_box_format}, test_marks=[ - skip_dispatch_datapoint, + skip_dispatch_vision_tensor, ], ), ] diff --git a/test/transforms_v2_kernel_infos.py b/test/transforms_v2_kernel_infos.py index b10c58277bd..5c5868abfbe 100644 --- a/test/transforms_v2_kernel_infos.py +++ b/test/transforms_v2_kernel_infos.py @@ -7,7 +7,7 @@ import torch.testing import torchvision.ops import torchvision.transforms.v2.functional as F -from torchvision import datapoints +from torchvision import vision_tensors from torchvision.transforms._functional_tensor import _max_value as get_max_value, _parse_pad_padding from transforms_v2_legacy_utils import ( ArgsKwargs, @@ -193,7 +193,7 @@ def transform(bbox, affine_matrix_, format_, canvas_size_): bbox_xyxy = F.convert_bounding_box_format( bbox.as_subclass(torch.Tensor), old_format=format_, - new_format=datapoints.BoundingBoxFormat.XYXY, + new_format=vision_tensors.BoundingBoxFormat.XYXY, inplace=True, ) points = np.array( @@ -215,7 +215,7 @@ def transform(bbox, affine_matrix_, format_, canvas_size_): dtype=bbox_xyxy.dtype, ) out_bbox = F.convert_bounding_box_format( - out_bbox, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format_, inplace=True + out_bbox, old_format=vision_tensors.BoundingBoxFormat.XYXY, new_format=format_, inplace=True ) # It is important to clamp before casting, especially for CXCYWH format, dtype=int64 out_bbox = F.clamp_bounding_boxes(out_bbox, format=format_, canvas_size=canvas_size_) @@ -228,7 +228,7 @@ def transform(bbox, affine_matrix_, format_, canvas_size_): def sample_inputs_convert_bounding_box_format(): - formats = list(datapoints.BoundingBoxFormat) + formats = list(vision_tensors.BoundingBoxFormat) for bounding_boxes_loader, new_format in itertools.product(make_bounding_box_loaders(formats=formats), formats): yield ArgsKwargs(bounding_boxes_loader, old_format=bounding_boxes_loader.format, new_format=new_format) @@ -659,7 +659,7 @@ def sample_inputs_perspective_bounding_boxes(): coefficients=_PERSPECTIVE_COEFFS[0], ) - format = datapoints.BoundingBoxFormat.XYXY + format = vision_tensors.BoundingBoxFormat.XYXY loader = make_bounding_box_loader(format=format) yield ArgsKwargs( loader, format=format, canvas_size=loader.canvas_size, startpoints=_STARTPOINTS, endpoints=_ENDPOINTS diff --git a/test/transforms_v2_legacy_utils.py b/test/transforms_v2_legacy_utils.py index bb8943a8889..d23b32d1219 100644 --- a/test/transforms_v2_legacy_utils.py +++ b/test/transforms_v2_legacy_utils.py @@ -27,7 +27,7 @@ import pytest import torch -from torchvision import datapoints +from torchvision import vision_tensors from torchvision.transforms._functional_tensor import _max_value as get_max_value from torchvision.transforms.v2.functional import to_dtype_image, to_image, to_pil_image @@ -82,7 +82,7 @@ def make_image( if color_space in {"GRAY_ALPHA", "RGBA"}: data[..., -1, :, :] = max_value - return datapoints.Image(data) + return vision_tensors.Image(data) def make_image_tensor(*args, **kwargs): @@ -96,7 +96,7 @@ def make_image_pil(*args, **kwargs): def make_bounding_boxes( canvas_size=DEFAULT_SIZE, *, - format=datapoints.BoundingBoxFormat.XYXY, + format=vision_tensors.BoundingBoxFormat.XYXY, batch_dims=(), dtype=None, device="cpu", @@ -107,12 +107,12 @@ def sample_position(values, max_value): return torch.stack([torch.randint(max_value - v, ()) for v in values.flatten().tolist()]).reshape(values.shape) if isinstance(format, str): - format = datapoints.BoundingBoxFormat[format] + format = vision_tensors.BoundingBoxFormat[format] dtype = dtype or torch.float32 if any(dim == 0 for dim in batch_dims): - return datapoints.BoundingBoxes( + return vision_tensors.BoundingBoxes( torch.empty(*batch_dims, 4, dtype=dtype, device=device), format=format, canvas_size=canvas_size ) @@ -120,28 +120,28 @@ def sample_position(values, max_value): y = sample_position(h, canvas_size[0]) x = sample_position(w, canvas_size[1]) - if format is datapoints.BoundingBoxFormat.XYWH: + if format is vision_tensors.BoundingBoxFormat.XYWH: parts = (x, y, w, h) - elif format is datapoints.BoundingBoxFormat.XYXY: + elif format is vision_tensors.BoundingBoxFormat.XYXY: x1, y1 = x, y x2 = x1 + w y2 = y1 + h parts = (x1, y1, x2, y2) - elif format is datapoints.BoundingBoxFormat.CXCYWH: + elif format is vision_tensors.BoundingBoxFormat.CXCYWH: cx = x + w / 2 cy = y + h / 2 parts = (cx, cy, w, h) else: raise ValueError(f"Format {format} is not supported") - return datapoints.BoundingBoxes( + return vision_tensors.BoundingBoxes( torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, canvas_size=canvas_size ) def make_detection_mask(size=DEFAULT_SIZE, *, num_objects=5, batch_dims=(), dtype=None, device="cpu"): """Make a "detection" mask, i.e. (*, N, H, W), where each object is encoded as one of N boolean masks""" - return datapoints.Mask( + return vision_tensors.Mask( torch.testing.make_tensor( (*batch_dims, num_objects, *size), low=0, @@ -154,7 +154,7 @@ def make_detection_mask(size=DEFAULT_SIZE, *, num_objects=5, batch_dims=(), dtyp def make_segmentation_mask(size=DEFAULT_SIZE, *, num_categories=10, batch_dims=(), dtype=None, device="cpu"): """Make a "segmentation" mask, i.e. (*, H, W), where the category is encoded as pixel value""" - return datapoints.Mask( + return vision_tensors.Mask( torch.testing.make_tensor( (*batch_dims, *size), low=0, @@ -166,7 +166,7 @@ def make_segmentation_mask(size=DEFAULT_SIZE, *, num_categories=10, batch_dims=( def make_video(size=DEFAULT_SIZE, *, num_frames=3, batch_dims=(), **kwargs): - return datapoints.Video(make_image(size, batch_dims=(*batch_dims, num_frames), **kwargs)) + return vision_tensors.Video(make_image(size, batch_dims=(*batch_dims, num_frames), **kwargs)) def make_video_tensor(*args, **kwargs): @@ -335,7 +335,7 @@ def fn(shape, dtype, device, memory_format): image_tensor = image_tensor.to(device=device) image_tensor = to_dtype_image(image_tensor, dtype=dtype, scale=True) - return datapoints.Image(image_tensor) + return vision_tensors.Image(image_tensor) return ImageLoader(fn, shape=(num_channels, *size), dtype=dtype, memory_format=memory_format) @@ -352,7 +352,7 @@ def make_image_loaders_for_interpolation( @dataclasses.dataclass class BoundingBoxesLoader(TensorLoader): - format: datapoints.BoundingBoxFormat + format: vision_tensors.BoundingBoxFormat spatial_size: Tuple[int, int] canvas_size: Tuple[int, int] = dataclasses.field(init=False) @@ -362,7 +362,7 @@ def __post_init__(self): def make_bounding_box_loader(*, extra_dims=(), format, spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE, dtype=torch.float32): if isinstance(format, str): - format = datapoints.BoundingBoxFormat[format] + format = vision_tensors.BoundingBoxFormat[format] spatial_size = _parse_size(spatial_size, name="spatial_size") @@ -381,7 +381,7 @@ def fn(shape, dtype, device): def make_bounding_box_loaders( *, extra_dims=tuple(d for d in DEFAULT_EXTRA_DIMS if len(d) < 2), - formats=tuple(datapoints.BoundingBoxFormat), + formats=tuple(vision_tensors.BoundingBoxFormat), spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE, dtypes=(torch.float32, torch.float64, torch.int64), ): diff --git a/torchvision/datasets/__init__.py b/torchvision/datasets/__init__.py index 43b0801d495..f091412a1b9 100644 --- a/torchvision/datasets/__init__.py +++ b/torchvision/datasets/__init__.py @@ -137,7 +137,7 @@ # Ref: https://peps.python.org/pep-0562/ def __getattr__(name): if name in ("wrap_dataset_for_transforms_v2",): - from torchvision.datapoints._dataset_wrapper import wrap_dataset_for_transforms_v2 + from torchvision.vision_tensors._dataset_wrapper import wrap_dataset_for_transforms_v2 return wrap_dataset_for_transforms_v2 diff --git a/torchvision/datasets/vision.py b/torchvision/datasets/vision.py index aba19369b64..72667958ae1 100644 --- a/torchvision/datasets/vision.py +++ b/torchvision/datasets/vision.py @@ -67,7 +67,7 @@ def __len__(self) -> int: def __repr__(self) -> str: head = "Dataset " + self.__class__.__name__ - body = [f"Number of datapoints: {self.__len__()}"] + body = [f"Number of vision_tensors: {self.__len__()}"] if self.root is not None: body.append(f"Root location: {self.root}") body += self.extra_repr().splitlines() diff --git a/torchvision/prototype/__init__.py b/torchvision/prototype/__init__.py index 200f5cd9552..be67c1bb0c9 100644 --- a/torchvision/prototype/__init__.py +++ b/torchvision/prototype/__init__.py @@ -1 +1 @@ -from . import datapoints, models, transforms, utils +from . import models, transforms, utils, vision_tensors diff --git a/torchvision/prototype/datasets/_builtin/caltech.py b/torchvision/prototype/datasets/_builtin/caltech.py index 631de46b2b6..343397cb65f 100644 --- a/torchvision/prototype/datasets/_builtin/caltech.py +++ b/torchvision/prototype/datasets/_builtin/caltech.py @@ -6,8 +6,6 @@ import torch from torchdata.datapipes.iter import Filter, IterDataPipe, IterKeyZipper, Mapper -from torchvision.datapoints import BoundingBoxes -from torchvision.prototype.datapoints import Label from torchvision.prototype.datasets.utils import Dataset, EncodedImage, GDriveResource, OnlineResource from torchvision.prototype.datasets.utils._internal import ( hint_sharding, @@ -16,6 +14,8 @@ read_categories_file, read_mat, ) +from torchvision.prototype.vision_tensors import Label +from torchvision.vision_tensors import BoundingBoxes from .._api import register_dataset, register_info diff --git a/torchvision/prototype/datasets/_builtin/celeba.py b/torchvision/prototype/datasets/_builtin/celeba.py index 9112a80357c..677544ecae3 100644 --- a/torchvision/prototype/datasets/_builtin/celeba.py +++ b/torchvision/prototype/datasets/_builtin/celeba.py @@ -4,8 +4,6 @@ import torch from torchdata.datapipes.iter import Filter, IterDataPipe, IterKeyZipper, Mapper, Zipper -from torchvision.datapoints import BoundingBoxes -from torchvision.prototype.datapoints import Label from torchvision.prototype.datasets.utils import Dataset, EncodedImage, GDriveResource, OnlineResource from torchvision.prototype.datasets.utils._internal import ( getitem, @@ -14,6 +12,8 @@ INFINITE_BUFFER_SIZE, path_accessor, ) +from torchvision.prototype.vision_tensors import Label +from torchvision.vision_tensors import BoundingBoxes from .._api import register_dataset, register_info diff --git a/torchvision/prototype/datasets/_builtin/cifar.py b/torchvision/prototype/datasets/_builtin/cifar.py index 7d178291992..52985c6b708 100644 --- a/torchvision/prototype/datasets/_builtin/cifar.py +++ b/torchvision/prototype/datasets/_builtin/cifar.py @@ -6,8 +6,6 @@ import numpy as np from torchdata.datapipes.iter import Filter, IterDataPipe, Mapper -from torchvision.datapoints import Image -from torchvision.prototype.datapoints import Label from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource from torchvision.prototype.datasets.utils._internal import ( hint_sharding, @@ -15,6 +13,8 @@ path_comparator, read_categories_file, ) +from torchvision.prototype.vision_tensors import Label +from torchvision.vision_tensors import Image from .._api import register_dataset, register_info diff --git a/torchvision/prototype/datasets/_builtin/clevr.py b/torchvision/prototype/datasets/_builtin/clevr.py index e282635684e..d686f72ecef 100644 --- a/torchvision/prototype/datasets/_builtin/clevr.py +++ b/torchvision/prototype/datasets/_builtin/clevr.py @@ -2,7 +2,6 @@ from typing import Any, BinaryIO, Dict, List, Optional, Tuple, Union from torchdata.datapipes.iter import Demultiplexer, Filter, IterDataPipe, IterKeyZipper, JsonParser, Mapper, UnBatcher -from torchvision.prototype.datapoints import Label from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource from torchvision.prototype.datasets.utils._internal import ( getitem, @@ -12,6 +11,7 @@ path_accessor, path_comparator, ) +from torchvision.prototype.vision_tensors import Label from .._api import register_dataset, register_info diff --git a/torchvision/prototype/datasets/_builtin/coco.py b/torchvision/prototype/datasets/_builtin/coco.py index abf19acec0d..743d78b412b 100644 --- a/torchvision/prototype/datasets/_builtin/coco.py +++ b/torchvision/prototype/datasets/_builtin/coco.py @@ -14,8 +14,6 @@ Mapper, UnBatcher, ) -from torchvision.datapoints import BoundingBoxes, Mask -from torchvision.prototype.datapoints import Label from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource from torchvision.prototype.datasets.utils._internal import ( getitem, @@ -26,6 +24,8 @@ path_accessor, read_categories_file, ) +from torchvision.prototype.vision_tensors import Label +from torchvision.vision_tensors import BoundingBoxes, Mask from .._api import register_dataset, register_info diff --git a/torchvision/prototype/datasets/_builtin/country211.py b/torchvision/prototype/datasets/_builtin/country211.py index 0f4b3d769dc..9bc695d4844 100644 --- a/torchvision/prototype/datasets/_builtin/country211.py +++ b/torchvision/prototype/datasets/_builtin/country211.py @@ -2,7 +2,6 @@ from typing import Any, Dict, List, Tuple, Union from torchdata.datapipes.iter import Filter, IterDataPipe, Mapper -from torchvision.prototype.datapoints import Label from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource from torchvision.prototype.datasets.utils._internal import ( hint_sharding, @@ -10,6 +9,7 @@ path_comparator, read_categories_file, ) +from torchvision.prototype.vision_tensors import Label from .._api import register_dataset, register_info diff --git a/torchvision/prototype/datasets/_builtin/cub200.py b/torchvision/prototype/datasets/_builtin/cub200.py index b301c6ba030..d5f193b2a5d 100644 --- a/torchvision/prototype/datasets/_builtin/cub200.py +++ b/torchvision/prototype/datasets/_builtin/cub200.py @@ -15,8 +15,6 @@ Mapper, ) from torchdata.datapipes.map import IterToMapConverter -from torchvision.datapoints import BoundingBoxes -from torchvision.prototype.datapoints import Label from torchvision.prototype.datasets.utils import Dataset, EncodedImage, GDriveResource, OnlineResource from torchvision.prototype.datasets.utils._internal import ( getitem, @@ -28,6 +26,8 @@ read_categories_file, read_mat, ) +from torchvision.prototype.vision_tensors import Label +from torchvision.vision_tensors import BoundingBoxes from .._api import register_dataset, register_info diff --git a/torchvision/prototype/datasets/_builtin/dtd.py b/torchvision/prototype/datasets/_builtin/dtd.py index 6ddab2af79d..7e15e5000be 100644 --- a/torchvision/prototype/datasets/_builtin/dtd.py +++ b/torchvision/prototype/datasets/_builtin/dtd.py @@ -3,7 +3,6 @@ from typing import Any, BinaryIO, Dict, List, Optional, Tuple, Union from torchdata.datapipes.iter import CSVParser, Demultiplexer, Filter, IterDataPipe, IterKeyZipper, LineReader, Mapper -from torchvision.prototype.datapoints import Label from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource from torchvision.prototype.datasets.utils._internal import ( getitem, @@ -13,6 +12,7 @@ path_comparator, read_categories_file, ) +from torchvision.prototype.vision_tensors import Label from .._api import register_dataset, register_info diff --git a/torchvision/prototype/datasets/_builtin/eurosat.py b/torchvision/prototype/datasets/_builtin/eurosat.py index 463eed79d70..7edfcc9867b 100644 --- a/torchvision/prototype/datasets/_builtin/eurosat.py +++ b/torchvision/prototype/datasets/_builtin/eurosat.py @@ -2,9 +2,9 @@ from typing import Any, Dict, List, Tuple, Union from torchdata.datapipes.iter import IterDataPipe, Mapper -from torchvision.prototype.datapoints import Label from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling +from torchvision.prototype.vision_tensors import Label from .._api import register_dataset, register_info diff --git a/torchvision/prototype/datasets/_builtin/fer2013.py b/torchvision/prototype/datasets/_builtin/fer2013.py index 17f092aa328..c9d65d611b3 100644 --- a/torchvision/prototype/datasets/_builtin/fer2013.py +++ b/torchvision/prototype/datasets/_builtin/fer2013.py @@ -3,10 +3,10 @@ import torch from torchdata.datapipes.iter import CSVDictParser, IterDataPipe, Mapper -from torchvision.datapoints import Image -from torchvision.prototype.datapoints import Label from torchvision.prototype.datasets.utils import Dataset, KaggleDownloadResource, OnlineResource from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling +from torchvision.prototype.vision_tensors import Label +from torchvision.vision_tensors import Image from .._api import register_dataset, register_info diff --git a/torchvision/prototype/datasets/_builtin/food101.py b/torchvision/prototype/datasets/_builtin/food101.py index f3054d8fb13..269b19ef494 100644 --- a/torchvision/prototype/datasets/_builtin/food101.py +++ b/torchvision/prototype/datasets/_builtin/food101.py @@ -2,7 +2,6 @@ from typing import Any, BinaryIO, Dict, List, Optional, Tuple, Union from torchdata.datapipes.iter import Demultiplexer, Filter, IterDataPipe, IterKeyZipper, LineReader, Mapper -from torchvision.prototype.datapoints import Label from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource from torchvision.prototype.datasets.utils._internal import ( getitem, @@ -12,6 +11,7 @@ path_comparator, read_categories_file, ) +from torchvision.prototype.vision_tensors import Label from .._api import register_dataset, register_info diff --git a/torchvision/prototype/datasets/_builtin/gtsrb.py b/torchvision/prototype/datasets/_builtin/gtsrb.py index 34651fcfce3..f0442f797ba 100644 --- a/torchvision/prototype/datasets/_builtin/gtsrb.py +++ b/torchvision/prototype/datasets/_builtin/gtsrb.py @@ -2,8 +2,6 @@ from typing import Any, Dict, List, Optional, Tuple, Union from torchdata.datapipes.iter import CSVDictParser, Demultiplexer, Filter, IterDataPipe, Mapper, Zipper -from torchvision.datapoints import BoundingBoxes -from torchvision.prototype.datapoints import Label from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource from torchvision.prototype.datasets.utils._internal import ( hint_sharding, @@ -11,6 +9,8 @@ INFINITE_BUFFER_SIZE, path_comparator, ) +from torchvision.prototype.vision_tensors import Label +from torchvision.vision_tensors import BoundingBoxes from .._api import register_dataset, register_info diff --git a/torchvision/prototype/datasets/_builtin/imagenet.py b/torchvision/prototype/datasets/_builtin/imagenet.py index 5e2db41e1d0..68de3d54a4e 100644 --- a/torchvision/prototype/datasets/_builtin/imagenet.py +++ b/torchvision/prototype/datasets/_builtin/imagenet.py @@ -15,7 +15,6 @@ TarArchiveLoader, ) from torchdata.datapipes.map import IterToMapConverter -from torchvision.prototype.datapoints import Label from torchvision.prototype.datasets.utils import Dataset, EncodedImage, ManualDownloadResource, OnlineResource from torchvision.prototype.datasets.utils._internal import ( getitem, @@ -26,6 +25,7 @@ read_categories_file, read_mat, ) +from torchvision.prototype.vision_tensors import Label from .._api import register_dataset, register_info diff --git a/torchvision/prototype/datasets/_builtin/mnist.py b/torchvision/prototype/datasets/_builtin/mnist.py index 8f22a33ae01..cfd5e4c6b04 100644 --- a/torchvision/prototype/datasets/_builtin/mnist.py +++ b/torchvision/prototype/datasets/_builtin/mnist.py @@ -7,11 +7,11 @@ import torch from torchdata.datapipes.iter import Decompressor, Demultiplexer, IterDataPipe, Mapper, Zipper -from torchvision.datapoints import Image -from torchvision.prototype.datapoints import Label from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling, INFINITE_BUFFER_SIZE from torchvision.prototype.utils._internal import fromfile +from torchvision.prototype.vision_tensors import Label +from torchvision.vision_tensors import Image from .._api import register_dataset, register_info diff --git a/torchvision/prototype/datasets/_builtin/oxford_iiit_pet.py b/torchvision/prototype/datasets/_builtin/oxford_iiit_pet.py index fbc7d30c292..54fadd8e848 100644 --- a/torchvision/prototype/datasets/_builtin/oxford_iiit_pet.py +++ b/torchvision/prototype/datasets/_builtin/oxford_iiit_pet.py @@ -3,7 +3,6 @@ from typing import Any, BinaryIO, Dict, List, Optional, Tuple, Union from torchdata.datapipes.iter import CSVDictParser, Demultiplexer, Filter, IterDataPipe, IterKeyZipper, Mapper -from torchvision.prototype.datapoints import Label from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource from torchvision.prototype.datasets.utils._internal import ( getitem, @@ -14,6 +13,7 @@ path_comparator, read_categories_file, ) +from torchvision.prototype.vision_tensors import Label from .._api import register_dataset, register_info diff --git a/torchvision/prototype/datasets/_builtin/pcam.py b/torchvision/prototype/datasets/_builtin/pcam.py index 4de5ae2765b..90bbf925627 100644 --- a/torchvision/prototype/datasets/_builtin/pcam.py +++ b/torchvision/prototype/datasets/_builtin/pcam.py @@ -4,10 +4,10 @@ from typing import Any, Dict, Iterator, List, Optional, Tuple, Union from torchdata.datapipes.iter import IterDataPipe, Mapper, Zipper -from torchvision.datapoints import Image -from torchvision.prototype.datapoints import Label from torchvision.prototype.datasets.utils import Dataset, GDriveResource, OnlineResource from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling +from torchvision.prototype.vision_tensors import Label +from torchvision.vision_tensors import Image from .._api import register_dataset, register_info diff --git a/torchvision/prototype/datasets/_builtin/semeion.py b/torchvision/prototype/datasets/_builtin/semeion.py index 92e1b93b410..e8d0c921459 100644 --- a/torchvision/prototype/datasets/_builtin/semeion.py +++ b/torchvision/prototype/datasets/_builtin/semeion.py @@ -3,10 +3,10 @@ import torch from torchdata.datapipes.iter import CSVParser, IterDataPipe, Mapper -from torchvision.datapoints import Image -from torchvision.prototype.datapoints import OneHotLabel from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling +from torchvision.prototype.vision_tensors import OneHotLabel +from torchvision.vision_tensors import Image from .._api import register_dataset, register_info diff --git a/torchvision/prototype/datasets/_builtin/stanford_cars.py b/torchvision/prototype/datasets/_builtin/stanford_cars.py index aefbbede2e3..bdaaf4a7995 100644 --- a/torchvision/prototype/datasets/_builtin/stanford_cars.py +++ b/torchvision/prototype/datasets/_builtin/stanford_cars.py @@ -2,8 +2,6 @@ from typing import Any, BinaryIO, Dict, Iterator, List, Tuple, Union from torchdata.datapipes.iter import Filter, IterDataPipe, Mapper, Zipper -from torchvision.datapoints import BoundingBoxes -from torchvision.prototype.datapoints import Label from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource from torchvision.prototype.datasets.utils._internal import ( hint_sharding, @@ -12,6 +10,8 @@ read_categories_file, read_mat, ) +from torchvision.prototype.vision_tensors import Label +from torchvision.vision_tensors import BoundingBoxes from .._api import register_dataset, register_info diff --git a/torchvision/prototype/datasets/_builtin/svhn.py b/torchvision/prototype/datasets/_builtin/svhn.py index 94de4cf42c3..51dc376795d 100644 --- a/torchvision/prototype/datasets/_builtin/svhn.py +++ b/torchvision/prototype/datasets/_builtin/svhn.py @@ -3,10 +3,10 @@ import numpy as np from torchdata.datapipes.iter import IterDataPipe, Mapper, UnBatcher -from torchvision.datapoints import Image -from torchvision.prototype.datapoints import Label from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling, read_mat +from torchvision.prototype.vision_tensors import Label +from torchvision.vision_tensors import Image from .._api import register_dataset, register_info diff --git a/torchvision/prototype/datasets/_builtin/usps.py b/torchvision/prototype/datasets/_builtin/usps.py index b5486669e21..28251ce6db8 100644 --- a/torchvision/prototype/datasets/_builtin/usps.py +++ b/torchvision/prototype/datasets/_builtin/usps.py @@ -3,10 +3,10 @@ import torch from torchdata.datapipes.iter import Decompressor, IterDataPipe, LineReader, Mapper -from torchvision.datapoints import Image -from torchvision.prototype.datapoints import Label from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling +from torchvision.prototype.vision_tensors import Label +from torchvision.vision_tensors import Image from .._api import register_dataset, register_info diff --git a/torchvision/prototype/datasets/_builtin/voc.py b/torchvision/prototype/datasets/_builtin/voc.py index 53dfbd185bc..e4a150a7f2c 100644 --- a/torchvision/prototype/datasets/_builtin/voc.py +++ b/torchvision/prototype/datasets/_builtin/voc.py @@ -5,9 +5,7 @@ from xml.etree import ElementTree from torchdata.datapipes.iter import Demultiplexer, Filter, IterDataPipe, IterKeyZipper, LineReader, Mapper -from torchvision.datapoints import BoundingBoxes from torchvision.datasets import VOCDetection -from torchvision.prototype.datapoints import Label from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource from torchvision.prototype.datasets.utils._internal import ( getitem, @@ -18,6 +16,8 @@ path_comparator, read_categories_file, ) +from torchvision.prototype.vision_tensors import Label +from torchvision.vision_tensors import BoundingBoxes from .._api import register_dataset, register_info diff --git a/torchvision/prototype/datasets/_folder.py b/torchvision/prototype/datasets/_folder.py index 0a37df03add..e1260f6c3b5 100644 --- a/torchvision/prototype/datasets/_folder.py +++ b/torchvision/prototype/datasets/_folder.py @@ -5,9 +5,9 @@ from typing import Any, BinaryIO, Collection, Dict, List, Optional, Tuple, Union from torchdata.datapipes.iter import FileLister, FileOpener, Filter, IterDataPipe, Mapper -from torchvision.prototype.datapoints import Label from torchvision.prototype.datasets.utils import EncodedData, EncodedImage from torchvision.prototype.datasets.utils._internal import hint_sharding, hint_shuffling +from torchvision.prototype.vision_tensors import Label __all__ = ["from_data_folder", "from_image_folder"] diff --git a/torchvision/prototype/datasets/utils/_encoded.py b/torchvision/prototype/datasets/utils/_encoded.py index 8adc1e57acb..f6551fb5fc6 100644 --- a/torchvision/prototype/datasets/utils/_encoded.py +++ b/torchvision/prototype/datasets/utils/_encoded.py @@ -6,14 +6,14 @@ import PIL.Image import torch - -from torchvision.datapoints._datapoint import Datapoint from torchvision.prototype.utils._internal import fromfile, ReadOnlyTensorBuffer +from torchvision.vision_tensors._vision_tensor import VisionTensor + D = TypeVar("D", bound="EncodedData") -class EncodedData(Datapoint): +class EncodedData(VisionTensor): @classmethod def _wrap(cls: Type[D], tensor: torch.Tensor) -> D: return tensor.as_subclass(cls) diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py index f4013ffa718..1aead63c497 100644 --- a/torchvision/prototype/transforms/_augment.py +++ b/torchvision/prototype/transforms/_augment.py @@ -3,9 +3,9 @@ import PIL.Image import torch from torch.utils._pytree import tree_flatten, tree_unflatten -from torchvision import datapoints +from torchvision import vision_tensors from torchvision.ops import masks_to_boxes -from torchvision.prototype import datapoints as proto_datapoints +from torchvision.prototype import vision_tensors as proto_vision_tensors from torchvision.transforms.v2 import functional as F, InterpolationMode, Transform from torchvision.transforms.v2._utils import is_pure_tensor @@ -26,9 +26,9 @@ def __init__( def _copy_paste( self, - image: Union[torch.Tensor, datapoints.Image], + image: Union[torch.Tensor, vision_tensors.Image], target: Dict[str, Any], - paste_image: Union[torch.Tensor, datapoints.Image], + paste_image: Union[torch.Tensor, vision_tensors.Image], paste_target: Dict[str, Any], random_selection: torch.Tensor, blending: bool, @@ -36,9 +36,9 @@ def _copy_paste( antialias: Optional[bool], ) -> Tuple[torch.Tensor, Dict[str, Any]]: - paste_masks = datapoints.wrap(paste_target["masks"][random_selection], like=paste_target["masks"]) - paste_boxes = datapoints.wrap(paste_target["boxes"][random_selection], like=paste_target["boxes"]) - paste_labels = datapoints.wrap(paste_target["labels"][random_selection], like=paste_target["labels"]) + paste_masks = vision_tensors.wrap(paste_target["masks"][random_selection], like=paste_target["masks"]) + paste_boxes = vision_tensors.wrap(paste_target["boxes"][random_selection], like=paste_target["boxes"]) + paste_labels = vision_tensors.wrap(paste_target["labels"][random_selection], like=paste_target["labels"]) masks = target["masks"] @@ -81,7 +81,7 @@ def _copy_paste( # https://github.com/pytorch/vision/blob/b6feccbc4387766b76a3e22b13815dbbbfa87c0f/torchvision/models/detection/roi_heads.py#L418-L422 xyxy_boxes[:, 2:] += 1 boxes = F.convert_bounding_box_format( - xyxy_boxes, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=bbox_format, inplace=True + xyxy_boxes, old_format=vision_tensors.BoundingBoxFormat.XYXY, new_format=bbox_format, inplace=True ) out_target["boxes"] = torch.cat([boxes, paste_boxes]) @@ -90,7 +90,7 @@ def _copy_paste( # Check for degenerated boxes and remove them boxes = F.convert_bounding_box_format( - out_target["boxes"], old_format=bbox_format, new_format=datapoints.BoundingBoxFormat.XYXY + out_target["boxes"], old_format=bbox_format, new_format=vision_tensors.BoundingBoxFormat.XYXY ) degenerate_boxes = boxes[:, 2:] <= boxes[:, :2] if degenerate_boxes.any(): @@ -104,20 +104,20 @@ def _copy_paste( def _extract_image_targets( self, flat_sample: List[Any] - ) -> Tuple[List[Union[torch.Tensor, datapoints.Image]], List[Dict[str, Any]]]: + ) -> Tuple[List[Union[torch.Tensor, vision_tensors.Image]], List[Dict[str, Any]]]: # fetch all images, bboxes, masks and labels from unstructured input # with List[image], List[BoundingBoxes], List[Mask], List[Label] images, bboxes, masks, labels = [], [], [], [] for obj in flat_sample: - if isinstance(obj, datapoints.Image) or is_pure_tensor(obj): + if isinstance(obj, vision_tensors.Image) or is_pure_tensor(obj): images.append(obj) elif isinstance(obj, PIL.Image.Image): images.append(F.to_image(obj)) - elif isinstance(obj, datapoints.BoundingBoxes): + elif isinstance(obj, vision_tensors.BoundingBoxes): bboxes.append(obj) - elif isinstance(obj, datapoints.Mask): + elif isinstance(obj, vision_tensors.Mask): masks.append(obj) - elif isinstance(obj, (proto_datapoints.Label, proto_datapoints.OneHotLabel)): + elif isinstance(obj, (proto_vision_tensors.Label, proto_vision_tensors.OneHotLabel)): labels.append(obj) if not (len(images) == len(bboxes) == len(masks) == len(labels)): @@ -140,8 +140,8 @@ def _insert_outputs( ) -> None: c0, c1, c2, c3 = 0, 0, 0, 0 for i, obj in enumerate(flat_sample): - if isinstance(obj, datapoints.Image): - flat_sample[i] = datapoints.wrap(output_images[c0], like=obj) + if isinstance(obj, vision_tensors.Image): + flat_sample[i] = vision_tensors.wrap(output_images[c0], like=obj) c0 += 1 elif isinstance(obj, PIL.Image.Image): flat_sample[i] = F.to_pil_image(output_images[c0]) @@ -149,14 +149,14 @@ def _insert_outputs( elif is_pure_tensor(obj): flat_sample[i] = output_images[c0] c0 += 1 - elif isinstance(obj, datapoints.BoundingBoxes): - flat_sample[i] = datapoints.wrap(output_targets[c1]["boxes"], like=obj) + elif isinstance(obj, vision_tensors.BoundingBoxes): + flat_sample[i] = vision_tensors.wrap(output_targets[c1]["boxes"], like=obj) c1 += 1 - elif isinstance(obj, datapoints.Mask): - flat_sample[i] = datapoints.wrap(output_targets[c2]["masks"], like=obj) + elif isinstance(obj, vision_tensors.Mask): + flat_sample[i] = vision_tensors.wrap(output_targets[c2]["masks"], like=obj) c2 += 1 - elif isinstance(obj, (proto_datapoints.Label, proto_datapoints.OneHotLabel)): - flat_sample[i] = datapoints.wrap(output_targets[c3]["labels"], like=obj) + elif isinstance(obj, (proto_vision_tensors.Label, proto_vision_tensors.OneHotLabel)): + flat_sample[i] = vision_tensors.wrap(output_targets[c3]["labels"], like=obj) c3 += 1 def forward(self, *inputs: Any) -> Any: diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py index 3b7e6878170..1b636ba3b1b 100644 --- a/torchvision/prototype/transforms/_geometry.py +++ b/torchvision/prototype/transforms/_geometry.py @@ -3,8 +3,8 @@ import PIL.Image import torch -from torchvision import datapoints -from torchvision.prototype.datapoints import Label, OneHotLabel +from torchvision import vision_tensors +from torchvision.prototype.vision_tensors import Label, OneHotLabel from torchvision.transforms.v2 import functional as F, Transform from torchvision.transforms.v2._utils import ( _FillType, @@ -39,15 +39,15 @@ def _check_inputs(self, flat_inputs: List[Any]) -> None: if not has_any( flat_inputs, PIL.Image.Image, - datapoints.Image, + vision_tensors.Image, is_pure_tensor, - datapoints.Video, + vision_tensors.Video, ): raise TypeError( f"{type(self).__name__}() requires input sample to contain an tensor or PIL image or a Video." ) - if has_any(flat_inputs, datapoints.BoundingBoxes) and not has_any(flat_inputs, Label, OneHotLabel): + if has_any(flat_inputs, vision_tensors.BoundingBoxes) and not has_any(flat_inputs, Label, OneHotLabel): raise TypeError( f"If a BoundingBoxes is contained in the input sample, " f"{type(self).__name__}() also requires it to contain a Label or OneHotLabel." @@ -85,7 +85,7 @@ def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: ) bounding_boxes = F.clamp_bounding_boxes(bounding_boxes, format=format, canvas_size=canvas_size) height_and_width = F.convert_bounding_box_format( - bounding_boxes, old_format=format, new_format=datapoints.BoundingBoxFormat.XYWH + bounding_boxes, old_format=format, new_format=vision_tensors.BoundingBoxFormat.XYWH )[..., 2:] is_valid = torch.all(height_and_width > 0, dim=-1) else: @@ -119,10 +119,10 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: ) if params["is_valid"] is not None: - if isinstance(inpt, (Label, OneHotLabel, datapoints.Mask)): - inpt = datapoints.wrap(inpt[params["is_valid"]], like=inpt) - elif isinstance(inpt, datapoints.BoundingBoxes): - inpt = datapoints.wrap( + if isinstance(inpt, (Label, OneHotLabel, vision_tensors.Mask)): + inpt = vision_tensors.wrap(inpt[params["is_valid"]], like=inpt) + elif isinstance(inpt, vision_tensors.BoundingBoxes): + inpt = vision_tensors.wrap( F.clamp_bounding_boxes(inpt[params["is_valid"]], format=inpt.format, canvas_size=inpt.canvas_size), like=inpt, ) diff --git a/torchvision/prototype/transforms/_misc.py b/torchvision/prototype/transforms/_misc.py index fa812bbbbe9..1d80420e667 100644 --- a/torchvision/prototype/transforms/_misc.py +++ b/torchvision/prototype/transforms/_misc.py @@ -5,7 +5,7 @@ import torch -from torchvision import datapoints +from torchvision import vision_tensors from torchvision.transforms.v2 import Transform from torchvision.transforms.v2._utils import is_pure_tensor @@ -25,17 +25,17 @@ def _get_defaultdict(default: T) -> Dict[Any, T]: class PermuteDimensions(Transform): - _transformed_types = (is_pure_tensor, datapoints.Image, datapoints.Video) + _transformed_types = (is_pure_tensor, vision_tensors.Image, vision_tensors.Video) def __init__(self, dims: Union[Sequence[int], Dict[Type, Optional[Sequence[int]]]]) -> None: super().__init__() if not isinstance(dims, dict): dims = _get_defaultdict(dims) - if torch.Tensor in dims and any(cls in dims for cls in [datapoints.Image, datapoints.Video]): + if torch.Tensor in dims and any(cls in dims for cls in [vision_tensors.Image, vision_tensors.Video]): warnings.warn( - "Got `dims` values for `torch.Tensor` and either `datapoints.Image` or `datapoints.Video`. " + "Got `dims` values for `torch.Tensor` and either `vision_tensors.Image` or `vision_tensors.Video`. " "Note that a plain `torch.Tensor` will *not* be transformed by this (or any other transformation) " - "in case a `datapoints.Image` or `datapoints.Video` is present in the input." + "in case a `vision_tensors.Image` or `vision_tensors.Video` is present in the input." ) self.dims = dims @@ -47,17 +47,17 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> torch.Tensor: class TransposeDimensions(Transform): - _transformed_types = (is_pure_tensor, datapoints.Image, datapoints.Video) + _transformed_types = (is_pure_tensor, vision_tensors.Image, vision_tensors.Video) def __init__(self, dims: Union[Tuple[int, int], Dict[Type, Optional[Tuple[int, int]]]]) -> None: super().__init__() if not isinstance(dims, dict): dims = _get_defaultdict(dims) - if torch.Tensor in dims and any(cls in dims for cls in [datapoints.Image, datapoints.Video]): + if torch.Tensor in dims and any(cls in dims for cls in [vision_tensors.Image, vision_tensors.Video]): warnings.warn( - "Got `dims` values for `torch.Tensor` and either `datapoints.Image` or `datapoints.Video`. " + "Got `dims` values for `torch.Tensor` and either `vision_tensors.Image` or `vision_tensors.Video`. " "Note that a plain `torch.Tensor` will *not* be transformed by this (or any other transformation) " - "in case a `datapoints.Image` or `datapoints.Video` is present in the input." + "in case a `vision_tensors.Image` or `vision_tensors.Video` is present in the input." ) self.dims = dims diff --git a/torchvision/prototype/transforms/_type_conversion.py b/torchvision/prototype/transforms/_type_conversion.py index 4cd3cf46871..66f2ff6b6f3 100644 --- a/torchvision/prototype/transforms/_type_conversion.py +++ b/torchvision/prototype/transforms/_type_conversion.py @@ -4,23 +4,23 @@ from torch.nn.functional import one_hot -from torchvision.prototype import datapoints as proto_datapoints +from torchvision.prototype import vision_tensors as proto_vision_tensors from torchvision.transforms.v2 import Transform class LabelToOneHot(Transform): - _transformed_types = (proto_datapoints.Label,) + _transformed_types = (proto_vision_tensors.Label,) def __init__(self, num_categories: int = -1): super().__init__() self.num_categories = num_categories - def _transform(self, inpt: proto_datapoints.Label, params: Dict[str, Any]) -> proto_datapoints.OneHotLabel: + def _transform(self, inpt: proto_vision_tensors.Label, params: Dict[str, Any]) -> proto_vision_tensors.OneHotLabel: num_categories = self.num_categories if num_categories == -1 and inpt.categories is not None: num_categories = len(inpt.categories) output = one_hot(inpt.as_subclass(torch.Tensor), num_classes=num_categories) - return proto_datapoints.OneHotLabel(output, categories=inpt.categories) + return proto_vision_tensors.OneHotLabel(output, categories=inpt.categories) def extra_repr(self) -> str: if self.num_categories == -1: diff --git a/torchvision/prototype/datapoints/__init__.py b/torchvision/prototype/vision_tensors/__init__.py similarity index 100% rename from torchvision/prototype/datapoints/__init__.py rename to torchvision/prototype/vision_tensors/__init__.py diff --git a/torchvision/prototype/datapoints/_label.py b/torchvision/prototype/vision_tensors/_label.py similarity index 95% rename from torchvision/prototype/datapoints/_label.py rename to torchvision/prototype/vision_tensors/_label.py index 10ac1bf8295..dddedb61727 100644 --- a/torchvision/prototype/datapoints/_label.py +++ b/torchvision/prototype/vision_tensors/_label.py @@ -5,13 +5,13 @@ import torch from torch.utils._pytree import tree_map -from torchvision.datapoints._datapoint import Datapoint +from torchvision.vision_tensors._vision_tensor import VisionTensor L = TypeVar("L", bound="_LabelBase") -class _LabelBase(Datapoint): +class _LabelBase(VisionTensor): categories: Optional[Sequence[str]] @classmethod diff --git a/torchvision/transforms/v2/_augment.py b/torchvision/transforms/v2/_augment.py index a5c98382540..8c8d26b4c8c 100644 --- a/torchvision/transforms/v2/_augment.py +++ b/torchvision/transforms/v2/_augment.py @@ -7,7 +7,7 @@ import torch from torch.nn.functional import one_hot from torch.utils._pytree import tree_flatten, tree_unflatten -from torchvision import datapoints, transforms as _transforms +from torchvision import transforms as _transforms, vision_tensors from torchvision.transforms.v2 import functional as F from ._transform import _RandomApplyTransform, Transform @@ -91,10 +91,10 @@ def __init__( self._log_ratio = torch.log(torch.tensor(self.ratio)) def _call_kernel(self, functional: Callable, inpt: Any, *args: Any, **kwargs: Any) -> Any: - if isinstance(inpt, (datapoints.BoundingBoxes, datapoints.Mask)): + if isinstance(inpt, (vision_tensors.BoundingBoxes, vision_tensors.Mask)): warnings.warn( f"{type(self).__name__}() is currently passing through inputs of type " - f"datapoints.{type(inpt).__name__}. This will likely change in the future." + f"vision_tensors.{type(inpt).__name__}. This will likely change in the future." ) return super()._call_kernel(functional, inpt, *args, **kwargs) @@ -158,7 +158,7 @@ def forward(self, *inputs): flat_inputs, spec = tree_flatten(inputs) needs_transform_list = self._needs_transform_list(flat_inputs) - if has_any(flat_inputs, PIL.Image.Image, datapoints.BoundingBoxes, datapoints.Mask): + if has_any(flat_inputs, PIL.Image.Image, vision_tensors.BoundingBoxes, vision_tensors.Mask): raise ValueError(f"{type(self).__name__}() does not support PIL images, bounding boxes and masks.") labels = self._labels_getter(inputs) @@ -188,7 +188,7 @@ def forward(self, *inputs): return tree_unflatten(flat_outputs, spec) def _check_image_or_video(self, inpt: torch.Tensor, *, batch_size: int): - expected_num_dims = 5 if isinstance(inpt, datapoints.Video) else 4 + expected_num_dims = 5 if isinstance(inpt, vision_tensors.Video) else 4 if inpt.ndim != expected_num_dims: raise ValueError( f"Expected a batched input with {expected_num_dims} dims, but got {inpt.ndim} dimensions instead." @@ -242,13 +242,13 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: if inpt is params["labels"]: return self._mixup_label(inpt, lam=lam) - elif isinstance(inpt, (datapoints.Image, datapoints.Video)) or is_pure_tensor(inpt): + elif isinstance(inpt, (vision_tensors.Image, vision_tensors.Video)) or is_pure_tensor(inpt): self._check_image_or_video(inpt, batch_size=params["batch_size"]) output = inpt.roll(1, 0).mul_(1.0 - lam).add_(inpt.mul(lam)) - if isinstance(inpt, (datapoints.Image, datapoints.Video)): - output = datapoints.wrap(output, like=inpt) + if isinstance(inpt, (vision_tensors.Image, vision_tensors.Video)): + output = vision_tensors.wrap(output, like=inpt) return output else: @@ -309,7 +309,7 @@ def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: if inpt is params["labels"]: return self._mixup_label(inpt, lam=params["lam_adjusted"]) - elif isinstance(inpt, (datapoints.Image, datapoints.Video)) or is_pure_tensor(inpt): + elif isinstance(inpt, (vision_tensors.Image, vision_tensors.Video)) or is_pure_tensor(inpt): self._check_image_or_video(inpt, batch_size=params["batch_size"]) x1, y1, x2, y2 = params["box"] @@ -317,8 +317,8 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: output = inpt.clone() output[..., y1:y2, x1:x2] = rolled[..., y1:y2, x1:x2] - if isinstance(inpt, (datapoints.Image, datapoints.Video)): - output = datapoints.wrap(output, like=inpt) + if isinstance(inpt, (vision_tensors.Image, vision_tensors.Video)): + output = vision_tensors.wrap(output, like=inpt) return output else: diff --git a/torchvision/transforms/v2/_auto_augment.py b/torchvision/transforms/v2/_auto_augment.py index 664210ff7e7..cd8325db1b5 100644 --- a/torchvision/transforms/v2/_auto_augment.py +++ b/torchvision/transforms/v2/_auto_augment.py @@ -5,7 +5,7 @@ import torch from torch.utils._pytree import tree_flatten, tree_unflatten, TreeSpec -from torchvision import datapoints, transforms as _transforms +from torchvision import transforms as _transforms, vision_tensors from torchvision.transforms import _functional_tensor as _FT from torchvision.transforms.v2 import AutoAugmentPolicy, functional as F, InterpolationMode, Transform from torchvision.transforms.v2.functional._geometry import _check_interpolation @@ -15,7 +15,7 @@ from ._utils import _get_fill, _setup_fill_arg, check_type, is_pure_tensor -ImageOrVideo = Union[torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.Video] +ImageOrVideo = Union[torch.Tensor, PIL.Image.Image, vision_tensors.Image, vision_tensors.Video] class _AutoAugmentBase(Transform): @@ -46,7 +46,7 @@ def _get_random_item(self, dct: Dict[str, Tuple[Callable, bool]]) -> Tuple[str, def _flatten_and_extract_image_or_video( self, inputs: Any, - unsupported_types: Tuple[Type, ...] = (datapoints.BoundingBoxes, datapoints.Mask), + unsupported_types: Tuple[Type, ...] = (vision_tensors.BoundingBoxes, vision_tensors.Mask), ) -> Tuple[Tuple[List[Any], TreeSpec, int], ImageOrVideo]: flat_inputs, spec = tree_flatten(inputs if len(inputs) > 1 else inputs[0]) needs_transform_list = self._needs_transform_list(flat_inputs) @@ -56,10 +56,10 @@ def _flatten_and_extract_image_or_video( if needs_transform and check_type( inpt, ( - datapoints.Image, + vision_tensors.Image, PIL.Image.Image, is_pure_tensor, - datapoints.Video, + vision_tensors.Video, ), ): image_or_videos.append((idx, inpt)) @@ -590,7 +590,7 @@ def forward(self, *inputs: Any) -> Any: augmentation_space = self._AUGMENTATION_SPACE if self.all_ops else self._PARTIAL_AUGMENTATION_SPACE orig_dims = list(image_or_video.shape) - expected_ndim = 5 if isinstance(orig_image_or_video, datapoints.Video) else 4 + expected_ndim = 5 if isinstance(orig_image_or_video, vision_tensors.Video) else 4 batch = image_or_video.reshape([1] * max(expected_ndim - image_or_video.ndim, 0) + orig_dims) batch_dims = [batch.size(0)] + [1] * (batch.ndim - 1) @@ -627,8 +627,8 @@ def forward(self, *inputs: Any) -> Any: mix.add_(combined_weights[:, i].reshape(batch_dims) * aug) mix = mix.reshape(orig_dims).to(dtype=image_or_video.dtype) - if isinstance(orig_image_or_video, (datapoints.Image, datapoints.Video)): - mix = datapoints.wrap(mix, like=orig_image_or_video) + if isinstance(orig_image_or_video, (vision_tensors.Image, vision_tensors.Video)): + mix = vision_tensors.wrap(mix, like=orig_image_or_video) elif isinstance(orig_image_or_video, PIL.Image.Image): mix = F.to_pil_image(mix) diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py index 4f94b37aa31..dba7da2c7dc 100644 --- a/torchvision/transforms/v2/_geometry.py +++ b/torchvision/transforms/v2/_geometry.py @@ -6,7 +6,7 @@ import PIL.Image import torch -from torchvision import datapoints, transforms as _transforms +from torchvision import transforms as _transforms, vision_tensors from torchvision.ops.boxes import box_iou from torchvision.transforms.functional import _get_perspective_coeffs from torchvision.transforms.v2 import functional as F, InterpolationMode, Transform @@ -36,8 +36,8 @@ class RandomHorizontalFlip(_RandomApplyTransform): .. v2betastatus:: RandomHorizontalFlip transform - If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, - :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.) + If the input is a :class:`torch.Tensor` or a ``VisionTensor`` (e.g. :class:`~torchvision.vision_tensors.Image`, + :class:`~torchvision.vision_tensors.Video`, :class:`~torchvision.vision_tensors.BoundingBoxes` etc.) it can have arbitrary number of leading batch dimensions. For example, the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. @@ -56,8 +56,8 @@ class RandomVerticalFlip(_RandomApplyTransform): .. v2betastatus:: RandomVerticalFlip transform - If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, - :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.) + If the input is a :class:`torch.Tensor` or a ``VisionTensor`` (e.g. :class:`~torchvision.vision_tensors.Image`, + :class:`~torchvision.vision_tensors.Video`, :class:`~torchvision.vision_tensors.BoundingBoxes` etc.) it can have arbitrary number of leading batch dimensions. For example, the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. @@ -76,8 +76,8 @@ class Resize(Transform): .. v2betastatus:: Resize transform - If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, - :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.) + If the input is a :class:`torch.Tensor` or a ``VisionTensor`` (e.g. :class:`~torchvision.vision_tensors.Image`, + :class:`~torchvision.vision_tensors.Video`, :class:`~torchvision.vision_tensors.BoundingBoxes` etc.) it can have arbitrary number of leading batch dimensions. For example, the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. @@ -171,8 +171,8 @@ class CenterCrop(Transform): .. v2betastatus:: CenterCrop transform - If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, - :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.) + If the input is a :class:`torch.Tensor` or a ``VisionTensor`` (e.g. :class:`~torchvision.vision_tensors.Image`, + :class:`~torchvision.vision_tensors.Video`, :class:`~torchvision.vision_tensors.BoundingBoxes` etc.) it can have arbitrary number of leading batch dimensions. For example, the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. @@ -199,8 +199,8 @@ class RandomResizedCrop(Transform): .. v2betastatus:: RandomResizedCrop transform - If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, - :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.) + If the input is a :class:`torch.Tensor` or a ``VisionTensor`` (e.g. :class:`~torchvision.vision_tensors.Image`, + :class:`~torchvision.vision_tensors.Video`, :class:`~torchvision.vision_tensors.BoundingBoxes` etc.) it can have arbitrary number of leading batch dimensions. For example, the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. @@ -322,8 +322,8 @@ class FiveCrop(Transform): .. v2betastatus:: FiveCrop transform - If the input is a :class:`torch.Tensor` or a :class:`~torchvision.datapoints.Image` or a - :class:`~torchvision.datapoints.Video` it can have arbitrary number of leading batch dimensions. + If the input is a :class:`torch.Tensor` or a :class:`~torchvision.vision_tensors.Image` or a + :class:`~torchvision.vision_tensors.Video` it can have arbitrary number of leading batch dimensions. For example, the image can have ``[..., C, H, W]`` shape. .. Note:: @@ -338,15 +338,15 @@ class FiveCrop(Transform): Example: >>> class BatchMultiCrop(transforms.Transform): - ... def forward(self, sample: Tuple[Tuple[Union[datapoints.Image, datapoints.Video], ...], int]): + ... def forward(self, sample: Tuple[Tuple[Union[vision_tensors.Image, vision_tensors.Video], ...], int]): ... images_or_videos, labels = sample ... batch_size = len(images_or_videos) ... image_or_video = images_or_videos[0] - ... images_or_videos = datapoints.wrap(torch.stack(images_or_videos), like=image_or_video) + ... images_or_videos = vision_tensors.wrap(torch.stack(images_or_videos), like=image_or_video) ... labels = torch.full((batch_size,), label, device=images_or_videos.device) ... return images_or_videos, labels ... - >>> image = datapoints.Image(torch.rand(3, 256, 256)) + >>> image = vision_tensors.Image(torch.rand(3, 256, 256)) >>> label = 3 >>> transform = transforms.Compose([transforms.FiveCrop(224), BatchMultiCrop()]) >>> images, labels = transform(image, label) @@ -363,10 +363,10 @@ def __init__(self, size: Union[int, Sequence[int]]) -> None: self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.") def _call_kernel(self, functional: Callable, inpt: Any, *args: Any, **kwargs: Any) -> Any: - if isinstance(inpt, (datapoints.BoundingBoxes, datapoints.Mask)): + if isinstance(inpt, (vision_tensors.BoundingBoxes, vision_tensors.Mask)): warnings.warn( f"{type(self).__name__}() is currently passing through inputs of type " - f"datapoints.{type(inpt).__name__}. This will likely change in the future." + f"vision_tensors.{type(inpt).__name__}. This will likely change in the future." ) return super()._call_kernel(functional, inpt, *args, **kwargs) @@ -374,7 +374,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: return self._call_kernel(F.five_crop, inpt, self.size) def _check_inputs(self, flat_inputs: List[Any]) -> None: - if has_any(flat_inputs, datapoints.BoundingBoxes, datapoints.Mask): + if has_any(flat_inputs, vision_tensors.BoundingBoxes, vision_tensors.Mask): raise TypeError(f"BoundingBoxes'es and Mask's are not supported by {type(self).__name__}()") @@ -384,8 +384,8 @@ class TenCrop(Transform): .. v2betastatus:: TenCrop transform - If the input is a :class:`torch.Tensor` or a :class:`~torchvision.datapoints.Image` or a - :class:`~torchvision.datapoints.Video` it can have arbitrary number of leading batch dimensions. + If the input is a :class:`torch.Tensor` or a :class:`~torchvision.vision_tensors.Image` or a + :class:`~torchvision.vision_tensors.Video` it can have arbitrary number of leading batch dimensions. For example, the image can have ``[..., C, H, W]`` shape. See :class:`~torchvision.transforms.v2.FiveCrop` for an example. @@ -410,15 +410,15 @@ def __init__(self, size: Union[int, Sequence[int]], vertical_flip: bool = False) self.vertical_flip = vertical_flip def _call_kernel(self, functional: Callable, inpt: Any, *args: Any, **kwargs: Any) -> Any: - if isinstance(inpt, (datapoints.BoundingBoxes, datapoints.Mask)): + if isinstance(inpt, (vision_tensors.BoundingBoxes, vision_tensors.Mask)): warnings.warn( f"{type(self).__name__}() is currently passing through inputs of type " - f"datapoints.{type(inpt).__name__}. This will likely change in the future." + f"vision_tensors.{type(inpt).__name__}. This will likely change in the future." ) return super()._call_kernel(functional, inpt, *args, **kwargs) def _check_inputs(self, flat_inputs: List[Any]) -> None: - if has_any(flat_inputs, datapoints.BoundingBoxes, datapoints.Mask): + if has_any(flat_inputs, vision_tensors.BoundingBoxes, vision_tensors.Mask): raise TypeError(f"BoundingBoxes'es and Mask's are not supported by {type(self).__name__}()") def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: @@ -430,8 +430,8 @@ class Pad(Transform): .. v2betastatus:: Pad transform - If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, - :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.) + If the input is a :class:`torch.Tensor` or a ``VisionTensor`` (e.g. :class:`~torchvision.vision_tensors.Image`, + :class:`~torchvision.vision_tensors.Video`, :class:`~torchvision.vision_tensors.BoundingBoxes` etc.) it can have arbitrary number of leading batch dimensions. For example, the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. @@ -447,7 +447,7 @@ class Pad(Transform): fill (number or tuple or dict, optional): Pixel fill value used when the ``padding_mode`` is constant. Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively. Fill value can be also a dictionary mapping data type to the fill value, e.g. - ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and + ``fill={vision_tensors.Image: 127, vision_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and ``Mask`` will be filled with 0. padding_mode (str, optional): Type of padding. Should be: constant, edge, reflect or symmetric. Default is "constant". @@ -515,8 +515,8 @@ class RandomZoomOut(_RandomApplyTransform): output_width = input_width * r output_height = input_height * r - If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, - :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.) + If the input is a :class:`torch.Tensor` or a ``VisionTensor`` (e.g. :class:`~torchvision.vision_tensors.Image`, + :class:`~torchvision.vision_tensors.Video`, :class:`~torchvision.vision_tensors.BoundingBoxes` etc.) it can have arbitrary number of leading batch dimensions. For example, the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. @@ -524,7 +524,7 @@ class RandomZoomOut(_RandomApplyTransform): fill (number or tuple or dict, optional): Pixel fill value used when the ``padding_mode`` is constant. Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively. Fill value can be also a dictionary mapping data type to the fill value, e.g. - ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and + ``fill={vision_tensors.Image: 127, vision_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and ``Mask`` will be filled with 0. side_range (sequence of floats, optional): tuple of two floats defines minimum and maximum factors to scale the input size. @@ -574,8 +574,8 @@ class RandomRotation(Transform): .. v2betastatus:: RandomRotation transform - If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, - :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.) + If the input is a :class:`torch.Tensor` or a ``VisionTensor`` (e.g. :class:`~torchvision.vision_tensors.Image`, + :class:`~torchvision.vision_tensors.Video`, :class:`~torchvision.vision_tensors.BoundingBoxes` etc.) it can have arbitrary number of leading batch dimensions. For example, the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. @@ -596,7 +596,7 @@ class RandomRotation(Transform): fill (number or tuple or dict, optional): Pixel fill value used when the ``padding_mode`` is constant. Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively. Fill value can be also a dictionary mapping data type to the fill value, e.g. - ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and + ``fill={vision_tensors.Image: 127, vision_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and ``Mask`` will be filled with 0. .. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters @@ -648,8 +648,8 @@ class RandomAffine(Transform): .. v2betastatus:: RandomAffine transform - If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, - :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.) + If the input is a :class:`torch.Tensor` or a ``VisionTensor`` (e.g. :class:`~torchvision.vision_tensors.Image`, + :class:`~torchvision.vision_tensors.Video`, :class:`~torchvision.vision_tensors.BoundingBoxes` etc.) it can have arbitrary number of leading batch dimensions. For example, the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. @@ -676,7 +676,7 @@ class RandomAffine(Transform): fill (number or tuple or dict, optional): Pixel fill value used when the ``padding_mode`` is constant. Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively. Fill value can be also a dictionary mapping data type to the fill value, e.g. - ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and + ``fill={vision_tensors.Image: 127, vision_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and ``Mask`` will be filled with 0. center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner. Default is the center of the image. @@ -770,8 +770,8 @@ class RandomCrop(Transform): .. v2betastatus:: RandomCrop transform - If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, - :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.) + If the input is a :class:`torch.Tensor` or a ``VisionTensor`` (e.g. :class:`~torchvision.vision_tensors.Image`, + :class:`~torchvision.vision_tensors.Video`, :class:`~torchvision.vision_tensors.BoundingBoxes` etc.) it can have arbitrary number of leading batch dimensions. For example, the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. @@ -794,7 +794,7 @@ class RandomCrop(Transform): fill (number or tuple or dict, optional): Pixel fill value used when the ``padding_mode`` is constant. Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively. Fill value can be also a dictionary mapping data type to the fill value, e.g. - ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and + ``fill={vision_tensors.Image: 127, vision_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and ``Mask`` will be filled with 0. padding_mode (str, optional): Type of padding. Should be: constant, edge, reflect or symmetric. Default is constant. @@ -927,8 +927,8 @@ class RandomPerspective(_RandomApplyTransform): .. v2betastatus:: RandomPerspective transform - If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, - :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.) + If the input is a :class:`torch.Tensor` or a ``VisionTensor`` (e.g. :class:`~torchvision.vision_tensors.Image`, + :class:`~torchvision.vision_tensors.Video`, :class:`~torchvision.vision_tensors.BoundingBoxes` etc.) it can have arbitrary number of leading batch dimensions. For example, the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. @@ -943,7 +943,7 @@ class RandomPerspective(_RandomApplyTransform): fill (number or tuple or dict, optional): Pixel fill value used when the ``padding_mode`` is constant. Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively. Fill value can be also a dictionary mapping data type to the fill value, e.g. - ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and + ``fill={vision_tensors.Image: 127, vision_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and ``Mask`` will be filled with 0. """ @@ -1014,8 +1014,8 @@ class ElasticTransform(Transform): .. v2betastatus:: RandomPerspective transform - If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, - :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.) + If the input is a :class:`torch.Tensor` or a ``VisionTensor`` (e.g. :class:`~torchvision.vision_tensors.Image`, + :class:`~torchvision.vision_tensors.Video`, :class:`~torchvision.vision_tensors.BoundingBoxes` etc.) it can have arbitrary number of leading batch dimensions. For example, the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. @@ -1046,7 +1046,7 @@ class ElasticTransform(Transform): fill (number or tuple or dict, optional): Pixel fill value used when the ``padding_mode`` is constant. Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively. Fill value can be also a dictionary mapping data type to the fill value, e.g. - ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and + ``fill={vision_tensors.Image: 127, vision_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and ``Mask`` will be filled with 0. """ @@ -1107,15 +1107,15 @@ class RandomIoUCrop(Transform): .. v2betastatus:: RandomIoUCrop transform - This transformation requires an image or video data and ``datapoints.BoundingBoxes`` in the input. + This transformation requires an image or video data and ``vision_tensors.BoundingBoxes`` in the input. .. warning:: In order to properly remove the bounding boxes below the IoU threshold, `RandomIoUCrop` must be followed by :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes`, either immediately after or later in the transforms pipeline. - If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, - :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.) + If the input is a :class:`torch.Tensor` or a ``VisionTensor`` (e.g. :class:`~torchvision.vision_tensors.Image`, + :class:`~torchvision.vision_tensors.Video`, :class:`~torchvision.vision_tensors.BoundingBoxes` etc.) it can have arbitrary number of leading batch dimensions. For example, the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. @@ -1152,8 +1152,8 @@ def __init__( def _check_inputs(self, flat_inputs: List[Any]) -> None: if not ( - has_all(flat_inputs, datapoints.BoundingBoxes) - and has_any(flat_inputs, PIL.Image.Image, datapoints.Image, is_pure_tensor) + has_all(flat_inputs, vision_tensors.BoundingBoxes) + and has_any(flat_inputs, PIL.Image.Image, vision_tensors.Image, is_pure_tensor) ): raise TypeError( f"{type(self).__name__}() requires input sample to contain tensor or PIL images " @@ -1193,7 +1193,7 @@ def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: xyxy_bboxes = F.convert_bounding_box_format( bboxes.as_subclass(torch.Tensor), bboxes.format, - datapoints.BoundingBoxFormat.XYXY, + vision_tensors.BoundingBoxFormat.XYXY, ) cx = 0.5 * (xyxy_bboxes[..., 0] + xyxy_bboxes[..., 2]) cy = 0.5 * (xyxy_bboxes[..., 1] + xyxy_bboxes[..., 3]) @@ -1221,7 +1221,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: F.crop, inpt, top=params["top"], left=params["left"], height=params["height"], width=params["width"] ) - if isinstance(output, datapoints.BoundingBoxes): + if isinstance(output, vision_tensors.BoundingBoxes): # We "mark" the invalid boxes as degenreate, and they can be # removed by a later call to SanitizeBoundingBoxes() output[~params["is_within_crop_area"]] = 0 @@ -1235,8 +1235,8 @@ class ScaleJitter(Transform): .. v2betastatus:: ScaleJitter transform - If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, - :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.) + If the input is a :class:`torch.Tensor` or a ``VisionTensor`` (e.g. :class:`~torchvision.vision_tensors.Image`, + :class:`~torchvision.vision_tensors.Video`, :class:`~torchvision.vision_tensors.BoundingBoxes` etc.) it can have arbitrary number of leading batch dimensions. For example, the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. @@ -1303,8 +1303,8 @@ class RandomShortestSize(Transform): .. v2betastatus:: RandomShortestSize transform - If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, - :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.) + If the input is a :class:`torch.Tensor` or a ``VisionTensor`` (e.g. :class:`~torchvision.vision_tensors.Image`, + :class:`~torchvision.vision_tensors.Video`, :class:`~torchvision.vision_tensors.BoundingBoxes` etc.) it can have arbitrary number of leading batch dimensions. For example, the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. @@ -1384,8 +1384,8 @@ class RandomResize(Transform): output_width = size output_height = size - If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, - :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.) + If the input is a :class:`torch.Tensor` or a ``VisionTensor`` (e.g. :class:`~torchvision.vision_tensors.Image`, + :class:`~torchvision.vision_tensors.Video`, :class:`~torchvision.vision_tensors.BoundingBoxes` etc.) it can have arbitrary number of leading batch dimensions. For example, the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. diff --git a/torchvision/transforms/v2/_meta.py b/torchvision/transforms/v2/_meta.py index e81b6b138a2..1424621ef28 100644 --- a/torchvision/transforms/v2/_meta.py +++ b/torchvision/transforms/v2/_meta.py @@ -1,6 +1,6 @@ from typing import Any, Dict, Union -from torchvision import datapoints +from torchvision import vision_tensors from torchvision.transforms.v2 import functional as F, Transform @@ -10,20 +10,20 @@ class ConvertBoundingBoxFormat(Transform): .. v2betastatus:: ConvertBoundingBoxFormat transform Args: - format (str or datapoints.BoundingBoxFormat): output bounding box format. - Possible values are defined by :class:`~torchvision.datapoints.BoundingBoxFormat` and + format (str or vision_tensors.BoundingBoxFormat): output bounding box format. + Possible values are defined by :class:`~torchvision.vision_tensors.BoundingBoxFormat` and string values match the enums, e.g. "XYXY" or "XYWH" etc. """ - _transformed_types = (datapoints.BoundingBoxes,) + _transformed_types = (vision_tensors.BoundingBoxes,) - def __init__(self, format: Union[str, datapoints.BoundingBoxFormat]) -> None: + def __init__(self, format: Union[str, vision_tensors.BoundingBoxFormat]) -> None: super().__init__() if isinstance(format, str): - format = datapoints.BoundingBoxFormat[format] + format = vision_tensors.BoundingBoxFormat[format] self.format = format - def _transform(self, inpt: datapoints.BoundingBoxes, params: Dict[str, Any]) -> datapoints.BoundingBoxes: + def _transform(self, inpt: vision_tensors.BoundingBoxes, params: Dict[str, Any]) -> vision_tensors.BoundingBoxes: return F.convert_bounding_box_format(inpt, new_format=self.format) # type: ignore[return-value] @@ -36,7 +36,7 @@ class ClampBoundingBoxes(Transform): """ - _transformed_types = (datapoints.BoundingBoxes,) + _transformed_types = (vision_tensors.BoundingBoxes,) - def _transform(self, inpt: datapoints.BoundingBoxes, params: Dict[str, Any]) -> datapoints.BoundingBoxes: + def _transform(self, inpt: vision_tensors.BoundingBoxes, params: Dict[str, Any]) -> vision_tensors.BoundingBoxes: return F.clamp_bounding_boxes(inpt) # type: ignore[return-value] diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py index c17530ecfb9..d5ce69b06e6 100644 --- a/torchvision/transforms/v2/_misc.py +++ b/torchvision/transforms/v2/_misc.py @@ -6,7 +6,7 @@ import torch from torch.utils._pytree import tree_flatten, tree_unflatten -from torchvision import datapoints, transforms as _transforms +from torchvision import transforms as _transforms, vision_tensors from torchvision.transforms.v2 import functional as F, Transform from ._utils import _parse_labels_getter, _setup_float_or_seq, _setup_size, get_bounding_boxes, has_any, is_pure_tensor @@ -74,7 +74,7 @@ class LinearTransformation(Transform): _v1_transform_cls = _transforms.LinearTransformation - _transformed_types = (is_pure_tensor, datapoints.Image, datapoints.Video) + _transformed_types = (is_pure_tensor, vision_tensors.Image, vision_tensors.Video) def __init__(self, transformation_matrix: torch.Tensor, mean_vector: torch.Tensor): super().__init__() @@ -129,8 +129,8 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: output = torch.mm(flat_inpt, transformation_matrix) output = output.reshape(shape) - if isinstance(inpt, (datapoints.Image, datapoints.Video)): - output = datapoints.wrap(output, like=inpt) + if isinstance(inpt, (vision_tensors.Image, vision_tensors.Video)): + output = vision_tensors.wrap(output, like=inpt) return output @@ -227,12 +227,12 @@ class ToDtype(Transform): ``ToDtype(dtype, scale=True)`` is the recommended replacement for ``ConvertImageDtype(dtype)``. Args: - dtype (``torch.dtype`` or dict of ``Datapoint`` -> ``torch.dtype``): The dtype to convert to. + dtype (``torch.dtype`` or dict of ``VisionTensor`` -> ``torch.dtype``): The dtype to convert to. If a ``torch.dtype`` is passed, e.g. ``torch.float32``, only images and videos will be converted to that dtype: this is for compatibility with :class:`~torchvision.transforms.v2.ConvertImageDtype`. - A dict can be passed to specify per-datapoint conversions, e.g. - ``dtype={datapoints.Image: torch.float32, datapoints.Mask: torch.int64, "others":None}``. The "others" - key can be used as a catch-all for any other datapoint type, and ``None`` means no conversion. + A dict can be passed to specify per-vision_tensor conversions, e.g. + ``dtype={vision_tensors.Image: torch.float32, vision_tensors.Mask: torch.int64, "others":None}``. The "others" + key can be used as a catch-all for any other vision_tensor type, and ``None`` means no conversion. scale (bool, optional): Whether to scale the values for images or videos. See :ref:`range_and_dtype`. Default: ``False``. """ @@ -250,12 +250,12 @@ def __init__( if ( isinstance(dtype, dict) and torch.Tensor in dtype - and any(cls in dtype for cls in [datapoints.Image, datapoints.Video]) + and any(cls in dtype for cls in [vision_tensors.Image, vision_tensors.Video]) ): warnings.warn( - "Got `dtype` values for `torch.Tensor` and either `datapoints.Image` or `datapoints.Video`. " + "Got `dtype` values for `torch.Tensor` and either `vision_tensors.Image` or `vision_tensors.Video`. " "Note that a plain `torch.Tensor` will *not* be transformed by this (or any other transformation) " - "in case a `datapoints.Image` or `datapoints.Video` is present in the input." + "in case a `vision_tensors.Image` or `vision_tensors.Video` is present in the input." ) self.dtype = dtype self.scale = scale @@ -264,7 +264,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: if isinstance(self.dtype, torch.dtype): # For consistency / BC with ConvertImageDtype, we only care about images or videos when dtype # is a simple torch.dtype - if not is_pure_tensor(inpt) and not isinstance(inpt, (datapoints.Image, datapoints.Video)): + if not is_pure_tensor(inpt) and not isinstance(inpt, (vision_tensors.Image, vision_tensors.Video)): return inpt dtype: Optional[torch.dtype] = self.dtype @@ -278,10 +278,10 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: "If you only need to convert the dtype of images or videos, you can just pass e.g. dtype=torch.float32. " "If you're passing a dict as dtype, " 'you can use "others" as a catch-all key ' - 'e.g. dtype={datapoints.Mask: torch.int64, "others": None} to pass-through the rest of the inputs.' + 'e.g. dtype={vision_tensors.Mask: torch.int64, "others": None} to pass-through the rest of the inputs.' ) - supports_scaling = is_pure_tensor(inpt) or isinstance(inpt, (datapoints.Image, datapoints.Video)) + supports_scaling = is_pure_tensor(inpt) or isinstance(inpt, (vision_tensors.Image, vision_tensors.Video)) if dtype is None: if self.scale and supports_scaling: warnings.warn( @@ -389,10 +389,10 @@ def forward(self, *inputs: Any) -> Any: ) boxes = cast( - datapoints.BoundingBoxes, + vision_tensors.BoundingBoxes, F.convert_bounding_box_format( boxes, - new_format=datapoints.BoundingBoxFormat.XYXY, + new_format=vision_tensors.BoundingBoxFormat.XYXY, ), ) ws, hs = boxes[:, 2] - boxes[:, 0], boxes[:, 3] - boxes[:, 1] @@ -415,7 +415,7 @@ def forward(self, *inputs: Any) -> Any: def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: is_label = inpt is not None and inpt is params["labels"] - is_bounding_boxes_or_mask = isinstance(inpt, (datapoints.BoundingBoxes, datapoints.Mask)) + is_bounding_boxes_or_mask = isinstance(inpt, (vision_tensors.BoundingBoxes, vision_tensors.Mask)) if not (is_label or is_bounding_boxes_or_mask): return inpt @@ -425,4 +425,4 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: if is_label: return output - return datapoints.wrap(output, like=inpt) + return vision_tensors.wrap(output, like=inpt) diff --git a/torchvision/transforms/v2/_transform.py b/torchvision/transforms/v2/_transform.py index f377c822a2d..357019a8d5b 100644 --- a/torchvision/transforms/v2/_transform.py +++ b/torchvision/transforms/v2/_transform.py @@ -7,7 +7,7 @@ import torch from torch import nn from torch.utils._pytree import tree_flatten, tree_unflatten -from torchvision import datapoints +from torchvision import vision_tensors from torchvision.transforms.v2._utils import check_type, has_any, is_pure_tensor from torchvision.utils import _log_api_usage_once @@ -56,8 +56,8 @@ def forward(self, *inputs: Any) -> Any: def _needs_transform_list(self, flat_inputs: List[Any]) -> List[bool]: # Below is a heuristic on how to deal with pure tensor inputs: - # 1. Pure tensors, i.e. tensors that are not a datapoint, are passed through if there is an explicit image - # (`datapoints.Image` or `PIL.Image.Image`) or video (`datapoints.Video`) in the sample. + # 1. Pure tensors, i.e. tensors that are not a vision_tensor, are passed through if there is an explicit image + # (`vision_tensors.Image` or `PIL.Image.Image`) or video (`vision_tensors.Video`) in the sample. # 2. If there is no explicit image or video in the sample, only the first encountered pure tensor is # transformed as image, while the rest is passed through. The order is defined by the returned `flat_inputs` # of `tree_flatten`, which recurses depth-first through the input. @@ -72,7 +72,7 @@ def _needs_transform_list(self, flat_inputs: List[Any]) -> List[bool]: # However, this case wasn't supported by transforms v1 either, so there is no BC concern. needs_transform_list = [] - transform_pure_tensor = not has_any(flat_inputs, datapoints.Image, datapoints.Video, PIL.Image.Image) + transform_pure_tensor = not has_any(flat_inputs, vision_tensors.Image, vision_tensors.Video, PIL.Image.Image) for inpt in flat_inputs: needs_transform = True diff --git a/torchvision/transforms/v2/_type_conversion.py b/torchvision/transforms/v2/_type_conversion.py index e92c98e6cb3..f98a632c6b1 100644 --- a/torchvision/transforms/v2/_type_conversion.py +++ b/torchvision/transforms/v2/_type_conversion.py @@ -4,7 +4,7 @@ import PIL.Image import torch -from torchvision import datapoints +from torchvision import vision_tensors from torchvision.transforms.v2 import functional as F, Transform from torchvision.transforms.v2._utils import is_pure_tensor @@ -27,7 +27,7 @@ def _transform(self, inpt: PIL.Image.Image, params: Dict[str, Any]) -> torch.Ten class ToImage(Transform): - """[BETA] Convert a tensor, ndarray, or PIL Image to :class:`~torchvision.datapoints.Image` + """[BETA] Convert a tensor, ndarray, or PIL Image to :class:`~torchvision.vision_tensors.Image` ; this does not scale values. .. v2betastatus:: ToImage transform @@ -39,7 +39,7 @@ class ToImage(Transform): def _transform( self, inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray], params: Dict[str, Any] - ) -> datapoints.Image: + ) -> vision_tensors.Image: return F.to_image(inpt) @@ -66,7 +66,7 @@ class ToPILImage(Transform): .. _PIL.Image mode: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#concept-modes """ - _transformed_types = (is_pure_tensor, datapoints.Image, np.ndarray) + _transformed_types = (is_pure_tensor, vision_tensors.Image, np.ndarray) def __init__(self, mode: Optional[str] = None) -> None: super().__init__() @@ -79,14 +79,14 @@ def _transform( class ToPureTensor(Transform): - """[BETA] Convert all datapoints to pure tensors, removing associated metadata (if any). + """[BETA] Convert all vision_tensors to pure tensors, removing associated metadata (if any). .. v2betastatus:: ToPureTensor transform This doesn't scale or change the values, only the type. """ - _transformed_types = (datapoints.Datapoint,) + _transformed_types = (vision_tensors.VisionTensor,) def _transform(self, inpt: Any, params: Dict[str, Any]) -> torch.Tensor: return inpt.as_subclass(torch.Tensor) diff --git a/torchvision/transforms/v2/_utils.py b/torchvision/transforms/v2/_utils.py index 6b327d45c0e..42f1d979d33 100644 --- a/torchvision/transforms/v2/_utils.py +++ b/torchvision/transforms/v2/_utils.py @@ -9,7 +9,7 @@ import PIL.Image import torch -from torchvision import datapoints +from torchvision import vision_tensors from torchvision._utils import sequence_to_str @@ -149,10 +149,10 @@ def _parse_labels_getter( raise ValueError(f"labels_getter should either be 'default', a callable, or None, but got {labels_getter}.") -def get_bounding_boxes(flat_inputs: List[Any]) -> datapoints.BoundingBoxes: +def get_bounding_boxes(flat_inputs: List[Any]) -> vision_tensors.BoundingBoxes: # This assumes there is only one bbox per sample as per the general convention try: - return next(inpt for inpt in flat_inputs if isinstance(inpt, datapoints.BoundingBoxes)) + return next(inpt for inpt in flat_inputs if isinstance(inpt, vision_tensors.BoundingBoxes)) except StopIteration: raise ValueError("No bounding boxes were found in the sample") @@ -161,7 +161,7 @@ def query_chw(flat_inputs: List[Any]) -> Tuple[int, int, int]: chws = { tuple(get_dimensions(inpt)) for inpt in flat_inputs - if check_type(inpt, (is_pure_tensor, datapoints.Image, PIL.Image.Image, datapoints.Video)) + if check_type(inpt, (is_pure_tensor, vision_tensors.Image, PIL.Image.Image, vision_tensors.Video)) } if not chws: raise TypeError("No image or video was found in the sample") @@ -179,11 +179,11 @@ def query_size(flat_inputs: List[Any]) -> Tuple[int, int]: inpt, ( is_pure_tensor, - datapoints.Image, + vision_tensors.Image, PIL.Image.Image, - datapoints.Video, - datapoints.Mask, - datapoints.BoundingBoxes, + vision_tensors.Video, + vision_tensors.Mask, + vision_tensors.BoundingBoxes, ), ) } diff --git a/torchvision/transforms/v2/functional/_augment.py b/torchvision/transforms/v2/functional/_augment.py index bc6c4030baf..620d91c0326 100644 --- a/torchvision/transforms/v2/functional/_augment.py +++ b/torchvision/transforms/v2/functional/_augment.py @@ -1,7 +1,7 @@ import PIL.Image import torch -from torchvision import datapoints +from torchvision import vision_tensors from torchvision.transforms.functional import pil_to_tensor, to_pil_image from torchvision.utils import _log_api_usage_once @@ -28,7 +28,7 @@ def erase( @_register_kernel_internal(erase, torch.Tensor) -@_register_kernel_internal(erase, datapoints.Image) +@_register_kernel_internal(erase, vision_tensors.Image) def erase_image( image: torch.Tensor, i: int, j: int, h: int, w: int, v: torch.Tensor, inplace: bool = False ) -> torch.Tensor: @@ -48,7 +48,7 @@ def _erase_image_pil( return to_pil_image(output, mode=image.mode) -@_register_kernel_internal(erase, datapoints.Video) +@_register_kernel_internal(erase, vision_tensors.Video) def erase_video( video: torch.Tensor, i: int, j: int, h: int, w: int, v: torch.Tensor, inplace: bool = False ) -> torch.Tensor: diff --git a/torchvision/transforms/v2/functional/_color.py b/torchvision/transforms/v2/functional/_color.py index f5c3fa69e55..d04d264bb57 100644 --- a/torchvision/transforms/v2/functional/_color.py +++ b/torchvision/transforms/v2/functional/_color.py @@ -3,7 +3,7 @@ import PIL.Image import torch from torch.nn.functional import conv2d -from torchvision import datapoints +from torchvision import vision_tensors from torchvision.transforms import _functional_pil as _FP from torchvision.transforms._functional_tensor import _max_value @@ -47,7 +47,7 @@ def _rgb_to_grayscale_image( @_register_kernel_internal(rgb_to_grayscale, torch.Tensor) -@_register_kernel_internal(rgb_to_grayscale, datapoints.Image) +@_register_kernel_internal(rgb_to_grayscale, vision_tensors.Image) def rgb_to_grayscale_image(image: torch.Tensor, num_output_channels: int = 1) -> torch.Tensor: if num_output_channels not in (1, 3): raise ValueError(f"num_output_channels must be 1 or 3, got {num_output_channels}.") @@ -82,7 +82,7 @@ def adjust_brightness(inpt: torch.Tensor, brightness_factor: float) -> torch.Ten @_register_kernel_internal(adjust_brightness, torch.Tensor) -@_register_kernel_internal(adjust_brightness, datapoints.Image) +@_register_kernel_internal(adjust_brightness, vision_tensors.Image) def adjust_brightness_image(image: torch.Tensor, brightness_factor: float) -> torch.Tensor: if brightness_factor < 0: raise ValueError(f"brightness_factor ({brightness_factor}) is not non-negative.") @@ -102,7 +102,7 @@ def _adjust_brightness_image_pil(image: PIL.Image.Image, brightness_factor: floa return _FP.adjust_brightness(image, brightness_factor=brightness_factor) -@_register_kernel_internal(adjust_brightness, datapoints.Video) +@_register_kernel_internal(adjust_brightness, vision_tensors.Video) def adjust_brightness_video(video: torch.Tensor, brightness_factor: float) -> torch.Tensor: return adjust_brightness_image(video, brightness_factor=brightness_factor) @@ -119,7 +119,7 @@ def adjust_saturation(inpt: torch.Tensor, saturation_factor: float) -> torch.Ten @_register_kernel_internal(adjust_saturation, torch.Tensor) -@_register_kernel_internal(adjust_saturation, datapoints.Image) +@_register_kernel_internal(adjust_saturation, vision_tensors.Image) def adjust_saturation_image(image: torch.Tensor, saturation_factor: float) -> torch.Tensor: if saturation_factor < 0: raise ValueError(f"saturation_factor ({saturation_factor}) is not non-negative.") @@ -141,7 +141,7 @@ def adjust_saturation_image(image: torch.Tensor, saturation_factor: float) -> to _adjust_saturation_image_pil = _register_kernel_internal(adjust_saturation, PIL.Image.Image)(_FP.adjust_saturation) -@_register_kernel_internal(adjust_saturation, datapoints.Video) +@_register_kernel_internal(adjust_saturation, vision_tensors.Video) def adjust_saturation_video(video: torch.Tensor, saturation_factor: float) -> torch.Tensor: return adjust_saturation_image(video, saturation_factor=saturation_factor) @@ -158,7 +158,7 @@ def adjust_contrast(inpt: torch.Tensor, contrast_factor: float) -> torch.Tensor: @_register_kernel_internal(adjust_contrast, torch.Tensor) -@_register_kernel_internal(adjust_contrast, datapoints.Image) +@_register_kernel_internal(adjust_contrast, vision_tensors.Image) def adjust_contrast_image(image: torch.Tensor, contrast_factor: float) -> torch.Tensor: if contrast_factor < 0: raise ValueError(f"contrast_factor ({contrast_factor}) is not non-negative.") @@ -180,7 +180,7 @@ def adjust_contrast_image(image: torch.Tensor, contrast_factor: float) -> torch. _adjust_contrast_image_pil = _register_kernel_internal(adjust_contrast, PIL.Image.Image)(_FP.adjust_contrast) -@_register_kernel_internal(adjust_contrast, datapoints.Video) +@_register_kernel_internal(adjust_contrast, vision_tensors.Video) def adjust_contrast_video(video: torch.Tensor, contrast_factor: float) -> torch.Tensor: return adjust_contrast_image(video, contrast_factor=contrast_factor) @@ -197,7 +197,7 @@ def adjust_sharpness(inpt: torch.Tensor, sharpness_factor: float) -> torch.Tenso @_register_kernel_internal(adjust_sharpness, torch.Tensor) -@_register_kernel_internal(adjust_sharpness, datapoints.Image) +@_register_kernel_internal(adjust_sharpness, vision_tensors.Image) def adjust_sharpness_image(image: torch.Tensor, sharpness_factor: float) -> torch.Tensor: num_channels, height, width = image.shape[-3:] if num_channels not in (1, 3): @@ -253,7 +253,7 @@ def adjust_sharpness_image(image: torch.Tensor, sharpness_factor: float) -> torc _adjust_sharpness_image_pil = _register_kernel_internal(adjust_sharpness, PIL.Image.Image)(_FP.adjust_sharpness) -@_register_kernel_internal(adjust_sharpness, datapoints.Video) +@_register_kernel_internal(adjust_sharpness, vision_tensors.Video) def adjust_sharpness_video(video: torch.Tensor, sharpness_factor: float) -> torch.Tensor: return adjust_sharpness_image(video, sharpness_factor=sharpness_factor) @@ -340,7 +340,7 @@ def _hsv_to_rgb(img: torch.Tensor) -> torch.Tensor: @_register_kernel_internal(adjust_hue, torch.Tensor) -@_register_kernel_internal(adjust_hue, datapoints.Image) +@_register_kernel_internal(adjust_hue, vision_tensors.Image) def adjust_hue_image(image: torch.Tensor, hue_factor: float) -> torch.Tensor: if not (-0.5 <= hue_factor <= 0.5): raise ValueError(f"hue_factor ({hue_factor}) is not in [-0.5, 0.5].") @@ -371,7 +371,7 @@ def adjust_hue_image(image: torch.Tensor, hue_factor: float) -> torch.Tensor: _adjust_hue_image_pil = _register_kernel_internal(adjust_hue, PIL.Image.Image)(_FP.adjust_hue) -@_register_kernel_internal(adjust_hue, datapoints.Video) +@_register_kernel_internal(adjust_hue, vision_tensors.Video) def adjust_hue_video(video: torch.Tensor, hue_factor: float) -> torch.Tensor: return adjust_hue_image(video, hue_factor=hue_factor) @@ -388,7 +388,7 @@ def adjust_gamma(inpt: torch.Tensor, gamma: float, gain: float = 1) -> torch.Ten @_register_kernel_internal(adjust_gamma, torch.Tensor) -@_register_kernel_internal(adjust_gamma, datapoints.Image) +@_register_kernel_internal(adjust_gamma, vision_tensors.Image) def adjust_gamma_image(image: torch.Tensor, gamma: float, gain: float = 1.0) -> torch.Tensor: if gamma < 0: raise ValueError("Gamma should be a non-negative real number") @@ -411,7 +411,7 @@ def adjust_gamma_image(image: torch.Tensor, gamma: float, gain: float = 1.0) -> _adjust_gamma_image_pil = _register_kernel_internal(adjust_gamma, PIL.Image.Image)(_FP.adjust_gamma) -@_register_kernel_internal(adjust_gamma, datapoints.Video) +@_register_kernel_internal(adjust_gamma, vision_tensors.Video) def adjust_gamma_video(video: torch.Tensor, gamma: float, gain: float = 1) -> torch.Tensor: return adjust_gamma_image(video, gamma=gamma, gain=gain) @@ -428,7 +428,7 @@ def posterize(inpt: torch.Tensor, bits: int) -> torch.Tensor: @_register_kernel_internal(posterize, torch.Tensor) -@_register_kernel_internal(posterize, datapoints.Image) +@_register_kernel_internal(posterize, vision_tensors.Image) def posterize_image(image: torch.Tensor, bits: int) -> torch.Tensor: if image.is_floating_point(): levels = 1 << bits @@ -445,7 +445,7 @@ def posterize_image(image: torch.Tensor, bits: int) -> torch.Tensor: _posterize_image_pil = _register_kernel_internal(posterize, PIL.Image.Image)(_FP.posterize) -@_register_kernel_internal(posterize, datapoints.Video) +@_register_kernel_internal(posterize, vision_tensors.Video) def posterize_video(video: torch.Tensor, bits: int) -> torch.Tensor: return posterize_image(video, bits=bits) @@ -462,7 +462,7 @@ def solarize(inpt: torch.Tensor, threshold: float) -> torch.Tensor: @_register_kernel_internal(solarize, torch.Tensor) -@_register_kernel_internal(solarize, datapoints.Image) +@_register_kernel_internal(solarize, vision_tensors.Image) def solarize_image(image: torch.Tensor, threshold: float) -> torch.Tensor: if threshold > _max_value(image.dtype): raise TypeError(f"Threshold should be less or equal the maximum value of the dtype, but got {threshold}") @@ -473,7 +473,7 @@ def solarize_image(image: torch.Tensor, threshold: float) -> torch.Tensor: _solarize_image_pil = _register_kernel_internal(solarize, PIL.Image.Image)(_FP.solarize) -@_register_kernel_internal(solarize, datapoints.Video) +@_register_kernel_internal(solarize, vision_tensors.Video) def solarize_video(video: torch.Tensor, threshold: float) -> torch.Tensor: return solarize_image(video, threshold=threshold) @@ -490,7 +490,7 @@ def autocontrast(inpt: torch.Tensor) -> torch.Tensor: @_register_kernel_internal(autocontrast, torch.Tensor) -@_register_kernel_internal(autocontrast, datapoints.Image) +@_register_kernel_internal(autocontrast, vision_tensors.Image) def autocontrast_image(image: torch.Tensor) -> torch.Tensor: c = image.shape[-3] if c not in [1, 3]: @@ -523,7 +523,7 @@ def autocontrast_image(image: torch.Tensor) -> torch.Tensor: _autocontrast_image_pil = _register_kernel_internal(autocontrast, PIL.Image.Image)(_FP.autocontrast) -@_register_kernel_internal(autocontrast, datapoints.Video) +@_register_kernel_internal(autocontrast, vision_tensors.Video) def autocontrast_video(video: torch.Tensor) -> torch.Tensor: return autocontrast_image(video) @@ -540,7 +540,7 @@ def equalize(inpt: torch.Tensor) -> torch.Tensor: @_register_kernel_internal(equalize, torch.Tensor) -@_register_kernel_internal(equalize, datapoints.Image) +@_register_kernel_internal(equalize, vision_tensors.Image) def equalize_image(image: torch.Tensor) -> torch.Tensor: if image.numel() == 0: return image @@ -613,7 +613,7 @@ def equalize_image(image: torch.Tensor) -> torch.Tensor: _equalize_image_pil = _register_kernel_internal(equalize, PIL.Image.Image)(_FP.equalize) -@_register_kernel_internal(equalize, datapoints.Video) +@_register_kernel_internal(equalize, vision_tensors.Video) def equalize_video(video: torch.Tensor) -> torch.Tensor: return equalize_image(video) @@ -630,7 +630,7 @@ def invert(inpt: torch.Tensor) -> torch.Tensor: @_register_kernel_internal(invert, torch.Tensor) -@_register_kernel_internal(invert, datapoints.Image) +@_register_kernel_internal(invert, vision_tensors.Image) def invert_image(image: torch.Tensor) -> torch.Tensor: if image.is_floating_point(): return 1.0 - image @@ -644,7 +644,7 @@ def invert_image(image: torch.Tensor) -> torch.Tensor: _invert_image_pil = _register_kernel_internal(invert, PIL.Image.Image)(_FP.invert) -@_register_kernel_internal(invert, datapoints.Video) +@_register_kernel_internal(invert, vision_tensors.Video) def invert_video(video: torch.Tensor) -> torch.Tensor: return invert_image(video) @@ -653,7 +653,7 @@ def permute_channels(inpt: torch.Tensor, permutation: List[int]) -> torch.Tensor """Permute the channels of the input according to the given permutation. This function supports plain :class:`~torch.Tensor`'s, :class:`PIL.Image.Image`'s, and - :class:`torchvision.datapoints.Image` and :class:`torchvision.datapoints.Video`. + :class:`torchvision.vision_tensors.Image` and :class:`torchvision.vision_tensors.Video`. Example: >>> rgb_image = torch.rand(3, 256, 256) @@ -681,7 +681,7 @@ def permute_channels(inpt: torch.Tensor, permutation: List[int]) -> torch.Tensor @_register_kernel_internal(permute_channels, torch.Tensor) -@_register_kernel_internal(permute_channels, datapoints.Image) +@_register_kernel_internal(permute_channels, vision_tensors.Image) def permute_channels_image(image: torch.Tensor, permutation: List[int]) -> torch.Tensor: shape = image.shape num_channels, height, width = shape[-3:] @@ -704,6 +704,6 @@ def _permute_channels_image_pil(image: PIL.Image.Image, permutation: List[int]) return to_pil_image(permute_channels_image(pil_to_tensor(image), permutation=permutation)) -@_register_kernel_internal(permute_channels, datapoints.Video) +@_register_kernel_internal(permute_channels, vision_tensors.Video) def permute_channels_video(video: torch.Tensor, permutation: List[int]) -> torch.Tensor: return permute_channels_image(video, permutation=permutation) diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py index abc3716cfa9..0ae2c160375 100644 --- a/torchvision/transforms/v2/functional/_geometry.py +++ b/torchvision/transforms/v2/functional/_geometry.py @@ -7,7 +7,7 @@ import torch from torch.nn.functional import grid_sample, interpolate, pad as torch_pad -from torchvision import datapoints +from torchvision import vision_tensors from torchvision.transforms import _functional_pil as _FP from torchvision.transforms._functional_tensor import _pad_symmetric from torchvision.transforms.functional import ( @@ -51,7 +51,7 @@ def horizontal_flip(inpt: torch.Tensor) -> torch.Tensor: @_register_kernel_internal(horizontal_flip, torch.Tensor) -@_register_kernel_internal(horizontal_flip, datapoints.Image) +@_register_kernel_internal(horizontal_flip, vision_tensors.Image) def horizontal_flip_image(image: torch.Tensor) -> torch.Tensor: return image.flip(-1) @@ -61,37 +61,37 @@ def _horizontal_flip_image_pil(image: PIL.Image.Image) -> PIL.Image.Image: return _FP.hflip(image) -@_register_kernel_internal(horizontal_flip, datapoints.Mask) +@_register_kernel_internal(horizontal_flip, vision_tensors.Mask) def horizontal_flip_mask(mask: torch.Tensor) -> torch.Tensor: return horizontal_flip_image(mask) def horizontal_flip_bounding_boxes( - bounding_boxes: torch.Tensor, format: datapoints.BoundingBoxFormat, canvas_size: Tuple[int, int] + bounding_boxes: torch.Tensor, format: vision_tensors.BoundingBoxFormat, canvas_size: Tuple[int, int] ) -> torch.Tensor: shape = bounding_boxes.shape bounding_boxes = bounding_boxes.clone().reshape(-1, 4) - if format == datapoints.BoundingBoxFormat.XYXY: + if format == vision_tensors.BoundingBoxFormat.XYXY: bounding_boxes[:, [2, 0]] = bounding_boxes[:, [0, 2]].sub_(canvas_size[1]).neg_() - elif format == datapoints.BoundingBoxFormat.XYWH: + elif format == vision_tensors.BoundingBoxFormat.XYWH: bounding_boxes[:, 0].add_(bounding_boxes[:, 2]).sub_(canvas_size[1]).neg_() - else: # format == datapoints.BoundingBoxFormat.CXCYWH: + else: # format == vision_tensors.BoundingBoxFormat.CXCYWH: bounding_boxes[:, 0].sub_(canvas_size[1]).neg_() return bounding_boxes.reshape(shape) -@_register_kernel_internal(horizontal_flip, datapoints.BoundingBoxes, datapoint_wrapper=False) -def _horizontal_flip_bounding_boxes_dispatch(inpt: datapoints.BoundingBoxes) -> datapoints.BoundingBoxes: +@_register_kernel_internal(horizontal_flip, vision_tensors.BoundingBoxes, vision_tensor_wrapper=False) +def _horizontal_flip_bounding_boxes_dispatch(inpt: vision_tensors.BoundingBoxes) -> vision_tensors.BoundingBoxes: output = horizontal_flip_bounding_boxes( inpt.as_subclass(torch.Tensor), format=inpt.format, canvas_size=inpt.canvas_size ) - return datapoints.wrap(output, like=inpt) + return vision_tensors.wrap(output, like=inpt) -@_register_kernel_internal(horizontal_flip, datapoints.Video) +@_register_kernel_internal(horizontal_flip, vision_tensors.Video) def horizontal_flip_video(video: torch.Tensor) -> torch.Tensor: return horizontal_flip_image(video) @@ -108,7 +108,7 @@ def vertical_flip(inpt: torch.Tensor) -> torch.Tensor: @_register_kernel_internal(vertical_flip, torch.Tensor) -@_register_kernel_internal(vertical_flip, datapoints.Image) +@_register_kernel_internal(vertical_flip, vision_tensors.Image) def vertical_flip_image(image: torch.Tensor) -> torch.Tensor: return image.flip(-2) @@ -118,37 +118,37 @@ def _vertical_flip_image_pil(image: PIL.Image) -> PIL.Image: return _FP.vflip(image) -@_register_kernel_internal(vertical_flip, datapoints.Mask) +@_register_kernel_internal(vertical_flip, vision_tensors.Mask) def vertical_flip_mask(mask: torch.Tensor) -> torch.Tensor: return vertical_flip_image(mask) def vertical_flip_bounding_boxes( - bounding_boxes: torch.Tensor, format: datapoints.BoundingBoxFormat, canvas_size: Tuple[int, int] + bounding_boxes: torch.Tensor, format: vision_tensors.BoundingBoxFormat, canvas_size: Tuple[int, int] ) -> torch.Tensor: shape = bounding_boxes.shape bounding_boxes = bounding_boxes.clone().reshape(-1, 4) - if format == datapoints.BoundingBoxFormat.XYXY: + if format == vision_tensors.BoundingBoxFormat.XYXY: bounding_boxes[:, [1, 3]] = bounding_boxes[:, [3, 1]].sub_(canvas_size[0]).neg_() - elif format == datapoints.BoundingBoxFormat.XYWH: + elif format == vision_tensors.BoundingBoxFormat.XYWH: bounding_boxes[:, 1].add_(bounding_boxes[:, 3]).sub_(canvas_size[0]).neg_() - else: # format == datapoints.BoundingBoxFormat.CXCYWH: + else: # format == vision_tensors.BoundingBoxFormat.CXCYWH: bounding_boxes[:, 1].sub_(canvas_size[0]).neg_() return bounding_boxes.reshape(shape) -@_register_kernel_internal(vertical_flip, datapoints.BoundingBoxes, datapoint_wrapper=False) -def _vertical_flip_bounding_boxes_dispatch(inpt: datapoints.BoundingBoxes) -> datapoints.BoundingBoxes: +@_register_kernel_internal(vertical_flip, vision_tensors.BoundingBoxes, vision_tensor_wrapper=False) +def _vertical_flip_bounding_boxes_dispatch(inpt: vision_tensors.BoundingBoxes) -> vision_tensors.BoundingBoxes: output = vertical_flip_bounding_boxes( inpt.as_subclass(torch.Tensor), format=inpt.format, canvas_size=inpt.canvas_size ) - return datapoints.wrap(output, like=inpt) + return vision_tensors.wrap(output, like=inpt) -@_register_kernel_internal(vertical_flip, datapoints.Video) +@_register_kernel_internal(vertical_flip, vision_tensors.Video) def vertical_flip_video(video: torch.Tensor) -> torch.Tensor: return vertical_flip_image(video) @@ -190,7 +190,7 @@ def resize( @_register_kernel_internal(resize, torch.Tensor) -@_register_kernel_internal(resize, datapoints.Image) +@_register_kernel_internal(resize, vision_tensors.Image) def resize_image( image: torch.Tensor, size: List[int], @@ -319,12 +319,12 @@ def resize_mask(mask: torch.Tensor, size: List[int], max_size: Optional[int] = N return output -@_register_kernel_internal(resize, datapoints.Mask, datapoint_wrapper=False) +@_register_kernel_internal(resize, vision_tensors.Mask, vision_tensor_wrapper=False) def _resize_mask_dispatch( - inpt: datapoints.Mask, size: List[int], max_size: Optional[int] = None, **kwargs: Any -) -> datapoints.Mask: + inpt: vision_tensors.Mask, size: List[int], max_size: Optional[int] = None, **kwargs: Any +) -> vision_tensors.Mask: output = resize_mask(inpt.as_subclass(torch.Tensor), size, max_size=max_size) - return datapoints.wrap(output, like=inpt) + return vision_tensors.wrap(output, like=inpt) def resize_bounding_boxes( @@ -345,17 +345,17 @@ def resize_bounding_boxes( ) -@_register_kernel_internal(resize, datapoints.BoundingBoxes, datapoint_wrapper=False) +@_register_kernel_internal(resize, vision_tensors.BoundingBoxes, vision_tensor_wrapper=False) def _resize_bounding_boxes_dispatch( - inpt: datapoints.BoundingBoxes, size: List[int], max_size: Optional[int] = None, **kwargs: Any -) -> datapoints.BoundingBoxes: + inpt: vision_tensors.BoundingBoxes, size: List[int], max_size: Optional[int] = None, **kwargs: Any +) -> vision_tensors.BoundingBoxes: output, canvas_size = resize_bounding_boxes( inpt.as_subclass(torch.Tensor), inpt.canvas_size, size, max_size=max_size ) - return datapoints.wrap(output, like=inpt, canvas_size=canvas_size) + return vision_tensors.wrap(output, like=inpt, canvas_size=canvas_size) -@_register_kernel_internal(resize, datapoints.Video) +@_register_kernel_internal(resize, vision_tensors.Video) def resize_video( video: torch.Tensor, size: List[int], @@ -651,7 +651,7 @@ def _affine_grid( @_register_kernel_internal(affine, torch.Tensor) -@_register_kernel_internal(affine, datapoints.Image) +@_register_kernel_internal(affine, vision_tensors.Image) def affine_image( image: torch.Tensor, angle: Union[int, float], @@ -730,7 +730,7 @@ def _affine_image_pil( def _affine_bounding_boxes_with_expand( bounding_boxes: torch.Tensor, - format: datapoints.BoundingBoxFormat, + format: vision_tensors.BoundingBoxFormat, canvas_size: Tuple[int, int], angle: Union[int, float], translate: List[float], @@ -749,7 +749,7 @@ def _affine_bounding_boxes_with_expand( device = bounding_boxes.device bounding_boxes = ( convert_bounding_box_format( - bounding_boxes, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY, inplace=True + bounding_boxes, old_format=format, new_format=vision_tensors.BoundingBoxFormat.XYXY, inplace=True ) ).reshape(-1, 4) @@ -808,9 +808,9 @@ def _affine_bounding_boxes_with_expand( new_width, new_height = _compute_affine_output_size(affine_vector, width, height) canvas_size = (new_height, new_width) - out_bboxes = clamp_bounding_boxes(out_bboxes, format=datapoints.BoundingBoxFormat.XYXY, canvas_size=canvas_size) + out_bboxes = clamp_bounding_boxes(out_bboxes, format=vision_tensors.BoundingBoxFormat.XYXY, canvas_size=canvas_size) out_bboxes = convert_bounding_box_format( - out_bboxes, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format, inplace=True + out_bboxes, old_format=vision_tensors.BoundingBoxFormat.XYXY, new_format=format, inplace=True ).reshape(original_shape) out_bboxes = out_bboxes.to(original_dtype) @@ -819,7 +819,7 @@ def _affine_bounding_boxes_with_expand( def affine_bounding_boxes( bounding_boxes: torch.Tensor, - format: datapoints.BoundingBoxFormat, + format: vision_tensors.BoundingBoxFormat, canvas_size: Tuple[int, int], angle: Union[int, float], translate: List[float], @@ -841,16 +841,16 @@ def affine_bounding_boxes( return out_box -@_register_kernel_internal(affine, datapoints.BoundingBoxes, datapoint_wrapper=False) +@_register_kernel_internal(affine, vision_tensors.BoundingBoxes, vision_tensor_wrapper=False) def _affine_bounding_boxes_dispatch( - inpt: datapoints.BoundingBoxes, + inpt: vision_tensors.BoundingBoxes, angle: Union[int, float], translate: List[float], scale: float, shear: List[float], center: Optional[List[float]] = None, **kwargs, -) -> datapoints.BoundingBoxes: +) -> vision_tensors.BoundingBoxes: output = affine_bounding_boxes( inpt.as_subclass(torch.Tensor), format=inpt.format, @@ -861,7 +861,7 @@ def _affine_bounding_boxes_dispatch( shear=shear, center=center, ) - return datapoints.wrap(output, like=inpt) + return vision_tensors.wrap(output, like=inpt) def affine_mask( @@ -896,9 +896,9 @@ def affine_mask( return output -@_register_kernel_internal(affine, datapoints.Mask, datapoint_wrapper=False) +@_register_kernel_internal(affine, vision_tensors.Mask, vision_tensor_wrapper=False) def _affine_mask_dispatch( - inpt: datapoints.Mask, + inpt: vision_tensors.Mask, angle: Union[int, float], translate: List[float], scale: float, @@ -906,7 +906,7 @@ def _affine_mask_dispatch( fill: _FillTypeJIT = None, center: Optional[List[float]] = None, **kwargs, -) -> datapoints.Mask: +) -> vision_tensors.Mask: output = affine_mask( inpt.as_subclass(torch.Tensor), angle=angle, @@ -916,10 +916,10 @@ def _affine_mask_dispatch( fill=fill, center=center, ) - return datapoints.wrap(output, like=inpt) + return vision_tensors.wrap(output, like=inpt) -@_register_kernel_internal(affine, datapoints.Video) +@_register_kernel_internal(affine, vision_tensors.Video) def affine_video( video: torch.Tensor, angle: Union[int, float], @@ -961,7 +961,7 @@ def rotate( @_register_kernel_internal(rotate, torch.Tensor) -@_register_kernel_internal(rotate, datapoints.Image) +@_register_kernel_internal(rotate, vision_tensors.Image) def rotate_image( image: torch.Tensor, angle: float, @@ -1027,7 +1027,7 @@ def _rotate_image_pil( def rotate_bounding_boxes( bounding_boxes: torch.Tensor, - format: datapoints.BoundingBoxFormat, + format: vision_tensors.BoundingBoxFormat, canvas_size: Tuple[int, int], angle: float, expand: bool = False, @@ -1049,10 +1049,14 @@ def rotate_bounding_boxes( ) -@_register_kernel_internal(rotate, datapoints.BoundingBoxes, datapoint_wrapper=False) +@_register_kernel_internal(rotate, vision_tensors.BoundingBoxes, vision_tensor_wrapper=False) def _rotate_bounding_boxes_dispatch( - inpt: datapoints.BoundingBoxes, angle: float, expand: bool = False, center: Optional[List[float]] = None, **kwargs -) -> datapoints.BoundingBoxes: + inpt: vision_tensors.BoundingBoxes, + angle: float, + expand: bool = False, + center: Optional[List[float]] = None, + **kwargs, +) -> vision_tensors.BoundingBoxes: output, canvas_size = rotate_bounding_boxes( inpt.as_subclass(torch.Tensor), format=inpt.format, @@ -1061,7 +1065,7 @@ def _rotate_bounding_boxes_dispatch( expand=expand, center=center, ) - return datapoints.wrap(output, like=inpt, canvas_size=canvas_size) + return vision_tensors.wrap(output, like=inpt, canvas_size=canvas_size) def rotate_mask( @@ -1092,20 +1096,20 @@ def rotate_mask( return output -@_register_kernel_internal(rotate, datapoints.Mask, datapoint_wrapper=False) +@_register_kernel_internal(rotate, vision_tensors.Mask, vision_tensor_wrapper=False) def _rotate_mask_dispatch( - inpt: datapoints.Mask, + inpt: vision_tensors.Mask, angle: float, expand: bool = False, center: Optional[List[float]] = None, fill: _FillTypeJIT = None, **kwargs, -) -> datapoints.Mask: +) -> vision_tensors.Mask: output = rotate_mask(inpt.as_subclass(torch.Tensor), angle=angle, expand=expand, fill=fill, center=center) - return datapoints.wrap(output, like=inpt) + return vision_tensors.wrap(output, like=inpt) -@_register_kernel_internal(rotate, datapoints.Video) +@_register_kernel_internal(rotate, vision_tensors.Video) def rotate_video( video: torch.Tensor, angle: float, @@ -1158,7 +1162,7 @@ def _parse_pad_padding(padding: Union[int, List[int]]) -> List[int]: @_register_kernel_internal(pad, torch.Tensor) -@_register_kernel_internal(pad, datapoints.Image) +@_register_kernel_internal(pad, vision_tensors.Image) def pad_image( image: torch.Tensor, padding: List[int], @@ -1260,7 +1264,7 @@ def _pad_with_vector_fill( _pad_image_pil = _register_kernel_internal(pad, PIL.Image.Image)(_FP.pad) -@_register_kernel_internal(pad, datapoints.Mask) +@_register_kernel_internal(pad, vision_tensors.Mask) def pad_mask( mask: torch.Tensor, padding: List[int], @@ -1289,7 +1293,7 @@ def pad_mask( def pad_bounding_boxes( bounding_boxes: torch.Tensor, - format: datapoints.BoundingBoxFormat, + format: vision_tensors.BoundingBoxFormat, canvas_size: Tuple[int, int], padding: List[int], padding_mode: str = "constant", @@ -1300,7 +1304,7 @@ def pad_bounding_boxes( left, right, top, bottom = _parse_pad_padding(padding) - if format == datapoints.BoundingBoxFormat.XYXY: + if format == vision_tensors.BoundingBoxFormat.XYXY: pad = [left, top, left, top] else: pad = [left, top, 0, 0] @@ -1314,10 +1318,10 @@ def pad_bounding_boxes( return clamp_bounding_boxes(bounding_boxes, format=format, canvas_size=canvas_size), canvas_size -@_register_kernel_internal(pad, datapoints.BoundingBoxes, datapoint_wrapper=False) +@_register_kernel_internal(pad, vision_tensors.BoundingBoxes, vision_tensor_wrapper=False) def _pad_bounding_boxes_dispatch( - inpt: datapoints.BoundingBoxes, padding: List[int], padding_mode: str = "constant", **kwargs -) -> datapoints.BoundingBoxes: + inpt: vision_tensors.BoundingBoxes, padding: List[int], padding_mode: str = "constant", **kwargs +) -> vision_tensors.BoundingBoxes: output, canvas_size = pad_bounding_boxes( inpt.as_subclass(torch.Tensor), format=inpt.format, @@ -1325,10 +1329,10 @@ def _pad_bounding_boxes_dispatch( padding=padding, padding_mode=padding_mode, ) - return datapoints.wrap(output, like=inpt, canvas_size=canvas_size) + return vision_tensors.wrap(output, like=inpt, canvas_size=canvas_size) -@_register_kernel_internal(pad, datapoints.Video) +@_register_kernel_internal(pad, vision_tensors.Video) def pad_video( video: torch.Tensor, padding: List[int], @@ -1350,7 +1354,7 @@ def crop(inpt: torch.Tensor, top: int, left: int, height: int, width: int) -> to @_register_kernel_internal(crop, torch.Tensor) -@_register_kernel_internal(crop, datapoints.Image) +@_register_kernel_internal(crop, vision_tensors.Image) def crop_image(image: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor: h, w = image.shape[-2:] @@ -1375,7 +1379,7 @@ def crop_image(image: torch.Tensor, top: int, left: int, height: int, width: int def crop_bounding_boxes( bounding_boxes: torch.Tensor, - format: datapoints.BoundingBoxFormat, + format: vision_tensors.BoundingBoxFormat, top: int, left: int, height: int, @@ -1383,7 +1387,7 @@ def crop_bounding_boxes( ) -> Tuple[torch.Tensor, Tuple[int, int]]: # Crop or implicit pad if left and/or top have negative values: - if format == datapoints.BoundingBoxFormat.XYXY: + if format == vision_tensors.BoundingBoxFormat.XYXY: sub = [left, top, left, top] else: sub = [left, top, 0, 0] @@ -1394,17 +1398,17 @@ def crop_bounding_boxes( return clamp_bounding_boxes(bounding_boxes, format=format, canvas_size=canvas_size), canvas_size -@_register_kernel_internal(crop, datapoints.BoundingBoxes, datapoint_wrapper=False) +@_register_kernel_internal(crop, vision_tensors.BoundingBoxes, vision_tensor_wrapper=False) def _crop_bounding_boxes_dispatch( - inpt: datapoints.BoundingBoxes, top: int, left: int, height: int, width: int -) -> datapoints.BoundingBoxes: + inpt: vision_tensors.BoundingBoxes, top: int, left: int, height: int, width: int +) -> vision_tensors.BoundingBoxes: output, canvas_size = crop_bounding_boxes( inpt.as_subclass(torch.Tensor), format=inpt.format, top=top, left=left, height=height, width=width ) - return datapoints.wrap(output, like=inpt, canvas_size=canvas_size) + return vision_tensors.wrap(output, like=inpt, canvas_size=canvas_size) -@_register_kernel_internal(crop, datapoints.Mask) +@_register_kernel_internal(crop, vision_tensors.Mask) def crop_mask(mask: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor: if mask.ndim < 3: mask = mask.unsqueeze(0) @@ -1420,7 +1424,7 @@ def crop_mask(mask: torch.Tensor, top: int, left: int, height: int, width: int) return output -@_register_kernel_internal(crop, datapoints.Video) +@_register_kernel_internal(crop, vision_tensors.Video) def crop_video(video: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor: return crop_image(video, top, left, height, width) @@ -1505,7 +1509,7 @@ def _perspective_coefficients( @_register_kernel_internal(perspective, torch.Tensor) -@_register_kernel_internal(perspective, datapoints.Image) +@_register_kernel_internal(perspective, vision_tensors.Image) def perspective_image( image: torch.Tensor, startpoints: Optional[List[List[int]]], @@ -1568,7 +1572,7 @@ def _perspective_image_pil( def perspective_bounding_boxes( bounding_boxes: torch.Tensor, - format: datapoints.BoundingBoxFormat, + format: vision_tensors.BoundingBoxFormat, canvas_size: Tuple[int, int], startpoints: Optional[List[List[int]]], endpoints: Optional[List[List[int]]], @@ -1582,7 +1586,7 @@ def perspective_bounding_boxes( original_shape = bounding_boxes.shape # TODO: first cast to float if bbox is int64 before convert_bounding_box_format bounding_boxes = ( - convert_bounding_box_format(bounding_boxes, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY) + convert_bounding_box_format(bounding_boxes, old_format=format, new_format=vision_tensors.BoundingBoxFormat.XYXY) ).reshape(-1, 4) dtype = bounding_boxes.dtype if torch.is_floating_point(bounding_boxes) else torch.float32 @@ -1649,25 +1653,25 @@ def perspective_bounding_boxes( out_bboxes = clamp_bounding_boxes( torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_boxes.dtype), - format=datapoints.BoundingBoxFormat.XYXY, + format=vision_tensors.BoundingBoxFormat.XYXY, canvas_size=canvas_size, ) # out_bboxes should be of shape [N boxes, 4] return convert_bounding_box_format( - out_bboxes, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format, inplace=True + out_bboxes, old_format=vision_tensors.BoundingBoxFormat.XYXY, new_format=format, inplace=True ).reshape(original_shape) -@_register_kernel_internal(perspective, datapoints.BoundingBoxes, datapoint_wrapper=False) +@_register_kernel_internal(perspective, vision_tensors.BoundingBoxes, vision_tensor_wrapper=False) def _perspective_bounding_boxes_dispatch( - inpt: datapoints.BoundingBoxes, + inpt: vision_tensors.BoundingBoxes, startpoints: Optional[List[List[int]]], endpoints: Optional[List[List[int]]], coefficients: Optional[List[float]] = None, **kwargs, -) -> datapoints.BoundingBoxes: +) -> vision_tensors.BoundingBoxes: output = perspective_bounding_boxes( inpt.as_subclass(torch.Tensor), format=inpt.format, @@ -1676,7 +1680,7 @@ def _perspective_bounding_boxes_dispatch( endpoints=endpoints, coefficients=coefficients, ) - return datapoints.wrap(output, like=inpt) + return vision_tensors.wrap(output, like=inpt) def perspective_mask( @@ -1702,15 +1706,15 @@ def perspective_mask( return output -@_register_kernel_internal(perspective, datapoints.Mask, datapoint_wrapper=False) +@_register_kernel_internal(perspective, vision_tensors.Mask, vision_tensor_wrapper=False) def _perspective_mask_dispatch( - inpt: datapoints.Mask, + inpt: vision_tensors.Mask, startpoints: Optional[List[List[int]]], endpoints: Optional[List[List[int]]], fill: _FillTypeJIT = None, coefficients: Optional[List[float]] = None, **kwargs, -) -> datapoints.Mask: +) -> vision_tensors.Mask: output = perspective_mask( inpt.as_subclass(torch.Tensor), startpoints=startpoints, @@ -1718,10 +1722,10 @@ def _perspective_mask_dispatch( fill=fill, coefficients=coefficients, ) - return datapoints.wrap(output, like=inpt) + return vision_tensors.wrap(output, like=inpt) -@_register_kernel_internal(perspective, datapoints.Video) +@_register_kernel_internal(perspective, vision_tensors.Video) def perspective_video( video: torch.Tensor, startpoints: Optional[List[List[int]]], @@ -1755,7 +1759,7 @@ def elastic( @_register_kernel_internal(elastic, torch.Tensor) -@_register_kernel_internal(elastic, datapoints.Image) +@_register_kernel_internal(elastic, vision_tensors.Image) def elastic_image( image: torch.Tensor, displacement: torch.Tensor, @@ -1841,7 +1845,7 @@ def _create_identity_grid(size: Tuple[int, int], device: torch.device, dtype: to def elastic_bounding_boxes( bounding_boxes: torch.Tensor, - format: datapoints.BoundingBoxFormat, + format: vision_tensors.BoundingBoxFormat, canvas_size: Tuple[int, int], displacement: torch.Tensor, ) -> torch.Tensor: @@ -1864,7 +1868,7 @@ def elastic_bounding_boxes( original_shape = bounding_boxes.shape # TODO: first cast to float if bbox is int64 before convert_bounding_box_format bounding_boxes = ( - convert_bounding_box_format(bounding_boxes, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY) + convert_bounding_box_format(bounding_boxes, old_format=format, new_format=vision_tensors.BoundingBoxFormat.XYXY) ).reshape(-1, 4) id_grid = _create_identity_grid(canvas_size, device=device, dtype=dtype) @@ -1887,23 +1891,23 @@ def elastic_bounding_boxes( out_bbox_mins, out_bbox_maxs = torch.aminmax(transformed_points, dim=1) out_bboxes = clamp_bounding_boxes( torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_boxes.dtype), - format=datapoints.BoundingBoxFormat.XYXY, + format=vision_tensors.BoundingBoxFormat.XYXY, canvas_size=canvas_size, ) return convert_bounding_box_format( - out_bboxes, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format, inplace=True + out_bboxes, old_format=vision_tensors.BoundingBoxFormat.XYXY, new_format=format, inplace=True ).reshape(original_shape) -@_register_kernel_internal(elastic, datapoints.BoundingBoxes, datapoint_wrapper=False) +@_register_kernel_internal(elastic, vision_tensors.BoundingBoxes, vision_tensor_wrapper=False) def _elastic_bounding_boxes_dispatch( - inpt: datapoints.BoundingBoxes, displacement: torch.Tensor, **kwargs -) -> datapoints.BoundingBoxes: + inpt: vision_tensors.BoundingBoxes, displacement: torch.Tensor, **kwargs +) -> vision_tensors.BoundingBoxes: output = elastic_bounding_boxes( inpt.as_subclass(torch.Tensor), format=inpt.format, canvas_size=inpt.canvas_size, displacement=displacement ) - return datapoints.wrap(output, like=inpt) + return vision_tensors.wrap(output, like=inpt) def elastic_mask( @@ -1925,15 +1929,15 @@ def elastic_mask( return output -@_register_kernel_internal(elastic, datapoints.Mask, datapoint_wrapper=False) +@_register_kernel_internal(elastic, vision_tensors.Mask, vision_tensor_wrapper=False) def _elastic_mask_dispatch( - inpt: datapoints.Mask, displacement: torch.Tensor, fill: _FillTypeJIT = None, **kwargs -) -> datapoints.Mask: + inpt: vision_tensors.Mask, displacement: torch.Tensor, fill: _FillTypeJIT = None, **kwargs +) -> vision_tensors.Mask: output = elastic_mask(inpt.as_subclass(torch.Tensor), displacement=displacement, fill=fill) - return datapoints.wrap(output, like=inpt) + return vision_tensors.wrap(output, like=inpt) -@_register_kernel_internal(elastic, datapoints.Video) +@_register_kernel_internal(elastic, vision_tensors.Video) def elastic_video( video: torch.Tensor, displacement: torch.Tensor, @@ -1982,7 +1986,7 @@ def _center_crop_compute_crop_anchor( @_register_kernel_internal(center_crop, torch.Tensor) -@_register_kernel_internal(center_crop, datapoints.Image) +@_register_kernel_internal(center_crop, vision_tensors.Image) def center_crop_image(image: torch.Tensor, output_size: List[int]) -> torch.Tensor: crop_height, crop_width = _center_crop_parse_output_size(output_size) shape = image.shape @@ -2021,7 +2025,7 @@ def _center_crop_image_pil(image: PIL.Image.Image, output_size: List[int]) -> PI def center_crop_bounding_boxes( bounding_boxes: torch.Tensor, - format: datapoints.BoundingBoxFormat, + format: vision_tensors.BoundingBoxFormat, canvas_size: Tuple[int, int], output_size: List[int], ) -> Tuple[torch.Tensor, Tuple[int, int]]: @@ -2032,17 +2036,17 @@ def center_crop_bounding_boxes( ) -@_register_kernel_internal(center_crop, datapoints.BoundingBoxes, datapoint_wrapper=False) +@_register_kernel_internal(center_crop, vision_tensors.BoundingBoxes, vision_tensor_wrapper=False) def _center_crop_bounding_boxes_dispatch( - inpt: datapoints.BoundingBoxes, output_size: List[int] -) -> datapoints.BoundingBoxes: + inpt: vision_tensors.BoundingBoxes, output_size: List[int] +) -> vision_tensors.BoundingBoxes: output, canvas_size = center_crop_bounding_boxes( inpt.as_subclass(torch.Tensor), format=inpt.format, canvas_size=inpt.canvas_size, output_size=output_size ) - return datapoints.wrap(output, like=inpt, canvas_size=canvas_size) + return vision_tensors.wrap(output, like=inpt, canvas_size=canvas_size) -@_register_kernel_internal(center_crop, datapoints.Mask) +@_register_kernel_internal(center_crop, vision_tensors.Mask) def center_crop_mask(mask: torch.Tensor, output_size: List[int]) -> torch.Tensor: if mask.ndim < 3: mask = mask.unsqueeze(0) @@ -2058,7 +2062,7 @@ def center_crop_mask(mask: torch.Tensor, output_size: List[int]) -> torch.Tensor return output -@_register_kernel_internal(center_crop, datapoints.Video) +@_register_kernel_internal(center_crop, vision_tensors.Video) def center_crop_video(video: torch.Tensor, output_size: List[int]) -> torch.Tensor: return center_crop_image(video, output_size) @@ -2102,7 +2106,7 @@ def resized_crop( @_register_kernel_internal(resized_crop, torch.Tensor) -@_register_kernel_internal(resized_crop, datapoints.Image) +@_register_kernel_internal(resized_crop, vision_tensors.Image) def resized_crop_image( image: torch.Tensor, top: int, @@ -2156,7 +2160,7 @@ def _resized_crop_image_pil_dispatch( def resized_crop_bounding_boxes( bounding_boxes: torch.Tensor, - format: datapoints.BoundingBoxFormat, + format: vision_tensors.BoundingBoxFormat, top: int, left: int, height: int, @@ -2167,14 +2171,14 @@ def resized_crop_bounding_boxes( return resize_bounding_boxes(bounding_boxes, canvas_size=canvas_size, size=size) -@_register_kernel_internal(resized_crop, datapoints.BoundingBoxes, datapoint_wrapper=False) +@_register_kernel_internal(resized_crop, vision_tensors.BoundingBoxes, vision_tensor_wrapper=False) def _resized_crop_bounding_boxes_dispatch( - inpt: datapoints.BoundingBoxes, top: int, left: int, height: int, width: int, size: List[int], **kwargs -) -> datapoints.BoundingBoxes: + inpt: vision_tensors.BoundingBoxes, top: int, left: int, height: int, width: int, size: List[int], **kwargs +) -> vision_tensors.BoundingBoxes: output, canvas_size = resized_crop_bounding_boxes( inpt.as_subclass(torch.Tensor), format=inpt.format, top=top, left=left, height=height, width=width, size=size ) - return datapoints.wrap(output, like=inpt, canvas_size=canvas_size) + return vision_tensors.wrap(output, like=inpt, canvas_size=canvas_size) def resized_crop_mask( @@ -2189,17 +2193,17 @@ def resized_crop_mask( return resize_mask(mask, size) -@_register_kernel_internal(resized_crop, datapoints.Mask, datapoint_wrapper=False) +@_register_kernel_internal(resized_crop, vision_tensors.Mask, vision_tensor_wrapper=False) def _resized_crop_mask_dispatch( - inpt: datapoints.Mask, top: int, left: int, height: int, width: int, size: List[int], **kwargs -) -> datapoints.Mask: + inpt: vision_tensors.Mask, top: int, left: int, height: int, width: int, size: List[int], **kwargs +) -> vision_tensors.Mask: output = resized_crop_mask( inpt.as_subclass(torch.Tensor), top=top, left=left, height=height, width=width, size=size ) - return datapoints.wrap(output, like=inpt) + return vision_tensors.wrap(output, like=inpt) -@_register_kernel_internal(resized_crop, datapoints.Video) +@_register_kernel_internal(resized_crop, vision_tensors.Video) def resized_crop_video( video: torch.Tensor, top: int, @@ -2243,7 +2247,7 @@ def _parse_five_crop_size(size: List[int]) -> List[int]: @_register_five_ten_crop_kernel_internal(five_crop, torch.Tensor) -@_register_five_ten_crop_kernel_internal(five_crop, datapoints.Image) +@_register_five_ten_crop_kernel_internal(five_crop, vision_tensors.Image) def five_crop_image( image: torch.Tensor, size: List[int] ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: @@ -2281,7 +2285,7 @@ def _five_crop_image_pil( return tl, tr, bl, br, center -@_register_five_ten_crop_kernel_internal(five_crop, datapoints.Video) +@_register_five_ten_crop_kernel_internal(five_crop, vision_tensors.Video) def five_crop_video( video: torch.Tensor, size: List[int] ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: @@ -2313,7 +2317,7 @@ def ten_crop( @_register_five_ten_crop_kernel_internal(ten_crop, torch.Tensor) -@_register_five_ten_crop_kernel_internal(ten_crop, datapoints.Image) +@_register_five_ten_crop_kernel_internal(ten_crop, vision_tensors.Image) def ten_crop_image( image: torch.Tensor, size: List[int], vertical_flip: bool = False ) -> Tuple[ @@ -2367,7 +2371,7 @@ def _ten_crop_image_pil( return non_flipped + flipped -@_register_five_ten_crop_kernel_internal(ten_crop, datapoints.Video) +@_register_five_ten_crop_kernel_internal(ten_crop, vision_tensors.Video) def ten_crop_video( video: torch.Tensor, size: List[int], vertical_flip: bool = False ) -> Tuple[ diff --git a/torchvision/transforms/v2/functional/_meta.py b/torchvision/transforms/v2/functional/_meta.py index be57f424b7f..497ce92e710 100644 --- a/torchvision/transforms/v2/functional/_meta.py +++ b/torchvision/transforms/v2/functional/_meta.py @@ -2,11 +2,11 @@ import PIL.Image import torch -from torchvision import datapoints -from torchvision.datapoints import BoundingBoxFormat +from torchvision import vision_tensors from torchvision.transforms import _functional_pil as _FP from torchvision.utils import _log_api_usage_once +from torchvision.vision_tensors import BoundingBoxFormat from ._utils import _get_kernel, _register_kernel_internal, is_pure_tensor @@ -22,7 +22,7 @@ def get_dimensions(inpt: torch.Tensor) -> List[int]: @_register_kernel_internal(get_dimensions, torch.Tensor) -@_register_kernel_internal(get_dimensions, datapoints.Image, datapoint_wrapper=False) +@_register_kernel_internal(get_dimensions, vision_tensors.Image, vision_tensor_wrapper=False) def get_dimensions_image(image: torch.Tensor) -> List[int]: chw = list(image.shape[-3:]) ndims = len(chw) @@ -38,7 +38,7 @@ def get_dimensions_image(image: torch.Tensor) -> List[int]: _get_dimensions_image_pil = _register_kernel_internal(get_dimensions, PIL.Image.Image)(_FP.get_dimensions) -@_register_kernel_internal(get_dimensions, datapoints.Video, datapoint_wrapper=False) +@_register_kernel_internal(get_dimensions, vision_tensors.Video, vision_tensor_wrapper=False) def get_dimensions_video(video: torch.Tensor) -> List[int]: return get_dimensions_image(video) @@ -54,7 +54,7 @@ def get_num_channels(inpt: torch.Tensor) -> int: @_register_kernel_internal(get_num_channels, torch.Tensor) -@_register_kernel_internal(get_num_channels, datapoints.Image, datapoint_wrapper=False) +@_register_kernel_internal(get_num_channels, vision_tensors.Image, vision_tensor_wrapper=False) def get_num_channels_image(image: torch.Tensor) -> int: chw = image.shape[-3:] ndims = len(chw) @@ -69,7 +69,7 @@ def get_num_channels_image(image: torch.Tensor) -> int: _get_num_channels_image_pil = _register_kernel_internal(get_num_channels, PIL.Image.Image)(_FP.get_image_num_channels) -@_register_kernel_internal(get_num_channels, datapoints.Video, datapoint_wrapper=False) +@_register_kernel_internal(get_num_channels, vision_tensors.Video, vision_tensor_wrapper=False) def get_num_channels_video(video: torch.Tensor) -> int: return get_num_channels_image(video) @@ -90,7 +90,7 @@ def get_size(inpt: torch.Tensor) -> List[int]: @_register_kernel_internal(get_size, torch.Tensor) -@_register_kernel_internal(get_size, datapoints.Image, datapoint_wrapper=False) +@_register_kernel_internal(get_size, vision_tensors.Image, vision_tensor_wrapper=False) def get_size_image(image: torch.Tensor) -> List[int]: hw = list(image.shape[-2:]) ndims = len(hw) @@ -106,18 +106,18 @@ def _get_size_image_pil(image: PIL.Image.Image) -> List[int]: return [height, width] -@_register_kernel_internal(get_size, datapoints.Video, datapoint_wrapper=False) +@_register_kernel_internal(get_size, vision_tensors.Video, vision_tensor_wrapper=False) def get_size_video(video: torch.Tensor) -> List[int]: return get_size_image(video) -@_register_kernel_internal(get_size, datapoints.Mask, datapoint_wrapper=False) +@_register_kernel_internal(get_size, vision_tensors.Mask, vision_tensor_wrapper=False) def get_size_mask(mask: torch.Tensor) -> List[int]: return get_size_image(mask) -@_register_kernel_internal(get_size, datapoints.BoundingBoxes, datapoint_wrapper=False) -def get_size_bounding_boxes(bounding_box: datapoints.BoundingBoxes) -> List[int]: +@_register_kernel_internal(get_size, vision_tensors.BoundingBoxes, vision_tensor_wrapper=False) +def get_size_bounding_boxes(bounding_box: vision_tensors.BoundingBoxes) -> List[int]: return list(bounding_box.canvas_size) @@ -132,7 +132,7 @@ def get_num_frames(inpt: torch.Tensor) -> int: @_register_kernel_internal(get_num_frames, torch.Tensor) -@_register_kernel_internal(get_num_frames, datapoints.Video, datapoint_wrapper=False) +@_register_kernel_internal(get_num_frames, vision_tensors.Video, vision_tensor_wrapper=False) def get_num_frames_video(video: torch.Tensor) -> int: return video.shape[-4] @@ -205,7 +205,7 @@ def convert_bounding_box_format( ) -> torch.Tensor: """[BETA] See :func:`~torchvision.transforms.v2.ConvertBoundingBoxFormat` for details.""" # This being a kernel / functional hybrid, we need an option to pass `old_format` explicitly for pure tensor - # inputs as well as extract it from `datapoints.BoundingBoxes` inputs. However, putting a default value on + # inputs as well as extract it from `vision_tensors.BoundingBoxes` inputs. However, putting a default value on # `old_format` means we also need to put one on `new_format` to have syntactically correct Python. Here we mimic the # default error that would be thrown if `new_format` had no default value. if new_format is None: @@ -218,16 +218,16 @@ def convert_bounding_box_format( if old_format is None: raise ValueError("For pure tensor inputs, `old_format` has to be passed.") return _convert_bounding_box_format(inpt, old_format=old_format, new_format=new_format, inplace=inplace) - elif isinstance(inpt, datapoints.BoundingBoxes): + elif isinstance(inpt, vision_tensors.BoundingBoxes): if old_format is not None: - raise ValueError("For bounding box datapoint inputs, `old_format` must not be passed.") + raise ValueError("For bounding box vision_tensor inputs, `old_format` must not be passed.") output = _convert_bounding_box_format( inpt.as_subclass(torch.Tensor), old_format=inpt.format, new_format=new_format, inplace=inplace ) - return datapoints.wrap(output, like=inpt, format=new_format) + return vision_tensors.wrap(output, like=inpt, format=new_format) else: raise TypeError( - f"Input can either be a plain tensor or a bounding box datapoint, but got {type(inpt)} instead." + f"Input can either be a plain tensor or a bounding box vision_tensor, but got {type(inpt)} instead." ) @@ -239,7 +239,7 @@ def _clamp_bounding_boxes( in_dtype = bounding_boxes.dtype bounding_boxes = bounding_boxes.clone() if bounding_boxes.is_floating_point() else bounding_boxes.float() xyxy_boxes = convert_bounding_box_format( - bounding_boxes, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY, inplace=True + bounding_boxes, old_format=format, new_format=vision_tensors.BoundingBoxFormat.XYXY, inplace=True ) xyxy_boxes[..., 0::2].clamp_(min=0, max=canvas_size[1]) xyxy_boxes[..., 1::2].clamp_(min=0, max=canvas_size[0]) @@ -263,12 +263,12 @@ def clamp_bounding_boxes( if format is None or canvas_size is None: raise ValueError("For pure tensor inputs, `format` and `canvas_size` has to be passed.") return _clamp_bounding_boxes(inpt, format=format, canvas_size=canvas_size) - elif isinstance(inpt, datapoints.BoundingBoxes): + elif isinstance(inpt, vision_tensors.BoundingBoxes): if format is not None or canvas_size is not None: - raise ValueError("For bounding box datapoint inputs, `format` and `canvas_size` must not be passed.") + raise ValueError("For bounding box vision_tensor inputs, `format` and `canvas_size` must not be passed.") output = _clamp_bounding_boxes(inpt.as_subclass(torch.Tensor), format=inpt.format, canvas_size=inpt.canvas_size) - return datapoints.wrap(output, like=inpt) + return vision_tensors.wrap(output, like=inpt) else: raise TypeError( - f"Input can either be a plain tensor or a bounding box datapoint, but got {type(inpt)} instead." + f"Input can either be a plain tensor or a bounding box vision_tensor, but got {type(inpt)} instead." ) diff --git a/torchvision/transforms/v2/functional/_misc.py b/torchvision/transforms/v2/functional/_misc.py index 1ed134b09b2..238e678d9a6 100644 --- a/torchvision/transforms/v2/functional/_misc.py +++ b/torchvision/transforms/v2/functional/_misc.py @@ -5,7 +5,7 @@ import torch from torch.nn.functional import conv2d, pad as torch_pad -from torchvision import datapoints +from torchvision import vision_tensors from torchvision.transforms._functional_tensor import _max_value from torchvision.transforms.functional import pil_to_tensor, to_pil_image @@ -31,7 +31,7 @@ def normalize( @_register_kernel_internal(normalize, torch.Tensor) -@_register_kernel_internal(normalize, datapoints.Image) +@_register_kernel_internal(normalize, vision_tensors.Image) def normalize_image(image: torch.Tensor, mean: List[float], std: List[float], inplace: bool = False) -> torch.Tensor: if not image.is_floating_point(): raise TypeError(f"Input tensor should be a float tensor. Got {image.dtype}.") @@ -65,7 +65,7 @@ def normalize_image(image: torch.Tensor, mean: List[float], std: List[float], in return image.div_(std) -@_register_kernel_internal(normalize, datapoints.Video) +@_register_kernel_internal(normalize, vision_tensors.Video) def normalize_video(video: torch.Tensor, mean: List[float], std: List[float], inplace: bool = False) -> torch.Tensor: return normalize_image(video, mean, std, inplace=inplace) @@ -98,7 +98,7 @@ def _get_gaussian_kernel2d( @_register_kernel_internal(gaussian_blur, torch.Tensor) -@_register_kernel_internal(gaussian_blur, datapoints.Image) +@_register_kernel_internal(gaussian_blur, vision_tensors.Image) def gaussian_blur_image( image: torch.Tensor, kernel_size: List[int], sigma: Optional[List[float]] = None ) -> torch.Tensor: @@ -172,7 +172,7 @@ def _gaussian_blur_image_pil( return to_pil_image(output, mode=image.mode) -@_register_kernel_internal(gaussian_blur, datapoints.Video) +@_register_kernel_internal(gaussian_blur, vision_tensors.Video) def gaussian_blur_video( video: torch.Tensor, kernel_size: List[int], sigma: Optional[List[float]] = None ) -> torch.Tensor: @@ -206,7 +206,7 @@ def _num_value_bits(dtype: torch.dtype) -> int: @_register_kernel_internal(to_dtype, torch.Tensor) -@_register_kernel_internal(to_dtype, datapoints.Image) +@_register_kernel_internal(to_dtype, vision_tensors.Image) def to_dtype_image(image: torch.Tensor, dtype: torch.dtype = torch.float, scale: bool = False) -> torch.Tensor: if image.dtype == dtype: @@ -265,13 +265,13 @@ def convert_image_dtype(image: torch.Tensor, dtype: torch.dtype = torch.float32) return to_dtype_image(image, dtype=dtype, scale=True) -@_register_kernel_internal(to_dtype, datapoints.Video) +@_register_kernel_internal(to_dtype, vision_tensors.Video) def to_dtype_video(video: torch.Tensor, dtype: torch.dtype = torch.float, scale: bool = False) -> torch.Tensor: return to_dtype_image(video, dtype, scale=scale) -@_register_kernel_internal(to_dtype, datapoints.BoundingBoxes, datapoint_wrapper=False) -@_register_kernel_internal(to_dtype, datapoints.Mask, datapoint_wrapper=False) +@_register_kernel_internal(to_dtype, vision_tensors.BoundingBoxes, vision_tensor_wrapper=False) +@_register_kernel_internal(to_dtype, vision_tensors.Mask, vision_tensor_wrapper=False) def _to_dtype_tensor_dispatch(inpt: torch.Tensor, dtype: torch.dtype, scale: bool = False) -> torch.Tensor: - # We don't need to unwrap and rewrap here, since Datapoint.to() preserves the type + # We don't need to unwrap and rewrap here, since VisionTensor.to() preserves the type return inpt.to(dtype) diff --git a/torchvision/transforms/v2/functional/_temporal.py b/torchvision/transforms/v2/functional/_temporal.py index 9464adf5fa2..207da5babfa 100644 --- a/torchvision/transforms/v2/functional/_temporal.py +++ b/torchvision/transforms/v2/functional/_temporal.py @@ -1,6 +1,6 @@ import torch -from torchvision import datapoints +from torchvision import vision_tensors from torchvision.utils import _log_api_usage_once @@ -19,7 +19,7 @@ def uniform_temporal_subsample(inpt: torch.Tensor, num_samples: int) -> torch.Te @_register_kernel_internal(uniform_temporal_subsample, torch.Tensor) -@_register_kernel_internal(uniform_temporal_subsample, datapoints.Video) +@_register_kernel_internal(uniform_temporal_subsample, vision_tensors.Video) def uniform_temporal_subsample_video(video: torch.Tensor, num_samples: int) -> torch.Tensor: # Reference: https://github.com/facebookresearch/pytorchvideo/blob/a0a131e/pytorchvideo/transforms/functional.py#L19 t_max = video.shape[-4] - 1 diff --git a/torchvision/transforms/v2/functional/_type_conversion.py b/torchvision/transforms/v2/functional/_type_conversion.py index 4359e0e6686..36f103b5ad0 100644 --- a/torchvision/transforms/v2/functional/_type_conversion.py +++ b/torchvision/transforms/v2/functional/_type_conversion.py @@ -3,12 +3,12 @@ import numpy as np import PIL.Image import torch -from torchvision import datapoints +from torchvision import vision_tensors from torchvision.transforms import functional as _F @torch.jit.unused -def to_image(inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray]) -> datapoints.Image: +def to_image(inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray]) -> vision_tensors.Image: """[BETA] See :class:`~torchvision.transforms.v2.ToImage` for details.""" if isinstance(inpt, np.ndarray): output = torch.from_numpy(inpt).permute((2, 0, 1)).contiguous() @@ -18,7 +18,7 @@ def to_image(inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray]) -> datapoin output = inpt else: raise TypeError(f"Input can either be a numpy array or a PIL image, but got {type(inpt)} instead.") - return datapoints.Image(output) + return vision_tensors.Image(output) to_pil_image = _F.to_pil_image diff --git a/torchvision/transforms/v2/functional/_utils.py b/torchvision/transforms/v2/functional/_utils.py index 5a907121b92..d00ae1cb4cd 100644 --- a/torchvision/transforms/v2/functional/_utils.py +++ b/torchvision/transforms/v2/functional/_utils.py @@ -2,21 +2,21 @@ from typing import Any, Callable, Dict, List, Optional, Sequence, Type, Union import torch -from torchvision import datapoints +from torchvision import vision_tensors _FillType = Union[int, float, Sequence[int], Sequence[float], None] _FillTypeJIT = Optional[List[float]] def is_pure_tensor(inpt: Any) -> bool: - return isinstance(inpt, torch.Tensor) and not isinstance(inpt, datapoints.Datapoint) + return isinstance(inpt, torch.Tensor) and not isinstance(inpt, vision_tensors.VisionTensor) # {functional: {input_type: type_specific_kernel}} _KERNEL_REGISTRY: Dict[Callable, Dict[Type, Callable]] = {} -def _kernel_datapoint_wrapper(kernel): +def _kernel_vision_tensor_wrapper(kernel): @functools.wraps(kernel) def wrapper(inpt, *args, **kwargs): # If you're wondering whether we could / should get rid of this wrapper, @@ -25,24 +25,24 @@ def wrapper(inpt, *args, **kwargs): # regardless of whether we override __torch_function__ in our base class # or not. # Also, even if we didn't call `as_subclass` here, we would still need - # this wrapper to call wrap(), because the Datapoint type would be + # this wrapper to call wrap(), because the VisionTensor type would be # lost after the first operation due to our own __torch_function__ # logic. output = kernel(inpt.as_subclass(torch.Tensor), *args, **kwargs) - return datapoints.wrap(output, like=inpt) + return vision_tensors.wrap(output, like=inpt) return wrapper -def _register_kernel_internal(functional, input_type, *, datapoint_wrapper=True): +def _register_kernel_internal(functional, input_type, *, vision_tensor_wrapper=True): registry = _KERNEL_REGISTRY.setdefault(functional, {}) if input_type in registry: raise ValueError(f"Functional {functional} already has a kernel registered for type {input_type}.") def decorator(kernel): registry[input_type] = ( - _kernel_datapoint_wrapper(kernel) - if issubclass(input_type, datapoints.Datapoint) and datapoint_wrapper + _kernel_vision_tensor_wrapper(kernel) + if issubclass(input_type, vision_tensors.VisionTensor) and vision_tensor_wrapper else kernel ) return kernel @@ -62,14 +62,16 @@ def _name_to_functional(name): _BUILTIN_DATAPOINT_TYPES = { - obj for obj in datapoints.__dict__.values() if isinstance(obj, type) and issubclass(obj, datapoints.Datapoint) + obj + for obj in vision_tensors.__dict__.values() + if isinstance(obj, type) and issubclass(obj, vision_tensors.VisionTensor) } -def register_kernel(functional, datapoint_cls): - """[BETA] Decorate a kernel to register it for a functional and a (custom) datapoint type. +def register_kernel(functional, vision_tensor_cls): + """[BETA] Decorate a kernel to register it for a functional and a (custom) vision_tensor type. - See :ref:`sphx_glr_auto_examples_transforms_plot_custom_datapoints.py` for usage + See :ref:`sphx_glr_auto_examples_transforms_plot_custom_vision_tensors.py` for usage details. """ if isinstance(functional, str): @@ -83,16 +85,18 @@ def register_kernel(functional, datapoint_cls): f"but got {functional}." ) - if not (isinstance(datapoint_cls, type) and issubclass(datapoint_cls, datapoints.Datapoint)): + if not (isinstance(vision_tensor_cls, type) and issubclass(vision_tensor_cls, vision_tensors.VisionTensor)): raise ValueError( - f"Kernels can only be registered for subclasses of torchvision.datapoints.Datapoint, " - f"but got {datapoint_cls}." + f"Kernels can only be registered for subclasses of torchvision.vision_tensors.VisionTensor, " + f"but got {vision_tensor_cls}." ) - if datapoint_cls in _BUILTIN_DATAPOINT_TYPES: - raise ValueError(f"Kernels cannot be registered for the builtin datapoint classes, but got {datapoint_cls}") + if vision_tensor_cls in _BUILTIN_DATAPOINT_TYPES: + raise ValueError( + f"Kernels cannot be registered for the builtin vision_tensor classes, but got {vision_tensor_cls}" + ) - return _register_kernel_internal(functional, datapoint_cls, datapoint_wrapper=False) + return _register_kernel_internal(functional, vision_tensor_cls, vision_tensor_wrapper=False) def _get_kernel(functional, input_type, *, allow_passthrough=False): @@ -103,10 +107,10 @@ def _get_kernel(functional, input_type, *, allow_passthrough=False): for cls in input_type.__mro__: if cls in registry: return registry[cls] - elif cls is datapoints.Datapoint: - # We don't want user-defined datapoints to dispatch to the pure Tensor kernels, so we explicit stop the - # MRO traversal before hitting torch.Tensor. We can even stop at datapoints.Datapoint, since we don't - # allow kernels to be registered for datapoints.Datapoint anyway. + elif cls is vision_tensors.VisionTensor: + # We don't want user-defined vision_tensors to dispatch to the pure Tensor kernels, so we explicit stop the + # MRO traversal before hitting torch.Tensor. We can even stop at vision_tensors.VisionTensor, since we don't + # allow kernels to be registered for vision_tensors.VisionTensor anyway. break if allow_passthrough: @@ -130,12 +134,12 @@ def wrap(kernel): def wrapper(inpt, *args, **kwargs): output = kernel(inpt, *args, **kwargs) container_type = type(output) - return container_type(datapoints.wrap(o, like=inpt) for o in output) + return container_type(vision_tensors.wrap(o, like=inpt) for o in output) return wrapper def decorator(kernel): - registry[input_type] = wrap(kernel) if issubclass(input_type, datapoints.Datapoint) else kernel + registry[input_type] = wrap(kernel) if issubclass(input_type, vision_tensors.VisionTensor) else kernel return kernel return decorator diff --git a/torchvision/datapoints/__init__.py b/torchvision/vision_tensors/__init__.py similarity index 71% rename from torchvision/datapoints/__init__.py rename to torchvision/vision_tensors/__init__.py index 512a8d606f2..216956080a2 100644 --- a/torchvision/datapoints/__init__.py +++ b/torchvision/vision_tensors/__init__.py @@ -1,24 +1,24 @@ import torch from ._bounding_box import BoundingBoxes, BoundingBoxFormat -from ._datapoint import Datapoint from ._image import Image from ._mask import Mask from ._torch_function_helpers import set_return_type from ._video import Video +from ._vision_tensor import VisionTensor def wrap(wrappee, *, like, **kwargs): - """[BETA] Convert a :class:`torch.Tensor` (``wrappee``) into the same :class:`~torchvision.datapoints.Datapoint` subclass as ``like``. + """[BETA] Convert a :class:`torch.Tensor` (``wrappee``) into the same :class:`~torchvision.vision_tensors.VisionTensor` subclass as ``like``. - If ``like`` is a :class:`~torchvision.datapoints.BoundingBoxes`, the ``format`` and ``canvas_size`` of + If ``like`` is a :class:`~torchvision.vision_tensors.BoundingBoxes`, the ``format`` and ``canvas_size`` of ``like`` are assigned to ``wrappee``, unless they are passed as ``kwargs``. Args: wrappee (Tensor): The tensor to convert. - like (:class:`~torchvision.datapoints.Datapoint`): The reference. + like (:class:`~torchvision.vision_tensors.VisionTensor`): The reference. ``wrappee`` will be converted into the same subclass as ``like``. - kwargs: Can contain "format" and "canvas_size" if ``like`` is a :class:`~torchvision.datapoint.BoundingBoxes`. + kwargs: Can contain "format" and "canvas_size" if ``like`` is a :class:`~torchvision.vision_tensor.BoundingBoxes`. Ignored otherwise. """ if isinstance(like, BoundingBoxes): diff --git a/torchvision/datapoints/_bounding_box.py b/torchvision/vision_tensors/_bounding_box.py similarity index 94% rename from torchvision/datapoints/_bounding_box.py rename to torchvision/vision_tensors/_bounding_box.py index ebed0628250..97fe9140fec 100644 --- a/torchvision/datapoints/_bounding_box.py +++ b/torchvision/vision_tensors/_bounding_box.py @@ -6,7 +6,7 @@ import torch from torch.utils._pytree import tree_flatten -from ._datapoint import Datapoint +from ._vision_tensor import VisionTensor class BoundingBoxFormat(Enum): @@ -24,13 +24,13 @@ class BoundingBoxFormat(Enum): CXCYWH = "CXCYWH" -class BoundingBoxes(Datapoint): +class BoundingBoxes(VisionTensor): """[BETA] :class:`torch.Tensor` subclass for bounding boxes. .. note:: - There should be only one :class:`~torchvision.datapoints.BoundingBoxes` + There should be only one :class:`~torchvision.vision_tensors.BoundingBoxes` instance per sample e.g. ``{"img": img, "bbox": BoundingBoxes(...)}``, - although one :class:`~torchvision.datapoints.BoundingBoxes` object can + although one :class:`~torchvision.vision_tensors.BoundingBoxes` object can contain multiple bounding boxes. Args: diff --git a/torchvision/datapoints/_dataset_wrapper.py b/torchvision/vision_tensors/_dataset_wrapper.py similarity index 89% rename from torchvision/datapoints/_dataset_wrapper.py rename to torchvision/vision_tensors/_dataset_wrapper.py index 07a3e0ff733..23890f66a93 100644 --- a/torchvision/datapoints/_dataset_wrapper.py +++ b/torchvision/vision_tensors/_dataset_wrapper.py @@ -9,7 +9,7 @@ import torch -from torchvision import datapoints, datasets +from torchvision import datasets, vision_tensors from torchvision.transforms.v2 import functional as F __all__ = ["wrap_dataset_for_transforms_v2"] @@ -36,26 +36,26 @@ def wrap_dataset_for_transforms_v2(dataset, target_keys=None): * :class:`~torchvision.datasets.CocoDetection`: Instead of returning the target as list of dicts, the wrapper returns a dict of lists. In addition, the key-value-pairs ``"boxes"`` (in ``XYXY`` coordinate format), - ``"masks"`` and ``"labels"`` are added and wrap the data in the corresponding ``torchvision.datapoints``. + ``"masks"`` and ``"labels"`` are added and wrap the data in the corresponding ``torchvision.vision_tensors``. The original keys are preserved. If ``target_keys`` is omitted, returns only the values for the ``"image_id"``, ``"boxes"``, and ``"labels"``. * :class:`~torchvision.datasets.VOCDetection`: The key-value-pairs ``"boxes"`` and ``"labels"`` are added to - the target and wrap the data in the corresponding ``torchvision.datapoints``. The original keys are + the target and wrap the data in the corresponding ``torchvision.vision_tensors``. The original keys are preserved. If ``target_keys`` is omitted, returns only the values for the ``"boxes"`` and ``"labels"``. * :class:`~torchvision.datasets.CelebA`: The target for ``target_type="bbox"`` is converted to the ``XYXY`` - coordinate format and wrapped into a :class:`~torchvision.datapoints.BoundingBoxes` datapoint. + coordinate format and wrapped into a :class:`~torchvision.vision_tensors.BoundingBoxes` vision_tensor. * :class:`~torchvision.datasets.Kitti`: Instead returning the target as list of dicts, the wrapper returns a dict of lists. In addition, the key-value-pairs ``"boxes"`` and ``"labels"`` are added and wrap the data - in the corresponding ``torchvision.datapoints``. The original keys are preserved. If ``target_keys`` is + in the corresponding ``torchvision.vision_tensors``. The original keys are preserved. If ``target_keys`` is omitted, returns only the values for the ``"boxes"`` and ``"labels"``. * :class:`~torchvision.datasets.OxfordIIITPet`: The target for ``target_type="segmentation"`` is wrapped into a - :class:`~torchvision.datapoints.Mask` datapoint. + :class:`~torchvision.vision_tensors.Mask` vision_tensor. * :class:`~torchvision.datasets.Cityscapes`: The target for ``target_type="semantic"`` is wrapped into a - :class:`~torchvision.datapoints.Mask` datapoint. The target for ``target_type="instance"`` is *replaced* by - a dictionary with the key-value-pairs ``"masks"`` (as :class:`~torchvision.datapoints.Mask` datapoint) and + :class:`~torchvision.vision_tensors.Mask` vision_tensor. The target for ``target_type="instance"`` is *replaced* by + a dictionary with the key-value-pairs ``"masks"`` (as :class:`~torchvision.vision_tensors.Mask` vision_tensor) and ``"labels"``. * :class:`~torchvision.datasets.WIDERFace`: The value for key ``"bbox"`` in the target is converted to ``XYXY`` - coordinate format and wrapped into a :class:`~torchvision.datapoints.BoundingBoxes` datapoint. + coordinate format and wrapped into a :class:`~torchvision.vision_tensors.BoundingBoxes` vision_tensor. Image classification datasets @@ -66,13 +66,13 @@ def wrap_dataset_for_transforms_v2(dataset, target_keys=None): Segmentation datasets, e.g. :class:`~torchvision.datasets.VOCSegmentation`, return a two-tuple of :class:`PIL.Image.Image`'s. This wrapper leaves the image as is (first item), while wrapping the - segmentation mask into a :class:`~torchvision.datapoints.Mask` (second item). + segmentation mask into a :class:`~torchvision.vision_tensors.Mask` (second item). Video classification datasets Video classification datasets, e.g. :class:`~torchvision.datasets.Kinetics`, return a three-tuple containing a :class:`torch.Tensor` for the video and audio and a :class:`int` as label. This wrapper wraps the video into a - :class:`~torchvision.datapoints.Video` while leaving the other items as is. + :class:`~torchvision.vision_tensors.Video` while leaving the other items as is. .. note:: @@ -98,12 +98,14 @@ def wrap_dataset_for_transforms_v2(dataset, target_keys=None): ) # Imagine we have isinstance(dataset, datasets.ImageNet). This will create a new class with the name - # "WrappedImageNet" at runtime that doubly inherits from VisionDatasetDatapointWrapper (see below) as well as the + # "WrappedImageNet" at runtime that doubly inherits from VisionDatasetVisionTensorWrapper (see below) as well as the # original ImageNet class. This allows the user to do regular isinstance(wrapped_dataset, datasets.ImageNet) checks, # while we can still inject everything that we need. - wrapped_dataset_cls = type(f"Wrapped{type(dataset).__name__}", (VisionDatasetDatapointWrapper, type(dataset)), {}) - # Since VisionDatasetDatapointWrapper comes before ImageNet in the MRO, calling the class hits - # VisionDatasetDatapointWrapper.__init__ first. Since we are never doing super().__init__(...), the constructor of + wrapped_dataset_cls = type( + f"Wrapped{type(dataset).__name__}", (VisionDatasetVisionTensorWrapper, type(dataset)), {} + ) + # Since VisionDatasetVisionTensorWrapper comes before ImageNet in the MRO, calling the class hits + # VisionDatasetVisionTensorWrapper.__init__ first. Since we are never doing super().__init__(...), the constructor of # ImageNet is never hit. That is by design, since we don't want to create the dataset instance again, but rather # have the existing instance as attribute on the new object. return wrapped_dataset_cls(dataset, target_keys) @@ -125,7 +127,7 @@ def decorator(wrapper_factory): WRAPPER_FACTORIES = WrapperFactories() -class VisionDatasetDatapointWrapper: +class VisionDatasetVisionTensorWrapper: def __init__(self, dataset, target_keys): dataset_cls = type(dataset) @@ -134,7 +136,7 @@ def __init__(self, dataset, target_keys): f"This wrapper is meant for subclasses of `torchvision.datasets.VisionDataset`, " f"but got a '{dataset_cls.__name__}' instead.\n" f"For an example of how to perform the wrapping for custom datasets, see\n\n" - "https://pytorch.org/vision/main/auto_examples/plot_datapoints.html#do-i-have-to-wrap-the-output-of-the-datasets-myself" + "https://pytorch.org/vision/main/auto_examples/plot_vision_tensors.html#do-i-have-to-wrap-the-output-of-the-datasets-myself" ) for cls in dataset_cls.mro(): @@ -221,7 +223,7 @@ def wrapper(idx, sample): def pil_image_to_mask(pil_image): - return datapoints.Mask(pil_image) + return vision_tensors.Mask(pil_image) def parse_target_keys(target_keys, *, available, default): @@ -302,7 +304,7 @@ def video_classification_wrapper_factory(dataset, target_keys): def wrapper(idx, sample): video, audio, label = sample - video = datapoints.Video(video) + video = vision_tensors.Video(video) return video, audio, label @@ -373,16 +375,16 @@ def wrapper(idx, sample): if "boxes" in target_keys: target["boxes"] = F.convert_bounding_box_format( - datapoints.BoundingBoxes( + vision_tensors.BoundingBoxes( batched_target["bbox"], - format=datapoints.BoundingBoxFormat.XYWH, + format=vision_tensors.BoundingBoxFormat.XYWH, canvas_size=canvas_size, ), - new_format=datapoints.BoundingBoxFormat.XYXY, + new_format=vision_tensors.BoundingBoxFormat.XYXY, ) if "masks" in target_keys: - target["masks"] = datapoints.Mask( + target["masks"] = vision_tensors.Mask( torch.stack( [ segmentation_to_mask(segmentation, canvas_size=canvas_size) @@ -454,12 +456,12 @@ def wrapper(idx, sample): target = {} if "boxes" in target_keys: - target["boxes"] = datapoints.BoundingBoxes( + target["boxes"] = vision_tensors.BoundingBoxes( [ [int(bndbox[part]) for part in ("xmin", "ymin", "xmax", "ymax")] for bndbox in batched_instances["bndbox"] ], - format=datapoints.BoundingBoxFormat.XYXY, + format=vision_tensors.BoundingBoxFormat.XYXY, canvas_size=(image.height, image.width), ) @@ -494,12 +496,12 @@ def wrapper(idx, sample): target_types=dataset.target_type, type_wrappers={ "bbox": lambda item: F.convert_bounding_box_format( - datapoints.BoundingBoxes( + vision_tensors.BoundingBoxes( item, - format=datapoints.BoundingBoxFormat.XYWH, + format=vision_tensors.BoundingBoxFormat.XYWH, canvas_size=(image.height, image.width), ), - new_format=datapoints.BoundingBoxFormat.XYXY, + new_format=vision_tensors.BoundingBoxFormat.XYXY, ), }, ) @@ -544,9 +546,9 @@ def wrapper(idx, sample): target = {} if "boxes" in target_keys: - target["boxes"] = datapoints.BoundingBoxes( + target["boxes"] = vision_tensors.BoundingBoxes( batched_target["bbox"], - format=datapoints.BoundingBoxFormat.XYXY, + format=vision_tensors.BoundingBoxFormat.XYXY, canvas_size=(image.height, image.width), ) @@ -596,7 +598,7 @@ def instance_segmentation_wrapper(mask): if label >= 1_000: label //= 1_000 labels.append(label) - return dict(masks=datapoints.Mask(torch.stack(masks)), labels=torch.stack(labels)) + return dict(masks=vision_tensors.Mask(torch.stack(masks)), labels=torch.stack(labels)) def wrapper(idx, sample): image, target = sample @@ -641,10 +643,12 @@ def wrapper(idx, sample): if "bbox" in target_keys: target["bbox"] = F.convert_bounding_box_format( - datapoints.BoundingBoxes( - target["bbox"], format=datapoints.BoundingBoxFormat.XYWH, canvas_size=(image.height, image.width) + vision_tensors.BoundingBoxes( + target["bbox"], + format=vision_tensors.BoundingBoxFormat.XYWH, + canvas_size=(image.height, image.width), ), - new_format=datapoints.BoundingBoxFormat.XYXY, + new_format=vision_tensors.BoundingBoxFormat.XYXY, ) return image, target diff --git a/torchvision/datapoints/_image.py b/torchvision/vision_tensors/_image.py similarity index 96% rename from torchvision/datapoints/_image.py rename to torchvision/vision_tensors/_image.py index c9cc10c8d3e..f61015e01e6 100644 --- a/torchvision/datapoints/_image.py +++ b/torchvision/vision_tensors/_image.py @@ -5,10 +5,10 @@ import PIL.Image import torch -from ._datapoint import Datapoint +from ._vision_tensor import VisionTensor -class Image(Datapoint): +class Image(VisionTensor): """[BETA] :class:`torch.Tensor` subclass for images. .. note:: diff --git a/torchvision/datapoints/_mask.py b/torchvision/vision_tensors/_mask.py similarity index 95% rename from torchvision/datapoints/_mask.py rename to torchvision/vision_tensors/_mask.py index 6725ac5fed2..f89c21f9a65 100644 --- a/torchvision/datapoints/_mask.py +++ b/torchvision/vision_tensors/_mask.py @@ -5,10 +5,10 @@ import PIL.Image import torch -from ._datapoint import Datapoint +from ._vision_tensor import VisionTensor -class Mask(Datapoint): +class Mask(VisionTensor): """[BETA] :class:`torch.Tensor` subclass for segmentation and detection masks. Args: diff --git a/torchvision/datapoints/_torch_function_helpers.py b/torchvision/vision_tensors/_torch_function_helpers.py similarity index 78% rename from torchvision/datapoints/_torch_function_helpers.py rename to torchvision/vision_tensors/_torch_function_helpers.py index b35b5a1ebdc..5f47181e9e4 100644 --- a/torchvision/datapoints/_torch_function_helpers.py +++ b/torchvision/vision_tensors/_torch_function_helpers.py @@ -16,7 +16,7 @@ def __exit__(self, *args): def set_return_type(return_type: str): - """[BETA] Set the return type of torch operations on datapoints. + """[BETA] Set the return type of torch operations on vision_tensors. This only affects the behaviour of torch operations. It has no effect on ``torchvision`` transforms or functionals, which will always return as @@ -33,28 +33,28 @@ def set_return_type(return_type: str): .. code:: python - img = datapoints.Image(torch.rand(3, 5, 5)) + img = vision_tensors.Image(torch.rand(3, 5, 5)) img + 2 # This is a pure Tensor (default behaviour) - set_return_type("datapoints") + set_return_type("vision_tensors") img + 2 # This is an Image or as a context manager to restrict the scope: .. code:: python - img = datapoints.Image(torch.rand(3, 5, 5)) + img = vision_tensors.Image(torch.rand(3, 5, 5)) img + 2 # This is a pure Tensor - with set_return_type("datapoints"): + with set_return_type("vision_tensors"): img + 2 # This is an Image img + 2 # This is a pure Tensor Args: - return_type (str): Can be "datapoint" or "tensor". Default is "tensor". + return_type (str): Can be "vision_tensor" or "tensor". Default is "tensor". """ global _TORCHFUNCTION_SUBCLASS to_restore = _TORCHFUNCTION_SUBCLASS - _TORCHFUNCTION_SUBCLASS = {"tensor": False, "datapoint": True}[return_type.lower()] + _TORCHFUNCTION_SUBCLASS = {"tensor": False, "vision_tensor": True}[return_type.lower()] return _ReturnTypeCM(to_restore) diff --git a/torchvision/datapoints/_video.py b/torchvision/vision_tensors/_video.py similarity index 95% rename from torchvision/datapoints/_video.py rename to torchvision/vision_tensors/_video.py index b54bfc54a06..9d5f5d51f63 100644 --- a/torchvision/datapoints/_video.py +++ b/torchvision/vision_tensors/_video.py @@ -4,10 +4,10 @@ import torch -from ._datapoint import Datapoint +from ._vision_tensor import VisionTensor -class Video(Datapoint): +class Video(VisionTensor): """[BETA] :class:`torch.Tensor` subclass for videos. Args: diff --git a/torchvision/datapoints/_datapoint.py b/torchvision/vision_tensors/_vision_tensor.py similarity index 89% rename from torchvision/datapoints/_datapoint.py rename to torchvision/vision_tensors/_vision_tensor.py index 64103f5834e..bfbfeff8f95 100644 --- a/torchvision/datapoints/_datapoint.py +++ b/torchvision/vision_tensors/_vision_tensor.py @@ -6,18 +6,18 @@ from torch._C import DisableTorchFunctionSubclass from torch.types import _device, _dtype, _size -from torchvision.datapoints._torch_function_helpers import _FORCE_TORCHFUNCTION_SUBCLASS, _must_return_subclass +from torchvision.vision_tensors._torch_function_helpers import _FORCE_TORCHFUNCTION_SUBCLASS, _must_return_subclass -D = TypeVar("D", bound="Datapoint") +D = TypeVar("D", bound="VisionTensor") -class Datapoint(torch.Tensor): - """[Beta] Base class for all datapoints. +class VisionTensor(torch.Tensor): + """[Beta] Base class for all vision_tensors. You probably don't want to use this class unless you're defining your own - custom Datapoints. See - :ref:`sphx_glr_auto_examples_transforms_plot_custom_datapoints.py` for details. + custom VisionTensors. See + :ref:`sphx_glr_auto_examples_transforms_plot_custom_vision_tensors.py` for details. """ @staticmethod @@ -62,9 +62,9 @@ def __torch_function__( ``__torch_function__`` method. If one is found, it is invoked with the operator as ``func`` as well as the ``args`` and ``kwargs`` of the original call. - Why do we override this? Because the base implementation in torch.Tensor would preserve the Datapoint type + Why do we override this? Because the base implementation in torch.Tensor would preserve the VisionTensor type of the output. In our case, we want to return pure tensors instead (with a few exceptions). Refer to the - "Datapoints FAQ" gallery example for a rationale of this behaviour (TL;DR: perf + no silver bullet). + "VisionTensors FAQ" gallery example for a rationale of this behaviour (TL;DR: perf + no silver bullet). Our implementation below is very similar to the base implementation in ``torch.Tensor`` - go check it out. """ @@ -79,7 +79,7 @@ def __torch_function__( must_return_subclass = _must_return_subclass() if must_return_subclass or (func in _FORCE_TORCHFUNCTION_SUBCLASS and isinstance(args[0], cls)): # If you're wondering why we need the `isinstance(args[0], cls)` check, remove it and see what fails - # in test_to_datapoint_reference(). + # in test_to_vision_tensor_reference(). # The __torch_function__ protocol will invoke the __torch_function__ method on *all* types involved in # the computation by walking the MRO upwards. For example, # `out = a_pure_tensor.to(an_image)` will invoke `Image.__torch_function__` with @@ -89,7 +89,7 @@ def __torch_function__( if not must_return_subclass and isinstance(output, cls): # DisableTorchFunctionSubclass is ignored by inplace ops like `.add_(...)`, - # so for those, the output is still a Datapoint. Thus, we need to manually unwrap. + # so for those, the output is still a VisionTensor. Thus, we need to manually unwrap. return output.as_subclass(torch.Tensor) return output