From f61a0b9026f355474916698efaa6a0a326ce88c9 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Mon, 8 Nov 2021 10:25:00 +0100 Subject: [PATCH 01/28] add FloReader datapipe --- torchvision/prototype/datasets/__init__.py | 2 +- torchvision/prototype/datasets/datapipes.py | 29 +++++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) create mode 100644 torchvision/prototype/datasets/datapipes.py diff --git a/torchvision/prototype/datasets/__init__.py b/torchvision/prototype/datasets/__init__.py index 1945b5a5d9e..ff7d9ca9784 100644 --- a/torchvision/prototype/datasets/__init__.py +++ b/torchvision/prototype/datasets/__init__.py @@ -7,7 +7,7 @@ "Note that you cannot install it with `pip install torchdata`, since this is another package." ) from error -from . import decoder, utils +from . import decoder, utils, datapipes from ._home import home # Load this last, since some parts depend on the above being loaded first diff --git a/torchvision/prototype/datasets/datapipes.py b/torchvision/prototype/datasets/datapipes.py new file mode 100644 index 00000000000..5159e3f10aa --- /dev/null +++ b/torchvision/prototype/datasets/datapipes.py @@ -0,0 +1,29 @@ +from typing import Tuple, IO, Iterator, Union, cast + +import torch +from torchdata.datapipes.iter import IterDataPipe + +__all__ = ["FloReader"] + + +class FloReader(IterDataPipe[torch.Tensor]): + def __init__(self, datapipe: IterDataPipe[Tuple[str, IO]]) -> None: + self.datapipe = datapipe + + def _read_data(self, file: IO, *, dtype: torch.dtype, count: int) -> torch.Tensor: + num_bytes_per_value = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8 + chunk_size = count * num_bytes_per_value + return torch.frombuffer(bytearray(file.read(chunk_size)), dtype=dtype) + + def _read_scalar(self, file: IO, *, dtype: torch.dtype) -> Union[int, float]: + return self._read_data(file, dtype=dtype, count=1).item() + + def __iter__(self) -> Iterator[torch.Tensor]: + for _, file in self.datapipe: + if self._read_scalar(file, dtype=torch.float32) != 202021.25: + raise ValueError("Magic number incorrect. Invalid .flo file") + + width = cast(int, self._read_scalar(file, dtype=torch.int32)) + height = cast(int, self._read_scalar(file, dtype=torch.int32)) + + yield self._read_data(file, dtype=torch.float32, count=2 * height * width).reshape((2, height, width)) From 675eaa05c1910f35679c4aee41dbf6c8bc2a2c38 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Mon, 8 Nov 2021 14:38:23 +0100 Subject: [PATCH 02/28] add NumericBinaryReader --- references/detection/coco_eval.py | 8 +-- torchvision/prototype/datasets/__init__.py | 2 +- .../prototype/datasets/_builtin/mnist.py | 40 ++++---------- torchvision/prototype/datasets/datapipes.py | 29 ----------- .../prototype/datasets/utils/_internal.py | 52 +++++++++++++++++-- 5 files changed, 64 insertions(+), 67 deletions(-) delete mode 100644 torchvision/prototype/datasets/datapipes.py diff --git a/references/detection/coco_eval.py b/references/detection/coco_eval.py index ec0709c5d91..0582435e1b3 100644 --- a/references/detection/coco_eval.py +++ b/references/detection/coco_eval.py @@ -181,11 +181,13 @@ def create_common_coco_eval(coco_eval, img_ids, eval_imgs): eval_imgs = list(eval_imgs.flatten()) coco_eval.evalImgs = eval_imgs - coco_eval.params.imgIds = img_ids - coco_eval._paramsEval = copy.deepcopy(coco_eval.params) + coco_eval._compute_params.imgIds = img_ids + coco_eval._paramsEval = copy.deepcopy(coco_eval._compute_params) def evaluate(imgs): with redirect_stdout(io.StringIO()): imgs.evaluate() - return imgs.params.imgIds, np.asarray(imgs.evalImgs).reshape(-1, len(imgs.params.areaRng), len(imgs.params.imgIds)) + return imgs._compute_params.imgIds, np.asarray(imgs.evalImgs).reshape( + -1, len(imgs._compute_params.areaRng), len(imgs._compute_params.imgIds) + ) diff --git a/torchvision/prototype/datasets/__init__.py b/torchvision/prototype/datasets/__init__.py index ff7d9ca9784..1945b5a5d9e 100644 --- a/torchvision/prototype/datasets/__init__.py +++ b/torchvision/prototype/datasets/__init__.py @@ -7,7 +7,7 @@ "Note that you cannot install it with `pip install torchdata`, since this is another package." ) from error -from . import decoder, utils, datapipes +from . import decoder, utils from ._home import home # Load this last, since some parts depend on the above being loaded first diff --git a/torchvision/prototype/datasets/_builtin/mnist.py b/torchvision/prototype/datasets/_builtin/mnist.py index 5c22521612c..1f6eaee88c6 100644 --- a/torchvision/prototype/datasets/_builtin/mnist.py +++ b/torchvision/prototype/datasets/_builtin/mnist.py @@ -1,11 +1,9 @@ import abc -import codecs import functools import io import operator import pathlib import string -import sys from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, cast import torch @@ -30,6 +28,7 @@ image_buffer_from_array, Decompressor, INFINITE_BUFFER_SIZE, + NumericBinaryReader, ) from torchvision.prototype.features import Image, Label @@ -56,44 +55,25 @@ def __init__( self.start = start self.stop = stop - @staticmethod - def _decode(input: bytes) -> int: - return int(codecs.encode(input, "hex"), 16) - - @staticmethod - def _to_tensor(chunk: bytes, *, dtype: torch.dtype, shape: List[int], reverse_bytes: bool) -> torch.Tensor: - # As is, the chunk is not writeable, because it is read from a file and not from memory. Thus, we copy here to - # avoid the warning that torch.frombuffer would emit otherwise. This also enables inplace operations on the - # contents, which would otherwise fail. - chunk = bytearray(chunk) - if reverse_bytes: - chunk.reverse() - tensor = torch.frombuffer(chunk, dtype=dtype).flip(0) - else: - tensor = torch.frombuffer(chunk, dtype=dtype) - return tensor.reshape(shape) - def __iter__(self) -> Iterator[torch.Tensor]: for _, file in self.datapipe: - magic = self._decode(file.read(4)) + reader = NumericBinaryReader(file, byte_order="big") + + magic = int(reader.read(torch.int32)) dtype = self._DTYPE_MAP[magic // 256] ndim = magic % 256 - 1 - num_samples = self._decode(file.read(4)) - shape = [self._decode(file.read(4)) for _ in range(ndim)] - - num_bytes_per_value = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8 - # The MNIST format uses the big endian byte order. If the system uses little endian byte order by default, - # we need to reverse the bytes before we can read them with torch.frombuffer(). - reverse_bytes = sys.byteorder == "little" and num_bytes_per_value > 1 - chunk_size = (cast(int, prod(shape)) if shape else 1) * num_bytes_per_value + num_samples = int(reader.read(torch.int32)) + shape = cast(List[int], reader.read(torch.int32, shape=(ndim,)).tolist()) if ndim else [] start = self.start or 0 stop = min(self.stop, num_samples) if self.stop else num_samples - file.seek(start * chunk_size, 1) + if start: + reader.skip(dtype, shape=(start,)) + for _ in range(stop - start): - yield self._to_tensor(file.read(chunk_size), dtype=dtype, shape=shape, reverse_bytes=reverse_bytes) + yield reader.read(dtype, shape=shape) class _MNISTBase(Dataset): diff --git a/torchvision/prototype/datasets/datapipes.py b/torchvision/prototype/datasets/datapipes.py deleted file mode 100644 index 5159e3f10aa..00000000000 --- a/torchvision/prototype/datasets/datapipes.py +++ /dev/null @@ -1,29 +0,0 @@ -from typing import Tuple, IO, Iterator, Union, cast - -import torch -from torchdata.datapipes.iter import IterDataPipe - -__all__ = ["FloReader"] - - -class FloReader(IterDataPipe[torch.Tensor]): - def __init__(self, datapipe: IterDataPipe[Tuple[str, IO]]) -> None: - self.datapipe = datapipe - - def _read_data(self, file: IO, *, dtype: torch.dtype, count: int) -> torch.Tensor: - num_bytes_per_value = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8 - chunk_size = count * num_bytes_per_value - return torch.frombuffer(bytearray(file.read(chunk_size)), dtype=dtype) - - def _read_scalar(self, file: IO, *, dtype: torch.dtype) -> Union[int, float]: - return self._read_data(file, dtype=dtype, count=1).item() - - def __iter__(self) -> Iterator[torch.Tensor]: - for _, file in self.datapipe: - if self._read_scalar(file, dtype=torch.float32) != 202021.25: - raise ValueError("Magic number incorrect. Invalid .flo file") - - width = cast(int, self._read_scalar(file, dtype=torch.int32)) - height = cast(int, self._read_scalar(file, dtype=torch.int32)) - - yield self._read_data(file, dtype=torch.float32, count=2 * height * width).reshape((2, height, width)) diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py index 2c48c4414e3..65f8adb835d 100644 --- a/torchvision/prototype/datasets/utils/_internal.py +++ b/torchvision/prototype/datasets/utils/_internal.py @@ -1,33 +1,34 @@ import csv import enum +import functools import gzip import io import lzma +import operator import os import os.path import pathlib import pickle +import sys import textwrap from typing import ( Sequence, Callable, - Union, Any, - Tuple, TypeVar, - Iterator, Dict, Optional, NoReturn, - IO, Iterable, Mapping, Sized, ) +from typing import Tuple, IO, Iterator, Union from typing import cast import numpy as np import PIL.Image +import torch import torch.distributed as dist import torch.utils.data from torch.utils.data import IterDataPipe @@ -51,6 +52,7 @@ "path_accessor", "path_comparator", "Decompressor", + "read_flo", ] K = TypeVar("K") @@ -335,3 +337,45 @@ def _make_sharded_datapipe(root: str, dataset_size: int) -> IterDataPipe: # dp = dp.cycle(2) dp = TakerDataPipe(dp, dataset_size) return dp + + +prod = functools.partial(functools.reduce, operator.mul) + + +class NumericBinaryReader: + def __init__(self, file: IO, *, byte_order: str = sys.byteorder) -> None: + self._file = file + self._reverse = byte_order != sys.byteorder + + def _compute_params(self, dtype: torch.dtype, shape: Sequence[int]) -> Tuple[int, bool]: + num_bytes_per_value = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8 + num_values = prod(shape) if shape else 1 + chunk_size = num_bytes_per_value * num_values + reverse = num_bytes_per_value > 1 and self._reverse + return chunk_size, reverse + + def read(self, dtype: torch.dtype, *, shape: Sequence[int] = ()) -> torch.Tensor: + chunk_size, reverse = self._compute_params(dtype, shape) + # As is, the chunk we read is not writeable, because it is read from a file and not from memory. Thus, we copy + # here to a bytearray in order to avoid the warning that torch.frombuffer would emit otherwise. This also + # enables inplace operations on the contents, which would otherwise fail. + chunk = bytearray(self._file.read(chunk_size)) + if reverse: + chunk.reverse() + tensor = torch.frombuffer(chunk, dtype=dtype).flip(0) + else: + tensor = torch.frombuffer(chunk, dtype=dtype) + return tensor.reshape(tuple(shape)) + + def skip(self, dtype: torch.dtype, *, shape: Sequence[int] = ()) -> None: + chunk_size, _ = self._compute_params(dtype, shape) + self._file.seek(chunk_size, 1) + + +def read_flo(file: IO) -> torch.Tensor: + if file.read(4) != b"PIEH": + raise ValueError("Magic number incorrect. Invalid .flo file") + + reader = NumericBinaryReader(file, byte_order="little") + width, height = reader.read(torch.int32, shape=(2,)).tolist() + return reader.read(torch.float32, shape=(height, width, 2)).permute((2, 0, 1)) From 05e934fa1f6e366e66b94447c8a16750e5155d80 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Mon, 8 Nov 2021 14:42:51 +0100 Subject: [PATCH 03/28] revert unrelated change --- references/detection/coco_eval.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/references/detection/coco_eval.py b/references/detection/coco_eval.py index 0582435e1b3..ec0709c5d91 100644 --- a/references/detection/coco_eval.py +++ b/references/detection/coco_eval.py @@ -181,13 +181,11 @@ def create_common_coco_eval(coco_eval, img_ids, eval_imgs): eval_imgs = list(eval_imgs.flatten()) coco_eval.evalImgs = eval_imgs - coco_eval._compute_params.imgIds = img_ids - coco_eval._paramsEval = copy.deepcopy(coco_eval._compute_params) + coco_eval.params.imgIds = img_ids + coco_eval._paramsEval = copy.deepcopy(coco_eval.params) def evaluate(imgs): with redirect_stdout(io.StringIO()): imgs.evaluate() - return imgs._compute_params.imgIds, np.asarray(imgs.evalImgs).reshape( - -1, len(imgs._compute_params.areaRng), len(imgs._compute_params.imgIds) - ) + return imgs.params.imgIds, np.asarray(imgs.evalImgs).reshape(-1, len(imgs.params.areaRng), len(imgs.params.imgIds)) From 3a2d8126dfd493eb2ce3583ba2c389253f1f4ef5 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Mon, 8 Nov 2021 14:45:23 +0100 Subject: [PATCH 04/28] cleanup --- torchvision/prototype/datasets/_builtin/mnist.py | 3 --- torchvision/prototype/datasets/utils/_internal.py | 7 +++++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/torchvision/prototype/datasets/_builtin/mnist.py b/torchvision/prototype/datasets/_builtin/mnist.py index 1f6eaee88c6..fd7b9f2f168 100644 --- a/torchvision/prototype/datasets/_builtin/mnist.py +++ b/torchvision/prototype/datasets/_builtin/mnist.py @@ -1,7 +1,6 @@ import abc import functools import io -import operator import pathlib import string from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, cast @@ -35,8 +34,6 @@ __all__ = ["MNIST", "FashionMNIST", "KMNIST", "EMNIST", "QMNIST"] -prod = functools.partial(functools.reduce, operator.mul) - class MNISTFileReader(IterDataPipe[torch.Tensor]): _DTYPE_MAP = { diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py index 65f8adb835d..786eb530bd2 100644 --- a/torchvision/prototype/datasets/utils/_internal.py +++ b/torchvision/prototype/datasets/utils/_internal.py @@ -14,17 +14,20 @@ from typing import ( Sequence, Callable, + Union, Any, + Tuple, TypeVar, + Iterator, Dict, Optional, NoReturn, + IO, Iterable, Mapping, Sized, + cast, ) -from typing import Tuple, IO, Iterator, Union -from typing import cast import numpy as np import PIL.Image From 2d7111dc19fb5bb04cf2f5776fd53c8221d0a899 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Mon, 8 Nov 2021 14:46:56 +0100 Subject: [PATCH 05/28] cleanup --- torchvision/prototype/datasets/utils/_internal.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py index 786eb530bd2..f38b1d20581 100644 --- a/torchvision/prototype/datasets/utils/_internal.py +++ b/torchvision/prototype/datasets/utils/_internal.py @@ -26,8 +26,8 @@ Iterable, Mapping, Sized, - cast, ) +from typing import cast import numpy as np import PIL.Image From f984983ac75f88de2e77b68b589679cc92d35a9c Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Mon, 8 Nov 2021 15:16:45 +0100 Subject: [PATCH 06/28] add comment for byte reversal --- torchvision/prototype/datasets/utils/_internal.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py index f38b1d20581..220d1675f89 100644 --- a/torchvision/prototype/datasets/utils/_internal.py +++ b/torchvision/prototype/datasets/utils/_internal.py @@ -348,6 +348,8 @@ def _make_sharded_datapipe(root: str, dataset_size: int) -> IterDataPipe: class NumericBinaryReader: def __init__(self, file: IO, *, byte_order: str = sys.byteorder) -> None: self._file = file + # torch.frombuffer interprets the bytes in the same byte order as the system. Thus, if the data is stored in + # the opposite byte order, we need to reverse the bytes before feeding them to torch.frombuffer(). self._reverse = byte_order != sys.byteorder def _compute_params(self, dtype: torch.dtype, shape: Sequence[int]) -> Tuple[int, bool]: From c4b46b79f1168c9242d311d488e182b58ce60278 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Mon, 8 Nov 2021 20:54:54 +0100 Subject: [PATCH 07/28] use numpy after all --- .../prototype/datasets/_builtin/mnist.py | 24 +++++---- .../prototype/datasets/utils/_internal.py | 53 ++++++++----------- 2 files changed, 34 insertions(+), 43 deletions(-) diff --git a/torchvision/prototype/datasets/_builtin/mnist.py b/torchvision/prototype/datasets/_builtin/mnist.py index fd7b9f2f168..95fcee5fa1d 100644 --- a/torchvision/prototype/datasets/_builtin/mnist.py +++ b/torchvision/prototype/datasets/_builtin/mnist.py @@ -27,13 +27,15 @@ image_buffer_from_array, Decompressor, INFINITE_BUFFER_SIZE, - NumericBinaryReader, + binary_to_tensor, ) from torchvision.prototype.features import Image, Label __all__ = ["MNIST", "FashionMNIST", "KMNIST", "EMNIST", "QMNIST"] +big_endian_binary_to_tensor = functools.partial(binary_to_tensor, byte_order="big") + class MNISTFileReader(IterDataPipe[torch.Tensor]): _DTYPE_MAP = { @@ -54,23 +56,23 @@ def __init__( def __iter__(self) -> Iterator[torch.Tensor]: for _, file in self.datapipe: - reader = NumericBinaryReader(file, byte_order="big") - - magic = int(reader.read(torch.int32)) + magic = int(big_endian_binary_to_tensor(file, dtype=torch.int32)) dtype = self._DTYPE_MAP[magic // 256] ndim = magic % 256 - 1 - num_samples = int(reader.read(torch.int32)) - shape = cast(List[int], reader.read(torch.int32, shape=(ndim,)).tolist()) if ndim else [] + num_samples = int(big_endian_binary_to_tensor(file, dtype=torch.int32)) + shape = ( + cast(List[int], big_endian_binary_to_tensor(file, dtype=torch.int32, shape=(ndim,)).tolist()) + if ndim + else [] + ) start = self.start or 0 stop = min(self.stop, num_samples) if self.stop else num_samples - if start: - reader.skip(dtype, shape=(start,)) - - for _ in range(stop - start): - yield reader.read(dtype, shape=shape) + yield big_endian_binary_to_tensor(file, dtype=dtype, shape=shape, skip=start) + for _ in range(stop - start - 1): + yield big_endian_binary_to_tensor(file, dtype=dtype, shape=shape) class _MNISTBase(Dataset): diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py index 220d1675f89..5c55607df0a 100644 --- a/torchvision/prototype/datasets/utils/_internal.py +++ b/torchvision/prototype/datasets/utils/_internal.py @@ -55,6 +55,7 @@ "path_accessor", "path_comparator", "Decompressor", + "binary_to_tensor", "read_flo", ] @@ -345,42 +346,30 @@ def _make_sharded_datapipe(root: str, dataset_size: int) -> IterDataPipe: prod = functools.partial(functools.reduce, operator.mul) -class NumericBinaryReader: - def __init__(self, file: IO, *, byte_order: str = sys.byteorder) -> None: - self._file = file - # torch.frombuffer interprets the bytes in the same byte order as the system. Thus, if the data is stored in - # the opposite byte order, we need to reverse the bytes before feeding them to torch.frombuffer(). - self._reverse = byte_order != sys.byteorder - - def _compute_params(self, dtype: torch.dtype, shape: Sequence[int]) -> Tuple[int, bool]: - num_bytes_per_value = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8 - num_values = prod(shape) if shape else 1 - chunk_size = num_bytes_per_value * num_values - reverse = num_bytes_per_value > 1 and self._reverse - return chunk_size, reverse - - def read(self, dtype: torch.dtype, *, shape: Sequence[int] = ()) -> torch.Tensor: - chunk_size, reverse = self._compute_params(dtype, shape) - # As is, the chunk we read is not writeable, because it is read from a file and not from memory. Thus, we copy - # here to a bytearray in order to avoid the warning that torch.frombuffer would emit otherwise. This also - # enables inplace operations on the contents, which would otherwise fail. - chunk = bytearray(self._file.read(chunk_size)) - if reverse: - chunk.reverse() - tensor = torch.frombuffer(chunk, dtype=dtype).flip(0) - else: - tensor = torch.frombuffer(chunk, dtype=dtype) - return tensor.reshape(tuple(shape)) +def binary_to_tensor( + file: IO, *, dtype: torch.dtype, shape: Sequence[int] = (), byte_order: str = sys.byteorder, skip: int = 0 +) -> torch.Tensor: + byteorder = "<" if byte_order == "little" else ">" + char = "f" if dtype.is_floating_point else ("i" if dtype.is_signed else "u") + itemsize = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8 + np_dtype = byteorder + char + str(itemsize) + + if skip: + file.seek(skip * itemsize, 1) + buffer = file.read((prod(shape) if shape else 1) * itemsize) + array = np.frombuffer(buffer, dtype=np_dtype) + + # PyTorch can only deal with with the native byte order, + # so we need to convert in case the file uses a different byte order + if byteorder != sys.byteorder: + array = array.astype(np_dtype[1:]) - def skip(self, dtype: torch.dtype, *, shape: Sequence[int] = ()) -> None: - chunk_size, _ = self._compute_params(dtype, shape) - self._file.seek(chunk_size, 1) + return torch.from_numpy(array).reshape(shape) def read_flo(file: IO) -> torch.Tensor: if file.read(4) != b"PIEH": raise ValueError("Magic number incorrect. Invalid .flo file") - reader = NumericBinaryReader(file, byte_order="little") - width, height = reader.read(torch.int32, shape=(2,)).tolist() - return reader.read(torch.float32, shape=(height, width, 2)).permute((2, 0, 1)) + width, height = binary_to_tensor(file, dtype=torch.int32, shape=(2,)).tolist() + return binary_to_tensor(file, dtype=torch.float32, shape=(height, width, 2)).permute((2, 0, 1)) From ba362a745dc159ae9549bfa296d6838eff4b57fb Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Mon, 8 Nov 2021 21:04:48 +0100 Subject: [PATCH 08/28] appease mypy --- torchvision/prototype/datasets/utils/_internal.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py index 5c55607df0a..025a272d9bc 100644 --- a/torchvision/prototype/datasets/utils/_internal.py +++ b/torchvision/prototype/datasets/utils/_internal.py @@ -364,7 +364,7 @@ def binary_to_tensor( if byteorder != sys.byteorder: array = array.astype(np_dtype[1:]) - return torch.from_numpy(array).reshape(shape) + return torch.from_numpy(array).reshape(tuple(shape)) def read_flo(file: IO) -> torch.Tensor: From 3bb9256d886ac5d3f93deb68ae81db7a3fdc7802 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Tue, 9 Nov 2021 14:28:07 +0100 Subject: [PATCH 09/28] use .astype() with copy=False --- torchvision/prototype/datasets/utils/_internal.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py index 025a272d9bc..ff31c33f714 100644 --- a/torchvision/prototype/datasets/utils/_internal.py +++ b/torchvision/prototype/datasets/utils/_internal.py @@ -357,13 +357,9 @@ def binary_to_tensor( if skip: file.seek(skip * itemsize, 1) buffer = file.read((prod(shape) if shape else 1) * itemsize) - array = np.frombuffer(buffer, dtype=np_dtype) - - # PyTorch can only deal with with the native byte order, - # so we need to convert in case the file uses a different byte order - if byteorder != sys.byteorder: - array = array.astype(np_dtype[1:]) - + # PyTorch can only deal with with the native byte order, so we need to convert to it in case the file uses a + # different one. + array = np.frombuffer(buffer, dtype=np_dtype).astype(np.dtype[1:], copy=False) return torch.from_numpy(array).reshape(tuple(shape)) @@ -371,5 +367,5 @@ def read_flo(file: IO) -> torch.Tensor: if file.read(4) != b"PIEH": raise ValueError("Magic number incorrect. Invalid .flo file") - width, height = binary_to_tensor(file, dtype=torch.int32, shape=(2,)).tolist() - return binary_to_tensor(file, dtype=torch.float32, shape=(height, width, 2)).permute((2, 0, 1)) + width, height = binary_to_tensor(file, dtype=torch.int32, shape=(2,), byte_order="little").tolist() + return binary_to_tensor(file, dtype=torch.float32, shape=(height, width, 2), byte_order="little").permute((2, 0, 1)) From 5e029a95db3316fb14c0d6d16a1d6bb25b85c72c Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Tue, 9 Nov 2021 15:01:35 +0100 Subject: [PATCH 10/28] add docstring and cleanuo --- .../prototype/datasets/utils/_internal.py | 26 ++++++++++++++++--- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py index ff31c33f714..fcdbde34f59 100644 --- a/torchvision/prototype/datasets/utils/_internal.py +++ b/torchvision/prototype/datasets/utils/_internal.py @@ -347,15 +347,33 @@ def _make_sharded_datapipe(root: str, dataset_size: int) -> IterDataPipe: def binary_to_tensor( - file: IO, *, dtype: torch.dtype, shape: Sequence[int] = (), byte_order: str = sys.byteorder, skip: int = 0 + file: IO, + *, + dtype: torch.dtype, + shape: Union[int, Sequence[int]] = (), + byte_order: str = sys.byteorder, + skip: int = 0, ) -> torch.Tensor: + """Construct a tensor from a binary file. + + Args: + file (IO): Open file. + dtype (torch.dtype): Data type of the returned tensor. + shape (Union[Sequence[int], int]): Shape of the returned tensor. If `int`, the tensor will return a 1D tensor + with as many elements. Defaults to reading a single value and returns it as 0D tensor. + byte_order (str): Byte order of the data. Can be ``"little"`` or ``"big"`` endian. Defaults to the native byte + order of the system. + skip (int): Number of values to skip before values are read. + """ + if isinstance(shape, int): + shape = (shape,) + byteorder = "<" if byte_order == "little" else ">" char = "f" if dtype.is_floating_point else ("i" if dtype.is_signed else "u") itemsize = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8 np_dtype = byteorder + char + str(itemsize) - if skip: - file.seek(skip * itemsize, 1) + file.seek(skip * itemsize, 1) buffer = file.read((prod(shape) if shape else 1) * itemsize) # PyTorch can only deal with with the native byte order, so we need to convert to it in case the file uses a # different one. @@ -367,5 +385,5 @@ def read_flo(file: IO) -> torch.Tensor: if file.read(4) != b"PIEH": raise ValueError("Magic number incorrect. Invalid .flo file") - width, height = binary_to_tensor(file, dtype=torch.int32, shape=(2,), byte_order="little").tolist() + width, height = binary_to_tensor(file, dtype=torch.int32, shape=2, byte_order="little").tolist() return binary_to_tensor(file, dtype=torch.float32, shape=(height, width, 2), byte_order="little").permute((2, 0, 1)) From e9c5584f3afdd67e76fa8fe41985c2de5b66353c Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Wed, 10 Nov 2021 10:32:21 +0100 Subject: [PATCH 11/28] reuse current _read_flo and revert MNIST changes --- torchvision/datasets/_optical_flow.py | 19 +++++-- .../prototype/datasets/_builtin/mnist.py | 45 +++++++++++----- .../prototype/datasets/utils/_internal.py | 53 ++----------------- 3 files changed, 50 insertions(+), 67 deletions(-) diff --git a/torchvision/datasets/_optical_flow.py b/torchvision/datasets/_optical_flow.py index 7c728a5af8f..cf9f77bef92 100644 --- a/torchvision/datasets/_optical_flow.py +++ b/torchvision/datasets/_optical_flow.py @@ -362,13 +362,19 @@ def _read_flow(self, file_name): return _read_pfm(file_name) -def _read_flo(file_name): +def _read_flo(f): """Read .flo file in Middlebury format""" - # Code adapted from: - # http://stackoverflow.com/questions/28013200/reading-middlebury-flow-files-with-python-bytes-array-numpy + # Code adapted from:flow-files-with-python-bytes-array-numpy # Everything needs to be in little Endian according to # https://vision.middlebury.edu/flow/code/flow-code/README.txt - with open(file_name, "rb") as f: + if isinstance(f, (str, Path)): + f = open(f, "rb") + close = True + else: + close = False + # http://stackoverflow.com/questions/28013200/reading-middlebury- + + try: magic = np.fromfile(f, "c", count=4).tobytes() if magic != b"PIEH": raise ValueError("Magic number incorrect. Invalid .flo file") @@ -376,7 +382,10 @@ def _read_flo(file_name): w = int(np.fromfile(f, " int: + return int(codecs.encode(input, "hex"), 16) + + @staticmethod + def _to_tensor(chunk: bytes, *, dtype: torch.dtype, shape: List[int], reverse_bytes: bool) -> torch.Tensor: + # As is, the chunk is not writeable, because it is read from a file and not from memory. Thus, we copy here to + # avoid the warning that torch.frombuffer would emit otherwise. This also enables inplace operations on the + # contents, which would otherwise fail. + chunk = bytearray(chunk) + if reverse_bytes: + chunk.reverse() + tensor = torch.frombuffer(chunk, dtype=dtype).flip(0) + else: + tensor = torch.frombuffer(chunk, dtype=dtype) + return tensor.reshape(shape) + def __iter__(self) -> Iterator[torch.Tensor]: for _, file in self.datapipe: - magic = int(big_endian_binary_to_tensor(file, dtype=torch.int32)) + magic = self._decode(file.read(4)) dtype = self._DTYPE_MAP[magic // 256] ndim = magic % 256 - 1 - num_samples = int(big_endian_binary_to_tensor(file, dtype=torch.int32)) - shape = ( - cast(List[int], big_endian_binary_to_tensor(file, dtype=torch.int32, shape=(ndim,)).tolist()) - if ndim - else [] - ) + num_samples = self._decode(file.read(4)) + shape = [self._decode(file.read(4)) for _ in range(ndim)] + + num_bytes_per_value = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8 + # The MNIST format uses the big endian byte order. If the system uses little endian byte order by default, + # we need to reverse the bytes before we can read them with torch.frombuffer(). + reverse_bytes = sys.byteorder == "little" and num_bytes_per_value > 1 + chunk_size = (cast(int, prod(shape)) if shape else 1) * num_bytes_per_value start = self.start or 0 stop = min(self.stop, num_samples) if self.stop else num_samples - yield big_endian_binary_to_tensor(file, dtype=dtype, shape=shape, skip=start) - for _ in range(stop - start - 1): - yield big_endian_binary_to_tensor(file, dtype=dtype, shape=shape) + file.seek(start * chunk_size, 1) + for _ in range(stop - start): + yield self._to_tensor(file.read(chunk_size), dtype=dtype, shape=shape, reverse_bytes=reverse_bytes) class _MNISTBase(Dataset): diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py index fcdbde34f59..aee1952af12 100644 --- a/torchvision/prototype/datasets/utils/_internal.py +++ b/torchvision/prototype/datasets/utils/_internal.py @@ -1,15 +1,12 @@ import csv import enum -import functools import gzip import io import lzma -import operator import os import os.path import pathlib import pickle -import sys import textwrap from typing import ( Sequence, @@ -31,12 +28,12 @@ import numpy as np import PIL.Image -import torch import torch.distributed as dist import torch.utils.data from torch.utils.data import IterDataPipe from torchdata.datapipes.iter import IoPathFileLister, IoPathFileLoader from torchdata.datapipes.utils import StreamWrapper +from torchvision.datasets._optical_flow import _read_flo __all__ = [ @@ -55,8 +52,6 @@ "path_accessor", "path_comparator", "Decompressor", - "binary_to_tensor", - "read_flo", ] K = TypeVar("K") @@ -343,47 +338,5 @@ def _make_sharded_datapipe(root: str, dataset_size: int) -> IterDataPipe: return dp -prod = functools.partial(functools.reduce, operator.mul) - - -def binary_to_tensor( - file: IO, - *, - dtype: torch.dtype, - shape: Union[int, Sequence[int]] = (), - byte_order: str = sys.byteorder, - skip: int = 0, -) -> torch.Tensor: - """Construct a tensor from a binary file. - - Args: - file (IO): Open file. - dtype (torch.dtype): Data type of the returned tensor. - shape (Union[Sequence[int], int]): Shape of the returned tensor. If `int`, the tensor will return a 1D tensor - with as many elements. Defaults to reading a single value and returns it as 0D tensor. - byte_order (str): Byte order of the data. Can be ``"little"`` or ``"big"`` endian. Defaults to the native byte - order of the system. - skip (int): Number of values to skip before values are read. - """ - if isinstance(shape, int): - shape = (shape,) - - byteorder = "<" if byte_order == "little" else ">" - char = "f" if dtype.is_floating_point else ("i" if dtype.is_signed else "u") - itemsize = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8 - np_dtype = byteorder + char + str(itemsize) - - file.seek(skip * itemsize, 1) - buffer = file.read((prod(shape) if shape else 1) * itemsize) - # PyTorch can only deal with with the native byte order, so we need to convert to it in case the file uses a - # different one. - array = np.frombuffer(buffer, dtype=np_dtype).astype(np.dtype[1:], copy=False) - return torch.from_numpy(array).reshape(tuple(shape)) - - -def read_flo(file: IO) -> torch.Tensor: - if file.read(4) != b"PIEH": - raise ValueError("Magic number incorrect. Invalid .flo file") - - width, height = binary_to_tensor(file, dtype=torch.int32, shape=2, byte_order="little").tolist() - return binary_to_tensor(file, dtype=torch.float32, shape=(height, width, 2), byte_order="little").permute((2, 0, 1)) +def read_flo(file: IO): + return torch.from_numpy(_read_flo(file)) From fa4fafb2596b997204b544e00c88413aa29ab272 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Wed, 10 Nov 2021 10:33:51 +0100 Subject: [PATCH 12/28] cleanup --- torchvision/datasets/_optical_flow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torchvision/datasets/_optical_flow.py b/torchvision/datasets/_optical_flow.py index cf9f77bef92..34be9f5061e 100644 --- a/torchvision/datasets/_optical_flow.py +++ b/torchvision/datasets/_optical_flow.py @@ -364,7 +364,8 @@ def _read_flow(self, file_name): def _read_flo(f): """Read .flo file in Middlebury format""" - # Code adapted from:flow-files-with-python-bytes-array-numpy + # Code adapted from: + # http://stackoverflow.com/questions/28013200/reading-middlebury-flow-files-with-python-bytes-array-numpy # Everything needs to be in little Endian according to # https://vision.middlebury.edu/flow/code/flow-code/README.txt if isinstance(f, (str, Path)): @@ -372,7 +373,6 @@ def _read_flo(f): close = True else: close = False - # http://stackoverflow.com/questions/28013200/reading-middlebury- try: magic = np.fromfile(f, "c", count=4).tobytes() From 61a71a12dfc8d37a5dbdb559d4e4379095ced5ee Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Tue, 16 Nov 2021 11:47:03 +0100 Subject: [PATCH 13/28] revert demonstration --- torchvision/datasets/_optical_flow.py | 15 ++---- .../prototype/datasets/_builtin/mnist.py | 45 +++++----------- .../prototype/datasets/utils/_internal.py | 53 +++++++++++++++++-- 3 files changed, 65 insertions(+), 48 deletions(-) diff --git a/torchvision/datasets/_optical_flow.py b/torchvision/datasets/_optical_flow.py index 34be9f5061e..7c728a5af8f 100644 --- a/torchvision/datasets/_optical_flow.py +++ b/torchvision/datasets/_optical_flow.py @@ -362,19 +362,13 @@ def _read_flow(self, file_name): return _read_pfm(file_name) -def _read_flo(f): +def _read_flo(file_name): """Read .flo file in Middlebury format""" # Code adapted from: # http://stackoverflow.com/questions/28013200/reading-middlebury-flow-files-with-python-bytes-array-numpy # Everything needs to be in little Endian according to # https://vision.middlebury.edu/flow/code/flow-code/README.txt - if isinstance(f, (str, Path)): - f = open(f, "rb") - close = True - else: - close = False - - try: + with open(file_name, "rb") as f: magic = np.fromfile(f, "c", count=4).tobytes() if magic != b"PIEH": raise ValueError("Magic number incorrect. Invalid .flo file") @@ -382,10 +376,7 @@ def _read_flo(f): w = int(np.fromfile(f, " int: - return int(codecs.encode(input, "hex"), 16) - - @staticmethod - def _to_tensor(chunk: bytes, *, dtype: torch.dtype, shape: List[int], reverse_bytes: bool) -> torch.Tensor: - # As is, the chunk is not writeable, because it is read from a file and not from memory. Thus, we copy here to - # avoid the warning that torch.frombuffer would emit otherwise. This also enables inplace operations on the - # contents, which would otherwise fail. - chunk = bytearray(chunk) - if reverse_bytes: - chunk.reverse() - tensor = torch.frombuffer(chunk, dtype=dtype).flip(0) - else: - tensor = torch.frombuffer(chunk, dtype=dtype) - return tensor.reshape(shape) - def __iter__(self) -> Iterator[torch.Tensor]: for _, file in self.datapipe: - magic = self._decode(file.read(4)) + magic = int(big_endian_binary_to_tensor(file, dtype=torch.int32)) dtype = self._DTYPE_MAP[magic // 256] ndim = magic % 256 - 1 - num_samples = self._decode(file.read(4)) - shape = [self._decode(file.read(4)) for _ in range(ndim)] - - num_bytes_per_value = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8 - # The MNIST format uses the big endian byte order. If the system uses little endian byte order by default, - # we need to reverse the bytes before we can read them with torch.frombuffer(). - reverse_bytes = sys.byteorder == "little" and num_bytes_per_value > 1 - chunk_size = (cast(int, prod(shape)) if shape else 1) * num_bytes_per_value + num_samples = int(big_endian_binary_to_tensor(file, dtype=torch.int32)) + shape = ( + cast(List[int], big_endian_binary_to_tensor(file, dtype=torch.int32, shape=(ndim,)).tolist()) + if ndim + else [] + ) start = self.start or 0 stop = min(self.stop, num_samples) if self.stop else num_samples - file.seek(start * chunk_size, 1) - for _ in range(stop - start): - yield self._to_tensor(file.read(chunk_size), dtype=dtype, shape=shape, reverse_bytes=reverse_bytes) + yield big_endian_binary_to_tensor(file, dtype=dtype, shape=shape, skip=start) + for _ in range(stop - start - 1): + yield big_endian_binary_to_tensor(file, dtype=dtype, shape=shape) class _MNISTBase(Dataset): diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py index aee1952af12..fcdbde34f59 100644 --- a/torchvision/prototype/datasets/utils/_internal.py +++ b/torchvision/prototype/datasets/utils/_internal.py @@ -1,12 +1,15 @@ import csv import enum +import functools import gzip import io import lzma +import operator import os import os.path import pathlib import pickle +import sys import textwrap from typing import ( Sequence, @@ -28,12 +31,12 @@ import numpy as np import PIL.Image +import torch import torch.distributed as dist import torch.utils.data from torch.utils.data import IterDataPipe from torchdata.datapipes.iter import IoPathFileLister, IoPathFileLoader from torchdata.datapipes.utils import StreamWrapper -from torchvision.datasets._optical_flow import _read_flo __all__ = [ @@ -52,6 +55,8 @@ "path_accessor", "path_comparator", "Decompressor", + "binary_to_tensor", + "read_flo", ] K = TypeVar("K") @@ -338,5 +343,47 @@ def _make_sharded_datapipe(root: str, dataset_size: int) -> IterDataPipe: return dp -def read_flo(file: IO): - return torch.from_numpy(_read_flo(file)) +prod = functools.partial(functools.reduce, operator.mul) + + +def binary_to_tensor( + file: IO, + *, + dtype: torch.dtype, + shape: Union[int, Sequence[int]] = (), + byte_order: str = sys.byteorder, + skip: int = 0, +) -> torch.Tensor: + """Construct a tensor from a binary file. + + Args: + file (IO): Open file. + dtype (torch.dtype): Data type of the returned tensor. + shape (Union[Sequence[int], int]): Shape of the returned tensor. If `int`, the tensor will return a 1D tensor + with as many elements. Defaults to reading a single value and returns it as 0D tensor. + byte_order (str): Byte order of the data. Can be ``"little"`` or ``"big"`` endian. Defaults to the native byte + order of the system. + skip (int): Number of values to skip before values are read. + """ + if isinstance(shape, int): + shape = (shape,) + + byteorder = "<" if byte_order == "little" else ">" + char = "f" if dtype.is_floating_point else ("i" if dtype.is_signed else "u") + itemsize = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8 + np_dtype = byteorder + char + str(itemsize) + + file.seek(skip * itemsize, 1) + buffer = file.read((prod(shape) if shape else 1) * itemsize) + # PyTorch can only deal with with the native byte order, so we need to convert to it in case the file uses a + # different one. + array = np.frombuffer(buffer, dtype=np_dtype).astype(np.dtype[1:], copy=False) + return torch.from_numpy(array).reshape(tuple(shape)) + + +def read_flo(file: IO) -> torch.Tensor: + if file.read(4) != b"PIEH": + raise ValueError("Magic number incorrect. Invalid .flo file") + + width, height = binary_to_tensor(file, dtype=torch.int32, shape=2, byte_order="little").tolist() + return binary_to_tensor(file, dtype=torch.float32, shape=(height, width, 2), byte_order="little").permute((2, 0, 1)) From 68f2d953f073763eb4dacafe8ffe5be50092a89a Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Tue, 16 Nov 2021 14:50:06 +0100 Subject: [PATCH 14/28] refactor --- .../prototype/datasets/_builtin/mnist.py | 33 ++++++----- .../prototype/datasets/utils/_internal.py | 56 +++++++++---------- 2 files changed, 44 insertions(+), 45 deletions(-) diff --git a/torchvision/prototype/datasets/_builtin/mnist.py b/torchvision/prototype/datasets/_builtin/mnist.py index 95fcee5fa1d..dbcc7e21e12 100644 --- a/torchvision/prototype/datasets/_builtin/mnist.py +++ b/torchvision/prototype/datasets/_builtin/mnist.py @@ -1,9 +1,10 @@ import abc import functools import io +import operator import pathlib import string -from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, cast +from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, cast, BinaryIO import torch from torchdata.datapipes.iter import ( @@ -27,14 +28,14 @@ image_buffer_from_array, Decompressor, INFINITE_BUFFER_SIZE, - binary_to_tensor, + fromfile, ) from torchvision.prototype.features import Image, Label - __all__ = ["MNIST", "FashionMNIST", "KMNIST", "EMNIST", "QMNIST"] -big_endian_binary_to_tensor = functools.partial(binary_to_tensor, byte_order="big") + +prod = functools.partial(functools.reduce, operator.mul) class MNISTFileReader(IterDataPipe[torch.Tensor]): @@ -48,7 +49,7 @@ class MNISTFileReader(IterDataPipe[torch.Tensor]): } def __init__( - self, datapipe: IterDataPipe[Tuple[Any, io.IOBase]], *, start: Optional[int], stop: Optional[int] + self, datapipe: IterDataPipe[Tuple[Any, BinaryIO]], *, start: Optional[int], stop: Optional[int] ) -> None: self.datapipe = datapipe self.start = start @@ -56,23 +57,25 @@ def __init__( def __iter__(self) -> Iterator[torch.Tensor]: for _, file in self.datapipe: - magic = int(big_endian_binary_to_tensor(file, dtype=torch.int32)) + read = functools.partial(fromfile, file, byte_order="big", count=1) + + magic = int(read(dtype=torch.int32)) dtype = self._DTYPE_MAP[magic // 256] ndim = magic % 256 - 1 - num_samples = int(big_endian_binary_to_tensor(file, dtype=torch.int32)) - shape = ( - cast(List[int], big_endian_binary_to_tensor(file, dtype=torch.int32, shape=(ndim,)).tolist()) - if ndim - else [] - ) + num_samples = int(read(dtype=torch.int32)) + shape = cast(List[int], read(dtype=torch.int32, count=ndim).tolist()) if ndim else [] + count = prod(shape) if shape else 1 start = self.start or 0 stop = min(self.stop, num_samples) if self.stop else num_samples - yield big_endian_binary_to_tensor(file, dtype=dtype, shape=shape, skip=start) - for _ in range(stop - start - 1): - yield big_endian_binary_to_tensor(file, dtype=dtype, shape=shape) + if start: + num_bytes_per_value = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8 + file.seek(num_bytes_per_value * count * start, 1) + + for _ in range(stop - start): + yield read(dtype=dtype, count=count).reshape(shape) class _MNISTBase(Dataset): diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py index da291be3d1c..0b797726b43 100644 --- a/torchvision/prototype/datasets/utils/_internal.py +++ b/torchvision/prototype/datasets/utils/_internal.py @@ -1,14 +1,12 @@ import enum -import functools import gzip import io import lzma -import operator import os import os.path import pathlib import pickle -import sys +from typing import BinaryIO from typing import ( Sequence, Callable, @@ -46,7 +44,7 @@ "path_accessor", "path_comparator", "Decompressor", - "binary_to_tensor", + "fromfile", "read_flo", ] @@ -65,6 +63,7 @@ def read_mat(buffer: io.IOBase, **kwargs: Any) -> Any: except ImportError as error: raise ModuleNotFoundError("Package `scipy` is required to be installed to read .mat files.") from error + # TODO: This can be removed as soon as https://github.com/pytorch/pytorch/pull/67718 is merged if isinstance(buffer, StreamWrapper): buffer = buffer.file_obj @@ -258,47 +257,44 @@ def _make_sharded_datapipe(root: str, dataset_size: int) -> IterDataPipe: return dp -prod = functools.partial(functools.reduce, operator.mul) - - -def binary_to_tensor( - file: IO, +def fromfile( + file: BinaryIO, *, dtype: torch.dtype, - shape: Union[int, Sequence[int]] = (), - byte_order: str = sys.byteorder, - skip: int = 0, + byte_order: str, + count: int = -1, ) -> torch.Tensor: """Construct a tensor from a binary file. + .. note:: + + This function is similar to :func:`numpy.fromfile` with two notable differences: + + 1. This function only accepts an open binary file, but not a path to it. + 2. This function has an additional ``byte_order`` parameter, since PyTorch's ``dtype``'s do not support that + concept. + Args: - file (IO): Open file. + file (IO): Open binary file. dtype (torch.dtype): Data type of the returned tensor. - shape (Union[Sequence[int], int]): Shape of the returned tensor. If `int`, the tensor will return a 1D tensor - with as many elements. Defaults to reading a single value and returns it as 0D tensor. - byte_order (str): Byte order of the data. Can be ``"little"`` or ``"big"`` endian. Defaults to the native byte - order of the system. - skip (int): Number of values to skip before values are read. + byte_order (str): Byte order of the data. Can be ``"little"`` or ``"big"`` endian. + count (int): Number of values of the returned tensor. If ``-1`` (default), will read the complete file. """ - if isinstance(shape, int): - shape = (shape,) - byteorder = "<" if byte_order == "little" else ">" char = "f" if dtype.is_floating_point else ("i" if dtype.is_signed else "u") itemsize = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8 np_dtype = byteorder + char + str(itemsize) - file.seek(skip * itemsize, 1) - buffer = file.read((prod(shape) if shape else 1) * itemsize) - # PyTorch can only deal with with the native byte order, so we need to convert to it in case the file uses a - # different one. - array = np.frombuffer(buffer, dtype=np_dtype).astype(np.dtype[1:], copy=False) - return torch.from_numpy(array).reshape(tuple(shape)) + buffer = file.read(-1 if count == -1 else count * itemsize) + # torch.frombuffer can only deal with with the native byte order, + # so we use numpy for the I/O and convert to a tensor. + return torch.from_numpy(np.frombuffer(buffer, dtype=np_dtype).astype(np_dtype[1:])) -def read_flo(file: IO) -> torch.Tensor: +def read_flo(file: BinaryIO) -> torch.Tensor: if file.read(4) != b"PIEH": raise ValueError("Magic number incorrect. Invalid .flo file") - width, height = binary_to_tensor(file, dtype=torch.int32, shape=2, byte_order="little").tolist() - return binary_to_tensor(file, dtype=torch.float32, shape=(height, width, 2), byte_order="little").permute((2, 0, 1)) + width, height = fromfile(file, dtype=torch.int32, byte_order="little", count=2) + flow = fromfile(file, dtype=torch.float32, byte_order="little", count=height * width * 2) + return flow.reshape((height, width, 2)).permute((2, 0, 1)) From a3823ba8cc6c4511b739617ecec06fe1871c385d Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Tue, 16 Nov 2021 15:39:11 +0100 Subject: [PATCH 15/28] cleanup --- torchvision/prototype/datasets/_builtin/mnist.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/torchvision/prototype/datasets/_builtin/mnist.py b/torchvision/prototype/datasets/_builtin/mnist.py index dbcc7e21e12..9ff84b447e6 100644 --- a/torchvision/prototype/datasets/_builtin/mnist.py +++ b/torchvision/prototype/datasets/_builtin/mnist.py @@ -57,13 +57,13 @@ def __init__( def __iter__(self) -> Iterator[torch.Tensor]: for _, file in self.datapipe: - read = functools.partial(fromfile, file, byte_order="big", count=1) + read = functools.partial(fromfile, file, byte_order="big") - magic = int(read(dtype=torch.int32)) + magic = int(read(dtype=torch.int32, count=1)) dtype = self._DTYPE_MAP[magic // 256] ndim = magic % 256 - 1 - num_samples = int(read(dtype=torch.int32)) + num_samples = int(read(dtype=torch.int32, count=1)) shape = cast(List[int], read(dtype=torch.int32, count=ndim).tolist()) if ndim else [] count = prod(shape) if shape else 1 From de865cf02108fd3e34014e9602063eba004b3abc Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Thu, 18 Nov 2021 08:15:10 +0100 Subject: [PATCH 16/28] add support for mutable memory --- torchvision/prototype/datasets/utils/_internal.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py index 0b797726b43..050056201e9 100644 --- a/torchvision/prototype/datasets/utils/_internal.py +++ b/torchvision/prototype/datasets/utils/_internal.py @@ -2,6 +2,7 @@ import gzip import io import lzma +import mmap import os import os.path import pathlib @@ -285,10 +286,14 @@ def fromfile( itemsize = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8 np_dtype = byteorder + char + str(itemsize) - buffer = file.read(-1 if count == -1 else count * itemsize) - # torch.frombuffer can only deal with with the native byte order, - # so we use numpy for the I/O and convert to a tensor. - return torch.from_numpy(np.frombuffer(buffer, dtype=np_dtype).astype(np_dtype[1:])) + chunk_size = count * itemsize + try: + buffer = memoryview(mmap.mmap(file.fileno(), 0))[file.tell() :] + file.seek(*(0, io.SEEK_END) if count == -1 else (chunk_size, io.SEEK_CUR)) + except PermissionError: + buffer = bytearray(file.read(-1 if count == -1 else chunk_size)) + + return torch.from_numpy(np.frombuffer(buffer, dtype=np_dtype, count=count).astype(np_dtype[1:], copy=False)) def read_flo(file: BinaryIO) -> torch.Tensor: From c3fd44513cbe16babdb2934e82ca29f017f32cd2 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Thu, 18 Nov 2021 08:15:39 +0100 Subject: [PATCH 17/28] add test --- test/test_prototype_datasets_utils.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 test/test_prototype_datasets_utils.py diff --git a/test/test_prototype_datasets_utils.py b/test/test_prototype_datasets_utils.py new file mode 100644 index 00000000000..8ffffb21878 --- /dev/null +++ b/test/test_prototype_datasets_utils.py @@ -0,0 +1,19 @@ +import pytest +import torch +from datasets_utils import make_fake_flo_file +from torchvision.datasets._optical_flow import _read_flo as read_flo_ref +from torchvision.prototype.datasets.utils._internal import read_flo + + +@pytest.mark.parametrize("mode", ("rb", "r+b")) +def test_read_flo(tmpdir, mode): + path = tmpdir / "test.flo" + height, width = torch.randint(3, 10, (2,)) + make_fake_flo_file(height, width, path) + + with open(path, mode) as file: + actual = read_flo(file) + + expected = torch.from_numpy(read_flo_ref(path).astype("f4", copy=False)) + + torch.testing.assert_close(actual, expected) From 7c3a33f94d6c1a51820e29f963fcd7f899186a61 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Thu, 18 Nov 2021 08:34:26 +0100 Subject: [PATCH 18/28] add comments --- .../prototype/datasets/utils/_internal.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py index 050056201e9..571879569d2 100644 --- a/torchvision/prototype/datasets/utils/_internal.py +++ b/torchvision/prototype/datasets/utils/_internal.py @@ -277,8 +277,8 @@ def fromfile( Args: file (IO): Open binary file. - dtype (torch.dtype): Data type of the returned tensor. - byte_order (str): Byte order of the data. Can be ``"little"`` or ``"big"`` endian. + dtype (torch.dtype): Data type of the underlying data. + byte_order (str): Byte order of the data. Can be "little" or "big" endian. count (int): Number of values of the returned tensor. If ``-1`` (default), will read the complete file. """ byteorder = "<" if byte_order == "little" else ">" @@ -286,13 +286,19 @@ def fromfile( itemsize = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8 np_dtype = byteorder + char + str(itemsize) - chunk_size = count * itemsize + # PyTorch does not support tensors with underlying read-only memory. If the file was opened for updating, i.e. + # 'r+b' or 'w+b', the memory is already writable. Otherwise we need to copy it to a mutable location after reading. try: buffer = memoryview(mmap.mmap(file.fileno(), 0))[file.tell() :] - file.seek(*(0, io.SEEK_END) if count == -1 else (chunk_size, io.SEEK_CUR)) + # Reading from the memoryview does not advance the file cursor, so we have to do it manually. + file.seek(*(0, io.SEEK_END) if count == -1 else (count * itemsize, io.SEEK_CUR)) except PermissionError: - buffer = bytearray(file.read(-1 if count == -1 else chunk_size)) + # A plain file.read() will give a read-only bytes, so we convert it to bytearray to make it mutable + buffer = bytearray(file.read(-1 if count == -1 else count * itemsize)) + # We cannot use torch.frombuffer() directly, since it only supports the native byte order of the system. Thus, we + # read the data with np.frombuffer() with the correct byte order and convert it to the native one with the + # successive .astype() call. return torch.from_numpy(np.frombuffer(buffer, dtype=np_dtype, count=count).astype(np_dtype[1:], copy=False)) From aa780fd472ef7eb818d1f7bb7bf719db9f065b95 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Thu, 18 Nov 2021 08:45:45 +0100 Subject: [PATCH 19/28] catch more exceptions --- torchvision/prototype/datasets/utils/_internal.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py index 571879569d2..a5a908ef182 100644 --- a/torchvision/prototype/datasets/utils/_internal.py +++ b/torchvision/prototype/datasets/utils/_internal.py @@ -292,7 +292,7 @@ def fromfile( buffer = memoryview(mmap.mmap(file.fileno(), 0))[file.tell() :] # Reading from the memoryview does not advance the file cursor, so we have to do it manually. file.seek(*(0, io.SEEK_END) if count == -1 else (count * itemsize, io.SEEK_CUR)) - except PermissionError: + except (PermissionError, io.UnsupportedOperation): # A plain file.read() will give a read-only bytes, so we convert it to bytearray to make it mutable buffer = bytearray(file.read(-1 if count == -1 else count * itemsize)) From e9031af69bd61c794718afc0d6548217e8d23180 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Thu, 18 Nov 2021 08:47:40 +0100 Subject: [PATCH 20/28] fix mypy --- torchvision/prototype/datasets/utils/_internal.py | 1 + 1 file changed, 1 insertion(+) diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py index a5a908ef182..1714ab5ae8c 100644 --- a/torchvision/prototype/datasets/utils/_internal.py +++ b/torchvision/prototype/datasets/utils/_internal.py @@ -288,6 +288,7 @@ def fromfile( # PyTorch does not support tensors with underlying read-only memory. If the file was opened for updating, i.e. # 'r+b' or 'w+b', the memory is already writable. Otherwise we need to copy it to a mutable location after reading. + buffer: Union[memoryview, bytearray] try: buffer = memoryview(mmap.mmap(file.fileno(), 0))[file.tell() :] # Reading from the memoryview does not advance the file cursor, so we have to do it manually. From 1d55fc05133d360e652555f1804730505a5383bf Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Thu, 18 Nov 2021 11:46:50 +0100 Subject: [PATCH 21/28] fix variable names --- torchvision/prototype/datasets/utils/_internal.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py index 1714ab5ae8c..ac97e235346 100644 --- a/torchvision/prototype/datasets/utils/_internal.py +++ b/torchvision/prototype/datasets/utils/_internal.py @@ -281,10 +281,10 @@ def fromfile( byte_order (str): Byte order of the data. Can be "little" or "big" endian. count (int): Number of values of the returned tensor. If ``-1`` (default), will read the complete file. """ - byteorder = "<" if byte_order == "little" else ">" + byte_order = "<" if byte_order == "little" else ">" char = "f" if dtype.is_floating_point else ("i" if dtype.is_signed else "u") - itemsize = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8 - np_dtype = byteorder + char + str(itemsize) + item_size = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8 + np_dtype = byte_order + char + str(item_size) # PyTorch does not support tensors with underlying read-only memory. If the file was opened for updating, i.e. # 'r+b' or 'w+b', the memory is already writable. Otherwise we need to copy it to a mutable location after reading. @@ -292,10 +292,10 @@ def fromfile( try: buffer = memoryview(mmap.mmap(file.fileno(), 0))[file.tell() :] # Reading from the memoryview does not advance the file cursor, so we have to do it manually. - file.seek(*(0, io.SEEK_END) if count == -1 else (count * itemsize, io.SEEK_CUR)) + file.seek(*(0, io.SEEK_END) if count == -1 else (count * item_size, io.SEEK_CUR)) except (PermissionError, io.UnsupportedOperation): # A plain file.read() will give a read-only bytes, so we convert it to bytearray to make it mutable - buffer = bytearray(file.read(-1 if count == -1 else count * itemsize)) + buffer = bytearray(file.read(-1 if count == -1 else count * item_size)) # We cannot use torch.frombuffer() directly, since it only supports the native byte order of the system. Thus, we # read the data with np.frombuffer() with the correct byte order and convert it to the native one with the From 5ebb5ae28d9590204b9b244bc0b14de42c2fdd64 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Thu, 18 Nov 2021 11:48:08 +0100 Subject: [PATCH 22/28] hardcode flow sizes in test --- test/test_prototype_datasets_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/test_prototype_datasets_utils.py b/test/test_prototype_datasets_utils.py index 8ffffb21878..3c16e14d435 100644 --- a/test/test_prototype_datasets_utils.py +++ b/test/test_prototype_datasets_utils.py @@ -8,8 +8,7 @@ @pytest.mark.parametrize("mode", ("rb", "r+b")) def test_read_flo(tmpdir, mode): path = tmpdir / "test.flo" - height, width = torch.randint(3, 10, (2,)) - make_fake_flo_file(height, width, path) + make_fake_flo_file(3, 4, path) with open(path, mode) as file: actual = read_flo(file) From ac3e4c2321bfac496c7a46067704dfdb5e64da3b Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Thu, 18 Nov 2021 11:48:31 +0100 Subject: [PATCH 23/28] add fix dtype docstring --- torchvision/prototype/datasets/utils/_internal.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py index ac97e235346..13b63f579dd 100644 --- a/torchvision/prototype/datasets/utils/_internal.py +++ b/torchvision/prototype/datasets/utils/_internal.py @@ -277,7 +277,7 @@ def fromfile( Args: file (IO): Open binary file. - dtype (torch.dtype): Data type of the underlying data. + dtype (torch.dtype): Data type of the underlying data as well as of the returned tensor. byte_order (str): Byte order of the data. Can be "little" or "big" endian. count (int): Number of values of the returned tensor. If ``-1`` (default), will read the complete file. """ From 507681a06244080b492a660e603cb3a13021ec0a Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Thu, 18 Nov 2021 11:57:49 +0100 Subject: [PATCH 24/28] expand comment on different reading modes --- torchvision/prototype/datasets/utils/_internal.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py index 13b63f579dd..56365fd704f 100644 --- a/torchvision/prototype/datasets/utils/_internal.py +++ b/torchvision/prototype/datasets/utils/_internal.py @@ -286,8 +286,12 @@ def fromfile( item_size = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8 np_dtype = byte_order + char + str(item_size) - # PyTorch does not support tensors with underlying read-only memory. If the file was opened for updating, i.e. - # 'r+b' or 'w+b', the memory is already writable. Otherwise we need to copy it to a mutable location after reading. + # PyTorch does not support tensors with underlying read-only memory. In case + # - the file has a .fileno(), + # - the file was opened for updating, i.e. 'r+b' or 'w+b', + # - the file is seekable + # we can avoid copying the data for performance. Otherwise we fall back to simply .read() the data and copy it to + # a mutable location afterwards. buffer: Union[memoryview, bytearray] try: buffer = memoryview(mmap.mmap(file.fileno(), 0))[file.tell() :] From c52c547875833aa9f6e430fff5387e9541f618e0 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Thu, 18 Nov 2021 13:38:35 +0100 Subject: [PATCH 25/28] add comment about files in update mode --- torchvision/prototype/datasets/utils/_internal.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py index 56365fd704f..703c11dd672 100644 --- a/torchvision/prototype/datasets/utils/_internal.py +++ b/torchvision/prototype/datasets/utils/_internal.py @@ -275,6 +275,11 @@ def fromfile( 2. This function has an additional ``byte_order`` parameter, since PyTorch's ``dtype``'s do not support that concept. + .. note:: + + If the ``file`` was opened in update mode, i.e. "r+b" or "w+b", reading data is much faster. Be aware that as + long as the file is still open, inplace operations on the returned tensor will reflect back to the file. + Args: file (IO): Open binary file. dtype (torch.dtype): Data type of the underlying data as well as of the returned tensor. From 80e8f2596d12882468bb4b4a8cfaa51af5f49332 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Thu, 18 Nov 2021 14:15:12 +0100 Subject: [PATCH 26/28] add tests for fromfile --- test/test_prototype_datasets_utils.py | 34 ++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/test/test_prototype_datasets_utils.py b/test/test_prototype_datasets_utils.py index 3c16e14d435..50cc5aceb6d 100644 --- a/test/test_prototype_datasets_utils.py +++ b/test/test_prototype_datasets_utils.py @@ -1,16 +1,44 @@ +import sys + +import numpy as np import pytest import torch from datasets_utils import make_fake_flo_file from torchvision.datasets._optical_flow import _read_flo as read_flo_ref -from torchvision.prototype.datasets.utils._internal import read_flo +from torchvision.prototype.datasets.utils._internal import read_flo, fromfile +@pytest.mark.filterwarnings("error:The given NumPy array is not writeable:UserWarning") @pytest.mark.parametrize("mode", ("rb", "r+b")) -def test_read_flo(tmpdir, mode): +@pytest.mark.parametrize( + ("np_dtype", "torch_dtype", "byte_order"), + [ + (">f4", torch.float32, "big"), + ("i8", torch.int64, "big"), + ("|u1", torch.uint8, sys.byteorder), + ], +) +def test_fromfile(tmpdir, np_dtype, torch_dtype, byte_order, mode): + path = tmpdir / "data.bin" + count = 5 + np.random.randn(count).astype(np_dtype).tofile(path) + + for count_ in (-1, count // 2): + expected = torch.from_numpy(np.fromfile(path, dtype=np_dtype, count=count_).astype(np_dtype[1:])) + + with open(path, mode) as file: + actual = fromfile(file, dtype=torch_dtype, byte_order=byte_order, count=count_) + + torch.testing.assert_close(actual, expected) + + +def test_read_flo(tmpdir): path = tmpdir / "test.flo" make_fake_flo_file(3, 4, path) - with open(path, mode) as file: + with open(path, "rb") as file: actual = read_flo(file) expected = torch.from_numpy(read_flo_ref(path).astype("f4", copy=False)) From 2bb491bb5d78ae815adc96f889ced5b5e26c8dc4 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Fri, 19 Nov 2021 09:41:03 +0100 Subject: [PATCH 27/28] cleanup --- torchvision/prototype/datasets/_builtin/mnist.py | 2 +- torchvision/prototype/datasets/utils/_internal.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/torchvision/prototype/datasets/_builtin/mnist.py b/torchvision/prototype/datasets/_builtin/mnist.py index 9ff84b447e6..c242207b7d7 100644 --- a/torchvision/prototype/datasets/_builtin/mnist.py +++ b/torchvision/prototype/datasets/_builtin/mnist.py @@ -32,8 +32,8 @@ ) from torchvision.prototype.features import Image, Label -__all__ = ["MNIST", "FashionMNIST", "KMNIST", "EMNIST", "QMNIST"] +__all__ = ["MNIST", "FashionMNIST", "KMNIST", "EMNIST", "QMNIST"] prod = functools.partial(functools.reduce, operator.mul) diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py index 7285cf05aa5..3db10183f68 100644 --- a/torchvision/prototype/datasets/utils/_internal.py +++ b/torchvision/prototype/datasets/utils/_internal.py @@ -65,7 +65,6 @@ def read_mat(buffer: io.IOBase, **kwargs: Any) -> Any: except ImportError as error: raise ModuleNotFoundError("Package `scipy` is required to be installed to read .mat files.") from error - # TODO: This can be removed as soon as https://github.com/pytorch/pytorch/pull/67718 is merged if isinstance(buffer, StreamWrapper): buffer = buffer.file_obj From 0969cf9a555ce1872c09f540855c20e2ae9effb3 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Fri, 19 Nov 2021 11:06:59 +0100 Subject: [PATCH 28/28] cleanup --- test/test_prototype_datasets_utils.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/test/test_prototype_datasets_utils.py b/test/test_prototype_datasets_utils.py index 50cc5aceb6d..7207299c9d4 100644 --- a/test/test_prototype_datasets_utils.py +++ b/test/test_prototype_datasets_utils.py @@ -9,7 +9,6 @@ @pytest.mark.filterwarnings("error:The given NumPy array is not writeable:UserWarning") -@pytest.mark.parametrize("mode", ("rb", "r+b")) @pytest.mark.parametrize( ("np_dtype", "torch_dtype", "byte_order"), [ @@ -20,10 +19,12 @@ ("|u1", torch.uint8, sys.byteorder), ], ) -def test_fromfile(tmpdir, np_dtype, torch_dtype, byte_order, mode): +@pytest.mark.parametrize("count", (-1, 2)) +@pytest.mark.parametrize("mode", ("rb", "r+b")) +def test_fromfile(tmpdir, np_dtype, torch_dtype, byte_order, count, mode): path = tmpdir / "data.bin" - count = 5 - np.random.randn(count).astype(np_dtype).tofile(path) + rng = np.random.RandomState(0) + rng.randn(5 if count == -1 else count + 1).astype(np_dtype).tofile(path) for count_ in (-1, count // 2): expected = torch.from_numpy(np.fromfile(path, dtype=np_dtype, count=count_).astype(np_dtype[1:]))