From f61a0b9026f355474916698efaa6a0a326ce88c9 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 8 Nov 2021 10:25:00 +0100
Subject: [PATCH 01/28] add FloReader datapipe

---
 torchvision/prototype/datasets/__init__.py  |  2 +-
 torchvision/prototype/datasets/datapipes.py | 29 +++++++++++++++++++++
 2 files changed, 30 insertions(+), 1 deletion(-)
 create mode 100644 torchvision/prototype/datasets/datapipes.py

diff --git a/torchvision/prototype/datasets/__init__.py b/torchvision/prototype/datasets/__init__.py
index 1945b5a5d9e..ff7d9ca9784 100644
--- a/torchvision/prototype/datasets/__init__.py
+++ b/torchvision/prototype/datasets/__init__.py
@@ -7,7 +7,7 @@
         "Note that you cannot install it with `pip install torchdata`, since this is another package."
     ) from error
 
-from . import decoder, utils
+from . import decoder, utils, datapipes
 from ._home import home
 
 # Load this last, since some parts depend on the above being loaded first
diff --git a/torchvision/prototype/datasets/datapipes.py b/torchvision/prototype/datasets/datapipes.py
new file mode 100644
index 00000000000..5159e3f10aa
--- /dev/null
+++ b/torchvision/prototype/datasets/datapipes.py
@@ -0,0 +1,29 @@
+from typing import Tuple, IO, Iterator, Union, cast
+
+import torch
+from torchdata.datapipes.iter import IterDataPipe
+
+__all__ = ["FloReader"]
+
+
+class FloReader(IterDataPipe[torch.Tensor]):
+    def __init__(self, datapipe: IterDataPipe[Tuple[str, IO]]) -> None:
+        self.datapipe = datapipe
+
+    def _read_data(self, file: IO, *, dtype: torch.dtype, count: int) -> torch.Tensor:
+        num_bytes_per_value = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8
+        chunk_size = count * num_bytes_per_value
+        return torch.frombuffer(bytearray(file.read(chunk_size)), dtype=dtype)
+
+    def _read_scalar(self, file: IO, *, dtype: torch.dtype) -> Union[int, float]:
+        return self._read_data(file, dtype=dtype, count=1).item()
+
+    def __iter__(self) -> Iterator[torch.Tensor]:
+        for _, file in self.datapipe:
+            if self._read_scalar(file, dtype=torch.float32) != 202021.25:
+                raise ValueError("Magic number incorrect. Invalid .flo file")
+
+            width = cast(int, self._read_scalar(file, dtype=torch.int32))
+            height = cast(int, self._read_scalar(file, dtype=torch.int32))
+
+            yield self._read_data(file, dtype=torch.float32, count=2 * height * width).reshape((2, height, width))

From 675eaa05c1910f35679c4aee41dbf6c8bc2a2c38 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 8 Nov 2021 14:38:23 +0100
Subject: [PATCH 02/28] add NumericBinaryReader

---
 references/detection/coco_eval.py             |  8 +--
 torchvision/prototype/datasets/__init__.py    |  2 +-
 .../prototype/datasets/_builtin/mnist.py      | 40 ++++----------
 torchvision/prototype/datasets/datapipes.py   | 29 -----------
 .../prototype/datasets/utils/_internal.py     | 52 +++++++++++++++++--
 5 files changed, 64 insertions(+), 67 deletions(-)
 delete mode 100644 torchvision/prototype/datasets/datapipes.py

diff --git a/references/detection/coco_eval.py b/references/detection/coco_eval.py
index ec0709c5d91..0582435e1b3 100644
--- a/references/detection/coco_eval.py
+++ b/references/detection/coco_eval.py
@@ -181,11 +181,13 @@ def create_common_coco_eval(coco_eval, img_ids, eval_imgs):
     eval_imgs = list(eval_imgs.flatten())
 
     coco_eval.evalImgs = eval_imgs
-    coco_eval.params.imgIds = img_ids
-    coco_eval._paramsEval = copy.deepcopy(coco_eval.params)
+    coco_eval._compute_params.imgIds = img_ids
+    coco_eval._paramsEval = copy.deepcopy(coco_eval._compute_params)
 
 
 def evaluate(imgs):
     with redirect_stdout(io.StringIO()):
         imgs.evaluate()
-    return imgs.params.imgIds, np.asarray(imgs.evalImgs).reshape(-1, len(imgs.params.areaRng), len(imgs.params.imgIds))
+    return imgs._compute_params.imgIds, np.asarray(imgs.evalImgs).reshape(
+        -1, len(imgs._compute_params.areaRng), len(imgs._compute_params.imgIds)
+    )
diff --git a/torchvision/prototype/datasets/__init__.py b/torchvision/prototype/datasets/__init__.py
index ff7d9ca9784..1945b5a5d9e 100644
--- a/torchvision/prototype/datasets/__init__.py
+++ b/torchvision/prototype/datasets/__init__.py
@@ -7,7 +7,7 @@
         "Note that you cannot install it with `pip install torchdata`, since this is another package."
     ) from error
 
-from . import decoder, utils, datapipes
+from . import decoder, utils
 from ._home import home
 
 # Load this last, since some parts depend on the above being loaded first
diff --git a/torchvision/prototype/datasets/_builtin/mnist.py b/torchvision/prototype/datasets/_builtin/mnist.py
index 5c22521612c..1f6eaee88c6 100644
--- a/torchvision/prototype/datasets/_builtin/mnist.py
+++ b/torchvision/prototype/datasets/_builtin/mnist.py
@@ -1,11 +1,9 @@
 import abc
-import codecs
 import functools
 import io
 import operator
 import pathlib
 import string
-import sys
 from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, cast
 
 import torch
@@ -30,6 +28,7 @@
     image_buffer_from_array,
     Decompressor,
     INFINITE_BUFFER_SIZE,
+    NumericBinaryReader,
 )
 from torchvision.prototype.features import Image, Label
 
@@ -56,44 +55,25 @@ def __init__(
         self.start = start
         self.stop = stop
 
-    @staticmethod
-    def _decode(input: bytes) -> int:
-        return int(codecs.encode(input, "hex"), 16)
-
-    @staticmethod
-    def _to_tensor(chunk: bytes, *, dtype: torch.dtype, shape: List[int], reverse_bytes: bool) -> torch.Tensor:
-        # As is, the chunk is not writeable, because it is read from a file and not from memory. Thus, we copy here to
-        # avoid the warning that torch.frombuffer would emit otherwise. This also enables inplace operations on the
-        # contents, which would otherwise fail.
-        chunk = bytearray(chunk)
-        if reverse_bytes:
-            chunk.reverse()
-            tensor = torch.frombuffer(chunk, dtype=dtype).flip(0)
-        else:
-            tensor = torch.frombuffer(chunk, dtype=dtype)
-        return tensor.reshape(shape)
-
     def __iter__(self) -> Iterator[torch.Tensor]:
         for _, file in self.datapipe:
-            magic = self._decode(file.read(4))
+            reader = NumericBinaryReader(file, byte_order="big")
+
+            magic = int(reader.read(torch.int32))
             dtype = self._DTYPE_MAP[magic // 256]
             ndim = magic % 256 - 1
 
-            num_samples = self._decode(file.read(4))
-            shape = [self._decode(file.read(4)) for _ in range(ndim)]
-
-            num_bytes_per_value = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8
-            # The MNIST format uses the big endian byte order. If the system uses little endian byte order by default,
-            # we need to reverse the bytes before we can read them with torch.frombuffer().
-            reverse_bytes = sys.byteorder == "little" and num_bytes_per_value > 1
-            chunk_size = (cast(int, prod(shape)) if shape else 1) * num_bytes_per_value
+            num_samples = int(reader.read(torch.int32))
+            shape = cast(List[int], reader.read(torch.int32, shape=(ndim,)).tolist()) if ndim else []
 
             start = self.start or 0
             stop = min(self.stop, num_samples) if self.stop else num_samples
 
-            file.seek(start * chunk_size, 1)
+            if start:
+                reader.skip(dtype, shape=(start,))
+
             for _ in range(stop - start):
-                yield self._to_tensor(file.read(chunk_size), dtype=dtype, shape=shape, reverse_bytes=reverse_bytes)
+                yield reader.read(dtype, shape=shape)
 
 
 class _MNISTBase(Dataset):
diff --git a/torchvision/prototype/datasets/datapipes.py b/torchvision/prototype/datasets/datapipes.py
deleted file mode 100644
index 5159e3f10aa..00000000000
--- a/torchvision/prototype/datasets/datapipes.py
+++ /dev/null
@@ -1,29 +0,0 @@
-from typing import Tuple, IO, Iterator, Union, cast
-
-import torch
-from torchdata.datapipes.iter import IterDataPipe
-
-__all__ = ["FloReader"]
-
-
-class FloReader(IterDataPipe[torch.Tensor]):
-    def __init__(self, datapipe: IterDataPipe[Tuple[str, IO]]) -> None:
-        self.datapipe = datapipe
-
-    def _read_data(self, file: IO, *, dtype: torch.dtype, count: int) -> torch.Tensor:
-        num_bytes_per_value = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8
-        chunk_size = count * num_bytes_per_value
-        return torch.frombuffer(bytearray(file.read(chunk_size)), dtype=dtype)
-
-    def _read_scalar(self, file: IO, *, dtype: torch.dtype) -> Union[int, float]:
-        return self._read_data(file, dtype=dtype, count=1).item()
-
-    def __iter__(self) -> Iterator[torch.Tensor]:
-        for _, file in self.datapipe:
-            if self._read_scalar(file, dtype=torch.float32) != 202021.25:
-                raise ValueError("Magic number incorrect. Invalid .flo file")
-
-            width = cast(int, self._read_scalar(file, dtype=torch.int32))
-            height = cast(int, self._read_scalar(file, dtype=torch.int32))
-
-            yield self._read_data(file, dtype=torch.float32, count=2 * height * width).reshape((2, height, width))
diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py
index 2c48c4414e3..65f8adb835d 100644
--- a/torchvision/prototype/datasets/utils/_internal.py
+++ b/torchvision/prototype/datasets/utils/_internal.py
@@ -1,33 +1,34 @@
 import csv
 import enum
+import functools
 import gzip
 import io
 import lzma
+import operator
 import os
 import os.path
 import pathlib
 import pickle
+import sys
 import textwrap
 from typing import (
     Sequence,
     Callable,
-    Union,
     Any,
-    Tuple,
     TypeVar,
-    Iterator,
     Dict,
     Optional,
     NoReturn,
-    IO,
     Iterable,
     Mapping,
     Sized,
 )
+from typing import Tuple, IO, Iterator, Union
 from typing import cast
 
 import numpy as np
 import PIL.Image
+import torch
 import torch.distributed as dist
 import torch.utils.data
 from torch.utils.data import IterDataPipe
@@ -51,6 +52,7 @@
     "path_accessor",
     "path_comparator",
     "Decompressor",
+    "read_flo",
 ]
 
 K = TypeVar("K")
@@ -335,3 +337,45 @@ def _make_sharded_datapipe(root: str, dataset_size: int) -> IterDataPipe:
     # dp = dp.cycle(2)
     dp = TakerDataPipe(dp, dataset_size)
     return dp
+
+
+prod = functools.partial(functools.reduce, operator.mul)
+
+
+class NumericBinaryReader:
+    def __init__(self, file: IO, *, byte_order: str = sys.byteorder) -> None:
+        self._file = file
+        self._reverse = byte_order != sys.byteorder
+
+    def _compute_params(self, dtype: torch.dtype, shape: Sequence[int]) -> Tuple[int, bool]:
+        num_bytes_per_value = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8
+        num_values = prod(shape) if shape else 1
+        chunk_size = num_bytes_per_value * num_values
+        reverse = num_bytes_per_value > 1 and self._reverse
+        return chunk_size, reverse
+
+    def read(self, dtype: torch.dtype, *, shape: Sequence[int] = ()) -> torch.Tensor:
+        chunk_size, reverse = self._compute_params(dtype, shape)
+        # As is, the chunk we read is not writeable, because it is read from a file and not from memory. Thus, we copy
+        # here to a bytearray in order to avoid the warning that torch.frombuffer would emit otherwise. This also
+        # enables inplace operations on the contents, which would otherwise fail.
+        chunk = bytearray(self._file.read(chunk_size))
+        if reverse:
+            chunk.reverse()
+            tensor = torch.frombuffer(chunk, dtype=dtype).flip(0)
+        else:
+            tensor = torch.frombuffer(chunk, dtype=dtype)
+        return tensor.reshape(tuple(shape))
+
+    def skip(self, dtype: torch.dtype, *, shape: Sequence[int] = ()) -> None:
+        chunk_size, _ = self._compute_params(dtype, shape)
+        self._file.seek(chunk_size, 1)
+
+
+def read_flo(file: IO) -> torch.Tensor:
+    if file.read(4) != b"PIEH":
+        raise ValueError("Magic number incorrect. Invalid .flo file")
+
+    reader = NumericBinaryReader(file, byte_order="little")
+    width, height = reader.read(torch.int32, shape=(2,)).tolist()
+    return reader.read(torch.float32, shape=(height, width, 2)).permute((2, 0, 1))

From 05e934fa1f6e366e66b94447c8a16750e5155d80 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 8 Nov 2021 14:42:51 +0100
Subject: [PATCH 03/28] revert unrelated change

---
 references/detection/coco_eval.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/references/detection/coco_eval.py b/references/detection/coco_eval.py
index 0582435e1b3..ec0709c5d91 100644
--- a/references/detection/coco_eval.py
+++ b/references/detection/coco_eval.py
@@ -181,13 +181,11 @@ def create_common_coco_eval(coco_eval, img_ids, eval_imgs):
     eval_imgs = list(eval_imgs.flatten())
 
     coco_eval.evalImgs = eval_imgs
-    coco_eval._compute_params.imgIds = img_ids
-    coco_eval._paramsEval = copy.deepcopy(coco_eval._compute_params)
+    coco_eval.params.imgIds = img_ids
+    coco_eval._paramsEval = copy.deepcopy(coco_eval.params)
 
 
 def evaluate(imgs):
     with redirect_stdout(io.StringIO()):
         imgs.evaluate()
-    return imgs._compute_params.imgIds, np.asarray(imgs.evalImgs).reshape(
-        -1, len(imgs._compute_params.areaRng), len(imgs._compute_params.imgIds)
-    )
+    return imgs.params.imgIds, np.asarray(imgs.evalImgs).reshape(-1, len(imgs.params.areaRng), len(imgs.params.imgIds))

From 3a2d8126dfd493eb2ce3583ba2c389253f1f4ef5 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 8 Nov 2021 14:45:23 +0100
Subject: [PATCH 04/28] cleanup

---
 torchvision/prototype/datasets/_builtin/mnist.py  | 3 ---
 torchvision/prototype/datasets/utils/_internal.py | 7 +++++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/torchvision/prototype/datasets/_builtin/mnist.py b/torchvision/prototype/datasets/_builtin/mnist.py
index 1f6eaee88c6..fd7b9f2f168 100644
--- a/torchvision/prototype/datasets/_builtin/mnist.py
+++ b/torchvision/prototype/datasets/_builtin/mnist.py
@@ -1,7 +1,6 @@
 import abc
 import functools
 import io
-import operator
 import pathlib
 import string
 from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, cast
@@ -35,8 +34,6 @@
 
 __all__ = ["MNIST", "FashionMNIST", "KMNIST", "EMNIST", "QMNIST"]
 
-prod = functools.partial(functools.reduce, operator.mul)
-
 
 class MNISTFileReader(IterDataPipe[torch.Tensor]):
     _DTYPE_MAP = {
diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py
index 65f8adb835d..786eb530bd2 100644
--- a/torchvision/prototype/datasets/utils/_internal.py
+++ b/torchvision/prototype/datasets/utils/_internal.py
@@ -14,17 +14,20 @@
 from typing import (
     Sequence,
     Callable,
+    Union,
     Any,
+    Tuple,
     TypeVar,
+    Iterator,
     Dict,
     Optional,
     NoReturn,
+    IO,
     Iterable,
     Mapping,
     Sized,
+    cast,
 )
-from typing import Tuple, IO, Iterator, Union
-from typing import cast
 
 import numpy as np
 import PIL.Image

From 2d7111dc19fb5bb04cf2f5776fd53c8221d0a899 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 8 Nov 2021 14:46:56 +0100
Subject: [PATCH 05/28] cleanup

---
 torchvision/prototype/datasets/utils/_internal.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py
index 786eb530bd2..f38b1d20581 100644
--- a/torchvision/prototype/datasets/utils/_internal.py
+++ b/torchvision/prototype/datasets/utils/_internal.py
@@ -26,8 +26,8 @@
     Iterable,
     Mapping,
     Sized,
-    cast,
 )
+from typing import cast
 
 import numpy as np
 import PIL.Image

From f984983ac75f88de2e77b68b589679cc92d35a9c Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 8 Nov 2021 15:16:45 +0100
Subject: [PATCH 06/28] add comment for byte reversal

---
 torchvision/prototype/datasets/utils/_internal.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py
index f38b1d20581..220d1675f89 100644
--- a/torchvision/prototype/datasets/utils/_internal.py
+++ b/torchvision/prototype/datasets/utils/_internal.py
@@ -348,6 +348,8 @@ def _make_sharded_datapipe(root: str, dataset_size: int) -> IterDataPipe:
 class NumericBinaryReader:
     def __init__(self, file: IO, *, byte_order: str = sys.byteorder) -> None:
         self._file = file
+        # torch.frombuffer interprets the bytes in the same byte order as the system. Thus, if the data is stored in
+        # the opposite byte order, we need to reverse the bytes before feeding them to torch.frombuffer().
         self._reverse = byte_order != sys.byteorder
 
     def _compute_params(self, dtype: torch.dtype, shape: Sequence[int]) -> Tuple[int, bool]:

From c4b46b79f1168c9242d311d488e182b58ce60278 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 8 Nov 2021 20:54:54 +0100
Subject: [PATCH 07/28] use numpy after all

---
 .../prototype/datasets/_builtin/mnist.py      | 24 +++++----
 .../prototype/datasets/utils/_internal.py     | 53 ++++++++-----------
 2 files changed, 34 insertions(+), 43 deletions(-)

diff --git a/torchvision/prototype/datasets/_builtin/mnist.py b/torchvision/prototype/datasets/_builtin/mnist.py
index fd7b9f2f168..95fcee5fa1d 100644
--- a/torchvision/prototype/datasets/_builtin/mnist.py
+++ b/torchvision/prototype/datasets/_builtin/mnist.py
@@ -27,13 +27,15 @@
     image_buffer_from_array,
     Decompressor,
     INFINITE_BUFFER_SIZE,
-    NumericBinaryReader,
+    binary_to_tensor,
 )
 from torchvision.prototype.features import Image, Label
 
 
 __all__ = ["MNIST", "FashionMNIST", "KMNIST", "EMNIST", "QMNIST"]
 
+big_endian_binary_to_tensor = functools.partial(binary_to_tensor, byte_order="big")
+
 
 class MNISTFileReader(IterDataPipe[torch.Tensor]):
     _DTYPE_MAP = {
@@ -54,23 +56,23 @@ def __init__(
 
     def __iter__(self) -> Iterator[torch.Tensor]:
         for _, file in self.datapipe:
-            reader = NumericBinaryReader(file, byte_order="big")
-
-            magic = int(reader.read(torch.int32))
+            magic = int(big_endian_binary_to_tensor(file, dtype=torch.int32))
             dtype = self._DTYPE_MAP[magic // 256]
             ndim = magic % 256 - 1
 
-            num_samples = int(reader.read(torch.int32))
-            shape = cast(List[int], reader.read(torch.int32, shape=(ndim,)).tolist()) if ndim else []
+            num_samples = int(big_endian_binary_to_tensor(file, dtype=torch.int32))
+            shape = (
+                cast(List[int], big_endian_binary_to_tensor(file, dtype=torch.int32, shape=(ndim,)).tolist())
+                if ndim
+                else []
+            )
 
             start = self.start or 0
             stop = min(self.stop, num_samples) if self.stop else num_samples
 
-            if start:
-                reader.skip(dtype, shape=(start,))
-
-            for _ in range(stop - start):
-                yield reader.read(dtype, shape=shape)
+            yield big_endian_binary_to_tensor(file, dtype=dtype, shape=shape, skip=start)
+            for _ in range(stop - start - 1):
+                yield big_endian_binary_to_tensor(file, dtype=dtype, shape=shape)
 
 
 class _MNISTBase(Dataset):
diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py
index 220d1675f89..5c55607df0a 100644
--- a/torchvision/prototype/datasets/utils/_internal.py
+++ b/torchvision/prototype/datasets/utils/_internal.py
@@ -55,6 +55,7 @@
     "path_accessor",
     "path_comparator",
     "Decompressor",
+    "binary_to_tensor",
     "read_flo",
 ]
 
@@ -345,42 +346,30 @@ def _make_sharded_datapipe(root: str, dataset_size: int) -> IterDataPipe:
 prod = functools.partial(functools.reduce, operator.mul)
 
 
-class NumericBinaryReader:
-    def __init__(self, file: IO, *, byte_order: str = sys.byteorder) -> None:
-        self._file = file
-        # torch.frombuffer interprets the bytes in the same byte order as the system. Thus, if the data is stored in
-        # the opposite byte order, we need to reverse the bytes before feeding them to torch.frombuffer().
-        self._reverse = byte_order != sys.byteorder
-
-    def _compute_params(self, dtype: torch.dtype, shape: Sequence[int]) -> Tuple[int, bool]:
-        num_bytes_per_value = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8
-        num_values = prod(shape) if shape else 1
-        chunk_size = num_bytes_per_value * num_values
-        reverse = num_bytes_per_value > 1 and self._reverse
-        return chunk_size, reverse
-
-    def read(self, dtype: torch.dtype, *, shape: Sequence[int] = ()) -> torch.Tensor:
-        chunk_size, reverse = self._compute_params(dtype, shape)
-        # As is, the chunk we read is not writeable, because it is read from a file and not from memory. Thus, we copy
-        # here to a bytearray in order to avoid the warning that torch.frombuffer would emit otherwise. This also
-        # enables inplace operations on the contents, which would otherwise fail.
-        chunk = bytearray(self._file.read(chunk_size))
-        if reverse:
-            chunk.reverse()
-            tensor = torch.frombuffer(chunk, dtype=dtype).flip(0)
-        else:
-            tensor = torch.frombuffer(chunk, dtype=dtype)
-        return tensor.reshape(tuple(shape))
+def binary_to_tensor(
+    file: IO, *, dtype: torch.dtype, shape: Sequence[int] = (), byte_order: str = sys.byteorder, skip: int = 0
+) -> torch.Tensor:
+    byteorder = "<" if byte_order == "little" else ">"
+    char = "f" if dtype.is_floating_point else ("i" if dtype.is_signed else "u")
+    itemsize = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8
+    np_dtype = byteorder + char + str(itemsize)
+
+    if skip:
+        file.seek(skip * itemsize, 1)
+    buffer = file.read((prod(shape) if shape else 1) * itemsize)
+    array = np.frombuffer(buffer, dtype=np_dtype)
+
+    # PyTorch can only deal with with the native byte order,
+    # so we need to convert in case the file uses a different byte order
+    if byteorder != sys.byteorder:
+        array = array.astype(np_dtype[1:])
 
-    def skip(self, dtype: torch.dtype, *, shape: Sequence[int] = ()) -> None:
-        chunk_size, _ = self._compute_params(dtype, shape)
-        self._file.seek(chunk_size, 1)
+    return torch.from_numpy(array).reshape(shape)
 
 
 def read_flo(file: IO) -> torch.Tensor:
     if file.read(4) != b"PIEH":
         raise ValueError("Magic number incorrect. Invalid .flo file")
 
-    reader = NumericBinaryReader(file, byte_order="little")
-    width, height = reader.read(torch.int32, shape=(2,)).tolist()
-    return reader.read(torch.float32, shape=(height, width, 2)).permute((2, 0, 1))
+    width, height = binary_to_tensor(file, dtype=torch.int32, shape=(2,)).tolist()
+    return binary_to_tensor(file, dtype=torch.float32, shape=(height, width, 2)).permute((2, 0, 1))

From ba362a745dc159ae9549bfa296d6838eff4b57fb Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 8 Nov 2021 21:04:48 +0100
Subject: [PATCH 08/28] appease mypy

---
 torchvision/prototype/datasets/utils/_internal.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py
index 5c55607df0a..025a272d9bc 100644
--- a/torchvision/prototype/datasets/utils/_internal.py
+++ b/torchvision/prototype/datasets/utils/_internal.py
@@ -364,7 +364,7 @@ def binary_to_tensor(
     if byteorder != sys.byteorder:
         array = array.astype(np_dtype[1:])
 
-    return torch.from_numpy(array).reshape(shape)
+    return torch.from_numpy(array).reshape(tuple(shape))
 
 
 def read_flo(file: IO) -> torch.Tensor:

From 3bb9256d886ac5d3f93deb68ae81db7a3fdc7802 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 9 Nov 2021 14:28:07 +0100
Subject: [PATCH 09/28] use .astype() with copy=False

---
 torchvision/prototype/datasets/utils/_internal.py | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py
index 025a272d9bc..ff31c33f714 100644
--- a/torchvision/prototype/datasets/utils/_internal.py
+++ b/torchvision/prototype/datasets/utils/_internal.py
@@ -357,13 +357,9 @@ def binary_to_tensor(
     if skip:
         file.seek(skip * itemsize, 1)
     buffer = file.read((prod(shape) if shape else 1) * itemsize)
-    array = np.frombuffer(buffer, dtype=np_dtype)
-
-    # PyTorch can only deal with with the native byte order,
-    # so we need to convert in case the file uses a different byte order
-    if byteorder != sys.byteorder:
-        array = array.astype(np_dtype[1:])
-
+    # PyTorch can only deal with with the native byte order, so we need to convert to it in case the file uses a
+    # different one.
+    array = np.frombuffer(buffer, dtype=np_dtype).astype(np.dtype[1:], copy=False)
     return torch.from_numpy(array).reshape(tuple(shape))
 
 
@@ -371,5 +367,5 @@ def read_flo(file: IO) -> torch.Tensor:
     if file.read(4) != b"PIEH":
         raise ValueError("Magic number incorrect. Invalid .flo file")
 
-    width, height = binary_to_tensor(file, dtype=torch.int32, shape=(2,)).tolist()
-    return binary_to_tensor(file, dtype=torch.float32, shape=(height, width, 2)).permute((2, 0, 1))
+    width, height = binary_to_tensor(file, dtype=torch.int32, shape=(2,), byte_order="little").tolist()
+    return binary_to_tensor(file, dtype=torch.float32, shape=(height, width, 2), byte_order="little").permute((2, 0, 1))

From 5e029a95db3316fb14c0d6d16a1d6bb25b85c72c Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 9 Nov 2021 15:01:35 +0100
Subject: [PATCH 10/28] add docstring and cleanuo

---
 .../prototype/datasets/utils/_internal.py     | 26 ++++++++++++++++---
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py
index ff31c33f714..fcdbde34f59 100644
--- a/torchvision/prototype/datasets/utils/_internal.py
+++ b/torchvision/prototype/datasets/utils/_internal.py
@@ -347,15 +347,33 @@ def _make_sharded_datapipe(root: str, dataset_size: int) -> IterDataPipe:
 
 
 def binary_to_tensor(
-    file: IO, *, dtype: torch.dtype, shape: Sequence[int] = (), byte_order: str = sys.byteorder, skip: int = 0
+    file: IO,
+    *,
+    dtype: torch.dtype,
+    shape: Union[int, Sequence[int]] = (),
+    byte_order: str = sys.byteorder,
+    skip: int = 0,
 ) -> torch.Tensor:
+    """Construct a tensor from a binary file.
+
+    Args:
+        file (IO): Open file.
+        dtype (torch.dtype): Data type of the returned tensor.
+        shape (Union[Sequence[int], int]): Shape of the returned tensor. If `int`, the tensor will return a 1D tensor
+            with as many elements. Defaults to reading a single value and returns it as 0D tensor.
+        byte_order (str): Byte order of the data. Can be ``"little"`` or ``"big"`` endian. Defaults to the native byte
+            order of the system.
+        skip (int): Number of values to skip before values are read.
+    """
+    if isinstance(shape, int):
+        shape = (shape,)
+
     byteorder = "<" if byte_order == "little" else ">"
     char = "f" if dtype.is_floating_point else ("i" if dtype.is_signed else "u")
     itemsize = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8
     np_dtype = byteorder + char + str(itemsize)
 
-    if skip:
-        file.seek(skip * itemsize, 1)
+    file.seek(skip * itemsize, 1)
     buffer = file.read((prod(shape) if shape else 1) * itemsize)
     # PyTorch can only deal with with the native byte order, so we need to convert to it in case the file uses a
     # different one.
@@ -367,5 +385,5 @@ def read_flo(file: IO) -> torch.Tensor:
     if file.read(4) != b"PIEH":
         raise ValueError("Magic number incorrect. Invalid .flo file")
 
-    width, height = binary_to_tensor(file, dtype=torch.int32, shape=(2,), byte_order="little").tolist()
+    width, height = binary_to_tensor(file, dtype=torch.int32, shape=2, byte_order="little").tolist()
     return binary_to_tensor(file, dtype=torch.float32, shape=(height, width, 2), byte_order="little").permute((2, 0, 1))

From e9c5584f3afdd67e76fa8fe41985c2de5b66353c Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 10 Nov 2021 10:32:21 +0100
Subject: [PATCH 11/28] reuse current _read_flo and revert MNIST changes

---
 torchvision/datasets/_optical_flow.py         | 19 +++++--
 .../prototype/datasets/_builtin/mnist.py      | 45 +++++++++++-----
 .../prototype/datasets/utils/_internal.py     | 53 ++-----------------
 3 files changed, 50 insertions(+), 67 deletions(-)

diff --git a/torchvision/datasets/_optical_flow.py b/torchvision/datasets/_optical_flow.py
index 7c728a5af8f..cf9f77bef92 100644
--- a/torchvision/datasets/_optical_flow.py
+++ b/torchvision/datasets/_optical_flow.py
@@ -362,13 +362,19 @@ def _read_flow(self, file_name):
         return _read_pfm(file_name)
 
 
-def _read_flo(file_name):
+def _read_flo(f):
     """Read .flo file in Middlebury format"""
-    # Code adapted from:
-    # http://stackoverflow.com/questions/28013200/reading-middlebury-flow-files-with-python-bytes-array-numpy
+    # Code adapted from:flow-files-with-python-bytes-array-numpy
     # Everything needs to be in little Endian according to
     # https://vision.middlebury.edu/flow/code/flow-code/README.txt
-    with open(file_name, "rb") as f:
+    if isinstance(f, (str, Path)):
+        f = open(f, "rb")
+        close = True
+    else:
+        close = False
+    # http://stackoverflow.com/questions/28013200/reading-middlebury-
+
+    try:
         magic = np.fromfile(f, "c", count=4).tobytes()
         if magic != b"PIEH":
             raise ValueError("Magic number incorrect. Invalid .flo file")
@@ -376,7 +382,10 @@ def _read_flo(file_name):
         w = int(np.fromfile(f, "<i4", count=1))
         h = int(np.fromfile(f, "<i4", count=1))
         data = np.fromfile(f, "<f4", count=2 * w * h)
-        return data.reshape(h, w, 2).transpose(2, 0, 1)
+        return data.reshape(h, w, 2).transpose(2, 0, 1).astype("f4", copy=False)
+    finally:
+        if close:
+            f.close()
 
 
 def _read_16bits_png_with_flow_and_valid_mask(file_name):
diff --git a/torchvision/prototype/datasets/_builtin/mnist.py b/torchvision/prototype/datasets/_builtin/mnist.py
index 95fcee5fa1d..5c22521612c 100644
--- a/torchvision/prototype/datasets/_builtin/mnist.py
+++ b/torchvision/prototype/datasets/_builtin/mnist.py
@@ -1,8 +1,11 @@
 import abc
+import codecs
 import functools
 import io
+import operator
 import pathlib
 import string
+import sys
 from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, cast
 
 import torch
@@ -27,14 +30,13 @@
     image_buffer_from_array,
     Decompressor,
     INFINITE_BUFFER_SIZE,
-    binary_to_tensor,
 )
 from torchvision.prototype.features import Image, Label
 
 
 __all__ = ["MNIST", "FashionMNIST", "KMNIST", "EMNIST", "QMNIST"]
 
-big_endian_binary_to_tensor = functools.partial(binary_to_tensor, byte_order="big")
+prod = functools.partial(functools.reduce, operator.mul)
 
 
 class MNISTFileReader(IterDataPipe[torch.Tensor]):
@@ -54,25 +56,44 @@ def __init__(
         self.start = start
         self.stop = stop
 
+    @staticmethod
+    def _decode(input: bytes) -> int:
+        return int(codecs.encode(input, "hex"), 16)
+
+    @staticmethod
+    def _to_tensor(chunk: bytes, *, dtype: torch.dtype, shape: List[int], reverse_bytes: bool) -> torch.Tensor:
+        # As is, the chunk is not writeable, because it is read from a file and not from memory. Thus, we copy here to
+        # avoid the warning that torch.frombuffer would emit otherwise. This also enables inplace operations on the
+        # contents, which would otherwise fail.
+        chunk = bytearray(chunk)
+        if reverse_bytes:
+            chunk.reverse()
+            tensor = torch.frombuffer(chunk, dtype=dtype).flip(0)
+        else:
+            tensor = torch.frombuffer(chunk, dtype=dtype)
+        return tensor.reshape(shape)
+
     def __iter__(self) -> Iterator[torch.Tensor]:
         for _, file in self.datapipe:
-            magic = int(big_endian_binary_to_tensor(file, dtype=torch.int32))
+            magic = self._decode(file.read(4))
             dtype = self._DTYPE_MAP[magic // 256]
             ndim = magic % 256 - 1
 
-            num_samples = int(big_endian_binary_to_tensor(file, dtype=torch.int32))
-            shape = (
-                cast(List[int], big_endian_binary_to_tensor(file, dtype=torch.int32, shape=(ndim,)).tolist())
-                if ndim
-                else []
-            )
+            num_samples = self._decode(file.read(4))
+            shape = [self._decode(file.read(4)) for _ in range(ndim)]
+
+            num_bytes_per_value = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8
+            # The MNIST format uses the big endian byte order. If the system uses little endian byte order by default,
+            # we need to reverse the bytes before we can read them with torch.frombuffer().
+            reverse_bytes = sys.byteorder == "little" and num_bytes_per_value > 1
+            chunk_size = (cast(int, prod(shape)) if shape else 1) * num_bytes_per_value
 
             start = self.start or 0
             stop = min(self.stop, num_samples) if self.stop else num_samples
 
-            yield big_endian_binary_to_tensor(file, dtype=dtype, shape=shape, skip=start)
-            for _ in range(stop - start - 1):
-                yield big_endian_binary_to_tensor(file, dtype=dtype, shape=shape)
+            file.seek(start * chunk_size, 1)
+            for _ in range(stop - start):
+                yield self._to_tensor(file.read(chunk_size), dtype=dtype, shape=shape, reverse_bytes=reverse_bytes)
 
 
 class _MNISTBase(Dataset):
diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py
index fcdbde34f59..aee1952af12 100644
--- a/torchvision/prototype/datasets/utils/_internal.py
+++ b/torchvision/prototype/datasets/utils/_internal.py
@@ -1,15 +1,12 @@
 import csv
 import enum
-import functools
 import gzip
 import io
 import lzma
-import operator
 import os
 import os.path
 import pathlib
 import pickle
-import sys
 import textwrap
 from typing import (
     Sequence,
@@ -31,12 +28,12 @@
 
 import numpy as np
 import PIL.Image
-import torch
 import torch.distributed as dist
 import torch.utils.data
 from torch.utils.data import IterDataPipe
 from torchdata.datapipes.iter import IoPathFileLister, IoPathFileLoader
 from torchdata.datapipes.utils import StreamWrapper
+from torchvision.datasets._optical_flow import _read_flo
 
 
 __all__ = [
@@ -55,8 +52,6 @@
     "path_accessor",
     "path_comparator",
     "Decompressor",
-    "binary_to_tensor",
-    "read_flo",
 ]
 
 K = TypeVar("K")
@@ -343,47 +338,5 @@ def _make_sharded_datapipe(root: str, dataset_size: int) -> IterDataPipe:
     return dp
 
 
-prod = functools.partial(functools.reduce, operator.mul)
-
-
-def binary_to_tensor(
-    file: IO,
-    *,
-    dtype: torch.dtype,
-    shape: Union[int, Sequence[int]] = (),
-    byte_order: str = sys.byteorder,
-    skip: int = 0,
-) -> torch.Tensor:
-    """Construct a tensor from a binary file.
-
-    Args:
-        file (IO): Open file.
-        dtype (torch.dtype): Data type of the returned tensor.
-        shape (Union[Sequence[int], int]): Shape of the returned tensor. If `int`, the tensor will return a 1D tensor
-            with as many elements. Defaults to reading a single value and returns it as 0D tensor.
-        byte_order (str): Byte order of the data. Can be ``"little"`` or ``"big"`` endian. Defaults to the native byte
-            order of the system.
-        skip (int): Number of values to skip before values are read.
-    """
-    if isinstance(shape, int):
-        shape = (shape,)
-
-    byteorder = "<" if byte_order == "little" else ">"
-    char = "f" if dtype.is_floating_point else ("i" if dtype.is_signed else "u")
-    itemsize = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8
-    np_dtype = byteorder + char + str(itemsize)
-
-    file.seek(skip * itemsize, 1)
-    buffer = file.read((prod(shape) if shape else 1) * itemsize)
-    # PyTorch can only deal with with the native byte order, so we need to convert to it in case the file uses a
-    # different one.
-    array = np.frombuffer(buffer, dtype=np_dtype).astype(np.dtype[1:], copy=False)
-    return torch.from_numpy(array).reshape(tuple(shape))
-
-
-def read_flo(file: IO) -> torch.Tensor:
-    if file.read(4) != b"PIEH":
-        raise ValueError("Magic number incorrect. Invalid .flo file")
-
-    width, height = binary_to_tensor(file, dtype=torch.int32, shape=2, byte_order="little").tolist()
-    return binary_to_tensor(file, dtype=torch.float32, shape=(height, width, 2), byte_order="little").permute((2, 0, 1))
+def read_flo(file: IO):
+    return torch.from_numpy(_read_flo(file))

From fa4fafb2596b997204b544e00c88413aa29ab272 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 10 Nov 2021 10:33:51 +0100
Subject: [PATCH 12/28] cleanup

---
 torchvision/datasets/_optical_flow.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torchvision/datasets/_optical_flow.py b/torchvision/datasets/_optical_flow.py
index cf9f77bef92..34be9f5061e 100644
--- a/torchvision/datasets/_optical_flow.py
+++ b/torchvision/datasets/_optical_flow.py
@@ -364,7 +364,8 @@ def _read_flow(self, file_name):
 
 def _read_flo(f):
     """Read .flo file in Middlebury format"""
-    # Code adapted from:flow-files-with-python-bytes-array-numpy
+    # Code adapted from:
+    # http://stackoverflow.com/questions/28013200/reading-middlebury-flow-files-with-python-bytes-array-numpy
     # Everything needs to be in little Endian according to
     # https://vision.middlebury.edu/flow/code/flow-code/README.txt
     if isinstance(f, (str, Path)):
@@ -372,7 +373,6 @@ def _read_flo(f):
         close = True
     else:
         close = False
-    # http://stackoverflow.com/questions/28013200/reading-middlebury-
 
     try:
         magic = np.fromfile(f, "c", count=4).tobytes()

From 61a71a12dfc8d37a5dbdb559d4e4379095ced5ee Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 16 Nov 2021 11:47:03 +0100
Subject: [PATCH 13/28] revert demonstration

---
 torchvision/datasets/_optical_flow.py         | 15 ++----
 .../prototype/datasets/_builtin/mnist.py      | 45 +++++-----------
 .../prototype/datasets/utils/_internal.py     | 53 +++++++++++++++++--
 3 files changed, 65 insertions(+), 48 deletions(-)

diff --git a/torchvision/datasets/_optical_flow.py b/torchvision/datasets/_optical_flow.py
index 34be9f5061e..7c728a5af8f 100644
--- a/torchvision/datasets/_optical_flow.py
+++ b/torchvision/datasets/_optical_flow.py
@@ -362,19 +362,13 @@ def _read_flow(self, file_name):
         return _read_pfm(file_name)
 
 
-def _read_flo(f):
+def _read_flo(file_name):
     """Read .flo file in Middlebury format"""
     # Code adapted from:
     # http://stackoverflow.com/questions/28013200/reading-middlebury-flow-files-with-python-bytes-array-numpy
     # Everything needs to be in little Endian according to
     # https://vision.middlebury.edu/flow/code/flow-code/README.txt
-    if isinstance(f, (str, Path)):
-        f = open(f, "rb")
-        close = True
-    else:
-        close = False
-
-    try:
+    with open(file_name, "rb") as f:
         magic = np.fromfile(f, "c", count=4).tobytes()
         if magic != b"PIEH":
             raise ValueError("Magic number incorrect. Invalid .flo file")
@@ -382,10 +376,7 @@ def _read_flo(f):
         w = int(np.fromfile(f, "<i4", count=1))
         h = int(np.fromfile(f, "<i4", count=1))
         data = np.fromfile(f, "<f4", count=2 * w * h)
-        return data.reshape(h, w, 2).transpose(2, 0, 1).astype("f4", copy=False)
-    finally:
-        if close:
-            f.close()
+        return data.reshape(h, w, 2).transpose(2, 0, 1)
 
 
 def _read_16bits_png_with_flow_and_valid_mask(file_name):
diff --git a/torchvision/prototype/datasets/_builtin/mnist.py b/torchvision/prototype/datasets/_builtin/mnist.py
index 5c22521612c..95fcee5fa1d 100644
--- a/torchvision/prototype/datasets/_builtin/mnist.py
+++ b/torchvision/prototype/datasets/_builtin/mnist.py
@@ -1,11 +1,8 @@
 import abc
-import codecs
 import functools
 import io
-import operator
 import pathlib
 import string
-import sys
 from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, cast
 
 import torch
@@ -30,13 +27,14 @@
     image_buffer_from_array,
     Decompressor,
     INFINITE_BUFFER_SIZE,
+    binary_to_tensor,
 )
 from torchvision.prototype.features import Image, Label
 
 
 __all__ = ["MNIST", "FashionMNIST", "KMNIST", "EMNIST", "QMNIST"]
 
-prod = functools.partial(functools.reduce, operator.mul)
+big_endian_binary_to_tensor = functools.partial(binary_to_tensor, byte_order="big")
 
 
 class MNISTFileReader(IterDataPipe[torch.Tensor]):
@@ -56,44 +54,25 @@ def __init__(
         self.start = start
         self.stop = stop
 
-    @staticmethod
-    def _decode(input: bytes) -> int:
-        return int(codecs.encode(input, "hex"), 16)
-
-    @staticmethod
-    def _to_tensor(chunk: bytes, *, dtype: torch.dtype, shape: List[int], reverse_bytes: bool) -> torch.Tensor:
-        # As is, the chunk is not writeable, because it is read from a file and not from memory. Thus, we copy here to
-        # avoid the warning that torch.frombuffer would emit otherwise. This also enables inplace operations on the
-        # contents, which would otherwise fail.
-        chunk = bytearray(chunk)
-        if reverse_bytes:
-            chunk.reverse()
-            tensor = torch.frombuffer(chunk, dtype=dtype).flip(0)
-        else:
-            tensor = torch.frombuffer(chunk, dtype=dtype)
-        return tensor.reshape(shape)
-
     def __iter__(self) -> Iterator[torch.Tensor]:
         for _, file in self.datapipe:
-            magic = self._decode(file.read(4))
+            magic = int(big_endian_binary_to_tensor(file, dtype=torch.int32))
             dtype = self._DTYPE_MAP[magic // 256]
             ndim = magic % 256 - 1
 
-            num_samples = self._decode(file.read(4))
-            shape = [self._decode(file.read(4)) for _ in range(ndim)]
-
-            num_bytes_per_value = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8
-            # The MNIST format uses the big endian byte order. If the system uses little endian byte order by default,
-            # we need to reverse the bytes before we can read them with torch.frombuffer().
-            reverse_bytes = sys.byteorder == "little" and num_bytes_per_value > 1
-            chunk_size = (cast(int, prod(shape)) if shape else 1) * num_bytes_per_value
+            num_samples = int(big_endian_binary_to_tensor(file, dtype=torch.int32))
+            shape = (
+                cast(List[int], big_endian_binary_to_tensor(file, dtype=torch.int32, shape=(ndim,)).tolist())
+                if ndim
+                else []
+            )
 
             start = self.start or 0
             stop = min(self.stop, num_samples) if self.stop else num_samples
 
-            file.seek(start * chunk_size, 1)
-            for _ in range(stop - start):
-                yield self._to_tensor(file.read(chunk_size), dtype=dtype, shape=shape, reverse_bytes=reverse_bytes)
+            yield big_endian_binary_to_tensor(file, dtype=dtype, shape=shape, skip=start)
+            for _ in range(stop - start - 1):
+                yield big_endian_binary_to_tensor(file, dtype=dtype, shape=shape)
 
 
 class _MNISTBase(Dataset):
diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py
index aee1952af12..fcdbde34f59 100644
--- a/torchvision/prototype/datasets/utils/_internal.py
+++ b/torchvision/prototype/datasets/utils/_internal.py
@@ -1,12 +1,15 @@
 import csv
 import enum
+import functools
 import gzip
 import io
 import lzma
+import operator
 import os
 import os.path
 import pathlib
 import pickle
+import sys
 import textwrap
 from typing import (
     Sequence,
@@ -28,12 +31,12 @@
 
 import numpy as np
 import PIL.Image
+import torch
 import torch.distributed as dist
 import torch.utils.data
 from torch.utils.data import IterDataPipe
 from torchdata.datapipes.iter import IoPathFileLister, IoPathFileLoader
 from torchdata.datapipes.utils import StreamWrapper
-from torchvision.datasets._optical_flow import _read_flo
 
 
 __all__ = [
@@ -52,6 +55,8 @@
     "path_accessor",
     "path_comparator",
     "Decompressor",
+    "binary_to_tensor",
+    "read_flo",
 ]
 
 K = TypeVar("K")
@@ -338,5 +343,47 @@ def _make_sharded_datapipe(root: str, dataset_size: int) -> IterDataPipe:
     return dp
 
 
-def read_flo(file: IO):
-    return torch.from_numpy(_read_flo(file))
+prod = functools.partial(functools.reduce, operator.mul)
+
+
+def binary_to_tensor(
+    file: IO,
+    *,
+    dtype: torch.dtype,
+    shape: Union[int, Sequence[int]] = (),
+    byte_order: str = sys.byteorder,
+    skip: int = 0,
+) -> torch.Tensor:
+    """Construct a tensor from a binary file.
+
+    Args:
+        file (IO): Open file.
+        dtype (torch.dtype): Data type of the returned tensor.
+        shape (Union[Sequence[int], int]): Shape of the returned tensor. If `int`, the tensor will return a 1D tensor
+            with as many elements. Defaults to reading a single value and returns it as 0D tensor.
+        byte_order (str): Byte order of the data. Can be ``"little"`` or ``"big"`` endian. Defaults to the native byte
+            order of the system.
+        skip (int): Number of values to skip before values are read.
+    """
+    if isinstance(shape, int):
+        shape = (shape,)
+
+    byteorder = "<" if byte_order == "little" else ">"
+    char = "f" if dtype.is_floating_point else ("i" if dtype.is_signed else "u")
+    itemsize = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8
+    np_dtype = byteorder + char + str(itemsize)
+
+    file.seek(skip * itemsize, 1)
+    buffer = file.read((prod(shape) if shape else 1) * itemsize)
+    # PyTorch can only deal with with the native byte order, so we need to convert to it in case the file uses a
+    # different one.
+    array = np.frombuffer(buffer, dtype=np_dtype).astype(np.dtype[1:], copy=False)
+    return torch.from_numpy(array).reshape(tuple(shape))
+
+
+def read_flo(file: IO) -> torch.Tensor:
+    if file.read(4) != b"PIEH":
+        raise ValueError("Magic number incorrect. Invalid .flo file")
+
+    width, height = binary_to_tensor(file, dtype=torch.int32, shape=2, byte_order="little").tolist()
+    return binary_to_tensor(file, dtype=torch.float32, shape=(height, width, 2), byte_order="little").permute((2, 0, 1))

From 68f2d953f073763eb4dacafe8ffe5be50092a89a Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 16 Nov 2021 14:50:06 +0100
Subject: [PATCH 14/28] refactor

---
 .../prototype/datasets/_builtin/mnist.py      | 33 ++++++-----
 .../prototype/datasets/utils/_internal.py     | 56 +++++++++----------
 2 files changed, 44 insertions(+), 45 deletions(-)

diff --git a/torchvision/prototype/datasets/_builtin/mnist.py b/torchvision/prototype/datasets/_builtin/mnist.py
index 95fcee5fa1d..dbcc7e21e12 100644
--- a/torchvision/prototype/datasets/_builtin/mnist.py
+++ b/torchvision/prototype/datasets/_builtin/mnist.py
@@ -1,9 +1,10 @@
 import abc
 import functools
 import io
+import operator
 import pathlib
 import string
-from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, cast
+from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, cast, BinaryIO
 
 import torch
 from torchdata.datapipes.iter import (
@@ -27,14 +28,14 @@
     image_buffer_from_array,
     Decompressor,
     INFINITE_BUFFER_SIZE,
-    binary_to_tensor,
+    fromfile,
 )
 from torchvision.prototype.features import Image, Label
 
-
 __all__ = ["MNIST", "FashionMNIST", "KMNIST", "EMNIST", "QMNIST"]
 
-big_endian_binary_to_tensor = functools.partial(binary_to_tensor, byte_order="big")
+
+prod = functools.partial(functools.reduce, operator.mul)
 
 
 class MNISTFileReader(IterDataPipe[torch.Tensor]):
@@ -48,7 +49,7 @@ class MNISTFileReader(IterDataPipe[torch.Tensor]):
     }
 
     def __init__(
-        self, datapipe: IterDataPipe[Tuple[Any, io.IOBase]], *, start: Optional[int], stop: Optional[int]
+        self, datapipe: IterDataPipe[Tuple[Any, BinaryIO]], *, start: Optional[int], stop: Optional[int]
     ) -> None:
         self.datapipe = datapipe
         self.start = start
@@ -56,23 +57,25 @@ def __init__(
 
     def __iter__(self) -> Iterator[torch.Tensor]:
         for _, file in self.datapipe:
-            magic = int(big_endian_binary_to_tensor(file, dtype=torch.int32))
+            read = functools.partial(fromfile, file, byte_order="big", count=1)
+
+            magic = int(read(dtype=torch.int32))
             dtype = self._DTYPE_MAP[magic // 256]
             ndim = magic % 256 - 1
 
-            num_samples = int(big_endian_binary_to_tensor(file, dtype=torch.int32))
-            shape = (
-                cast(List[int], big_endian_binary_to_tensor(file, dtype=torch.int32, shape=(ndim,)).tolist())
-                if ndim
-                else []
-            )
+            num_samples = int(read(dtype=torch.int32))
+            shape = cast(List[int], read(dtype=torch.int32, count=ndim).tolist()) if ndim else []
+            count = prod(shape) if shape else 1
 
             start = self.start or 0
             stop = min(self.stop, num_samples) if self.stop else num_samples
 
-            yield big_endian_binary_to_tensor(file, dtype=dtype, shape=shape, skip=start)
-            for _ in range(stop - start - 1):
-                yield big_endian_binary_to_tensor(file, dtype=dtype, shape=shape)
+            if start:
+                num_bytes_per_value = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8
+                file.seek(num_bytes_per_value * count * start, 1)
+
+            for _ in range(stop - start):
+                yield read(dtype=dtype, count=count).reshape(shape)
 
 
 class _MNISTBase(Dataset):
diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py
index da291be3d1c..0b797726b43 100644
--- a/torchvision/prototype/datasets/utils/_internal.py
+++ b/torchvision/prototype/datasets/utils/_internal.py
@@ -1,14 +1,12 @@
 import enum
-import functools
 import gzip
 import io
 import lzma
-import operator
 import os
 import os.path
 import pathlib
 import pickle
-import sys
+from typing import BinaryIO
 from typing import (
     Sequence,
     Callable,
@@ -46,7 +44,7 @@
     "path_accessor",
     "path_comparator",
     "Decompressor",
-    "binary_to_tensor",
+    "fromfile",
     "read_flo",
 ]
 
@@ -65,6 +63,7 @@ def read_mat(buffer: io.IOBase, **kwargs: Any) -> Any:
     except ImportError as error:
         raise ModuleNotFoundError("Package `scipy` is required to be installed to read .mat files.") from error
 
+    # TODO: This can be removed as soon as https://github.com/pytorch/pytorch/pull/67718 is merged
     if isinstance(buffer, StreamWrapper):
         buffer = buffer.file_obj
 
@@ -258,47 +257,44 @@ def _make_sharded_datapipe(root: str, dataset_size: int) -> IterDataPipe:
     return dp
 
 
-prod = functools.partial(functools.reduce, operator.mul)
-
-
-def binary_to_tensor(
-    file: IO,
+def fromfile(
+    file: BinaryIO,
     *,
     dtype: torch.dtype,
-    shape: Union[int, Sequence[int]] = (),
-    byte_order: str = sys.byteorder,
-    skip: int = 0,
+    byte_order: str,
+    count: int = -1,
 ) -> torch.Tensor:
     """Construct a tensor from a binary file.
 
+    .. note::
+
+        This function is similar to :func:`numpy.fromfile` with two notable differences:
+
+        1. This function only accepts an open binary file, but not a path to it.
+        2. This function has an additional ``byte_order`` parameter, since PyTorch's ``dtype``'s do not support that
+            concept.
+
     Args:
-        file (IO): Open file.
+        file (IO): Open binary file.
         dtype (torch.dtype): Data type of the returned tensor.
-        shape (Union[Sequence[int], int]): Shape of the returned tensor. If `int`, the tensor will return a 1D tensor
-            with as many elements. Defaults to reading a single value and returns it as 0D tensor.
-        byte_order (str): Byte order of the data. Can be ``"little"`` or ``"big"`` endian. Defaults to the native byte
-            order of the system.
-        skip (int): Number of values to skip before values are read.
+        byte_order (str): Byte order of the data. Can be ``"little"`` or ``"big"`` endian.
+        count (int): Number of values of the returned tensor. If ``-1`` (default), will read the complete file.
     """
-    if isinstance(shape, int):
-        shape = (shape,)
-
     byteorder = "<" if byte_order == "little" else ">"
     char = "f" if dtype.is_floating_point else ("i" if dtype.is_signed else "u")
     itemsize = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8
     np_dtype = byteorder + char + str(itemsize)
 
-    file.seek(skip * itemsize, 1)
-    buffer = file.read((prod(shape) if shape else 1) * itemsize)
-    # PyTorch can only deal with with the native byte order, so we need to convert to it in case the file uses a
-    # different one.
-    array = np.frombuffer(buffer, dtype=np_dtype).astype(np.dtype[1:], copy=False)
-    return torch.from_numpy(array).reshape(tuple(shape))
+    buffer = file.read(-1 if count == -1 else count * itemsize)
+    # torch.frombuffer can only deal with with the native byte order,
+    # so we use numpy for the I/O and convert to a tensor.
+    return torch.from_numpy(np.frombuffer(buffer, dtype=np_dtype).astype(np_dtype[1:]))
 
 
-def read_flo(file: IO) -> torch.Tensor:
+def read_flo(file: BinaryIO) -> torch.Tensor:
     if file.read(4) != b"PIEH":
         raise ValueError("Magic number incorrect. Invalid .flo file")
 
-    width, height = binary_to_tensor(file, dtype=torch.int32, shape=2, byte_order="little").tolist()
-    return binary_to_tensor(file, dtype=torch.float32, shape=(height, width, 2), byte_order="little").permute((2, 0, 1))
+    width, height = fromfile(file, dtype=torch.int32, byte_order="little", count=2)
+    flow = fromfile(file, dtype=torch.float32, byte_order="little", count=height * width * 2)
+    return flow.reshape((height, width, 2)).permute((2, 0, 1))

From a3823ba8cc6c4511b739617ecec06fe1871c385d Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 16 Nov 2021 15:39:11 +0100
Subject: [PATCH 15/28] cleanup

---
 torchvision/prototype/datasets/_builtin/mnist.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/torchvision/prototype/datasets/_builtin/mnist.py b/torchvision/prototype/datasets/_builtin/mnist.py
index dbcc7e21e12..9ff84b447e6 100644
--- a/torchvision/prototype/datasets/_builtin/mnist.py
+++ b/torchvision/prototype/datasets/_builtin/mnist.py
@@ -57,13 +57,13 @@ def __init__(
 
     def __iter__(self) -> Iterator[torch.Tensor]:
         for _, file in self.datapipe:
-            read = functools.partial(fromfile, file, byte_order="big", count=1)
+            read = functools.partial(fromfile, file, byte_order="big")
 
-            magic = int(read(dtype=torch.int32))
+            magic = int(read(dtype=torch.int32, count=1))
             dtype = self._DTYPE_MAP[magic // 256]
             ndim = magic % 256 - 1
 
-            num_samples = int(read(dtype=torch.int32))
+            num_samples = int(read(dtype=torch.int32, count=1))
             shape = cast(List[int], read(dtype=torch.int32, count=ndim).tolist()) if ndim else []
             count = prod(shape) if shape else 1
 

From de865cf02108fd3e34014e9602063eba004b3abc Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 18 Nov 2021 08:15:10 +0100
Subject: [PATCH 16/28] add support for mutable memory

---
 torchvision/prototype/datasets/utils/_internal.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py
index 0b797726b43..050056201e9 100644
--- a/torchvision/prototype/datasets/utils/_internal.py
+++ b/torchvision/prototype/datasets/utils/_internal.py
@@ -2,6 +2,7 @@
 import gzip
 import io
 import lzma
+import mmap
 import os
 import os.path
 import pathlib
@@ -285,10 +286,14 @@ def fromfile(
     itemsize = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8
     np_dtype = byteorder + char + str(itemsize)
 
-    buffer = file.read(-1 if count == -1 else count * itemsize)
-    # torch.frombuffer can only deal with with the native byte order,
-    # so we use numpy for the I/O and convert to a tensor.
-    return torch.from_numpy(np.frombuffer(buffer, dtype=np_dtype).astype(np_dtype[1:]))
+    chunk_size = count * itemsize
+    try:
+        buffer = memoryview(mmap.mmap(file.fileno(), 0))[file.tell() :]
+        file.seek(*(0, io.SEEK_END) if count == -1 else (chunk_size, io.SEEK_CUR))
+    except PermissionError:
+        buffer = bytearray(file.read(-1 if count == -1 else chunk_size))
+
+    return torch.from_numpy(np.frombuffer(buffer, dtype=np_dtype, count=count).astype(np_dtype[1:], copy=False))
 
 
 def read_flo(file: BinaryIO) -> torch.Tensor:

From c3fd44513cbe16babdb2934e82ca29f017f32cd2 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 18 Nov 2021 08:15:39 +0100
Subject: [PATCH 17/28] add test

---
 test/test_prototype_datasets_utils.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 test/test_prototype_datasets_utils.py

diff --git a/test/test_prototype_datasets_utils.py b/test/test_prototype_datasets_utils.py
new file mode 100644
index 00000000000..8ffffb21878
--- /dev/null
+++ b/test/test_prototype_datasets_utils.py
@@ -0,0 +1,19 @@
+import pytest
+import torch
+from datasets_utils import make_fake_flo_file
+from torchvision.datasets._optical_flow import _read_flo as read_flo_ref
+from torchvision.prototype.datasets.utils._internal import read_flo
+
+
+@pytest.mark.parametrize("mode", ("rb", "r+b"))
+def test_read_flo(tmpdir, mode):
+    path = tmpdir / "test.flo"
+    height, width = torch.randint(3, 10, (2,))
+    make_fake_flo_file(height, width, path)
+
+    with open(path, mode) as file:
+        actual = read_flo(file)
+
+    expected = torch.from_numpy(read_flo_ref(path).astype("f4", copy=False))
+
+    torch.testing.assert_close(actual, expected)

From 7c3a33f94d6c1a51820e29f963fcd7f899186a61 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 18 Nov 2021 08:34:26 +0100
Subject: [PATCH 18/28] add comments

---
 .../prototype/datasets/utils/_internal.py        | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py
index 050056201e9..571879569d2 100644
--- a/torchvision/prototype/datasets/utils/_internal.py
+++ b/torchvision/prototype/datasets/utils/_internal.py
@@ -277,8 +277,8 @@ def fromfile(
 
     Args:
         file (IO): Open binary file.
-        dtype (torch.dtype): Data type of the returned tensor.
-        byte_order (str): Byte order of the data. Can be ``"little"`` or ``"big"`` endian.
+        dtype (torch.dtype): Data type of the underlying data.
+        byte_order (str): Byte order of the data. Can be "little" or "big" endian.
         count (int): Number of values of the returned tensor. If ``-1`` (default), will read the complete file.
     """
     byteorder = "<" if byte_order == "little" else ">"
@@ -286,13 +286,19 @@ def fromfile(
     itemsize = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8
     np_dtype = byteorder + char + str(itemsize)
 
-    chunk_size = count * itemsize
+    # PyTorch does not support tensors with underlying read-only memory. If the file was opened for updating, i.e.
+    # 'r+b' or 'w+b', the memory is already writable. Otherwise we need to copy it to a mutable location after reading.
     try:
         buffer = memoryview(mmap.mmap(file.fileno(), 0))[file.tell() :]
-        file.seek(*(0, io.SEEK_END) if count == -1 else (chunk_size, io.SEEK_CUR))
+        # Reading from the memoryview does not advance the file cursor, so we have to do it manually.
+        file.seek(*(0, io.SEEK_END) if count == -1 else (count * itemsize, io.SEEK_CUR))
     except PermissionError:
-        buffer = bytearray(file.read(-1 if count == -1 else chunk_size))
+        # A plain file.read() will give a read-only bytes, so we convert it to bytearray to make it mutable
+        buffer = bytearray(file.read(-1 if count == -1 else count * itemsize))
 
+    # We cannot use torch.frombuffer() directly, since it only supports the native byte order of the system. Thus, we
+    # read the data with np.frombuffer() with the correct byte order and convert it to the native one with the
+    # successive .astype() call.
     return torch.from_numpy(np.frombuffer(buffer, dtype=np_dtype, count=count).astype(np_dtype[1:], copy=False))
 
 

From aa780fd472ef7eb818d1f7bb7bf719db9f065b95 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 18 Nov 2021 08:45:45 +0100
Subject: [PATCH 19/28] catch more exceptions

---
 torchvision/prototype/datasets/utils/_internal.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py
index 571879569d2..a5a908ef182 100644
--- a/torchvision/prototype/datasets/utils/_internal.py
+++ b/torchvision/prototype/datasets/utils/_internal.py
@@ -292,7 +292,7 @@ def fromfile(
         buffer = memoryview(mmap.mmap(file.fileno(), 0))[file.tell() :]
         # Reading from the memoryview does not advance the file cursor, so we have to do it manually.
         file.seek(*(0, io.SEEK_END) if count == -1 else (count * itemsize, io.SEEK_CUR))
-    except PermissionError:
+    except (PermissionError, io.UnsupportedOperation):
         # A plain file.read() will give a read-only bytes, so we convert it to bytearray to make it mutable
         buffer = bytearray(file.read(-1 if count == -1 else count * itemsize))
 

From e9031af69bd61c794718afc0d6548217e8d23180 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 18 Nov 2021 08:47:40 +0100
Subject: [PATCH 20/28] fix mypy

---
 torchvision/prototype/datasets/utils/_internal.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py
index a5a908ef182..1714ab5ae8c 100644
--- a/torchvision/prototype/datasets/utils/_internal.py
+++ b/torchvision/prototype/datasets/utils/_internal.py
@@ -288,6 +288,7 @@ def fromfile(
 
     # PyTorch does not support tensors with underlying read-only memory. If the file was opened for updating, i.e.
     # 'r+b' or 'w+b', the memory is already writable. Otherwise we need to copy it to a mutable location after reading.
+    buffer: Union[memoryview, bytearray]
     try:
         buffer = memoryview(mmap.mmap(file.fileno(), 0))[file.tell() :]
         # Reading from the memoryview does not advance the file cursor, so we have to do it manually.

From 1d55fc05133d360e652555f1804730505a5383bf Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 18 Nov 2021 11:46:50 +0100
Subject: [PATCH 21/28] fix variable names

---
 torchvision/prototype/datasets/utils/_internal.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py
index 1714ab5ae8c..ac97e235346 100644
--- a/torchvision/prototype/datasets/utils/_internal.py
+++ b/torchvision/prototype/datasets/utils/_internal.py
@@ -281,10 +281,10 @@ def fromfile(
         byte_order (str): Byte order of the data. Can be "little" or "big" endian.
         count (int): Number of values of the returned tensor. If ``-1`` (default), will read the complete file.
     """
-    byteorder = "<" if byte_order == "little" else ">"
+    byte_order = "<" if byte_order == "little" else ">"
     char = "f" if dtype.is_floating_point else ("i" if dtype.is_signed else "u")
-    itemsize = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8
-    np_dtype = byteorder + char + str(itemsize)
+    item_size = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8
+    np_dtype = byte_order + char + str(item_size)
 
     # PyTorch does not support tensors with underlying read-only memory. If the file was opened for updating, i.e.
     # 'r+b' or 'w+b', the memory is already writable. Otherwise we need to copy it to a mutable location after reading.
@@ -292,10 +292,10 @@ def fromfile(
     try:
         buffer = memoryview(mmap.mmap(file.fileno(), 0))[file.tell() :]
         # Reading from the memoryview does not advance the file cursor, so we have to do it manually.
-        file.seek(*(0, io.SEEK_END) if count == -1 else (count * itemsize, io.SEEK_CUR))
+        file.seek(*(0, io.SEEK_END) if count == -1 else (count * item_size, io.SEEK_CUR))
     except (PermissionError, io.UnsupportedOperation):
         # A plain file.read() will give a read-only bytes, so we convert it to bytearray to make it mutable
-        buffer = bytearray(file.read(-1 if count == -1 else count * itemsize))
+        buffer = bytearray(file.read(-1 if count == -1 else count * item_size))
 
     # We cannot use torch.frombuffer() directly, since it only supports the native byte order of the system. Thus, we
     # read the data with np.frombuffer() with the correct byte order and convert it to the native one with the

From 5ebb5ae28d9590204b9b244bc0b14de42c2fdd64 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 18 Nov 2021 11:48:08 +0100
Subject: [PATCH 22/28] hardcode flow sizes in test

---
 test/test_prototype_datasets_utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/test/test_prototype_datasets_utils.py b/test/test_prototype_datasets_utils.py
index 8ffffb21878..3c16e14d435 100644
--- a/test/test_prototype_datasets_utils.py
+++ b/test/test_prototype_datasets_utils.py
@@ -8,8 +8,7 @@
 @pytest.mark.parametrize("mode", ("rb", "r+b"))
 def test_read_flo(tmpdir, mode):
     path = tmpdir / "test.flo"
-    height, width = torch.randint(3, 10, (2,))
-    make_fake_flo_file(height, width, path)
+    make_fake_flo_file(3, 4, path)
 
     with open(path, mode) as file:
         actual = read_flo(file)

From ac3e4c2321bfac496c7a46067704dfdb5e64da3b Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 18 Nov 2021 11:48:31 +0100
Subject: [PATCH 23/28] add fix dtype docstring

---
 torchvision/prototype/datasets/utils/_internal.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py
index ac97e235346..13b63f579dd 100644
--- a/torchvision/prototype/datasets/utils/_internal.py
+++ b/torchvision/prototype/datasets/utils/_internal.py
@@ -277,7 +277,7 @@ def fromfile(
 
     Args:
         file (IO): Open binary file.
-        dtype (torch.dtype): Data type of the underlying data.
+        dtype (torch.dtype): Data type of the underlying data as well as of the returned tensor.
         byte_order (str): Byte order of the data. Can be "little" or "big" endian.
         count (int): Number of values of the returned tensor. If ``-1`` (default), will read the complete file.
     """

From 507681a06244080b492a660e603cb3a13021ec0a Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 18 Nov 2021 11:57:49 +0100
Subject: [PATCH 24/28] expand comment on different reading modes

---
 torchvision/prototype/datasets/utils/_internal.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py
index 13b63f579dd..56365fd704f 100644
--- a/torchvision/prototype/datasets/utils/_internal.py
+++ b/torchvision/prototype/datasets/utils/_internal.py
@@ -286,8 +286,12 @@ def fromfile(
     item_size = (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits // 8
     np_dtype = byte_order + char + str(item_size)
 
-    # PyTorch does not support tensors with underlying read-only memory. If the file was opened for updating, i.e.
-    # 'r+b' or 'w+b', the memory is already writable. Otherwise we need to copy it to a mutable location after reading.
+    # PyTorch does not support tensors with underlying read-only memory. In case
+    # - the file has a .fileno(),
+    # - the file was opened for updating, i.e. 'r+b' or 'w+b',
+    # - the file is seekable
+    # we can avoid copying the data for performance. Otherwise we fall back to simply .read() the data and copy it to
+    # a mutable location afterwards.
     buffer: Union[memoryview, bytearray]
     try:
         buffer = memoryview(mmap.mmap(file.fileno(), 0))[file.tell() :]

From c52c547875833aa9f6e430fff5387e9541f618e0 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 18 Nov 2021 13:38:35 +0100
Subject: [PATCH 25/28] add comment about files in update mode

---
 torchvision/prototype/datasets/utils/_internal.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py
index 56365fd704f..703c11dd672 100644
--- a/torchvision/prototype/datasets/utils/_internal.py
+++ b/torchvision/prototype/datasets/utils/_internal.py
@@ -275,6 +275,11 @@ def fromfile(
         2. This function has an additional ``byte_order`` parameter, since PyTorch's ``dtype``'s do not support that
             concept.
 
+    .. note::
+
+        If the ``file`` was opened in update mode, i.e. "r+b" or "w+b", reading data is much faster. Be aware that as
+        long as the file is still open, inplace operations on the returned tensor will reflect back to the file.
+
     Args:
         file (IO): Open binary file.
         dtype (torch.dtype): Data type of the underlying data as well as of the returned tensor.

From 80e8f2596d12882468bb4b4a8cfaa51af5f49332 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 18 Nov 2021 14:15:12 +0100
Subject: [PATCH 26/28] add tests for fromfile

---
 test/test_prototype_datasets_utils.py | 34 ++++++++++++++++++++++++---
 1 file changed, 31 insertions(+), 3 deletions(-)

diff --git a/test/test_prototype_datasets_utils.py b/test/test_prototype_datasets_utils.py
index 3c16e14d435..50cc5aceb6d 100644
--- a/test/test_prototype_datasets_utils.py
+++ b/test/test_prototype_datasets_utils.py
@@ -1,16 +1,44 @@
+import sys
+
+import numpy as np
 import pytest
 import torch
 from datasets_utils import make_fake_flo_file
 from torchvision.datasets._optical_flow import _read_flo as read_flo_ref
-from torchvision.prototype.datasets.utils._internal import read_flo
+from torchvision.prototype.datasets.utils._internal import read_flo, fromfile
 
 
+@pytest.mark.filterwarnings("error:The given NumPy array is not writeable:UserWarning")
 @pytest.mark.parametrize("mode", ("rb", "r+b"))
-def test_read_flo(tmpdir, mode):
+@pytest.mark.parametrize(
+    ("np_dtype", "torch_dtype", "byte_order"),
+    [
+        (">f4", torch.float32, "big"),
+        ("<f8", torch.float64, "little"),
+        ("<i4", torch.int32, "little"),
+        (">i8", torch.int64, "big"),
+        ("|u1", torch.uint8, sys.byteorder),
+    ],
+)
+def test_fromfile(tmpdir, np_dtype, torch_dtype, byte_order, mode):
+    path = tmpdir / "data.bin"
+    count = 5
+    np.random.randn(count).astype(np_dtype).tofile(path)
+
+    for count_ in (-1, count // 2):
+        expected = torch.from_numpy(np.fromfile(path, dtype=np_dtype, count=count_).astype(np_dtype[1:]))
+
+        with open(path, mode) as file:
+            actual = fromfile(file, dtype=torch_dtype, byte_order=byte_order, count=count_)
+
+        torch.testing.assert_close(actual, expected)
+
+
+def test_read_flo(tmpdir):
     path = tmpdir / "test.flo"
     make_fake_flo_file(3, 4, path)
 
-    with open(path, mode) as file:
+    with open(path, "rb") as file:
         actual = read_flo(file)
 
     expected = torch.from_numpy(read_flo_ref(path).astype("f4", copy=False))

From 2bb491bb5d78ae815adc96f889ced5b5e26c8dc4 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 19 Nov 2021 09:41:03 +0100
Subject: [PATCH 27/28] cleanup

---
 torchvision/prototype/datasets/_builtin/mnist.py  | 2 +-
 torchvision/prototype/datasets/utils/_internal.py | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/torchvision/prototype/datasets/_builtin/mnist.py b/torchvision/prototype/datasets/_builtin/mnist.py
index 9ff84b447e6..c242207b7d7 100644
--- a/torchvision/prototype/datasets/_builtin/mnist.py
+++ b/torchvision/prototype/datasets/_builtin/mnist.py
@@ -32,8 +32,8 @@
 )
 from torchvision.prototype.features import Image, Label
 
-__all__ = ["MNIST", "FashionMNIST", "KMNIST", "EMNIST", "QMNIST"]
 
+__all__ = ["MNIST", "FashionMNIST", "KMNIST", "EMNIST", "QMNIST"]
 
 prod = functools.partial(functools.reduce, operator.mul)
 
diff --git a/torchvision/prototype/datasets/utils/_internal.py b/torchvision/prototype/datasets/utils/_internal.py
index 7285cf05aa5..3db10183f68 100644
--- a/torchvision/prototype/datasets/utils/_internal.py
+++ b/torchvision/prototype/datasets/utils/_internal.py
@@ -65,7 +65,6 @@ def read_mat(buffer: io.IOBase, **kwargs: Any) -> Any:
     except ImportError as error:
         raise ModuleNotFoundError("Package `scipy` is required to be installed to read .mat files.") from error
 
-    # TODO: This can be removed as soon as https://github.com/pytorch/pytorch/pull/67718 is merged
     if isinstance(buffer, StreamWrapper):
         buffer = buffer.file_obj
 

From 0969cf9a555ce1872c09f540855c20e2ae9effb3 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 19 Nov 2021 11:06:59 +0100
Subject: [PATCH 28/28] cleanup

---
 test/test_prototype_datasets_utils.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/test/test_prototype_datasets_utils.py b/test/test_prototype_datasets_utils.py
index 50cc5aceb6d..7207299c9d4 100644
--- a/test/test_prototype_datasets_utils.py
+++ b/test/test_prototype_datasets_utils.py
@@ -9,7 +9,6 @@
 
 
 @pytest.mark.filterwarnings("error:The given NumPy array is not writeable:UserWarning")
-@pytest.mark.parametrize("mode", ("rb", "r+b"))
 @pytest.mark.parametrize(
     ("np_dtype", "torch_dtype", "byte_order"),
     [
@@ -20,10 +19,12 @@
         ("|u1", torch.uint8, sys.byteorder),
     ],
 )
-def test_fromfile(tmpdir, np_dtype, torch_dtype, byte_order, mode):
+@pytest.mark.parametrize("count", (-1, 2))
+@pytest.mark.parametrize("mode", ("rb", "r+b"))
+def test_fromfile(tmpdir, np_dtype, torch_dtype, byte_order, count, mode):
     path = tmpdir / "data.bin"
-    count = 5
-    np.random.randn(count).astype(np_dtype).tofile(path)
+    rng = np.random.RandomState(0)
+    rng.randn(5 if count == -1 else count + 1).astype(np_dtype).tofile(path)
 
     for count_ in (-1, count // 2):
         expected = torch.from_numpy(np.fromfile(path, dtype=np_dtype, count=count_).astype(np_dtype[1:]))