ray-project · c21 · Feb 29, 2024 · Jan 9, 2024 · Jan 9, 2024 · Jan 11, 2024
diff --git a/ci/docker/data.build.Dockerfile b/ci/docker/data.build.Dockerfile
@@ -28,6 +28,11 @@ sudo apt-get purge -y mongodb*
 sudo apt-get install -y mongodb
 sudo rm -rf /var/lib/mongodb/mongod.lock
 
+# Dependency used for read_tfrecords function.
+# Given that we only use the ExamplesToRecordBatchDecoder
+# which is purley c++, we can isntall it with --no-dependencies.
+pip install tfx-bsl==1.14.0 --no-dependencies
+
 if [[ $RAY_CI_JAVA_BUILD == 1 ]]; then
   # These packages increase the image size quite a bit, so we only install them 
   # as needed.

@@ -2,16 +2,26 @@
 from typing import TYPE_CHECKING, Dict, Iterable, Iterator, List, Optional, Union
 
 import numpy as np
+import pyarrow
 
+from ray.data._internal.dataset_logger import DatasetLogger
+from ray.data.aggregate import AggregateFn
 from ray.data.block import Block
 from ray.data.datasource.file_based_datasource import FileBasedDatasource
 from ray.util.annotations import PublicAPI
 
 if TYPE_CHECKING:
-    import pyarrow
+    import pandas as pd
     import tensorflow as tf
     from tensorflow_metadata.proto.v0 import schema_pb2
 
+    from ray.data.dataset import Dataset
+
+
+DEFAULT_BATCH_SIZE = 2048
+
+logger = DatasetLogger(__name__)
+
 
 @PublicAPI(stability="alpha")
 class TFRecordDatasource(FileBasedDatasource):
@@ -23,13 +33,23 @@ def __init__(
         self,
         paths: Union[str, List[str]],
         tf_schema: Optional["schema_pb2.Schema"] = None,
+        fast_read: bool = False,
+        batch_size: Optional[int] = None,
         **file_based_datasource_kwargs,
     ):
         super().__init__(paths, **file_based_datasource_kwargs)
 
-        self.tf_schema = tf_schema
+        self._tf_schema = tf_schema
+        self._fast_read = fast_read
+        self._batch_size = batch_size or DEFAULT_BATCH_SIZE
 
     def _read_stream(self, f: "pyarrow.NativeFile", path: str) -> Iterator[Block]:
+        if self._fast_read:
+            yield from self._fast_read_stream(f, path)
+        else:
+            yield from self._slow_read_stream(f, path)
+
+    def _slow_read_stream(self, f: "pyarrow.NativeFile", path: str) -> Iterator[Block]:
         import pyarrow as pa
         import tensorflow as tf
         from google.protobuf.message import DecodeError
@@ -46,14 +66,64 @@ def _read_stream(self, f: "pyarrow.NativeFile", path: str) -> Iterator[Block]:
                 )
 
             yield pa.Table.from_pydict(
-                _convert_example_to_dict(example, self.tf_schema)
+                _convert_example_to_dict(example, self._tf_schema)
             )
 
+    def _fast_read_stream(self, f: "pyarrow.NativeFile", path: str) -> Iterator[Block]:
+        import tensorflow as tf
+        from tfx_bsl.cc.tfx_bsl_extension.coders import ExamplesToRecordBatchDecoder
+
+        full_path = self._resolve_full_path(path)
+
+        compression = (self._open_stream_args or {}).get("compression", None)
+
+        if compression:
+            compression = compression.upper()
+
+        tf_schema_string = (
+            self._tf_schema.SerializeToString() if self._tf_schema else None
+        )
+
+        decoder = ExamplesToRecordBatchDecoder(tf_schema_string)
+        exception_thrown = None
+        try:
+            for record in tf.data.TFRecordDataset(
+                full_path, compression_type=compression
+            ).batch(self._batch_size):
+                yield pyarrow.Table.from_batches([decoder.DecodeBatch(record.numpy())])
+        except Exception as error:
+            logger.get_logger().exception(f"Failed to read TFRecord file {full_path}")
+            exception_thrown = error
+
+        # we need to do this hack were we raise an exception outside of the
+        # except block because tensorflow DataLossError is unpickable, and
+        # even if we raise a runtime error, ray keeps information about the
+        # original error, which makes it unpickable still.
+        if exception_thrown:
+            raise RuntimeError(f"Failed to read TFRecord file {full_path}.")
+
+    def _resolve_full_path(self, relative_path):
+        if isinstance(self._filesystem, pyarrow.fs.S3FileSystem):
+            return f"s3://{relative_path}"
+        if isinstance(self._filesystem, pyarrow.fs.GcsFileSystem):
+            return f"gs://{relative_path}"
+        if isinstance(self._filesystem, pyarrow.fs.HadoopFileSystem):
+            return f"hdfs:///{relative_path}"
+        if isinstance(self._filesystem, pyarrow.fs.PyFileSystem):
+            protocol = self._filesystem.handler.fs.protocol
+            if isinstance(protocol, list) or isinstance(protocol, tuple):
+                protocol = protocol[0]
+            if protocol == "gcs":
+                protocol = "gs"
+            return f"{protocol}://{relative_path}"
+
+        return relative_path
+
 
 def _convert_example_to_dict(
     example: "tf.train.Example",
     tf_schema: Optional["schema_pb2.Schema"],
-) -> Dict[str, "pyarrow.Array"]:
+) -> Dict[str, pyarrow.Array]:
     record = {}
     schema_dict = {}
     # Convert user-specified schema into dict for convenient mapping
@@ -73,7 +143,7 @@ def _convert_example_to_dict(
 
 
 def _convert_arrow_table_to_examples(
-    arrow_table: "pyarrow.Table",
+    arrow_table: pyarrow.Table,
     tf_schema: Optional["schema_pb2.Schema"] = None,
 ) -> Iterable["tf.train.Example"]:
     import tensorflow as tf
@@ -118,7 +188,7 @@ def _get_single_true_type(dct) -> str:
 def _get_feature_value(
     feature: "tf.train.Feature",
     schema_feature_type: Optional["schema_pb2.FeatureType"] = None,
-) -> "pyarrow.Array":
+) -> pyarrow.Array:
     import pyarrow as pa
 
     underlying_feature_type = {
@@ -361,6 +431,57 @@ def _read_records(
             raise RuntimeError(error_message) from e
 
 
+def _infer_schema_and_transform(dataset: "Dataset"):
+    list_sizes = dataset.aggregate(_MaxListSize(dataset.schema().names))
+
+    return dataset.map_batches(
+        _unwrap_single_value_lists,
+        fn_kwargs={"col_lengths": list_sizes["max_list_size"]},
+    )
+
+
+def _unwrap_single_value_lists(
+    batch: Dict[str, np.ndarray], col_lengths: Dict[str, int]
+):
+    for col in col_lengths:
+        if col_lengths[col] == 1:
+            batch[col] = np.array(
+                [x[0] if isinstance(x, np.ndarray) else x for x in batch[col]]
+            )
+
+    return batch
+
+
+class _MaxListSize(AggregateFn):
+    def __init__(self, columns: List[str]):
+        self._columns = columns
+        super().__init__(
+            init=self._init,
+            merge=self._merge,
+            accumulate_row=self._accumulate_row,
+            finalize=lambda a: a,
+            name="max_list_size",
+        )
+
+    def _init(self, k: str):
+        return {col: 0 for col in self._columns}
+
+    def _merge(self, acc1: Dict[str, int], acc2: Dict[str, int]):
+        merged = {}
+        for col in self._columns:
+            merged[col] = max(acc1[col], acc2[col])
+
+        return merged
+
+    def _accumulate_row(self, acc: Dict[str, int], row: "pd.Series"):
+        for k in row:
+            value = row[k]
+            if value:
+                acc[k] = max(len(value), acc[k])
+
+        return acc
+
+
 # Adapted from https://github.com/vahidk/tfrecord/blob/74b2d24a838081356d993ec0e147eaf59ccd4c84/tfrecord/writer.py#L57-L72  # noqa: E501
 #
 # MIT License

@@ -1476,6 +1476,8 @@ def read_tfrecords(
     tf_schema: Optional["schema_pb2.Schema"] = None,
     shuffle: Union[Literal["files"], None] = None,
     file_extensions: Optional[List[str]] = None,
+    fast_read_batch_size: Optional[int] = None,
+    fast_read_auto_infer_schema: bool = True,
 ) -> Dataset:
     """Create a :class:`~ray.data.Dataset` from TFRecord files that contain
     `tf.train.Example <https://www.tensorflow.org/api_docs/python/tf/train/Example>`_
@@ -1550,13 +1552,45 @@ def read_tfrecords(
         shuffle: If setting to "files", randomly shuffle input files order before read.
             Defaults to not shuffle with ``None``.
         file_extensions: A list of file extensions to filter files by.
+        batch_size: An int representing the number of consecutive elements of this
-        batch_size: An int representing the number of consecutive elements of this
+        fast_read_batch_size: An int representing the number of consecutive elements of this
-        batch_size: An int representing the number of consecutive elements of this
+        fast_read_batch_size: An int representing the number of consecutive elements of this
+             dataset to combine in a single batch when fast_read is used.
+        fast_read_auto_infer_schema: Toggles the schema inference applied; applicable
+            only if fast_read is used and tf_schema argument is missing.
+            Defaults to True.
 
     Returns:
         A :class:`~ray.data.Dataset` that contains the example features.
 
     Raises:
         ValueError: If a file contains a message that isn't a ``tf.train.Example``.
     """
+    import platform
+
+    fast_read = False
+
+    try:
+        from tfx_bsl.cc.tfx_bsl_extension.coders import (  # noqa: F401
+            ExamplesToRecordBatchDecoder,
+        )
+
+        fast_read = True
+    except ModuleNotFoundError:
+        if platform.processor() == "arm":
+            logger.warning(
+                "This function depends on tfx-bsl which is currently not supported"
+                " on devices with Apple silicon (e.g. M1) and requires an"
+                " environment with x86 CPU architecture."
+            )
+        else:
+            logger.warning(
+                "To use TFRecordDatasource with large datasets, please install"
+                " tfx-bsl package with pip install tfx_bsl --no-dependencies`."
+            )
+        logger.info(
+            "Falling back to slower strategy for reading tf.records. This"
+            "reading strategy should be avoided when reading large datasets."
+        )
+
     if meta_provider is None:
         meta_provider = get_generic_metadata_provider(
             TFRecordDatasource._FILE_EXTENSIONS
@@ -1573,8 +1607,17 @@ def read_tfrecords(
         shuffle=shuffle,
         include_paths=include_paths,
         file_extensions=file_extensions,
+        fast_read=fast_read,
+        batch_size=fast_read_batch_size,
     )
-    return read_datasource(datasource, parallelism=parallelism)
+    ds = read_datasource(datasource, parallelism=parallelism)
+
+    if fast_read_auto_infer_schema and fast_read and not tf_schema:
+        from ray.data.datasource.tfrecords_datasource import _infer_schema_and_transform
+
+        return _infer_schema_and_transform(ds)
+
+    return ds
 
 
 @PublicAPI(stability="alpha")