ray-project · c21 · Feb 29, 2024 · Jan 9, 2024 · Jan 9, 2024 · Jan 11, 2024
diff --git a/.buildkite/data.rayci.yml b/.buildkite/data.rayci.yml
@@ -24,6 +24,9 @@ steps:
   - name: datamongobuild
     wanda: ci/docker/datamongo.build.wanda.yaml
 
+  - name: datatfxbslbuild
+    wanda: ci/docker/datatfxbsl.build.wanda.yaml
+
   # tests
   - label: ":database: data: arrow 6 tests"
     tags: 
@@ -88,6 +91,19 @@ steps:
         --test-env RAY_DATA_USE_STREAMING_EXECUTOR=1
         --except-tags data_integration,doctest
     depends_on: datanbuild
+
+  - label: ":database: data: TFRecords (tfx-bsl) tests"
+    tags: 
+      - python
+      - data
+    instance_type: medium
+    commands:
+      - bazel run //ci/ray_ci:test_in_docker -- //python/ray/data/... data
+        --parallelism-per-worker 3
+        --build-name datatfxbslbuild
+        --test-env RAY_DATA_USE_STREAMING_EXECUTOR=1
+        --only-tags tfxbsl
+    depends_on: datatfxbslbuild
 
   - label: ":database: data: doc tests"
     tags: 

@@ -0,0 +1,15 @@
+name: "datatfxbslbuild"
+froms: ["cr.ray.io/rayproject/oss-ci-base_ml"]
+dockerfile: ci/docker/data.build.Dockerfile
+srcs:
+  - ci/env/install-dependencies.sh
+  - python/requirements.txt
+  - python/requirements_compiled.txt
+  - python/requirements/test-requirements.txt
+  - python/requirements/ml/dl-cpu-requirements.txt
+  - python/requirements/ml/data-requirements.txt
+  - python/requirements/ml/data-test-tfrecords-requirements.txt
+build_args:
+  - ARROW_VERSION=14.*
+tags:
+  - cr.ray.io/rayproject/data14build
@@ -309,7 +309,7 @@ py_test(
     name = "test_tfrecords",
     size = "small",
     srcs = ["tests/test_tfrecords.py"],
-    tags = ["team:data", "exclusive"],
+    tags = ["team:data", "exclusive", "tfxbsl"],
     deps = ["//:ray_lib", ":conftest"],
 )
 

@@ -2,16 +2,26 @@
 from typing import TYPE_CHECKING, Dict, Iterable, Iterator, List, Optional, Union
 
 import numpy as np
+import pyarrow
 
+from ray.data._internal.dataset_logger import DatasetLogger
+from ray.data.aggregate import AggregateFn
 from ray.data.block import Block
 from ray.data.datasource.file_based_datasource import FileBasedDatasource
 from ray.util.annotations import PublicAPI
 
 if TYPE_CHECKING:
-    import pyarrow
+    import pandas as pd
     import tensorflow as tf
     from tensorflow_metadata.proto.v0 import schema_pb2
 
+    from ray.data.dataset import Dataset
+
+
+DEFAULT_BATCH_SIZE = 2048
+
+logger = DatasetLogger(__name__)
+
 
 @PublicAPI(stability="alpha")
 class TFRecordDatasource(FileBasedDatasource):
@@ -23,13 +33,23 @@ def __init__(
         self,
         paths: Union[str, List[str]],
         tf_schema: Optional["schema_pb2.Schema"] = None,
+        fast_read: bool = False,
+        batch_size: Optional[int] = None,
         **file_based_datasource_kwargs,
     ):
         super().__init__(paths, **file_based_datasource_kwargs)
 
-        self.tf_schema = tf_schema
+        self._tf_schema = tf_schema
+        self._fast_read = fast_read
+        self._batch_size = batch_size or DEFAULT_BATCH_SIZE
 
     def _read_stream(self, f: "pyarrow.NativeFile", path: str) -> Iterator[Block]:
+        if self._fast_read:
+            yield from self._fast_read_stream(f, path)
+        else:
+            yield from self._slow_read_stream(f, path)
+
+    def _slow_read_stream(self, f: "pyarrow.NativeFile", path: str) -> Iterator[Block]:
         import pyarrow as pa
         import tensorflow as tf
         from google.protobuf.message import DecodeError
@@ -46,14 +66,64 @@ def _read_stream(self, f: "pyarrow.NativeFile", path: str) -> Iterator[Block]:
                 )
 
             yield pa.Table.from_pydict(
-                _convert_example_to_dict(example, self.tf_schema)
+                _convert_example_to_dict(example, self._tf_schema)
             )
 
+    def _fast_read_stream(self, f: "pyarrow.NativeFile", path: str) -> Iterator[Block]:
+        import tensorflow as tf
+        from tfx_bsl.cc.tfx_bsl_extension.coders import ExamplesToRecordBatchDecoder
+
+        full_path = self._resolve_full_path(path)
+
+        compression = (self._open_stream_args or {}).get("compression", None)
+
+        if compression:
+            compression = compression.upper()
+
+        tf_schema_string = (
+            self._tf_schema.SerializeToString() if self._tf_schema else None
+        )
+
+        decoder = ExamplesToRecordBatchDecoder(tf_schema_string)
+        exception_thrown = None
+        try:
+            for record in tf.data.TFRecordDataset(
+                full_path, compression_type=compression
+            ).batch(self._batch_size):
+                yield pyarrow.Table.from_batches([decoder.DecodeBatch(record.numpy())])
+        except Exception as error:
+            logger.get_logger().exception(f"Failed to read TFRecord file {full_path}")
+            exception_thrown = error
+
+        # we need to do this hack were we raise an exception outside of the
+        # except block because tensorflow DataLossError is unpickable, and
+        # even if we raise a runtime error, ray keeps information about the
+        # original error, which makes it unpickable still.
+        if exception_thrown:
+            raise RuntimeError(f"Failed to read TFRecord file {full_path}.")
+
+    def _resolve_full_path(self, relative_path):
+        if isinstance(self._filesystem, pyarrow.fs.S3FileSystem):
+            return f"s3://{relative_path}"
+        if isinstance(self._filesystem, pyarrow.fs.GcsFileSystem):
+            return f"gs://{relative_path}"
+        if isinstance(self._filesystem, pyarrow.fs.HadoopFileSystem):
+            return f"hdfs:///{relative_path}"
+        if isinstance(self._filesystem, pyarrow.fs.PyFileSystem):
+            protocol = self._filesystem.handler.fs.protocol
+            if isinstance(protocol, list) or isinstance(protocol, tuple):
+                protocol = protocol[0]
+            if protocol == "gcs":
+                protocol = "gs"
+            return f"{protocol}://{relative_path}"
+
+        return relative_path
+
 
 def _convert_example_to_dict(
     example: "tf.train.Example",
     tf_schema: Optional["schema_pb2.Schema"],
-) -> Dict[str, "pyarrow.Array"]:
+) -> Dict[str, pyarrow.Array]:
     record = {}
     schema_dict = {}
     # Convert user-specified schema into dict for convenient mapping
@@ -73,7 +143,7 @@ def _convert_example_to_dict(
 
 
 def _convert_arrow_table_to_examples(
-    arrow_table: "pyarrow.Table",
+    arrow_table: pyarrow.Table,
     tf_schema: Optional["schema_pb2.Schema"] = None,
 ) -> Iterable["tf.train.Example"]:
     import tensorflow as tf
@@ -118,7 +188,7 @@ def _get_single_true_type(dct) -> str:
 def _get_feature_value(
     feature: "tf.train.Feature",
     schema_feature_type: Optional["schema_pb2.FeatureType"] = None,
-) -> "pyarrow.Array":
+) -> pyarrow.Array:
     import pyarrow as pa
 
     underlying_feature_type = {
@@ -361,6 +431,57 @@ def _read_records(
             raise RuntimeError(error_message) from e
 
 
+def _infer_schema_and_transform(dataset: "Dataset"):
+    list_sizes = dataset.aggregate(_MaxListSize(dataset.schema().names))
+
+    return dataset.map_batches(
+        _unwrap_single_value_lists,
+        fn_kwargs={"col_lengths": list_sizes["max_list_size"]},
+    )
+
+
+def _unwrap_single_value_lists(
+    batch: Dict[str, np.ndarray], col_lengths: Dict[str, int]
+):
+    for col in col_lengths:
+        if col_lengths[col] == 1:
+            batch[col] = np.array(
+                [x[0] if isinstance(x, np.ndarray) else x for x in batch[col]]
+            )
+
+    return batch
+
+
+class _MaxListSize(AggregateFn):
+    def __init__(self, columns: List[str]):
+        self._columns = columns
+        super().__init__(
+            init=self._init,
+            merge=self._merge,
+            accumulate_row=self._accumulate_row,
+            finalize=lambda a: a,
+            name="max_list_size",
+        )
+
+    def _init(self, k: str):
+        return {col: 0 for col in self._columns}
+
+    def _merge(self, acc1: Dict[str, int], acc2: Dict[str, int]):
+        merged = {}
+        for col in self._columns:
+            merged[col] = max(acc1[col], acc2[col])
+
+        return merged
+
+    def _accumulate_row(self, acc: Dict[str, int], row: "pd.Series"):
+        for k in row:
+            value = row[k]
+            if value:
+                acc[k] = max(len(value), acc[k])
+
+        return acc
+
+
 # Adapted from https://github.com/vahidk/tfrecord/blob/74b2d24a838081356d993ec0e147eaf59ccd4c84/tfrecord/writer.py#L57-L72  # noqa: E501
 #
 # MIT License

@@ -1476,6 +1476,8 @@ def read_tfrecords(
     tf_schema: Optional["schema_pb2.Schema"] = None,
     shuffle: Union[Literal["files"], None] = None,
     file_extensions: Optional[List[str]] = None,
+    fast_read_batch_size: Optional[int] = None,
+    fast_read_auto_infer_schema: bool = True,
 ) -> Dataset:
     """Create a :class:`~ray.data.Dataset` from TFRecord files that contain
     `tf.train.Example <https://www.tensorflow.org/api_docs/python/tf/train/Example>`_
@@ -1550,13 +1552,45 @@ def read_tfrecords(
         shuffle: If setting to "files", randomly shuffle input files order before read.
             Defaults to not shuffle with ``None``.
         file_extensions: A list of file extensions to filter files by.
+        batch_size: An int representing the number of consecutive elements of this
-        batch_size: An int representing the number of consecutive elements of this
+        fast_read_batch_size: An int representing the number of consecutive elements of this
-        batch_size: An int representing the number of consecutive elements of this
+        fast_read_batch_size: An int representing the number of consecutive elements of this
+             dataset to combine in a single batch when fast_read is used.
+        fast_read_auto_infer_schema: Toggles the schema inference applied; applicable
+            only if fast_read is used and tf_schema argument is missing.
+            Defaults to True.
 
     Returns:
         A :class:`~ray.data.Dataset` that contains the example features.
 
     Raises:
         ValueError: If a file contains a message that isn't a ``tf.train.Example``.
     """
+    import platform
+
+    fast_read = False
+
+    try:
+        from tfx_bsl.cc.tfx_bsl_extension.coders import (  # noqa: F401
+            ExamplesToRecordBatchDecoder,
+        )
+
+        fast_read = True
+    except ModuleNotFoundError:
+        if platform.processor() == "arm":
+            logger.warning(
+                "The fast strategy of this function depends on tfx-bsl, which is "
+                "currently not supported on devices with Apple silicon "
+                "(e.g. M1) and requires an environment with x86 CPU architecture."
+            )
+        else:
+            logger.warning(
+                "To use TFRecordDatasource with large datasets, please install"
+                " tfx-bsl package with pip install tfx_bsl --no-dependencies`."
+            )
+        logger.info(
+            "Falling back to slower strategy for reading tf.records. This "
+            "reading strategy should be avoided when reading large datasets."
+        )
+
     if meta_provider is None:
         meta_provider = get_generic_metadata_provider(
             TFRecordDatasource._FILE_EXTENSIONS
@@ -1573,8 +1607,17 @@ def read_tfrecords(
         shuffle=shuffle,
         include_paths=include_paths,
         file_extensions=file_extensions,
+        fast_read=fast_read,
+        batch_size=fast_read_batch_size,
     )
-    return read_datasource(datasource, parallelism=parallelism)
+    ds = read_datasource(datasource, parallelism=parallelism)
+
+    if fast_read_auto_infer_schema and fast_read and not tf_schema:
+        from ray.data.datasource.tfrecords_datasource import _infer_schema_and_transform
+
+        return _infer_schema_and_transform(ds)
+
+    return ds
 
 
 @PublicAPI(stability="alpha")