From b0357fd44090956d79335e9f05a447d60d650603 Mon Sep 17 00:00:00 2001
From: Cheng Su <scnju13@gmail.com>
Date: Fri, 6 Jan 2023 12:54:54 -0800
Subject: [PATCH] [Datasets] Enable lazy execution by default (#31286)

This PR is to enable lazy execution by default. See https://github.com/ray-project/enhancements/pull/19 for motivation. The change includes:
* Change `Dataset` constructor: `Dataset.__init__(lazy: bool = True)`. Also remove `defer_execution` field, as it's no longer needed.
* `read_api.py:read_datasource()` returns a lazy `Dataset` with computing the first input block.
* Add `ds.fully_executed()` calls to required unit tests, to make sure they are passing.

TODO:
- [x] Fix all unit tests
- [x] https://github.com/ray-project/ray/pull/31459
- [x] https://github.com/ray-project/ray/pull/31460
- [ ] Remove the behavior to eagerly compute first block for read
- [ ] https://github.com/ray-project/ray/issues/31417
- [ ] Update documentation
---
 .../datasets_train/datasets_train.py          |  2 +-
 python/ray/data/_internal/plan.py             |  5 +++
 python/ray/data/dataset.py                    | 30 ++++++++---------
 python/ray/data/read_api.py                   |  6 ++--
 python/ray/data/tests/test_dataset.py         | 32 +++++++++++++------
 python/ray/data/tests/test_dataset_parquet.py |  5 ++-
 .../data/tests/test_dynamic_block_split.py    |  4 +++
 python/ray/data/tests/test_optimize.py        | 15 ++-------
 python/ray/data/tests/test_stats.py           |  6 ++--
 python/ray/train/batch_predictor.py           |  6 +++-
 10 files changed, 64 insertions(+), 47 deletions(-)
diff --git a/doc/source/ray-core/_examples/datasets_train/datasets_train.py b/doc/source/ray-core/_examples/datasets_train/datasets_train.py
index 111e4d81a8dd9..990e77c09527c 100644
--- a/doc/source/ray-core/_examples/datasets_train/datasets_train.py
+++ b/doc/source/ray-core/_examples/datasets_train/datasets_train.py
@@ -580,7 +580,7 @@ def train_func(config):
         read_dataset(data_path)
     )
 
-    num_columns = len(train_dataset.schema().names)
+    num_columns = len(train_dataset.schema(fetch_if_missing=True).names)
     # remove label column.
     num_features = num_columns - 1
 
diff --git a/python/ray/data/_internal/plan.py b/python/ray/data/_internal/plan.py
index 7d12acccfb5ca..76a744db78a58 100644
--- a/python/ray/data/_internal/plan.py
+++ b/python/ray/data/_internal/plan.py
@@ -251,6 +251,11 @@ def schema(
                 self.execute()
             else:
                 return None
+        elif self._in_blocks is not None and self._snapshot_blocks is None:
+            # If the plan only has input blocks, we execute it, so snapshot has output.
+            # This applies to newly created dataset. For example, initial dataset from
+            # read, and output datasets of Dataset.split().
+            self.execute()
         # Snapshot is now guaranteed to be the output of the final stage or None.
         blocks = self._snapshot_blocks
         if not blocks:
diff --git a/python/ray/data/dataset.py b/python/ray/data/dataset.py
index 9798c51ee51e1..e39fea4fe2041 100644
--- a/python/ray/data/dataset.py
+++ b/python/ray/data/dataset.py
@@ -177,19 +177,19 @@ class Dataset(Generic[T]):
         >>> ds = ray.data.range(1000)
         >>> # Transform in parallel with map_batches().
         >>> ds.map_batches(lambda batch: [v * 2 for v in batch])
-        Dataset(num_blocks=..., num_rows=1000, schema=<class 'int'>)
+        Dataset(num_blocks=..., num_rows=..., schema=...)
         >>> # Compute max.
         >>> ds.max()
         999
         >>> # Group the data.
         >>> ds.groupby(lambda x: x % 3).count()
-        Dataset(num_blocks=..., num_rows=3, schema=<class 'tuple'>)
+        Dataset(num_blocks=..., num_rows=..., schema=...)
         >>> # Shuffle this dataset randomly.
         >>> ds.random_shuffle()
-        Dataset(num_blocks=..., num_rows=1000, schema=<class 'int'>)
+        Dataset(num_blocks=..., num_rows=..., schema=...)
         >>> # Sort it back in order.
         >>> ds.sort()
-        Dataset(num_blocks=..., num_rows=1000, schema=<class 'int'>)
+        Dataset(num_blocks=..., num_rows=..., schema=...)
 
     Since Datasets are just lists of Ray object refs, they can be passed
     between Ray tasks and actors without incurring a copy. Datasets support
@@ -202,9 +202,7 @@ def __init__(
         self,
         plan: ExecutionPlan,
         epoch: int,
-        lazy: bool,
-        *,
-        defer_execution: bool = False,
+        lazy: bool = True,
     ):
         """Construct a Dataset (internal API).
 
@@ -219,7 +217,7 @@ def __init__(
         self._epoch = epoch
         self._lazy = lazy
 
-        if not lazy and not defer_execution:
+        if not lazy:
             self._plan.execute(allow_clear_input_blocks=False)
 
     @staticmethod
@@ -243,7 +241,7 @@ def map(
             >>> # Transform python objects.
             >>> ds = ray.data.range(1000)
             >>> ds.map(lambda x: x * 2)
-            Dataset(num_blocks=..., num_rows=1000, schema=<class 'int'>)
+            Dataset(num_blocks=..., num_rows=..., schema=...)
             >>> # Transform Arrow records.
             >>> ds = ray.data.from_items(
             ...     [{"value": i} for i in range(1000)])
@@ -804,7 +802,7 @@ def flat_map(
             >>> import ray
             >>> ds = ray.data.range(1000)
             >>> ds.flat_map(lambda x: [x, x ** 2, x ** 3])
-            Dataset(num_blocks=..., num_rows=3000, schema=<class 'int'>)
+            Dataset(num_blocks=..., num_rows=..., schema=...)
 
         Time complexity: O(dataset size / parallelism)
 
@@ -872,7 +870,7 @@ def filter(
             >>> import ray
             >>> ds = ray.data.range(100)
             >>> ds.filter(lambda x: x % 2 == 0)
-            Dataset(num_blocks=..., num_rows=50, schema=<class 'int'>)
+            Dataset(num_blocks=..., num_rows=..., schema=...)
 
         Time complexity: O(dataset size / parallelism)
 
@@ -966,10 +964,10 @@ def random_shuffle(
             >>> ds = ray.data.range(100)
             >>> # Shuffle this dataset randomly.
             >>> ds.random_shuffle()
-            Dataset(num_blocks=..., num_rows=100, schema=<class 'int'>)
+            Dataset(num_blocks=..., num_rows=..., schema=...)
             >>> # Shuffle this dataset with a fixed random seed.
             >>> ds.random_shuffle(seed=12345)
-            Dataset(num_blocks=..., num_rows=100, schema=<class 'int'>)
+            Dataset(num_blocks=..., num_rows=..., schema=...)
 
         Time complexity: O(dataset size / parallelism)
 
@@ -1012,7 +1010,7 @@ def randomize_block_order(
         """
 
         plan = self._plan.with_stage(RandomizeBlocksStage(seed))
-        return Dataset(plan, self._epoch, self._lazy, defer_execution=True)
+        return Dataset(plan, self._epoch, self._lazy)
 
     def random_sample(
         self, fraction: float, *, seed: Optional[int] = None
@@ -1533,7 +1531,7 @@ def groupby(self, key: Optional[KeyFn]) -> "GroupedDataset[T]":
             >>> import ray
             >>> # Group by a key function and aggregate.
             >>> ray.data.range(100).groupby(lambda x: x % 3).count()
-            Dataset(num_blocks=..., num_rows=3, schema=<class 'tuple'>)
+            Dataset(num_blocks=..., num_rows=..., schema=...)
             >>> # Group by an Arrow table column and aggregate.
             >>> ray.data.from_items([
             ...     {"A": x % 3, "B": x} for x in range(100)]).groupby(
@@ -1933,7 +1931,7 @@ def sort(
             >>> # Sort using the entire record as the key.
             >>> ds = ray.data.range(100)
             >>> ds.sort()
-            Dataset(num_blocks=..., num_rows=100, schema=<class 'int'>)
+            Dataset(num_blocks=..., num_rows=..., schema=...)
             >>> # Sort by a single column in descending order.
             >>> ds = ray.data.from_items(
             ...     [{"value": i} for i in range(1000)])
diff --git a/python/ray/data/read_api.py b/python/ray/data/read_api.py
index 65a75914b2bde..1bcf7264a58af 100644
--- a/python/ray/data/read_api.py
+++ b/python/ray/data/read_api.py
@@ -339,9 +339,9 @@ def read_datasource(
     block_list.ensure_metadata_for_first_block()
 
     return Dataset(
-        ExecutionPlan(block_list, block_list.stats(), run_by_consumer=False),
-        0,
-        False,
+        plan=ExecutionPlan(block_list, block_list.stats(), run_by_consumer=False),
+        epoch=0,
+        lazy=True,
     )
 
 
diff --git a/python/ray/data/tests/test_dataset.py b/python/ray/data/tests/test_dataset.py
index 3aac96fdaf2f4..d9b991b5f8a4e 100644
--- a/python/ray/data/tests/test_dataset.py
+++ b/python/ray/data/tests/test_dataset.py
@@ -337,10 +337,10 @@ def test_zip(ray_start_regular_shared):
     ds1 = ray.data.range(5, parallelism=5)
     ds2 = ray.data.range(5, parallelism=5).map(lambda x: x + 1)
     ds = ds1.zip(ds2)
-    assert ds.schema() == tuple
+    assert ds.schema(fetch_if_missing=True) == tuple
     assert ds.take() == [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5)]
     with pytest.raises(ValueError):
-        ds.zip(ray.data.range(3))
+        ds.zip(ray.data.range(3)).fully_executed()
 
 
 def test_zip_pandas(ray_start_regular_shared):
@@ -366,8 +366,8 @@ def test_zip_arrow(ray_start_regular_shared):
         lambda r: {"a": r["value"] + 1, "b": r["value"] + 2}
     )
     ds = ds1.zip(ds2)
-    assert "{id: int64, a: int64, b: int64}" in str(ds)
     assert ds.count() == 5
+    assert "{id: int64, a: int64, b: int64}" in str(ds)
     result = [r.as_pydict() for r in ds.take()]
     assert result[0] == {"id": 0, "a": 1, "b": 2}
 
@@ -749,6 +749,7 @@ def test_tensors_sort(ray_start_regular_shared):
 def test_tensors_inferred_from_map(ray_start_regular_shared):
     # Test map.
     ds = ray.data.range(10, parallelism=10).map(lambda _: np.ones((4, 4)))
+    ds.fully_executed()
     assert str(ds) == (
         "Dataset(num_blocks=10, num_rows=10, "
         "schema={__value__: ArrowTensorType(shape=(4, 4), dtype=double)})"
@@ -758,6 +759,7 @@ def test_tensors_inferred_from_map(ray_start_regular_shared):
     ds = ray.data.range(16, parallelism=4).map_batches(
         lambda _: np.ones((3, 4, 4)), batch_size=2
     )
+    ds.fully_executed()
     assert str(ds) == (
         "Dataset(num_blocks=4, num_rows=24, "
         "schema={__value__: ArrowTensorType(shape=(4, 4), dtype=double)})"
@@ -767,6 +769,7 @@ def test_tensors_inferred_from_map(ray_start_regular_shared):
     ds = ray.data.range(10, parallelism=10).flat_map(
         lambda _: [np.ones((4, 4)), np.ones((4, 4))]
     )
+    ds.fully_executed()
     assert str(ds) == (
         "Dataset(num_blocks=10, num_rows=20, "
         "schema={__value__: ArrowTensorType(shape=(4, 4), dtype=double)})"
@@ -776,6 +779,7 @@ def test_tensors_inferred_from_map(ray_start_regular_shared):
     ds = ray.data.range(16, parallelism=4).map_batches(
         lambda _: pd.DataFrame({"a": [np.ones((4, 4))] * 3}), batch_size=2
     )
+    ds.fully_executed()
     assert str(ds) == (
         "Dataset(num_blocks=4, num_rows=24, "
         "schema={a: TensorDtype(shape=(4, 4), dtype=float64)})"
@@ -785,6 +789,7 @@ def test_tensors_inferred_from_map(ray_start_regular_shared):
         lambda _: pd.DataFrame({"a": [np.ones((2, 2)), np.ones((3, 3))]}),
         batch_size=2,
     )
+    ds.fully_executed()
     assert str(ds) == (
         "Dataset(num_blocks=4, num_rows=16, "
         "schema={a: TensorDtype(shape=(None, None), dtype=float64)})"
@@ -1456,16 +1461,19 @@ def test_empty_dataset(ray_start_regular_shared):
 
     ds = ray.data.range(1)
     ds = ds.filter(lambda x: x > 1)
+    ds.fully_executed()
     assert str(ds) == "Dataset(num_blocks=1, num_rows=0, schema=Unknown schema)"
 
     # Test map on empty dataset.
     ds = ray.data.from_items([])
     ds = ds.map(lambda x: x)
+    ds.fully_executed()
     assert ds.count() == 0
 
     # Test filter on empty dataset.
     ds = ray.data.from_items([])
     ds = ds.filter(lambda: True)
+    ds.fully_executed()
     assert ds.count() == 0
 
 
@@ -1473,7 +1481,9 @@ def test_schema(ray_start_regular_shared):
     ds = ray.data.range(10, parallelism=10)
     ds2 = ray.data.range_table(10, parallelism=10)
     ds3 = ds2.repartition(5)
+    ds3.fully_executed()
     ds4 = ds3.map(lambda x: {"a": "hi", "b": 1.0}).limit(5).repartition(1)
+    ds4.fully_executed()
     assert str(ds) == "Dataset(num_blocks=10, num_rows=10, schema=<class 'int'>)"
     assert str(ds2) == "Dataset(num_blocks=10, num_rows=10, schema={value: int64})"
     assert str(ds3) == "Dataset(num_blocks=5, num_rows=10, schema={value: int64})"
@@ -2284,7 +2294,7 @@ def test_drop_columns(ray_start_regular_shared, tmp_path):
         ]
         # Test dropping non-existent column
         with pytest.raises(KeyError):
-            ds.drop_columns(["dummy_col", "col1", "col2"])
+            ds.drop_columns(["dummy_col", "col1", "col2"]).fully_executed()
 
 
 def test_select_columns(ray_start_regular_shared):
@@ -2315,13 +2325,13 @@ def test_select_columns(ray_start_regular_shared):
         ]
         # Test selecting a column that is not in the dataset schema
         with pytest.raises(KeyError):
-            each_ds.select_columns(cols=["col1", "col2", "dummy_col"])
+            each_ds.select_columns(cols=["col1", "col2", "dummy_col"]).fully_executed()
 
     # Test simple
     ds3 = ray.data.range(10)
     assert ds3.dataset_format() == "simple"
     with pytest.raises(ValueError):
-        ds3.select_columns(cols=[])
+        ds3.select_columns(cols=[]).fully_executed()
 
 
 def test_map_batches_basic(ray_start_regular_shared, tmp_path):
@@ -2684,11 +2694,13 @@ def mutate(df):
     ds = ray.data.range_table(num_rows, parallelism=num_blocks).repartition(num_blocks)
     # Convert to Pandas blocks.
     ds = ds.map_batches(lambda df: df, batch_format="pandas", batch_size=None)
+    ds.fully_executed()
 
     # Apply UDF that mutates the batches, which should fail since the batch is
     # read-only.
     with pytest.raises(ValueError, match="tried to mutate a zero-copy read-only batch"):
-        ds.map_batches(mutate, batch_size=batch_size, zero_copy_batch=True)
+        ds = ds.map_batches(mutate, batch_size=batch_size, zero_copy_batch=True)
+        ds.fully_executed()
 
 
 BLOCK_BUNDLING_TEST_CASES = [
@@ -2710,10 +2722,12 @@ def test_map_batches_block_bundling_auto(
 
     # Blocks should be bundled up to the batch size.
     ds1 = ds.map_batches(lambda x: x, batch_size=batch_size)
+    ds1.fully_executed()
     assert ds1.num_blocks() == math.ceil(num_blocks / max(batch_size // block_size, 1))
 
     # Blocks should not be bundled up when batch_size is not specified.
     ds2 = ds.map_batches(lambda x: x)
+    ds2.fully_executed()
     assert ds2.num_blocks() == num_blocks
 
 
@@ -2796,7 +2810,7 @@ def good_fn(row):
     ds = ray.data.range(10, parallelism=1)
     error_message = "Current row has different columns compared to previous rows."
     with pytest.raises(ValueError) as e:
-        ds.map(bad_fn)
+        ds.map(bad_fn).fully_executed()
     assert error_message in str(e.value)
     ds_map = ds.map(good_fn)
     assert ds_map.take() == [{"a": "hello1", "b": "hello2"} for _ in range(10)]
@@ -5364,7 +5378,7 @@ def f(x):
     compute_strategy = ray.data.ActorPoolStrategy()
     ray.data.range(10, parallelism=10).map_batches(
         f, batch_size=1, compute=compute_strategy
-    )
+    ).fully_executed()
     expected_max_num_workers = math.ceil(
         num_cpus * (1 / compute_strategy.ready_to_total_workers_ratio)
     )
diff --git a/python/ray/data/tests/test_dataset_parquet.py b/python/ray/data/tests/test_dataset_parquet.py
index 09bffcf5a9086..6bf861e416edc 100644
--- a/python/ray/data/tests/test_dataset_parquet.py
+++ b/python/ray/data/tests/test_dataset_parquet.py
@@ -194,7 +194,10 @@ def prefetch_file_metadata(self, pieces):
 
     # Expect precomputed row counts and block sizes to be missing.
     assert ds._meta_count() is None
-    assert ds._plan._snapshot_blocks.size_bytes() == -1
+    assert (
+        ds._plan._snapshot_blocks is None
+        or ds._plan._snapshot_blocks.size_bytes() == -1
+    )
 
     # Expect to lazily compute all metadata correctly.
     assert ds._plan.execute()._num_computed() == 1
diff --git a/python/ray/data/tests/test_dynamic_block_split.py b/python/ray/data/tests/test_dynamic_block_split.py
index ad5aa16468500..f290807df4353 100644
--- a/python/ray/data/tests/test_dynamic_block_split.py
+++ b/python/ray/data/tests/test_dynamic_block_split.py
@@ -82,10 +82,13 @@ def test_dataset(
     assert ds.size_bytes() >= 0.7 * block_size * num_blocks * num_tasks
 
     map_ds = ds.map_batches(lambda x: x)
+    map_ds.fully_executed()
     assert map_ds.num_blocks() == num_tasks
     map_ds = ds.map_batches(lambda x: x, batch_size=num_blocks * num_tasks)
+    map_ds.fully_executed()
     assert map_ds.num_blocks() == 1
     map_ds = ds.map(lambda x: x)
+    map_ds.fully_executed()
     assert map_ds.num_blocks() == num_blocks * num_tasks
 
     ds_list = ds.split(5)
@@ -109,6 +112,7 @@ def test_dataset(
     assert ds.groupby("one").count().count() == num_blocks * num_tasks
 
     new_ds = ds.zip(ds)
+    new_ds.fully_executed()
     assert new_ds.num_blocks() == num_blocks * num_tasks
 
     assert len(ds.take(5)) == 5
diff --git a/python/ray/data/tests/test_optimize.py b/python/ray/data/tests/test_optimize.py
index a374ee564e31b..fee16b1bcc12b 100644
--- a/python/ray/data/tests/test_optimize.py
+++ b/python/ray/data/tests/test_optimize.py
@@ -63,6 +63,7 @@ def test_memory_sanity(shutdown_only):
     info = ray.init(num_cpus=1, object_store_memory=500e6)
     ds = ray.data.range(10)
     ds = ds.map(lambda x: np.ones(100 * 1024 * 1024, dtype=np.uint8))
+    ds.fully_executed()
     meminfo = memory_summary(info.address_info["address"], stats_only=True)
 
     # Sanity check spilling is happening as expected.
@@ -291,23 +292,11 @@ def _assert_has_stages(stages, stage_names):
 
 
 def test_stage_linking(ray_start_regular_shared):
-    # NOTE: This tests the internals of `ExecutionPlan`, which is bad practice. Remove
-    # this test once we have proper unit testing of `ExecutionPlan`.
-    # Test eager dataset.
-    ds = ray.data.range(10)
-    assert len(ds._plan._stages_before_snapshot) == 0
-    assert len(ds._plan._stages_after_snapshot) == 0
-    assert len(ds._plan._last_optimized_stages) == 0
-    ds = ds.map(lambda x: x + 1)
-    _assert_has_stages(ds._plan._stages_before_snapshot, ["map"])
-    assert len(ds._plan._stages_after_snapshot) == 0
-    _assert_has_stages(ds._plan._last_optimized_stages, ["read->map"])
-
     # Test lazy dataset.
     ds = ray.data.range(10).lazy()
     assert len(ds._plan._stages_before_snapshot) == 0
     assert len(ds._plan._stages_after_snapshot) == 0
-    assert len(ds._plan._last_optimized_stages) == 0
+    assert ds._plan._last_optimized_stages is None
     ds = ds.map(lambda x: x + 1)
     assert len(ds._plan._stages_before_snapshot) == 0
     _assert_has_stages(ds._plan._stages_after_snapshot, ["map"])
diff --git a/python/ray/data/tests/test_stats.py b/python/ray/data/tests/test_stats.py
index 1af089a25c751..6f3ed68b1d209 100644
--- a/python/ray/data/tests/test_stats.py
+++ b/python/ray/data/tests/test_stats.py
@@ -31,7 +31,7 @@ def test_dataset_stats_basic(ray_start_regular_shared, enable_auto_log_stats):
     )
     with patch.object(logger, "info") as mock_logger:
         ds = ray.data.range(1000, parallelism=10)
-        ds = ds.map_batches(lambda x: x)
+        ds = ds.map_batches(lambda x: x).fully_executed()
 
         if enable_auto_log_stats:
             logger_args, logger_kwargs = mock_logger.call_args
@@ -47,7 +47,7 @@ def test_dataset_stats_basic(ray_start_regular_shared, enable_auto_log_stats):
 """
             )
 
-        ds = ds.map(lambda x: x)
+        ds = ds.map(lambda x: x).fully_executed()
         if enable_auto_log_stats:
             logger_args, logger_kwargs = mock_logger.call_args
             assert (
@@ -239,7 +239,7 @@ def test_dataset_pipeline_stats_basic(ray_start_regular_shared, enable_auto_log_
     )
     with patch.object(logger, "info") as mock_logger:
         ds = ray.data.range(1000, parallelism=10)
-        ds = ds.map_batches(lambda x: x)
+        ds = ds.map_batches(lambda x: x).fully_executed()
 
         if enable_auto_log_stats:
             logger_args, logger_kwargs = mock_logger.call_args
diff --git a/python/ray/train/batch_predictor.py b/python/ray/train/batch_predictor.py
index 56672e46b238c..302adc67e5fc9 100644
--- a/python/ray/train/batch_predictor.py
+++ b/python/ray/train/batch_predictor.py
@@ -99,7 +99,7 @@ def predict(
         separate_gpu_stage: bool = True,
         ray_remote_args: Optional[Dict[str, Any]] = None,
         **predict_kwargs,
-    ) -> ray.data.Dataset:
+    ) -> Union[ray.data.Dataset, ray.data.DatasetPipeline]:
         """Run batch scoring on a Dataset.
 
         Args:
@@ -309,6 +309,10 @@ def __call__(self, input_batch: DataBatchType) -> DataBatchType:
             **ray_remote_args,
         )
 
+        if isinstance(prediction_results, ray.data.Dataset):
+            # Force execution because Dataset uses lazy execution by default.
+            prediction_results.fully_executed()
+
         return prediction_results
 
     def predict_pipelined(