[Datasets] Enable lazy execution by default (#31286)

This PR is to enable lazy execution by default. See ray-project/enhancements#19 for motivation. The change includes: * Change `Dataset` constructor: `Dataset.__init__(lazy: bool = True)`. Also remove `defer_execution` field, as it's no longer needed. * `read_api.py:read_datasource()` returns a lazy `Dataset` with computing the first input block. * Add `ds.fully_executed()` calls to required unit tests, to make sure they are passing. TODO: - [x] Fix all unit tests - [x] #31459 - [x] #31460 - [ ] Remove the behavior to eagerly compute first block for read - [ ] #31417 - [ ] Update documentation
ray-project · Jan 12, 2023 · b0357fd · b0357fd
1 parent 58f349d
commit b0357fd
Show file tree

Hide file tree

Showing 10 changed files with 64 additions and 47 deletions.
diff --git a/doc/source/ray-core/_examples/datasets_train/datasets_train.py b/doc/source/ray-core/_examples/datasets_train/datasets_train.py
@@ -580,7 +580,7 @@ def train_func(config):
         read_dataset(data_path)
     )
 
-    num_columns = len(train_dataset.schema().names)
+    num_columns = len(train_dataset.schema(fetch_if_missing=True).names)
     # remove label column.
     num_features = num_columns - 1
 

diff --git a/python/ray/data/_internal/plan.py b/python/ray/data/_internal/plan.py
@@ -251,6 +251,11 @@ def schema(
                 self.execute()
             else:
                 return None
+        elif self._in_blocks is not None and self._snapshot_blocks is None:
+            # If the plan only has input blocks, we execute it, so snapshot has output.
+            # This applies to newly created dataset. For example, initial dataset from
+            # read, and output datasets of Dataset.split().
+            self.execute()
         # Snapshot is now guaranteed to be the output of the final stage or None.
         blocks = self._snapshot_blocks
         if not blocks:

diff --git a/python/ray/data/dataset.py b/python/ray/data/dataset.py
@@ -177,19 +177,19 @@ class Dataset(Generic[T]):
         >>> ds = ray.data.range(1000)
         >>> # Transform in parallel with map_batches().
         >>> ds.map_batches(lambda batch: [v * 2 for v in batch])
-        Dataset(num_blocks=..., num_rows=1000, schema=<class 'int'>)
+        Dataset(num_blocks=..., num_rows=..., schema=...)
         >>> # Compute max.
         >>> ds.max()
         999
         >>> # Group the data.
         >>> ds.groupby(lambda x: x % 3).count()
-        Dataset(num_blocks=..., num_rows=3, schema=<class 'tuple'>)
+        Dataset(num_blocks=..., num_rows=..., schema=...)
         >>> # Shuffle this dataset randomly.
         >>> ds.random_shuffle()
-        Dataset(num_blocks=..., num_rows=1000, schema=<class 'int'>)
+        Dataset(num_blocks=..., num_rows=..., schema=...)
         >>> # Sort it back in order.
         >>> ds.sort()
-        Dataset(num_blocks=..., num_rows=1000, schema=<class 'int'>)
+        Dataset(num_blocks=..., num_rows=..., schema=...)
 
     Since Datasets are just lists of Ray object refs, they can be passed
     between Ray tasks and actors without incurring a copy. Datasets support
@@ -202,9 +202,7 @@ def __init__(
         self,
         plan: ExecutionPlan,
         epoch: int,
-        lazy: bool,
-        *,
-        defer_execution: bool = False,
+        lazy: bool = True,
     ):
         """Construct a Dataset (internal API).
 
@@ -219,7 +217,7 @@ def __init__(
         self._epoch = epoch
         self._lazy = lazy
 
-        if not lazy and not defer_execution:
+        if not lazy:
             self._plan.execute(allow_clear_input_blocks=False)
 
     @staticmethod
@@ -243,7 +241,7 @@ def map(
             >>> # Transform python objects.
             >>> ds = ray.data.range(1000)
             >>> ds.map(lambda x: x * 2)
-            Dataset(num_blocks=..., num_rows=1000, schema=<class 'int'>)
+            Dataset(num_blocks=..., num_rows=..., schema=...)
             >>> # Transform Arrow records.
             >>> ds = ray.data.from_items(
             ...     [{"value": i} for i in range(1000)])
@@ -804,7 +802,7 @@ def flat_map(
             >>> import ray
             >>> ds = ray.data.range(1000)
             >>> ds.flat_map(lambda x: [x, x ** 2, x ** 3])
-            Dataset(num_blocks=..., num_rows=3000, schema=<class 'int'>)
+            Dataset(num_blocks=..., num_rows=..., schema=...)
 
         Time complexity: O(dataset size / parallelism)
 
@@ -872,7 +870,7 @@ def filter(
             >>> import ray
             >>> ds = ray.data.range(100)
             >>> ds.filter(lambda x: x % 2 == 0)
-            Dataset(num_blocks=..., num_rows=50, schema=<class 'int'>)
+            Dataset(num_blocks=..., num_rows=..., schema=...)
 
         Time complexity: O(dataset size / parallelism)
 
@@ -966,10 +964,10 @@ def random_shuffle(
             >>> ds = ray.data.range(100)
             >>> # Shuffle this dataset randomly.
             >>> ds.random_shuffle()
-            Dataset(num_blocks=..., num_rows=100, schema=<class 'int'>)
+            Dataset(num_blocks=..., num_rows=..., schema=...)
             >>> # Shuffle this dataset with a fixed random seed.
             >>> ds.random_shuffle(seed=12345)
-            Dataset(num_blocks=..., num_rows=100, schema=<class 'int'>)
+            Dataset(num_blocks=..., num_rows=..., schema=...)
 
         Time complexity: O(dataset size / parallelism)
 
@@ -1012,7 +1010,7 @@ def randomize_block_order(
         """
 
         plan = self._plan.with_stage(RandomizeBlocksStage(seed))
-        return Dataset(plan, self._epoch, self._lazy, defer_execution=True)
+        return Dataset(plan, self._epoch, self._lazy)
 
     def random_sample(
         self, fraction: float, *, seed: Optional[int] = None
@@ -1533,7 +1531,7 @@ def groupby(self, key: Optional[KeyFn]) -> "GroupedDataset[T]":
             >>> import ray
             >>> # Group by a key function and aggregate.
             >>> ray.data.range(100).groupby(lambda x: x % 3).count()
-            Dataset(num_blocks=..., num_rows=3, schema=<class 'tuple'>)
+            Dataset(num_blocks=..., num_rows=..., schema=...)
             >>> # Group by an Arrow table column and aggregate.
             >>> ray.data.from_items([
             ...     {"A": x % 3, "B": x} for x in range(100)]).groupby(
@@ -1933,7 +1931,7 @@ def sort(
             >>> # Sort using the entire record as the key.
             >>> ds = ray.data.range(100)
             >>> ds.sort()
-            Dataset(num_blocks=..., num_rows=100, schema=<class 'int'>)
+            Dataset(num_blocks=..., num_rows=..., schema=...)
             >>> # Sort by a single column in descending order.
             >>> ds = ray.data.from_items(
             ...     [{"value": i} for i in range(1000)])

diff --git a/python/ray/data/read_api.py b/python/ray/data/read_api.py
@@ -339,9 +339,9 @@ def read_datasource(
     block_list.ensure_metadata_for_first_block()
 
     return Dataset(
-        ExecutionPlan(block_list, block_list.stats(), run_by_consumer=False),
-        0,
-        False,
+        plan=ExecutionPlan(block_list, block_list.stats(), run_by_consumer=False),
+        epoch=0,
+        lazy=True,
     )
 
 

diff --git a/python/ray/data/tests/test_dataset.py b/python/ray/data/tests/test_dataset.py
@@ -337,10 +337,10 @@ def test_zip(ray_start_regular_shared):
     ds1 = ray.data.range(5, parallelism=5)
     ds2 = ray.data.range(5, parallelism=5).map(lambda x: x + 1)
     ds = ds1.zip(ds2)
-    assert ds.schema() == tuple
+    assert ds.schema(fetch_if_missing=True) == tuple
     assert ds.take() == [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5)]
     with pytest.raises(ValueError):
-        ds.zip(ray.data.range(3))
+        ds.zip(ray.data.range(3)).fully_executed()
 
 
 def test_zip_pandas(ray_start_regular_shared):
@@ -366,8 +366,8 @@ def test_zip_arrow(ray_start_regular_shared):
         lambda r: {"a": r["value"] + 1, "b": r["value"] + 2}
     )
     ds = ds1.zip(ds2)
-    assert "{id: int64, a: int64, b: int64}" in str(ds)
     assert ds.count() == 5
+    assert "{id: int64, a: int64, b: int64}" in str(ds)
     result = [r.as_pydict() for r in ds.take()]
     assert result[0] == {"id": 0, "a": 1, "b": 2}
 
@@ -749,6 +749,7 @@ def test_tensors_sort(ray_start_regular_shared):
 def test_tensors_inferred_from_map(ray_start_regular_shared):
     # Test map.
     ds = ray.data.range(10, parallelism=10).map(lambda _: np.ones((4, 4)))
+    ds.fully_executed()
     assert str(ds) == (
         "Dataset(num_blocks=10, num_rows=10, "
         "schema={__value__: ArrowTensorType(shape=(4, 4), dtype=double)})"
@@ -758,6 +759,7 @@ def test_tensors_inferred_from_map(ray_start_regular_shared):
     ds = ray.data.range(16, parallelism=4).map_batches(
         lambda _: np.ones((3, 4, 4)), batch_size=2
     )
+    ds.fully_executed()
     assert str(ds) == (
         "Dataset(num_blocks=4, num_rows=24, "
         "schema={__value__: ArrowTensorType(shape=(4, 4), dtype=double)})"
@@ -767,6 +769,7 @@ def test_tensors_inferred_from_map(ray_start_regular_shared):
     ds = ray.data.range(10, parallelism=10).flat_map(
         lambda _: [np.ones((4, 4)), np.ones((4, 4))]
     )
+    ds.fully_executed()
     assert str(ds) == (
         "Dataset(num_blocks=10, num_rows=20, "
         "schema={__value__: ArrowTensorType(shape=(4, 4), dtype=double)})"
@@ -776,6 +779,7 @@ def test_tensors_inferred_from_map(ray_start_regular_shared):
     ds = ray.data.range(16, parallelism=4).map_batches(
         lambda _: pd.DataFrame({"a": [np.ones((4, 4))] * 3}), batch_size=2
     )
+    ds.fully_executed()
     assert str(ds) == (
         "Dataset(num_blocks=4, num_rows=24, "
         "schema={a: TensorDtype(shape=(4, 4), dtype=float64)})"
@@ -785,6 +789,7 @@ def test_tensors_inferred_from_map(ray_start_regular_shared):
         lambda _: pd.DataFrame({"a": [np.ones((2, 2)), np.ones((3, 3))]}),
         batch_size=2,
     )
+    ds.fully_executed()
     assert str(ds) == (
         "Dataset(num_blocks=4, num_rows=16, "
         "schema={a: TensorDtype(shape=(None, None), dtype=float64)})"
@@ -1456,24 +1461,29 @@ def test_empty_dataset(ray_start_regular_shared):
 
     ds = ray.data.range(1)
     ds = ds.filter(lambda x: x > 1)
+    ds.fully_executed()
     assert str(ds) == "Dataset(num_blocks=1, num_rows=0, schema=Unknown schema)"
 
     # Test map on empty dataset.
     ds = ray.data.from_items([])
     ds = ds.map(lambda x: x)
+    ds.fully_executed()
     assert ds.count() == 0
 
     # Test filter on empty dataset.
     ds = ray.data.from_items([])
     ds = ds.filter(lambda: True)
+    ds.fully_executed()
     assert ds.count() == 0
 
 
 def test_schema(ray_start_regular_shared):
     ds = ray.data.range(10, parallelism=10)
     ds2 = ray.data.range_table(10, parallelism=10)
     ds3 = ds2.repartition(5)
+    ds3.fully_executed()
     ds4 = ds3.map(lambda x: {"a": "hi", "b": 1.0}).limit(5).repartition(1)
+    ds4.fully_executed()
     assert str(ds) == "Dataset(num_blocks=10, num_rows=10, schema=<class 'int'>)"
     assert str(ds2) == "Dataset(num_blocks=10, num_rows=10, schema={value: int64})"
     assert str(ds3) == "Dataset(num_blocks=5, num_rows=10, schema={value: int64})"
@@ -2284,7 +2294,7 @@ def test_drop_columns(ray_start_regular_shared, tmp_path):
         ]
         # Test dropping non-existent column
         with pytest.raises(KeyError):
-            ds.drop_columns(["dummy_col", "col1", "col2"])
+            ds.drop_columns(["dummy_col", "col1", "col2"]).fully_executed()
 
 
 def test_select_columns(ray_start_regular_shared):
@@ -2315,13 +2325,13 @@ def test_select_columns(ray_start_regular_shared):
         ]
         # Test selecting a column that is not in the dataset schema
         with pytest.raises(KeyError):
-            each_ds.select_columns(cols=["col1", "col2", "dummy_col"])
+            each_ds.select_columns(cols=["col1", "col2", "dummy_col"]).fully_executed()
 
     # Test simple
     ds3 = ray.data.range(10)
     assert ds3.dataset_format() == "simple"
     with pytest.raises(ValueError):
-        ds3.select_columns(cols=[])
+        ds3.select_columns(cols=[]).fully_executed()
 
 
 def test_map_batches_basic(ray_start_regular_shared, tmp_path):
@@ -2684,11 +2694,13 @@ def mutate(df):
     ds = ray.data.range_table(num_rows, parallelism=num_blocks).repartition(num_blocks)
     # Convert to Pandas blocks.
     ds = ds.map_batches(lambda df: df, batch_format="pandas", batch_size=None)
+    ds.fully_executed()
 
     # Apply UDF that mutates the batches, which should fail since the batch is
     # read-only.
     with pytest.raises(ValueError, match="tried to mutate a zero-copy read-only batch"):
-        ds.map_batches(mutate, batch_size=batch_size, zero_copy_batch=True)
+        ds = ds.map_batches(mutate, batch_size=batch_size, zero_copy_batch=True)
+        ds.fully_executed()
 
 
 BLOCK_BUNDLING_TEST_CASES = [
@@ -2710,10 +2722,12 @@ def test_map_batches_block_bundling_auto(
 
     # Blocks should be bundled up to the batch size.
     ds1 = ds.map_batches(lambda x: x, batch_size=batch_size)
+    ds1.fully_executed()
     assert ds1.num_blocks() == math.ceil(num_blocks / max(batch_size // block_size, 1))
 
     # Blocks should not be bundled up when batch_size is not specified.
     ds2 = ds.map_batches(lambda x: x)
+    ds2.fully_executed()
     assert ds2.num_blocks() == num_blocks
 
 
@@ -2796,7 +2810,7 @@ def good_fn(row):
     ds = ray.data.range(10, parallelism=1)
     error_message = "Current row has different columns compared to previous rows."
     with pytest.raises(ValueError) as e:
-        ds.map(bad_fn)
+        ds.map(bad_fn).fully_executed()
     assert error_message in str(e.value)
     ds_map = ds.map(good_fn)
     assert ds_map.take() == [{"a": "hello1", "b": "hello2"} for _ in range(10)]
@@ -5364,7 +5378,7 @@ def f(x):
     compute_strategy = ray.data.ActorPoolStrategy()
     ray.data.range(10, parallelism=10).map_batches(
         f, batch_size=1, compute=compute_strategy
-    )
+    ).fully_executed()
     expected_max_num_workers = math.ceil(
         num_cpus * (1 / compute_strategy.ready_to_total_workers_ratio)
     )

diff --git a/python/ray/data/tests/test_dataset_parquet.py b/python/ray/data/tests/test_dataset_parquet.py
@@ -194,7 +194,10 @@ def prefetch_file_metadata(self, pieces):
 
     # Expect precomputed row counts and block sizes to be missing.
     assert ds._meta_count() is None
-    assert ds._plan._snapshot_blocks.size_bytes() == -1
+    assert (
+        ds._plan._snapshot_blocks is None
+        or ds._plan._snapshot_blocks.size_bytes() == -1
+    )
 
     # Expect to lazily compute all metadata correctly.
     assert ds._plan.execute()._num_computed() == 1

diff --git a/python/ray/data/tests/test_dynamic_block_split.py b/python/ray/data/tests/test_dynamic_block_split.py
@@ -82,10 +82,13 @@ def test_dataset(
     assert ds.size_bytes() >= 0.7 * block_size * num_blocks * num_tasks
 
     map_ds = ds.map_batches(lambda x: x)
+    map_ds.fully_executed()
     assert map_ds.num_blocks() == num_tasks
     map_ds = ds.map_batches(lambda x: x, batch_size=num_blocks * num_tasks)
+    map_ds.fully_executed()
     assert map_ds.num_blocks() == 1
     map_ds = ds.map(lambda x: x)
+    map_ds.fully_executed()
     assert map_ds.num_blocks() == num_blocks * num_tasks
 
     ds_list = ds.split(5)
@@ -109,6 +112,7 @@ def test_dataset(
     assert ds.groupby("one").count().count() == num_blocks * num_tasks
 
     new_ds = ds.zip(ds)
+    new_ds.fully_executed()
     assert new_ds.num_blocks() == num_blocks * num_tasks
 
     assert len(ds.take(5)) == 5

diff --git a/python/ray/data/tests/test_optimize.py b/python/ray/data/tests/test_optimize.py
@@ -63,6 +63,7 @@ def test_memory_sanity(shutdown_only):
     info = ray.init(num_cpus=1, object_store_memory=500e6)
     ds = ray.data.range(10)
     ds = ds.map(lambda x: np.ones(100 * 1024 * 1024, dtype=np.uint8))
+    ds.fully_executed()
     meminfo = memory_summary(info.address_info["address"], stats_only=True)
 
     # Sanity check spilling is happening as expected.
@@ -291,23 +292,11 @@ def _assert_has_stages(stages, stage_names):
 
 
 def test_stage_linking(ray_start_regular_shared):
-    # NOTE: This tests the internals of `ExecutionPlan`, which is bad practice. Remove
-    # this test once we have proper unit testing of `ExecutionPlan`.
-    # Test eager dataset.
-    ds = ray.data.range(10)
-    assert len(ds._plan._stages_before_snapshot) == 0
-    assert len(ds._plan._stages_after_snapshot) == 0
-    assert len(ds._plan._last_optimized_stages) == 0
-    ds = ds.map(lambda x: x + 1)
-    _assert_has_stages(ds._plan._stages_before_snapshot, ["map"])
-    assert len(ds._plan._stages_after_snapshot) == 0
-    _assert_has_stages(ds._plan._last_optimized_stages, ["read->map"])
-
     # Test lazy dataset.
     ds = ray.data.range(10).lazy()
     assert len(ds._plan._stages_before_snapshot) == 0
     assert len(ds._plan._stages_after_snapshot) == 0
-    assert len(ds._plan._last_optimized_stages) == 0
+    assert ds._plan._last_optimized_stages is None
     ds = ds.map(lambda x: x + 1)
     assert len(ds._plan._stages_before_snapshot) == 0
     _assert_has_stages(ds._plan._stages_after_snapshot, ["map"])