ray-project · stephanie-wang · Oct 23, 2023 · Oct 20, 2023 · Oct 20, 2023 · Oct 20, 2023
@@ -78,8 +78,9 @@ def next(self) -> Block:
             target_num_rows = max(1, target_num_rows)
 
             num_rows = min(target_num_rows, block.num_rows())
-            block_to_yield = block.slice(0, num_rows)
-            block_remainder = block.slice(num_rows, block.num_rows())
+            # Use copy=True to avoid holding the entire block in memory.
+            block_to_yield = block.slice(0, num_rows, copy=True)
+            block_remainder = block.slice(num_rows, block.num_rows(), copy=True)
 
         self._buffer = DelegatingBlockBuilder()
         if block_remainder is not None:

@@ -3,10 +3,13 @@
 
 import numpy as np
 
+from ray.data._internal.dataset_logger import DatasetLogger
 from ray.data._internal.delegating_block_builder import DelegatingBlockBuilder
 from ray.data._internal.planner.exchange.interfaces import ExchangeTaskSpec
 from ray.data.block import Block, BlockAccessor, BlockExecStats, BlockMetadata
 
+logger = DatasetLogger(__name__)
+
 
 class ShuffleTaskSpec(ExchangeTaskSpec):
     """
@@ -19,12 +22,18 @@ class ShuffleTaskSpec(ExchangeTaskSpec):
 
     def __init__(
         self,
+        target_max_block_size: int,
         random_shuffle: bool = False,
         random_seed: Optional[int] = None,
         upstream_map_fn: Optional[Callable[[Iterable[Block]], Iterable[Block]]] = None,
     ):
         super().__init__(
-            map_args=[upstream_map_fn, random_shuffle, random_seed],
+            map_args=[
+                target_max_block_size,
+                upstream_map_fn,
+                random_shuffle,
+                random_seed,
+            ],
             reduce_args=[random_shuffle, random_seed],
         )
 
@@ -33,22 +42,37 @@ def map(
         idx: int,
         block: Block,
         output_num_blocks: int,
+        target_max_block_size: int,
-        target_max_block_size: int,
+        target_shuffle_max_block_size: int,
-        target_max_block_size: int,
+        target_shuffle_max_block_size: int,
         upstream_map_fn: Optional[Callable[[Iterable[Block]], Iterable[Block]]],
         random_shuffle: bool,
         random_seed: Optional[int],
     ) -> List[Union[BlockMetadata, Block]]:
-        # TODO: Support fusion with other upstream operators.
         stats = BlockExecStats.builder()
         if upstream_map_fn:
-            mapped_blocks = list(upstream_map_fn([block]))
-            if len(mapped_blocks) > 1:
-                builder = BlockAccessor.for_block(mapped_blocks[0]).builder()
-                for b in mapped_blocks:
-                    builder.add_block(b)
-                block = builder.build()
-            else:
-                block = mapped_blocks[0]
+            # TODO: Support dynamic block splitting in
+            # all-to-all ops, to avoid having to re-fuse
+            # upstream blocks together.
+            upstream_map_iter = upstream_map_fn([block])
+            mapped_block = next(upstream_map_iter)
+            builder = BlockAccessor.for_block(mapped_block).builder()
+            builder.add_block(mapped_block)
+            for mapped_block in upstream_map_iter:
+                builder.add_block(mapped_block)
+            # Drop the upstream inputs to reduce memory usage.
+            del mapped_block
+            block = builder.build()
         block = BlockAccessor.for_block(block)
+        if block.size_bytes() > target_max_block_size:
+            logger.get_logger().warn(
+                "Input block to map task has size "
+                f"{block.size_bytes() // (1024 * 1024)}MiB, which exceeds "
+                "DataContext.get_current().target_shuffle_max_block_size="
+                f"{target_max_block_size // (1024 * 1024)}MiB. "
+                "This can lead to out-of-memory errors and can happen "
+                "when map tasks are fused to the shuffle operation. "
+                "To prevent fusion, call Dataset.materialize() on the "
+                "dataset before shuffling."
+            )
 
         # Randomize the distribution of records to blocks.
         if random_shuffle:

@@ -37,7 +37,14 @@ def fn(
         upstream_map_fn = None
         nonlocal ray_remote_args
         if map_transformer:
-            map_transformer.set_target_max_block_size(ctx.target_max_block_size)
+            # NOTE(swang): We override the target block size with infinity, to
+            # prevent the upstream map from slicing its output into smaller
+            # blocks. Since the shuffle task will just fuse these back
+            # together, the extra slicing and re-fusing can add high memory
+            # overhead. This can be removed once dynamic block splitting is
+            # supported for all-to-all ops.
+            # See https://github.com/ray-project/ray/issues/40518.
+            map_transformer.set_target_max_block_size(float("inf"))
 
             def upstream_map_fn(blocks):
                 return map_transformer.apply_transform(blocks, ctx)
@@ -47,6 +54,7 @@ def upstream_map_fn(blocks):
             ray_remote_args = ctx.upstream_map_ray_remote_args
 
         shuffle_spec = ShuffleTaskSpec(
+            ctx.target_max_block_size,
             random_shuffle=True,
             random_seed=seed,
             upstream_map_fn=upstream_map_fn,

@@ -36,12 +36,20 @@ def shuffle_repartition_fn(
         map_transformer: Optional["MapTransformer"] = ctx.upstream_map_transformer
         upstream_map_fn = None
         if map_transformer:
-            map_transformer.set_target_max_block_size(ctx.target_max_block_size)
+            # NOTE(swang): We override the target block size with infinity, to
+            # prevent the upstream map from slicing its output into smaller
+            # blocks. Since the shuffle task will just fuse these back
+            # together, the extra slicing and re-fusing can add high memory
+            # overhead. This can be removed once dynamic block splitting is
+            # supported for all-to-all ops.
+            # See https://github.com/ray-project/ray/issues/40518.
+            map_transformer.set_target_max_block_size(float("inf"))
 
             def upstream_map_fn(blocks):
                 return map_transformer.apply_transform(blocks, ctx)
 
         shuffle_spec = ShuffleTaskSpec(
+            ctx.target_max_block_size,
             random_shuffle=False,
             upstream_map_fn=upstream_map_fn,
         )

diff --git a/release/nightly_tests/dataset/sort.py b/release/nightly_tests/dataset/sort.py
@@ -116,7 +116,7 @@ def run_benchmark(args):
             ds = ds.random_shuffle()
         else:
             ds = ds.sort(key="c_0")
-        ds.materialize()
+        ds = ds.materialize()
         ds_stats = ds.stats()
 
         print("==== Driver memory summary ====")