-
Notifications
You must be signed in to change notification settings - Fork 5.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Data] Cap op concurrency with exponential ramp-up #40275
Changes from 16 commits
2d580a7
ed288f3
78d2b10
b699382
c08f1b6
647128a
a517c30
37913b0
fc14b94
600f197
061faf8
1a35d2d
527606d
681b589
ec46dd1
26e59b4
04e4e5a
9443aeb
ee0dd39
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
import logging | ||
from abc import ABC, abstractmethod | ||
from typing import TYPE_CHECKING | ||
|
||
import ray | ||
|
||
if TYPE_CHECKING: | ||
from ray.data._internal.execution.interfaces.physical_operator import ( | ||
PhysicalOperator, | ||
) | ||
from ray.data._internal.execution.streaming_executor_state import Topology | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
# Default enabled backpressure policies and its config key. | ||
# Use `DataContext.set_plugin_config` to config it. | ||
# TODO(hchen): Enable ConcurrencyCapBackpressurePolicy by default. | ||
ENABLED_BACKPRESSURE_POLICIES = [] | ||
ENABLED_BACKPRESSURE_POLICIES_CONFIG_KEY = "backpressure_policies.enabled" | ||
|
||
|
||
def get_backpressure_policies(topology: "Topology"): | ||
data_context = ray.data.DataContext.get_current() | ||
policies = data_context.get_plugin_config( | ||
ENABLED_BACKPRESSURE_POLICIES_CONFIG_KEY, ENABLED_BACKPRESSURE_POLICIES | ||
) | ||
|
||
return [policy(topology) for policy in policies] | ||
|
||
|
||
class BackpressurePolicy(ABC): | ||
"""Interface for back pressure policies.""" | ||
|
||
@abstractmethod | ||
def __init__(self, topology: "Topology"): | ||
... | ||
|
||
@abstractmethod | ||
def can_run(self, op: "PhysicalOperator") -> bool: | ||
"""Called when StreamingExecutor selects an operator to run in | ||
`streaming_executor_state.select_operator_to_run()`. | ||
|
||
Returns: True if the operator can run, False otherwise. | ||
""" | ||
... | ||
|
||
|
||
class ConcurrencyCapBackpressurePolicy(BackpressurePolicy): | ||
"""A backpressure policy that caps the concurrency of each operator. | ||
|
||
The concurrency cap limits the number of concurrently running tasks. | ||
It will be set to an intial value, and will ramp up exponentially. | ||
|
||
The concrete stategy is as follows: | ||
- Each PhysicalOperator is assigned an initial concurrency cap. | ||
- An PhysicalOperator can run new tasks if the number of running tasks is less | ||
than the cap. | ||
- When the number of finished tasks reaches a threshold, the concurrency cap will | ||
increase. | ||
""" | ||
|
||
# Following are the default values followed by the config keys of the | ||
# available configs. | ||
# Use `DataContext.set_plugin_config` to config them. | ||
|
||
# The intial concurrency cap for each operator. | ||
INIT_CAP = 4 | ||
INIT_CAP_CONFIG_KEY = "backpressure_policies.concurrency_cap.init_cap" | ||
# When the number of finished tasks reaches this threshold, the concurrency cap | ||
# will be multiplied by the multiplier. | ||
CAP_MULTIPLY_THRESHOLD = 0.5 | ||
CAP_MULTIPLY_THRESHOLD_CONFIG_KEY = ( | ||
"backpressure_policies.concurrency_cap.cap_multiply_threshold" | ||
) | ||
# The multiplier to multiply the concurrency cap by. | ||
CAP_MULTIPLIER = 2.0 | ||
CAP_MULTIPLIER_CONFIG_KEY = "backpressure_policies.concurrency_cap.cap_multiplier" | ||
|
||
def __init__(self, topology: "Topology"): | ||
self._concurrency_caps: dict["PhysicalOperator", float] = {} | ||
|
||
data_context = ray.data.DataContext.get_current() | ||
self._init_cap = data_context.get_plugin_config( | ||
self.INIT_CAP_CONFIG_KEY, self.INIT_CAP | ||
) | ||
self._cap_multiplier = data_context.get_plugin_config( | ||
self.CAP_MULTIPLIER_CONFIG_KEY, self.CAP_MULTIPLIER | ||
) | ||
self._cap_multiply_threshold = data_context.get_plugin_config( | ||
self.CAP_MULTIPLY_THRESHOLD_CONFIG_KEY, self.CAP_MULTIPLY_THRESHOLD | ||
) | ||
|
||
assert self._init_cap > 0 | ||
assert 0 < self._cap_multiply_threshold <= 1 | ||
assert self._cap_multiplier > 1 | ||
|
||
logger.debug( | ||
"ConcurrencyCapBackpressurePolicy initialized with config: " | ||
f"{self._init_cap}, {self._cap_multiply_threshold}, {self._cap_multiplier}" | ||
) | ||
|
||
for op, _ in topology.items(): | ||
self._concurrency_caps[op] = self._init_cap | ||
|
||
def can_run(self, op: "PhysicalOperator") -> bool: | ||
metrics = op.metrics | ||
while metrics.num_tasks_finished >= ( | ||
self._concurrency_caps[op] * self._cap_multiply_threshold | ||
): | ||
self._concurrency_caps[op] *= self._cap_multiplier | ||
logger.debug( | ||
f"Concurrency cap for {op} increased to {self._concurrency_caps[op]}" | ||
) | ||
return metrics.num_tasks_running < self._concurrency_caps[op] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
import os | ||
import threading | ||
from typing import TYPE_CHECKING, Optional | ||
from typing import TYPE_CHECKING, Any, Dict, Optional | ||
|
||
import ray | ||
from ray._private.ray_constants import env_integer | ||
|
@@ -219,6 +219,7 @@ def __init__( | |
self.enable_get_object_locations_for_metrics = ( | ||
enable_get_object_locations_for_metrics | ||
) | ||
self._plugin_configs: Dict[str, Any] = {} | ||
|
||
@staticmethod | ||
def get_current() -> "DataContext": | ||
|
@@ -283,6 +284,15 @@ def _set_current(context: "DataContext") -> None: | |
global _default_context | ||
_default_context = context | ||
|
||
def get_plugin_config(self, key: str, default: Any = None) -> Any: | ||
return self._plugin_configs.get(key, default) | ||
|
||
def set_plugin_config(self, key: str, value: Any) -> None: | ||
self._plugin_configs[key] = value | ||
|
||
def remove_plugin_config(self, key: str) -> None: | ||
self._plugin_configs.pop(key, None) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's declare these methods with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Discussed offline, we'll make this API reusable for other components. |
||
|
||
|
||
# Backwards compatibility alias. | ||
DatasetContext = DataContext |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can we rename it as
_backpressure_plugin_configs
? In the future, we may introduce other plugin components.