-
Notifications
You must be signed in to change notification settings - Fork 5.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Core] Fixing segfaults in Async Streaming Generator (#43775)
This change makes sure that no side-effects outlive invocation of execute_task method: Previously, after scheduling tasks onto Core Worker's ThreadPoolExecutor these could have continued executing, even after the request has been cancelled (cancelling of the future wouldn't cancel already running task), leading to SIGSEGV when the task running in TPE would try to access data-structures that were already cleaned up after returning from this method. With this change: Upon encountering any failure, we'd set an interrupt_signal_event interrupting already scheduled, but not yet executed tasks (preventing them from modifying externally passed in data-structures) Signed-off-by: Alexey Kudinkin <ak@anyscale.com>
- Loading branch information
1 parent
d180d5c
commit cfebe14
Showing
5 changed files
with
221 additions
and
42 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
137 changes: 137 additions & 0 deletions
137
python/ray/tests/test_streaming_generator_regression.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
import sys | ||
import time | ||
from concurrent.futures import ThreadPoolExecutor | ||
|
||
import pytest | ||
|
||
import ray | ||
from ray.actor import ActorHandle | ||
from ray.exceptions import RayTaskError, TaskCancelledError | ||
from ray.util.state import list_workers | ||
|
||
|
||
@ray.remote(num_cpus=1) | ||
class EndpointActor: | ||
def __init__(self, *, injected_executor_delay_s: float, tokens_per_request: int): | ||
self._tokens_per_request = tokens_per_request | ||
# In this test we simulate conditions leading to use-after-free conditions, | ||
# by injecting delays into worker's thread-pool executor | ||
self._inject_delay_in_core_worker_executor( | ||
target_delay_s=injected_executor_delay_s, | ||
max_workers=1, | ||
) | ||
|
||
async def aio_stream(self): | ||
for i in range(self._tokens_per_request): | ||
yield i | ||
|
||
@classmethod | ||
def _inject_delay_in_core_worker_executor( | ||
cls, target_delay_s: float, max_workers: int | ||
): | ||
if target_delay_s > 0: | ||
|
||
class DelayedThreadPoolExecutor(ThreadPoolExecutor): | ||
def submit(self, fn, /, *args, **kwargs): | ||
def __slowed_fn(): | ||
print( | ||
f">>> [DelayedThreadPoolExecutor] Starting executing " | ||
f"function with delay {target_delay_s}s" | ||
) | ||
|
||
time.sleep(target_delay_s) | ||
fn(*args, **kwargs) | ||
|
||
return super().submit(__slowed_fn) | ||
|
||
executor = DelayedThreadPoolExecutor(max_workers=max_workers) | ||
ray._private.worker.global_worker.core_worker.reset_event_loop_executor( | ||
executor | ||
) | ||
|
||
|
||
@ray.remote(num_cpus=1) | ||
class CallerActor: | ||
def __init__( | ||
self, | ||
downstream: ActorHandle, | ||
): | ||
self._h = downstream | ||
|
||
async def run(self): | ||
print(">>> [Caller] Starting consuming stream") | ||
|
||
async_obj_ref_gen = self._h.aio_stream.options(num_returns="streaming").remote() | ||
async for ref in async_obj_ref_gen: | ||
r = await ref | ||
if r == 1: | ||
print(">>> [Caller] Cancelling generator") | ||
ray.cancel(async_obj_ref_gen, recursive=False) | ||
|
||
# NOTE: This delay is crucial to let already scheduled task to report | ||
# generated item (report_streaming_generator_output) before we | ||
# will tear down this stream | ||
delay_after_cancellation_s = 2 | ||
|
||
print(f">>> [Caller] **Sleeping** {delay_after_cancellation_s}s") | ||
time.sleep(delay_after_cancellation_s) | ||
else: | ||
print(f">>> [Caller] Received {r}") | ||
|
||
print(">>> [Caller] Completed consuming stream") | ||
|
||
|
||
@pytest.mark.parametrize("injected_executor_delay_s", [0, 2]) | ||
@pytest.mark.parametrize( | ||
"ray_start_cluster", | ||
[ | ||
{ | ||
"num_nodes": 2, | ||
"num_cpus": 1, | ||
} | ||
], | ||
indirect=True, | ||
) | ||
def test_segfault_report_streaming_generator_output( | ||
ray_start_cluster, injected_executor_delay_s: float | ||
): | ||
""" | ||
This is a "smoke" test attempting to emulate condition, when using Ray's async | ||
streaming generator, that leads to worker crashing with SIGSEGV. | ||
For more details summarizing these conditions, please refer to | ||
https://github.com/ray-project/ray/issues/43771#issuecomment-1982301654 | ||
""" | ||
|
||
caller = CallerActor.remote( | ||
EndpointActor.remote( | ||
injected_executor_delay_s=injected_executor_delay_s, | ||
tokens_per_request=100, | ||
), | ||
) | ||
|
||
worker_state_before = [(a.worker_id, a.exit_type) for a in list_workers()] | ||
print(">>> Workers state before: ", worker_state_before) | ||
|
||
with pytest.raises(RayTaskError) as exc_info: | ||
ray.get(caller.run.remote()) | ||
|
||
assert isinstance(exc_info.value.cause, TaskCancelledError) | ||
|
||
worker_state_after = [(a.worker_id, a.exit_type) for a in list_workers()] | ||
print(">>> Workers state after: ", worker_state_after) | ||
|
||
worker_ids, worker_exit_types = zip(*worker_state_after) | ||
# Make sure no workers crashed | ||
assert ( | ||
"SYSTEM_ERROR" not in worker_exit_types | ||
), f"Unexpected crashed worker(s) in {worker_ids}" | ||
|
||
|
||
if __name__ == "__main__": | ||
import os | ||
|
||
if os.environ.get("PARALLEL_CI"): | ||
sys.exit(pytest.main(["-n", "auto", "--boxed", "-vs", __file__])) | ||
else: | ||
sys.exit(pytest.main(["-sv", __file__])) |