From eb4b22985869feac211ad193c7290ebe2b77a677 Mon Sep 17 00:00:00 2001 From: Peter Zhu Date: Mon, 17 Nov 2025 21:28:51 -0500 Subject: [PATCH 1/2] Add VM barrier in rb_gc_impl_before_fork We need the VM barrier in rb_gc_impl_before_fork to stop the other Ractors because otherwise they could be allocating objects in the fast path which could be calling mmtk_add_obj_free_candidate. Since mmtk_add_obj_free_candidate acquires a lock on obj_free_candidates in weak_proc.rs, this lock may not be released in the child process after the Ractor dies. For example, the following script demonstrates the issue: puts "Hello #{Process.pid}" 100.times do |i| puts "i = #{i}" Ractor.new(i) do |j| puts "Ractor #{j} hello" 1000.times do |i| s = "#{j}-#{i}" end Ractor.receive puts "Ractor #{j} goodbye" end pid = fork { } puts "Child pid is #{pid}" _, status = Process.waitpid2 pid puts status.success? end puts "Goodbye" In the child process, we can see that it is stuck trying to acquire the lock on obj_free_candidates: #5 0x00007192bfb53f10 in mmtk_ruby::weak_proc::WeakProcessor::get_all_obj_free_candidates (self=0x7192c0657498 ) at src/weak_proc.rs:52 #6 0x00007192bfa634c3 in mmtk_ruby::api::mmtk_get_all_obj_free_candidates () at src/api.rs:295 #7 0x00007192bfa61d50 in rb_gc_impl_shutdown_call_finalizer (objspace_ptr=0x578c17abfc50) at gc/mmtk/mmtk.c:1032 #8 0x0000578c1601e48e in rb_ec_finalize (ec=0x578c17ac06d0) at eval.c:166 #9 rb_ec_cleanup (ec=, ex=) at eval.c:257 #10 0x0000578c1601ebf6 in ruby_cleanup (ex=) at eval.c:180 #11 ruby_stop (ex=) at eval.c:292 #12 0x0000578c16127124 in rb_f_fork (obj=) at process.c:4291 #13 rb_f_fork (obj=) at process.c:4281 --- gc/mmtk/mmtk.c | 1 + 1 file changed, 1 insertion(+) diff --git a/gc/mmtk/mmtk.c b/gc/mmtk/mmtk.c index 0d78d61..4288334 100644 --- a/gc/mmtk/mmtk.c +++ b/gc/mmtk/mmtk.c @@ -1050,6 +1050,7 @@ rb_gc_impl_before_fork(void *objspace_ptr) struct objspace *objspace = objspace_ptr; objspace->fork_hook_vm_lock_lev = RB_GC_VM_LOCK(); + rb_gc_vm_barrier(); mmtk_before_fork(); } From 1a629504a708a8b3b4a3a71b23c962e071c4c252 Mon Sep 17 00:00:00 2001 From: Peter Zhu Date: Mon, 17 Nov 2025 21:39:02 -0500 Subject: [PATCH 2/2] Ensure not blocking for GC in rb_gc_impl_before_fork In rb_gc_impl_before_fork, it locks the VM and barriers all the Ractors before calling mmtk_before_fork. However, since rb_mmtk_block_for_gc is a barrier point, one or more Ractors could be paused there. However, mmtk_before_fork is not compatible with that because it assumes that the MMTk workers are idle, but the workers are not idle because they are busy working on a GC. This commit essentially implements a trylock. It will optimistically lock but will release the lock if it detects that any other Ractors are waiting in rb_mmtk_block_for_gc. For example, the following script demonstrates the issue: puts "Hello #{Process.pid}" 100.times do |i| puts "i = #{i}" Ractor.new(i) do |j| puts "Ractor #{j} hello" 1000.times do |i| s = "#{j}-#{i}" end Ractor.receive puts "Ractor #{j} goodbye" end pid = fork { } puts "Child pid is #{pid}" _, status = Process.waitpid2 pid puts status.success? end puts "Goodbye" We can see the MMTk worker thread is waiting to start the GC: #4 0x00007ffff66538b1 in rb_mmtk_stop_the_world () at gc/mmtk/mmtk.c:101 #5 0x00007ffff6d04caf in mmtk_ruby::collection::{impl#0}::stop_all_mutators>> (_tls=..., mutator_visitor=...) at src/collection.rs:23 However, the mutator thread is stuck in mmtk_before_fork trying to stop that worker thread: #4 0x00007ffff6c0b621 in std::sys::thread::unix::Thread::join () at library/std/src/sys/thread/unix.rs:134 #5 0x00007ffff6658b6e in std::thread::JoinInner<()>::join<()> (self=...) #6 0x00007ffff6658d4c in std::thread::JoinHandle<()>::join<()> (self=...) #7 0x00007ffff665795e in mmtk_ruby::binding::RubyBinding::join_all_gc_threads (self=0x7ffff72462d0 ) at src/binding.rs:115 #8 0x00007ffff66561a8 in mmtk_ruby::api::mmtk_before_fork () at src/api.rs:309 #9 0x00007ffff66556ff in rb_gc_impl_before_fork (objspace_ptr=0x555555d17980) at gc/mmtk/mmtk.c:1054 #10 0x00005555556bbc3e in rb_gc_before_fork () at gc.c:5429 --- gc/mmtk/mmtk.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/gc/mmtk/mmtk.c b/gc/mmtk/mmtk.c index 4288334..e1678dc 100644 --- a/gc/mmtk/mmtk.c +++ b/gc/mmtk/mmtk.c @@ -32,6 +32,7 @@ struct objspace { unsigned long live_ractor_cache_count; pthread_mutex_t mutex; + rb_atomic_t mutator_blocking_count; bool world_stopped; pthread_cond_t cond_world_stopped; pthread_cond_t cond_world_started; @@ -131,7 +132,9 @@ rb_mmtk_block_for_gc(MMTk_VMMutatorThread mutator) struct objspace *objspace = rb_gc_get_objspace(); size_t starting_gc_count = objspace->gc_count; + RUBY_ATOMIC_INC(objspace->mutator_blocking_count); int lock_lev = RB_GC_VM_LOCK(); + RUBY_ATOMIC_DEC(objspace->mutator_blocking_count); int err; if ((err = pthread_mutex_lock(&objspace->mutex)) != 0) { rb_bug("ERROR: cannot lock objspace->mutex: %s", strerror(err)); @@ -1049,9 +1052,26 @@ rb_gc_impl_before_fork(void *objspace_ptr) { struct objspace *objspace = objspace_ptr; + retry: objspace->fork_hook_vm_lock_lev = RB_GC_VM_LOCK(); rb_gc_vm_barrier(); + /* At this point, we know that all the Ractors are paused because of the + * rb_gc_vm_barrier above. Since rb_mmtk_block_for_gc is a barrier point, + * one or more Ractors could be paused there. However, mmtk_before_fork is + * not compatible with that because it assumes that the MMTk workers are idle, + * but the workers are not idle because they are busy working on a GC. + * + * This essentially implements a trylock. It will optimistically lock but will + * release the lock if it detects that any other Ractors are waiting in + * rb_mmtk_block_for_gc. + */ + rb_atomic_t mutator_blocking_count = RUBY_ATOMIC_LOAD(objspace->mutator_blocking_count); + if (mutator_blocking_count != 0) { + RB_GC_VM_UNLOCK(objspace->fork_hook_vm_lock_lev); + goto retry; + } + mmtk_before_fork(); }