From 2cd7686564014dde232851e88c3f9370cf5d4e7a Mon Sep 17 00:00:00 2001 From: Pablo Galindo Salgado Date: Wed, 17 Dec 2025 02:51:17 +0000 Subject: [PATCH] gh-138122: Validate base frame before caching in remote debugging frame cache The frame cache in the remote debugging module was storing frame chains without validating that they reached the base frame. This could happen when a frame chain was interrupted or when the process state changed during reading, resulting in incomplete stacks being cached. Subsequent samples that hit the cache would then produce flamegraphs that didn't reach the bottom of the call stack. The fix passes base_frame_addr through to process_frame_chain() which already has validation logic to ensure the frame walk terminates at the expected sentinel frame. By enabling this validation in the caching code path and tracking whether we've confirmed reaching the base frame, we now only store complete frame chains in the cache. When extending from cached data, we trust that the cached frames were already validated at storage time, maintaining the invariant that cached stacks are always complete. An integration test using deeply nested generators that oscillate the stack depth is added to verify that all sampled stacks contain the entry point function. This catches regressions where incomplete stacks might be cached and returned. --- .../test_integration.py | 95 +++++++++++++++++++ ...-12-17-03-03-12.gh-issue-138122.m3EF9E.rst | 4 + Modules/_remote_debugging/_remote_debugging.h | 9 +- Modules/_remote_debugging/frame_cache.c | 16 +++- Modules/_remote_debugging/frames.c | 41 +++++--- Modules/_remote_debugging/threads.c | 4 +- 6 files changed, 152 insertions(+), 17 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2025-12-17-03-03-12.gh-issue-138122.m3EF9E.rst diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_integration.py b/Lib/test/test_profiling/test_sampling_profiler/test_integration.py index 08a96d7eb459f0..866a056eda9e03 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_integration.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_integration.py @@ -863,3 +863,98 @@ def test_async_aware_running_sees_only_cpu_task(self): self.assertGreater(cpu_percentage, 90.0, f"cpu_leaf should dominate samples in 'running' mode, " f"got {cpu_percentage:.1f}% ({cpu_leaf_samples}/{total})") + + +def _generate_deep_generators_script(chain_depth=20, recurse_depth=150): + """Generate a script with deep nested generators for stress testing.""" + lines = [ + 'import sys', + 'sys.setrecursionlimit(5000)', + '', + ] + # Generate chain of yield-from functions + for i in range(chain_depth - 1): + lines.extend([ + f'def deep_yield_chain_{i}(n):', + f' yield ("L{i}", n)', + f' yield from deep_yield_chain_{i + 1}(n)', + '', + ]) + # Last chain function calls recursive_diver + lines.extend([ + f'def deep_yield_chain_{chain_depth - 1}(n):', + f' yield ("L{chain_depth - 1}", n)', + f' yield from recursive_diver(n, {chain_depth})', + '', + 'def recursive_diver(n, depth):', + ' yield (f"DIVE_{depth}", n)', + f' if depth < {recurse_depth}:', + ' yield from recursive_diver(n, depth + 1)', + ' else:', + ' for i in range(5):', + ' yield (f"BOTTOM_{depth}", i)', + '', + 'def oscillating_generator(iterations=1000):', + ' for i in range(iterations):', + ' yield ("OSCILLATE", i)', + ' yield from deep_yield_chain_0(i)', + '', + 'def run_forever():', + ' while True:', + ' for _ in oscillating_generator(10):', + ' pass', + '', + '_test_sock.sendall(b"working")', + 'run_forever()', + ]) + return '\n'.join(lines) + + +@requires_remote_subprocess_debugging() +class TestDeepGeneratorFrameCache(unittest.TestCase): + """Test frame cache consistency with deep oscillating generator stacks.""" + + def test_all_stacks_share_same_base_frame(self): + """Verify all sampled stacks reach the entry point function. + + When profiling deep generators that oscillate up and down the call + stack, every sample should include the entry point function + (run_forever) in its call chain. If the frame cache stores + incomplete stacks, some samples will be missing this base function, + causing broken flamegraphs. + """ + script = _generate_deep_generators_script() + with test_subprocess(script, wait_for_working=True) as subproc: + collector = CollapsedStackCollector(sample_interval_usec=1, skip_idle=False) + + with ( + io.StringIO() as captured_output, + mock.patch("sys.stdout", captured_output), + ): + profiling.sampling.sample.sample( + subproc.process.pid, + collector, + duration_sec=2, + ) + + samples_with_entry_point = 0 + samples_without_entry_point = 0 + total_samples = 0 + + for (call_tree, _thread_id), count in collector.stack_counter.items(): + total_samples += count + if call_tree: + has_entry_point = call_tree and call_tree[0][2] == "" + if has_entry_point: + samples_with_entry_point += count + else: + samples_without_entry_point += count + + self.assertGreater(total_samples, 100, + f"Expected at least 100 samples, got {total_samples}") + + self.assertEqual(samples_without_entry_point, 0, + f"Found {samples_without_entry_point}/{total_samples} samples " + f"missing the entry point function 'run_forever'. This indicates " + f"incomplete stacks are being returned, likely due to frame cache " + f"storing partial stack traces.") diff --git a/Misc/NEWS.d/next/Library/2025-12-17-03-03-12.gh-issue-138122.m3EF9E.rst b/Misc/NEWS.d/next/Library/2025-12-17-03-03-12.gh-issue-138122.m3EF9E.rst new file mode 100644 index 00000000000000..e33a761aa61825 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-12-17-03-03-12.gh-issue-138122.m3EF9E.rst @@ -0,0 +1,4 @@ +Fix incomplete stack traces in the Tachyon profiler's frame cache when +profiling code with deeply nested generators. The frame cache now validates +that stack traces reach the base frame before caching, preventing broken +flamegraphs. Patch by Pablo Galindo. diff --git a/Modules/_remote_debugging/_remote_debugging.h b/Modules/_remote_debugging/_remote_debugging.h index 2f3efedd1e0ed5..663479f235af09 100644 --- a/Modules/_remote_debugging/_remote_debugging.h +++ b/Modules/_remote_debugging/_remote_debugging.h @@ -429,7 +429,8 @@ extern int process_frame_chain( int *stopped_at_cached_frame, uintptr_t *frame_addrs, Py_ssize_t *num_addrs, - Py_ssize_t max_addrs + Py_ssize_t max_addrs, + uintptr_t *out_last_frame_addr ); /* Frame cache functions */ @@ -447,18 +448,22 @@ extern int frame_cache_lookup_and_extend( Py_ssize_t *num_addrs, Py_ssize_t max_addrs); // Returns: 1 = stored, 0 = not stored (graceful), -1 = error +// Only stores complete stacks that reach base_frame_addr extern int frame_cache_store( RemoteUnwinderObject *unwinder, uint64_t thread_id, PyObject *frame_list, const uintptr_t *addrs, - Py_ssize_t num_addrs); + Py_ssize_t num_addrs, + uintptr_t base_frame_addr, + uintptr_t last_frame_visited); extern int collect_frames_with_cache( RemoteUnwinderObject *unwinder, uintptr_t frame_addr, StackChunkList *chunks, PyObject *frame_info, + uintptr_t base_frame_addr, uintptr_t gc_frame, uintptr_t last_profiled_frame, uint64_t thread_id); diff --git a/Modules/_remote_debugging/frame_cache.c b/Modules/_remote_debugging/frame_cache.c index 4598b9dc353278..ab7891445e07a3 100644 --- a/Modules/_remote_debugging/frame_cache.c +++ b/Modules/_remote_debugging/frame_cache.c @@ -194,6 +194,7 @@ frame_cache_lookup_and_extend( } // Store frame list with addresses in cache +// Only stores complete stacks that reach base_frame_addr (validation done internally) // Returns: 1 = stored successfully, 0 = not stored (graceful degradation), -1 = error int frame_cache_store( @@ -201,12 +202,25 @@ frame_cache_store( uint64_t thread_id, PyObject *frame_list, const uintptr_t *addrs, - Py_ssize_t num_addrs) + Py_ssize_t num_addrs, + uintptr_t base_frame_addr, + uintptr_t last_frame_visited) { if (!unwinder->frame_cache || thread_id == 0) { return 0; } + // Validate we have a complete stack before caching. + // Only cache if last_frame_visited matches base_frame_addr (the sentinel + // at the bottom of the stack). Note: we use last_frame_visited rather than + // addrs[num_addrs-1] because the base frame is visited but not added to the + // addrs array (it returns frame==NULL from is_frame_valid due to + // owner==FRAME_OWNED_BY_INTERPRETER). + if (base_frame_addr != 0 && last_frame_visited != base_frame_addr) { + // Incomplete stack - don't cache (graceful degradation) + return 0; + } + // Clamp to max frames if (num_addrs > FRAME_CACHE_MAX_FRAMES) { num_addrs = FRAME_CACHE_MAX_FRAMES; diff --git a/Modules/_remote_debugging/frames.c b/Modules/_remote_debugging/frames.c index abde60c45766a5..47e34e9f945cbd 100644 --- a/Modules/_remote_debugging/frames.c +++ b/Modules/_remote_debugging/frames.c @@ -265,7 +265,8 @@ process_frame_chain( int *stopped_at_cached_frame, uintptr_t *frame_addrs, // optional: C array to receive frame addresses Py_ssize_t *num_addrs, // in/out: current count / updated count - Py_ssize_t max_addrs) // max capacity of frame_addrs array + Py_ssize_t max_addrs, // max capacity of frame_addrs array + uintptr_t *out_last_frame_addr) // optional: receives last frame address visited { uintptr_t frame_addr = initial_frame_addr; uintptr_t prev_frame_addr = 0; @@ -273,10 +274,13 @@ process_frame_chain( const size_t MAX_FRAMES = 1024 + 512; size_t frame_count = 0; - // Initialize output flag + // Initialize output parameters if (stopped_at_cached_frame) { *stopped_at_cached_frame = 0; } + if (out_last_frame_addr) { + *out_last_frame_addr = 0; + } // Quick check: if current_frame == last_profiled_frame, entire stack is unchanged if (last_profiled_frame != 0 && initial_frame_addr == last_profiled_frame) { @@ -390,6 +394,11 @@ process_frame_chain( return -1; } + // Set output parameter for caller (needed for cache validation) + if (out_last_frame_addr) { + *out_last_frame_addr = last_frame_addr; + } + return 0; } @@ -537,6 +546,7 @@ collect_frames_with_cache( uintptr_t frame_addr, StackChunkList *chunks, PyObject *frame_info, + uintptr_t base_frame_addr, uintptr_t gc_frame, uintptr_t last_profiled_frame, uint64_t thread_id) @@ -551,11 +561,13 @@ collect_frames_with_cache( uintptr_t addrs[FRAME_CACHE_MAX_FRAMES]; Py_ssize_t num_addrs = 0; Py_ssize_t frames_before = PyList_GET_SIZE(frame_info); + uintptr_t last_frame_visited = 0; int stopped_at_cached = 0; - if (process_frame_chain(unwinder, frame_addr, chunks, frame_info, 0, gc_frame, + if (process_frame_chain(unwinder, frame_addr, chunks, frame_info, base_frame_addr, gc_frame, last_profiled_frame, &stopped_at_cached, - addrs, &num_addrs, FRAME_CACHE_MAX_FRAMES) < 0) { + addrs, &num_addrs, FRAME_CACHE_MAX_FRAMES, + &last_frame_visited) < 0) { return -1; } @@ -575,23 +587,28 @@ collect_frames_with_cache( // Cache miss - continue walking from last_profiled_frame to get the rest STATS_INC(unwinder, frame_cache_misses); Py_ssize_t frames_before_walk = PyList_GET_SIZE(frame_info); - if (process_frame_chain(unwinder, last_profiled_frame, chunks, frame_info, 0, gc_frame, - 0, NULL, addrs, &num_addrs, FRAME_CACHE_MAX_FRAMES) < 0) { + if (process_frame_chain(unwinder, last_profiled_frame, chunks, frame_info, base_frame_addr, gc_frame, + 0, NULL, addrs, &num_addrs, FRAME_CACHE_MAX_FRAMES, + &last_frame_visited) < 0) { return -1; } STATS_ADD(unwinder, frames_read_from_memory, PyList_GET_SIZE(frame_info) - frames_before_walk); } else { - // Partial cache hit + // Partial cache hit - cache was validated when stored, so we trust it STATS_INC(unwinder, frame_cache_partial_hits); STATS_ADD(unwinder, frames_read_from_cache, PyList_GET_SIZE(frame_info) - frames_before_cache); } - } else if (last_profiled_frame == 0) { - // No cache involvement (no last_profiled_frame or cache disabled) - STATS_INC(unwinder, frame_cache_misses); + } else { + if (last_profiled_frame == 0) { + // No cache involvement (no last_profiled_frame or cache disabled) + STATS_INC(unwinder, frame_cache_misses); + } } - // Store in cache (frame_cache_store handles truncation if num_addrs > FRAME_CACHE_MAX_FRAMES) - if (frame_cache_store(unwinder, thread_id, frame_info, addrs, num_addrs) < 0) { + // Store in cache - frame_cache_store validates internally that we have a + // complete stack (reached base_frame_addr) before actually storing + if (frame_cache_store(unwinder, thread_id, frame_info, addrs, num_addrs, + base_frame_addr, last_frame_visited) < 0) { return -1; } diff --git a/Modules/_remote_debugging/threads.c b/Modules/_remote_debugging/threads.c index 81c13ea48e3c49..6db774ecfc269e 100644 --- a/Modules/_remote_debugging/threads.c +++ b/Modules/_remote_debugging/threads.c @@ -430,7 +430,7 @@ unwind_stack_for_thread( uintptr_t last_profiled_frame = GET_MEMBER(uintptr_t, ts, unwinder->debug_offsets.thread_state.last_profiled_frame); if (collect_frames_with_cache(unwinder, frame_addr, &chunks, frame_info, - gc_frame, last_profiled_frame, tid) < 0) { + base_frame_addr, gc_frame, last_profiled_frame, tid) < 0) { set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to collect frames"); goto error; } @@ -444,7 +444,7 @@ unwind_stack_for_thread( } else { // No caching - process entire frame chain with base_frame validation if (process_frame_chain(unwinder, frame_addr, &chunks, frame_info, - base_frame_addr, gc_frame, 0, NULL, NULL, NULL, 0) < 0) { + base_frame_addr, gc_frame, 0, NULL, NULL, NULL, 0, NULL) < 0) { set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to process frame chain"); goto error; }