diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_integration.py b/Lib/test/test_profiling/test_sampling_profiler/test_integration.py index 08a96d7eb459f0..866a056eda9e03 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_integration.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_integration.py @@ -863,3 +863,98 @@ def test_async_aware_running_sees_only_cpu_task(self): self.assertGreater(cpu_percentage, 90.0, f"cpu_leaf should dominate samples in 'running' mode, " f"got {cpu_percentage:.1f}% ({cpu_leaf_samples}/{total})") + + +def _generate_deep_generators_script(chain_depth=20, recurse_depth=150): + """Generate a script with deep nested generators for stress testing.""" + lines = [ + 'import sys', + 'sys.setrecursionlimit(5000)', + '', + ] + # Generate chain of yield-from functions + for i in range(chain_depth - 1): + lines.extend([ + f'def deep_yield_chain_{i}(n):', + f' yield ("L{i}", n)', + f' yield from deep_yield_chain_{i + 1}(n)', + '', + ]) + # Last chain function calls recursive_diver + lines.extend([ + f'def deep_yield_chain_{chain_depth - 1}(n):', + f' yield ("L{chain_depth - 1}", n)', + f' yield from recursive_diver(n, {chain_depth})', + '', + 'def recursive_diver(n, depth):', + ' yield (f"DIVE_{depth}", n)', + f' if depth < {recurse_depth}:', + ' yield from recursive_diver(n, depth + 1)', + ' else:', + ' for i in range(5):', + ' yield (f"BOTTOM_{depth}", i)', + '', + 'def oscillating_generator(iterations=1000):', + ' for i in range(iterations):', + ' yield ("OSCILLATE", i)', + ' yield from deep_yield_chain_0(i)', + '', + 'def run_forever():', + ' while True:', + ' for _ in oscillating_generator(10):', + ' pass', + '', + '_test_sock.sendall(b"working")', + 'run_forever()', + ]) + return '\n'.join(lines) + + +@requires_remote_subprocess_debugging() +class TestDeepGeneratorFrameCache(unittest.TestCase): + """Test frame cache consistency with deep oscillating generator stacks.""" + + def test_all_stacks_share_same_base_frame(self): + """Verify all sampled stacks reach the entry point function. + + When profiling deep generators that oscillate up and down the call + stack, every sample should include the entry point function + (run_forever) in its call chain. If the frame cache stores + incomplete stacks, some samples will be missing this base function, + causing broken flamegraphs. + """ + script = _generate_deep_generators_script() + with test_subprocess(script, wait_for_working=True) as subproc: + collector = CollapsedStackCollector(sample_interval_usec=1, skip_idle=False) + + with ( + io.StringIO() as captured_output, + mock.patch("sys.stdout", captured_output), + ): + profiling.sampling.sample.sample( + subproc.process.pid, + collector, + duration_sec=2, + ) + + samples_with_entry_point = 0 + samples_without_entry_point = 0 + total_samples = 0 + + for (call_tree, _thread_id), count in collector.stack_counter.items(): + total_samples += count + if call_tree: + has_entry_point = call_tree and call_tree[0][2] == "" + if has_entry_point: + samples_with_entry_point += count + else: + samples_without_entry_point += count + + self.assertGreater(total_samples, 100, + f"Expected at least 100 samples, got {total_samples}") + + self.assertEqual(samples_without_entry_point, 0, + f"Found {samples_without_entry_point}/{total_samples} samples " + f"missing the entry point function 'run_forever'. This indicates " + f"incomplete stacks are being returned, likely due to frame cache " + f"storing partial stack traces.") diff --git a/Misc/NEWS.d/next/Library/2025-12-17-03-03-12.gh-issue-138122.m3EF9E.rst b/Misc/NEWS.d/next/Library/2025-12-17-03-03-12.gh-issue-138122.m3EF9E.rst new file mode 100644 index 00000000000000..e33a761aa61825 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-12-17-03-03-12.gh-issue-138122.m3EF9E.rst @@ -0,0 +1,4 @@ +Fix incomplete stack traces in the Tachyon profiler's frame cache when +profiling code with deeply nested generators. The frame cache now validates +that stack traces reach the base frame before caching, preventing broken +flamegraphs. Patch by Pablo Galindo. diff --git a/Modules/_remote_debugging/_remote_debugging.h b/Modules/_remote_debugging/_remote_debugging.h index 2f3efedd1e0ed5..663479f235af09 100644 --- a/Modules/_remote_debugging/_remote_debugging.h +++ b/Modules/_remote_debugging/_remote_debugging.h @@ -429,7 +429,8 @@ extern int process_frame_chain( int *stopped_at_cached_frame, uintptr_t *frame_addrs, Py_ssize_t *num_addrs, - Py_ssize_t max_addrs + Py_ssize_t max_addrs, + uintptr_t *out_last_frame_addr ); /* Frame cache functions */ @@ -447,18 +448,22 @@ extern int frame_cache_lookup_and_extend( Py_ssize_t *num_addrs, Py_ssize_t max_addrs); // Returns: 1 = stored, 0 = not stored (graceful), -1 = error +// Only stores complete stacks that reach base_frame_addr extern int frame_cache_store( RemoteUnwinderObject *unwinder, uint64_t thread_id, PyObject *frame_list, const uintptr_t *addrs, - Py_ssize_t num_addrs); + Py_ssize_t num_addrs, + uintptr_t base_frame_addr, + uintptr_t last_frame_visited); extern int collect_frames_with_cache( RemoteUnwinderObject *unwinder, uintptr_t frame_addr, StackChunkList *chunks, PyObject *frame_info, + uintptr_t base_frame_addr, uintptr_t gc_frame, uintptr_t last_profiled_frame, uint64_t thread_id); diff --git a/Modules/_remote_debugging/frame_cache.c b/Modules/_remote_debugging/frame_cache.c index 4598b9dc353278..ab7891445e07a3 100644 --- a/Modules/_remote_debugging/frame_cache.c +++ b/Modules/_remote_debugging/frame_cache.c @@ -194,6 +194,7 @@ frame_cache_lookup_and_extend( } // Store frame list with addresses in cache +// Only stores complete stacks that reach base_frame_addr (validation done internally) // Returns: 1 = stored successfully, 0 = not stored (graceful degradation), -1 = error int frame_cache_store( @@ -201,12 +202,25 @@ frame_cache_store( uint64_t thread_id, PyObject *frame_list, const uintptr_t *addrs, - Py_ssize_t num_addrs) + Py_ssize_t num_addrs, + uintptr_t base_frame_addr, + uintptr_t last_frame_visited) { if (!unwinder->frame_cache || thread_id == 0) { return 0; } + // Validate we have a complete stack before caching. + // Only cache if last_frame_visited matches base_frame_addr (the sentinel + // at the bottom of the stack). Note: we use last_frame_visited rather than + // addrs[num_addrs-1] because the base frame is visited but not added to the + // addrs array (it returns frame==NULL from is_frame_valid due to + // owner==FRAME_OWNED_BY_INTERPRETER). + if (base_frame_addr != 0 && last_frame_visited != base_frame_addr) { + // Incomplete stack - don't cache (graceful degradation) + return 0; + } + // Clamp to max frames if (num_addrs > FRAME_CACHE_MAX_FRAMES) { num_addrs = FRAME_CACHE_MAX_FRAMES; diff --git a/Modules/_remote_debugging/frames.c b/Modules/_remote_debugging/frames.c index abde60c45766a5..47e34e9f945cbd 100644 --- a/Modules/_remote_debugging/frames.c +++ b/Modules/_remote_debugging/frames.c @@ -265,7 +265,8 @@ process_frame_chain( int *stopped_at_cached_frame, uintptr_t *frame_addrs, // optional: C array to receive frame addresses Py_ssize_t *num_addrs, // in/out: current count / updated count - Py_ssize_t max_addrs) // max capacity of frame_addrs array + Py_ssize_t max_addrs, // max capacity of frame_addrs array + uintptr_t *out_last_frame_addr) // optional: receives last frame address visited { uintptr_t frame_addr = initial_frame_addr; uintptr_t prev_frame_addr = 0; @@ -273,10 +274,13 @@ process_frame_chain( const size_t MAX_FRAMES = 1024 + 512; size_t frame_count = 0; - // Initialize output flag + // Initialize output parameters if (stopped_at_cached_frame) { *stopped_at_cached_frame = 0; } + if (out_last_frame_addr) { + *out_last_frame_addr = 0; + } // Quick check: if current_frame == last_profiled_frame, entire stack is unchanged if (last_profiled_frame != 0 && initial_frame_addr == last_profiled_frame) { @@ -390,6 +394,11 @@ process_frame_chain( return -1; } + // Set output parameter for caller (needed for cache validation) + if (out_last_frame_addr) { + *out_last_frame_addr = last_frame_addr; + } + return 0; } @@ -537,6 +546,7 @@ collect_frames_with_cache( uintptr_t frame_addr, StackChunkList *chunks, PyObject *frame_info, + uintptr_t base_frame_addr, uintptr_t gc_frame, uintptr_t last_profiled_frame, uint64_t thread_id) @@ -551,11 +561,13 @@ collect_frames_with_cache( uintptr_t addrs[FRAME_CACHE_MAX_FRAMES]; Py_ssize_t num_addrs = 0; Py_ssize_t frames_before = PyList_GET_SIZE(frame_info); + uintptr_t last_frame_visited = 0; int stopped_at_cached = 0; - if (process_frame_chain(unwinder, frame_addr, chunks, frame_info, 0, gc_frame, + if (process_frame_chain(unwinder, frame_addr, chunks, frame_info, base_frame_addr, gc_frame, last_profiled_frame, &stopped_at_cached, - addrs, &num_addrs, FRAME_CACHE_MAX_FRAMES) < 0) { + addrs, &num_addrs, FRAME_CACHE_MAX_FRAMES, + &last_frame_visited) < 0) { return -1; } @@ -575,23 +587,28 @@ collect_frames_with_cache( // Cache miss - continue walking from last_profiled_frame to get the rest STATS_INC(unwinder, frame_cache_misses); Py_ssize_t frames_before_walk = PyList_GET_SIZE(frame_info); - if (process_frame_chain(unwinder, last_profiled_frame, chunks, frame_info, 0, gc_frame, - 0, NULL, addrs, &num_addrs, FRAME_CACHE_MAX_FRAMES) < 0) { + if (process_frame_chain(unwinder, last_profiled_frame, chunks, frame_info, base_frame_addr, gc_frame, + 0, NULL, addrs, &num_addrs, FRAME_CACHE_MAX_FRAMES, + &last_frame_visited) < 0) { return -1; } STATS_ADD(unwinder, frames_read_from_memory, PyList_GET_SIZE(frame_info) - frames_before_walk); } else { - // Partial cache hit + // Partial cache hit - cache was validated when stored, so we trust it STATS_INC(unwinder, frame_cache_partial_hits); STATS_ADD(unwinder, frames_read_from_cache, PyList_GET_SIZE(frame_info) - frames_before_cache); } - } else if (last_profiled_frame == 0) { - // No cache involvement (no last_profiled_frame or cache disabled) - STATS_INC(unwinder, frame_cache_misses); + } else { + if (last_profiled_frame == 0) { + // No cache involvement (no last_profiled_frame or cache disabled) + STATS_INC(unwinder, frame_cache_misses); + } } - // Store in cache (frame_cache_store handles truncation if num_addrs > FRAME_CACHE_MAX_FRAMES) - if (frame_cache_store(unwinder, thread_id, frame_info, addrs, num_addrs) < 0) { + // Store in cache - frame_cache_store validates internally that we have a + // complete stack (reached base_frame_addr) before actually storing + if (frame_cache_store(unwinder, thread_id, frame_info, addrs, num_addrs, + base_frame_addr, last_frame_visited) < 0) { return -1; } diff --git a/Modules/_remote_debugging/threads.c b/Modules/_remote_debugging/threads.c index 81c13ea48e3c49..6db774ecfc269e 100644 --- a/Modules/_remote_debugging/threads.c +++ b/Modules/_remote_debugging/threads.c @@ -430,7 +430,7 @@ unwind_stack_for_thread( uintptr_t last_profiled_frame = GET_MEMBER(uintptr_t, ts, unwinder->debug_offsets.thread_state.last_profiled_frame); if (collect_frames_with_cache(unwinder, frame_addr, &chunks, frame_info, - gc_frame, last_profiled_frame, tid) < 0) { + base_frame_addr, gc_frame, last_profiled_frame, tid) < 0) { set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to collect frames"); goto error; } @@ -444,7 +444,7 @@ unwind_stack_for_thread( } else { // No caching - process entire frame chain with base_frame validation if (process_frame_chain(unwinder, frame_addr, &chunks, frame_info, - base_frame_addr, gc_frame, 0, NULL, NULL, NULL, 0) < 0) { + base_frame_addr, gc_frame, 0, NULL, NULL, NULL, 0, NULL) < 0) { set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to process frame chain"); goto error; }