Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -863,3 +863,98 @@ def test_async_aware_running_sees_only_cpu_task(self):
self.assertGreater(cpu_percentage, 90.0,
f"cpu_leaf should dominate samples in 'running' mode, "
f"got {cpu_percentage:.1f}% ({cpu_leaf_samples}/{total})")


def _generate_deep_generators_script(chain_depth=20, recurse_depth=150):
"""Generate a script with deep nested generators for stress testing."""
lines = [
'import sys',
'sys.setrecursionlimit(5000)',
'',
]
# Generate chain of yield-from functions
for i in range(chain_depth - 1):
lines.extend([
f'def deep_yield_chain_{i}(n):',
f' yield ("L{i}", n)',
f' yield from deep_yield_chain_{i + 1}(n)',
'',
])
# Last chain function calls recursive_diver
lines.extend([
f'def deep_yield_chain_{chain_depth - 1}(n):',
f' yield ("L{chain_depth - 1}", n)',
f' yield from recursive_diver(n, {chain_depth})',
'',
'def recursive_diver(n, depth):',
' yield (f"DIVE_{depth}", n)',
f' if depth < {recurse_depth}:',
' yield from recursive_diver(n, depth + 1)',
' else:',
' for i in range(5):',
' yield (f"BOTTOM_{depth}", i)',
'',
'def oscillating_generator(iterations=1000):',
' for i in range(iterations):',
' yield ("OSCILLATE", i)',
' yield from deep_yield_chain_0(i)',
'',
'def run_forever():',
' while True:',
' for _ in oscillating_generator(10):',
' pass',
'',
'_test_sock.sendall(b"working")',
'run_forever()',
])
return '\n'.join(lines)


@requires_remote_subprocess_debugging()
class TestDeepGeneratorFrameCache(unittest.TestCase):
"""Test frame cache consistency with deep oscillating generator stacks."""

def test_all_stacks_share_same_base_frame(self):
"""Verify all sampled stacks reach the entry point function.

When profiling deep generators that oscillate up and down the call
stack, every sample should include the entry point function
(run_forever) in its call chain. If the frame cache stores
incomplete stacks, some samples will be missing this base function,
causing broken flamegraphs.
"""
script = _generate_deep_generators_script()
with test_subprocess(script, wait_for_working=True) as subproc:
collector = CollapsedStackCollector(sample_interval_usec=1, skip_idle=False)

with (
io.StringIO() as captured_output,
mock.patch("sys.stdout", captured_output),
):
profiling.sampling.sample.sample(
subproc.process.pid,
collector,
duration_sec=2,
)

samples_with_entry_point = 0
samples_without_entry_point = 0
total_samples = 0

for (call_tree, _thread_id), count in collector.stack_counter.items():
total_samples += count
if call_tree:
has_entry_point = call_tree and call_tree[0][2] == "<module>"
if has_entry_point:
samples_with_entry_point += count
else:
samples_without_entry_point += count

self.assertGreater(total_samples, 100,
f"Expected at least 100 samples, got {total_samples}")

self.assertEqual(samples_without_entry_point, 0,
f"Found {samples_without_entry_point}/{total_samples} samples "
f"missing the entry point function 'run_forever'. This indicates "
f"incomplete stacks are being returned, likely due to frame cache "
f"storing partial stack traces.")
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Fix incomplete stack traces in the Tachyon profiler's frame cache when
profiling code with deeply nested generators. The frame cache now validates
that stack traces reach the base frame before caching, preventing broken
flamegraphs. Patch by Pablo Galindo.
9 changes: 7 additions & 2 deletions Modules/_remote_debugging/_remote_debugging.h
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,8 @@ extern int process_frame_chain(
int *stopped_at_cached_frame,
uintptr_t *frame_addrs,
Py_ssize_t *num_addrs,
Py_ssize_t max_addrs
Py_ssize_t max_addrs,
uintptr_t *out_last_frame_addr
);

/* Frame cache functions */
Expand All @@ -447,18 +448,22 @@ extern int frame_cache_lookup_and_extend(
Py_ssize_t *num_addrs,
Py_ssize_t max_addrs);
// Returns: 1 = stored, 0 = not stored (graceful), -1 = error
// Only stores complete stacks that reach base_frame_addr
extern int frame_cache_store(
RemoteUnwinderObject *unwinder,
uint64_t thread_id,
PyObject *frame_list,
const uintptr_t *addrs,
Py_ssize_t num_addrs);
Py_ssize_t num_addrs,
uintptr_t base_frame_addr,
uintptr_t last_frame_visited);

extern int collect_frames_with_cache(
RemoteUnwinderObject *unwinder,
uintptr_t frame_addr,
StackChunkList *chunks,
PyObject *frame_info,
uintptr_t base_frame_addr,
uintptr_t gc_frame,
uintptr_t last_profiled_frame,
uint64_t thread_id);
Expand Down
16 changes: 15 additions & 1 deletion Modules/_remote_debugging/frame_cache.c
Original file line number Diff line number Diff line change
Expand Up @@ -194,19 +194,33 @@ frame_cache_lookup_and_extend(
}

// Store frame list with addresses in cache
// Only stores complete stacks that reach base_frame_addr (validation done internally)
// Returns: 1 = stored successfully, 0 = not stored (graceful degradation), -1 = error
int
frame_cache_store(
RemoteUnwinderObject *unwinder,
uint64_t thread_id,
PyObject *frame_list,
const uintptr_t *addrs,
Py_ssize_t num_addrs)
Py_ssize_t num_addrs,
uintptr_t base_frame_addr,
uintptr_t last_frame_visited)
{
if (!unwinder->frame_cache || thread_id == 0) {
return 0;
}

// Validate we have a complete stack before caching.
// Only cache if last_frame_visited matches base_frame_addr (the sentinel
// at the bottom of the stack). Note: we use last_frame_visited rather than
// addrs[num_addrs-1] because the base frame is visited but not added to the
// addrs array (it returns frame==NULL from is_frame_valid due to
// owner==FRAME_OWNED_BY_INTERPRETER).
if (base_frame_addr != 0 && last_frame_visited != base_frame_addr) {
// Incomplete stack - don't cache (graceful degradation)
return 0;
}

// Clamp to max frames
if (num_addrs > FRAME_CACHE_MAX_FRAMES) {
num_addrs = FRAME_CACHE_MAX_FRAMES;
Expand Down
41 changes: 29 additions & 12 deletions Modules/_remote_debugging/frames.c
Original file line number Diff line number Diff line change
Expand Up @@ -265,18 +265,22 @@ process_frame_chain(
int *stopped_at_cached_frame,
uintptr_t *frame_addrs, // optional: C array to receive frame addresses
Py_ssize_t *num_addrs, // in/out: current count / updated count
Py_ssize_t max_addrs) // max capacity of frame_addrs array
Py_ssize_t max_addrs, // max capacity of frame_addrs array
uintptr_t *out_last_frame_addr) // optional: receives last frame address visited
{
uintptr_t frame_addr = initial_frame_addr;
uintptr_t prev_frame_addr = 0;
uintptr_t last_frame_addr = 0; // Track last frame visited for validation
const size_t MAX_FRAMES = 1024 + 512;
size_t frame_count = 0;

// Initialize output flag
// Initialize output parameters
if (stopped_at_cached_frame) {
*stopped_at_cached_frame = 0;
}
if (out_last_frame_addr) {
*out_last_frame_addr = 0;
}

// Quick check: if current_frame == last_profiled_frame, entire stack is unchanged
if (last_profiled_frame != 0 && initial_frame_addr == last_profiled_frame) {
Expand Down Expand Up @@ -390,6 +394,11 @@ process_frame_chain(
return -1;
}

// Set output parameter for caller (needed for cache validation)
if (out_last_frame_addr) {
*out_last_frame_addr = last_frame_addr;
}

return 0;
}

Expand Down Expand Up @@ -537,6 +546,7 @@ collect_frames_with_cache(
uintptr_t frame_addr,
StackChunkList *chunks,
PyObject *frame_info,
uintptr_t base_frame_addr,
uintptr_t gc_frame,
uintptr_t last_profiled_frame,
uint64_t thread_id)
Expand All @@ -551,11 +561,13 @@ collect_frames_with_cache(
uintptr_t addrs[FRAME_CACHE_MAX_FRAMES];
Py_ssize_t num_addrs = 0;
Py_ssize_t frames_before = PyList_GET_SIZE(frame_info);
uintptr_t last_frame_visited = 0;

int stopped_at_cached = 0;
if (process_frame_chain(unwinder, frame_addr, chunks, frame_info, 0, gc_frame,
if (process_frame_chain(unwinder, frame_addr, chunks, frame_info, base_frame_addr, gc_frame,
last_profiled_frame, &stopped_at_cached,
addrs, &num_addrs, FRAME_CACHE_MAX_FRAMES) < 0) {
addrs, &num_addrs, FRAME_CACHE_MAX_FRAMES,
&last_frame_visited) < 0) {
return -1;
}

Expand All @@ -575,23 +587,28 @@ collect_frames_with_cache(
// Cache miss - continue walking from last_profiled_frame to get the rest
STATS_INC(unwinder, frame_cache_misses);
Py_ssize_t frames_before_walk = PyList_GET_SIZE(frame_info);
if (process_frame_chain(unwinder, last_profiled_frame, chunks, frame_info, 0, gc_frame,
0, NULL, addrs, &num_addrs, FRAME_CACHE_MAX_FRAMES) < 0) {
if (process_frame_chain(unwinder, last_profiled_frame, chunks, frame_info, base_frame_addr, gc_frame,
0, NULL, addrs, &num_addrs, FRAME_CACHE_MAX_FRAMES,
&last_frame_visited) < 0) {
return -1;
}
STATS_ADD(unwinder, frames_read_from_memory, PyList_GET_SIZE(frame_info) - frames_before_walk);
} else {
// Partial cache hit
// Partial cache hit - cache was validated when stored, so we trust it
STATS_INC(unwinder, frame_cache_partial_hits);
STATS_ADD(unwinder, frames_read_from_cache, PyList_GET_SIZE(frame_info) - frames_before_cache);
}
} else if (last_profiled_frame == 0) {
// No cache involvement (no last_profiled_frame or cache disabled)
STATS_INC(unwinder, frame_cache_misses);
} else {
if (last_profiled_frame == 0) {
// No cache involvement (no last_profiled_frame or cache disabled)
STATS_INC(unwinder, frame_cache_misses);
}
}

// Store in cache (frame_cache_store handles truncation if num_addrs > FRAME_CACHE_MAX_FRAMES)
if (frame_cache_store(unwinder, thread_id, frame_info, addrs, num_addrs) < 0) {
// Store in cache - frame_cache_store validates internally that we have a
// complete stack (reached base_frame_addr) before actually storing
if (frame_cache_store(unwinder, thread_id, frame_info, addrs, num_addrs,
base_frame_addr, last_frame_visited) < 0) {
return -1;
}

Expand Down
4 changes: 2 additions & 2 deletions Modules/_remote_debugging/threads.c
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,7 @@ unwind_stack_for_thread(
uintptr_t last_profiled_frame = GET_MEMBER(uintptr_t, ts,
unwinder->debug_offsets.thread_state.last_profiled_frame);
if (collect_frames_with_cache(unwinder, frame_addr, &chunks, frame_info,
gc_frame, last_profiled_frame, tid) < 0) {
base_frame_addr, gc_frame, last_profiled_frame, tid) < 0) {
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to collect frames");
goto error;
}
Expand All @@ -444,7 +444,7 @@ unwind_stack_for_thread(
} else {
// No caching - process entire frame chain with base_frame validation
if (process_frame_chain(unwinder, frame_addr, &chunks, frame_info,
base_frame_addr, gc_frame, 0, NULL, NULL, NULL, 0) < 0) {
base_frame_addr, gc_frame, 0, NULL, NULL, NULL, 0, NULL) < 0) {
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to process frame chain");
goto error;
}
Expand Down
Loading