From 7dd836b4a18b84ae22f5c282f9301830d1fff7c2 Mon Sep 17 00:00:00 2001
From: Pablo Galindo Salgado <pablogsal@gmail.com>
Date: Mon, 15 Dec 2025 01:47:18 +0000
Subject: [PATCH 01/17] Add binary I/O header for sampling profiler

Defines the API and data structures for a high-performance binary
format for profiling data. The format uses string/frame deduplication,
varint encoding, and delta compression to achieve 10-50x size reduction
compared to text formats. Optional zstd compression provides additional
savings.

The header includes inline varint encode/decode functions since these
are called in tight loops during both writing and reading. Structures
for both writer (BinaryWriter) and reader (BinaryReader) are defined
here to allow the module.c bindings to allocate them.
---
 Modules/_remote_debugging/binary_io.h | 630 ++++++++++++++++++++++++++
 1 file changed, 630 insertions(+)
 create mode 100644 Modules/_remote_debugging/binary_io.h

diff --git a/Modules/_remote_debugging/binary_io.h b/Modules/_remote_debugging/binary_io.h
new file mode 100644
index 00000000000000..3bc40b5f54fd56
--- /dev/null
+++ b/Modules/_remote_debugging/binary_io.h
@@ -0,0 +1,630 @@
+/******************************************************************************
+ * Python Remote Debugging Module - Binary I/O Header
+ *
+ * This header provides declarations for high-performance binary file I/O
+ * for profiling data with optional zstd streaming compression.
+ ******************************************************************************/
+
+#ifndef Py_BINARY_IO_H
+#define Py_BINARY_IO_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "Python.h"
+#include "pycore_hashtable.h"
+#include <stdint.h>
+#include <stdio.h>
+
+/* ============================================================================
+ * BINARY FORMAT CONSTANTS
+ * ============================================================================ */
+
+#define BINARY_FORMAT_MAGIC     0x54414348  /* "TACH" (Tachyon) */
+#define BINARY_FORMAT_VERSION   2
+
+/* Buffer sizes: 512KB balances syscall amortization against memory use,
+ * and aligns well with filesystem block sizes and zstd dictionary windows */
+#define WRITE_BUFFER_SIZE       (512 * 1024)
+#define COMPRESSED_BUFFER_SIZE  (512 * 1024)
+
+/* Compression types */
+#define COMPRESSION_NONE        0
+#define COMPRESSION_ZSTD        1
+
+/* Stack encoding types for delta compression */
+#define STACK_REPEAT            0x00  /* RLE: identical to previous, with count */
+#define STACK_FULL              0x01  /* Full stack (first sample or no match) */
+#define STACK_SUFFIX            0x02  /* Shares N frames from bottom */
+#define STACK_POP_PUSH          0x03  /* Remove M frames, add N frames */
+
+/* Maximum stack depth we'll buffer for delta encoding */
+#define MAX_STACK_DEPTH         256
+
+/* Initial capacity for RLE pending buffer */
+#define INITIAL_RLE_CAPACITY    64
+
+/* Initial capacities for dynamic arrays - sized to reduce reallocations */
+#define INITIAL_STRING_CAPACITY 4096
+#define INITIAL_FRAME_CAPACITY  4096
+#define INITIAL_THREAD_CAPACITY 256
+
+/* ============================================================================
+ * STATISTICS STRUCTURES
+ * ============================================================================ */
+
+/* Writer statistics - tracks encoding efficiency */
+typedef struct {
+    uint64_t repeat_records;      /* Number of RLE repeat records written */
+    uint64_t repeat_samples;      /* Total samples encoded via RLE */
+    uint64_t full_records;        /* Number of full stack records */
+    uint64_t suffix_records;      /* Number of suffix match records */
+    uint64_t pop_push_records;    /* Number of pop-push records */
+    uint64_t total_frames_written;/* Total frame indices written */
+    uint64_t frames_saved;        /* Frames avoided due to delta encoding */
+    uint64_t bytes_written;       /* Total bytes written (before compression) */
+} BinaryWriterStats;
+
+/* Reader statistics - tracks reconstruction performance */
+typedef struct {
+    uint64_t repeat_records;      /* RLE records decoded */
+    uint64_t repeat_samples;      /* Samples decoded from RLE */
+    uint64_t full_records;        /* Full stack records decoded */
+    uint64_t suffix_records;      /* Suffix match records decoded */
+    uint64_t pop_push_records;    /* Pop-push records decoded */
+    uint64_t total_samples;       /* Total samples reconstructed */
+    uint64_t stack_reconstructions; /* Number of stack array reconstructions */
+} BinaryReaderStats;
+
+/* ============================================================================
+ * PLATFORM ABSTRACTION
+ * ============================================================================ */
+
+#if defined(__linux__) || defined(__APPLE__)
+    #include <sys/mman.h>
+    #include <unistd.h>
+    #include <sys/stat.h>
+    #include <fcntl.h>
+    #define USE_MMAP 1
+#else
+    #define USE_MMAP 0
+#endif
+
+/* 64-bit file position support for files larger than 2GB.
+ * On POSIX: use ftello/fseeko with off_t (already 64-bit on 64-bit systems)
+ * On Windows: use _ftelli64/_fseeki64 with __int64 */
+#if defined(_WIN32) || defined(_WIN64)
+    #include <io.h>
+    typedef __int64 file_offset_t;
+    #define FTELL64(fp) _ftelli64(fp)
+    #define FSEEK64(fp, offset, whence) _fseeki64(fp, offset, whence)
+#else
+    /* POSIX - off_t is 64-bit on 64-bit systems, ftello/fseeko handle large files */
+    typedef off_t file_offset_t;
+    #define FTELL64(fp) ftello(fp)
+    #define FSEEK64(fp, offset, whence) fseeko(fp, offset, whence)
+#endif
+
+/* Forward declare zstd types if available */
+#ifdef HAVE_ZSTD
+#include <zstd.h>
+#endif
+
+/* Branch prediction hints - same as Objects/obmalloc.c */
+#if (defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 2))) && defined(__OPTIMIZE__)
+#  define UNLIKELY(value) __builtin_expect((value), 0)
+#  define LIKELY(value) __builtin_expect((value), 1)
+#else
+#  define UNLIKELY(value) (value)
+#  define LIKELY(value) (value)
+#endif
+
+/* ============================================================================
+ * BINARY WRITER STRUCTURES
+ * ============================================================================ */
+
+/* zstd compression state (only used if HAVE_ZSTD defined) */
+typedef struct {
+#ifdef HAVE_ZSTD
+    ZSTD_CCtx *cctx;  /* Modern API: CCtx and CStream are the same since v1.3.0 */
+#else
+    void *cctx;  /* Placeholder */
+#endif
+    uint8_t *compressed_buffer;
+    size_t compressed_buffer_size;
+} ZstdCompressor;
+
+/* Frame entry - combines all frame data for better cache locality */
+typedef struct {
+    uint32_t filename_idx;
+    uint32_t funcname_idx;
+    int32_t lineno;
+} FrameEntry;
+
+/* Frame key for hash table lookup */
+typedef struct {
+    uint32_t filename_idx;
+    uint32_t funcname_idx;
+    int32_t lineno;
+} FrameKey;
+
+/* Pending RLE sample - buffered for run-length encoding */
+typedef struct {
+    uint64_t timestamp_delta;
+    uint8_t status;
+} PendingRLESample;
+
+/* Thread entry - tracks per-thread state for delta encoding */
+typedef struct {
+    uint64_t thread_id;
+    uint64_t prev_timestamp;
+    uint32_t interpreter_id;
+
+    /* Previous stack for delta encoding (frame indices, innermost first) */
+    uint32_t *prev_stack;
+    size_t prev_stack_depth;
+    size_t prev_stack_capacity;
+
+    /* RLE pending buffer - samples waiting to be written as a repeat group */
+    PendingRLESample *pending_rle;
+    size_t pending_rle_count;
+    size_t pending_rle_capacity;
+    int has_pending_rle;  /* Flag: do we have buffered repeats? */
+} ThreadEntry;
+
+/* Main binary writer structure */
+typedef struct {
+    FILE *fp;
+    char *filename;
+
+    /* Write buffer for batched I/O */
+    uint8_t *write_buffer;
+    size_t buffer_pos;
+    size_t buffer_size;
+
+    /* Compression */
+    int compression_type;
+    ZstdCompressor zstd;
+
+    /* Metadata */
+    uint64_t start_time_us;
+    uint64_t sample_interval_us;
+    uint32_t total_samples;
+
+    /* String hash table: PyObject* -> uint32_t index */
+    _Py_hashtable_t *string_hash;
+    /* String storage: array of UTF-8 encoded strings */
+    char **strings;
+    size_t *string_lengths;
+    size_t string_count;
+    size_t string_capacity;
+
+    /* Frame hash table: FrameKey* -> uint32_t index */
+    _Py_hashtable_t *frame_hash;
+    /* Frame storage: combined struct for better cache locality */
+    FrameEntry *frame_entries;
+    size_t frame_count;
+    size_t frame_capacity;
+
+    /* Thread timestamp tracking for delta encoding - combined for cache locality */
+    ThreadEntry *thread_entries;
+    size_t thread_count;
+    size_t thread_capacity;
+
+    /* Statistics */
+    BinaryWriterStats stats;
+} BinaryWriter;
+
+/* ============================================================================
+ * BINARY READER STRUCTURES
+ * ============================================================================ */
+
+/* Per-thread state for stack reconstruction during replay */
+typedef struct {
+    uint64_t thread_id;
+    uint32_t interpreter_id;
+    uint64_t prev_timestamp;
+
+    /* Reconstructed stack buffer (frame indices, innermost first) */
+    uint32_t *current_stack;
+    size_t current_stack_depth;
+    size_t current_stack_capacity;
+} ReaderThreadState;
+
+/* Main binary reader structure */
+typedef struct {
+    char *filename;
+
+#if USE_MMAP
+    int fd;
+    uint8_t *mapped_data;
+    size_t mapped_size;
+#else
+    FILE *fp;
+    uint8_t *file_data;
+    size_t file_size;
+#endif
+
+    /* Decompression state */
+    int compression_type;
+    /* Note: ZSTD_DCtx is not stored - created/freed during decompression */
+    uint8_t *decompressed_data;
+    size_t decompressed_size;
+
+    /* Header metadata */
+    uint64_t start_time_us;
+    uint64_t sample_interval_us;
+    uint32_t sample_count;
+    uint32_t thread_count;
+    uint64_t string_table_offset;
+    uint64_t frame_table_offset;
+
+    /* Parsed string table: array of Python string objects */
+    PyObject **strings;
+    uint32_t strings_count;
+
+    /* Parsed frame table: packed as [filename_idx, funcname_idx, lineno] */
+    uint32_t *frame_data;
+    uint32_t frames_count;
+
+    /* Sample data region */
+    uint8_t *sample_data;
+    size_t sample_data_size;
+
+    /* Per-thread state for stack reconstruction (used during replay) */
+    ReaderThreadState *thread_states;
+    size_t thread_state_count;
+    size_t thread_state_capacity;
+
+    /* Statistics */
+    BinaryReaderStats stats;
+} BinaryReader;
+
+/* ============================================================================
+ * VARINT ENCODING/DECODING (INLINE FOR PERFORMANCE)
+ * ============================================================================ */
+
+/* Encode unsigned 64-bit varint (LEB128). Returns bytes written. */
+static inline size_t
+encode_varint_u64(uint8_t *buf, uint64_t value)
+{
+    /* Fast path for single-byte values (0-127) - very common case */
+    if (value < 0x80) {
+        buf[0] = (uint8_t)value;
+        return 1;
+    }
+
+    size_t i = 0;
+    while (value >= 0x80) {
+        buf[i++] = (uint8_t)((value & 0x7F) | 0x80);
+        value >>= 7;
+    }
+    buf[i++] = (uint8_t)(value & 0x7F);
+    return i;
+}
+
+/* Encode unsigned 32-bit varint. Returns bytes written. */
+static inline size_t
+encode_varint_u32(uint8_t *buf, uint32_t value)
+{
+    return encode_varint_u64(buf, value);
+}
+
+/* Encode signed 32-bit varint (zigzag encoding). Returns bytes written. */
+static inline size_t
+encode_varint_i32(uint8_t *buf, int32_t value)
+{
+    /* Zigzag encode: map signed to unsigned */
+    uint32_t zigzag = ((uint32_t)value << 1) ^ (uint32_t)(value >> 31);
+    return encode_varint_u32(buf, zigzag);
+}
+
+/* Decode unsigned 64-bit varint. Updates offset only on success. Returns value.
+ * On error (overflow or incomplete), offset is NOT updated, allowing callers
+ * to detect errors via (offset == prev_offset) check.
+ * On success, sets *error to 0 if error is non-NULL.
+ * On error, sets *error to 1 if error is non-NULL. */
+static inline uint64_t
+decode_varint_u64_ex(const uint8_t *data, size_t *offset, size_t max_size, int *error)
+{
+    size_t pos = *offset;
+    uint64_t result = 0;
+    int shift = 0;
+
+    /* Fast path for single-byte varints (0-127) - most common case */
+    if (LIKELY(pos < max_size && (data[pos] & 0x80) == 0)) {
+        *offset = pos + 1;
+        if (error) *error = 0;
+        return data[pos];
+    }
+
+    while (pos < max_size) {
+        uint8_t byte = data[pos++];
+        result |= (uint64_t)(byte & 0x7F) << shift;
+        if ((byte & 0x80) == 0) {
+            *offset = pos;
+            if (error) *error = 0;
+            return result;
+        }
+        shift += 7;
+        if (UNLIKELY(shift >= 64)) {
+            /* Overflow - do NOT update offset so caller can detect error */
+            if (error) *error = 1;
+            return 0;
+        }
+    }
+
+    /* Incomplete varint - do NOT update offset so caller can detect error */
+    if (error) *error = 1;
+    return 0;
+}
+
+/* Backward-compatible wrapper that sets PyErr on error.
+ * Callers should check PyErr_Occurred() after batch operations. */
+static inline uint64_t
+decode_varint_u64(const uint8_t *data, size_t *offset, size_t max_size)
+{
+    int error = 0;
+    uint64_t result = decode_varint_u64_ex(data, offset, max_size, &error);
+    if (UNLIKELY(error)) {
+        PyErr_SetString(PyExc_ValueError, "Invalid or incomplete varint in binary data");
+    }
+    return result;
+}
+
+/* Decode unsigned 32-bit varint with explicit error handling.
+ * If value exceeds UINT32_MAX, treats as error: offset is NOT updated,
+ * *error is set to 1, allowing callers to detect via (offset == prev_offset). */
+static inline uint32_t
+decode_varint_u32_ex(const uint8_t *data, size_t *offset, size_t max_size, int *error)
+{
+    size_t saved_offset = *offset;
+    uint64_t value = decode_varint_u64_ex(data, offset, max_size, error);
+    if (error && *error) {
+        /* decode_varint_u64_ex already handled the error, offset unchanged */
+        return 0;
+    }
+    if (UNLIKELY(value > UINT32_MAX)) {
+        /* Value overflow - restore offset so caller can detect error */
+        *offset = saved_offset;
+        if (error) *error = 1;
+        return 0;
+    }
+    return (uint32_t)value;
+}
+
+/* Backward-compatible wrapper that sets PyErr on error. */
+static inline uint32_t
+decode_varint_u32(const uint8_t *data, size_t *offset, size_t max_size)
+{
+    int error = 0;
+    uint32_t result = decode_varint_u32_ex(data, offset, max_size, &error);
+    if (UNLIKELY(error)) {
+        PyErr_SetString(PyExc_ValueError, "Invalid or incomplete varint in binary data");
+    }
+    return result;
+}
+
+/* Decode signed 32-bit varint (zigzag) with explicit error handling. */
+static inline int32_t
+decode_varint_i32_ex(const uint8_t *data, size_t *offset, size_t max_size, int *error)
+{
+    uint32_t zigzag = decode_varint_u32_ex(data, offset, max_size, error);
+    /* Zigzag decode */
+    return (int32_t)((zigzag >> 1) ^ -(int32_t)(zigzag & 1));
+}
+
+/* Backward-compatible wrapper that sets PyErr on error. */
+static inline int32_t
+decode_varint_i32(const uint8_t *data, size_t *offset, size_t max_size)
+{
+    int error = 0;
+    int32_t result = decode_varint_i32_ex(data, offset, max_size, &error);
+    if (UNLIKELY(error)) {
+        PyErr_SetString(PyExc_ValueError, "Invalid or incomplete varint in binary data");
+    }
+    return result;
+}
+
+/* ============================================================================
+ * SHARED UTILITY FUNCTIONS
+ * ============================================================================ */
+
+/* Generic array growth - returns new pointer or NULL (sets PyErr_NoMemory)
+ * Includes overflow checking for capacity doubling and allocation size. */
+static inline void *
+grow_array(void *ptr, size_t *capacity, size_t elem_size)
+{
+    size_t old_cap = *capacity;
+
+    /* Check for overflow when doubling capacity */
+    if (old_cap > SIZE_MAX / 2) {
+        PyErr_SetString(PyExc_OverflowError, "Array capacity overflow");
+        return NULL;
+    }
+    size_t new_cap = old_cap * 2;
+
+    /* Check for overflow when calculating allocation size */
+    if (new_cap > SIZE_MAX / elem_size) {
+        PyErr_SetString(PyExc_OverflowError, "Array allocation size overflow");
+        return NULL;
+    }
+
+    void *new_ptr = PyMem_Realloc(ptr, new_cap * elem_size);
+    if (new_ptr) {
+        *capacity = new_cap;
+    } else {
+        PyErr_NoMemory();
+    }
+    return new_ptr;
+}
+
+/* Macro wrapper for type safety with grow_array */
+#define GROW_ARRAY(ptr, count, capacity, type) \
+    ((count) < (capacity) ? 0 : \
+     ((ptr) = grow_array((ptr), &(capacity), sizeof(type))) ? 0 : -1)
+
+/* ============================================================================
+ * BINARY WRITER API
+ * ============================================================================ */
+
+/*
+ * Create a new binary writer.
+ *
+ * Arguments:
+ *   filename: Path to output file
+ *   sample_interval_us: Sampling interval in microseconds
+ *   compression_type: COMPRESSION_NONE or COMPRESSION_ZSTD
+ *   start_time_us: Start timestamp in microseconds (from time.monotonic() * 1e6)
+ *
+ * Returns:
+ *   New BinaryWriter* on success, NULL on failure (PyErr set)
+ */
+BinaryWriter *binary_writer_create(
+    const char *filename,
+    uint64_t sample_interval_us,
+    int compression_type,
+    uint64_t start_time_us
+);
+
+/*
+ * Write a sample to the binary file.
+ *
+ * Arguments:
+ *   writer: Writer from binary_writer_create
+ *   stack_frames: List of InterpreterInfo struct sequences
+ *   timestamp_us: Current timestamp in microseconds (from time.monotonic() * 1e6)
+ *
+ * Returns:
+ *   0 on success, -1 on failure (PyErr set)
+ */
+int binary_writer_write_sample(
+    BinaryWriter *writer,
+    PyObject *stack_frames,
+    uint64_t timestamp_us
+);
+
+/*
+ * Finalize and close the binary file.
+ * Writes string/frame tables, footer, and updates header.
+ *
+ * Arguments:
+ *   writer: Writer to finalize
+ *
+ * Returns:
+ *   0 on success, -1 on failure (PyErr set)
+ */
+int binary_writer_finalize(BinaryWriter *writer);
+
+/*
+ * Destroy a binary writer and free all resources.
+ * Safe to call even if writer is partially initialized.
+ *
+ * Arguments:
+ *   writer: Writer to destroy (may be NULL)
+ */
+void binary_writer_destroy(BinaryWriter *writer);
+
+/* ============================================================================
+ * BINARY READER API
+ * ============================================================================ */
+
+/*
+ * Open a binary file for reading.
+ *
+ * Arguments:
+ *   filename: Path to input file
+ *
+ * Returns:
+ *   New BinaryReader* on success, NULL on failure (PyErr set)
+ */
+BinaryReader *binary_reader_open(const char *filename);
+
+/*
+ * Replay samples from binary file through a collector.
+ *
+ * Arguments:
+ *   reader: Reader from binary_reader_open
+ *   collector: Python collector with collect() method
+ *   progress_callback: Optional callable(current, total) or NULL
+ *
+ * Returns:
+ *   Number of samples replayed on success, -1 on failure (PyErr set)
+ */
+Py_ssize_t binary_reader_replay(
+    BinaryReader *reader,
+    PyObject *collector,
+    PyObject *progress_callback
+);
+
+/*
+ * Get metadata about the binary file.
+ *
+ * Arguments:
+ *   reader: Reader from binary_reader_open
+ *
+ * Returns:
+ *   Dict with file metadata on success, NULL on failure (PyErr set)
+ */
+PyObject *binary_reader_get_info(BinaryReader *reader);
+
+/*
+ * Close a binary reader and free all resources.
+ *
+ * Arguments:
+ *   reader: Reader to close (may be NULL)
+ */
+void binary_reader_close(BinaryReader *reader);
+
+/* ============================================================================
+ * STATISTICS FUNCTIONS
+ * ============================================================================ */
+
+/*
+ * Get writer statistics as a Python dict.
+ *
+ * Arguments:
+ *   writer: Writer to get stats from
+ *
+ * Returns:
+ *   Dict with statistics on success, NULL on failure (PyErr set)
+ */
+PyObject *binary_writer_get_stats(BinaryWriter *writer);
+
+/*
+ * Get reader statistics as a Python dict.
+ *
+ * Arguments:
+ *   reader: Reader to get stats from
+ *
+ * Returns:
+ *   Dict with statistics on success, NULL on failure (PyErr set)
+ */
+PyObject *binary_reader_get_stats(BinaryReader *reader);
+
+/* ============================================================================
+ * UTILITY FUNCTIONS
+ * ============================================================================ */
+
+/*
+ * Check if zstd compression is available.
+ *
+ * Returns:
+ *   1 if zstd available, 0 otherwise
+ */
+int binary_io_zstd_available(void);
+
+/*
+ * Get the best available compression type.
+ *
+ * Returns:
+ *   COMPRESSION_ZSTD if available, COMPRESSION_NONE otherwise
+ */
+int binary_io_get_best_compression(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* Py_BINARY_IO_H */

From 83603c7d5c433979b7ce1866cab64d10ceca0f88 Mon Sep 17 00:00:00 2001
From: Pablo Galindo Salgado <pablogsal@gmail.com>
Date: Mon, 15 Dec 2025 01:47:33 +0000
Subject: [PATCH 02/17] Add binary writer for sampling profiler

Implements streaming binary output with delta compression. The writer
tracks per-thread state to encode stack changes efficiently: identical
stacks use RLE, similar stacks encode only the differing frames.

String and frame deduplication uses Python's hashtable implementation
for O(1) lookup during interning. The 512KB write buffer amortizes
syscall overhead. When zstd is available, data streams through
compression before hitting disk.

Finalization writes the string/frame tables and footer, then seeks
back to update the header with final counts and offsets.
---
 Modules/_remote_debugging/binary_io_writer.c | 1206 ++++++++++++++++++
 1 file changed, 1206 insertions(+)
 create mode 100644 Modules/_remote_debugging/binary_io_writer.c

diff --git a/Modules/_remote_debugging/binary_io_writer.c b/Modules/_remote_debugging/binary_io_writer.c
new file mode 100644
index 00000000000000..5e5abca3d9a967
--- /dev/null
+++ b/Modules/_remote_debugging/binary_io_writer.c
@@ -0,0 +1,1206 @@
+/******************************************************************************
+ * Python Remote Debugging Module - Binary Writer Implementation
+ *
+ * High-performance binary file writer for profiling data with optional zstd
+ * streaming compression.
+ ******************************************************************************/
+
+#ifndef Py_BUILD_CORE_MODULE
+#  define Py_BUILD_CORE_MODULE
+#endif
+
+#include "binary_io.h"
+#include "_remote_debugging.h"
+#include <string.h>
+
+#ifdef HAVE_ZSTD
+#include <zstd.h>
+#endif
+
+/* ============================================================================
+ * CONSTANTS FOR BINARY FORMAT SIZES
+ * ============================================================================ */
+
+/* Sample header sizes */
+#define SAMPLE_HEADER_FIXED_SIZE 13      /* thread_id(8) + interpreter_id(4) + encoding(1) */
+#define SAMPLE_HEADER_MAX_SIZE 26        /* fixed + max_varint(10) + status(1) + margin */
+#define MAX_VARINT_SIZE 10               /* Maximum bytes for a varint64 */
+#define MAX_VARINT_SIZE_U32 5            /* Maximum bytes for a varint32 */
+/* Frame buffer: depth varint (max 2 bytes for 256) + 256 frames * 5 bytes/varint + margin */
+#define MAX_FRAME_BUFFER_SIZE ((MAX_STACK_DEPTH * MAX_VARINT_SIZE_U32) + MAX_VARINT_SIZE_U32 + 16)
+
+/* File structure sizes */
+#define FILE_HEADER_PLACEHOLDER_SIZE 64  /* Placeholder written at file start */
+#define FILE_HEADER_SIZE 52              /* Actual header content size */
+#define FILE_FOOTER_SIZE 32              /* Footer size */
+
+/* ============================================================================
+ * WRITER-SPECIFIC UTILITY HELPERS
+ * ============================================================================ */
+
+/* Grow two parallel arrays together (e.g., strings and string_lengths).
+ * Returns 0 on success, -1 on error (sets PyErr).
+ * On error, original arrays are preserved (truly atomic update). */
+static inline int
+grow_parallel_arrays(void **array1, void **array2, size_t *capacity,
+                     size_t elem_size1, size_t elem_size2)
+{
+    size_t old_cap = *capacity;
+
+    /* Check for overflow when doubling capacity */
+    if (old_cap > SIZE_MAX / 2) {
+        PyErr_SetString(PyExc_OverflowError, "Array capacity overflow");
+        return -1;
+    }
+    size_t new_cap = old_cap * 2;
+
+    /* Check for overflow when calculating allocation sizes */
+    if (new_cap > SIZE_MAX / elem_size1 || new_cap > SIZE_MAX / elem_size2) {
+        PyErr_SetString(PyExc_OverflowError, "Array allocation size overflow");
+        return -1;
+    }
+
+    size_t new_size1 = new_cap * elem_size1;
+    size_t new_size2 = new_cap * elem_size2;
+    size_t old_size1 = old_cap * elem_size1;
+    size_t old_size2 = old_cap * elem_size2;
+
+    /* Allocate fresh memory blocks (not realloc) to ensure atomicity.
+     * If either allocation fails, original arrays are completely unchanged. */
+    void *new_array1 = PyMem_Malloc(new_size1);
+    if (!new_array1) {
+        PyErr_NoMemory();
+        return -1;
+    }
+
+    void *new_array2 = PyMem_Malloc(new_size2);
+    if (!new_array2) {
+        /* Second allocation failed - free first and return with no state change */
+        PyMem_Free(new_array1);
+        PyErr_NoMemory();
+        return -1;
+    }
+
+    /* Both allocations succeeded - copy data and update pointers atomically */
+    memcpy(new_array1, *array1, old_size1);
+    memcpy(new_array2, *array2, old_size2);
+
+    /* Free old arrays */
+    PyMem_Free(*array1);
+    PyMem_Free(*array2);
+
+    /* Update all pointers */
+    *array1 = new_array1;
+    *array2 = new_array2;
+    *capacity = new_cap;
+    return 0;
+}
+
+/* Checked fwrite with GIL release - returns 0 on success, -1 on error (sets PyErr).
+ * This version releases the GIL during the write operation to allow other Python
+ * threads to run during potentially blocking I/O. */
+static inline int
+fwrite_checked_allow_threads(const void *data, size_t size, FILE *fp)
+{
+    size_t written;
+    Py_BEGIN_ALLOW_THREADS
+    written = fwrite(data, 1, size, fp);
+    Py_END_ALLOW_THREADS
+    if (written != size) {
+        PyErr_SetFromErrno(PyExc_IOError);
+        return -1;
+    }
+    return 0;
+}
+
+/* Forward declaration for writer_write_bytes */
+static inline int writer_write_bytes(BinaryWriter *writer, const void *data, size_t size);
+
+/* Encode and write a varint u32 - returns 0 on success, -1 on error */
+static inline int
+writer_write_varint_u32(BinaryWriter *writer, uint32_t value)
+{
+    uint8_t buf[MAX_VARINT_SIZE];
+    size_t len = encode_varint_u32(buf, value);
+    return writer_write_bytes(writer, buf, len);
+}
+
+/* Encode and write a varint u64 - returns 0 on success, -1 on error */
+static inline int
+writer_write_varint_u64(BinaryWriter *writer, uint64_t value)
+{
+    uint8_t buf[MAX_VARINT_SIZE];
+    size_t len = encode_varint_u64(buf, value);
+    return writer_write_bytes(writer, buf, len);
+}
+
+
+/* ============================================================================
+ * UTILITY FUNCTIONS
+ * ============================================================================ */
+
+int
+binary_io_zstd_available(void)
+{
+#ifdef HAVE_ZSTD
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int
+binary_io_get_best_compression(void)
+{
+#ifdef HAVE_ZSTD
+    return COMPRESSION_ZSTD;
+#else
+    return COMPRESSION_NONE;
+#endif
+}
+
+/* ============================================================================
+ * BINARY WRITER IMPLEMENTATION
+ * ============================================================================ */
+
+/* Initialize zstd compression */
+static int
+writer_init_zstd(BinaryWriter *writer)
+{
+#ifdef HAVE_ZSTD
+    writer->zstd.cctx = ZSTD_createCCtx();
+    if (!writer->zstd.cctx) {
+        PyErr_SetString(PyExc_MemoryError, "Failed to create zstd compression context");
+        return -1;
+    }
+
+    /* Compression level 5: better ratio for repetitive profiling data */
+    size_t result = ZSTD_CCtx_setParameter(writer->zstd.cctx,
+                                           ZSTD_c_compressionLevel, 5);
+    if (ZSTD_isError(result)) {
+        PyErr_Format(PyExc_RuntimeError, "Failed to set zstd compression level: %s",
+                     ZSTD_getErrorName(result));
+        ZSTD_freeCCtx(writer->zstd.cctx);
+        writer->zstd.cctx = NULL;
+        return -1;
+    }
+
+    /* Use large buffer (512KB) for fewer I/O syscalls */
+    writer->zstd.compressed_buffer = PyMem_Malloc(COMPRESSED_BUFFER_SIZE);
+    if (!writer->zstd.compressed_buffer) {
+        ZSTD_freeCCtx(writer->zstd.cctx);
+        writer->zstd.cctx = NULL;
+        PyErr_NoMemory();
+        return -1;
+    }
+    writer->zstd.compressed_buffer_size = COMPRESSED_BUFFER_SIZE;
+
+    return 0;
+#else
+    PyErr_SetString(PyExc_RuntimeError,
+        "zstd compression requested but not available (HAVE_ZSTD not defined)");
+    return -1;
+#endif
+}
+
+/* Flush write buffer to disk (with compression if enabled) */
+static int
+writer_flush_buffer(BinaryWriter *writer)
+{
+    if (writer->buffer_pos == 0) {
+        return 0;
+    }
+
+#ifdef HAVE_ZSTD
+    if (writer->compression_type == COMPRESSION_ZSTD) {
+        ZSTD_inBuffer input = { writer->write_buffer, writer->buffer_pos, 0 };
+
+        while (input.pos < input.size) {
+            ZSTD_outBuffer output = {
+                writer->zstd.compressed_buffer,
+                writer->zstd.compressed_buffer_size,
+                0
+            };
+
+            size_t result = ZSTD_compressStream2(
+                writer->zstd.cctx, &output, &input, ZSTD_e_continue
+            );
+
+            if (ZSTD_isError(result)) {
+                PyErr_Format(PyExc_IOError, "zstd compression error: %s",
+                             ZSTD_getErrorName(result));
+                return -1;
+            }
+
+            if (output.pos > 0) {
+                if (fwrite_checked_allow_threads(writer->zstd.compressed_buffer, output.pos, writer->fp) < 0) {
+                    return -1;
+                }
+            }
+        }
+    } else
+#endif
+    {
+        /* Uncompressed write */
+        if (fwrite_checked_allow_threads(writer->write_buffer, writer->buffer_pos, writer->fp) < 0) {
+            return -1;
+        }
+    }
+
+    writer->buffer_pos = 0;
+    return 0;
+}
+
+/* Write bytes to buffer (flushing if needed) */
+static inline int
+writer_write_bytes(BinaryWriter *writer, const void *data, size_t size)
+{
+    const uint8_t *src = (const uint8_t *)data;
+    size_t original_size = size;
+
+    while (size > 0) {
+        size_t space = writer->buffer_size - writer->buffer_pos;
+        size_t to_copy = (size < space) ? size : space;
+
+        memcpy(writer->write_buffer + writer->buffer_pos, src, to_copy);
+        writer->buffer_pos += to_copy;
+        src += to_copy;
+        size -= to_copy;
+
+        if (writer->buffer_pos == writer->buffer_size) {
+            if (writer_flush_buffer(writer) < 0) {
+                return -1;
+            }
+        }
+    }
+
+    writer->stats.bytes_written += original_size;
+    return 0;
+}
+
+/* ============================================================================
+ * HASH TABLE SUPPORT FUNCTIONS (using _Py_hashtable)
+ * ============================================================================ */
+
+/* Hash function for Python strings - uses Python's cached hash */
+static Py_uhash_t
+string_hash_func(const void *key)
+{
+    PyObject *str = (PyObject *)key;
+    Py_hash_t hash = PyObject_Hash(str);
+    if (hash == -1) {
+        PyErr_Clear();
+        return 0;
+    }
+    return (Py_uhash_t)hash;
+}
+
+/* Compare function for Python strings */
+static int
+string_compare_func(const void *key1, const void *key2)
+{
+    PyObject *str1 = (PyObject *)key1;
+    PyObject *str2 = (PyObject *)key2;
+    if (str1 == str2) {
+        return 1;
+    }
+    int result = PyObject_RichCompareBool(str1, str2, Py_EQ);
+    if (result == -1) {
+        PyErr_Clear();
+        return 0;
+    }
+    return result;
+}
+
+/* Destroy function for string keys - decref the Python string */
+static void
+string_key_destroy(void *key)
+{
+    Py_XDECREF((PyObject *)key);
+}
+
+/* Hash function for frame keys */
+static Py_uhash_t
+frame_key_hash_func(const void *key)
+{
+    const FrameKey *fk = (const FrameKey *)key;
+    /* FNV-1a style hash combining all three values */
+    Py_uhash_t hash = 2166136261u;
+    hash ^= fk->filename_idx;
+    hash *= 16777619u;
+    hash ^= fk->funcname_idx;
+    hash *= 16777619u;
+    hash ^= (uint32_t)fk->lineno;
+    hash *= 16777619u;
+    return hash;
+}
+
+/* Compare function for frame keys */
+static int
+frame_key_compare_func(const void *key1, const void *key2)
+{
+    const FrameKey *fk1 = (const FrameKey *)key1;
+    const FrameKey *fk2 = (const FrameKey *)key2;
+    return (fk1->filename_idx == fk2->filename_idx &&
+            fk1->funcname_idx == fk2->funcname_idx &&
+            fk1->lineno == fk2->lineno);
+}
+
+/* Destroy function for frame keys - free the allocated FrameKey */
+static void
+frame_key_destroy(void *key)
+{
+    PyMem_Free(key);
+}
+
+/* Intern a string and return its index */
+static inline int
+writer_intern_string(BinaryWriter *writer, PyObject *string, uint32_t *index)
+{
+    /* Check if string already exists in hash table */
+    void *existing = _Py_hashtable_get(writer->string_hash, string);
+    if (existing != NULL) {
+        *index = (uint32_t)(uintptr_t)existing - 1;  /* Subtract 1 since we store index+1 */
+        return 0;
+    }
+
+    /* New string - grow storage if needed */
+    if (writer->string_count >= writer->string_capacity) {
+        if (grow_parallel_arrays((void **)&writer->strings,
+                                  (void **)&writer->string_lengths,
+                                  &writer->string_capacity,
+                                  sizeof(char *), sizeof(size_t)) < 0) {
+            return -1;
+        }
+    }
+
+    Py_ssize_t str_len;
+    const char *str_data = PyUnicode_AsUTF8AndSize(string, &str_len);
+    if (!str_data) {
+        return -1;
+    }
+
+    /* Store copy of string data */
+    char *str_copy = PyMem_Malloc(str_len + 1);
+    if (!str_copy) {
+        PyErr_NoMemory();
+        return -1;
+    }
+    memcpy(str_copy, str_data, str_len + 1);
+
+    /* The index we'll use (current count before incrementing) */
+    *index = (uint32_t)writer->string_count;
+
+    /* Add to hash table FIRST (before modifying arrays/count) to ensure atomicity.
+     * If hash table insert fails, we can simply free str_copy without rolling back.
+     * Store index+1 to distinguish from NULL (0 would be ambiguous). */
+    Py_INCREF(string);
+    if (_Py_hashtable_set(writer->string_hash, string, (void *)(uintptr_t)(*index + 1)) < 0) {
+        Py_DECREF(string);
+        PyMem_Free(str_copy);
+        PyErr_NoMemory();
+        return -1;
+    }
+
+    /* Hash table insert succeeded - now safely update arrays and count.
+     * These operations cannot fail, so the data structures stay consistent. */
+    writer->strings[writer->string_count] = str_copy;
+    writer->string_lengths[writer->string_count] = str_len;
+    writer->string_count++;
+
+    return 0;
+}
+
+/* Intern a frame and return its index */
+static inline int
+writer_intern_frame(BinaryWriter *writer, uint32_t filename_idx, uint32_t funcname_idx,
+                    int32_t lineno, uint32_t *index)
+{
+    /* Create a temporary key for lookup */
+    FrameKey lookup_key = {filename_idx, funcname_idx, lineno};
+
+    /* Check if frame already exists in hash table */
+    void *existing = _Py_hashtable_get(writer->frame_hash, &lookup_key);
+    if (existing != NULL) {
+        *index = (uint32_t)(uintptr_t)existing - 1;  /* Subtract 1 since we store index+1 */
+        return 0;
+    }
+
+    /* New frame - grow storage if needed */
+    if (GROW_ARRAY(writer->frame_entries, writer->frame_count,
+                   writer->frame_capacity, FrameEntry) < 0) {
+        return -1;
+    }
+
+    /* Allocate key for hash table first (before modifying frame_count)
+     * to ensure atomic rollback on failure */
+    FrameKey *key = PyMem_Malloc(sizeof(FrameKey));
+    if (!key) {
+        PyErr_NoMemory();
+        return -1;
+    }
+    *key = lookup_key;
+
+    /* Now add the frame entry */
+    *index = (uint32_t)writer->frame_count;
+    FrameEntry *fe = &writer->frame_entries[writer->frame_count];
+    fe->filename_idx = filename_idx;
+    fe->funcname_idx = funcname_idx;
+    fe->lineno = lineno;
+
+    /* Add to hash table (store index+1 to distinguish from NULL) */
+    if (_Py_hashtable_set(writer->frame_hash, key, (void *)(uintptr_t)(*index + 1)) < 0) {
+        PyMem_Free(key);
+        /* Don't increment frame_count - rollback the frame entry */
+        PyErr_NoMemory();
+        return -1;
+    }
+
+    /* Success - now increment frame_count */
+    writer->frame_count++;
+    return 0;
+}
+
+/* Get or create a thread entry for the given thread_id.
+ * Returns pointer to ThreadEntry, or NULL on allocation failure.
+ * If is_new is non-NULL, sets it to 1 if this is a new thread, 0 otherwise. */
+static ThreadEntry *
+writer_get_or_create_thread_entry(BinaryWriter *writer, uint64_t thread_id,
+                                   uint32_t interpreter_id, int *is_new)
+{
+    /* Linear search (OK for small number of threads) */
+    /* Key is (thread_id, interpreter_id) since same thread_id can exist in different interpreters */
+    for (size_t i = 0; i < writer->thread_count; i++) {
+        if (writer->thread_entries[i].thread_id == thread_id &&
+            writer->thread_entries[i].interpreter_id == interpreter_id) {
+            if (is_new) {
+                *is_new = 0;
+            }
+            return &writer->thread_entries[i];
+        }
+    }
+
+    /* Add new thread - grow array if needed */
+    if (writer->thread_count >= writer->thread_capacity) {
+        writer->thread_entries = grow_array(writer->thread_entries,
+                                            &writer->thread_capacity,
+                                            sizeof(ThreadEntry));
+        if (!writer->thread_entries) {
+            return NULL;
+        }
+    }
+
+    ThreadEntry *entry = &writer->thread_entries[writer->thread_count];
+    memset(entry, 0, sizeof(ThreadEntry));
+    entry->thread_id = thread_id;
+    entry->interpreter_id = interpreter_id;
+    entry->prev_timestamp = writer->start_time_us;
+    entry->prev_stack_capacity = MAX_STACK_DEPTH;
+    entry->pending_rle_capacity = INITIAL_RLE_CAPACITY;
+
+    entry->prev_stack = PyMem_Malloc(entry->prev_stack_capacity * sizeof(uint32_t));
+    if (!entry->prev_stack) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    entry->pending_rle = PyMem_Malloc(entry->pending_rle_capacity * sizeof(PendingRLESample));
+    if (!entry->pending_rle) {
+        PyMem_Free(entry->prev_stack);
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    writer->thread_count++;
+    if (is_new) {
+        *is_new = 1;
+    }
+    return entry;
+}
+
+/* Compare two stacks and return the encoding type and parameters.
+ * Sets:
+ *   - shared_count: number of frames matching from bottom of stack
+ *   - pop_count: frames to remove from prev stack
+ *   - push_count: new frames to add
+ *
+ * Returns the best encoding type to use. */
+static int
+compare_stacks(const uint32_t *prev_stack, size_t prev_depth,
+               const uint32_t *curr_stack, size_t curr_depth,
+               size_t *shared_count, size_t *pop_count, size_t *push_count)
+{
+    /* Check for identical stacks */
+    if (prev_depth == curr_depth) {
+        int identical = 1;
+        for (size_t i = 0; i < prev_depth; i++) {
+            if (prev_stack[i] != curr_stack[i]) {
+                identical = 0;
+                break;
+            }
+        }
+        if (identical) {
+            *shared_count = prev_depth;
+            *pop_count = 0;
+            *push_count = 0;
+            return STACK_REPEAT;
+        }
+    }
+
+    /* Find longest common suffix (frames at the bottom/outer part of stack).
+     * Stacks are stored innermost-first, so suffix is at the end. */
+    size_t suffix_len = 0;
+    size_t min_depth = (prev_depth < curr_depth) ? prev_depth : curr_depth;
+
+    for (size_t i = 0; i < min_depth; i++) {
+        size_t prev_idx = prev_depth - 1 - i;
+        size_t curr_idx = curr_depth - 1 - i;
+        if (prev_stack[prev_idx] == curr_stack[curr_idx]) {
+            suffix_len++;
+        } else {
+            break;
+        }
+    }
+
+    *shared_count = suffix_len;
+    *pop_count = prev_depth - suffix_len;
+    *push_count = curr_depth - suffix_len;
+
+    /* Choose best encoding based on byte cost */
+    /* STACK_FULL: 1 (type) + 1-2 (depth) + sum(frame varints) */
+    /* STACK_SUFFIX: 1 (type) + 1-2 (shared) + 1-2 (new_count) + sum(new frame varints) */
+    /* STACK_POP_PUSH: 1 (type) + 1-2 (pop) + 1-2 (push) + sum(new frame varints) */
+
+    /* If no common suffix, use full stack */
+    if (suffix_len == 0) {
+        return STACK_FULL;
+    }
+
+    /* If only adding frames (suffix == prev_depth), use SUFFIX */
+    if (*pop_count == 0 && *push_count > 0) {
+        return STACK_SUFFIX;
+    }
+
+    /* If popping and/or pushing, use POP_PUSH if it saves bytes */
+    /* Heuristic: POP_PUSH is better when we're modifying top frames */
+    if (*pop_count > 0 || *push_count > 0) {
+        /* Use full stack if sharing less than half the frames */
+        if (suffix_len < curr_depth / 2) {
+            return STACK_FULL;
+        }
+        return STACK_POP_PUSH;
+    }
+
+    return STACK_FULL;
+}
+
+/* Write common sample header: thread_id(8) + interpreter_id(4) + encoding(1).
+ * Returns 0 on success, -1 on failure. */
+static inline int
+write_sample_header(BinaryWriter *writer, ThreadEntry *entry, uint8_t encoding)
+{
+    uint8_t header[SAMPLE_HEADER_FIXED_SIZE];
+    memcpy(header, &entry->thread_id, 8);
+    memcpy(header + 8, &entry->interpreter_id, 4);
+    header[12] = encoding;
+    return writer_write_bytes(writer, header, SAMPLE_HEADER_FIXED_SIZE);
+}
+
+/* Flush pending RLE samples for a thread.
+ * Writes the RLE record to the output buffer.
+ * Returns 0 on success, -1 on failure. */
+static int
+flush_pending_rle(BinaryWriter *writer, ThreadEntry *entry)
+{
+    if (!entry->has_pending_rle || entry->pending_rle_count == 0) {
+        return 0;
+    }
+
+    /* Write RLE record:
+     * [thread_id: 8] [interpreter_id: 4] [STACK_REPEAT: 1] [count: varint]
+     * [timestamp_delta_1: varint] [status_1: 1] ... [timestamp_delta_N: varint] [status_N: 1]
+     */
+
+    /* Write fixed header */
+    if (write_sample_header(writer, entry, STACK_REPEAT) < 0) {
+        return -1;
+    }
+
+    /* Write count */
+    if (writer_write_varint_u32(writer, (uint32_t)entry->pending_rle_count) < 0) {
+        return -1;
+    }
+
+    /* Write timestamp deltas and status bytes */
+    for (size_t i = 0; i < entry->pending_rle_count; i++) {
+        if (writer_write_varint_u64(writer, entry->pending_rle[i].timestamp_delta) < 0) {
+            return -1;
+        }
+        if (writer_write_bytes(writer, &entry->pending_rle[i].status, 1) < 0) {
+            return -1;
+        }
+        writer->total_samples++;
+    }
+
+    /* Update stats: RLE saved writing full stacks for each repeat sample */
+    writer->stats.repeat_records++;
+    writer->stats.repeat_samples += entry->pending_rle_count;
+    /* Each RLE sample saves writing the entire stack (prev_stack_depth frames) */
+    writer->stats.frames_saved += entry->pending_rle_count * entry->prev_stack_depth;
+
+    /* Clear pending state */
+    entry->pending_rle_count = 0;
+    entry->has_pending_rle = 0;
+
+    return 0;
+}
+
+/* Write a single sample with the specified encoding.
+ * Returns 0 on success, -1 on failure. */
+static int
+write_sample_with_encoding(BinaryWriter *writer, ThreadEntry *entry,
+                           uint64_t timestamp_delta, uint8_t status,
+                           int encoding_type,
+                           const uint32_t *frame_indices, size_t stack_depth,
+                           size_t shared_count, size_t pop_count, size_t push_count)
+{
+    /* Write header: thread_id (8) + interpreter_id (4) + encoding (1) + delta (varint) + status (1) */
+    uint8_t header_buf[SAMPLE_HEADER_MAX_SIZE];
+    memcpy(header_buf, &entry->thread_id, 8);
+    memcpy(header_buf + 8, &entry->interpreter_id, 4);
+    header_buf[12] = (uint8_t)encoding_type;
+    size_t varint_len = encode_varint_u64(header_buf + 13, timestamp_delta);
+    header_buf[13 + varint_len] = status;
+
+    if (writer_write_bytes(writer, header_buf, 14 + varint_len) < 0) {
+        return -1;
+    }
+
+    /* Write encoding-specific data */
+    uint8_t frame_buf[MAX_FRAME_BUFFER_SIZE];
+    size_t frame_buf_pos = 0;
+    size_t frames_written = 0;
+
+    switch (encoding_type) {
+    case STACK_FULL:
+        /* [depth: varint] [frame_idx: varint]... */
+        frame_buf_pos += encode_varint_u32(frame_buf, (uint32_t)stack_depth);
+        for (size_t i = 0; i < stack_depth; i++) {
+            frame_buf_pos += encode_varint_u32(frame_buf + frame_buf_pos, frame_indices[i]);
+        }
+        frames_written = stack_depth;
+        writer->stats.full_records++;
+        break;
+
+    case STACK_SUFFIX:
+        /* [shared_count: varint] [new_count: varint] [new_frame_idx: varint]... */
+        frame_buf_pos += encode_varint_u32(frame_buf, (uint32_t)shared_count);
+        frame_buf_pos += encode_varint_u32(frame_buf + frame_buf_pos, (uint32_t)push_count);
+        /* New frames are at the top (beginning) of current stack */
+        for (size_t i = 0; i < push_count; i++) {
+            frame_buf_pos += encode_varint_u32(frame_buf + frame_buf_pos, frame_indices[i]);
+        }
+        frames_written = push_count;
+        writer->stats.suffix_records++;
+        /* Saved writing shared_count frames */
+        writer->stats.frames_saved += shared_count;
+        break;
+
+    case STACK_POP_PUSH:
+        /* [pop_count: varint] [push_count: varint] [new_frame_idx: varint]... */
+        frame_buf_pos += encode_varint_u32(frame_buf, (uint32_t)pop_count);
+        frame_buf_pos += encode_varint_u32(frame_buf + frame_buf_pos, (uint32_t)push_count);
+        /* New frames are at the top (beginning) of current stack */
+        for (size_t i = 0; i < push_count; i++) {
+            frame_buf_pos += encode_varint_u32(frame_buf + frame_buf_pos, frame_indices[i]);
+        }
+        frames_written = push_count;
+        writer->stats.pop_push_records++;
+        /* Saved writing shared_count frames (stack_depth - push_count if we had written full) */
+        writer->stats.frames_saved += shared_count;
+        break;
+
+    default:
+        PyErr_SetString(PyExc_RuntimeError, "Invalid stack encoding type");
+        return -1;
+    }
+
+    if (writer_write_bytes(writer, frame_buf, frame_buf_pos) < 0) {
+        return -1;
+    }
+
+    writer->stats.total_frames_written += frames_written;
+    writer->total_samples++;
+    return 0;
+}
+
+BinaryWriter *
+binary_writer_create(const char *filename, uint64_t sample_interval_us, int compression_type,
+                     uint64_t start_time_us)
+{
+    BinaryWriter *writer = PyMem_Calloc(1, sizeof(BinaryWriter));
+    if (!writer) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    writer->filename = PyMem_Malloc(strlen(filename) + 1);
+    if (!writer->filename) {
+        PyMem_Free(writer);
+        PyErr_NoMemory();
+        return NULL;
+    }
+    strcpy(writer->filename, filename);
+
+    writer->start_time_us = start_time_us;
+    writer->sample_interval_us = sample_interval_us;
+    writer->compression_type = compression_type;
+
+    writer->write_buffer = PyMem_Malloc(WRITE_BUFFER_SIZE);
+    if (!writer->write_buffer) {
+        goto error;
+    }
+    writer->buffer_size = WRITE_BUFFER_SIZE;
+
+    writer->string_hash = _Py_hashtable_new_full(
+        string_hash_func,
+        string_compare_func,
+        string_key_destroy,  /* Key destroy: decref the Python string */
+        NULL,                /* Value destroy: values are just indices, not pointers */
+        NULL                 /* Use default allocator */
+    );
+    if (!writer->string_hash) {
+        goto error;
+    }
+    writer->strings = PyMem_Malloc(INITIAL_STRING_CAPACITY * sizeof(char *));
+    if (!writer->strings) {
+        goto error;
+    }
+    writer->string_lengths = PyMem_Malloc(INITIAL_STRING_CAPACITY * sizeof(size_t));
+    if (!writer->string_lengths) {
+        goto error;
+    }
+    writer->string_capacity = INITIAL_STRING_CAPACITY;
+
+    writer->frame_hash = _Py_hashtable_new_full(
+        frame_key_hash_func,
+        frame_key_compare_func,
+        frame_key_destroy,   /* Key destroy: free the FrameKey */
+        NULL,                /* Value destroy: values are just indices, not pointers */
+        NULL                 /* Use default allocator */
+    );
+    if (!writer->frame_hash) {
+        goto error;
+    }
+    writer->frame_entries = PyMem_Malloc(INITIAL_FRAME_CAPACITY * sizeof(FrameEntry));
+    if (!writer->frame_entries) {
+        goto error;
+    }
+    writer->frame_capacity = INITIAL_FRAME_CAPACITY;
+
+    writer->thread_entries = PyMem_Malloc(INITIAL_THREAD_CAPACITY * sizeof(ThreadEntry));
+    if (!writer->thread_entries) {
+        goto error;
+    }
+    writer->thread_capacity = INITIAL_THREAD_CAPACITY;
+
+    /* Initialize compression if requested */
+    if (compression_type == COMPRESSION_ZSTD) {
+        if (writer_init_zstd(writer) < 0) {
+            goto error;
+        }
+    }
+
+    /* Open file */
+    writer->fp = fopen(filename, "wb");
+    if (!writer->fp) {
+        PyErr_SetFromErrnoWithFilename(PyExc_IOError, filename);
+        goto error;
+    }
+
+    /* Hint sequential write pattern to kernel for better I/O scheduling */
+#if defined(__linux__) && defined(POSIX_FADV_SEQUENTIAL)
+    {
+        int fd = fileno(writer->fp);
+        if (fd >= 0) {
+            (void)posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL);
+        }
+    }
+#endif
+
+    /* Write placeholder header - release GIL during I/O */
+    uint8_t header[FILE_HEADER_PLACEHOLDER_SIZE] = {0};
+    if (fwrite_checked_allow_threads(header, FILE_HEADER_PLACEHOLDER_SIZE, writer->fp) < 0) {
+        goto error;
+    }
+
+    return writer;
+
+error:
+    binary_writer_destroy(writer);
+    return NULL;
+}
+
+/* Build a frame stack from Python frame list by interning all strings and frames.
+ * Returns 0 on success, -1 on error. */
+static int
+build_frame_stack(BinaryWriter *writer, PyObject *frame_list,
+                  uint32_t *curr_stack, size_t *curr_depth)
+{
+    Py_ssize_t stack_depth = PyList_Size(frame_list);
+    *curr_depth = (stack_depth < MAX_STACK_DEPTH) ? stack_depth : MAX_STACK_DEPTH;
+
+    for (Py_ssize_t k = 0; k < (Py_ssize_t)*curr_depth; k++) {
+        /* Use unchecked accessors since we control the data structures */
+        PyObject *frame_info = PyList_GET_ITEM(frame_list, k);
+
+        /* Get filename, location, funcname from FrameInfo using unchecked access */
+        PyObject *filename = PyStructSequence_GET_ITEM(frame_info, 0);
+        PyObject *location = PyStructSequence_GET_ITEM(frame_info, 1);
+        PyObject *funcname = PyStructSequence_GET_ITEM(frame_info, 2);
+
+        /* Extract lineno from location (can be None for synthetic frames) */
+        int32_t lineno = 0;
+        if (location != Py_None) {
+            /* Use unchecked access - first element is lineno */
+            PyObject *lineno_obj = PyTuple_Check(location) ?
+                PyTuple_GET_ITEM(location, 0) :
+                PyStructSequence_GET_ITEM(location, 0);
+            lineno = (int32_t)PyLong_AsLong(lineno_obj);
+            if (UNLIKELY(PyErr_Occurred() != NULL)) {
+                PyErr_Clear();
+                lineno = 0;
+            }
+        }
+
+        /* Intern filename */
+        uint32_t filename_idx;
+        if (writer_intern_string(writer, filename, &filename_idx) < 0) {
+            return -1;
+        }
+
+        /* Intern funcname */
+        uint32_t funcname_idx;
+        if (writer_intern_string(writer, funcname, &funcname_idx) < 0) {
+            return -1;
+        }
+
+        /* Intern frame */
+        uint32_t frame_idx;
+        if (writer_intern_frame(writer, filename_idx, funcname_idx, lineno, &frame_idx) < 0) {
+            return -1;
+        }
+
+        curr_stack[k] = frame_idx;
+    }
+    return 0;
+}
+
+/* Process a single thread's sample.
+ * Returns 0 on success, -1 on error. */
+static int
+process_thread_sample(BinaryWriter *writer, PyObject *thread_info,
+                      uint32_t interpreter_id, uint64_t timestamp_us)
+{
+    /* Get thread_id, status, frame_list from ThreadInfo using unchecked access */
+    PyObject *thread_id_obj = PyStructSequence_GET_ITEM(thread_info, 0);
+    PyObject *status_obj = PyStructSequence_GET_ITEM(thread_info, 1);
+    PyObject *frame_list = PyStructSequence_GET_ITEM(thread_info, 2);
+
+    uint64_t thread_id = PyLong_AsUnsignedLongLong(thread_id_obj);
+    if (thread_id == (uint64_t)-1 && PyErr_Occurred()) {
+        return -1;
+    }
+    long status_long = PyLong_AsLong(status_obj);
+    if (status_long == -1 && PyErr_Occurred()) {
+        return -1;
+    }
+    uint8_t status = (uint8_t)status_long;
+
+    /* Get or create thread entry */
+    int is_new_thread = 0;
+    ThreadEntry *entry = writer_get_or_create_thread_entry(
+        writer, thread_id, interpreter_id, &is_new_thread);
+    if (!entry) {
+        return -1;
+    }
+
+    /* Calculate timestamp delta */
+    uint64_t delta = timestamp_us - entry->prev_timestamp;
+    entry->prev_timestamp = timestamp_us;
+
+    /* Process frames and build current stack */
+    uint32_t curr_stack[MAX_STACK_DEPTH];
+    size_t curr_depth;
+    if (build_frame_stack(writer, frame_list, curr_stack, &curr_depth) < 0) {
+        return -1;
+    }
+
+    /* Compare with previous stack to determine encoding */
+    size_t shared_count, pop_count, push_count;
+    int encoding = compare_stacks(
+        entry->prev_stack, entry->prev_stack_depth,
+        curr_stack, curr_depth,
+        &shared_count, &pop_count, &push_count);
+
+    if (encoding == STACK_REPEAT && !is_new_thread) {
+        /* Buffer this sample for RLE */
+        if (GROW_ARRAY(entry->pending_rle, entry->pending_rle_count,
+                       entry->pending_rle_capacity, PendingRLESample) < 0) {
+            return -1;
+        }
+        entry->pending_rle[entry->pending_rle_count].timestamp_delta = delta;
+        entry->pending_rle[entry->pending_rle_count].status = status;
+        entry->pending_rle_count++;
+        entry->has_pending_rle = 1;
+    } else {
+        /* Stack changed - flush any pending RLE first */
+        if (entry->has_pending_rle) {
+            if (flush_pending_rle(writer, entry) < 0) {
+                return -1;
+            }
+        }
+
+        /* Write this sample with the appropriate encoding */
+        if (write_sample_with_encoding(writer, entry, delta, status, encoding,
+                                       curr_stack, curr_depth,
+                                       shared_count, pop_count, push_count) < 0) {
+            return -1;
+        }
+
+        /* Update previous stack */
+        memcpy(entry->prev_stack, curr_stack, curr_depth * sizeof(uint32_t));
+        entry->prev_stack_depth = curr_depth;
+    }
+
+    return 0;
+}
+
+int
+binary_writer_write_sample(BinaryWriter *writer, PyObject *stack_frames, uint64_t timestamp_us)
+{
+    if (!PyList_Check(stack_frames)) {
+        PyErr_SetString(PyExc_TypeError, "stack_frames must be a list");
+        return -1;
+    }
+
+    Py_ssize_t num_interpreters = PyList_GET_SIZE(stack_frames);
+    for (Py_ssize_t i = 0; i < num_interpreters; i++) {
+        PyObject *interp_info = PyList_GET_ITEM(stack_frames, i);
+
+        /* Get interpreter_id and threads from InterpreterInfo using unchecked access */
+        PyObject *interp_id_obj = PyStructSequence_GET_ITEM(interp_info, 0);
+        PyObject *threads = PyStructSequence_GET_ITEM(interp_info, 1);
+
+        unsigned long interp_id_long = PyLong_AsUnsignedLong(interp_id_obj);
+        if (interp_id_long == (unsigned long)-1 && PyErr_Occurred()) {
+            return -1;
+        }
+        /* Bounds check: interpreter_id is stored as uint32_t in binary format */
+        if (interp_id_long > UINT32_MAX) {
+            PyErr_Format(PyExc_OverflowError,
+                "interpreter_id %lu exceeds maximum value %lu",
+                interp_id_long, (unsigned long)UINT32_MAX);
+            return -1;
+        }
+        uint32_t interpreter_id = (uint32_t)interp_id_long;
+
+        Py_ssize_t num_threads = PyList_GET_SIZE(threads);
+        for (Py_ssize_t j = 0; j < num_threads; j++) {
+            PyObject *thread_info = PyList_GET_ITEM(threads, j);
+            if (process_thread_sample(writer, thread_info, interpreter_id, timestamp_us) < 0) {
+                return -1;
+            }
+        }
+    }
+
+    return 0;
+}
+
+int
+binary_writer_finalize(BinaryWriter *writer)
+{
+    /* Flush any pending RLE for all threads */
+    for (size_t i = 0; i < writer->thread_count; i++) {
+        if (writer->thread_entries[i].has_pending_rle) {
+            if (flush_pending_rle(writer, &writer->thread_entries[i]) < 0) {
+                return -1;
+            }
+        }
+    }
+
+    /* Flush remaining buffer */
+    if (writer_flush_buffer(writer) < 0) {
+        return -1;
+    }
+
+#ifdef HAVE_ZSTD
+    /* Finalize compression stream */
+    if (writer->compression_type == COMPRESSION_ZSTD && writer->zstd.cctx) {
+        ZSTD_inBuffer input = { NULL, 0, 0 };
+        size_t remaining;
+
+        do {
+            ZSTD_outBuffer output = {
+                writer->zstd.compressed_buffer,
+                writer->zstd.compressed_buffer_size,
+                0
+            };
+
+            remaining = ZSTD_compressStream2(writer->zstd.cctx, &output, &input, ZSTD_e_end);
+
+            if (ZSTD_isError(remaining)) {
+                PyErr_Format(PyExc_IOError, "zstd finalization error: %s",
+                             ZSTD_getErrorName(remaining));
+                return -1;
+            }
+
+            if (output.pos > 0) {
+                if (fwrite_checked_allow_threads(writer->zstd.compressed_buffer, output.pos, writer->fp) < 0) {
+                    return -1;
+                }
+            }
+        } while (remaining > 0);
+    }
+#endif
+
+    /* Get offset for string table (use 64-bit file position for >2GB files) */
+    file_offset_t string_table_offset = FTELL64(writer->fp);
+    if (string_table_offset < 0) {
+        PyErr_SetFromErrno(PyExc_IOError);
+        return -1;
+    }
+
+    /* Write string table - release GIL during potentially large writes */
+    for (size_t i = 0; i < writer->string_count; i++) {
+        uint8_t len_buf[10];
+        size_t len_size = encode_varint_u32(len_buf, (uint32_t)writer->string_lengths[i]);
+        if (fwrite_checked_allow_threads(len_buf, len_size, writer->fp) < 0 ||
+            fwrite_checked_allow_threads(writer->strings[i], writer->string_lengths[i], writer->fp) < 0) {
+            return -1;
+        }
+    }
+
+    /* Get offset for frame table */
+    file_offset_t frame_table_offset = FTELL64(writer->fp);
+    if (frame_table_offset < 0) {
+        PyErr_SetFromErrno(PyExc_IOError);
+        return -1;
+    }
+
+    /* Write frame table - release GIL during writes */
+    for (size_t i = 0; i < writer->frame_count; i++) {
+        FrameEntry *entry = &writer->frame_entries[i];
+        uint8_t buf[30];
+        size_t pos = encode_varint_u32(buf, entry->filename_idx);
+        pos += encode_varint_u32(buf + pos, entry->funcname_idx);
+        pos += encode_varint_i32(buf + pos, entry->lineno);
+        if (fwrite_checked_allow_threads(buf, pos, writer->fp) < 0) {
+            return -1;
+        }
+    }
+
+    /* Write footer (32 bytes): string_count(4) + frame_count(4) + file_size(8) + checksum(16) */
+    file_offset_t footer_offset = FTELL64(writer->fp);
+    if (footer_offset < 0) {
+        PyErr_SetFromErrno(PyExc_IOError);
+        return -1;
+    }
+    uint64_t file_size = (uint64_t)footer_offset + 32;
+    uint8_t footer[32] = {0};
+    memcpy(footer + 0, &writer->string_count, 4);
+    memcpy(footer + 4, &writer->frame_count, 4);
+    memcpy(footer + 8, &file_size, 8);
+    /* bytes 16-31: checksum placeholder (zeros) */
+    if (fwrite_checked_allow_threads(footer, 32, writer->fp) < 0) {
+        return -1;
+    }
+
+    /* Write header at file start */
+    if (FSEEK64(writer->fp, 0, SEEK_SET) < 0) {
+        PyErr_SetFromErrno(PyExc_IOError);
+        return -1;
+    }
+
+    /* Convert file offsets to uint64_t for portable header format */
+    uint64_t string_table_offset_u64 = (uint64_t)string_table_offset;
+    uint64_t frame_table_offset_u64 = (uint64_t)frame_table_offset;
+
+    uint8_t header[52] = {0};
+    uint32_t magic = BINARY_FORMAT_MAGIC;
+    uint32_t version = BINARY_FORMAT_VERSION;
+    memcpy(header + 0, &magic, 4);
+    memcpy(header + 4, &version, 4);
+    memcpy(header + 8, &writer->start_time_us, 8);
+    memcpy(header + 16, &writer->sample_interval_us, 8);
+    memcpy(header + 24, &writer->total_samples, 4);
+    memcpy(header + 28, &writer->thread_count, 4);
+    memcpy(header + 32, &string_table_offset_u64, 8);
+    memcpy(header + 40, &frame_table_offset_u64, 8);
+    memcpy(header + 48, &writer->compression_type, 4);
+    if (fwrite_checked_allow_threads(header, 52, writer->fp) < 0) {
+        return -1;
+    }
+
+    /* Close file */
+    if (fclose(writer->fp) != 0) {
+        writer->fp = NULL;
+        PyErr_SetFromErrno(PyExc_IOError);
+        return -1;
+    }
+    writer->fp = NULL;
+
+    return 0;
+}
+
+void
+binary_writer_destroy(BinaryWriter *writer)
+{
+    if (!writer) {
+        return;
+    }
+
+    if (writer->fp) {
+        fclose(writer->fp);
+    }
+
+    PyMem_Free(writer->filename);
+    PyMem_Free(writer->write_buffer);
+
+#ifdef HAVE_ZSTD
+    if (writer->zstd.cctx) {
+        ZSTD_freeCCtx(writer->zstd.cctx);
+    }
+    PyMem_Free(writer->zstd.compressed_buffer);
+#endif
+
+    /* Free string hash table (destroys keys which decrefs Python strings) */
+    if (writer->string_hash) {
+        _Py_hashtable_destroy(writer->string_hash);
+    }
+    if (writer->strings) {
+        for (size_t i = 0; i < writer->string_count; i++) {
+            PyMem_Free(writer->strings[i]);
+        }
+        PyMem_Free(writer->strings);
+    }
+    PyMem_Free(writer->string_lengths);
+
+    /* Free frame hash table (destroys keys which frees FrameKey structs) */
+    if (writer->frame_hash) {
+        _Py_hashtable_destroy(writer->frame_hash);
+    }
+    PyMem_Free(writer->frame_entries);
+
+    /* Free per-thread buffers */
+    if (writer->thread_entries) {
+        for (size_t i = 0; i < writer->thread_count; i++) {
+            PyMem_Free(writer->thread_entries[i].prev_stack);
+            PyMem_Free(writer->thread_entries[i].pending_rle);
+        }
+        PyMem_Free(writer->thread_entries);
+    }
+
+    PyMem_Free(writer);
+}
+

From 7a58811e54878c44e46f0becd09a360d0ca1a0b3 Mon Sep 17 00:00:00 2001
From: Pablo Galindo Salgado <pablogsal@gmail.com>
Date: Mon, 15 Dec 2025 01:47:47 +0000
Subject: [PATCH 03/17] Add binary reader for sampling profiler

Implements binary file parsing with stack reconstruction. On Unix,
uses mmap with MADV_SEQUENTIAL for efficient sequential access. Falls
back to buffered I/O on Windows.

The reader reconstructs full stacks from delta-encoded records by
maintaining per-thread state. Each sample's stack is rebuilt by
applying the encoded operation (repeat/suffix/pop-push) to the
previous stack for that thread.

Replay feeds reconstructed samples to any collector, enabling
conversion between formats without re-profiling.
---
 Modules/_remote_debugging/binary_io_reader.c | 1075 ++++++++++++++++++
 1 file changed, 1075 insertions(+)
 create mode 100644 Modules/_remote_debugging/binary_io_reader.c

diff --git a/Modules/_remote_debugging/binary_io_reader.c b/Modules/_remote_debugging/binary_io_reader.c
new file mode 100644
index 00000000000000..6890bc864ec9aa
--- /dev/null
+++ b/Modules/_remote_debugging/binary_io_reader.c
@@ -0,0 +1,1075 @@
+/******************************************************************************
+ * Python Remote Debugging Module - Binary Reader Implementation
+ *
+ * High-performance binary file reader for profiling data with optional zstd
+ * decompression.
+ ******************************************************************************/
+
+#ifndef Py_BUILD_CORE_MODULE
+#  define Py_BUILD_CORE_MODULE
+#endif
+
+#include "binary_io.h"
+#include "_remote_debugging.h"
+#include <string.h>
+
+#ifdef HAVE_ZSTD
+#include <zstd.h>
+#endif
+
+/* ============================================================================
+ * CONSTANTS FOR BINARY FORMAT SIZES
+ * ============================================================================ */
+
+/* File structure sizes */
+#define FILE_HEADER_PLACEHOLDER_SIZE 64  /* Placeholder written at file start */
+#define FILE_HEADER_SIZE 52              /* Actual header content size */
+#define FILE_FOOTER_SIZE 32              /* Footer size */
+#define MIN_DECOMPRESS_BUFFER_SIZE (64 * 1024)  /* Minimum decompression buffer */
+
+/* Progress callback frequency */
+#define PROGRESS_CALLBACK_INTERVAL 1000
+
+/* Maximum decompression size limit (1GB) */
+#define MAX_DECOMPRESS_SIZE (1ULL << 30)
+
+/* ============================================================================
+ * BINARY READER IMPLEMENTATION
+ * ============================================================================ */
+
+/* Parse the file header and populate reader fields */
+static inline int
+reader_parse_header(BinaryReader *reader, const uint8_t *data, size_t file_size)
+{
+    if (file_size < FILE_HEADER_PLACEHOLDER_SIZE) {
+        PyErr_SetString(PyExc_ValueError, "File too small for header");
+        return -1;
+    }
+
+    /* Use memcpy to avoid strict aliasing violations and unaligned access */
+    uint32_t magic;
+    uint32_t version;
+    memcpy(&magic, &data[0], sizeof(magic));
+    memcpy(&version, &data[4], sizeof(version));
+
+    if (magic != BINARY_FORMAT_MAGIC) {
+        PyErr_Format(PyExc_ValueError, "Invalid magic number: 0x%08x", magic);
+        return -1;
+    }
+
+    if (version != BINARY_FORMAT_VERSION) {
+        PyErr_Format(PyExc_ValueError, "Unsupported version: %u", version);
+        return -1;
+    }
+
+    memcpy(&reader->start_time_us, &data[8], sizeof(reader->start_time_us));
+    memcpy(&reader->sample_interval_us, &data[16], sizeof(reader->sample_interval_us));
+    memcpy(&reader->sample_count, &data[24], sizeof(reader->sample_count));
+    memcpy(&reader->thread_count, &data[28], sizeof(reader->thread_count));
+    memcpy(&reader->string_table_offset, &data[32], sizeof(reader->string_table_offset));
+    memcpy(&reader->frame_table_offset, &data[40], sizeof(reader->frame_table_offset));
+    memcpy(&reader->compression_type, &data[48], sizeof(reader->compression_type));
+
+    return 0;
+}
+
+/* Parse the file footer */
+static inline int
+reader_parse_footer(BinaryReader *reader, const uint8_t *data, size_t file_size)
+{
+    if (file_size < FILE_FOOTER_SIZE) {
+        PyErr_SetString(PyExc_ValueError, "File too small for footer");
+        return -1;
+    }
+
+    const uint8_t *footer = data + file_size - FILE_FOOTER_SIZE;
+    /* Use memcpy to avoid strict aliasing violations */
+    memcpy(&reader->strings_count, &footer[0], sizeof(reader->strings_count));
+    memcpy(&reader->frames_count, &footer[4], sizeof(reader->frames_count));
+
+    return 0;
+}
+
+#ifdef HAVE_ZSTD
+/* Maximum decompression buffer size to prevent memory exhaustion (1GB) */
+#define MAX_DECOMPRESS_SIZE (1ULL << 30)
+
+/* Decompress zstd-compressed sample data */
+static inline int
+reader_decompress_samples(BinaryReader *reader, const uint8_t *data)
+{
+    size_t compressed_size = reader->string_table_offset - FILE_HEADER_PLACEHOLDER_SIZE;
+    const uint8_t *compressed_data = data + FILE_HEADER_PLACEHOLDER_SIZE;
+
+    /* Validate compressed data region */
+    if (reader->string_table_offset < FILE_HEADER_PLACEHOLDER_SIZE) {
+        PyErr_SetString(PyExc_ValueError, "Invalid string table offset");
+        return -1;
+    }
+
+    ZSTD_DCtx *dctx = ZSTD_createDCtx();
+    if (!dctx) {
+        PyErr_SetString(PyExc_MemoryError, "Failed to create zstd decompression context");
+        return -1;
+    }
+
+    /* Try to get exact decompressed size from frame header for optimal allocation */
+    unsigned long long frame_content_size = ZSTD_getFrameContentSize(compressed_data, compressed_size);
+    size_t alloc_size;
+
+    if (frame_content_size == ZSTD_CONTENTSIZE_ERROR) {
+        /* Corrupted frame header - fail early */
+        ZSTD_freeDCtx(dctx);
+        PyErr_SetString(PyExc_ValueError, "Corrupted zstd frame header");
+        return -1;
+    } else if (frame_content_size != ZSTD_CONTENTSIZE_UNKNOWN &&
+               frame_content_size <= SIZE_MAX &&
+               frame_content_size <= MAX_DECOMPRESS_SIZE) {
+        alloc_size = (size_t)frame_content_size;
+    } else {
+        alloc_size = ZSTD_DStreamOutSize() * 4;
+        if (alloc_size < MIN_DECOMPRESS_BUFFER_SIZE) {
+            alloc_size = MIN_DECOMPRESS_BUFFER_SIZE;
+        }
+    }
+
+    reader->decompressed_data = PyMem_Malloc(alloc_size);
+    if (!reader->decompressed_data) {
+        ZSTD_freeDCtx(dctx);
+        PyErr_NoMemory();
+        return -1;
+    }
+
+    ZSTD_inBuffer input = { compressed_data, compressed_size, 0 };
+    size_t total_output = 0;
+    size_t last_result = 0;
+
+    while (input.pos < input.size) {
+        if (total_output >= alloc_size) {
+            /* Check for overflow before doubling */
+            if (alloc_size > SIZE_MAX / 2 || alloc_size * 2 > MAX_DECOMPRESS_SIZE) {
+                PyMem_Free(reader->decompressed_data);
+                reader->decompressed_data = NULL;
+                ZSTD_freeDCtx(dctx);
+                PyErr_SetString(PyExc_MemoryError, "Decompressed data exceeds maximum size");
+                return -1;
+            }
+            size_t new_size = alloc_size * 2;
+            uint8_t *new_buf = PyMem_Realloc(reader->decompressed_data, new_size);
+            if (!new_buf) {
+                PyMem_Free(reader->decompressed_data);
+                reader->decompressed_data = NULL;
+                ZSTD_freeDCtx(dctx);
+                PyErr_NoMemory();
+                return -1;
+            }
+            reader->decompressed_data = new_buf;
+            alloc_size = new_size;
+        }
+
+        ZSTD_outBuffer output = {
+            reader->decompressed_data + total_output,
+            alloc_size - total_output,
+            0
+        };
+
+        last_result = ZSTD_decompressStream(dctx, &output, &input);
+        if (ZSTD_isError(last_result)) {
+            PyMem_Free(reader->decompressed_data);
+            reader->decompressed_data = NULL;
+            ZSTD_freeDCtx(dctx);
+            PyErr_Format(PyExc_ValueError, "zstd decompression error: %s",
+                         ZSTD_getErrorName(last_result));
+            return -1;
+        }
+
+        total_output += output.pos;
+    }
+
+    /* Verify decompression is complete (last_result == 0 means frame is complete) */
+    if (last_result != 0) {
+        PyMem_Free(reader->decompressed_data);
+        reader->decompressed_data = NULL;
+        ZSTD_freeDCtx(dctx);
+        PyErr_SetString(PyExc_ValueError, "Incomplete zstd frame: data may be truncated");
+        return -1;
+    }
+
+    ZSTD_freeDCtx(dctx);
+    reader->decompressed_size = total_output;
+    reader->sample_data = reader->decompressed_data;
+    reader->sample_data_size = reader->decompressed_size;
+
+    return 0;
+}
+#endif
+
+/* Parse the string table into Python unicode objects */
+static inline int
+reader_parse_string_table(BinaryReader *reader, const uint8_t *data, size_t file_size)
+{
+    reader->strings = PyMem_Calloc(reader->strings_count, sizeof(PyObject *));
+    if (!reader->strings && reader->strings_count > 0) {
+        PyErr_NoMemory();
+        return -1;
+    }
+
+    size_t offset = reader->string_table_offset;
+    for (uint32_t i = 0; i < reader->strings_count; i++) {
+        size_t prev_offset = offset;
+        uint32_t str_len = decode_varint_u32(data, &offset, file_size);
+        if (offset == prev_offset) {
+            PyErr_SetString(PyExc_ValueError, "Malformed varint in string table");
+            return -1;
+        }
+        if (offset + str_len > file_size) {
+            PyErr_SetString(PyExc_ValueError, "String table overflow");
+            return -1;
+        }
+
+        reader->strings[i] = PyUnicode_DecodeUTF8((char *)&data[offset], str_len, "replace");
+        if (!reader->strings[i]) {
+            return -1;
+        }
+        offset += str_len;
+    }
+
+    return 0;
+}
+
+/* Parse the frame table (function_id, filename_id, lineno for each frame) */
+static inline int
+reader_parse_frame_table(BinaryReader *reader, const uint8_t *data, size_t file_size)
+{
+    /* Check for integer overflow in allocation size calculation.
+       Only needed on 32-bit where SIZE_MAX can be exceeded by uint32_t * 12. */
+#if SIZEOF_SIZE_T < 8
+    if (reader->frames_count > SIZE_MAX / (3 * sizeof(uint32_t))) {
+        PyErr_SetString(PyExc_OverflowError, "Frame count too large for allocation");
+        return -1;
+    }
+#endif
+
+    size_t alloc_size = (size_t)reader->frames_count * 3 * sizeof(uint32_t);
+    reader->frame_data = PyMem_Malloc(alloc_size);
+    if (!reader->frame_data && reader->frames_count > 0) {
+        PyErr_NoMemory();
+        return -1;
+    }
+
+    size_t offset = reader->frame_table_offset;
+    for (uint32_t i = 0; i < reader->frames_count; i++) {
+        size_t base = (size_t)i * 3;
+        size_t prev_offset;
+
+        prev_offset = offset;
+        reader->frame_data[base] = decode_varint_u32(data, &offset, file_size);
+        if (offset == prev_offset) {
+            PyErr_SetString(PyExc_ValueError, "Malformed varint in frame table (filename)");
+            return -1;
+        }
+
+        prev_offset = offset;
+        reader->frame_data[base + 1] = decode_varint_u32(data, &offset, file_size);
+        if (offset == prev_offset) {
+            PyErr_SetString(PyExc_ValueError, "Malformed varint in frame table (funcname)");
+            return -1;
+        }
+
+        prev_offset = offset;
+        reader->frame_data[base + 2] = (uint32_t)decode_varint_i32(data, &offset, file_size);
+        if (offset == prev_offset) {
+            PyErr_SetString(PyExc_ValueError, "Malformed varint in frame table (lineno)");
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+BinaryReader *
+binary_reader_open(const char *filename)
+{
+    BinaryReader *reader = PyMem_Calloc(1, sizeof(BinaryReader));
+    if (!reader) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+#if USE_MMAP
+    reader->fd = -1;  /* Explicit initialization for cleanup safety */
+#endif
+
+    reader->filename = PyMem_Malloc(strlen(filename) + 1);
+    if (!reader->filename) {
+        PyMem_Free(reader);
+        PyErr_NoMemory();
+        return NULL;
+    }
+    strcpy(reader->filename, filename);
+
+#if USE_MMAP
+    /* Open with mmap on Unix */
+    reader->fd = open(filename, O_RDONLY);
+    if (reader->fd < 0) {
+        PyErr_SetFromErrnoWithFilename(PyExc_IOError, filename);
+        goto error;
+    }
+
+    struct stat st;
+    if (fstat(reader->fd, &st) < 0) {
+        PyErr_SetFromErrno(PyExc_IOError);
+        goto error;
+    }
+    reader->mapped_size = st.st_size;
+
+    /* Map the file into memory.
+     * MAP_POPULATE (Linux-only) pre-faults all pages at mmap time, which:
+     * - Catches issues (e.g., file truncation) immediately rather than as SIGBUS during reads
+     * - Eliminates page faults during subsequent reads for better performance
+     */
+#ifdef __linux__
+    reader->mapped_data = mmap(NULL, reader->mapped_size, PROT_READ,
+                               MAP_PRIVATE | MAP_POPULATE, reader->fd, 0);
+#else
+    reader->mapped_data = mmap(NULL, reader->mapped_size, PROT_READ,
+                               MAP_PRIVATE, reader->fd, 0);
+#endif
+    if (reader->mapped_data == MAP_FAILED) {
+        reader->mapped_data = NULL;
+        PyErr_SetFromErrno(PyExc_IOError);
+        goto error;
+    }
+
+    /* Hint sequential access pattern - failures are non-fatal */
+    (void)madvise(reader->mapped_data, reader->mapped_size, MADV_SEQUENTIAL);
+
+    /* Pre-fetch pages into memory - failures are non-fatal.
+     * Complements MAP_POPULATE on Linux, provides benefit on macOS. */
+    (void)madvise(reader->mapped_data, reader->mapped_size, MADV_WILLNEED);
+
+    /* Use transparent huge pages for large files to reduce TLB misses.
+     * Only beneficial for files >= 32MB where TLB pressure matters. */
+#ifdef MADV_HUGEPAGE
+    if (reader->mapped_size >= (32 * 1024 * 1024)) {
+        (void)madvise(reader->mapped_data, reader->mapped_size, MADV_HUGEPAGE);
+    }
+#endif
+
+    /* Add file descriptor-level hints for better kernel I/O scheduling */
+#if defined(__linux__) && defined(POSIX_FADV_SEQUENTIAL)
+    (void)posix_fadvise(reader->fd, 0, 0, POSIX_FADV_SEQUENTIAL);
+    if (reader->mapped_size > (64 * 1024 * 1024)) {
+        (void)posix_fadvise(reader->fd, 0, 0, POSIX_FADV_WILLNEED);
+    }
+#endif
+
+    uint8_t *data = reader->mapped_data;
+    size_t file_size = reader->mapped_size;
+#else
+    /* Use stdio on Windows */
+    reader->fp = fopen(filename, "rb");
+    if (!reader->fp) {
+        PyErr_SetFromErrnoWithFilename(PyExc_IOError, filename);
+        goto error;
+    }
+
+    if (FSEEK64(reader->fp, 0, SEEK_END) != 0) {
+        PyErr_SetFromErrno(PyExc_IOError);
+        goto error;
+    }
+    file_offset_t file_size_off = FTELL64(reader->fp);
+    if (file_size_off < 0) {
+        PyErr_SetFromErrno(PyExc_IOError);
+        goto error;
+    }
+    reader->file_size = (size_t)file_size_off;
+    if (FSEEK64(reader->fp, 0, SEEK_SET) != 0) {
+        PyErr_SetFromErrno(PyExc_IOError);
+        goto error;
+    }
+
+    reader->file_data = PyMem_Malloc(reader->file_size);
+    if (!reader->file_data) {
+        PyErr_NoMemory();
+        goto error;
+    }
+
+    if (fread(reader->file_data, 1, reader->file_size, reader->fp) != reader->file_size) {
+        PyErr_SetFromErrno(PyExc_IOError);
+        goto error;
+    }
+
+    uint8_t *data = reader->file_data;
+    size_t file_size = reader->file_size;
+#endif
+
+    /* Parse header and footer */
+    if (reader_parse_header(reader, data, file_size) < 0) {
+        goto error;
+    }
+    if (reader_parse_footer(reader, data, file_size) < 0) {
+        goto error;
+    }
+
+    /* Validate table offsets are within file bounds */
+    if (reader->string_table_offset > file_size) {
+        PyErr_Format(PyExc_ValueError,
+            "Invalid string table offset: %llu exceeds file size %zu",
+            (unsigned long long)reader->string_table_offset, file_size);
+        goto error;
+    }
+    if (reader->frame_table_offset > file_size) {
+        PyErr_Format(PyExc_ValueError,
+            "Invalid frame table offset: %llu exceeds file size %zu",
+            (unsigned long long)reader->frame_table_offset, file_size);
+        goto error;
+    }
+    if (reader->string_table_offset < FILE_HEADER_PLACEHOLDER_SIZE) {
+        PyErr_Format(PyExc_ValueError,
+            "Invalid string table offset: %llu is before data section",
+            (unsigned long long)reader->string_table_offset);
+        goto error;
+    }
+    if (reader->frame_table_offset < FILE_HEADER_PLACEHOLDER_SIZE) {
+        PyErr_Format(PyExc_ValueError,
+            "Invalid frame table offset: %llu is before data section",
+            (unsigned long long)reader->frame_table_offset);
+        goto error;
+    }
+    if (reader->string_table_offset > reader->frame_table_offset) {
+        PyErr_Format(PyExc_ValueError,
+            "Invalid table offsets: string table (%llu) is after frame table (%llu)",
+            (unsigned long long)reader->string_table_offset,
+            (unsigned long long)reader->frame_table_offset);
+        goto error;
+    }
+
+    /* Handle compressed data */
+    if (reader->compression_type == COMPRESSION_ZSTD) {
+#ifdef HAVE_ZSTD
+        if (reader_decompress_samples(reader, data) < 0) {
+            goto error;
+        }
+#else
+        PyErr_SetString(PyExc_RuntimeError,
+            "File uses zstd compression but zstd support not compiled in");
+        goto error;
+#endif
+    } else {
+        /* Uncompressed data */
+        reader->sample_data = data + FILE_HEADER_PLACEHOLDER_SIZE;
+        reader->sample_data_size = reader->string_table_offset - FILE_HEADER_PLACEHOLDER_SIZE;
+    }
+
+    /* Parse string and frame tables */
+    if (reader_parse_string_table(reader, data, file_size) < 0) {
+        goto error;
+    }
+    if (reader_parse_frame_table(reader, data, file_size) < 0) {
+        goto error;
+    }
+
+    return reader;
+
+error:
+    binary_reader_close(reader);
+    return NULL;
+}
+
+/* Get or create reader thread state for stack reconstruction */
+static ReaderThreadState *
+reader_get_or_create_thread_state(BinaryReader *reader, uint64_t thread_id,
+                                   uint32_t interpreter_id)
+{
+    /* Search existing threads (key is thread_id + interpreter_id) */
+    for (size_t i = 0; i < reader->thread_state_count; i++) {
+        if (reader->thread_states[i].thread_id == thread_id &&
+            reader->thread_states[i].interpreter_id == interpreter_id) {
+            return &reader->thread_states[i];
+        }
+    }
+
+    /* Initial allocation or growth */
+    if (!reader->thread_states) {
+        reader->thread_state_capacity = 16;
+        reader->thread_states = PyMem_Calloc(reader->thread_state_capacity, sizeof(ReaderThreadState));
+        if (!reader->thread_states) {
+            PyErr_NoMemory();
+            return NULL;
+        }
+    } else if (reader->thread_state_count >= reader->thread_state_capacity) {
+        reader->thread_states = grow_array(reader->thread_states,
+                                           &reader->thread_state_capacity,
+                                           sizeof(ReaderThreadState));
+        if (!reader->thread_states) {
+            return NULL;
+        }
+    }
+
+    /* Initialize new thread state */
+    ReaderThreadState *ts = &reader->thread_states[reader->thread_state_count++];
+    memset(ts, 0, sizeof(ReaderThreadState));
+    ts->thread_id = thread_id;
+    ts->interpreter_id = interpreter_id;
+    ts->prev_timestamp = reader->start_time_us;
+    ts->current_stack_capacity = MAX_STACK_DEPTH;
+    ts->current_stack = PyMem_Malloc(ts->current_stack_capacity * sizeof(uint32_t));
+    if (!ts->current_stack) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+    return ts;
+}
+
+/* ============================================================================
+ * STACK DECODING HELPERS
+ * ============================================================================ */
+
+/* Decode a full stack from sample data.
+ * Updates ts->current_stack and ts->current_stack_depth.
+ * Returns 0 on success, -1 on error (bounds violation). */
+static inline int
+decode_stack_full(ReaderThreadState *ts, const uint8_t *data,
+                  size_t *offset, size_t max_size)
+{
+    uint32_t depth = decode_varint_u32(data, offset, max_size);
+
+    /* Validate depth against capacity to prevent buffer overflow */
+    if (depth > ts->current_stack_capacity) {
+        PyErr_Format(PyExc_ValueError,
+            "Stack depth %u exceeds capacity %zu", depth, ts->current_stack_capacity);
+        return -1;
+    }
+
+    ts->current_stack_depth = depth;
+    for (uint32_t i = 0; i < depth; i++) {
+        ts->current_stack[i] = decode_varint_u32(data, offset, max_size);
+    }
+    return 0;
+}
+
+/* Decode a suffix-encoded stack from sample data.
+ * The suffix encoding shares frames from the bottom of the previous stack.
+ * Returns 0 on success, -1 on error (bounds violation). */
+static inline int
+decode_stack_suffix(ReaderThreadState *ts, const uint8_t *data,
+                    size_t *offset, size_t max_size)
+{
+    uint32_t shared = decode_varint_u32(data, offset, max_size);
+    uint32_t new_count = decode_varint_u32(data, offset, max_size);
+
+    /* Validate shared doesn't exceed current stack depth */
+    if (shared > ts->current_stack_depth) {
+        PyErr_Format(PyExc_ValueError,
+            "Shared count %u exceeds current stack depth %zu",
+            shared, ts->current_stack_depth);
+        return -1;
+    }
+
+    /* Validate final depth doesn't exceed capacity */
+    size_t final_depth = (size_t)shared + new_count;
+    if (final_depth > ts->current_stack_capacity) {
+        PyErr_Format(PyExc_ValueError,
+            "Final stack depth %zu exceeds capacity %zu",
+            final_depth, ts->current_stack_capacity);
+        return -1;
+    }
+
+    /* Move shared frames to make room for new frames at the top */
+    if (new_count > 0 && shared > 0) {
+        size_t prev_shared_start = ts->current_stack_depth - shared;
+        memmove(&ts->current_stack[new_count],
+                &ts->current_stack[prev_shared_start],
+                shared * sizeof(uint32_t));
+    }
+
+    /* Read new frames (at top of stack) */
+    for (uint32_t i = 0; i < new_count; i++) {
+        ts->current_stack[i] = decode_varint_u32(data, offset, max_size);
+    }
+    ts->current_stack_depth = final_depth;
+    return 0;
+}
+
+/* Decode a pop-push encoded stack from sample data.
+ * Pops frames from the top and pushes new frames.
+ * Returns 0 on success, -1 on error (bounds violation). */
+static inline int
+decode_stack_pop_push(ReaderThreadState *ts, const uint8_t *data,
+                      size_t *offset, size_t max_size)
+{
+    uint32_t pop = decode_varint_u32(data, offset, max_size);
+    uint32_t push = decode_varint_u32(data, offset, max_size);
+    size_t keep = (ts->current_stack_depth > pop) ? ts->current_stack_depth - pop : 0;
+
+    /* Validate final depth doesn't exceed capacity */
+    size_t final_depth = keep + push;
+    if (final_depth > ts->current_stack_capacity) {
+        PyErr_Format(PyExc_ValueError,
+            "Final stack depth %zu exceeds capacity %zu",
+            final_depth, ts->current_stack_capacity);
+        return -1;
+    }
+
+    /* Move kept frames (from bottom of stack) to make room for new frames.
+     * Even when push == 0, we need to move kept frames to index 0 if pop > 0. */
+    if (keep > 0) {
+        memmove(&ts->current_stack[push],
+                &ts->current_stack[pop],
+                keep * sizeof(uint32_t));
+    }
+
+    /* Read new frames (at top of stack) */
+    for (uint32_t i = 0; i < push; i++) {
+        ts->current_stack[i] = decode_varint_u32(data, offset, max_size);
+    }
+    ts->current_stack_depth = final_depth;
+    return 0;
+}
+
+/* Build a Python list of FrameInfo objects from frame indices */
+static PyObject *
+build_frame_list(RemoteDebuggingState *state, BinaryReader *reader,
+                 const uint32_t *frame_indices, size_t stack_depth)
+{
+    PyObject *frame_list = PyList_New(stack_depth);
+    if (!frame_list) {
+        return NULL;
+    }
+
+    for (size_t k = 0; k < stack_depth; k++) {
+        uint32_t frame_idx = frame_indices[k];
+        if (frame_idx >= reader->frames_count) {
+            PyErr_Format(PyExc_ValueError, "Invalid frame index: %u", frame_idx);
+            goto error;
+        }
+
+        size_t base = frame_idx * 3;
+        uint32_t filename_idx = reader->frame_data[base];
+        uint32_t funcname_idx = reader->frame_data[base + 1];
+        int32_t lineno = (int32_t)reader->frame_data[base + 2];
+
+        if (filename_idx >= reader->strings_count ||
+            funcname_idx >= reader->strings_count) {
+            PyErr_SetString(PyExc_ValueError, "Invalid string index in frame");
+            goto error;
+        }
+
+        PyObject *frame_info = PyStructSequence_New(state->FrameInfo_Type);
+        if (!frame_info) {
+            goto error;
+        }
+
+        PyObject *location;
+        if (lineno > 0) {
+            location = Py_BuildValue("(iiii)", lineno, lineno, 0, 0);
+            if (!location) {
+                Py_DECREF(frame_info);
+                goto error;
+            }
+        }
+        else {
+            location = Py_NewRef(Py_None);
+        }
+
+        PyStructSequence_SetItem(frame_info, 0, Py_NewRef(reader->strings[filename_idx]));
+        PyStructSequence_SetItem(frame_info, 1, location);
+        PyStructSequence_SetItem(frame_info, 2, Py_NewRef(reader->strings[funcname_idx]));
+        PyStructSequence_SetItem(frame_info, 3, Py_NewRef(Py_None));
+        PyList_SET_ITEM(frame_list, k, frame_info);
+    }
+
+    return frame_list;
+
+error:
+    Py_DECREF(frame_list);
+    return NULL;
+}
+
+/* Helper to build and emit a sample to the collector */
+static int
+emit_sample(RemoteDebuggingState *state, PyObject *collector,
+            uint64_t thread_id, uint32_t interpreter_id, uint8_t status,
+            const uint32_t *frame_indices, size_t stack_depth,
+            BinaryReader *reader, uint64_t timestamp_us)
+{
+    PyObject *frame_list = NULL, *thread_info = NULL, *thread_list = NULL;
+    PyObject *interp_info = NULL, *sample_list = NULL, *result = NULL;
+    int ret = -1;
+
+    frame_list = build_frame_list(state, reader, frame_indices, stack_depth);
+    if (!frame_list) {
+        goto error;
+    }
+
+    thread_info = PyStructSequence_New(state->ThreadInfo_Type);
+    if (!thread_info) {
+        goto error;
+    }
+    PyObject *tid = PyLong_FromUnsignedLongLong(thread_id);
+    if (!tid) {
+        goto error;
+    }
+    PyObject *st = PyLong_FromLong(status);
+    if (!st) {
+        Py_DECREF(tid);
+        goto error;
+    }
+    PyStructSequence_SetItem(thread_info, 0, tid);
+    PyStructSequence_SetItem(thread_info, 1, st);
+    PyStructSequence_SetItem(thread_info, 2, frame_list);
+    frame_list = NULL;  /* ownership transferred */
+
+    thread_list = PyList_New(1);
+    if (!thread_list) {
+        goto error;
+    }
+    PyList_SET_ITEM(thread_list, 0, thread_info);
+    thread_info = NULL;
+
+    interp_info = PyStructSequence_New(state->InterpreterInfo_Type);
+    if (!interp_info) {
+        goto error;
+    }
+    PyObject *iid = PyLong_FromUnsignedLong(interpreter_id);
+    if (!iid) {
+        goto error;
+    }
+    PyStructSequence_SetItem(interp_info, 0, iid);
+    PyStructSequence_SetItem(interp_info, 1, thread_list);
+    thread_list = NULL;
+
+    sample_list = PyList_New(1);
+    if (!sample_list) {
+        goto error;
+    }
+    PyList_SET_ITEM(sample_list, 0, interp_info);
+    interp_info = NULL;
+
+    /* Pass timestamp_us to collector - collectors use it if provided */
+    PyObject *timestamp_obj = PyLong_FromUnsignedLongLong(timestamp_us);
+    if (!timestamp_obj) {
+        goto error;
+    }
+    result = PyObject_CallMethod(collector, "collect", "OO", sample_list, timestamp_obj);
+    Py_DECREF(timestamp_obj);
+    if (result) {
+        ret = 0;
+    }
+
+error:
+    Py_XDECREF(result);
+    Py_XDECREF(sample_list);
+    Py_XDECREF(interp_info);
+    Py_XDECREF(thread_list);
+    Py_XDECREF(thread_info);
+    Py_XDECREF(frame_list);
+    return ret;
+}
+
+/* Helper to invoke progress callback, clearing any errors */
+static inline void
+invoke_progress_callback(PyObject *callback, Py_ssize_t current, uint32_t total)
+{
+    if (callback && callback != Py_None) {
+        PyObject *result = PyObject_CallFunction(callback, "nI", current, total);
+        if (result) {
+            Py_DECREF(result);
+        } else {
+            PyErr_Clear();
+        }
+    }
+}
+
+Py_ssize_t
+binary_reader_replay(BinaryReader *reader, PyObject *collector, PyObject *progress_callback)
+{
+    if (!PyObject_HasAttrString(collector, "collect")) {
+        PyErr_SetString(PyExc_TypeError, "Collector must have a collect() method");
+        return -1;
+    }
+
+    /* Get module state for struct sequence types */
+    PyObject *module = PyImport_ImportModule("_remote_debugging");
+    if (!module) {
+        return -1;
+    }
+    RemoteDebuggingState *state = RemoteDebugging_GetState(module);
+    Py_DECREF(module);
+
+    if (!state) {
+        PyErr_SetString(PyExc_RuntimeError, "Failed to get module state");
+        return -1;
+    }
+
+    size_t offset = 0;
+    Py_ssize_t replayed = 0;
+
+    /* Initial progress callback at 0% */
+    invoke_progress_callback(progress_callback, 0, reader->sample_count);
+
+    while (offset < reader->sample_data_size) {
+        /* Read thread_id (8 bytes) + interpreter_id (4 bytes) */
+        if (offset + 13 > reader->sample_data_size) {
+            break;  /* End of data */
+        }
+
+        /* Use memcpy to avoid strict aliasing violations */
+        uint64_t thread_id;
+        uint32_t interpreter_id;
+        memcpy(&thread_id, &reader->sample_data[offset], sizeof(thread_id));
+        offset += 8;
+
+        memcpy(&interpreter_id, &reader->sample_data[offset], sizeof(interpreter_id));
+        offset += 4;
+
+        /* Get or create thread state for reconstruction */
+        ReaderThreadState *ts = reader_get_or_create_thread_state(reader, thread_id, interpreter_id);
+        if (!ts) {
+            return -1;
+        }
+
+        /* Read encoding byte */
+        uint8_t encoding = reader->sample_data[offset++];
+
+        switch (encoding) {
+        case STACK_REPEAT: {
+            /* RLE repeat: [count: varint] [delta: varint, status: 1]... */
+            size_t prev_offset = offset;
+            uint32_t count = decode_varint_u32(reader->sample_data, &offset, reader->sample_data_size);
+            /* Detect varint decode failure */
+            if (offset == prev_offset) {
+                PyErr_SetString(PyExc_ValueError, "Malformed varint for RLE count");
+                return -1;
+            }
+
+            /* Validate RLE count to prevent DoS from malicious files.
+             * Each RLE sample needs at least 2 bytes (1 byte min varint + 1 status byte).
+             * Also reject absurdly large counts that would exhaust memory. */
+            size_t remaining_data = reader->sample_data_size - offset;
+            size_t max_possible_samples = remaining_data / 2;
+            if (count > max_possible_samples) {
+                PyErr_Format(PyExc_ValueError,
+                    "Invalid RLE count %u exceeds maximum possible %zu for remaining data",
+                    count, max_possible_samples);
+                return -1;
+            }
+
+            reader->stats.repeat_records++;
+            reader->stats.repeat_samples += count;
+
+            for (uint32_t i = 0; i < count; i++) {
+                size_t delta_prev_offset = offset;
+                uint64_t delta = decode_varint_u64(reader->sample_data, &offset, reader->sample_data_size);
+                /* Detect varint decode failure: offset unchanged means error (overflow or truncated) */
+                if (offset == delta_prev_offset) {
+                    PyErr_SetString(PyExc_ValueError, "Malformed varint in RLE sample data");
+                    return -1;
+                }
+                if (offset >= reader->sample_data_size) {
+                    PyErr_SetString(PyExc_ValueError, "Unexpected end of sample data in RLE");
+                    return -1;
+                }
+                uint8_t status = reader->sample_data[offset++];
+
+                ts->prev_timestamp += delta;
+
+                /* Emit sample using cached stack */
+                if (emit_sample(state, collector, thread_id, interpreter_id, status,
+                               ts->current_stack, ts->current_stack_depth, reader,
+                               ts->prev_timestamp) < 0) {
+                    return -1;
+                }
+                replayed++;
+                reader->stats.total_samples++;
+
+                /* Progress callback inside RLE loop for smooth updates */
+                if (replayed % PROGRESS_CALLBACK_INTERVAL == 0) {
+                    invoke_progress_callback(progress_callback, replayed, reader->sample_count);
+                }
+            }
+            break;
+        }
+
+        case STACK_FULL:
+        case STACK_SUFFIX:
+        case STACK_POP_PUSH: {
+            /* All three encodings share: [delta: varint] [status: 1] ... */
+            size_t prev_offset = offset;
+            uint64_t delta = decode_varint_u64(reader->sample_data, &offset, reader->sample_data_size);
+            /* Detect varint decode failure: offset unchanged means error */
+            if (offset == prev_offset) {
+                PyErr_SetString(PyExc_ValueError, "Malformed varint in sample data");
+                return -1;
+            }
+            if (offset >= reader->sample_data_size) {
+                PyErr_SetString(PyExc_ValueError, "Unexpected end of sample data");
+                return -1;
+            }
+            uint8_t status = reader->sample_data[offset++];
+            ts->prev_timestamp += delta;
+
+            if (encoding == STACK_FULL) {
+                if (decode_stack_full(ts, reader->sample_data, &offset, reader->sample_data_size) < 0) {
+                    return -1;
+                }
+                reader->stats.full_records++;
+            } else if (encoding == STACK_SUFFIX) {
+                if (decode_stack_suffix(ts, reader->sample_data, &offset, reader->sample_data_size) < 0) {
+                    return -1;
+                }
+                reader->stats.suffix_records++;
+            } else { /* STACK_POP_PUSH */
+                if (decode_stack_pop_push(ts, reader->sample_data, &offset, reader->sample_data_size) < 0) {
+                    return -1;
+                }
+                reader->stats.pop_push_records++;
+            }
+            reader->stats.stack_reconstructions++;
+
+            if (emit_sample(state, collector, thread_id, interpreter_id, status,
+                           ts->current_stack, ts->current_stack_depth, reader,
+                           ts->prev_timestamp) < 0) {
+                return -1;
+            }
+            replayed++;
+            reader->stats.total_samples++;
+            break;
+        }
+
+        default:
+            PyErr_Format(PyExc_ValueError, "Unknown stack encoding: %u", encoding);
+            return -1;
+        }
+
+        /* Progress callback */
+        if (replayed % PROGRESS_CALLBACK_INTERVAL == 0) {
+            invoke_progress_callback(progress_callback, replayed, reader->sample_count);
+        }
+    }
+
+    /* Final progress callback at 100% */
+    invoke_progress_callback(progress_callback, replayed, reader->sample_count);
+
+    return replayed;
+}
+
+PyObject *
+binary_reader_get_info(BinaryReader *reader)
+{
+    return Py_BuildValue(
+        "{s:I, s:K, s:K, s:I, s:I, s:I, s:I, s:i}",
+        "version", BINARY_FORMAT_VERSION,
+        "start_time_us", reader->start_time_us,
+        "sample_interval_us", reader->sample_interval_us,
+        "sample_count", reader->sample_count,
+        "thread_count", reader->thread_count,
+        "string_count", reader->strings_count,
+        "frame_count", reader->frames_count,
+        "compression_type", reader->compression_type
+    );
+}
+
+PyObject *
+binary_writer_get_stats(BinaryWriter *writer)
+{
+    BinaryWriterStats *s = &writer->stats;
+
+    /* Calculate derived stats */
+    uint64_t total_records = s->repeat_records + s->full_records +
+                             s->suffix_records + s->pop_push_records;
+    uint64_t total_samples = writer->total_samples;
+    uint64_t potential_frames = s->total_frames_written + s->frames_saved;
+    double compression_ratio = (potential_frames > 0) ?
+        (double)s->frames_saved / potential_frames * 100.0 : 0.0;
+
+    return Py_BuildValue(
+        "{s:K, s:K, s:K, s:K, s:K, s:K, s:K, s:K, s:K, s:K, s:d}",
+        "repeat_records", s->repeat_records,
+        "repeat_samples", s->repeat_samples,
+        "full_records", s->full_records,
+        "suffix_records", s->suffix_records,
+        "pop_push_records", s->pop_push_records,
+        "total_records", total_records,
+        "total_samples", total_samples,
+        "total_frames_written", s->total_frames_written,
+        "frames_saved", s->frames_saved,
+        "bytes_written", s->bytes_written,
+        "frame_compression_pct", compression_ratio
+    );
+}
+
+PyObject *
+binary_reader_get_stats(BinaryReader *reader)
+{
+    BinaryReaderStats *s = &reader->stats;
+
+    uint64_t total_records = s->repeat_records + s->full_records +
+                             s->suffix_records + s->pop_push_records;
+
+    return Py_BuildValue(
+        "{s:K, s:K, s:K, s:K, s:K, s:K, s:K, s:K}",
+        "repeat_records", s->repeat_records,
+        "repeat_samples", s->repeat_samples,
+        "full_records", s->full_records,
+        "suffix_records", s->suffix_records,
+        "pop_push_records", s->pop_push_records,
+        "total_records", total_records,
+        "total_samples", s->total_samples,
+        "stack_reconstructions", s->stack_reconstructions
+    );
+}
+
+void
+binary_reader_close(BinaryReader *reader)
+{
+    if (!reader) {
+        return;
+    }
+
+    PyMem_Free(reader->filename);
+
+#if USE_MMAP
+    if (reader->mapped_data) {
+        munmap(reader->mapped_data, reader->mapped_size);
+        reader->mapped_data = NULL;  /* Prevent use-after-free */
+        reader->mapped_size = 0;
+    }
+    if (reader->fd >= 0) {
+        close(reader->fd);
+        reader->fd = -1;  /* Mark as closed */
+    }
+#else
+    if (reader->fp) {
+        fclose(reader->fp);
+        reader->fp = NULL;
+    }
+    if (reader->file_data) {
+        PyMem_Free(reader->file_data);
+        reader->file_data = NULL;
+        reader->file_size = 0;
+    }
+#endif
+
+    PyMem_Free(reader->decompressed_data);
+
+    if (reader->strings) {
+        for (uint32_t i = 0; i < reader->strings_count; i++) {
+            Py_XDECREF(reader->strings[i]);
+        }
+        PyMem_Free(reader->strings);
+    }
+
+    PyMem_Free(reader->frame_data);
+
+    /* Free per-thread reconstruction state */
+    if (reader->thread_states) {
+        for (size_t i = 0; i < reader->thread_state_count; i++) {
+            PyMem_Free(reader->thread_states[i].current_stack);
+        }
+        PyMem_Free(reader->thread_states);
+    }
+
+    PyMem_Free(reader);
+}

From e3ea7a4e517516c0ceec6060dacd9f59b5426a39 Mon Sep 17 00:00:00 2001
From: Pablo Galindo Salgado <pablogsal@gmail.com>
Date: Mon, 15 Dec 2025 01:48:06 +0000
Subject: [PATCH 04/17] Build: add binary I/O files to Unix build

Adds binary_io_writer.c and binary_io_reader.c to the _remote_debugging
module compilation. Also hooks up optional zstd support: when libzstd
is found by pkg-config, the module compiles with HAVE_ZSTD defined and
links against libzstd. Without zstd, the module still builds but
compression is unavailable.
---
 Modules/Setup.stdlib.in |  2 +-
 configure               | 22 ++++++++++++++++++++--
 configure.ac            | 14 +++++++++++++-
 3 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/Modules/Setup.stdlib.in b/Modules/Setup.stdlib.in
index 1be83b455261ea..f450cad6fb33ad 100644
--- a/Modules/Setup.stdlib.in
+++ b/Modules/Setup.stdlib.in
@@ -41,7 +41,7 @@
 @MODULE__PICKLE_TRUE@_pickle _pickle.c
 @MODULE__QUEUE_TRUE@_queue _queuemodule.c
 @MODULE__RANDOM_TRUE@_random _randommodule.c
-@MODULE__REMOTE_DEBUGGING_TRUE@_remote_debugging _remote_debugging/module.c _remote_debugging/object_reading.c _remote_debugging/code_objects.c _remote_debugging/frames.c _remote_debugging/frame_cache.c _remote_debugging/threads.c _remote_debugging/asyncio.c
+@MODULE__REMOTE_DEBUGGING_TRUE@_remote_debugging _remote_debugging/module.c _remote_debugging/object_reading.c _remote_debugging/code_objects.c _remote_debugging/frames.c _remote_debugging/frame_cache.c _remote_debugging/threads.c _remote_debugging/asyncio.c _remote_debugging/binary_io_writer.c _remote_debugging/binary_io_reader.c
 @MODULE__STRUCT_TRUE@_struct _struct.c
 
 # build supports subinterpreters
diff --git a/configure b/configure
index a1bc7991aa8dc2..b1faeaf806a9c6 100755
--- a/configure
+++ b/configure
@@ -858,6 +858,8 @@ HAVE_GETHOSTBYNAME_R_3_ARG
 HAVE_GETHOSTBYNAME_R_5_ARG
 HAVE_GETHOSTBYNAME_R_6_ARG
 LIBOBJS
+REMOTE_DEBUGGING_LIBS
+REMOTE_DEBUGGING_CFLAGS
 LIBZSTD_LIBS
 LIBZSTD_CFLAGS
 LIBLZMA_LIBS
@@ -23023,6 +23025,22 @@ printf "%s\n" "yes" >&6; }
         have_libzstd=yes
 fi
 
+if test "x$have_libzstd" = xyes
+then :
+
+  REMOTE_DEBUGGING_CFLAGS="-DHAVE_ZSTD $LIBZSTD_CFLAGS"
+  REMOTE_DEBUGGING_LIBS="$LIBZSTD_LIBS"
+
+else case e in #(
+  e)
+  REMOTE_DEBUGGING_CFLAGS=""
+  REMOTE_DEBUGGING_LIBS=""
+ ;;
+esac
+fi
+
+
+
 
 
 
@@ -31644,8 +31662,8 @@ fi
   if test "x$py_cv_module__remote_debugging" = xyes
 then :
 
-
-
+    as_fn_append MODULE_BLOCK "MODULE__REMOTE_DEBUGGING_CFLAGS=$REMOTE_DEBUGGING_CFLAGS$as_nl"
+    as_fn_append MODULE_BLOCK "MODULE__REMOTE_DEBUGGING_LDFLAGS=$REMOTE_DEBUGGING_LIBS$as_nl"
 
 fi
 
diff --git a/configure.ac b/configure.ac
index a284a118f0296f..043ec957f40894 100644
--- a/configure.ac
+++ b/configure.ac
@@ -5529,6 +5529,18 @@ PKG_CHECK_MODULES([LIBZSTD], [libzstd >= 1.4.5], [have_libzstd=yes], [
   ])
 ])
 
+dnl _remote_debugging module: optional zstd compression support
+dnl The module always builds, but zstd compression is only available when libzstd is found
+AS_VAR_IF([have_libzstd], [yes], [
+  REMOTE_DEBUGGING_CFLAGS="-DHAVE_ZSTD $LIBZSTD_CFLAGS"
+  REMOTE_DEBUGGING_LIBS="$LIBZSTD_LIBS"
+], [
+  REMOTE_DEBUGGING_CFLAGS=""
+  REMOTE_DEBUGGING_LIBS=""
+])
+AC_SUBST([REMOTE_DEBUGGING_CFLAGS])
+AC_SUBST([REMOTE_DEBUGGING_LIBS])
+
 dnl PY_CHECK_NETDB_FUNC(FUNCTION)
 AC_DEFUN([PY_CHECK_NETDB_FUNC], [PY_CHECK_FUNC([$1], [@%:@include <netdb.h>])])
 
@@ -7911,7 +7923,7 @@ PY_STDLIB_MOD_SIMPLE([_pickle])
 PY_STDLIB_MOD_SIMPLE([_posixsubprocess])
 PY_STDLIB_MOD_SIMPLE([_queue])
 PY_STDLIB_MOD_SIMPLE([_random])
-PY_STDLIB_MOD_SIMPLE([_remote_debugging])
+PY_STDLIB_MOD_SIMPLE([_remote_debugging], [$REMOTE_DEBUGGING_CFLAGS], [$REMOTE_DEBUGGING_LIBS])
 PY_STDLIB_MOD_SIMPLE([select])
 PY_STDLIB_MOD_SIMPLE([_struct])
 PY_STDLIB_MOD_SIMPLE([_types])

From 8931b4ae5591cb62306423019542abc7c1f68b10 Mon Sep 17 00:00:00 2001
From: Pablo Galindo Salgado <pablogsal@gmail.com>
Date: Mon, 15 Dec 2025 01:48:18 +0000
Subject: [PATCH 05/17] Build: add binary I/O files to Windows build

Adds binary_io_writer.c, binary_io_reader.c, and binary_io.h to the
Visual Studio project for _remote_debugging.
---
 PCbuild/_remote_debugging.vcxproj         | 3 +++
 PCbuild/_remote_debugging.vcxproj.filters | 9 +++++++++
 2 files changed, 12 insertions(+)

diff --git a/PCbuild/_remote_debugging.vcxproj b/PCbuild/_remote_debugging.vcxproj
index c91c9cf3652363..a7a89e9eac5104 100644
--- a/PCbuild/_remote_debugging.vcxproj
+++ b/PCbuild/_remote_debugging.vcxproj
@@ -105,9 +105,12 @@
     <ClCompile Include="..\Modules\_remote_debugging\frame_cache.c" />
     <ClCompile Include="..\Modules\_remote_debugging\threads.c" />
     <ClCompile Include="..\Modules\_remote_debugging\asyncio.c" />
+    <ClCompile Include="..\Modules\_remote_debugging\binary_io_writer.c" />
+    <ClCompile Include="..\Modules\_remote_debugging\binary_io_reader.c" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\Modules\_remote_debugging\_remote_debugging.h" />
+    <ClInclude Include="..\Modules\_remote_debugging\binary_io.h" />
   </ItemGroup>
   <ItemGroup>
     <ResourceCompile Include="..\PC\python_nt.rc" />
diff --git a/PCbuild/_remote_debugging.vcxproj.filters b/PCbuild/_remote_debugging.vcxproj.filters
index b37a2c5575c9f5..a282e5d1275f45 100644
--- a/PCbuild/_remote_debugging.vcxproj.filters
+++ b/PCbuild/_remote_debugging.vcxproj.filters
@@ -33,11 +33,20 @@
     <ClCompile Include="..\Modules\_remote_debugging\asyncio.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\Modules\_remote_debugging\binary_io_writer.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\Modules\_remote_debugging\binary_io_reader.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\Modules\_remote_debugging\_remote_debugging.h">
       <Filter>Header Files</Filter>
     </ClInclude>
+    <ClInclude Include="..\Modules\_remote_debugging\binary_io.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <ResourceCompile Include="..\PC\python_nt.rc">

From 18287f4506c35db0e903d7d992137e448b913d29 Mon Sep 17 00:00:00 2001
From: Pablo Galindo Salgado <pablogsal@gmail.com>
Date: Mon, 15 Dec 2025 01:48:40 +0000
Subject: [PATCH 06/17] Add Python bindings for binary writer and reader

Exposes BinaryWriter and BinaryReader as Python types in
_remote_debugging module. BinaryWriter wraps the C writer with
write_sample() and finalize() methods. BinaryReader provides replay()
to feed samples through any collector.

Also adds zstd_available() function to let Python code check whether
compression support was compiled in.
---
 Modules/_remote_debugging/_remote_debugging.h |   6 +
 Modules/_remote_debugging/clinic/module.c.h   | 656 +++++++++++++++++-
 Modules/_remote_debugging/module.c            | 541 ++++++++++++++-
 3 files changed, 1198 insertions(+), 5 deletions(-)

diff --git a/Modules/_remote_debugging/_remote_debugging.h b/Modules/_remote_debugging/_remote_debugging.h
index fcb75b841b742e..1564c98a8a0717 100644
--- a/Modules/_remote_debugging/_remote_debugging.h
+++ b/Modules/_remote_debugging/_remote_debugging.h
@@ -12,10 +12,14 @@
 extern "C" {
 #endif
 
+#ifndef _GNU_SOURCE
 #define _GNU_SOURCE
+#endif
 
 #ifndef Py_BUILD_CORE_BUILTIN
+#  ifndef Py_BUILD_CORE_MODULE
 #    define Py_BUILD_CORE_MODULE 1
+#  endif
 #endif
 
 #include "Python.h"
@@ -197,6 +201,8 @@ typedef struct {
     PyTypeObject *ThreadInfo_Type;
     PyTypeObject *InterpreterInfo_Type;
     PyTypeObject *AwaitedInfo_Type;
+    PyTypeObject *BinaryWriter_Type;
+    PyTypeObject *BinaryReader_Type;
 } RemoteDebuggingState;
 
 enum _ThreadState {
diff --git a/Modules/_remote_debugging/clinic/module.c.h b/Modules/_remote_debugging/clinic/module.c.h
index 353929c4643dbd..50481817bef466 100644
--- a/Modules/_remote_debugging/clinic/module.c.h
+++ b/Modules/_remote_debugging/clinic/module.c.h
@@ -7,6 +7,7 @@ preserve
 #  include "pycore_runtime.h"     // _Py_ID()
 #endif
 #include "pycore_critical_section.h"// Py_BEGIN_CRITICAL_SECTION()
+#include "pycore_long.h"          // _PyLong_UnsignedLongLong_Converter()
 #include "pycore_modsupport.h"    // _PyArg_UnpackKeywords()
 
 PyDoc_STRVAR(_remote_debugging_RemoteUnwinder___init____doc__,
@@ -433,4 +434,657 @@ _remote_debugging_RemoteUnwinder_get_stats(PyObject *self, PyObject *Py_UNUSED(i
 
     return return_value;
 }
-/*[clinic end generated code: output=1943fb7a56197e39 input=a9049054013a1b77]*/
+
+PyDoc_STRVAR(_remote_debugging_BinaryWriter___init____doc__,
+"BinaryWriter(filename, sample_interval_us, start_time_us, *,\n"
+"             compression=0)\n"
+"--\n"
+"\n"
+"High-performance binary writer for profiling data.\n"
+"\n"
+"Arguments:\n"
+"    filename: Path to output file\n"
+"    sample_interval_us: Sampling interval in microseconds\n"
+"    start_time_us: Start timestamp in microseconds (from time.monotonic() * 1e6)\n"
+"    compression: 0=none, 1=zstd (default: 0)\n"
+"\n"
+"Use as a context manager or call finalize() when done.");
+
+static int
+_remote_debugging_BinaryWriter___init___impl(BinaryWriterObject *self,
+                                             const char *filename,
+                                             unsigned long long sample_interval_us,
+                                             unsigned long long start_time_us,
+                                             int compression);
+
+static int
+_remote_debugging_BinaryWriter___init__(PyObject *self, PyObject *args, PyObject *kwargs)
+{
+    int return_value = -1;
+    #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+
+    #define NUM_KEYWORDS 4
+    static struct {
+        PyGC_Head _this_is_not_used;
+        PyObject_VAR_HEAD
+        Py_hash_t ob_hash;
+        PyObject *ob_item[NUM_KEYWORDS];
+    } _kwtuple = {
+        .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
+        .ob_hash = -1,
+        .ob_item = { &_Py_ID(filename), &_Py_ID(sample_interval_us), &_Py_ID(start_time_us), &_Py_ID(compression), },
+    };
+    #undef NUM_KEYWORDS
+    #define KWTUPLE (&_kwtuple.ob_base.ob_base)
+
+    #else  // !Py_BUILD_CORE
+    #  define KWTUPLE NULL
+    #endif  // !Py_BUILD_CORE
+
+    static const char * const _keywords[] = {"filename", "sample_interval_us", "start_time_us", "compression", NULL};
+    static _PyArg_Parser _parser = {
+        .keywords = _keywords,
+        .fname = "BinaryWriter",
+        .kwtuple = KWTUPLE,
+    };
+    #undef KWTUPLE
+    PyObject *argsbuf[4];
+    PyObject * const *fastargs;
+    Py_ssize_t nargs = PyTuple_GET_SIZE(args);
+    Py_ssize_t noptargs = nargs + (kwargs ? PyDict_GET_SIZE(kwargs) : 0) - 3;
+    const char *filename;
+    unsigned long long sample_interval_us;
+    unsigned long long start_time_us;
+    int compression = 0;
+
+    fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser,
+            /*minpos*/ 3, /*maxpos*/ 3, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
+    if (!fastargs) {
+        goto exit;
+    }
+    if (!PyUnicode_Check(fastargs[0])) {
+        _PyArg_BadArgument("BinaryWriter", "argument 'filename'", "str", fastargs[0]);
+        goto exit;
+    }
+    Py_ssize_t filename_length;
+    filename = PyUnicode_AsUTF8AndSize(fastargs[0], &filename_length);
+    if (filename == NULL) {
+        goto exit;
+    }
+    if (strlen(filename) != (size_t)filename_length) {
+        PyErr_SetString(PyExc_ValueError, "embedded null character");
+        goto exit;
+    }
+    if (!_PyLong_UnsignedLongLong_Converter(fastargs[1], &sample_interval_us)) {
+        goto exit;
+    }
+    if (!_PyLong_UnsignedLongLong_Converter(fastargs[2], &start_time_us)) {
+        goto exit;
+    }
+    if (!noptargs) {
+        goto skip_optional_kwonly;
+    }
+    compression = PyLong_AsInt(fastargs[3]);
+    if (compression == -1 && PyErr_Occurred()) {
+        goto exit;
+    }
+skip_optional_kwonly:
+    return_value = _remote_debugging_BinaryWriter___init___impl((BinaryWriterObject *)self, filename, sample_interval_us, start_time_us, compression);
+
+exit:
+    return return_value;
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryWriter_write_sample__doc__,
+"write_sample($self, /, stack_frames, timestamp_us)\n"
+"--\n"
+"\n"
+"Write a sample to the binary file.\n"
+"\n"
+"Arguments:\n"
+"    stack_frames: List of InterpreterInfo objects\n"
+"    timestamp_us: Current timestamp in microseconds (from time.monotonic() * 1e6)");
+
+#define _REMOTE_DEBUGGING_BINARYWRITER_WRITE_SAMPLE_METHODDEF    \
+    {"write_sample", _PyCFunction_CAST(_remote_debugging_BinaryWriter_write_sample), METH_FASTCALL|METH_KEYWORDS, _remote_debugging_BinaryWriter_write_sample__doc__},
+
+static PyObject *
+_remote_debugging_BinaryWriter_write_sample_impl(BinaryWriterObject *self,
+                                                 PyObject *stack_frames,
+                                                 unsigned long long timestamp_us);
+
+static PyObject *
+_remote_debugging_BinaryWriter_write_sample(PyObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames)
+{
+    PyObject *return_value = NULL;
+    #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+
+    #define NUM_KEYWORDS 2
+    static struct {
+        PyGC_Head _this_is_not_used;
+        PyObject_VAR_HEAD
+        Py_hash_t ob_hash;
+        PyObject *ob_item[NUM_KEYWORDS];
+    } _kwtuple = {
+        .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
+        .ob_hash = -1,
+        .ob_item = { &_Py_ID(stack_frames), &_Py_ID(timestamp_us), },
+    };
+    #undef NUM_KEYWORDS
+    #define KWTUPLE (&_kwtuple.ob_base.ob_base)
+
+    #else  // !Py_BUILD_CORE
+    #  define KWTUPLE NULL
+    #endif  // !Py_BUILD_CORE
+
+    static const char * const _keywords[] = {"stack_frames", "timestamp_us", NULL};
+    static _PyArg_Parser _parser = {
+        .keywords = _keywords,
+        .fname = "write_sample",
+        .kwtuple = KWTUPLE,
+    };
+    #undef KWTUPLE
+    PyObject *argsbuf[2];
+    PyObject *stack_frames;
+    unsigned long long timestamp_us;
+
+    args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser,
+            /*minpos*/ 2, /*maxpos*/ 2, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
+    if (!args) {
+        goto exit;
+    }
+    stack_frames = args[0];
+    if (!_PyLong_UnsignedLongLong_Converter(args[1], &timestamp_us)) {
+        goto exit;
+    }
+    return_value = _remote_debugging_BinaryWriter_write_sample_impl((BinaryWriterObject *)self, stack_frames, timestamp_us);
+
+exit:
+    return return_value;
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryWriter_finalize__doc__,
+"finalize($self, /)\n"
+"--\n"
+"\n"
+"Finalize and close the binary file.\n"
+"\n"
+"Writes string/frame tables, footer, and updates header.");
+
+#define _REMOTE_DEBUGGING_BINARYWRITER_FINALIZE_METHODDEF    \
+    {"finalize", (PyCFunction)_remote_debugging_BinaryWriter_finalize, METH_NOARGS, _remote_debugging_BinaryWriter_finalize__doc__},
+
+static PyObject *
+_remote_debugging_BinaryWriter_finalize_impl(BinaryWriterObject *self);
+
+static PyObject *
+_remote_debugging_BinaryWriter_finalize(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    return _remote_debugging_BinaryWriter_finalize_impl((BinaryWriterObject *)self);
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryWriter_close__doc__,
+"close($self, /)\n"
+"--\n"
+"\n"
+"Close the writer without finalizing (discards data).");
+
+#define _REMOTE_DEBUGGING_BINARYWRITER_CLOSE_METHODDEF    \
+    {"close", (PyCFunction)_remote_debugging_BinaryWriter_close, METH_NOARGS, _remote_debugging_BinaryWriter_close__doc__},
+
+static PyObject *
+_remote_debugging_BinaryWriter_close_impl(BinaryWriterObject *self);
+
+static PyObject *
+_remote_debugging_BinaryWriter_close(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    return _remote_debugging_BinaryWriter_close_impl((BinaryWriterObject *)self);
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryWriter___enter____doc__,
+"__enter__($self, /)\n"
+"--\n"
+"\n"
+"Enter context manager.");
+
+#define _REMOTE_DEBUGGING_BINARYWRITER___ENTER___METHODDEF    \
+    {"__enter__", (PyCFunction)_remote_debugging_BinaryWriter___enter__, METH_NOARGS, _remote_debugging_BinaryWriter___enter____doc__},
+
+static PyObject *
+_remote_debugging_BinaryWriter___enter___impl(BinaryWriterObject *self);
+
+static PyObject *
+_remote_debugging_BinaryWriter___enter__(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    return _remote_debugging_BinaryWriter___enter___impl((BinaryWriterObject *)self);
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryWriter___exit____doc__,
+"__exit__($self, /, exc_type=None, exc_val=None, exc_tb=None)\n"
+"--\n"
+"\n"
+"Exit context manager, finalizing the file.");
+
+#define _REMOTE_DEBUGGING_BINARYWRITER___EXIT___METHODDEF    \
+    {"__exit__", _PyCFunction_CAST(_remote_debugging_BinaryWriter___exit__), METH_FASTCALL|METH_KEYWORDS, _remote_debugging_BinaryWriter___exit____doc__},
+
+static PyObject *
+_remote_debugging_BinaryWriter___exit___impl(BinaryWriterObject *self,
+                                             PyObject *exc_type,
+                                             PyObject *exc_val,
+                                             PyObject *exc_tb);
+
+static PyObject *
+_remote_debugging_BinaryWriter___exit__(PyObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames)
+{
+    PyObject *return_value = NULL;
+    #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+
+    #define NUM_KEYWORDS 3
+    static struct {
+        PyGC_Head _this_is_not_used;
+        PyObject_VAR_HEAD
+        Py_hash_t ob_hash;
+        PyObject *ob_item[NUM_KEYWORDS];
+    } _kwtuple = {
+        .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
+        .ob_hash = -1,
+        .ob_item = { &_Py_ID(exc_type), &_Py_ID(exc_val), &_Py_ID(exc_tb), },
+    };
+    #undef NUM_KEYWORDS
+    #define KWTUPLE (&_kwtuple.ob_base.ob_base)
+
+    #else  // !Py_BUILD_CORE
+    #  define KWTUPLE NULL
+    #endif  // !Py_BUILD_CORE
+
+    static const char * const _keywords[] = {"exc_type", "exc_val", "exc_tb", NULL};
+    static _PyArg_Parser _parser = {
+        .keywords = _keywords,
+        .fname = "__exit__",
+        .kwtuple = KWTUPLE,
+    };
+    #undef KWTUPLE
+    PyObject *argsbuf[3];
+    Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 0;
+    PyObject *exc_type = Py_None;
+    PyObject *exc_val = Py_None;
+    PyObject *exc_tb = Py_None;
+
+    args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser,
+            /*minpos*/ 0, /*maxpos*/ 3, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
+    if (!args) {
+        goto exit;
+    }
+    if (!noptargs) {
+        goto skip_optional_pos;
+    }
+    if (args[0]) {
+        exc_type = args[0];
+        if (!--noptargs) {
+            goto skip_optional_pos;
+        }
+    }
+    if (args[1]) {
+        exc_val = args[1];
+        if (!--noptargs) {
+            goto skip_optional_pos;
+        }
+    }
+    exc_tb = args[2];
+skip_optional_pos:
+    return_value = _remote_debugging_BinaryWriter___exit___impl((BinaryWriterObject *)self, exc_type, exc_val, exc_tb);
+
+exit:
+    return return_value;
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryWriter_get_stats__doc__,
+"get_stats($self, /)\n"
+"--\n"
+"\n"
+"Get encoding statistics for the writer.\n"
+"\n"
+"Returns a dict with encoding statistics including repeat/full/suffix/pop-push\n"
+"record counts, frames written/saved, and compression ratio.");
+
+#define _REMOTE_DEBUGGING_BINARYWRITER_GET_STATS_METHODDEF    \
+    {"get_stats", (PyCFunction)_remote_debugging_BinaryWriter_get_stats, METH_NOARGS, _remote_debugging_BinaryWriter_get_stats__doc__},
+
+static PyObject *
+_remote_debugging_BinaryWriter_get_stats_impl(BinaryWriterObject *self);
+
+static PyObject *
+_remote_debugging_BinaryWriter_get_stats(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    return _remote_debugging_BinaryWriter_get_stats_impl((BinaryWriterObject *)self);
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryReader___init____doc__,
+"BinaryReader(filename)\n"
+"--\n"
+"\n"
+"High-performance binary reader for profiling data.\n"
+"\n"
+"Arguments:\n"
+"    filename: Path to input file\n"
+"\n"
+"Use as a context manager or call close() when done.");
+
+static int
+_remote_debugging_BinaryReader___init___impl(BinaryReaderObject *self,
+                                             const char *filename);
+
+static int
+_remote_debugging_BinaryReader___init__(PyObject *self, PyObject *args, PyObject *kwargs)
+{
+    int return_value = -1;
+    #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+
+    #define NUM_KEYWORDS 1
+    static struct {
+        PyGC_Head _this_is_not_used;
+        PyObject_VAR_HEAD
+        Py_hash_t ob_hash;
+        PyObject *ob_item[NUM_KEYWORDS];
+    } _kwtuple = {
+        .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
+        .ob_hash = -1,
+        .ob_item = { &_Py_ID(filename), },
+    };
+    #undef NUM_KEYWORDS
+    #define KWTUPLE (&_kwtuple.ob_base.ob_base)
+
+    #else  // !Py_BUILD_CORE
+    #  define KWTUPLE NULL
+    #endif  // !Py_BUILD_CORE
+
+    static const char * const _keywords[] = {"filename", NULL};
+    static _PyArg_Parser _parser = {
+        .keywords = _keywords,
+        .fname = "BinaryReader",
+        .kwtuple = KWTUPLE,
+    };
+    #undef KWTUPLE
+    PyObject *argsbuf[1];
+    PyObject * const *fastargs;
+    Py_ssize_t nargs = PyTuple_GET_SIZE(args);
+    const char *filename;
+
+    fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser,
+            /*minpos*/ 1, /*maxpos*/ 1, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
+    if (!fastargs) {
+        goto exit;
+    }
+    if (!PyUnicode_Check(fastargs[0])) {
+        _PyArg_BadArgument("BinaryReader", "argument 'filename'", "str", fastargs[0]);
+        goto exit;
+    }
+    Py_ssize_t filename_length;
+    filename = PyUnicode_AsUTF8AndSize(fastargs[0], &filename_length);
+    if (filename == NULL) {
+        goto exit;
+    }
+    if (strlen(filename) != (size_t)filename_length) {
+        PyErr_SetString(PyExc_ValueError, "embedded null character");
+        goto exit;
+    }
+    return_value = _remote_debugging_BinaryReader___init___impl((BinaryReaderObject *)self, filename);
+
+exit:
+    return return_value;
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryReader_replay__doc__,
+"replay($self, /, collector, progress_callback=None)\n"
+"--\n"
+"\n"
+"Replay samples through a collector.\n"
+"\n"
+"Arguments:\n"
+"    collector: Collector object with collect() method\n"
+"    progress_callback: Optional callable(current, total)\n"
+"\n"
+"Returns:\n"
+"    Number of samples replayed");
+
+#define _REMOTE_DEBUGGING_BINARYREADER_REPLAY_METHODDEF    \
+    {"replay", _PyCFunction_CAST(_remote_debugging_BinaryReader_replay), METH_FASTCALL|METH_KEYWORDS, _remote_debugging_BinaryReader_replay__doc__},
+
+static PyObject *
+_remote_debugging_BinaryReader_replay_impl(BinaryReaderObject *self,
+                                           PyObject *collector,
+                                           PyObject *progress_callback);
+
+static PyObject *
+_remote_debugging_BinaryReader_replay(PyObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames)
+{
+    PyObject *return_value = NULL;
+    #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+
+    #define NUM_KEYWORDS 2
+    static struct {
+        PyGC_Head _this_is_not_used;
+        PyObject_VAR_HEAD
+        Py_hash_t ob_hash;
+        PyObject *ob_item[NUM_KEYWORDS];
+    } _kwtuple = {
+        .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
+        .ob_hash = -1,
+        .ob_item = { &_Py_ID(collector), &_Py_ID(progress_callback), },
+    };
+    #undef NUM_KEYWORDS
+    #define KWTUPLE (&_kwtuple.ob_base.ob_base)
+
+    #else  // !Py_BUILD_CORE
+    #  define KWTUPLE NULL
+    #endif  // !Py_BUILD_CORE
+
+    static const char * const _keywords[] = {"collector", "progress_callback", NULL};
+    static _PyArg_Parser _parser = {
+        .keywords = _keywords,
+        .fname = "replay",
+        .kwtuple = KWTUPLE,
+    };
+    #undef KWTUPLE
+    PyObject *argsbuf[2];
+    Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 1;
+    PyObject *collector;
+    PyObject *progress_callback = Py_None;
+
+    args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser,
+            /*minpos*/ 1, /*maxpos*/ 2, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
+    if (!args) {
+        goto exit;
+    }
+    collector = args[0];
+    if (!noptargs) {
+        goto skip_optional_pos;
+    }
+    progress_callback = args[1];
+skip_optional_pos:
+    return_value = _remote_debugging_BinaryReader_replay_impl((BinaryReaderObject *)self, collector, progress_callback);
+
+exit:
+    return return_value;
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryReader_get_info__doc__,
+"get_info($self, /)\n"
+"--\n"
+"\n"
+"Get metadata about the binary file.\n"
+"\n"
+"Returns:\n"
+"    Dict with file metadata");
+
+#define _REMOTE_DEBUGGING_BINARYREADER_GET_INFO_METHODDEF    \
+    {"get_info", (PyCFunction)_remote_debugging_BinaryReader_get_info, METH_NOARGS, _remote_debugging_BinaryReader_get_info__doc__},
+
+static PyObject *
+_remote_debugging_BinaryReader_get_info_impl(BinaryReaderObject *self);
+
+static PyObject *
+_remote_debugging_BinaryReader_get_info(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    return _remote_debugging_BinaryReader_get_info_impl((BinaryReaderObject *)self);
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryReader_get_stats__doc__,
+"get_stats($self, /)\n"
+"--\n"
+"\n"
+"Get reconstruction statistics from replay.\n"
+"\n"
+"Returns a dict with statistics about record types decoded and samples\n"
+"reconstructed during replay.");
+
+#define _REMOTE_DEBUGGING_BINARYREADER_GET_STATS_METHODDEF    \
+    {"get_stats", (PyCFunction)_remote_debugging_BinaryReader_get_stats, METH_NOARGS, _remote_debugging_BinaryReader_get_stats__doc__},
+
+static PyObject *
+_remote_debugging_BinaryReader_get_stats_impl(BinaryReaderObject *self);
+
+static PyObject *
+_remote_debugging_BinaryReader_get_stats(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    return _remote_debugging_BinaryReader_get_stats_impl((BinaryReaderObject *)self);
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryReader_close__doc__,
+"close($self, /)\n"
+"--\n"
+"\n"
+"Close the reader and free resources.");
+
+#define _REMOTE_DEBUGGING_BINARYREADER_CLOSE_METHODDEF    \
+    {"close", (PyCFunction)_remote_debugging_BinaryReader_close, METH_NOARGS, _remote_debugging_BinaryReader_close__doc__},
+
+static PyObject *
+_remote_debugging_BinaryReader_close_impl(BinaryReaderObject *self);
+
+static PyObject *
+_remote_debugging_BinaryReader_close(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    return _remote_debugging_BinaryReader_close_impl((BinaryReaderObject *)self);
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryReader___enter____doc__,
+"__enter__($self, /)\n"
+"--\n"
+"\n"
+"Enter context manager.");
+
+#define _REMOTE_DEBUGGING_BINARYREADER___ENTER___METHODDEF    \
+    {"__enter__", (PyCFunction)_remote_debugging_BinaryReader___enter__, METH_NOARGS, _remote_debugging_BinaryReader___enter____doc__},
+
+static PyObject *
+_remote_debugging_BinaryReader___enter___impl(BinaryReaderObject *self);
+
+static PyObject *
+_remote_debugging_BinaryReader___enter__(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    return _remote_debugging_BinaryReader___enter___impl((BinaryReaderObject *)self);
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryReader___exit____doc__,
+"__exit__($self, /, exc_type=None, exc_val=None, exc_tb=None)\n"
+"--\n"
+"\n"
+"Exit context manager, closing the file.");
+
+#define _REMOTE_DEBUGGING_BINARYREADER___EXIT___METHODDEF    \
+    {"__exit__", _PyCFunction_CAST(_remote_debugging_BinaryReader___exit__), METH_FASTCALL|METH_KEYWORDS, _remote_debugging_BinaryReader___exit____doc__},
+
+static PyObject *
+_remote_debugging_BinaryReader___exit___impl(BinaryReaderObject *self,
+                                             PyObject *exc_type,
+                                             PyObject *exc_val,
+                                             PyObject *exc_tb);
+
+static PyObject *
+_remote_debugging_BinaryReader___exit__(PyObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames)
+{
+    PyObject *return_value = NULL;
+    #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+
+    #define NUM_KEYWORDS 3
+    static struct {
+        PyGC_Head _this_is_not_used;
+        PyObject_VAR_HEAD
+        Py_hash_t ob_hash;
+        PyObject *ob_item[NUM_KEYWORDS];
+    } _kwtuple = {
+        .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
+        .ob_hash = -1,
+        .ob_item = { &_Py_ID(exc_type), &_Py_ID(exc_val), &_Py_ID(exc_tb), },
+    };
+    #undef NUM_KEYWORDS
+    #define KWTUPLE (&_kwtuple.ob_base.ob_base)
+
+    #else  // !Py_BUILD_CORE
+    #  define KWTUPLE NULL
+    #endif  // !Py_BUILD_CORE
+
+    static const char * const _keywords[] = {"exc_type", "exc_val", "exc_tb", NULL};
+    static _PyArg_Parser _parser = {
+        .keywords = _keywords,
+        .fname = "__exit__",
+        .kwtuple = KWTUPLE,
+    };
+    #undef KWTUPLE
+    PyObject *argsbuf[3];
+    Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 0;
+    PyObject *exc_type = Py_None;
+    PyObject *exc_val = Py_None;
+    PyObject *exc_tb = Py_None;
+
+    args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser,
+            /*minpos*/ 0, /*maxpos*/ 3, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
+    if (!args) {
+        goto exit;
+    }
+    if (!noptargs) {
+        goto skip_optional_pos;
+    }
+    if (args[0]) {
+        exc_type = args[0];
+        if (!--noptargs) {
+            goto skip_optional_pos;
+        }
+    }
+    if (args[1]) {
+        exc_val = args[1];
+        if (!--noptargs) {
+            goto skip_optional_pos;
+        }
+    }
+    exc_tb = args[2];
+skip_optional_pos:
+    return_value = _remote_debugging_BinaryReader___exit___impl((BinaryReaderObject *)self, exc_type, exc_val, exc_tb);
+
+exit:
+    return return_value;
+}
+
+PyDoc_STRVAR(_remote_debugging_zstd_available__doc__,
+"zstd_available($module, /)\n"
+"--\n"
+"\n"
+"Check if zstd compression is available.\n"
+"\n"
+"Returns:\n"
+"    True if zstd available, False otherwise");
+
+#define _REMOTE_DEBUGGING_ZSTD_AVAILABLE_METHODDEF    \
+    {"zstd_available", (PyCFunction)_remote_debugging_zstd_available, METH_NOARGS, _remote_debugging_zstd_available__doc__},
+
+static PyObject *
+_remote_debugging_zstd_available_impl(PyObject *module);
+
+static PyObject *
+_remote_debugging_zstd_available(PyObject *module, PyObject *Py_UNUSED(ignored))
+{
+    return _remote_debugging_zstd_available_impl(module);
+}
+/*[clinic end generated code: output=a02fee60448b86e2 input=a9049054013a1b77]*/
diff --git a/Modules/_remote_debugging/module.c b/Modules/_remote_debugging/module.c
index a194d88c3c3ca0..b8af019a68077b 100644
--- a/Modules/_remote_debugging/module.c
+++ b/Modules/_remote_debugging/module.c
@@ -6,6 +6,20 @@
  ******************************************************************************/
 
 #include "_remote_debugging.h"
+#include "binary_io.h"
+
+/* Forward declarations for clinic-generated code */
+typedef struct {
+    PyObject_HEAD
+    BinaryWriter *writer;
+    uint32_t cached_total_samples;  /* Preserved after finalize */
+} BinaryWriterObject;
+
+typedef struct {
+    PyObject_HEAD
+    BinaryReader *reader;
+} BinaryReaderObject;
+
 #include "clinic/module.c.h"
 
 /* ============================================================================
@@ -970,6 +984,10 @@ static PyType_Spec RemoteUnwinder_spec = {
     .slots = RemoteUnwinder_slots,
 };
 
+/* Forward declarations for type specs defined later */
+static PyType_Spec BinaryWriter_spec;
+static PyType_Spec BinaryReader_spec;
+
 /* ============================================================================
  * MODULE INITIALIZATION
  * ============================================================================ */
@@ -1048,6 +1066,18 @@ _remote_debugging_exec(PyObject *m)
     if (PyModule_AddType(m, st->AwaitedInfo_Type) < 0) {
         return -1;
     }
+
+    // Create BinaryWriter and BinaryReader types
+    CREATE_TYPE(m, st->BinaryWriter_Type, &BinaryWriter_spec);
+    if (PyModule_AddType(m, st->BinaryWriter_Type) < 0) {
+        return -1;
+    }
+
+    CREATE_TYPE(m, st->BinaryReader_Type, &BinaryReader_spec);
+    if (PyModule_AddType(m, st->BinaryReader_Type) < 0) {
+        return -1;
+    }
+
 #ifdef Py_GIL_DISABLED
     PyUnstable_Module_SetGIL(m, Py_MOD_GIL_NOT_USED);
 #endif
@@ -1091,6 +1121,8 @@ remote_debugging_traverse(PyObject *mod, visitproc visit, void *arg)
     Py_VISIT(state->ThreadInfo_Type);
     Py_VISIT(state->InterpreterInfo_Type);
     Py_VISIT(state->AwaitedInfo_Type);
+    Py_VISIT(state->BinaryWriter_Type);
+    Py_VISIT(state->BinaryReader_Type);
     return 0;
 }
 
@@ -1106,6 +1138,8 @@ remote_debugging_clear(PyObject *mod)
     Py_CLEAR(state->ThreadInfo_Type);
     Py_CLEAR(state->InterpreterInfo_Type);
     Py_CLEAR(state->AwaitedInfo_Type);
+    Py_CLEAR(state->BinaryWriter_Type);
+    Py_CLEAR(state->BinaryReader_Type);
     return 0;
 }
 
@@ -1115,6 +1149,509 @@ remote_debugging_free(void *mod)
     (void)remote_debugging_clear((PyObject *)mod);
 }
 
+/* ============================================================================
+ * BINARY WRITER CLASS
+ * ============================================================================ */
+
+#define BinaryWriter_CAST(op) ((BinaryWriterObject *)(op))
+
+/*[clinic input]
+class _remote_debugging.BinaryWriter "BinaryWriterObject *" "&PyBinaryWriter_Type"
+[clinic start generated code]*/
+/*[clinic end generated code: output=da39a3ee5e6b4b0d input=e948838b90a2003c]*/
+
+/*[clinic input]
+_remote_debugging.BinaryWriter.__init__
+    filename: str
+    sample_interval_us: unsigned_long_long
+    start_time_us: unsigned_long_long
+    *
+    compression: int = 0
+
+High-performance binary writer for profiling data.
+
+Arguments:
+    filename: Path to output file
+    sample_interval_us: Sampling interval in microseconds
+    start_time_us: Start timestamp in microseconds (from time.monotonic() * 1e6)
+    compression: 0=none, 1=zstd (default: 0)
+
+Use as a context manager or call finalize() when done.
+[clinic start generated code]*/
+
+static int
+_remote_debugging_BinaryWriter___init___impl(BinaryWriterObject *self,
+                                             const char *filename,
+                                             unsigned long long sample_interval_us,
+                                             unsigned long long start_time_us,
+                                             int compression)
+/*[clinic end generated code: output=014c0306f1bacf4b input=57497fe3cb9214a6]*/
+{
+    if (self->writer) {
+        binary_writer_destroy(self->writer);
+    }
+
+    self->writer = binary_writer_create(filename, sample_interval_us, compression, start_time_us);
+    if (!self->writer) {
+        return -1;
+    }
+
+    return 0;
+}
+
+/*[clinic input]
+_remote_debugging.BinaryWriter.write_sample
+    stack_frames: object
+    timestamp_us: unsigned_long_long
+
+Write a sample to the binary file.
+
+Arguments:
+    stack_frames: List of InterpreterInfo objects
+    timestamp_us: Current timestamp in microseconds (from time.monotonic() * 1e6)
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryWriter_write_sample_impl(BinaryWriterObject *self,
+                                                 PyObject *stack_frames,
+                                                 unsigned long long timestamp_us)
+/*[clinic end generated code: output=24d5b86679b4128f input=dce3148417482624]*/
+{
+    if (!self->writer) {
+        PyErr_SetString(PyExc_ValueError, "Writer is closed");
+        return NULL;
+    }
+
+    if (binary_writer_write_sample(self->writer, stack_frames, timestamp_us) < 0) {
+        return NULL;
+    }
+
+    Py_RETURN_NONE;
+}
+
+/*[clinic input]
+_remote_debugging.BinaryWriter.finalize
+
+Finalize and close the binary file.
+
+Writes string/frame tables, footer, and updates header.
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryWriter_finalize_impl(BinaryWriterObject *self)
+/*[clinic end generated code: output=3534b88c6628de88 input=c02191750682f6a2]*/
+{
+    if (!self->writer) {
+        PyErr_SetString(PyExc_ValueError, "Writer is already closed");
+        return NULL;
+    }
+
+    /* Save total_samples before finalizing */
+    self->cached_total_samples = self->writer->total_samples;
+
+    if (binary_writer_finalize(self->writer) < 0) {
+        return NULL;
+    }
+
+    binary_writer_destroy(self->writer);
+    self->writer = NULL;
+
+    Py_RETURN_NONE;
+}
+
+/*[clinic input]
+_remote_debugging.BinaryWriter.close
+
+Close the writer without finalizing (discards data).
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryWriter_close_impl(BinaryWriterObject *self)
+/*[clinic end generated code: output=9571bb2256fd1fd2 input=6e0da206e60daf16]*/
+{
+    if (self->writer) {
+        binary_writer_destroy(self->writer);
+        self->writer = NULL;
+    }
+    Py_RETURN_NONE;
+}
+
+/*[clinic input]
+_remote_debugging.BinaryWriter.__enter__
+
+Enter context manager.
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryWriter___enter___impl(BinaryWriterObject *self)
+/*[clinic end generated code: output=8eb95f61daf2d120 input=8ef14ee18da561d2]*/
+{
+    Py_INCREF(self);
+    return (PyObject *)self;
+}
+
+/*[clinic input]
+_remote_debugging.BinaryWriter.__exit__
+    exc_type: object = None
+    exc_val: object = None
+    exc_tb: object = None
+
+Exit context manager, finalizing the file.
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryWriter___exit___impl(BinaryWriterObject *self,
+                                             PyObject *exc_type,
+                                             PyObject *exc_val,
+                                             PyObject *exc_tb)
+/*[clinic end generated code: output=61831f47c72a53c6 input=12334ce1009af37f]*/
+{
+    if (self->writer) {
+        /* Finalize on normal exit */
+        if (binary_writer_finalize(self->writer) < 0) {
+            binary_writer_destroy(self->writer);
+            self->writer = NULL;
+            return NULL;
+        }
+        binary_writer_destroy(self->writer);
+        self->writer = NULL;
+    }
+    Py_RETURN_FALSE;
+}
+
+/*[clinic input]
+_remote_debugging.BinaryWriter.get_stats
+
+Get encoding statistics for the writer.
+
+Returns a dict with encoding statistics including repeat/full/suffix/pop-push
+record counts, frames written/saved, and compression ratio.
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryWriter_get_stats_impl(BinaryWriterObject *self)
+/*[clinic end generated code: output=06522cd52544df89 input=82968491b53ad277]*/
+{
+    if (!self->writer) {
+        PyErr_SetString(PyExc_ValueError, "Writer is closed");
+        return NULL;
+    }
+    return binary_writer_get_stats(self->writer);
+}
+
+static PyObject *
+BinaryWriter_get_total_samples(BinaryWriterObject *self, void *closure)
+{
+    if (!self->writer) {
+        /* Use cached value after finalize/close */
+        return PyLong_FromUnsignedLong(self->cached_total_samples);
+    }
+    return PyLong_FromUnsignedLong(self->writer->total_samples);
+}
+
+static PyGetSetDef BinaryWriter_getset[] = {
+    {"total_samples", (getter)BinaryWriter_get_total_samples, NULL, "Total samples written", NULL},
+    {NULL}
+};
+
+static PyMethodDef BinaryWriter_methods[] = {
+    _REMOTE_DEBUGGING_BINARYWRITER_WRITE_SAMPLE_METHODDEF
+    _REMOTE_DEBUGGING_BINARYWRITER_FINALIZE_METHODDEF
+    _REMOTE_DEBUGGING_BINARYWRITER_CLOSE_METHODDEF
+    _REMOTE_DEBUGGING_BINARYWRITER___ENTER___METHODDEF
+    _REMOTE_DEBUGGING_BINARYWRITER___EXIT___METHODDEF
+    _REMOTE_DEBUGGING_BINARYWRITER_GET_STATS_METHODDEF
+    {NULL, NULL, 0, NULL}
+};
+
+static void
+BinaryWriter_dealloc(PyObject *op)
+{
+    BinaryWriterObject *self = BinaryWriter_CAST(op);
+    PyTypeObject *tp = Py_TYPE(self);
+    if (self->writer) {
+        binary_writer_destroy(self->writer);
+    }
+    tp->tp_free(self);
+    Py_DECREF(tp);
+}
+
+static PyType_Slot BinaryWriter_slots[] = {
+    {Py_tp_getset, BinaryWriter_getset},
+    {Py_tp_methods, BinaryWriter_methods},
+    {Py_tp_init, _remote_debugging_BinaryWriter___init__},
+    {Py_tp_dealloc, BinaryWriter_dealloc},
+    {0, NULL}
+};
+
+static PyType_Spec BinaryWriter_spec = {
+    .name = "_remote_debugging.BinaryWriter",
+    .basicsize = sizeof(BinaryWriterObject),
+    .flags = (
+        Py_TPFLAGS_DEFAULT
+        | Py_TPFLAGS_IMMUTABLETYPE
+    ),
+    .slots = BinaryWriter_slots,
+};
+
+/* ============================================================================
+ * BINARY READER CLASS
+ * ============================================================================ */
+
+#define BinaryReader_CAST(op) ((BinaryReaderObject *)(op))
+
+/*[clinic input]
+class _remote_debugging.BinaryReader "BinaryReaderObject *" "&PyBinaryReader_Type"
+[clinic start generated code]*/
+/*[clinic end generated code: output=da39a3ee5e6b4b0d input=36400aaf6f53216d]*/
+
+/*[clinic input]
+_remote_debugging.BinaryReader.__init__
+    filename: str
+
+High-performance binary reader for profiling data.
+
+Arguments:
+    filename: Path to input file
+
+Use as a context manager or call close() when done.
+[clinic start generated code]*/
+
+static int
+_remote_debugging_BinaryReader___init___impl(BinaryReaderObject *self,
+                                             const char *filename)
+/*[clinic end generated code: output=9699226f7ae052bb input=4201f9cc500ef2f6]*/
+{
+    if (self->reader) {
+        binary_reader_close(self->reader);
+    }
+
+    self->reader = binary_reader_open(filename);
+    if (!self->reader) {
+        return -1;
+    }
+
+    return 0;
+}
+
+/*[clinic input]
+_remote_debugging.BinaryReader.replay
+    collector: object
+    progress_callback: object = None
+
+Replay samples through a collector.
+
+Arguments:
+    collector: Collector object with collect() method
+    progress_callback: Optional callable(current, total)
+
+Returns:
+    Number of samples replayed
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryReader_replay_impl(BinaryReaderObject *self,
+                                           PyObject *collector,
+                                           PyObject *progress_callback)
+/*[clinic end generated code: output=442345562574b61c input=ebb687aed3e0f4f1]*/
+{
+    if (!self->reader) {
+        PyErr_SetString(PyExc_ValueError, "Reader is closed");
+        return NULL;
+    }
+
+    Py_ssize_t replayed = binary_reader_replay(self->reader, collector, progress_callback);
+    if (replayed < 0) {
+        return NULL;
+    }
+
+    return PyLong_FromSsize_t(replayed);
+}
+
+/*[clinic input]
+_remote_debugging.BinaryReader.get_info
+
+Get metadata about the binary file.
+
+Returns:
+    Dict with file metadata
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryReader_get_info_impl(BinaryReaderObject *self)
+/*[clinic end generated code: output=7f641fbd39147391 input=02e75e39c8a6cd1f]*/
+{
+    if (!self->reader) {
+        PyErr_SetString(PyExc_ValueError, "Reader is closed");
+        return NULL;
+    }
+
+    return binary_reader_get_info(self->reader);
+}
+
+/*[clinic input]
+_remote_debugging.BinaryReader.get_stats
+
+Get reconstruction statistics from replay.
+
+Returns a dict with statistics about record types decoded and samples
+reconstructed during replay.
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryReader_get_stats_impl(BinaryReaderObject *self)
+/*[clinic end generated code: output=628b9ab5e4c4fd36 input=d8dd6654abd6c3c0]*/
+{
+    if (!self->reader) {
+        PyErr_SetString(PyExc_ValueError, "Reader is closed");
+        return NULL;
+    }
+    return binary_reader_get_stats(self->reader);
+}
+
+/*[clinic input]
+_remote_debugging.BinaryReader.close
+
+Close the reader and free resources.
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryReader_close_impl(BinaryReaderObject *self)
+/*[clinic end generated code: output=ad0238cf5240b4f8 input=b919a66c737712d5]*/
+{
+    if (self->reader) {
+        binary_reader_close(self->reader);
+        self->reader = NULL;
+    }
+    Py_RETURN_NONE;
+}
+
+/*[clinic input]
+_remote_debugging.BinaryReader.__enter__
+
+Enter context manager.
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryReader___enter___impl(BinaryReaderObject *self)
+/*[clinic end generated code: output=fade133538e93817 input=4794844c9efdc4f6]*/
+{
+    Py_INCREF(self);
+    return (PyObject *)self;
+}
+
+/*[clinic input]
+_remote_debugging.BinaryReader.__exit__
+    exc_type: object = None
+    exc_val: object = None
+    exc_tb: object = None
+
+Exit context manager, closing the file.
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryReader___exit___impl(BinaryReaderObject *self,
+                                             PyObject *exc_type,
+                                             PyObject *exc_val,
+                                             PyObject *exc_tb)
+/*[clinic end generated code: output=2acdd36cfdc14e4a input=87284243d7935835]*/
+{
+    if (self->reader) {
+        binary_reader_close(self->reader);
+        self->reader = NULL;
+    }
+    Py_RETURN_FALSE;
+}
+
+static PyObject *
+BinaryReader_get_sample_count(BinaryReaderObject *self, void *closure)
+{
+    if (!self->reader) {
+        return PyLong_FromLong(0);
+    }
+    return PyLong_FromUnsignedLong(self->reader->sample_count);
+}
+
+static PyObject *
+BinaryReader_get_sample_interval_us(BinaryReaderObject *self, void *closure)
+{
+    if (!self->reader) {
+        return PyLong_FromLong(0);
+    }
+    return PyLong_FromUnsignedLongLong(self->reader->sample_interval_us);
+}
+
+static PyGetSetDef BinaryReader_getset[] = {
+    {"sample_count", (getter)BinaryReader_get_sample_count, NULL, "Number of samples in file", NULL},
+    {"sample_interval_us", (getter)BinaryReader_get_sample_interval_us, NULL, "Sample interval in microseconds", NULL},
+    {NULL}
+};
+
+static PyMethodDef BinaryReader_methods[] = {
+    _REMOTE_DEBUGGING_BINARYREADER_REPLAY_METHODDEF
+    _REMOTE_DEBUGGING_BINARYREADER_GET_INFO_METHODDEF
+    _REMOTE_DEBUGGING_BINARYREADER_GET_STATS_METHODDEF
+    _REMOTE_DEBUGGING_BINARYREADER_CLOSE_METHODDEF
+    _REMOTE_DEBUGGING_BINARYREADER___ENTER___METHODDEF
+    _REMOTE_DEBUGGING_BINARYREADER___EXIT___METHODDEF
+    {NULL, NULL, 0, NULL}
+};
+
+static void
+BinaryReader_dealloc(PyObject *op)
+{
+    BinaryReaderObject *self = BinaryReader_CAST(op);
+    PyTypeObject *tp = Py_TYPE(self);
+    if (self->reader) {
+        binary_reader_close(self->reader);
+    }
+    tp->tp_free(self);
+    Py_DECREF(tp);
+}
+
+static PyType_Slot BinaryReader_slots[] = {
+    {Py_tp_getset, BinaryReader_getset},
+    {Py_tp_methods, BinaryReader_methods},
+    {Py_tp_init, _remote_debugging_BinaryReader___init__},
+    {Py_tp_dealloc, BinaryReader_dealloc},
+    {0, NULL}
+};
+
+static PyType_Spec BinaryReader_spec = {
+    .name = "_remote_debugging.BinaryReader",
+    .basicsize = sizeof(BinaryReaderObject),
+    .flags = (
+        Py_TPFLAGS_DEFAULT
+        | Py_TPFLAGS_IMMUTABLETYPE
+    ),
+    .slots = BinaryReader_slots,
+};
+
+/* ============================================================================
+ * MODULE METHODS
+ * ============================================================================ */
+
+/*[clinic input]
+_remote_debugging.zstd_available
+
+Check if zstd compression is available.
+
+Returns:
+    True if zstd available, False otherwise
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_zstd_available_impl(PyObject *module)
+/*[clinic end generated code: output=55e35a70ef280cdd input=a1b4d41bc09c7cf9]*/
+{
+    return PyBool_FromLong(binary_io_zstd_available());
+}
+
+static PyMethodDef remote_debugging_methods[] = {
+    _REMOTE_DEBUGGING_ZSTD_AVAILABLE_METHODDEF
+    {NULL, NULL, 0, NULL},
+};
+
 static PyModuleDef_Slot remote_debugging_slots[] = {
     {Py_mod_exec, _remote_debugging_exec},
     {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
@@ -1122,10 +1659,6 @@ static PyModuleDef_Slot remote_debugging_slots[] = {
     {0, NULL},
 };
 
-static PyMethodDef remote_debugging_methods[] = {
-    {NULL, NULL, 0, NULL},
-};
-
 static struct PyModuleDef remote_debugging_module = {
     PyModuleDef_HEAD_INIT,
     .m_name = "_remote_debugging",

From 1f7737e6f009d9b8cb10031866d46772ee0b7a26 Mon Sep 17 00:00:00 2001
From: Pablo Galindo Salgado <pablogsal@gmail.com>
Date: Mon, 15 Dec 2025 01:48:54 +0000
Subject: [PATCH 07/17] Add BinaryCollector for sampling profiler

Thin wrapper around the C BinaryWriter. Implements the Collector
interface so it can be used interchangeably with other collectors
like FlamegraphCollector or GeckoCollector.

Compression is configurable: 'auto' uses zstd when available, 'zstd'
requires it, 'none' disables compression. The collector passes
samples directly to C for encoding without building Python data
structures.
---
 Lib/profiling/sampling/binary_collector.py | 123 +++++++++++++++++++++
 1 file changed, 123 insertions(+)
 create mode 100644 Lib/profiling/sampling/binary_collector.py

diff --git a/Lib/profiling/sampling/binary_collector.py b/Lib/profiling/sampling/binary_collector.py
new file mode 100644
index 00000000000000..293d4b6debcca3
--- /dev/null
+++ b/Lib/profiling/sampling/binary_collector.py
@@ -0,0 +1,123 @@
+"""Thin Python wrapper around C binary writer for profiling data."""
+
+import time
+
+from .collector import Collector
+
+# Compression type constants (must match binary_io.h)
+COMPRESSION_NONE = 0
+COMPRESSION_ZSTD = 1
+
+
+def _resolve_compression(compression):
+    """Resolve compression type from string or int.
+
+    Args:
+        compression: 'auto', 'zstd', 'none', or int (0/1)
+
+    Returns:
+        int: Compression type constant
+    """
+    if isinstance(compression, int):
+        return compression
+
+    compression = compression.lower()
+    if compression == 'none':
+        return COMPRESSION_NONE
+    elif compression == 'zstd':
+        return COMPRESSION_ZSTD
+    elif compression == 'auto':
+        # Auto: use zstd if available, otherwise none
+        import _remote_debugging
+        if _remote_debugging.zstd_available():
+            return COMPRESSION_ZSTD
+        return COMPRESSION_NONE
+    else:
+        raise ValueError(f"Unknown compression type: {compression}")
+
+
+class BinaryCollector(Collector):
+    """High-performance binary collector using C implementation.
+
+    This collector writes profiling data directly to a binary file format
+    with optional zstd compression. All I/O is performed in C for maximum
+    throughput.
+
+    The binary format uses string/frame deduplication and varint encoding
+    for efficient storage.
+    """
+
+    def __init__(self, filename, sample_interval_usec, *, skip_idle=False,
+                 compression='auto'):
+        """Create a new binary collector.
+
+        Args:
+            filename: Path to output binary file
+            sample_interval_usec: Sampling interval in microseconds
+            skip_idle: If True, skip idle threads (not used in binary format)
+            compression: 'auto', 'zstd', 'none', or int (0=none, 1=zstd)
+        """
+        import _remote_debugging
+
+        self.filename = filename
+        self.sample_interval_usec = sample_interval_usec
+        self.skip_idle = skip_idle
+
+        compression_type = _resolve_compression(compression)
+        start_time_us = int(time.monotonic() * 1_000_000)
+        self._writer = _remote_debugging.BinaryWriter(
+            filename, sample_interval_usec, start_time_us, compression=compression_type
+        )
+
+    def collect(self, stack_frames, timestamp_us=None):
+        """Collect profiling data from stack frames.
+
+        This passes stack_frames directly to the C writer which handles
+        all encoding and buffering.
+
+        Args:
+            stack_frames: List of InterpreterInfo objects from _remote_debugging
+            timestamp_us: Optional timestamp in microseconds. If not provided,
+                          uses time.monotonic() to generate one.
+        """
+        if timestamp_us is None:
+            timestamp_us = int(time.monotonic() * 1_000_000)
+        self._writer.write_sample(stack_frames, timestamp_us)
+
+    def collect_failed_sample(self):
+        """Record a failed sample attempt (no-op for binary format)."""
+        pass
+
+    def export(self, filename=None):
+        """Finalize and close the binary file.
+
+        Args:
+            filename: Ignored (binary files are written incrementally)
+        """
+        self._writer.finalize()
+
+    @property
+    def total_samples(self):
+        """Total number of samples written."""
+        return self._writer.total_samples
+
+    def get_stats(self):
+        """Get encoding statistics.
+
+        Returns:
+            Dict with encoding statistics including repeat/full/suffix/pop-push
+            record counts, frames written/saved, and compression ratio.
+        """
+        return self._writer.get_stats()
+
+    def __enter__(self):
+        """Context manager entry."""
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit - finalize unless there was an error."""
+        if exc_type is None:
+            self._writer.finalize()
+        else:
+            self._writer.close()
+        return False

From 2965effb6adc66300199d85513e405ddfc600cd9 Mon Sep 17 00:00:00 2001
From: Pablo Galindo Salgado <pablogsal@gmail.com>
Date: Mon, 15 Dec 2025 01:49:11 +0000
Subject: [PATCH 08/17] Add BinaryReader for sampling profiler replay

Wrapper around the C BinaryReader providing file info access and
replay functionality. The replay() method reconstructs samples from
the binary file and feeds them to any collector, enabling format
conversion without re-profiling.

Includes get_info() for metadata access (sample count, thread count,
compression type) and get_stats() for decoding statistics.
---
 Lib/profiling/sampling/binary_reader.py | 131 ++++++++++++++++++++++++
 1 file changed, 131 insertions(+)
 create mode 100644 Lib/profiling/sampling/binary_reader.py

diff --git a/Lib/profiling/sampling/binary_reader.py b/Lib/profiling/sampling/binary_reader.py
new file mode 100644
index 00000000000000..3d7fbf981ea63b
--- /dev/null
+++ b/Lib/profiling/sampling/binary_reader.py
@@ -0,0 +1,131 @@
+"""Thin Python wrapper around C binary reader for profiling data."""
+
+
+class BinaryReader:
+    """High-performance binary reader using C implementation.
+
+    This reader uses memory-mapped I/O (on Unix) for fast replay of
+    profiling data from binary files.
+
+    Use as a context manager:
+        with BinaryReader('profile.bin') as reader:
+            info = reader.get_info()
+            reader.replay_samples(collector, progress_callback)
+    """
+
+    def __init__(self, filename):
+        """Create a new binary reader.
+
+        Args:
+            filename: Path to input binary file
+        """
+        self.filename = filename
+        self._reader = None
+
+    def __enter__(self):
+        """Open the binary file for reading."""
+        import _remote_debugging
+        self._reader = _remote_debugging.BinaryReader(self.filename)
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Close the binary file."""
+        if self._reader is not None:
+            self._reader.close()
+            self._reader = None
+        return False
+
+    def get_info(self):
+        """Get metadata about the binary file.
+
+        Returns:
+            dict: File metadata including:
+                - sample_count: Number of samples in the file
+                - sample_interval_us: Sampling interval in microseconds
+                - start_time_us: Start timestamp in microseconds
+                - string_count: Number of unique strings
+                - frame_count: Number of unique frames
+                - compression: Compression type used
+        """
+        if self._reader is None:
+            raise RuntimeError("Reader not open. Use as context manager.")
+        return self._reader.get_info()
+
+    def replay_samples(self, collector, progress_callback=None):
+        """Replay samples from binary file through a collector.
+
+        This allows converting binary profiling data to other formats
+        (e.g., flamegraph, pstats) by replaying through the appropriate
+        collector.
+
+        Args:
+            collector: A Collector instance with a collect() method
+            progress_callback: Optional callable(current, total) for progress
+
+        Returns:
+            int: Number of samples replayed
+        """
+        if self._reader is None:
+            raise RuntimeError("Reader not open. Use as context manager.")
+        return self._reader.replay(collector, progress_callback)
+
+    @property
+    def sample_count(self):
+        """Number of samples in the file."""
+        if self._reader is None:
+            raise RuntimeError("Reader not open. Use as context manager.")
+        return self._reader.get_info()['sample_count']
+
+    def get_stats(self):
+        """Get reconstruction statistics from replay.
+
+        Returns:
+            dict: Statistics about record types decoded and samples
+                  reconstructed during replay.
+        """
+        if self._reader is None:
+            raise RuntimeError("Reader not open. Use as context manager.")
+        return self._reader.get_stats()
+
+
+def convert_binary_to_format(input_file, output_file, output_format,
+                             sample_interval_usec=None, progress_callback=None):
+    """Convert a binary profiling file to another format.
+
+    Args:
+        input_file: Path to input binary file
+        output_file: Path to output file
+        output_format: Target format ('flamegraph', 'collapsed', 'pstats', etc.)
+        sample_interval_usec: Override sample interval (uses file's if None)
+        progress_callback: Optional callable(current, total) for progress
+
+    Returns:
+        int: Number of samples converted
+    """
+    from .gecko_collector import GeckoCollector
+    from .stack_collector import FlamegraphCollector, CollapsedStackCollector
+    from .pstats_collector import PStatsCollector
+
+    with BinaryReader(input_file) as reader:
+        info = reader.get_info()
+        interval = sample_interval_usec or info['sample_interval_us']
+
+        # Create appropriate collector based on format
+        if output_format == 'flamegraph':
+            collector = FlamegraphCollector(interval)
+        elif output_format == 'collapsed':
+            collector = CollapsedStackCollector(interval)
+        elif output_format == 'pstats':
+            collector = PStatsCollector(interval)
+        elif output_format == 'gecko':
+            collector = GeckoCollector(interval)
+        else:
+            raise ValueError(f"Unknown output format: {output_format}")
+
+        # Replay samples through collector
+        count = reader.replay_samples(collector, progress_callback)
+
+        # Export to target format
+        collector.export(output_file)
+
+        return count

From 427f84615214f069f64d6302ce2841780798c1d2 Mon Sep 17 00:00:00 2001
From: Pablo Galindo Salgado <pablogsal@gmail.com>
Date: Mon, 15 Dec 2025 01:49:32 +0000
Subject: [PATCH 09/17] Add --binary and replay command to sampling profiler
 CLI

Adds --binary output format and --compression option to run/attach
commands. The replay command converts binary profiles to other formats:

    python -m profiling.sampling replay profile.bin
    python -m profiling.sampling replay --flamegraph -o out.html profile.bin

This enables a record-and-replay workflow: capture in binary format
during profiling (faster, smaller files), then convert to visualization
formats later without re-profiling.
---
 Lib/profiling/sampling/cli.py | 157 +++++++++++++++++++++++++++++++---
 1 file changed, 147 insertions(+), 10 deletions(-)

diff --git a/Lib/profiling/sampling/cli.py b/Lib/profiling/sampling/cli.py
index ffc80edc1f6e74..35c39bdcada0a3 100644
--- a/Lib/profiling/sampling/cli.py
+++ b/Lib/profiling/sampling/cli.py
@@ -14,6 +14,8 @@
 from .stack_collector import CollapsedStackCollector, FlamegraphCollector
 from .heatmap_collector import HeatmapCollector
 from .gecko_collector import GeckoCollector
+from .binary_collector import BinaryCollector
+from .binary_reader import BinaryReader, convert_binary_to_format
 from .constants import (
     PROFILING_MODE_ALL,
     PROFILING_MODE_WALL,
@@ -73,6 +75,7 @@ class CustomFormatter(
     "flamegraph": "html",
     "gecko": "json",
     "heatmap": "html",
+    "binary": "bin",
 }
 
 COLLECTOR_MAP = {
@@ -81,6 +84,7 @@ class CustomFormatter(
     "flamegraph": FlamegraphCollector,
     "gecko": GeckoCollector,
     "heatmap": HeatmapCollector,
+    "binary": BinaryCollector,
 }
 
 
@@ -278,7 +282,7 @@ def _add_mode_options(parser):
     )
 
 
-def _add_format_options(parser):
+def _add_format_options(parser, include_compression=True):
     """Add output format options to a parser."""
     output_group = parser.add_argument_group("Output options")
     format_group = output_group.add_mutually_exclusive_group()
@@ -317,8 +321,23 @@ def _add_format_options(parser):
         dest="format",
         help="Generate interactive HTML heatmap visualization with line-level sample counts",
     )
+    format_group.add_argument(
+        "--binary",
+        action="store_const",
+        const="binary",
+        dest="format",
+        help="Generate high-performance binary format (use 'replay' command to convert)",
+    )
     parser.set_defaults(format="pstats")
 
+    if include_compression:
+        output_group.add_argument(
+            "--compression",
+            choices=["auto", "zstd", "none"],
+            default="auto",
+            help="Compression for binary format: auto (use zstd if available), zstd, none",
+        )
+
     output_group.add_argument(
         "-o",
         "--output",
@@ -373,15 +392,18 @@ def _sort_to_mode(sort_choice):
     return sort_map.get(sort_choice, SORT_MODE_NSAMPLES)
 
 
-def _create_collector(format_type, interval, skip_idle, opcodes=False):
+def _create_collector(format_type, interval, skip_idle, opcodes=False,
+                      output_file=None, compression='auto'):
     """Create the appropriate collector based on format type.
 
     Args:
-        format_type: The output format ('pstats', 'collapsed', 'flamegraph', 'gecko', 'heatmap')
+        format_type: The output format ('pstats', 'collapsed', 'flamegraph', 'gecko', 'heatmap', 'binary')
         interval: Sampling interval in microseconds
         skip_idle: Whether to skip idle samples
         opcodes: Whether to collect opcode information (only used by gecko format
                  for creating interval markers in Firefox Profiler)
+        output_file: Output file path (required for binary format)
+        compression: Compression type for binary format ('auto', 'zstd', 'none')
 
     Returns:
         A collector instance of the appropriate type
@@ -390,6 +412,13 @@ def _create_collector(format_type, interval, skip_idle, opcodes=False):
     if collector_class is None:
         raise ValueError(f"Unknown format: {format_type}")
 
+    # Binary format requires output file and compression
+    if format_type == "binary":
+        if output_file is None:
+            raise ValueError("Binary format requires an output file")
+        return collector_class(output_file, interval, skip_idle=skip_idle,
+                              compression=compression)
+
     # Gecko format never skips idle (it needs both GIL and CPU data)
     # and is the only format that uses opcodes for interval markers
     if format_type == "gecko":
@@ -425,7 +454,12 @@ def _handle_output(collector, args, pid, mode):
         pid: Process ID (for generating filenames)
         mode: Profiling mode used
     """
-    if args.format == "pstats":
+    if args.format == "binary":
+        # Binary format already wrote to file incrementally, just finalize
+        collector.export(None)
+        filename = collector.filename
+        print(f"Binary profile written to {filename} ({collector.total_samples} samples)")
+    elif args.format == "pstats":
         if args.outfile:
             collector.export(args.outfile)
         else:
@@ -449,6 +483,13 @@ def _validate_args(args, parser):
         args: Parsed command-line arguments
         parser: ArgumentParser instance for error reporting
     """
+    # Replay command has minimal validation
+    if args.command == "replay":
+        # Can't replay to binary format
+        if args.format == "binary":
+            parser.error("Cannot replay to binary format. Use a different output format.")
+        return
+
     # Check if live mode is available
     if hasattr(args, 'live') and args.live and LiveStatsCollector is None:
         parser.error(
@@ -456,7 +497,7 @@ def _validate_args(args, parser):
         )
 
     # Async-aware mode is incompatible with --native, --no-gc, --mode, and --all-threads
-    if args.async_aware:
+    if getattr(args, 'async_aware', False):
         issues = []
         if args.native:
             issues.append("--native")
@@ -473,7 +514,7 @@ def _validate_args(args, parser):
             )
 
     # --async-mode requires --async-aware
-    if hasattr(args, 'async_mode') and args.async_mode != "running" and not args.async_aware:
+    if hasattr(args, 'async_mode') and args.async_mode != "running" and not getattr(args, 'async_aware', False):
         parser.error("--async-mode requires --async-aware to be enabled.")
 
     # Live mode is incompatible with format options
@@ -501,7 +542,7 @@ def _validate_args(args, parser):
         return
 
     # Validate gecko mode doesn't use non-wall mode
-    if args.format == "gecko" and args.mode != "wall":
+    if args.format == "gecko" and getattr(args, 'mode', 'wall') != "wall":
         parser.error(
             "--mode option is incompatible with --gecko. "
             "Gecko format automatically includes both GIL-holding and CPU status analysis."
@@ -509,7 +550,7 @@ def _validate_args(args, parser):
 
     # Validate --opcodes is only used with compatible formats
     opcodes_compatible_formats = ("live", "gecko", "flamegraph", "heatmap")
-    if args.opcodes and args.format not in opcodes_compatible_formats:
+    if getattr(args, 'opcodes', False) and args.format not in opcodes_compatible_formats:
         parser.error(
             f"--opcodes is only compatible with {', '.join('--' + f for f in opcodes_compatible_formats)}."
         )
@@ -621,6 +662,30 @@ def main():
     _add_format_options(attach_parser)
     _add_pstats_options(attach_parser)
 
+    # === REPLAY COMMAND ===
+    replay_parser = subparsers.add_parser(
+        "replay",
+        help="Replay a binary profile and convert to another format",
+        formatter_class=CustomFormatter,
+        description="""Replay a binary profile file and convert to another format
+
+Examples:
+  # Convert binary to flamegraph
+  `python -m profiling.sampling replay --flamegraph -o output.html profile.bin`
+
+  # Convert binary to pstats and print to stdout
+  `python -m profiling.sampling replay profile.bin`
+
+  # Convert binary to gecko format
+  `python -m profiling.sampling replay --gecko -o profile.json profile.bin`""",
+    )
+    replay_parser.add_argument(
+        "input_file",
+        help="Binary profile file to replay",
+    )
+    _add_format_options(replay_parser, include_compression=False)
+    _add_pstats_options(replay_parser)
+
     # Parse arguments
     args = parser.parse_args()
 
@@ -631,6 +696,7 @@ def main():
     command_handlers = {
         "run": _handle_run,
         "attach": _handle_attach,
+        "replay": _handle_replay,
     }
 
     # Execute the appropriate command
@@ -660,8 +726,17 @@ def _handle_attach(args):
         mode != PROFILING_MODE_WALL if mode != PROFILING_MODE_ALL else False
     )
 
+    # For binary format, determine output file before creating collector
+    output_file = None
+    if args.format == "binary":
+        output_file = args.outfile or _generate_output_filename(args.format, args.pid)
+
     # Create the appropriate collector
-    collector = _create_collector(args.format, args.interval, skip_idle, args.opcodes)
+    collector = _create_collector(
+        args.format, args.interval, skip_idle, args.opcodes,
+        output_file=output_file,
+        compression=getattr(args, 'compression', 'auto')
+    )
 
     # Sample the process
     collector = sample(
@@ -731,8 +806,17 @@ def _handle_run(args):
         mode != PROFILING_MODE_WALL if mode != PROFILING_MODE_ALL else False
     )
 
+    # For binary format, determine output file before creating collector
+    output_file = None
+    if args.format == "binary":
+        output_file = args.outfile or _generate_output_filename(args.format, process.pid)
+
     # Create the appropriate collector
-    collector = _create_collector(args.format, args.interval, skip_idle, args.opcodes)
+    collector = _create_collector(
+        args.format, args.interval, skip_idle, args.opcodes,
+        output_file=output_file,
+        compression=getattr(args, 'compression', 'auto')
+    )
 
     # Profile the subprocess
     try:
@@ -852,5 +936,58 @@ def _handle_live_run(args):
                 process.wait()
 
 
+def _handle_replay(args):
+    """Handle the 'replay' command - convert binary profile to another format."""
+    import os
+
+    # Check input file exists
+    if not os.path.exists(args.input_file):
+        sys.exit(f"Error: Input file not found: {args.input_file}")
+
+    # Can't replay to binary format
+    if args.format == "binary":
+        sys.exit("Error: Cannot replay to binary format. Use a different output format.")
+
+    with BinaryReader(args.input_file) as reader:
+        info = reader.get_info()
+        interval = info['sample_interval_us']
+
+        print(f"Replaying {info['sample_count']} samples from {args.input_file}")
+        print(f"  Sample interval: {interval} us")
+        print(f"  Compression: {'zstd' if info.get('compression_type', 0) == 1 else 'none'}")
+
+        # Create appropriate collector
+        collector = _create_collector(args.format, interval, skip_idle=False)
+
+        # Replay with progress bar
+        def progress_callback(current, total):
+            if total > 0:
+                pct = current / total
+                bar_width = 40
+                filled = int(bar_width * pct)
+                bar = '█' * filled + '░' * (bar_width - filled)
+                print(f"\r  [{bar}] {pct*100:5.1f}% ({current:,}/{total:,})", end="", flush=True)
+
+        count = reader.replay_samples(collector, progress_callback)
+        print()  # Newline after progress bar
+
+        # Handle output similar to other formats
+        if args.format == "pstats":
+            if args.outfile:
+                collector.export(args.outfile)
+            else:
+                # Print to stdout with defaults applied
+                sort_choice = args.sort if args.sort is not None else "nsamples"
+                limit = args.limit if args.limit is not None else 15
+                sort_mode = _sort_to_mode(sort_choice)
+                collector.print_stats(sort_mode, limit, not args.no_summary, PROFILING_MODE_WALL)
+        else:
+            # Export to file
+            filename = args.outfile or _generate_output_filename(args.format, os.getpid())
+            collector.export(filename)
+
+        print(f"Replayed {count} samples")
+
+
 if __name__ == "__main__":
     main()

From 1437629bef4db4a5ee1f9db9dcec95ab5464634e Mon Sep 17 00:00:00 2001
From: Pablo Galindo Salgado <pablogsal@gmail.com>
Date: Mon, 15 Dec 2025 01:49:48 +0000
Subject: [PATCH 10/17] Update collector interface for binary replay support

Adds optional timestamp_us parameter to Collector.collect() method.
During live profiling this is None and collectors use their own timing.
During binary replay the stored timestamp is passed through, allowing
collectors to reconstruct the original timing.

Also fixes gecko_collector to use time.monotonic() instead of time.time()
for consistency with other collectors.
---
 Lib/profiling/sampling/collector.py           | 11 +++-
 Lib/profiling/sampling/gecko_collector.py     | 18 +++++-
 .../sampling/live_collector/collector.py      |  2 +-
 Lib/profiling/sampling/pstats_collector.py    |  2 +-
 Lib/profiling/sampling/sample.py              | 55 +++++++++++++++++++
 Lib/profiling/sampling/stack_collector.py     |  6 +-
 6 files changed, 84 insertions(+), 10 deletions(-)

diff --git a/Lib/profiling/sampling/collector.py b/Lib/profiling/sampling/collector.py
index a1f6ec190f6556..6996bf99aef48a 100644
--- a/Lib/profiling/sampling/collector.py
+++ b/Lib/profiling/sampling/collector.py
@@ -44,8 +44,15 @@ def extract_lineno(location):
 
 class Collector(ABC):
     @abstractmethod
-    def collect(self, stack_frames):
-        """Collect profiling data from stack frames."""
+    def collect(self, stack_frames, timestamp_us=None):
+        """Collect profiling data from stack frames.
+
+        Args:
+            stack_frames: List of InterpreterInfo objects
+            timestamp_us: Optional timestamp in microseconds. If provided (from
+                binary replay), use this instead of current time. If None,
+                collectors should use time.monotonic() or similar.
+        """
 
     def collect_failed_sample(self):
         """Collect data about a failed sample attempt."""
diff --git a/Lib/profiling/sampling/gecko_collector.py b/Lib/profiling/sampling/gecko_collector.py
index 608a15da483729..356d4609f4436b 100644
--- a/Lib/profiling/sampling/gecko_collector.py
+++ b/Lib/profiling/sampling/gecko_collector.py
@@ -66,7 +66,7 @@ def __init__(self, sample_interval_usec, *, skip_idle=False, opcodes=False):
         self.sample_interval_usec = sample_interval_usec
         self.skip_idle = skip_idle
         self.opcodes_enabled = opcodes
-        self.start_time = time.time() * 1000  # milliseconds since epoch
+        self.start_time = time.monotonic() * 1000  # milliseconds since start
 
         # Global string table (shared across all threads)
         self.global_strings = ["(root)"]  # Start with root
@@ -103,6 +103,9 @@ def __init__(self, sample_interval_usec, *, skip_idle=False, opcodes=False):
         # Opcode state tracking per thread: tid -> (opcode, lineno, col_offset, funcname, filename, start_time)
         self.opcode_state = {}
 
+        # For binary replay: track base timestamp (first sample's timestamp)
+        self._replay_base_timestamp_us = None
+
     def _track_state_transition(self, tid, condition, active_dict, inactive_dict,
                                   active_name, inactive_name, category, current_time):
         """Track binary state transitions and emit markers.
@@ -138,9 +141,18 @@ def _track_state_transition(self, tid, condition, active_dict, inactive_dict,
                 self._add_marker(tid, active_name, active_dict.pop(tid),
                                current_time, category)
 
-    def collect(self, stack_frames):
+    def collect(self, stack_frames, timestamp_us=None):
         """Collect a sample from stack frames."""
-        current_time = (time.time() * 1000) - self.start_time
+        if timestamp_us is not None:
+            # Use provided timestamp (from binary replay)
+            # Track first timestamp as base for relative time calculation
+            if self._replay_base_timestamp_us is None:
+                self._replay_base_timestamp_us = timestamp_us
+            # Convert to milliseconds relative to first sample
+            current_time = (timestamp_us - self._replay_base_timestamp_us) / 1000
+        else:
+            # Live sampling - use monotonic clock
+            current_time = (time.monotonic() * 1000) - self.start_time
 
         # Update interval calculation
         if self.sample_count > 0 and self.last_sample_time > 0:
diff --git a/Lib/profiling/sampling/live_collector/collector.py b/Lib/profiling/sampling/live_collector/collector.py
index 28af2e9744545a..dcb9fcabe32779 100644
--- a/Lib/profiling/sampling/live_collector/collector.py
+++ b/Lib/profiling/sampling/live_collector/collector.py
@@ -348,7 +348,7 @@ def collect_failed_sample(self):
         self.failed_samples += 1
         self.total_samples += 1
 
-    def collect(self, stack_frames):
+    def collect(self, stack_frames, timestamp_us=None):
         """Collect and display profiling data."""
         if self.start_time is None:
             self.start_time = time.perf_counter()
diff --git a/Lib/profiling/sampling/pstats_collector.py b/Lib/profiling/sampling/pstats_collector.py
index 7c154e25828a8f..eb79df1dc93dba 100644
--- a/Lib/profiling/sampling/pstats_collector.py
+++ b/Lib/profiling/sampling/pstats_collector.py
@@ -51,7 +51,7 @@ def _process_frames(self, frames):
 
             self.callers[callee][caller] += 1
 
-    def collect(self, stack_frames):
+    def collect(self, stack_frames, timestamp_us=None):
         if stack_frames and hasattr(stack_frames[0], "awaited_by"):
             # Async frame processing
             for frames, thread_id, task_id in self._iter_async_frames(stack_frames):
diff --git a/Lib/profiling/sampling/sample.py b/Lib/profiling/sampling/sample.py
index 294ec3003fc6bc..b18017c6bb95b3 100644
--- a/Lib/profiling/sampling/sample.py
+++ b/Lib/profiling/sampling/sample.py
@@ -12,6 +12,7 @@
 from .stack_collector import CollapsedStackCollector, FlamegraphCollector
 from .heatmap_collector import HeatmapCollector
 from .gecko_collector import GeckoCollector
+from .binary_collector import BinaryCollector
 from .constants import (
     PROFILING_MODE_WALL,
     PROFILING_MODE_CPU,
@@ -137,6 +138,10 @@ def sample(self, collector, duration_sec=10, *, async_aware=False):
             if self.collect_stats:
                 self._print_unwinder_stats()
 
+            # Print binary I/O stats if using binary collector
+            if isinstance(collector, BinaryCollector):
+                self._print_binary_stats(collector)
+
         # Pass stats to flamegraph collector if it's the right type
         if hasattr(collector, 'set_stats'):
             collector.set_stats(self.sample_interval_usec, running_time, sample_rate, error_rate, missed_samples, mode=self.mode)
@@ -278,6 +283,56 @@ def _print_unwinder_stats(self):
         if stale_invalidations > 0:
             print(f"  {ANSIColors.YELLOW}Stale cache invalidations: {stale_invalidations}{ANSIColors.RESET}")
 
+    def _print_binary_stats(self, collector):
+        """Print binary I/O encoding statistics."""
+        try:
+            stats = collector.get_stats()
+        except (ValueError, RuntimeError):
+            return  # Collector closed or stats unavailable
+
+        print(f"  {ANSIColors.CYAN}Binary Encoding:{ANSIColors.RESET}")
+
+        # Record type counts
+        repeat_records = stats.get('repeat_records', 0)
+        repeat_samples = stats.get('repeat_samples', 0)
+        full_records = stats.get('full_records', 0)
+        suffix_records = stats.get('suffix_records', 0)
+        pop_push_records = stats.get('pop_push_records', 0)
+        total_records = stats.get('total_records', 0)
+
+        if total_records > 0:
+            repeat_pct = repeat_records / total_records * 100
+            full_pct = full_records / total_records * 100
+            suffix_pct = suffix_records / total_records * 100
+            pop_push_pct = pop_push_records / total_records * 100
+        else:
+            repeat_pct = full_pct = suffix_pct = pop_push_pct = 0
+
+        print(f"    Records:          {total_records:,}")
+        print(f"      RLE repeat:     {repeat_records:,} ({ANSIColors.GREEN}{repeat_pct:.1f}%{ANSIColors.RESET}) [{repeat_samples:,} samples]")
+        print(f"      Full stack:     {full_records:,} ({full_pct:.1f}%)")
+        print(f"      Suffix match:   {suffix_records:,} ({suffix_pct:.1f}%)")
+        print(f"      Pop-push:       {pop_push_records:,} ({pop_push_pct:.1f}%)")
+
+        # Frame efficiency
+        frames_written = stats.get('total_frames_written', 0)
+        frames_saved = stats.get('frames_saved', 0)
+        compression_pct = stats.get('frame_compression_pct', 0)
+
+        print(f"  {ANSIColors.CYAN}Frame Efficiency:{ANSIColors.RESET}")
+        print(f"    Frames written:   {frames_written:,}")
+        print(f"    Frames saved:     {frames_saved:,} ({ANSIColors.GREEN}{compression_pct:.1f}%{ANSIColors.RESET})")
+
+        # Bytes written
+        bytes_written = stats.get('bytes_written', 0)
+        if bytes_written >= 1024 * 1024:
+            bytes_str = f"{bytes_written / (1024 * 1024):.1f} MB"
+        elif bytes_written >= 1024:
+            bytes_str = f"{bytes_written / 1024:.1f} KB"
+        else:
+            bytes_str = f"{bytes_written} B"
+        print(f"    Bytes (pre-zstd): {bytes_str}")
+
 
 def sample(
     pid,
diff --git a/Lib/profiling/sampling/stack_collector.py b/Lib/profiling/sampling/stack_collector.py
index e437facd8bb94b..8e75234ed5251e 100644
--- a/Lib/profiling/sampling/stack_collector.py
+++ b/Lib/profiling/sampling/stack_collector.py
@@ -18,7 +18,7 @@ def __init__(self, sample_interval_usec, *, skip_idle=False):
         self.sample_interval_usec = sample_interval_usec
         self.skip_idle = skip_idle
 
-    def collect(self, stack_frames, skip_idle=False):
+    def collect(self, stack_frames, timestamp_us=None, skip_idle=False):
         if stack_frames and hasattr(stack_frames[0], "awaited_by"):
             # Async-aware mode: process async task frames
             for frames, thread_id, task_id in self._iter_async_frames(stack_frames):
@@ -96,7 +96,7 @@ def __init__(self, *args, **kwargs):
         # Per-thread statistics
         self.per_thread_stats = {}  # {thread_id: {has_gil, on_cpu, gil_requested, unknown, has_exception, total, gc_samples}}
 
-    def collect(self, stack_frames, skip_idle=False):
+    def collect(self, stack_frames, timestamp_us=None, skip_idle=False):
         """Override to track thread status statistics before processing frames."""
         # Increment sample count once per sample
         self._sample_count += 1
@@ -128,7 +128,7 @@ def collect(self, stack_frames, skip_idle=False):
                 self.per_thread_stats[thread_id][key] += value
 
         # Call parent collect to process frames
-        super().collect(stack_frames, skip_idle=skip_idle)
+        super().collect(stack_frames, timestamp_us=timestamp_us, skip_idle=skip_idle)
 
     def set_stats(self, sample_interval_usec, duration_sec, sample_rate,
                   error_rate=None, missed_samples=None, mode=None):

From e75513b6b982eeb37b6298bd3cc6dbcc52c28e2b Mon Sep 17 00:00:00 2001
From: Pablo Galindo Salgado <pablogsal@gmail.com>
Date: Mon, 15 Dec 2025 01:50:04 +0000
Subject: [PATCH 11/17] Add tests for binary format

Tests cover the full write/read cycle, delta encoding (RLE, suffix,
pop-push), compression modes, edge cases (empty files, deep stacks,
many threads), and replay through different collectors.

The mock-based tests verify encoding behavior without needing actual
profiling, while integration tests exercise the complete pipeline.
---
 .../test_binary_format.py                     | 1021 +++++++++++++++++
 1 file changed, 1021 insertions(+)
 create mode 100644 Lib/test/test_profiling/test_sampling_profiler/test_binary_format.py

diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_binary_format.py b/Lib/test/test_profiling/test_sampling_profiler/test_binary_format.py
new file mode 100644
index 00000000000000..43b63f0bf4cef4
--- /dev/null
+++ b/Lib/test/test_profiling/test_sampling_profiler/test_binary_format.py
@@ -0,0 +1,1021 @@
+"""Tests for binary format round-trip functionality."""
+
+import os
+import random
+import tempfile
+import unittest
+from collections import defaultdict
+
+try:
+    import _remote_debugging
+    from _remote_debugging import (
+        InterpreterInfo,
+        ThreadInfo,
+        FrameInfo,
+        LocationInfo,
+        THREAD_STATUS_HAS_GIL,
+        THREAD_STATUS_ON_CPU,
+        THREAD_STATUS_UNKNOWN,
+        THREAD_STATUS_GIL_REQUESTED,
+        THREAD_STATUS_HAS_EXCEPTION,
+    )
+    from profiling.sampling.binary_collector import (
+        BinaryCollector,
+        COMPRESSION_NONE,
+        COMPRESSION_ZSTD,
+    )
+    from profiling.sampling.binary_reader import BinaryReader
+
+    ZSTD_AVAILABLE = _remote_debugging.zstd_available()
+except ImportError:
+    raise unittest.SkipTest(
+        "Test only runs when _remote_debugging is available"
+    )
+
+
+def make_frame(filename, lineno, funcname):
+    """Create a FrameInfo struct sequence."""
+    location = LocationInfo((lineno, lineno, -1, -1))
+    return FrameInfo((filename, location, funcname, None))
+
+
+def make_thread(thread_id, frames, status=0):
+    """Create a ThreadInfo struct sequence."""
+    return ThreadInfo((thread_id, status, frames))
+
+
+def make_interpreter(interp_id, threads):
+    """Create an InterpreterInfo struct sequence."""
+    return InterpreterInfo((interp_id, threads))
+
+
+def extract_lineno(location):
+    """Extract line number from location (tuple or int or None)."""
+    if location is None:
+        return 0  # Treat None as 0
+    if isinstance(location, tuple):
+        return location[0] if location[0] is not None else 0
+    return location
+
+
+class RawCollector:
+    """Collector that captures all raw data grouped by thread."""
+
+    def __init__(self):
+        # Key: (interpreter_id, thread_id) -> list of samples for that thread
+        self.by_thread = defaultdict(list)
+        self.total_count = 0
+
+    def collect(self, stack_frames, timestamp_us=None):
+        """Capture the raw sample data."""
+        for interp in stack_frames:
+            for thread in interp.threads:
+                frames = []
+                for frame in thread.frame_info:
+                    frames.append(
+                        {
+                            "filename": frame.filename,
+                            "funcname": frame.funcname,
+                            "lineno": extract_lineno(frame.location),
+                        }
+                    )
+                key = (interp.interpreter_id, thread.thread_id)
+                self.by_thread[key].append(
+                    {
+                        "status": thread.status,
+                        "frames": frames,
+                    }
+                )
+                self.total_count += 1
+
+    def export(self, filename):
+        pass
+
+
+def samples_to_by_thread(samples):
+    """Convert input samples to by-thread format for comparison."""
+    by_thread = defaultdict(list)
+    for sample in samples:
+        for interp in sample:
+            for thread in interp.threads:
+                frames = []
+                for frame in thread.frame_info:
+                    frames.append(
+                        {
+                            "filename": frame.filename,
+                            "funcname": frame.funcname,
+                            "lineno": extract_lineno(frame.location),
+                        }
+                    )
+                key = (interp.interpreter_id, thread.thread_id)
+                by_thread[key].append(
+                    {
+                        "status": thread.status,
+                        "frames": frames,
+                    }
+                )
+    return by_thread
+
+
+class BinaryFormatTestBase(unittest.TestCase):
+    """Base class with common setup/teardown for binary format tests."""
+
+    def setUp(self):
+        self.temp_files = []
+
+    def tearDown(self):
+        for f in self.temp_files:
+            if os.path.exists(f):
+                os.unlink(f)
+
+    def create_binary_file(self, samples, interval=1000, compression="none"):
+        """Create a test binary file and track it for cleanup."""
+        with tempfile.NamedTemporaryFile(suffix=".bin", delete=False) as f:
+            filename = f.name
+        self.temp_files.append(filename)
+
+        collector = BinaryCollector(
+            filename, interval, compression=compression
+        )
+        for sample in samples:
+            collector.collect(sample)
+        collector.export(None)
+        return filename
+
+    def roundtrip(self, samples, interval=1000, compression="none"):
+        """Write samples to binary and read back."""
+        filename = self.create_binary_file(samples, interval, compression)
+        collector = RawCollector()
+        with BinaryReader(filename) as reader:
+            count = reader.replay_samples(collector)
+        return collector, count
+
+    def assert_samples_equal(self, expected_samples, collector):
+        """Assert that roundtripped samples match input exactly, per-thread."""
+        expected = samples_to_by_thread(expected_samples)
+
+        # Same threads present
+        self.assertEqual(
+            set(expected.keys()),
+            set(collector.by_thread.keys()),
+            "Thread set mismatch",
+        )
+
+        # For each thread, samples match in order
+        for key in expected:
+            exp_samples = expected[key]
+            act_samples = collector.by_thread[key]
+            interp_id, thread_id = key
+
+            self.assertEqual(
+                len(exp_samples),
+                len(act_samples),
+                f"Thread ({interp_id}, {thread_id}): sample count mismatch "
+                f"(expected {len(exp_samples)}, got {len(act_samples)})",
+            )
+
+            for i, (exp, act) in enumerate(zip(exp_samples, act_samples)):
+                self.assertEqual(
+                    exp["status"],
+                    act["status"],
+                    f"Thread ({interp_id}, {thread_id}), sample {i}: "
+                    f"status mismatch (expected {exp['status']}, got {act['status']})",
+                )
+
+                self.assertEqual(
+                    len(exp["frames"]),
+                    len(act["frames"]),
+                    f"Thread ({interp_id}, {thread_id}), sample {i}: "
+                    f"frame count mismatch",
+                )
+
+                for j, (exp_frame, act_frame) in enumerate(
+                    zip(exp["frames"], act["frames"])
+                ):
+                    self.assertEqual(
+                        exp_frame["filename"],
+                        act_frame["filename"],
+                        f"Thread ({interp_id}, {thread_id}), sample {i}, "
+                        f"frame {j}: filename mismatch",
+                    )
+                    self.assertEqual(
+                        exp_frame["funcname"],
+                        act_frame["funcname"],
+                        f"Thread ({interp_id}, {thread_id}), sample {i}, "
+                        f"frame {j}: funcname mismatch",
+                    )
+                    self.assertEqual(
+                        exp_frame["lineno"],
+                        act_frame["lineno"],
+                        f"Thread ({interp_id}, {thread_id}), sample {i}, "
+                        f"frame {j}: lineno mismatch "
+                        f"(expected {exp_frame['lineno']}, got {act_frame['lineno']})",
+                    )
+
+
+class TestBinaryRoundTrip(BinaryFormatTestBase):
+    """Tests for exact binary format round-trip."""
+
+    def test_single_sample_single_frame(self):
+        """Single sample with one frame roundtrips exactly."""
+        samples = [
+            [
+                make_interpreter(
+                    0,
+                    [
+                        make_thread(
+                            12345, [make_frame("test.py", 42, "myfunc")]
+                        )
+                    ],
+                )
+            ]
+        ]
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 1)
+        self.assert_samples_equal(samples, collector)
+
+    def test_single_sample_multi_frame(self):
+        """Single sample with call stack roundtrips exactly."""
+        frames = [
+            make_frame("inner.py", 10, "inner"),
+            make_frame("middle.py", 20, "middle"),
+            make_frame("outer.py", 30, "outer"),
+        ]
+        samples = [[make_interpreter(0, [make_thread(100, frames)])]]
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 1)
+        self.assert_samples_equal(samples, collector)
+
+    def test_multiple_samples_same_stack(self):
+        """Multiple identical samples roundtrip exactly (tests RLE)."""
+        frame = make_frame("hot.py", 99, "hot_func")
+        samples = [
+            [make_interpreter(0, [make_thread(1, [frame])])]
+            for _ in range(100)
+        ]
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 100)
+        self.assert_samples_equal(samples, collector)
+
+    def test_multiple_samples_varying_stacks(self):
+        """Multiple samples with varying stacks roundtrip exactly."""
+        samples = []
+        for i in range(20):
+            depth = i % 5 + 1
+            frames = [
+                make_frame(f"f{j}.py", j * 10 + i, f"func{j}")
+                for j in range(depth)
+            ]
+            samples.append([make_interpreter(0, [make_thread(1, frames)])])
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 20)
+        self.assert_samples_equal(samples, collector)
+
+    def test_thread_ids_preserved(self):
+        """Thread IDs are preserved exactly."""
+        thread_ids = [1, 12345, 0x7FFF12345678, 999999]
+        samples = []
+        for tid in thread_ids:
+            samples.append(
+                [
+                    make_interpreter(
+                        0, [make_thread(tid, [make_frame("t.py", 10, "f")])]
+                    )
+                ]
+            )
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, len(thread_ids))
+        self.assert_samples_equal(samples, collector)
+
+    def test_interpreter_ids_preserved(self):
+        """Interpreter IDs are preserved exactly."""
+        interp_ids = [0, 1, 5, 100]
+        samples = []
+        for iid in interp_ids:
+            samples.append(
+                [
+                    make_interpreter(
+                        iid, [make_thread(1, [make_frame("i.py", 10, "f")])]
+                    )
+                ]
+            )
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, len(interp_ids))
+        self.assert_samples_equal(samples, collector)
+
+    def test_status_flags_preserved(self):
+        """All thread status flags are preserved exactly."""
+        statuses = [
+            0,
+            THREAD_STATUS_HAS_GIL,
+            THREAD_STATUS_ON_CPU,
+            THREAD_STATUS_UNKNOWN,
+            THREAD_STATUS_GIL_REQUESTED,
+            THREAD_STATUS_HAS_EXCEPTION,
+            THREAD_STATUS_HAS_GIL | THREAD_STATUS_ON_CPU,
+            THREAD_STATUS_HAS_GIL | THREAD_STATUS_HAS_EXCEPTION,
+            THREAD_STATUS_HAS_GIL
+            | THREAD_STATUS_ON_CPU
+            | THREAD_STATUS_GIL_REQUESTED,
+        ]
+        samples = []
+        for i, status in enumerate(statuses):
+            samples.append(
+                [
+                    make_interpreter(
+                        0,
+                        [
+                            make_thread(
+                                1, [make_frame("s.py", 10 + i, "f")], status
+                            )
+                        ],
+                    )
+                ]
+            )
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, len(statuses))
+        self.assert_samples_equal(samples, collector)
+
+    def test_multiple_threads_per_sample(self):
+        """Multiple threads in one sample roundtrip exactly."""
+        threads = [
+            make_thread(
+                1, [make_frame("t1.py", 10, "t1")], THREAD_STATUS_HAS_GIL
+            ),
+            make_thread(
+                2, [make_frame("t2.py", 20, "t2")], THREAD_STATUS_ON_CPU
+            ),
+            make_thread(3, [make_frame("t3.py", 30, "t3")], 0),
+        ]
+        samples = [[make_interpreter(0, threads)] for _ in range(10)]
+        collector, count = self.roundtrip(samples)
+        # 10 samples × 3 threads = 30 thread-samples
+        self.assertEqual(count, 30)
+        self.assert_samples_equal(samples, collector)
+
+    def test_multiple_interpreters_per_sample(self):
+        """Multiple interpreters in one sample roundtrip exactly."""
+        samples = [
+            [
+                make_interpreter(
+                    0, [make_thread(1, [make_frame("i0.py", 10, "i0")])]
+                ),
+                make_interpreter(
+                    1, [make_thread(2, [make_frame("i1.py", 20, "i1")])]
+                ),
+            ]
+            for _ in range(5)
+        ]
+        collector, count = self.roundtrip(samples)
+        # 5 samples × 2 interpreters × 1 thread = 10 thread-samples
+        self.assertEqual(count, 10)
+        self.assert_samples_equal(samples, collector)
+
+    def test_same_thread_id_different_interpreters(self):
+        """Same thread_id in different interpreters must be tracked separately."""
+        # This test catches bugs where thread state is keyed only by thread_id
+        # without considering interpreter_id
+        samples = []
+        # Interleave samples from interpreter 0 and 1, both using thread_id=1
+        for i in range(20):
+            interp_id = i % 2  # Alternate between interpreter 0 and 1
+            frame = make_frame(
+                f"interp{interp_id}.py", 10 + i, f"func{interp_id}"
+            )
+            samples.append(
+                [make_interpreter(interp_id, [make_thread(1, [frame])])]
+            )
+
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 20)
+        self.assert_samples_equal(samples, collector)
+
+        # Verify both interpreters are present
+        keys = set(collector.by_thread.keys())
+        self.assertIn((0, 1), keys)  # interpreter 0, thread 1
+        self.assertIn((1, 1), keys)  # interpreter 1, thread 1
+
+        # Verify each interpreter got 10 samples
+        self.assertEqual(len(collector.by_thread[(0, 1)]), 10)
+        self.assertEqual(len(collector.by_thread[(1, 1)]), 10)
+
+        # Verify the samples are in the right order for each interpreter
+        for i, sample in enumerate(collector.by_thread[(0, 1)]):
+            expected_lineno = 10 + i * 2  # 10, 12, 14, ...
+            self.assertEqual(sample["frames"][0]["lineno"], expected_lineno)
+            self.assertEqual(sample["frames"][0]["filename"], "interp0.py")
+
+        for i, sample in enumerate(collector.by_thread[(1, 1)]):
+            expected_lineno = 11 + i * 2  # 11, 13, 15, ...
+            self.assertEqual(sample["frames"][0]["lineno"], expected_lineno)
+            self.assertEqual(sample["frames"][0]["filename"], "interp1.py")
+
+    def test_deep_call_stack(self):
+        """Deep call stack roundtrips exactly."""
+        depth = 100
+        frames = [
+            make_frame(f"f{i}.py", i + 1, f"func{i}") for i in range(depth)
+        ]
+        samples = [[make_interpreter(0, [make_thread(1, frames)])]]
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 1)
+        self.assert_samples_equal(samples, collector)
+
+    def test_line_numbers_preserved(self):
+        """Various line numbers are preserved exactly."""
+        linenos = [1, 100, 1000, 65535, 100000]
+        samples = []
+        for lineno in linenos:
+            samples.append(
+                [
+                    make_interpreter(
+                        0, [make_thread(1, [make_frame("l.py", lineno, "f")])]
+                    )
+                ]
+            )
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, len(linenos))
+        self.assert_samples_equal(samples, collector)
+
+    @unittest.skipUnless(ZSTD_AVAILABLE, "zstd compression not available")
+    def test_zstd_compression_roundtrip(self):
+        """Zstd compressed data roundtrips exactly."""
+        samples = []
+        for i in range(200):
+            frames = [
+                make_frame(f"z{j}.py", j * 10 + i + 1, f"zfunc{j}")
+                for j in range(3)
+            ]
+            samples.append([make_interpreter(0, [make_thread(1, frames)])])
+        collector, count = self.roundtrip(samples, compression="zstd")
+        self.assertEqual(count, 200)
+        self.assert_samples_equal(samples, collector)
+
+    def test_sample_interval_preserved(self):
+        """Sample interval is preserved in file metadata."""
+        intervals = [100, 500, 1000, 5000, 10000]
+        for interval in intervals:
+            with self.subTest(interval=interval):
+                samples = [
+                    [
+                        make_interpreter(
+                            0, [make_thread(1, [make_frame("i.py", 1, "f")])]
+                        )
+                    ]
+                ]
+                filename = self.create_binary_file(samples, interval=interval)
+                with BinaryReader(filename) as reader:
+                    info = reader.get_info()
+                    self.assertEqual(info["sample_interval_us"], interval)
+
+    def test_threads_interleaved_samples(self):
+        """Multiple threads with interleaved varying samples."""
+        samples = []
+        for i in range(30):
+            threads = [
+                make_thread(
+                    1,
+                    [make_frame("t1.py", 10 + i, "t1")],
+                    THREAD_STATUS_HAS_GIL if i % 2 == 0 else 0,
+                ),
+                make_thread(
+                    2,
+                    [make_frame("t2.py", 20 + i, "t2")],
+                    THREAD_STATUS_ON_CPU if i % 3 == 0 else 0,
+                ),
+            ]
+            samples.append([make_interpreter(0, threads)])
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 60)
+        self.assert_samples_equal(samples, collector)
+
+
+class TestBinaryEdgeCases(BinaryFormatTestBase):
+    """Tests for edge cases in binary format."""
+
+    def test_unicode_filenames(self):
+        """Unicode filenames roundtrip exactly."""
+        filenames = [
+            "/путь/файл.py",
+            "/路径/文件.py",
+            "/パス/ファイル.py",
+            "/chemin/café.py",
+        ]
+        for fname in filenames:
+            with self.subTest(filename=fname):
+                samples = [
+                    [
+                        make_interpreter(
+                            0, [make_thread(1, [make_frame(fname, 1, "func")])]
+                        )
+                    ]
+                ]
+                collector, count = self.roundtrip(samples)
+                self.assertEqual(count, 1)
+                self.assert_samples_equal(samples, collector)
+
+    def test_unicode_funcnames(self):
+        """Unicode function names roundtrip exactly."""
+        funcnames = [
+            "функция",
+            "函数",
+            "関数",
+            "función",
+        ]
+        for funcname in funcnames:
+            with self.subTest(funcname=funcname):
+                samples = [
+                    [
+                        make_interpreter(
+                            0,
+                            [
+                                make_thread(
+                                    1, [make_frame("test.py", 1, funcname)]
+                                )
+                            ],
+                        )
+                    ]
+                ]
+                collector, count = self.roundtrip(samples)
+                self.assertEqual(count, 1)
+                self.assert_samples_equal(samples, collector)
+
+    def test_special_char_filenames(self):
+        """Filenames with special characters roundtrip exactly."""
+        filenames = [
+            "/path/with spaces/file.py",
+            "/path/with\ttab/file.py",
+            "/path/with'quote/file.py",
+            '/path/with"double/file.py',
+            "/path/with\\backslash/file.py",
+        ]
+        for fname in filenames:
+            with self.subTest(filename=fname):
+                samples = [
+                    [
+                        make_interpreter(
+                            0, [make_thread(1, [make_frame(fname, 1, "func")])]
+                        )
+                    ]
+                ]
+                collector, count = self.roundtrip(samples)
+                self.assertEqual(count, 1)
+                self.assert_samples_equal(samples, collector)
+
+    def test_special_funcnames(self):
+        """Function names with special characters roundtrip exactly."""
+        funcnames = [
+            "<lambda>",
+            "<listcomp>",
+            "<genexpr>",
+            "<module>",
+            "__init__",
+            "func.inner",
+        ]
+        for funcname in funcnames:
+            with self.subTest(funcname=funcname):
+                samples = [
+                    [
+                        make_interpreter(
+                            0,
+                            [
+                                make_thread(
+                                    1, [make_frame("test.py", 1, funcname)]
+                                )
+                            ],
+                        )
+                    ]
+                ]
+                collector, count = self.roundtrip(samples)
+                self.assertEqual(count, 1)
+                self.assert_samples_equal(samples, collector)
+
+    def test_long_filename(self):
+        """Long filename roundtrips exactly."""
+        long_file = "/very/long/path/" + "sub/" * 50 + "file.py"
+        samples = [
+            [
+                make_interpreter(
+                    0, [make_thread(1, [make_frame(long_file, 1, "func")])]
+                )
+            ]
+        ]
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 1)
+        self.assert_samples_equal(samples, collector)
+
+    def test_long_funcname(self):
+        """Long function name roundtrips exactly."""
+        long_func = "very_long_function_name_" + "x" * 200
+        samples = [
+            [
+                make_interpreter(
+                    0, [make_thread(1, [make_frame("test.py", 1, long_func)])]
+                )
+            ]
+        ]
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 1)
+        self.assert_samples_equal(samples, collector)
+
+    def test_empty_funcname(self):
+        """Empty function name roundtrips exactly."""
+        samples = [
+            [
+                make_interpreter(
+                    0, [make_thread(1, [make_frame("test.py", 1, "")])]
+                )
+            ]
+        ]
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 1)
+        self.assert_samples_equal(samples, collector)
+
+    @unittest.skipUnless(ZSTD_AVAILABLE, "zstd compression not available")
+    def test_large_sample_count(self):
+        """Large number of samples roundtrips exactly."""
+        num = 5000
+        samples = [
+            [
+                make_interpreter(
+                    0,
+                    [
+                        make_thread(
+                            1, [make_frame("test.py", (i % 100) + 1, "func")]
+                        )
+                    ],
+                )
+            ]
+            for i in range(num)
+        ]
+        collector, count = self.roundtrip(samples, compression="zstd")
+        self.assertEqual(count, num)
+        self.assert_samples_equal(samples, collector)
+
+    def test_context_manager_cleanup(self):
+        """Reader cleans up on context exit."""
+        samples = [
+            [
+                make_interpreter(
+                    0, [make_thread(1, [make_frame("t.py", 1, "f")])]
+                )
+            ]
+        ]
+        filename = self.create_binary_file(samples)
+        reader = BinaryReader(filename)
+        with reader:
+            collector = RawCollector()
+            count = reader.replay_samples(collector)
+            self.assertEqual(count, 1)
+        with self.assertRaises(RuntimeError):
+            reader.replay_samples(collector)
+
+    def test_invalid_file_path(self):
+        """Invalid file path raises appropriate error."""
+        with self.assertRaises((FileNotFoundError, OSError, ValueError)):
+            with BinaryReader("/nonexistent/path/file.bin") as reader:
+                reader.replay_samples(RawCollector())
+
+
+class TestBinaryEncodings(BinaryFormatTestBase):
+    """Tests specifically targeting different stack encodings."""
+
+    def test_stack_full_encoding(self):
+        """First sample uses STACK_FULL encoding and roundtrips."""
+        frames = [make_frame(f"f{i}.py", i + 1, f"func{i}") for i in range(5)]
+        samples = [[make_interpreter(0, [make_thread(1, frames)])]]
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 1)
+        self.assert_samples_equal(samples, collector)
+
+    def test_stack_repeat_encoding(self):
+        """Identical consecutive samples use RLE and roundtrip."""
+        frame = make_frame("repeat.py", 42, "repeat_func")
+        samples = [
+            [make_interpreter(0, [make_thread(1, [frame])])]
+            for _ in range(1000)
+        ]
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 1000)
+        self.assert_samples_equal(samples, collector)
+
+    def test_stack_suffix_encoding(self):
+        """Samples sharing suffix use STACK_SUFFIX and roundtrip."""
+        samples = []
+        for i in range(10):
+            frames = [make_frame(f"new{i}.py", i + 1, f"new{i}")]
+            frames.extend(
+                [
+                    make_frame(f"shared{j}.py", j + 1, f"shared{j}")
+                    for j in range(5)
+                ]
+            )
+            samples.append([make_interpreter(0, [make_thread(1, frames)])])
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 10)
+        self.assert_samples_equal(samples, collector)
+
+    def test_stack_pop_push_encoding(self):
+        """Samples with pop+push pattern roundtrip."""
+        samples = []
+        base_frames = [make_frame("base.py", 10, "base")]
+
+        # Call deeper
+        samples.append([make_interpreter(0, [make_thread(1, base_frames)])])
+        samples.append(
+            [
+                make_interpreter(
+                    0,
+                    [
+                        make_thread(
+                            1,
+                            [make_frame("call1.py", 20, "call1")]
+                            + base_frames,
+                        )
+                    ],
+                )
+            ]
+        )
+        samples.append(
+            [
+                make_interpreter(
+                    0,
+                    [
+                        make_thread(
+                            1,
+                            [
+                                make_frame("call2.py", 30, "call2"),
+                                make_frame("call1.py", 20, "call1"),
+                            ]
+                            + base_frames,
+                        )
+                    ],
+                )
+            ]
+        )
+        # Return
+        samples.append(
+            [
+                make_interpreter(
+                    0,
+                    [
+                        make_thread(
+                            1,
+                            [make_frame("call1.py", 25, "call1")]
+                            + base_frames,
+                        )
+                    ],
+                )
+            ]
+        )
+        samples.append([make_interpreter(0, [make_thread(1, base_frames)])])
+
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 5)
+        self.assert_samples_equal(samples, collector)
+
+    def test_mixed_encodings(self):
+        """Mix of different encoding patterns roundtrips."""
+        samples = []
+        # Some repeated samples (RLE)
+        frame1 = make_frame("hot.py", 1, "hot")
+        for _ in range(20):
+            samples.append([make_interpreter(0, [make_thread(1, [frame1])])])
+        # Some varying samples
+        for i in range(20):
+            frames = [make_frame(f"vary{i}.py", i + 1, f"vary{i}")]
+            samples.append([make_interpreter(0, [make_thread(1, frames)])])
+        # More repeated
+        for _ in range(20):
+            samples.append([make_interpreter(0, [make_thread(1, [frame1])])])
+
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 60)
+        self.assert_samples_equal(samples, collector)
+
+    def test_alternating_threads_status_changes(self):
+        """Alternating thread status changes roundtrip correctly."""
+        samples = []
+        for i in range(50):
+            status1 = THREAD_STATUS_HAS_GIL if i % 2 == 0 else 0
+            status2 = (
+                THREAD_STATUS_ON_CPU if i % 3 == 0 else THREAD_STATUS_HAS_GIL
+            )
+            threads = [
+                make_thread(1, [make_frame("t1.py", 10, "t1")], status1),
+                make_thread(2, [make_frame("t2.py", 20, "t2")], status2),
+            ]
+            samples.append([make_interpreter(0, threads)])
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 100)
+        self.assert_samples_equal(samples, collector)
+
+
+class TestBinaryStress(BinaryFormatTestBase):
+    """Randomized stress tests for binary format."""
+
+    @unittest.skipUnless(ZSTD_AVAILABLE, "zstd compression not available")
+    def test_random_samples_stress(self):
+        """Stress test with random samples - exercises hash table resizing."""
+        random.seed(42)  # Reproducible
+
+        # Large pools to force hash table resizing (exceeds initial 8192/4096 sizes)
+        filenames = [f"file{i}.py" for i in range(200)]
+        funcnames = [f"func{i}" for i in range(300)]
+        thread_ids = list(range(1, 50))
+        interp_ids = list(range(10))
+        statuses = [
+            0,
+            THREAD_STATUS_HAS_GIL,
+            THREAD_STATUS_ON_CPU,
+            THREAD_STATUS_HAS_GIL | THREAD_STATUS_ON_CPU,
+            THREAD_STATUS_HAS_EXCEPTION,
+        ]
+
+        samples = []
+        for _ in range(1000):
+            num_interps = random.randint(1, 3)
+            interps = []
+            for _ in range(num_interps):
+                iid = random.choice(interp_ids)
+                num_threads = random.randint(1, 5)
+                threads = []
+                for _ in range(num_threads):
+                    tid = random.choice(thread_ids)
+                    status = random.choice(statuses)
+                    depth = random.randint(1, 15)
+                    frames = []
+                    for _ in range(depth):
+                        fname = random.choice(filenames)
+                        func = random.choice(funcnames)
+                        # Wide line number range to create many unique frames
+                        lineno = random.randint(1, 5000)
+                        frames.append(make_frame(fname, lineno, func))
+                    threads.append(make_thread(tid, frames, status))
+                interps.append(make_interpreter(iid, threads))
+            samples.append(interps)
+
+        collector, count = self.roundtrip(samples, compression="zstd")
+        self.assertGreater(count, 0)
+        self.assert_samples_equal(samples, collector)
+
+    def test_rle_stress(self):
+        """Stress test RLE encoding with identical samples."""
+        random.seed(123)
+
+        # Create a few distinct stacks
+        stacks = []
+        for i in range(5):
+            depth = random.randint(1, 8)
+            frames = [
+                make_frame(f"rle{j}.py", j * 10, f"rle{j}")
+                for j in range(depth)
+            ]
+            stacks.append(frames)
+
+        # Generate samples with repeated stacks (should trigger RLE)
+        samples = []
+        for _ in range(100):
+            stack = random.choice(stacks)
+            repeat = random.randint(1, 50)
+            for _ in range(repeat):
+                samples.append([make_interpreter(0, [make_thread(1, stack)])])
+
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, len(samples))
+        self.assert_samples_equal(samples, collector)
+
+    @unittest.skipUnless(ZSTD_AVAILABLE, "zstd compression not available")
+    def test_multi_thread_stress(self):
+        """Stress test with many threads and interleaved samples."""
+        random.seed(456)
+
+        thread_ids = list(range(1, 20))
+        samples = []
+
+        for i in range(300):
+            # Randomly select 1-5 threads for this sample
+            num_threads = random.randint(1, 5)
+            selected = random.sample(thread_ids, num_threads)
+            threads = []
+            for tid in selected:
+                status = random.choice(
+                    [0, THREAD_STATUS_HAS_GIL, THREAD_STATUS_ON_CPU]
+                )
+                depth = random.randint(1, 5)
+                frames = [
+                    make_frame(f"mt{tid}_{j}.py", i + j, f"f{j}")
+                    for j in range(depth)
+                ]
+                threads.append(make_thread(tid, frames, status))
+            samples.append([make_interpreter(0, threads)])
+
+        collector, count = self.roundtrip(samples, compression="zstd")
+        self.assertGreater(count, 0)
+        self.assert_samples_equal(samples, collector)
+
+    def test_encoding_transitions_stress(self):
+        """Stress test stack encoding transitions."""
+        random.seed(789)
+
+        base_frames = [
+            make_frame(f"base{i}.py", i, f"base{i}") for i in range(5)
+        ]
+        samples = []
+
+        for i in range(200):
+            choice = random.randint(0, 4)
+            if choice == 0:
+                # Full new stack
+                depth = random.randint(1, 8)
+                frames = [
+                    make_frame(f"new{i}_{j}.py", j, f"new{j}")
+                    for j in range(depth)
+                ]
+            elif choice == 1:
+                # Repeat previous (will use RLE if identical)
+                frames = base_frames[: random.randint(1, 5)]
+            elif choice == 2:
+                # Add frames on top (suffix encoding)
+                extra = random.randint(1, 3)
+                frames = [
+                    make_frame(f"top{i}_{j}.py", j, f"top{j}")
+                    for j in range(extra)
+                ]
+                frames.extend(base_frames[: random.randint(2, 4)])
+            else:
+                # Pop and push (pop-push encoding)
+                keep = random.randint(1, 3)
+                push = random.randint(0, 2)
+                frames = [
+                    make_frame(f"push{i}_{j}.py", j, f"push{j}")
+                    for j in range(push)
+                ]
+                frames.extend(base_frames[:keep])
+
+            samples.append([make_interpreter(0, [make_thread(1, frames)])])
+
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, len(samples))
+        self.assert_samples_equal(samples, collector)
+
+    @unittest.skipUnless(ZSTD_AVAILABLE, "zstd compression not available")
+    def test_same_thread_id_multiple_interpreters_stress(self):
+        """Stress test: same thread_id across multiple interpreters with interleaved samples.
+
+        This test catches bugs where thread state is keyed only by thread_id
+        without considering interpreter_id (both in writer and reader).
+        """
+        random.seed(999)
+
+        # Multiple interpreters, each with overlapping thread_ids
+        interp_ids = [0, 1, 2, 3]
+        # Same thread_ids used across all interpreters
+        shared_thread_ids = [1, 2, 3]
+
+        filenames = [f"file{i}.py" for i in range(10)]
+        funcnames = [f"func{i}" for i in range(15)]
+        statuses = [0, THREAD_STATUS_HAS_GIL, THREAD_STATUS_ON_CPU]
+
+        samples = []
+        for i in range(1000):
+            # Randomly pick an interpreter
+            iid = random.choice(interp_ids)
+            # Randomly pick 1-3 threads (from shared pool)
+            num_threads = random.randint(1, 3)
+            selected_tids = random.sample(shared_thread_ids, num_threads)
+
+            threads = []
+            for tid in selected_tids:
+                status = random.choice(statuses)
+                depth = random.randint(1, 6)
+                frames = []
+                for d in range(depth):
+                    # Include interpreter and thread info in frame data for verification
+                    fname = f"i{iid}_t{tid}_{random.choice(filenames)}"
+                    func = random.choice(funcnames)
+                    lineno = i * 10 + d + 1  # Unique per sample
+                    frames.append(make_frame(fname, lineno, func))
+                threads.append(make_thread(tid, frames, status))
+
+            samples.append([make_interpreter(iid, threads)])
+
+        collector, count = self.roundtrip(samples, compression="zstd")
+        self.assertGreater(count, 0)
+        self.assert_samples_equal(samples, collector)
+
+        # Verify that we have samples from multiple (interpreter, thread) combinations
+        # with the same thread_id
+        keys = set(collector.by_thread.keys())
+        # Should have samples for same thread_id in different interpreters
+        for tid in shared_thread_ids:
+            interps_with_tid = [iid for (iid, t) in keys if t == tid]
+            self.assertGreater(
+                len(interps_with_tid),
+                1,
+                f"Thread {tid} should appear in multiple interpreters",
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()

From a520ac124f0799008175b49e51f6e636ccbd5ed2 Mon Sep 17 00:00:00 2001
From: Pablo Galindo Salgado <pablogsal@gmail.com>
Date: Mon, 15 Dec 2025 01:50:16 +0000
Subject: [PATCH 12/17] Add internal documentation for binary format

Documents the file layout, encoding schemes, and design rationale.
Covers header/footer structure, delta encoding types (repeat, suffix,
pop-push), string/frame deduplication, and compression integration.

Intended for developers working on the profiler implementation.
---
 InternalDocs/profiling_binary_format.md | 456 ++++++++++++++++++++++++
 1 file changed, 456 insertions(+)
 create mode 100644 InternalDocs/profiling_binary_format.md

diff --git a/InternalDocs/profiling_binary_format.md b/InternalDocs/profiling_binary_format.md
new file mode 100644
index 00000000000000..ff3daca2b57cb4
--- /dev/null
+++ b/InternalDocs/profiling_binary_format.md
@@ -0,0 +1,456 @@
+# Profiling Binary Format
+
+The profiling module includes a binary file format for storing sampling
+profiler data. This document describes the format's structure and the
+design decisions behind it.
+
+The implementation is in
+[`Modules/_remote_debugging/binary_io.c`](../Modules/_remote_debugging/binary_io.c)
+with declarations in
+[`Modules/_remote_debugging/binary_io.h`](../Modules/_remote_debugging/binary_io.h).
+
+## Overview
+
+The sampling profiler can generate enormous amounts of data. A typical
+profiling session sampling at 1000 Hz for 60 seconds produces 60,000 samples.
+Each sample contains a full call stack, often 20-50 frames deep, and each
+frame includes a filename, function name, and line number. In a text-based
+format like collapsed stacks, this would mean repeating the same long file
+paths and function names thousands of times.
+
+The binary format addresses this through two key strategies:
+
+1. **Deduplication**: Strings and frames are stored once in lookup tables,
+   then referenced by small integer indices. A 100-character file path that
+   appears in 50,000 samples is stored once, not 50,000 times.
+
+2. **Compact encoding**: Variable-length integers (varints) encode small
+   values in fewer bytes. Since most indices are small (under 128), they
+   typically need only one byte instead of four.
+
+Together with optional zstd compression, these techniques reduce file sizes
+by 10-50x compared to text formats while also enabling faster I/O.
+
+## File Layout
+
+The file consists of five sections:
+
+```
++------------------+  Offset 0
+|     Header       |  64 bytes (fixed)
++------------------+  Offset 64
+|                  |
+|   Sample Data    |  Variable size (optionally compressed)
+|                  |
++------------------+  string_table_offset
+|   String Table   |  Variable size
++------------------+  frame_table_offset
+|   Frame Table    |  Variable size
++------------------+  file_size - 32
+|     Footer       |  32 bytes (fixed)
++------------------+  file_size
+```
+
+The layout is designed for streaming writes during profiling. The profiler
+cannot know in advance how many unique strings or frames will be encountered,
+so these tables must be built incrementally and written at the end.
+
+The header comes first so readers can quickly validate the file and locate
+the metadata tables. The sample data follows immediately, allowing the writer
+to stream samples directly to disk (or through a compression stream) without
+buffering the entire dataset in memory.
+
+The string and frame tables are placed after sample data because they grow
+as new unique entries are discovered during profiling. By deferring their
+output until finalization, the writer avoids the complexity of reserving
+space or rewriting portions of the file.
+
+The footer at the end contains counts needed to allocate arrays before
+parsing the tables. Placing it at a fixed offset from the end (rather than
+at a variable offset recorded in the header) means readers can locate it
+with a single seek to `file_size - 32`, without first reading the header.
+
+## Header
+
+```
+ Offset   Size   Type      Description
++--------+------+---------+----------------------------------------+
+|    0   |  4   | uint32  | Magic number (0x54414348 = "TACH")     |
+|    4   |  4   | uint32  | Format version (currently 2)           |
+|    8   |  8   | uint64  | Start timestamp (microseconds)         |
+|   16   |  8   | uint64  | Sample interval (microseconds)         |
+|   24   |  4   | uint32  | Total sample count                     |
+|   28   |  4   | uint32  | Thread count                           |
+|   32   |  8   | uint64  | String table offset                    |
+|   40   |  8   | uint64  | Frame table offset                     |
+|   48   |  4   | uint32  | Compression type (0=none, 1=zstd)      |
+|   52   | 12   | bytes   | Reserved (zero-filled)                 |
++--------+------+---------+----------------------------------------+
+```
+
+The header is written as zeros initially, then overwritten with actual values
+during finalization. This requires the output stream to be seekable, which
+is acceptable since the format targets regular files rather than pipes or
+network streams.
+
+## Sample Data
+
+Sample data begins at offset 64 and extends to `string_table_offset`. Samples
+use delta compression to minimize redundancy when consecutive samples from the
+same thread have identical or similar call stacks.
+
+### Stack Encoding Types
+
+Each sample record begins with thread identification, then an encoding byte:
+
+| Code | Name | Description |
+|------|------|-------------|
+| 0x00 | REPEAT | RLE: identical stack repeated N times |
+| 0x01 | FULL | Complete stack (first sample or no match) |
+| 0x02 | SUFFIX | Shares N frames from bottom of previous stack |
+| 0x03 | POP_PUSH | Remove M frames from top, add N new frames |
+
+### Record Formats
+
+**REPEAT (0x00) - Run-Length Encoded Identical Stacks:**
+```
++-----------------+-----------+----------------------------------------+
+| thread_id       | 8 bytes   | Thread identifier (uint64, fixed)      |
+| interpreter_id  | 4 bytes   | Interpreter ID (uint32, fixed)         |
+| encoding        | 1 byte    | 0x00 (REPEAT)                          |
+| count           | varint    | Number of samples in this RLE group    |
+| samples         | varies    | Interleaved: [delta: varint, status: 1]|
+|                 |           | repeated count times                   |
++-----------------+-----------+----------------------------------------+
+```
+The stack is inherited from this thread's previous sample. Each sample in the
+group gets its own timestamp delta and status byte, stored as interleaved pairs
+(delta1, status1, delta2, status2, ...) rather than separate arrays.
+
+**FULL (0x01) - Complete Stack:**
+```
++-----------------+-----------+----------------------------------------+
+| thread_id       | 8 bytes   | Thread identifier (uint64, fixed)      |
+| interpreter_id  | 4 bytes   | Interpreter ID (uint32, fixed)         |
+| encoding        | 1 byte    | 0x01 (FULL)                            |
+| timestamp_delta | varint    | Microseconds since thread's last sample|
+| status          | 1 byte    | Thread state flags                     |
+| stack_depth     | varint    | Number of frames in call stack         |
+| frame_indices   | varint[]  | Array of frame table indices           |
++-----------------+-----------+----------------------------------------+
+```
+Used for the first sample from a thread, or when delta encoding would not
+provide savings.
+
+**SUFFIX (0x02) - Shared Suffix Match:**
+```
++-----------------+-----------+----------------------------------------+
+| thread_id       | 8 bytes   | Thread identifier (uint64, fixed)      |
+| interpreter_id  | 4 bytes   | Interpreter ID (uint32, fixed)         |
+| encoding        | 1 byte    | 0x02 (SUFFIX)                          |
+| timestamp_delta | varint    | Microseconds since thread's last sample|
+| status          | 1 byte    | Thread state flags                     |
+| shared_count    | varint    | Frames shared from bottom of prev stack|
+| new_count       | varint    | New frames at top of stack             |
+| new_frames      | varint[]  | Array of new_count frame indices       |
++-----------------+-----------+----------------------------------------+
+```
+Used when a function call added frames to the top of the stack. The shared
+frames from the previous stack are kept, and new frames are prepended.
+
+**POP_PUSH (0x03) - Pop and Push:**
+```
++-----------------+-----------+----------------------------------------+
+| thread_id       | 8 bytes   | Thread identifier (uint64, fixed)      |
+| interpreter_id  | 4 bytes   | Interpreter ID (uint32, fixed)         |
+| encoding        | 1 byte    | 0x03 (POP_PUSH)                        |
+| timestamp_delta | varint    | Microseconds since thread's last sample|
+| status          | 1 byte    | Thread state flags                     |
+| pop_count       | varint    | Frames to remove from top of prev stack|
+| push_count      | varint    | New frames to add at top               |
+| new_frames      | varint[]  | Array of push_count frame indices      |
++-----------------+-----------+----------------------------------------+
+```
+Used when the code path changed: some frames were popped (function returns)
+and new frames were pushed (different function calls).
+
+### Compression Benefits
+
+This delta encoding provides massive savings for typical profiling workloads:
+
+- **CPU-bound code**: Hot loops produce many identical samples. RLE encoding
+  compresses 100 identical samples to just 2-3 bytes of overhead plus the
+  timestamp/status data.
+
+- **I/O-bound code**: Alternating between wait and work produces similar
+  stacks with small variations. SUFFIX encoding captures this efficiently.
+
+- **Call-heavy code**: Functions calling other functions share common stack
+  prefixes. POP_PUSH encoding only stores the changed frames.
+
+### Thread and Interpreter Identification
+
+Thread IDs are 64-bit values that can be large (memory addresses on some
+platforms) and vary unpredictably. Using a fixed 8-byte encoding avoids
+the overhead of varint encoding for large values and simplifies parsing
+since the reader knows exactly where each field begins.
+
+The interpreter ID identifies which Python sub-interpreter the thread
+belongs to, allowing analysis tools to separate activity across interpreters
+in processes using multiple sub-interpreters.
+
+### Status Byte
+
+The status byte is a bitfield encoding thread state at sample time:
+
+| Bit | Flag                  | Meaning                                    |
+|-----|-----------------------|--------------------------------------------|
+|  0  | THREAD_STATUS_HAS_GIL | Thread holds the GIL (Global Interpreter Lock) |
+|  1  | THREAD_STATUS_ON_CPU  | Thread is actively running on a CPU core   |
+|  2  | THREAD_STATUS_UNKNOWN | Thread state could not be determined       |
+|  3  | THREAD_STATUS_GIL_REQUESTED | Thread is waiting to acquire the GIL  |
+|  4  | THREAD_STATUS_HAS_EXCEPTION | Thread has a pending exception         |
+
+Multiple flags can be set simultaneously (e.g., a thread can hold the GIL
+while also running on CPU). Analysis tools use these to filter samples or
+visualize thread states over time.
+
+### Timestamp Delta Encoding
+
+Timestamps use delta encoding rather than absolute values. Absolute
+timestamps in microseconds require 8 bytes each, but consecutive samples
+from the same thread are typically separated by the sampling interval
+(e.g., 1000 microseconds), so the delta between them is small and fits
+in 1-2 varint bytes. The writer tracks the previous timestamp for each
+thread separately. The first sample from a thread encodes its delta from
+the profiling start time; subsequent samples encode the delta from that
+thread's previous sample. This per-thread tracking is necessary because
+samples are interleaved across threads in arrival order, not grouped by
+thread.
+
+For REPEAT (RLE) records, timestamp deltas and status bytes are stored as
+interleaved pairs (delta, status, delta, status, ...) - one pair per
+repeated sample - allowing efficient batching while preserving the exact
+timing and state of each sample.
+
+### Frame Indexing
+
+Each frame in a call stack is represented by an index into the frame table
+rather than inline data. This provides massive space savings because call
+stacks are highly repetitive: the same function appears in many samples
+(hot functions), call stacks often share common prefixes (main -> app ->
+handler -> ...), and recursive functions create repeated frame sequences.
+A frame index is typically 1-2 varint bytes. Inline frame data would be
+20-200+ bytes (two strings plus a line number). For a profile with 100,000
+samples averaging 30 frames each, this reduces frame data from potentially
+gigabytes to tens of megabytes.
+
+Frame indices are written innermost-first (the currently executing frame
+has index 0 in the array). This ordering works well with delta compression:
+function calls typically add frames at the top (index 0), while shared
+frames remain at the bottom.
+
+## String Table
+
+The string table stores deduplicated UTF-8 strings (filenames and function
+names). It begins at `string_table_offset` and contains entries in order of
+their assignment during writing:
+
+```
++----------------+
+| length: varint |
+| data: bytes    |
++----------------+  (repeated for each string)
+```
+
+Strings are stored in the order they were first encountered during writing.
+The first unique filename gets index 0, the second gets index 1, and so on.
+Length-prefixing (rather than null-termination) allows strings containing
+null bytes and enables readers to allocate exact-sized buffers. The varint
+length encoding means short strings (under 128 bytes) need only one length
+byte.
+
+## Frame Table
+
+The frame table stores deduplicated frame entries:
+
+```
++----------------------+
+| filename_idx: varint |
+| funcname_idx: varint |
+| lineno: svarint      |
++----------------------+  (repeated for each frame)
+```
+
+Each unique (filename, funcname, lineno) combination gets one entry. Two
+calls to the same function at different line numbers produce different
+frame entries; two calls at the same line number share one entry.
+
+Strings and frames are deduplicated separately because they have different
+cardinalities and reference patterns. A codebase might have hundreds of
+unique source files but thousands of unique functions. Many functions share
+the same filename, so storing the filename index in each frame entry (rather
+than the full string) provides an additional layer of deduplication. A frame
+entry is just three varints (typically 3-6 bytes) rather than two full
+strings plus a line number.
+
+Line numbers use signed varint (zigzag encoding) rather than unsigned to
+handle edge cases. Synthetic frames—generated frames that don't correspond
+directly to Python source code, such as C extension boundaries or internal
+interpreter frames—use line number 0 or -1 to indicate the absence of a
+source location. Zigzag encoding ensures these small negative values encode
+efficiently (−1 becomes 1, which is one byte) rather than requiring the
+maximum varint length.
+
+## Footer
+
+```
+ Offset   Size   Type      Description
++--------+------+---------+----------------------------------------+
+|    0   |  4   | uint32  | String count                           |
+|    4   |  4   | uint32  | Frame count                            |
+|    8   |  8   | uint64  | Total file size                        |
+|   16   | 16   | bytes   | Checksum (reserved, currently zeros)   |
++--------+------+---------+----------------------------------------+
+```
+
+The string and frame counts allow readers to pre-allocate arrays of the
+correct size before parsing the tables. Without these counts, readers would
+need to either scan the tables twice (once to count, once to parse) or use
+dynamically-growing arrays.
+
+The file size field provides a consistency check: if the actual file size
+does not match, the file may be truncated or corrupted.
+
+The checksum field is reserved for future use. A checksum would allow
+detection of corruption but adds complexity and computation cost. The
+current implementation leaves this as zeros.
+
+## Variable-Length Integer Encoding
+
+The format uses LEB128 (Little Endian Base 128) for unsigned integers and
+zigzag + LEB128 for signed integers. These encodings are widely used
+(Protocol Buffers, DWARF debug info, WebAssembly) and well-understood.
+
+### Unsigned Varint (LEB128)
+
+Each byte stores 7 bits of data. The high bit indicates whether more bytes
+follow:
+
+```
+Value        Encoded bytes
+0-127        [0xxxxxxx]                    (1 byte)
+128-16383    [1xxxxxxx] [0xxxxxxx]         (2 bytes)
+16384+       [1xxxxxxx] [1xxxxxxx] ...     (3+ bytes)
+```
+
+Most indices in profiling data are small. A profile with 1000 unique frames
+needs at most 2 bytes per frame index. The common case (indices under 128)
+needs only 1 byte.
+
+### Signed Varint (Zigzag)
+
+Standard LEB128 encodes −1 as a very large unsigned value, requiring many
+bytes. Zigzag encoding interleaves positive and negative values:
+
+```
+ 0 -> 0    -1 -> 1     1 -> 2    -2 -> 3     2 -> 4
+```
+
+This ensures small-magnitude values (whether positive or negative) encode
+in few bytes.
+
+## Compression
+
+When compression is enabled, the sample data region contains a zstd stream.
+The string table, frame table, and footer remain uncompressed so readers can
+access metadata without decompressing the entire file. A tool that only needs
+to report "this file contains 50,000 samples of 3 threads" can read the header
+and footer without touching the compressed sample data. This also simplifies
+the format: the header's offset fields point directly to the tables rather
+than to positions within a decompressed stream.
+
+Zstd provides an excellent balance of compression ratio and speed. Profiling
+data compresses very well (often 5-10x) due to repetitive patterns: the same
+small set of frame indices appears repeatedly, and delta-encoded timestamps
+cluster around the sampling interval. Zstd's streaming API allows compression
+without buffering the entire dataset. The writer feeds sample data through
+the compressor incrementally, flushing compressed chunks to disk as they
+become available.
+
+Level 5 compression is used as a default. Lower levels (1-3) are faster but
+compress less; higher levels (6+) compress more but slow down writing. Level
+5 provides good compression with minimal impact on profiling overhead.
+
+## Reading and Writing
+
+### Writing
+
+1. Open the output file and write 64 zero bytes as a placeholder header
+2. Initialize empty string and frame dictionaries for deduplication
+3. For each sample:
+   - Intern any new strings, assigning sequential indices
+   - Intern any new frames, assigning sequential indices
+   - Encode the sample record and write to the buffer
+   - Flush the buffer through compression (if enabled) when full
+4. Flush remaining buffered data and finalize compression
+5. Write the string table (length-prefixed strings in index order)
+6. Write the frame table (varint-encoded entries in index order)
+7. Write the footer with final counts
+8. Seek to offset 0 and write the header with actual values
+
+The writer maintains two dictionaries: one mapping strings to indices, one
+mapping (filename_idx, funcname_idx, lineno) tuples to frame indices. These
+enable O(1) lookup during interning.
+
+### Reading
+
+1. Read the header and validate magic/version
+2. Seek to end − 32 and read the footer
+3. Allocate string array of `string_count` elements
+4. Parse the string table, populating the array
+5. Allocate frame array of `frame_count * 3` uint32 elements
+6. Parse the frame table, populating the array
+7. If compressed, decompress the sample data region
+8. Iterate through samples, resolving indices to strings/frames
+
+The reader builds lookup arrays rather than dictionaries since it only needs
+index-to-value mapping, not value-to-index.
+
+## Platform Considerations
+
+On Unix systems (Linux, macOS), the reader uses `mmap()` to map the file
+into the process address space. The kernel handles paging data in and out
+as needed, no explicit read() calls or buffer management are required,
+multiple readers can share the same physical pages, and sequential access
+patterns benefit from kernel read-ahead.
+
+The implementation uses `madvise()` to hint the access pattern to the kernel:
+`MADV_SEQUENTIAL` indicates the file will be read linearly, enabling
+aggressive read-ahead. `MADV_WILLNEED` requests pre-faulting of pages.
+On Linux, `MAP_POPULATE` pre-faults all pages at mmap time rather than on
+first access, moving page fault overhead from the parsing loop to the
+initial mapping for more predictable performance. For large files (over
+32 MB), `MADV_HUGEPAGE` requests transparent huge pages (2 MB instead of
+4 KB) to reduce TLB pressure when accessing large amounts of data.
+
+On Windows, the implementation falls back to standard file I/O with full
+file buffering. Profiling data files are typically small enough (tens to
+hundreds of megabytes) that this is acceptable.
+
+The writer uses a 512 KB buffer to batch small writes. Each sample record
+is typically tens of bytes; writing these individually would incur excessive
+syscall overhead. The buffer accumulates data until full, then flushes in
+one write() call (or feeds through the compression stream).
+
+## Future Considerations
+
+The format reserves space for future extensions. The 12 reserved bytes in
+the header could hold additional metadata. The 16-byte checksum field in
+the footer is currently unused. The version field allows incompatible
+changes with graceful rejection. New compression types could be added
+(compression_type > 1).
+
+Any changes that alter the meaning of existing fields or the parsing logic
+should increment the version number to prevent older readers from
+misinterpreting new files.

From 3ad7a3d2bb4c6f1d47f519abaf94ebb1fee25264 Mon Sep 17 00:00:00 2001
From: Pablo Galindo Salgado <pablogsal@gmail.com>
Date: Mon, 15 Dec 2025 01:50:30 +0000
Subject: [PATCH 13/17] Document binary format and replay command

Adds user documentation for --binary output format and the replay
command. Covers compression options, the record-and-replay workflow,
and examples of converting between formats.
---
 Doc/library/profiling.sampling.rst | 101 ++++++++++++++++++++++++++++-
 1 file changed, 98 insertions(+), 3 deletions(-)

diff --git a/Doc/library/profiling.sampling.rst b/Doc/library/profiling.sampling.rst
index e0e583d00500f7..a9721443a20ab5 100644
--- a/Doc/library/profiling.sampling.rst
+++ b/Doc/library/profiling.sampling.rst
@@ -200,6 +200,36 @@ On most systems, attaching to another process requires appropriate permissions.
 See :ref:`profiling-permissions` for platform-specific requirements.
 
 
+.. _replay-command:
+
+The ``replay`` command
+----------------------
+
+The ``replay`` command converts binary profile files to other output formats::
+
+   python -m profiling.sampling replay profile.bin
+   python -m profiling.sampling replay --flamegraph -o profile.html profile.bin
+
+This command is useful when you have captured profiling data in binary format
+and want to analyze it later or convert it to a visualization format. Binary
+profiles can be replayed multiple times to different formats without
+re-profiling.
+
+::
+
+   # Convert binary to pstats (default, prints to stdout)
+   python -m profiling.sampling replay profile.bin
+
+   # Convert binary to flame graph
+   python -m profiling.sampling replay --flamegraph -o output.html profile.bin
+
+   # Convert binary to gecko format for Firefox Profiler
+   python -m profiling.sampling replay --gecko -o profile.json profile.bin
+
+   # Convert binary to heatmap
+   python -m profiling.sampling replay --heatmap -o my_heatmap profile.bin
+
+
 Profiling in production
 -----------------------
 
@@ -967,6 +997,57 @@ intuitive view that shows exactly where time is spent without requiring
 interpretation of hierarchical visualizations.
 
 
+Binary format
+-------------
+
+Binary format (:option:`--binary`) produces a compact binary file for efficient
+storage of profiling data::
+
+   python -m profiling.sampling run --binary -o profile.bin script.py
+   python -m profiling.sampling attach --binary -o profile.bin 12345
+
+The :option:`--compression` option controls data compression:
+
+- ``auto`` (default): Use zstd compression if available, otherwise no
+  compression
+- ``zstd``: Force zstd compression (requires zstd support)
+- ``none``: Disable compression
+
+::
+
+   python -m profiling.sampling run --binary --compression=zstd -o profile.bin script.py
+
+To analyze binary profiles, use the :ref:`replay-command` to convert them to
+other formats like flame graphs or pstats output.
+
+
+Record and replay workflow
+==========================
+
+The binary format combined with the replay command enables a record-and-replay
+workflow that separates data capture from analysis. Rather than generating
+visualizations during profiling, you capture raw data to a compact binary file
+and convert it to different formats later.
+
+This approach has three main benefits. First, sampling runs faster because the
+work of building data structures for visualization is deferred until replay.
+Second, a single binary capture can be converted to multiple output formats
+without re-profiling---pstats for a quick overview, flame graph for visual
+exploration, heatmap for line-level detail. Third, binary files are compact
+and easy to share with colleagues who can convert them to their preferred
+format.
+
+A typical workflow::
+
+   # Capture profile in production or during tests
+   python -m profiling.sampling attach --binary -o profile.bin 12345
+
+   # Later, analyze with different formats
+   python -m profiling.sampling replay profile.bin
+   python -m profiling.sampling replay --flamegraph -o profile.html profile.bin
+   python -m profiling.sampling replay --heatmap -o heatmap profile.bin
+
+
 Live mode
 =========
 
@@ -1178,6 +1259,10 @@ Global options
 
    Attach to and profile a running process by PID.
 
+.. option:: replay
+
+   Convert a binary profile file to another output format.
+
 
 Sampling options
 ----------------
@@ -1256,12 +1341,22 @@ Output options
 
    Generate HTML heatmap with line-level sample counts.
 
+.. option:: --binary
+
+   Generate high-performance binary format for later conversion with the
+   ``replay`` command.
+
+.. option:: --compression <type>
+
+   Compression for binary format: ``auto`` (use zstd if available, default),
+   ``zstd``, or ``none``.
+
 .. option:: -o <path>, --output <path>
 
    Output file or directory path. Default behavior varies by format:
-   ``--pstats`` writes to stdout, ``--flamegraph`` and ``--gecko`` generate
-   files like ``flamegraph.PID.html``, and ``--heatmap`` creates a directory
-   named ``heatmap_PID``.
+   ``--pstats`` writes to stdout, ``--flamegraph``, ``--gecko``, and
+   ``--binary`` generate files like ``flamegraph.PID.html``, and ``--heatmap``
+   creates a directory named ``heatmap_PID``.
 
 
 pstats display options

From cd4f4129ce4d5554116f9d038cfd53b042ae15a0 Mon Sep 17 00:00:00 2001
From: Pablo Galindo Salgado <pablogsal@gmail.com>
Date: Mon, 15 Dec 2025 01:59:53 +0000
Subject: [PATCH 14/17] Fix CI

---
 .../test_sampling_profiler/test_binary_format.py            | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_binary_format.py b/Lib/test/test_profiling/test_sampling_profiler/test_binary_format.py
index 43b63f0bf4cef4..ca37eec01f43d8 100644
--- a/Lib/test/test_profiling/test_sampling_profiler/test_binary_format.py
+++ b/Lib/test/test_profiling/test_sampling_profiler/test_binary_format.py
@@ -19,11 +19,7 @@
         THREAD_STATUS_GIL_REQUESTED,
         THREAD_STATUS_HAS_EXCEPTION,
     )
-    from profiling.sampling.binary_collector import (
-        BinaryCollector,
-        COMPRESSION_NONE,
-        COMPRESSION_ZSTD,
-    )
+    from profiling.sampling.binary_collector import BinaryCollector
     from profiling.sampling.binary_reader import BinaryReader
 
     ZSTD_AVAILABLE = _remote_debugging.zstd_available()

From 596af7faa7cb75e06ee16b8b4b9c1947b8975f7e Mon Sep 17 00:00:00 2001
From: Pablo Galindo Salgado <pablogsal@gmail.com>
Date: Mon, 15 Dec 2025 02:01:18 +0000
Subject: [PATCH 15/17] Add NEWS entry

---
 .../Library/2025-12-15-02-00-31.gh-issue-138122.m3EF9E.rst    | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 Misc/NEWS.d/next/Library/2025-12-15-02-00-31.gh-issue-138122.m3EF9E.rst

diff --git a/Misc/NEWS.d/next/Library/2025-12-15-02-00-31.gh-issue-138122.m3EF9E.rst b/Misc/NEWS.d/next/Library/2025-12-15-02-00-31.gh-issue-138122.m3EF9E.rst
new file mode 100644
index 00000000000000..f9c2cee51d1dcd
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-12-15-02-00-31.gh-issue-138122.m3EF9E.rst
@@ -0,0 +1,4 @@
+Add binary output format to :mod:`profiling.sampling` for compact storage of
+profiling data. The new ``--binary`` option captures samples to a file that
+can be converted to other formats using the ``replay`` command. Patch by
+Pablo Galindo

From 1e2400b53f694fb7757d3138c552d451f4268288 Mon Sep 17 00:00:00 2001
From: Pablo Galindo Salgado <pablogsal@gmail.com>
Date: Mon, 15 Dec 2025 02:23:11 +0000
Subject: [PATCH 16/17] Regen and simplify

---
 .../pycore_global_objects_fini_generated.h    |  9 +++
 Include/internal/pycore_global_strings.h      |  9 +++
 .../internal/pycore_runtime_init_generated.h  |  9 +++
 .../internal/pycore_unicodeobject_generated.h | 36 +++++++++
 InternalDocs/profiling_binary_format.md       | 14 ----
 Lib/profiling/sampling/binary_collector.py    |  2 -
 Lib/profiling/sampling/binary_reader.py       |  3 -
 Lib/profiling/sampling/cli.py                 | 13 +---
 Lib/profiling/sampling/sample.py              |  4 -
 Modules/_remote_debugging/binary_io.h         | 75 ++++--------------
 Modules/_remote_debugging/binary_io_reader.c  | 15 +---
 Modules/_remote_debugging/binary_io_writer.c  | 77 +++----------------
 12 files changed, 92 insertions(+), 174 deletions(-)

diff --git a/Include/internal/pycore_global_objects_fini_generated.h b/Include/internal/pycore_global_objects_fini_generated.h
index 6473a3c64a6c23..2190528f342857 100644
--- a/Include/internal/pycore_global_objects_fini_generated.h
+++ b/Include/internal/pycore_global_objects_fini_generated.h
@@ -1653,9 +1653,11 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(co_varnames));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(code));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(col_offset));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(collector));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(command));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(comment_factory));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(compile_mode));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(compression));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(config));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(consts));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(context));
@@ -1718,7 +1720,9 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(event));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(eventmask));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exc));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exc_tb));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exc_type));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exc_val));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exc_value));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(excepthook));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exception));
@@ -1974,6 +1978,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(print_file_and_line));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(priority));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(progress));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(progress_callback));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(progress_routine));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(proto));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(protocol));
@@ -2013,6 +2018,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(reversed));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(rounding));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(salt));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(sample_interval_us));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(sched_priority));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(scheduler));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(script));
@@ -2052,8 +2058,10 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(spam));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(src));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(src_dir_fd));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(stack_frames));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(stacklevel));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(start));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(start_time_us));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(statement));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(stats));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(status));
@@ -2094,6 +2102,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(times));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(timespec));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(timestamp));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(timestamp_us));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(timetuple));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(timeunit));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(top));
diff --git a/Include/internal/pycore_global_strings.h b/Include/internal/pycore_global_strings.h
index ec720de2524e6e..7a94854ab49d84 100644
--- a/Include/internal/pycore_global_strings.h
+++ b/Include/internal/pycore_global_strings.h
@@ -376,9 +376,11 @@ struct _Py_global_strings {
         STRUCT_FOR_ID(co_varnames)
         STRUCT_FOR_ID(code)
         STRUCT_FOR_ID(col_offset)
+        STRUCT_FOR_ID(collector)
         STRUCT_FOR_ID(command)
         STRUCT_FOR_ID(comment_factory)
         STRUCT_FOR_ID(compile_mode)
+        STRUCT_FOR_ID(compression)
         STRUCT_FOR_ID(config)
         STRUCT_FOR_ID(consts)
         STRUCT_FOR_ID(context)
@@ -441,7 +443,9 @@ struct _Py_global_strings {
         STRUCT_FOR_ID(event)
         STRUCT_FOR_ID(eventmask)
         STRUCT_FOR_ID(exc)
+        STRUCT_FOR_ID(exc_tb)
         STRUCT_FOR_ID(exc_type)
+        STRUCT_FOR_ID(exc_val)
         STRUCT_FOR_ID(exc_value)
         STRUCT_FOR_ID(excepthook)
         STRUCT_FOR_ID(exception)
@@ -697,6 +701,7 @@ struct _Py_global_strings {
         STRUCT_FOR_ID(print_file_and_line)
         STRUCT_FOR_ID(priority)
         STRUCT_FOR_ID(progress)
+        STRUCT_FOR_ID(progress_callback)
         STRUCT_FOR_ID(progress_routine)
         STRUCT_FOR_ID(proto)
         STRUCT_FOR_ID(protocol)
@@ -736,6 +741,7 @@ struct _Py_global_strings {
         STRUCT_FOR_ID(reversed)
         STRUCT_FOR_ID(rounding)
         STRUCT_FOR_ID(salt)
+        STRUCT_FOR_ID(sample_interval_us)
         STRUCT_FOR_ID(sched_priority)
         STRUCT_FOR_ID(scheduler)
         STRUCT_FOR_ID(script)
@@ -775,8 +781,10 @@ struct _Py_global_strings {
         STRUCT_FOR_ID(spam)
         STRUCT_FOR_ID(src)
         STRUCT_FOR_ID(src_dir_fd)
+        STRUCT_FOR_ID(stack_frames)
         STRUCT_FOR_ID(stacklevel)
         STRUCT_FOR_ID(start)
+        STRUCT_FOR_ID(start_time_us)
         STRUCT_FOR_ID(statement)
         STRUCT_FOR_ID(stats)
         STRUCT_FOR_ID(status)
@@ -817,6 +825,7 @@ struct _Py_global_strings {
         STRUCT_FOR_ID(times)
         STRUCT_FOR_ID(timespec)
         STRUCT_FOR_ID(timestamp)
+        STRUCT_FOR_ID(timestamp_us)
         STRUCT_FOR_ID(timetuple)
         STRUCT_FOR_ID(timeunit)
         STRUCT_FOR_ID(top)
diff --git a/Include/internal/pycore_runtime_init_generated.h b/Include/internal/pycore_runtime_init_generated.h
index b32083db98e29e..d3a06e206174a0 100644
--- a/Include/internal/pycore_runtime_init_generated.h
+++ b/Include/internal/pycore_runtime_init_generated.h
@@ -1651,9 +1651,11 @@ extern "C" {
     INIT_ID(co_varnames), \
     INIT_ID(code), \
     INIT_ID(col_offset), \
+    INIT_ID(collector), \
     INIT_ID(command), \
     INIT_ID(comment_factory), \
     INIT_ID(compile_mode), \
+    INIT_ID(compression), \
     INIT_ID(config), \
     INIT_ID(consts), \
     INIT_ID(context), \
@@ -1716,7 +1718,9 @@ extern "C" {
     INIT_ID(event), \
     INIT_ID(eventmask), \
     INIT_ID(exc), \
+    INIT_ID(exc_tb), \
     INIT_ID(exc_type), \
+    INIT_ID(exc_val), \
     INIT_ID(exc_value), \
     INIT_ID(excepthook), \
     INIT_ID(exception), \
@@ -1972,6 +1976,7 @@ extern "C" {
     INIT_ID(print_file_and_line), \
     INIT_ID(priority), \
     INIT_ID(progress), \
+    INIT_ID(progress_callback), \
     INIT_ID(progress_routine), \
     INIT_ID(proto), \
     INIT_ID(protocol), \
@@ -2011,6 +2016,7 @@ extern "C" {
     INIT_ID(reversed), \
     INIT_ID(rounding), \
     INIT_ID(salt), \
+    INIT_ID(sample_interval_us), \
     INIT_ID(sched_priority), \
     INIT_ID(scheduler), \
     INIT_ID(script), \
@@ -2050,8 +2056,10 @@ extern "C" {
     INIT_ID(spam), \
     INIT_ID(src), \
     INIT_ID(src_dir_fd), \
+    INIT_ID(stack_frames), \
     INIT_ID(stacklevel), \
     INIT_ID(start), \
+    INIT_ID(start_time_us), \
     INIT_ID(statement), \
     INIT_ID(stats), \
     INIT_ID(status), \
@@ -2092,6 +2100,7 @@ extern "C" {
     INIT_ID(times), \
     INIT_ID(timespec), \
     INIT_ID(timestamp), \
+    INIT_ID(timestamp_us), \
     INIT_ID(timetuple), \
     INIT_ID(timeunit), \
     INIT_ID(top), \
diff --git a/Include/internal/pycore_unicodeobject_generated.h b/Include/internal/pycore_unicodeobject_generated.h
index f3756fde2c4073..99e02fc85dc917 100644
--- a/Include/internal/pycore_unicodeobject_generated.h
+++ b/Include/internal/pycore_unicodeobject_generated.h
@@ -1284,6 +1284,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
     assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(collector);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
     string = &_Py_ID(command);
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
@@ -1296,6 +1300,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
     assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(compression);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
     string = &_Py_ID(config);
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
@@ -1544,10 +1552,18 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
     assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(exc_tb);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
     string = &_Py_ID(exc_type);
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
     assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(exc_val);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
     string = &_Py_ID(exc_value);
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
@@ -2568,6 +2584,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
     assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(progress_callback);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
     string = &_Py_ID(progress_routine);
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
@@ -2724,6 +2744,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
     assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(sample_interval_us);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
     string = &_Py_ID(sched_priority);
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
@@ -2880,6 +2904,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
     assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(stack_frames);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
     string = &_Py_ID(stacklevel);
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
@@ -2888,6 +2916,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
     assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(start_time_us);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
     string = &_Py_ID(statement);
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
@@ -3048,6 +3080,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
     assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(timestamp_us);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
     string = &_Py_ID(timetuple);
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
diff --git a/InternalDocs/profiling_binary_format.md b/InternalDocs/profiling_binary_format.md
index ff3daca2b57cb4..b4ec2b39323d32 100644
--- a/InternalDocs/profiling_binary_format.md
+++ b/InternalDocs/profiling_binary_format.md
@@ -174,20 +174,6 @@ frames from the previous stack are kept, and new frames are prepended.
 Used when the code path changed: some frames were popped (function returns)
 and new frames were pushed (different function calls).
 
-### Compression Benefits
-
-This delta encoding provides massive savings for typical profiling workloads:
-
-- **CPU-bound code**: Hot loops produce many identical samples. RLE encoding
-  compresses 100 identical samples to just 2-3 bytes of overhead plus the
-  timestamp/status data.
-
-- **I/O-bound code**: Alternating between wait and work produces similar
-  stacks with small variations. SUFFIX encoding captures this efficiently.
-
-- **Call-heavy code**: Functions calling other functions share common stack
-  prefixes. POP_PUSH encoding only stores the changed frames.
-
 ### Thread and Interpreter Identification
 
 Thread IDs are 64-bit values that can be large (memory addresses on some
diff --git a/Lib/profiling/sampling/binary_collector.py b/Lib/profiling/sampling/binary_collector.py
index 293d4b6debcca3..d8d38f4c078927 100644
--- a/Lib/profiling/sampling/binary_collector.py
+++ b/Lib/profiling/sampling/binary_collector.py
@@ -98,7 +98,6 @@ def export(self, filename=None):
 
     @property
     def total_samples(self):
-        """Total number of samples written."""
         return self._writer.total_samples
 
     def get_stats(self):
@@ -111,7 +110,6 @@ def get_stats(self):
         return self._writer.get_stats()
 
     def __enter__(self):
-        """Context manager entry."""
         return self
 
     def __exit__(self, exc_type, exc_val, exc_tb):
diff --git a/Lib/profiling/sampling/binary_reader.py b/Lib/profiling/sampling/binary_reader.py
index 3d7fbf981ea63b..50c96668cc585b 100644
--- a/Lib/profiling/sampling/binary_reader.py
+++ b/Lib/profiling/sampling/binary_reader.py
@@ -23,13 +23,11 @@ def __init__(self, filename):
         self._reader = None
 
     def __enter__(self):
-        """Open the binary file for reading."""
         import _remote_debugging
         self._reader = _remote_debugging.BinaryReader(self.filename)
         return self
 
     def __exit__(self, exc_type, exc_val, exc_tb):
-        """Close the binary file."""
         if self._reader is not None:
             self._reader.close()
             self._reader = None
@@ -71,7 +69,6 @@ def replay_samples(self, collector, progress_callback=None):
 
     @property
     def sample_count(self):
-        """Number of samples in the file."""
         if self._reader is None:
             raise RuntimeError("Reader not open. Use as context manager.")
         return self._reader.get_info()['sample_count']
diff --git a/Lib/profiling/sampling/cli.py b/Lib/profiling/sampling/cli.py
index 35c39bdcada0a3..63befea4f91de0 100644
--- a/Lib/profiling/sampling/cli.py
+++ b/Lib/profiling/sampling/cli.py
@@ -102,7 +102,7 @@ def _parse_mode(mode_string):
 def _check_process_died(process):
     """Check if process died and raise an error with stderr if available."""
     if process.poll() is None:
-        return  # Process still running
+        return
 
     # Process died - try to get stderr for error message
     stderr_msg = ""
@@ -471,7 +471,6 @@ def _handle_output(collector, args, pid, mode):
                 sort_mode, limit, not args.no_summary, mode
             )
     else:
-        # Export to file
         filename = args.outfile or _generate_output_filename(args.format, pid)
         collector.export(filename)
 
@@ -726,7 +725,6 @@ def _handle_attach(args):
         mode != PROFILING_MODE_WALL if mode != PROFILING_MODE_ALL else False
     )
 
-    # For binary format, determine output file before creating collector
     output_file = None
     if args.format == "binary":
         output_file = args.outfile or _generate_output_filename(args.format, args.pid)
@@ -806,7 +804,6 @@ def _handle_run(args):
         mode != PROFILING_MODE_WALL if mode != PROFILING_MODE_ALL else False
     )
 
-    # For binary format, determine output file before creating collector
     output_file = None
     if args.format == "binary":
         output_file = args.outfile or _generate_output_filename(args.format, process.pid)
@@ -940,7 +937,6 @@ def _handle_replay(args):
     """Handle the 'replay' command - convert binary profile to another format."""
     import os
 
-    # Check input file exists
     if not os.path.exists(args.input_file):
         sys.exit(f"Error: Input file not found: {args.input_file}")
 
@@ -956,10 +952,8 @@ def _handle_replay(args):
         print(f"  Sample interval: {interval} us")
         print(f"  Compression: {'zstd' if info.get('compression_type', 0) == 1 else 'none'}")
 
-        # Create appropriate collector
         collector = _create_collector(args.format, interval, skip_idle=False)
 
-        # Replay with progress bar
         def progress_callback(current, total):
             if total > 0:
                 pct = current / total
@@ -969,20 +963,17 @@ def progress_callback(current, total):
                 print(f"\r  [{bar}] {pct*100:5.1f}% ({current:,}/{total:,})", end="", flush=True)
 
         count = reader.replay_samples(collector, progress_callback)
-        print()  # Newline after progress bar
+        print()
 
-        # Handle output similar to other formats
         if args.format == "pstats":
             if args.outfile:
                 collector.export(args.outfile)
             else:
-                # Print to stdout with defaults applied
                 sort_choice = args.sort if args.sort is not None else "nsamples"
                 limit = args.limit if args.limit is not None else 15
                 sort_mode = _sort_to_mode(sort_choice)
                 collector.print_stats(sort_mode, limit, not args.no_summary, PROFILING_MODE_WALL)
         else:
-            # Export to file
             filename = args.outfile or _generate_output_filename(args.format, os.getpid())
             collector.export(filename)
 
diff --git a/Lib/profiling/sampling/sample.py b/Lib/profiling/sampling/sample.py
index b18017c6bb95b3..9c0cdce93c403e 100644
--- a/Lib/profiling/sampling/sample.py
+++ b/Lib/profiling/sampling/sample.py
@@ -138,7 +138,6 @@ def sample(self, collector, duration_sec=10, *, async_aware=False):
             if self.collect_stats:
                 self._print_unwinder_stats()
 
-            # Print binary I/O stats if using binary collector
             if isinstance(collector, BinaryCollector):
                 self._print_binary_stats(collector)
 
@@ -292,7 +291,6 @@ def _print_binary_stats(self, collector):
 
         print(f"  {ANSIColors.CYAN}Binary Encoding:{ANSIColors.RESET}")
 
-        # Record type counts
         repeat_records = stats.get('repeat_records', 0)
         repeat_samples = stats.get('repeat_samples', 0)
         full_records = stats.get('full_records', 0)
@@ -314,7 +312,6 @@ def _print_binary_stats(self, collector):
         print(f"      Suffix match:   {suffix_records:,} ({suffix_pct:.1f}%)")
         print(f"      Pop-push:       {pop_push_records:,} ({pop_push_pct:.1f}%)")
 
-        # Frame efficiency
         frames_written = stats.get('total_frames_written', 0)
         frames_saved = stats.get('frames_saved', 0)
         compression_pct = stats.get('frame_compression_pct', 0)
@@ -323,7 +320,6 @@ def _print_binary_stats(self, collector):
         print(f"    Frames written:   {frames_written:,}")
         print(f"    Frames saved:     {frames_saved:,} ({ANSIColors.GREEN}{compression_pct:.1f}%{ANSIColors.RESET})")
 
-        # Bytes written
         bytes_written = stats.get('bytes_written', 0)
         if bytes_written >= 1024 * 1024:
             bytes_str = f"{bytes_written / (1024 * 1024):.1f} MB"
diff --git a/Modules/_remote_debugging/binary_io.h b/Modules/_remote_debugging/binary_io.h
index 3bc40b5f54fd56..e730fa8d9ace5c 100644
--- a/Modules/_remote_debugging/binary_io.h
+++ b/Modules/_remote_debugging/binary_io.h
@@ -320,13 +320,11 @@ encode_varint_i32(uint8_t *buf, int32_t value)
     return encode_varint_u32(buf, zigzag);
 }
 
-/* Decode unsigned 64-bit varint. Updates offset only on success. Returns value.
+/* Decode unsigned 64-bit varint (LEB128). Updates offset only on success.
  * On error (overflow or incomplete), offset is NOT updated, allowing callers
- * to detect errors via (offset == prev_offset) check.
- * On success, sets *error to 0 if error is non-NULL.
- * On error, sets *error to 1 if error is non-NULL. */
+ * to detect errors via (offset == prev_offset) check. Sets PyErr on error. */
 static inline uint64_t
-decode_varint_u64_ex(const uint8_t *data, size_t *offset, size_t max_size, int *error)
+decode_varint_u64(const uint8_t *data, size_t *offset, size_t max_size)
 {
     size_t pos = *offset;
     uint64_t result = 0;
@@ -335,7 +333,6 @@ decode_varint_u64_ex(const uint8_t *data, size_t *offset, size_t max_size, int *
     /* Fast path for single-byte varints (0-127) - most common case */
     if (LIKELY(pos < max_size && (data[pos] & 0x80) == 0)) {
         *offset = pos + 1;
-        if (error) *error = 0;
         return data[pos];
     }
 
@@ -344,87 +341,45 @@ decode_varint_u64_ex(const uint8_t *data, size_t *offset, size_t max_size, int *
         result |= (uint64_t)(byte & 0x7F) << shift;
         if ((byte & 0x80) == 0) {
             *offset = pos;
-            if (error) *error = 0;
             return result;
         }
         shift += 7;
         if (UNLIKELY(shift >= 64)) {
-            /* Overflow - do NOT update offset so caller can detect error */
-            if (error) *error = 1;
+            PyErr_SetString(PyExc_ValueError, "Invalid or incomplete varint in binary data");
             return 0;
         }
     }
 
-    /* Incomplete varint - do NOT update offset so caller can detect error */
-    if (error) *error = 1;
+    PyErr_SetString(PyExc_ValueError, "Invalid or incomplete varint in binary data");
     return 0;
 }
 
-/* Backward-compatible wrapper that sets PyErr on error.
- * Callers should check PyErr_Occurred() after batch operations. */
-static inline uint64_t
-decode_varint_u64(const uint8_t *data, size_t *offset, size_t max_size)
-{
-    int error = 0;
-    uint64_t result = decode_varint_u64_ex(data, offset, max_size, &error);
-    if (UNLIKELY(error)) {
-        PyErr_SetString(PyExc_ValueError, "Invalid or incomplete varint in binary data");
-    }
-    return result;
-}
-
-/* Decode unsigned 32-bit varint with explicit error handling.
- * If value exceeds UINT32_MAX, treats as error: offset is NOT updated,
- * *error is set to 1, allowing callers to detect via (offset == prev_offset). */
+/* Decode unsigned 32-bit varint. If value exceeds UINT32_MAX, treats as error. */
 static inline uint32_t
-decode_varint_u32_ex(const uint8_t *data, size_t *offset, size_t max_size, int *error)
+decode_varint_u32(const uint8_t *data, size_t *offset, size_t max_size)
 {
     size_t saved_offset = *offset;
-    uint64_t value = decode_varint_u64_ex(data, offset, max_size, error);
-    if (error && *error) {
-        /* decode_varint_u64_ex already handled the error, offset unchanged */
+    uint64_t value = decode_varint_u64(data, offset, max_size);
+    if (PyErr_Occurred()) {
         return 0;
     }
     if (UNLIKELY(value > UINT32_MAX)) {
-        /* Value overflow - restore offset so caller can detect error */
         *offset = saved_offset;
-        if (error) *error = 1;
+        PyErr_SetString(PyExc_ValueError, "Invalid or incomplete varint in binary data");
         return 0;
     }
     return (uint32_t)value;
 }
 
-/* Backward-compatible wrapper that sets PyErr on error. */
-static inline uint32_t
-decode_varint_u32(const uint8_t *data, size_t *offset, size_t max_size)
-{
-    int error = 0;
-    uint32_t result = decode_varint_u32_ex(data, offset, max_size, &error);
-    if (UNLIKELY(error)) {
-        PyErr_SetString(PyExc_ValueError, "Invalid or incomplete varint in binary data");
-    }
-    return result;
-}
-
-/* Decode signed 32-bit varint (zigzag) with explicit error handling. */
-static inline int32_t
-decode_varint_i32_ex(const uint8_t *data, size_t *offset, size_t max_size, int *error)
-{
-    uint32_t zigzag = decode_varint_u32_ex(data, offset, max_size, error);
-    /* Zigzag decode */
-    return (int32_t)((zigzag >> 1) ^ -(int32_t)(zigzag & 1));
-}
-
-/* Backward-compatible wrapper that sets PyErr on error. */
+/* Decode signed 32-bit varint (zigzag encoding). */
 static inline int32_t
 decode_varint_i32(const uint8_t *data, size_t *offset, size_t max_size)
 {
-    int error = 0;
-    int32_t result = decode_varint_i32_ex(data, offset, max_size, &error);
-    if (UNLIKELY(error)) {
-        PyErr_SetString(PyExc_ValueError, "Invalid or incomplete varint in binary data");
+    uint32_t zigzag = decode_varint_u32(data, offset, max_size);
+    if (PyErr_Occurred()) {
+        return 0;
     }
-    return result;
+    return (int32_t)((zigzag >> 1) ^ -(int32_t)(zigzag & 1));
 }
 
 /* ============================================================================
diff --git a/Modules/_remote_debugging/binary_io_reader.c b/Modules/_remote_debugging/binary_io_reader.c
index 6890bc864ec9aa..ef3d26b31cffdf 100644
--- a/Modules/_remote_debugging/binary_io_reader.c
+++ b/Modules/_remote_debugging/binary_io_reader.c
@@ -37,7 +37,6 @@
  * BINARY READER IMPLEMENTATION
  * ============================================================================ */
 
-/* Parse the file header and populate reader fields */
 static inline int
 reader_parse_header(BinaryReader *reader, const uint8_t *data, size_t file_size)
 {
@@ -73,7 +72,6 @@ reader_parse_header(BinaryReader *reader, const uint8_t *data, size_t file_size)
     return 0;
 }
 
-/* Parse the file footer */
 static inline int
 reader_parse_footer(BinaryReader *reader, const uint8_t *data, size_t file_size)
 {
@@ -94,7 +92,6 @@ reader_parse_footer(BinaryReader *reader, const uint8_t *data, size_t file_size)
 /* Maximum decompression buffer size to prevent memory exhaustion (1GB) */
 #define MAX_DECOMPRESS_SIZE (1ULL << 30)
 
-/* Decompress zstd-compressed sample data */
 static inline int
 reader_decompress_samples(BinaryReader *reader, const uint8_t *data)
 {
@@ -204,7 +201,6 @@ reader_decompress_samples(BinaryReader *reader, const uint8_t *data)
 }
 #endif
 
-/* Parse the string table into Python unicode objects */
 static inline int
 reader_parse_string_table(BinaryReader *reader, const uint8_t *data, size_t file_size)
 {
@@ -237,7 +233,6 @@ reader_parse_string_table(BinaryReader *reader, const uint8_t *data, size_t file
     return 0;
 }
 
-/* Parse the frame table (function_id, filename_id, lineno for each frame) */
 static inline int
 reader_parse_frame_table(BinaryReader *reader, const uint8_t *data, size_t file_size)
 {
@@ -457,7 +452,6 @@ binary_reader_open(const char *filename)
         goto error;
 #endif
     } else {
-        /* Uncompressed data */
         reader->sample_data = data + FILE_HEADER_PLACEHOLDER_SIZE;
         reader->sample_data_size = reader->string_table_offset - FILE_HEADER_PLACEHOLDER_SIZE;
     }
@@ -490,7 +484,6 @@ reader_get_or_create_thread_state(BinaryReader *reader, uint64_t thread_id,
         }
     }
 
-    /* Initial allocation or growth */
     if (!reader->thread_states) {
         reader->thread_state_capacity = 16;
         reader->thread_states = PyMem_Calloc(reader->thread_state_capacity, sizeof(ReaderThreadState));
@@ -507,7 +500,6 @@ reader_get_or_create_thread_state(BinaryReader *reader, uint64_t thread_id,
         }
     }
 
-    /* Initialize new thread state */
     ReaderThreadState *ts = &reader->thread_states[reader->thread_state_count++];
     memset(ts, 0, sizeof(ReaderThreadState));
     ts->thread_id = thread_id;
@@ -576,7 +568,7 @@ decode_stack_suffix(ReaderThreadState *ts, const uint8_t *data,
         return -1;
     }
 
-    /* Move shared frames to make room for new frames at the top */
+    /* Move shared frames (from bottom of stack) to make room for new frames at the top */
     if (new_count > 0 && shared > 0) {
         size_t prev_shared_start = ts->current_stack_depth - shared;
         memmove(&ts->current_stack[new_count],
@@ -584,7 +576,6 @@ decode_stack_suffix(ReaderThreadState *ts, const uint8_t *data,
                 shared * sizeof(uint32_t));
     }
 
-    /* Read new frames (at top of stack) */
     for (uint32_t i = 0; i < new_count; i++) {
         ts->current_stack[i] = decode_varint_u32(data, offset, max_size);
     }
@@ -612,7 +603,7 @@ decode_stack_pop_push(ReaderThreadState *ts, const uint8_t *data,
         return -1;
     }
 
-    /* Move kept frames (from bottom of stack) to make room for new frames.
+    /* Move kept frames (from bottom of stack) to make room for new frames at the top.
      * Even when push == 0, we need to move kept frames to index 0 if pop > 0. */
     if (keep > 0) {
         memmove(&ts->current_stack[push],
@@ -620,7 +611,6 @@ decode_stack_pop_push(ReaderThreadState *ts, const uint8_t *data,
                 keep * sizeof(uint32_t));
     }
 
-    /* Read new frames (at top of stack) */
     for (uint32_t i = 0; i < push; i++) {
         ts->current_stack[i] = decode_varint_u32(data, offset, max_size);
     }
@@ -1063,7 +1053,6 @@ binary_reader_close(BinaryReader *reader)
 
     PyMem_Free(reader->frame_data);
 
-    /* Free per-thread reconstruction state */
     if (reader->thread_states) {
         for (size_t i = 0; i < reader->thread_state_count; i++) {
             PyMem_Free(reader->thread_states[i].current_stack);
diff --git a/Modules/_remote_debugging/binary_io_writer.c b/Modules/_remote_debugging/binary_io_writer.c
index 5e5abca3d9a967..fbcdea5cbe526b 100644
--- a/Modules/_remote_debugging/binary_io_writer.c
+++ b/Modules/_remote_debugging/binary_io_writer.c
@@ -47,14 +47,12 @@ grow_parallel_arrays(void **array1, void **array2, size_t *capacity,
 {
     size_t old_cap = *capacity;
 
-    /* Check for overflow when doubling capacity */
     if (old_cap > SIZE_MAX / 2) {
         PyErr_SetString(PyExc_OverflowError, "Array capacity overflow");
         return -1;
     }
     size_t new_cap = old_cap * 2;
 
-    /* Check for overflow when calculating allocation sizes */
     if (new_cap > SIZE_MAX / elem_size1 || new_cap > SIZE_MAX / elem_size2) {
         PyErr_SetString(PyExc_OverflowError, "Array allocation size overflow");
         return -1;
@@ -85,11 +83,9 @@ grow_parallel_arrays(void **array1, void **array2, size_t *capacity,
     memcpy(new_array1, *array1, old_size1);
     memcpy(new_array2, *array2, old_size2);
 
-    /* Free old arrays */
     PyMem_Free(*array1);
     PyMem_Free(*array2);
 
-    /* Update all pointers */
     *array1 = new_array1;
     *array2 = new_array2;
     *capacity = new_cap;
@@ -163,7 +159,6 @@ binary_io_get_best_compression(void)
  * BINARY WRITER IMPLEMENTATION
  * ============================================================================ */
 
-/* Initialize zstd compression */
 static int
 writer_init_zstd(BinaryWriter *writer)
 {
@@ -203,7 +198,6 @@ writer_init_zstd(BinaryWriter *writer)
 #endif
 }
 
-/* Flush write buffer to disk (with compression if enabled) */
 static int
 writer_flush_buffer(BinaryWriter *writer)
 {
@@ -241,7 +235,6 @@ writer_flush_buffer(BinaryWriter *writer)
     } else
 #endif
     {
-        /* Uncompressed write */
         if (fwrite_checked_allow_threads(writer->write_buffer, writer->buffer_pos, writer->fp) < 0) {
             return -1;
         }
@@ -251,7 +244,6 @@ writer_flush_buffer(BinaryWriter *writer)
     return 0;
 }
 
-/* Write bytes to buffer (flushing if needed) */
 static inline int
 writer_write_bytes(BinaryWriter *writer, const void *data, size_t size)
 {
@@ -295,7 +287,6 @@ string_hash_func(const void *key)
     return (Py_uhash_t)hash;
 }
 
-/* Compare function for Python strings */
 static int
 string_compare_func(const void *key1, const void *key2)
 {
@@ -312,14 +303,12 @@ string_compare_func(const void *key1, const void *key2)
     return result;
 }
 
-/* Destroy function for string keys - decref the Python string */
 static void
 string_key_destroy(void *key)
 {
     Py_XDECREF((PyObject *)key);
 }
 
-/* Hash function for frame keys */
 static Py_uhash_t
 frame_key_hash_func(const void *key)
 {
@@ -335,7 +324,6 @@ frame_key_hash_func(const void *key)
     return hash;
 }
 
-/* Compare function for frame keys */
 static int
 frame_key_compare_func(const void *key1, const void *key2)
 {
@@ -346,25 +334,21 @@ frame_key_compare_func(const void *key1, const void *key2)
             fk1->lineno == fk2->lineno);
 }
 
-/* Destroy function for frame keys - free the allocated FrameKey */
 static void
 frame_key_destroy(void *key)
 {
     PyMem_Free(key);
 }
 
-/* Intern a string and return its index */
 static inline int
 writer_intern_string(BinaryWriter *writer, PyObject *string, uint32_t *index)
 {
-    /* Check if string already exists in hash table */
     void *existing = _Py_hashtable_get(writer->string_hash, string);
     if (existing != NULL) {
-        *index = (uint32_t)(uintptr_t)existing - 1;  /* Subtract 1 since we store index+1 */
+        *index = (uint32_t)(uintptr_t)existing - 1;  /* index+1 stored to distinguish from NULL */
         return 0;
     }
 
-    /* New string - grow storage if needed */
     if (writer->string_count >= writer->string_capacity) {
         if (grow_parallel_arrays((void **)&writer->strings,
                                   (void **)&writer->string_lengths,
@@ -380,7 +364,6 @@ writer_intern_string(BinaryWriter *writer, PyObject *string, uint32_t *index)
         return -1;
     }
 
-    /* Store copy of string data */
     char *str_copy = PyMem_Malloc(str_len + 1);
     if (!str_copy) {
         PyErr_NoMemory();
@@ -388,12 +371,9 @@ writer_intern_string(BinaryWriter *writer, PyObject *string, uint32_t *index)
     }
     memcpy(str_copy, str_data, str_len + 1);
 
-    /* The index we'll use (current count before incrementing) */
     *index = (uint32_t)writer->string_count;
 
-    /* Add to hash table FIRST (before modifying arrays/count) to ensure atomicity.
-     * If hash table insert fails, we can simply free str_copy without rolling back.
-     * Store index+1 to distinguish from NULL (0 would be ambiguous). */
+    /* Add to hash table FIRST to ensure atomic rollback on failure */
     Py_INCREF(string);
     if (_Py_hashtable_set(writer->string_hash, string, (void *)(uintptr_t)(*index + 1)) < 0) {
         Py_DECREF(string);
@@ -402,8 +382,6 @@ writer_intern_string(BinaryWriter *writer, PyObject *string, uint32_t *index)
         return -1;
     }
 
-    /* Hash table insert succeeded - now safely update arrays and count.
-     * These operations cannot fail, so the data structures stay consistent. */
     writer->strings[writer->string_count] = str_copy;
     writer->string_lengths[writer->string_count] = str_len;
     writer->string_count++;
@@ -411,29 +389,23 @@ writer_intern_string(BinaryWriter *writer, PyObject *string, uint32_t *index)
     return 0;
 }
 
-/* Intern a frame and return its index */
 static inline int
 writer_intern_frame(BinaryWriter *writer, uint32_t filename_idx, uint32_t funcname_idx,
                     int32_t lineno, uint32_t *index)
 {
-    /* Create a temporary key for lookup */
     FrameKey lookup_key = {filename_idx, funcname_idx, lineno};
 
-    /* Check if frame already exists in hash table */
     void *existing = _Py_hashtable_get(writer->frame_hash, &lookup_key);
     if (existing != NULL) {
-        *index = (uint32_t)(uintptr_t)existing - 1;  /* Subtract 1 since we store index+1 */
+        *index = (uint32_t)(uintptr_t)existing - 1;  /* index+1 stored to distinguish from NULL */
         return 0;
     }
 
-    /* New frame - grow storage if needed */
     if (GROW_ARRAY(writer->frame_entries, writer->frame_count,
                    writer->frame_capacity, FrameEntry) < 0) {
         return -1;
     }
 
-    /* Allocate key for hash table first (before modifying frame_count)
-     * to ensure atomic rollback on failure */
     FrameKey *key = PyMem_Malloc(sizeof(FrameKey));
     if (!key) {
         PyErr_NoMemory();
@@ -441,22 +413,18 @@ writer_intern_frame(BinaryWriter *writer, uint32_t filename_idx, uint32_t funcna
     }
     *key = lookup_key;
 
-    /* Now add the frame entry */
     *index = (uint32_t)writer->frame_count;
     FrameEntry *fe = &writer->frame_entries[writer->frame_count];
     fe->filename_idx = filename_idx;
     fe->funcname_idx = funcname_idx;
     fe->lineno = lineno;
 
-    /* Add to hash table (store index+1 to distinguish from NULL) */
     if (_Py_hashtable_set(writer->frame_hash, key, (void *)(uintptr_t)(*index + 1)) < 0) {
         PyMem_Free(key);
-        /* Don't increment frame_count - rollback the frame entry */
         PyErr_NoMemory();
         return -1;
     }
 
-    /* Success - now increment frame_count */
     writer->frame_count++;
     return 0;
 }
@@ -468,8 +436,8 @@ static ThreadEntry *
 writer_get_or_create_thread_entry(BinaryWriter *writer, uint64_t thread_id,
                                    uint32_t interpreter_id, int *is_new)
 {
-    /* Linear search (OK for small number of threads) */
-    /* Key is (thread_id, interpreter_id) since same thread_id can exist in different interpreters */
+    /* Linear search is OK for small number of threads.
+     * Key is (thread_id, interpreter_id) since same thread_id can exist in different interpreters. */
     for (size_t i = 0; i < writer->thread_count; i++) {
         if (writer->thread_entries[i].thread_id == thread_id &&
             writer->thread_entries[i].interpreter_id == interpreter_id) {
@@ -480,7 +448,6 @@ writer_get_or_create_thread_entry(BinaryWriter *writer, uint64_t thread_id,
         }
     }
 
-    /* Add new thread - grow array if needed */
     if (writer->thread_count >= writer->thread_capacity) {
         writer->thread_entries = grow_array(writer->thread_entries,
                                             &writer->thread_capacity,
@@ -530,7 +497,6 @@ compare_stacks(const uint32_t *prev_stack, size_t prev_depth,
                const uint32_t *curr_stack, size_t curr_depth,
                size_t *shared_count, size_t *pop_count, size_t *push_count)
 {
-    /* Check for identical stacks */
     if (prev_depth == curr_depth) {
         int identical = 1;
         for (size_t i = 0; i < prev_depth; i++) {
@@ -621,17 +587,14 @@ flush_pending_rle(BinaryWriter *writer, ThreadEntry *entry)
      * [timestamp_delta_1: varint] [status_1: 1] ... [timestamp_delta_N: varint] [status_N: 1]
      */
 
-    /* Write fixed header */
     if (write_sample_header(writer, entry, STACK_REPEAT) < 0) {
         return -1;
     }
 
-    /* Write count */
     if (writer_write_varint_u32(writer, (uint32_t)entry->pending_rle_count) < 0) {
         return -1;
     }
 
-    /* Write timestamp deltas and status bytes */
     for (size_t i = 0; i < entry->pending_rle_count; i++) {
         if (writer_write_varint_u64(writer, entry->pending_rle[i].timestamp_delta) < 0) {
             return -1;
@@ -642,13 +605,11 @@ flush_pending_rle(BinaryWriter *writer, ThreadEntry *entry)
         writer->total_samples++;
     }
 
-    /* Update stats: RLE saved writing full stacks for each repeat sample */
     writer->stats.repeat_records++;
     writer->stats.repeat_samples += entry->pending_rle_count;
-    /* Each RLE sample saves writing the entire stack (prev_stack_depth frames) */
+    /* Each RLE sample saves writing the entire stack */
     writer->stats.frames_saved += entry->pending_rle_count * entry->prev_stack_depth;
 
-    /* Clear pending state */
     entry->pending_rle_count = 0;
     entry->has_pending_rle = 0;
 
@@ -664,7 +625,7 @@ write_sample_with_encoding(BinaryWriter *writer, ThreadEntry *entry,
                            const uint32_t *frame_indices, size_t stack_depth,
                            size_t shared_count, size_t pop_count, size_t push_count)
 {
-    /* Write header: thread_id (8) + interpreter_id (4) + encoding (1) + delta (varint) + status (1) */
+    /* Header: thread_id(8) + interpreter_id(4) + encoding(1) + delta(varint) + status(1) */
     uint8_t header_buf[SAMPLE_HEADER_MAX_SIZE];
     memcpy(header_buf, &entry->thread_id, 8);
     memcpy(header_buf + 8, &entry->interpreter_id, 4);
@@ -676,7 +637,6 @@ write_sample_with_encoding(BinaryWriter *writer, ThreadEntry *entry,
         return -1;
     }
 
-    /* Write encoding-specific data */
     uint8_t frame_buf[MAX_FRAME_BUFFER_SIZE];
     size_t frame_buf_pos = 0;
     size_t frames_written = 0;
@@ -804,14 +764,12 @@ binary_writer_create(const char *filename, uint64_t sample_interval_us, int comp
     }
     writer->thread_capacity = INITIAL_THREAD_CAPACITY;
 
-    /* Initialize compression if requested */
     if (compression_type == COMPRESSION_ZSTD) {
         if (writer_init_zstd(writer) < 0) {
             goto error;
         }
     }
 
-    /* Open file */
     writer->fp = fopen(filename, "wb");
     if (!writer->fp) {
         PyErr_SetFromErrnoWithFilename(PyExc_IOError, filename);
@@ -828,7 +786,6 @@ binary_writer_create(const char *filename, uint64_t sample_interval_us, int comp
     }
 #endif
 
-    /* Write placeholder header - release GIL during I/O */
     uint8_t header[FILE_HEADER_PLACEHOLDER_SIZE] = {0};
     if (fwrite_checked_allow_threads(header, FILE_HEADER_PLACEHOLDER_SIZE, writer->fp) < 0) {
         goto error;
@@ -902,7 +859,6 @@ static int
 process_thread_sample(BinaryWriter *writer, PyObject *thread_info,
                       uint32_t interpreter_id, uint64_t timestamp_us)
 {
-    /* Get thread_id, status, frame_list from ThreadInfo using unchecked access */
     PyObject *thread_id_obj = PyStructSequence_GET_ITEM(thread_info, 0);
     PyObject *status_obj = PyStructSequence_GET_ITEM(thread_info, 1);
     PyObject *frame_list = PyStructSequence_GET_ITEM(thread_info, 2);
@@ -917,7 +873,6 @@ process_thread_sample(BinaryWriter *writer, PyObject *thread_info,
     }
     uint8_t status = (uint8_t)status_long;
 
-    /* Get or create thread entry */
     int is_new_thread = 0;
     ThreadEntry *entry = writer_get_or_create_thread_entry(
         writer, thread_id, interpreter_id, &is_new_thread);
@@ -961,14 +916,12 @@ process_thread_sample(BinaryWriter *writer, PyObject *thread_info,
             }
         }
 
-        /* Write this sample with the appropriate encoding */
         if (write_sample_with_encoding(writer, entry, delta, status, encoding,
                                        curr_stack, curr_depth,
                                        shared_count, pop_count, push_count) < 0) {
             return -1;
         }
 
-        /* Update previous stack */
         memcpy(entry->prev_stack, curr_stack, curr_depth * sizeof(uint32_t));
         entry->prev_stack_depth = curr_depth;
     }
@@ -988,7 +941,6 @@ binary_writer_write_sample(BinaryWriter *writer, PyObject *stack_frames, uint64_
     for (Py_ssize_t i = 0; i < num_interpreters; i++) {
         PyObject *interp_info = PyList_GET_ITEM(stack_frames, i);
 
-        /* Get interpreter_id and threads from InterpreterInfo using unchecked access */
         PyObject *interp_id_obj = PyStructSequence_GET_ITEM(interp_info, 0);
         PyObject *threads = PyStructSequence_GET_ITEM(interp_info, 1);
 
@@ -1020,7 +972,6 @@ binary_writer_write_sample(BinaryWriter *writer, PyObject *stack_frames, uint64_
 int
 binary_writer_finalize(BinaryWriter *writer)
 {
-    /* Flush any pending RLE for all threads */
     for (size_t i = 0; i < writer->thread_count; i++) {
         if (writer->thread_entries[i].has_pending_rle) {
             if (flush_pending_rle(writer, &writer->thread_entries[i]) < 0) {
@@ -1029,7 +980,6 @@ binary_writer_finalize(BinaryWriter *writer)
         }
     }
 
-    /* Flush remaining buffer */
     if (writer_flush_buffer(writer) < 0) {
         return -1;
     }
@@ -1064,14 +1014,14 @@ binary_writer_finalize(BinaryWriter *writer)
     }
 #endif
 
-    /* Get offset for string table (use 64-bit file position for >2GB files) */
+    /* Use 64-bit file position for >2GB files */
     file_offset_t string_table_offset = FTELL64(writer->fp);
     if (string_table_offset < 0) {
         PyErr_SetFromErrno(PyExc_IOError);
         return -1;
     }
 
-    /* Write string table - release GIL during potentially large writes */
+    /* Release GIL during potentially large writes */
     for (size_t i = 0; i < writer->string_count; i++) {
         uint8_t len_buf[10];
         size_t len_size = encode_varint_u32(len_buf, (uint32_t)writer->string_lengths[i]);
@@ -1081,14 +1031,12 @@ binary_writer_finalize(BinaryWriter *writer)
         }
     }
 
-    /* Get offset for frame table */
     file_offset_t frame_table_offset = FTELL64(writer->fp);
     if (frame_table_offset < 0) {
         PyErr_SetFromErrno(PyExc_IOError);
         return -1;
     }
 
-    /* Write frame table - release GIL during writes */
     for (size_t i = 0; i < writer->frame_count; i++) {
         FrameEntry *entry = &writer->frame_entries[i];
         uint8_t buf[30];
@@ -1100,7 +1048,7 @@ binary_writer_finalize(BinaryWriter *writer)
         }
     }
 
-    /* Write footer (32 bytes): string_count(4) + frame_count(4) + file_size(8) + checksum(16) */
+    /* Footer: string_count(4) + frame_count(4) + file_size(8) + checksum(16) */
     file_offset_t footer_offset = FTELL64(writer->fp);
     if (footer_offset < 0) {
         PyErr_SetFromErrno(PyExc_IOError);
@@ -1116,7 +1064,6 @@ binary_writer_finalize(BinaryWriter *writer)
         return -1;
     }
 
-    /* Write header at file start */
     if (FSEEK64(writer->fp, 0, SEEK_SET) < 0) {
         PyErr_SetFromErrno(PyExc_IOError);
         return -1;
@@ -1142,7 +1089,6 @@ binary_writer_finalize(BinaryWriter *writer)
         return -1;
     }
 
-    /* Close file */
     if (fclose(writer->fp) != 0) {
         writer->fp = NULL;
         PyErr_SetFromErrno(PyExc_IOError);
@@ -1174,7 +1120,6 @@ binary_writer_destroy(BinaryWriter *writer)
     PyMem_Free(writer->zstd.compressed_buffer);
 #endif
 
-    /* Free string hash table (destroys keys which decrefs Python strings) */
     if (writer->string_hash) {
         _Py_hashtable_destroy(writer->string_hash);
     }
@@ -1186,13 +1131,11 @@ binary_writer_destroy(BinaryWriter *writer)
     }
     PyMem_Free(writer->string_lengths);
 
-    /* Free frame hash table (destroys keys which frees FrameKey structs) */
     if (writer->frame_hash) {
         _Py_hashtable_destroy(writer->frame_hash);
     }
     PyMem_Free(writer->frame_entries);
 
-    /* Free per-thread buffers */
     if (writer->thread_entries) {
         for (size_t i = 0; i < writer->thread_count; i++) {
             PyMem_Free(writer->thread_entries[i].prev_stack);

From 788c56509972af80b8b3274ed00d05dc0323c594 Mon Sep 17 00:00:00 2001
From: Pablo Galindo Salgado <pablogsal@gmail.com>
Date: Mon, 15 Dec 2025 20:22:25 +0000
Subject: [PATCH 17/17] Speed up general case

---
 Lib/profiling/sampling/collector.py           |  11 ++
 Lib/profiling/sampling/gecko_collector.py     | 117 +++++++-------
 Lib/profiling/sampling/heatmap_collector.py   |  28 ++--
 Lib/profiling/sampling/pstats_collector.py    |  21 +--
 Lib/profiling/sampling/stack_collector.py     |  55 +++----
 .../test_binary_format.py                     |  15 +-
 Modules/_remote_debugging/binary_io_reader.c  | 143 +++++++++++++-----
 7 files changed, 234 insertions(+), 156 deletions(-)

diff --git a/Lib/profiling/sampling/collector.py b/Lib/profiling/sampling/collector.py
index 6996bf99aef48a..0b485bbbb4c240 100644
--- a/Lib/profiling/sampling/collector.py
+++ b/Lib/profiling/sampling/collector.py
@@ -86,6 +86,17 @@ def _iter_async_frames(self, awaited_info_list):
         # Phase 3: Build linear stacks from each leaf to root (optimized - no sorting!)
         yield from self._build_linear_stacks(leaf_task_ids, task_map, child_to_parent)
 
+    def _iter_stacks(self, stack_frames, skip_idle=False):
+        """Yield (frames, thread_id) for all stacks, handling both sync and async modes."""
+        if stack_frames and hasattr(stack_frames[0], "awaited_by"):
+            for frames, thread_id, _ in self._iter_async_frames(stack_frames):
+                if frames:
+                    yield frames, thread_id
+        else:
+            for frames, thread_id in self._iter_all_frames(stack_frames, skip_idle=skip_idle):
+                if frames:
+                    yield frames, thread_id
+
     def _build_task_graph(self, awaited_info_list):
         task_map = {}
         child_to_parent = {}  # Maps child_id -> (selected_parent_id, parent_count)
diff --git a/Lib/profiling/sampling/gecko_collector.py b/Lib/profiling/sampling/gecko_collector.py
index 356d4609f4436b..c1c9cfcf3b93a9 100644
--- a/Lib/profiling/sampling/gecko_collector.py
+++ b/Lib/profiling/sampling/gecko_collector.py
@@ -141,27 +141,35 @@ def _track_state_transition(self, tid, condition, active_dict, inactive_dict,
                 self._add_marker(tid, active_name, active_dict.pop(tid),
                                current_time, category)
 
-    def collect(self, stack_frames, timestamp_us=None):
-        """Collect a sample from stack frames."""
-        if timestamp_us is not None:
-            # Use provided timestamp (from binary replay)
-            # Track first timestamp as base for relative time calculation
-            if self._replay_base_timestamp_us is None:
-                self._replay_base_timestamp_us = timestamp_us
-            # Convert to milliseconds relative to first sample
-            current_time = (timestamp_us - self._replay_base_timestamp_us) / 1000
-        else:
-            # Live sampling - use monotonic clock
+    def collect(self, stack_frames, timestamps_us=None):
+        """Collect samples from stack frames.
+
+        Args:
+            stack_frames: List of interpreter/thread frame info
+            timestamps_us: List of timestamps in microseconds (None for live sampling)
+        """
+        # Handle live sampling (no timestamps provided)
+        if timestamps_us is None:
             current_time = (time.monotonic() * 1000) - self.start_time
+            times = [current_time]
+        else:
+            if not timestamps_us:
+                return
+            # Initialize base timestamp if needed
+            if self._replay_base_timestamp_us is None:
+                self._replay_base_timestamp_us = timestamps_us[0]
+            # Convert all timestamps to times (ms relative to first sample)
+            base = self._replay_base_timestamp_us
+            times = [(ts - base) / 1000 for ts in timestamps_us]
+
+        first_time = times[0]
 
         # Update interval calculation
         if self.sample_count > 0 and self.last_sample_time > 0:
-            self.interval = (
-                current_time - self.last_sample_time
-            ) / self.sample_count
-        self.last_sample_time = current_time
+            self.interval = (times[-1] - self.last_sample_time) / self.sample_count
+        self.last_sample_time = times[-1]
 
-        # Process threads and track GC per thread
+        # Process threads
         for interpreter_info in stack_frames:
             for thread_info in interpreter_info.threads:
                 frames = thread_info.frame_info
@@ -179,92 +187,86 @@ def collect(self, stack_frames, timestamp_us=None):
                 on_cpu = bool(status_flags & THREAD_STATUS_ON_CPU)
                 gil_requested = bool(status_flags & THREAD_STATUS_GIL_REQUESTED)
 
-                # Track GIL possession (Has GIL / No GIL)
+                # Track state transitions using first timestamp
                 self._track_state_transition(
                     tid, has_gil, self.has_gil_start, self.no_gil_start,
-                    "Has GIL", "No GIL", CATEGORY_GIL, current_time
+                    "Has GIL", "No GIL", CATEGORY_GIL, first_time
                 )
-
-                # Track CPU state (On CPU / Off CPU)
                 self._track_state_transition(
                     tid, on_cpu, self.on_cpu_start, self.off_cpu_start,
-                    "On CPU", "Off CPU", CATEGORY_CPU, current_time
+                    "On CPU", "Off CPU", CATEGORY_CPU, first_time
                 )
 
-                # Track code type (Python Code / Native Code)
-                # This is tri-state: Python (has_gil), Native (on_cpu without gil), or Neither
+                # Track code type
                 if has_gil:
                     self._track_state_transition(
                         tid, True, self.python_code_start, self.native_code_start,
-                        "Python Code", "Native Code", CATEGORY_CODE_TYPE, current_time
+                        "Python Code", "Native Code", CATEGORY_CODE_TYPE, first_time
                     )
                 elif on_cpu:
                     self._track_state_transition(
                         tid, True, self.native_code_start, self.python_code_start,
-                        "Native Code", "Python Code", CATEGORY_CODE_TYPE, current_time
+                        "Native Code", "Python Code", CATEGORY_CODE_TYPE, first_time
                     )
                 else:
-                    # Thread is idle (neither has GIL nor on CPU) - close any open code markers
-                    # This handles the third state that _track_state_transition doesn't cover
                     if tid in self.initialized_threads:
                         if tid in self.python_code_start:
                             self._add_marker(tid, "Python Code", self.python_code_start.pop(tid),
-                                           current_time, CATEGORY_CODE_TYPE)
+                                           first_time, CATEGORY_CODE_TYPE)
                         if tid in self.native_code_start:
                             self._add_marker(tid, "Native Code", self.native_code_start.pop(tid),
-                                           current_time, CATEGORY_CODE_TYPE)
+                                           first_time, CATEGORY_CODE_TYPE)
 
-                # Track "Waiting for GIL" intervals (one-sided tracking)
+                # Track GIL wait
                 if gil_requested:
-                    self.gil_wait_start.setdefault(tid, current_time)
+                    self.gil_wait_start.setdefault(tid, first_time)
                 elif tid in self.gil_wait_start:
                     self._add_marker(tid, "Waiting for GIL", self.gil_wait_start.pop(tid),
-                                   current_time, CATEGORY_GIL)
+                                   first_time, CATEGORY_GIL)
 
-                # Track exception state (Has Exception / No Exception)
+                # Track exception state
                 has_exception = bool(status_flags & THREAD_STATUS_HAS_EXCEPTION)
                 self._track_state_transition(
                     tid, has_exception, self.exception_start, self.no_exception_start,
-                    "Has Exception", "No Exception", CATEGORY_EXCEPTION, current_time
+                    "Has Exception", "No Exception", CATEGORY_EXCEPTION, first_time
                 )
 
-                # Track GC events by detecting <GC> frames in the stack trace
-                # This leverages the improved GC frame tracking from commit 336366fd7ca
-                # which precisely identifies the thread that initiated GC collection
+                # Track GC events
                 has_gc_frame = any(frame[2] == "<GC>" for frame in frames)
                 if has_gc_frame:
-                    # This thread initiated GC collection
                     if tid not in self.gc_start_per_thread:
-                        self.gc_start_per_thread[tid] = current_time
+                        self.gc_start_per_thread[tid] = first_time
                 elif tid in self.gc_start_per_thread:
-                    # End GC marker when no more GC frames are detected
                     self._add_marker(tid, "GC Collecting", self.gc_start_per_thread.pop(tid),
-                                   current_time, CATEGORY_GC)
+                                   first_time, CATEGORY_GC)
 
-                # Mark thread as initialized after processing all state transitions
+                # Mark thread as initialized
                 self.initialized_threads.add(tid)
 
-                # Categorize: idle if neither has GIL nor on CPU
+                # Skip idle threads if requested
                 is_idle = not has_gil and not on_cpu
-
-                # Skip idle threads if skip_idle is enabled
                 if self.skip_idle and is_idle:
                     continue
 
                 if not frames:
                     continue
 
-                # Process the stack
+                # Process stack once to get stack_index
                 stack_index = self._process_stack(thread_data, frames)
 
-                # Add sample - cache references to avoid dictionary lookups
+                # Add samples with timestamps
                 samples = thread_data["samples"]
-                samples["stack"].append(stack_index)
-                samples["time"].append(current_time)
-                samples["eventDelay"].append(None)
+                samples_stack = samples["stack"]
+                samples_time = samples["time"]
+                samples_delay = samples["eventDelay"]
+
+                for t in times:
+                    samples_stack.append(stack_index)
+                    samples_time.append(t)
+                    samples_delay.append(None)
 
-                # Track opcode state changes for interval markers (leaf frame only)
-                if self.opcodes_enabled:
+                # Handle opcodes
+                if self.opcodes_enabled and frames:
                     leaf_frame = frames[0]
                     filename, location, funcname, opcode = leaf_frame
                     if isinstance(location, tuple):
@@ -276,18 +278,15 @@ def collect(self, stack_frames, timestamp_us=None):
                     current_state = (opcode, lineno, col_offset, funcname, filename)
 
                     if tid not in self.opcode_state:
-                        # First observation - start tracking
-                        self.opcode_state[tid] = (*current_state, current_time)
+                        self.opcode_state[tid] = (*current_state, first_time)
                     elif self.opcode_state[tid][:5] != current_state:
-                        # State changed - emit marker for previous state
                         prev_opcode, prev_lineno, prev_col, prev_funcname, prev_filename, prev_start = self.opcode_state[tid]
                         self._add_opcode_interval_marker(
-                            tid, prev_opcode, prev_lineno, prev_col, prev_funcname, prev_start, current_time
+                            tid, prev_opcode, prev_lineno, prev_col, prev_funcname, prev_start, first_time
                         )
-                        # Start tracking new state
-                        self.opcode_state[tid] = (*current_state, current_time)
+                        self.opcode_state[tid] = (*current_state, first_time)
 
-        self.sample_count += 1
+        self.sample_count += len(times)
 
     def _create_thread(self, tid):
         """Create a new thread structure with processed profile format."""
diff --git a/Lib/profiling/sampling/heatmap_collector.py b/Lib/profiling/sampling/heatmap_collector.py
index 5b4c89283be08c..4e7e359bf8903b 100644
--- a/Lib/profiling/sampling/heatmap_collector.py
+++ b/Lib/profiling/sampling/heatmap_collector.py
@@ -518,7 +518,7 @@ def set_stats(self, sample_interval_usec, duration_sec, sample_rate, error_rate=
         }
         self.stats.update(kwargs)
 
-    def process_frames(self, frames, thread_id):
+    def process_frames(self, frames, thread_id, weight=1):
         """Process stack frames and count samples per line.
 
         Args:
@@ -526,8 +526,9 @@ def process_frames(self, frames, thread_id):
                     leaf-to-root order. location is (lineno, end_lineno, col_offset, end_col_offset).
                     opcode is None if not gathered.
             thread_id: Thread ID for this stack trace
+            weight: Number of samples this stack represents (for batched RLE)
         """
-        self._total_samples += 1
+        self._total_samples += weight
         self._seen_lines.clear()
 
         for i, (filename, location, funcname, opcode) in enumerate(frames):
@@ -545,15 +546,16 @@ def process_frames(self, frames, thread_id):
                 self._seen_lines.add(line_key)
 
             self._record_line_sample(filename, lineno, funcname, is_leaf=is_leaf,
-                                     count_cumulative=count_cumulative)
+                                     count_cumulative=count_cumulative, weight=weight)
 
             if opcode is not None:
                 # Set opcodes_enabled flag when we first encounter opcode data
                 self.opcodes_enabled = True
                 self._record_bytecode_sample(filename, lineno, opcode,
-                                             end_lineno, col_offset, end_col_offset)
+                                             end_lineno, col_offset, end_col_offset,
+                                             weight=weight)
 
-            # Build call graph for adjacent frames
+            # Build call graph for adjacent frames (relationships are deduplicated anyway)
             if i + 1 < len(frames):
                 next_frame = frames[i + 1]
                 next_lineno = extract_lineno(next_frame[1])
@@ -575,24 +577,25 @@ def _is_valid_frame(self, filename, lineno):
         return True
 
     def _record_line_sample(self, filename, lineno, funcname, is_leaf=False,
-                            count_cumulative=True):
+                            count_cumulative=True, weight=1):
         """Record a sample for a specific line."""
         # Track cumulative samples (all occurrences in stack)
         if count_cumulative:
-            self.line_samples[(filename, lineno)] += 1
-            self.file_samples[filename][lineno] += 1
+            self.line_samples[(filename, lineno)] += weight
+            self.file_samples[filename][lineno] += weight
 
         # Track self/leaf samples (only when at top of stack)
         if is_leaf:
-            self.line_self_samples[(filename, lineno)] += 1
-            self.file_self_samples[filename][lineno] += 1
+            self.line_self_samples[(filename, lineno)] += weight
+            self.file_self_samples[filename][lineno] += weight
 
         # Record function definition location
         if funcname and (filename, funcname) not in self.function_definitions:
             self.function_definitions[(filename, funcname)] = lineno
 
     def _record_bytecode_sample(self, filename, lineno, opcode,
-                                end_lineno=None, col_offset=None, end_col_offset=None):
+                                end_lineno=None, col_offset=None, end_col_offset=None,
+                                weight=1):
         """Record a sample for a specific bytecode instruction.
 
         Args:
@@ -602,6 +605,7 @@ def _record_bytecode_sample(self, filename, lineno, opcode,
             end_lineno: End line number (may be -1 if not available)
             col_offset: Column offset in UTF-8 bytes (may be -1 if not available)
             end_col_offset: End column offset in UTF-8 bytes (may be -1 if not available)
+            weight: Number of samples this represents (for batched RLE)
         """
         key = (filename, lineno)
 
@@ -609,7 +613,7 @@ def _record_bytecode_sample(self, filename, lineno, opcode,
         if opcode not in self.line_opcodes[key]:
             self.line_opcodes[key][opcode] = {'count': 0, 'locations': set()}
 
-        self.line_opcodes[key][opcode]['count'] += 1
+        self.line_opcodes[key][opcode]['count'] += weight
 
         # Store unique location info if column offset is available (not -1)
         if col_offset is not None and col_offset >= 0:
diff --git a/Lib/profiling/sampling/pstats_collector.py b/Lib/profiling/sampling/pstats_collector.py
index eb79df1dc93dba..1b2fe6a77278ee 100644
--- a/Lib/profiling/sampling/pstats_collector.py
+++ b/Lib/profiling/sampling/pstats_collector.py
@@ -18,7 +18,7 @@ def __init__(self, sample_interval_usec, *, skip_idle=False):
         self.skip_idle = skip_idle
         self._seen_locations = set()
 
-    def _process_frames(self, frames):
+    def _process_frames(self, frames, weight=1):
         """Process a single thread's frame stack."""
         if not frames:
             return
@@ -32,12 +32,12 @@ def _process_frames(self, frames):
             location = (frame.filename, lineno, frame.funcname)
             if location not in self._seen_locations:
                 self._seen_locations.add(location)
-                self.result[location]["cumulative_calls"] += 1
+                self.result[location]["cumulative_calls"] += weight
 
         # The top frame gets counted as an inline call (directly executing)
         top_lineno = extract_lineno(frames[0].location)
         top_location = (frames[0].filename, top_lineno, frames[0].funcname)
-        self.result[top_location]["direct_calls"] += 1
+        self.result[top_location]["direct_calls"] += weight
 
         # Track caller-callee relationships for call graph
         for i in range(1, len(frames)):
@@ -49,17 +49,12 @@ def _process_frames(self, frames):
             callee = (callee_frame.filename, callee_lineno, callee_frame.funcname)
             caller = (caller_frame.filename, caller_lineno, caller_frame.funcname)
 
-            self.callers[callee][caller] += 1
+            self.callers[callee][caller] += weight
 
-    def collect(self, stack_frames, timestamp_us=None):
-        if stack_frames and hasattr(stack_frames[0], "awaited_by"):
-            # Async frame processing
-            for frames, thread_id, task_id in self._iter_async_frames(stack_frames):
-                self._process_frames(frames)
-        else:
-            # Regular frame processing
-            for frames, thread_id in self._iter_all_frames(stack_frames, skip_idle=self.skip_idle):
-                self._process_frames(frames)
+    def collect(self, stack_frames, timestamps_us=None):
+        weight = len(timestamps_us) if timestamps_us else 1
+        for frames, _ in self._iter_stacks(stack_frames, skip_idle=self.skip_idle):
+            self._process_frames(frames, weight=weight)
 
     def export(self, filename):
         self.create_stats()
diff --git a/Lib/profiling/sampling/stack_collector.py b/Lib/profiling/sampling/stack_collector.py
index 8e75234ed5251e..55e643d0e9c8cb 100644
--- a/Lib/profiling/sampling/stack_collector.py
+++ b/Lib/profiling/sampling/stack_collector.py
@@ -18,21 +18,12 @@ def __init__(self, sample_interval_usec, *, skip_idle=False):
         self.sample_interval_usec = sample_interval_usec
         self.skip_idle = skip_idle
 
-    def collect(self, stack_frames, timestamp_us=None, skip_idle=False):
-        if stack_frames and hasattr(stack_frames[0], "awaited_by"):
-            # Async-aware mode: process async task frames
-            for frames, thread_id, task_id in self._iter_async_frames(stack_frames):
-                if not frames:
-                    continue
-                self.process_frames(frames, thread_id)
-        else:
-            # Sync-only mode
-            for frames, thread_id in self._iter_all_frames(stack_frames, skip_idle=skip_idle):
-                if not frames:
-                    continue
-                self.process_frames(frames, thread_id)
+    def collect(self, stack_frames, timestamps_us=None, skip_idle=False):
+        weight = len(timestamps_us) if timestamps_us else 1
+        for frames, thread_id in self._iter_stacks(stack_frames, skip_idle=skip_idle):
+            self.process_frames(frames, thread_id, weight=weight)
 
-    def process_frames(self, frames, thread_id):
+    def process_frames(self, frames, thread_id, weight=1):
         pass
 
 
@@ -41,13 +32,13 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.stack_counter = collections.Counter()
 
-    def process_frames(self, frames, thread_id):
+    def process_frames(self, frames, thread_id, weight=1):
         # Extract only (filename, lineno, funcname) - opcode not needed for collapsed stacks
         # frame is (filename, location, funcname, opcode)
         call_tree = tuple(
             (f[0], extract_lineno(f[1]), f[2]) for f in reversed(frames)
         )
-        self.stack_counter[(call_tree, thread_id)] += 1
+        self.stack_counter[(call_tree, thread_id)] += weight
 
     def export(self, filename):
         lines = []
@@ -96,23 +87,26 @@ def __init__(self, *args, **kwargs):
         # Per-thread statistics
         self.per_thread_stats = {}  # {thread_id: {has_gil, on_cpu, gil_requested, unknown, has_exception, total, gc_samples}}
 
-    def collect(self, stack_frames, timestamp_us=None, skip_idle=False):
+    def collect(self, stack_frames, timestamps_us=None, skip_idle=False):
         """Override to track thread status statistics before processing frames."""
-        # Increment sample count once per sample
-        self._sample_count += 1
+        # Weight is number of timestamps (samples with identical stack)
+        weight = len(timestamps_us) if timestamps_us else 1
+
+        # Increment sample count by weight
+        self._sample_count += weight
 
         # Collect both aggregate and per-thread statistics using base method
         status_counts, has_gc_frame, per_thread_stats = self._collect_thread_status_stats(stack_frames)
 
-        # Merge aggregate status counts
+        # Merge aggregate status counts (multiply by weight)
         for key in status_counts:
-            self.thread_status_counts[key] += status_counts[key]
+            self.thread_status_counts[key] += status_counts[key] * weight
 
         # Update aggregate GC frame count
         if has_gc_frame:
-            self.samples_with_gc_frames += 1
+            self.samples_with_gc_frames += weight
 
-        # Merge per-thread statistics
+        # Merge per-thread statistics (multiply by weight)
         for thread_id, stats in per_thread_stats.items():
             if thread_id not in self.per_thread_stats:
                 self.per_thread_stats[thread_id] = {
@@ -125,10 +119,10 @@ def collect(self, stack_frames, timestamp_us=None, skip_idle=False):
                     "gc_samples": 0,
                 }
             for key, value in stats.items():
-                self.per_thread_stats[thread_id][key] += value
+                self.per_thread_stats[thread_id][key] += value * weight
 
         # Call parent collect to process frames
-        super().collect(stack_frames, timestamp_us=timestamp_us, skip_idle=skip_idle)
+        super().collect(stack_frames, timestamps_us, skip_idle=skip_idle)
 
     def set_stats(self, sample_interval_usec, duration_sec, sample_rate,
                   error_rate=None, missed_samples=None, mode=None):
@@ -311,7 +305,7 @@ def convert_children(children, min_samples):
             "opcode_mapping": opcode_mapping
         }
 
-    def process_frames(self, frames, thread_id):
+    def process_frames(self, frames, thread_id, weight=1):
         """Process stack frames into flamegraph tree structure.
 
         Args:
@@ -319,10 +313,11 @@ def process_frames(self, frames, thread_id):
                     leaf-to-root order. location is (lineno, end_lineno, col_offset, end_col_offset).
                     opcode is None if not gathered.
             thread_id: Thread ID for this stack trace
+            weight: Number of samples this stack represents (for batched RLE)
         """
         # Reverse to root->leaf order for tree building
-        self._root["samples"] += 1
-        self._total_samples += 1
+        self._root["samples"] += weight
+        self._total_samples += weight
         self._root["threads"].add(thread_id)
         self._all_threads.add(thread_id)
 
@@ -336,11 +331,11 @@ def process_frames(self, frames, thread_id):
             if node is None:
                 node = {"samples": 0, "children": {}, "threads": set(), "opcodes": collections.Counter()}
                 current["children"][func] = node
-            node["samples"] += 1
+            node["samples"] += weight
             node["threads"].add(thread_id)
 
             if opcode is not None:
-                node["opcodes"][opcode] += 1
+                node["opcodes"][opcode] += weight
 
             current = node
 
diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_binary_format.py b/Lib/test/test_profiling/test_sampling_profiler/test_binary_format.py
index ca37eec01f43d8..64bef181da9ba2 100644
--- a/Lib/test/test_profiling/test_sampling_profiler/test_binary_format.py
+++ b/Lib/test/test_profiling/test_sampling_profiler/test_binary_format.py
@@ -62,8 +62,10 @@ def __init__(self):
         self.by_thread = defaultdict(list)
         self.total_count = 0
 
-    def collect(self, stack_frames, timestamp_us=None):
+    def collect(self, stack_frames, timestamps_us):
         """Capture the raw sample data."""
+        # timestamps_us is a list; add one sample per timestamp
+        count = len(timestamps_us)
         for interp in stack_frames:
             for thread in interp.threads:
                 frames = []
@@ -76,13 +78,10 @@ def collect(self, stack_frames, timestamp_us=None):
                         }
                     )
                 key = (interp.interpreter_id, thread.thread_id)
-                self.by_thread[key].append(
-                    {
-                        "status": thread.status,
-                        "frames": frames,
-                    }
-                )
-                self.total_count += 1
+                sample = {"status": thread.status, "frames": frames}
+                for _ in range(count):
+                    self.by_thread[key].append(sample)
+                self.total_count += count
 
     def export(self, filename):
         pass
diff --git a/Modules/_remote_debugging/binary_io_reader.c b/Modules/_remote_debugging/binary_io_reader.c
index ef3d26b31cffdf..10381e8b62bc00 100644
--- a/Modules/_remote_debugging/binary_io_reader.c
+++ b/Modules/_remote_debugging/binary_io_reader.c
@@ -677,16 +677,14 @@ build_frame_list(RemoteDebuggingState *state, BinaryReader *reader,
     return NULL;
 }
 
-/* Helper to build and emit a sample to the collector */
-static int
-emit_sample(RemoteDebuggingState *state, PyObject *collector,
-            uint64_t thread_id, uint32_t interpreter_id, uint8_t status,
-            const uint32_t *frame_indices, size_t stack_depth,
-            BinaryReader *reader, uint64_t timestamp_us)
+/* Helper to build sample_list from frame indices (shared by emit functions) */
+static PyObject *
+build_sample_list(RemoteDebuggingState *state, BinaryReader *reader,
+                  uint64_t thread_id, uint32_t interpreter_id, uint8_t status,
+                  const uint32_t *frame_indices, size_t stack_depth)
 {
     PyObject *frame_list = NULL, *thread_info = NULL, *thread_list = NULL;
-    PyObject *interp_info = NULL, *sample_list = NULL, *result = NULL;
-    int ret = -1;
+    PyObject *interp_info = NULL, *sample_list = NULL;
 
     frame_list = build_frame_list(state, reader, frame_indices, stack_depth);
     if (!frame_list) {
@@ -735,27 +733,54 @@ emit_sample(RemoteDebuggingState *state, PyObject *collector,
         goto error;
     }
     PyList_SET_ITEM(sample_list, 0, interp_info);
-    interp_info = NULL;
-
-    /* Pass timestamp_us to collector - collectors use it if provided */
-    PyObject *timestamp_obj = PyLong_FromUnsignedLongLong(timestamp_us);
-    if (!timestamp_obj) {
-        goto error;
-    }
-    result = PyObject_CallMethod(collector, "collect", "OO", sample_list, timestamp_obj);
-    Py_DECREF(timestamp_obj);
-    if (result) {
-        ret = 0;
-    }
+    return sample_list;
 
 error:
-    Py_XDECREF(result);
     Py_XDECREF(sample_list);
     Py_XDECREF(interp_info);
     Py_XDECREF(thread_list);
     Py_XDECREF(thread_info);
     Py_XDECREF(frame_list);
-    return ret;
+    return NULL;
+}
+
+/* Helper to emit a sample to the collector. timestamps_list is borrowed. */
+static int
+emit_sample(RemoteDebuggingState *state, PyObject *collector,
+            uint64_t thread_id, uint32_t interpreter_id, uint8_t status,
+            const uint32_t *frame_indices, size_t stack_depth,
+            BinaryReader *reader, PyObject *timestamps_list)
+{
+    PyObject *sample_list = build_sample_list(state, reader, thread_id,
+                                               interpreter_id, status,
+                                               frame_indices, stack_depth);
+    if (!sample_list) {
+        return -1;
+    }
+
+    PyObject *result = PyObject_CallMethod(collector, "collect", "OO", sample_list, timestamps_list);
+    Py_DECREF(sample_list);
+
+    if (!result) {
+        return -1;
+    }
+    Py_DECREF(result);
+    return 0;
+}
+
+/* Helper to trim timestamp list and emit batch. Returns 0 on success, -1 on error. */
+static int
+emit_batch(RemoteDebuggingState *state, PyObject *collector,
+           uint64_t thread_id, uint32_t interpreter_id, uint8_t status,
+           const uint32_t *frame_indices, size_t stack_depth,
+           BinaryReader *reader, PyObject *timestamps_list, Py_ssize_t actual_size)
+{
+    /* Trim list to actual size */
+    if (PyList_SetSlice(timestamps_list, actual_size, PyList_GET_SIZE(timestamps_list), NULL) < 0) {
+        return -1;
+    }
+    return emit_sample(state, collector, thread_id, interpreter_id, status,
+                       frame_indices, stack_depth, reader, timestamps_list);
 }
 
 /* Helper to invoke progress callback, clearing any errors */
@@ -849,36 +874,72 @@ binary_reader_replay(BinaryReader *reader, PyObject *collector, PyObject *progre
             reader->stats.repeat_records++;
             reader->stats.repeat_samples += count;
 
+            /* Process RLE samples, batching by status */
+            PyObject *timestamps_list = NULL;
+            uint8_t batch_status = 0;
+            Py_ssize_t batch_idx = 0;
+
             for (uint32_t i = 0; i < count; i++) {
                 size_t delta_prev_offset = offset;
                 uint64_t delta = decode_varint_u64(reader->sample_data, &offset, reader->sample_data_size);
-                /* Detect varint decode failure: offset unchanged means error (overflow or truncated) */
                 if (offset == delta_prev_offset) {
+                    Py_XDECREF(timestamps_list);
                     PyErr_SetString(PyExc_ValueError, "Malformed varint in RLE sample data");
                     return -1;
                 }
                 if (offset >= reader->sample_data_size) {
+                    Py_XDECREF(timestamps_list);
                     PyErr_SetString(PyExc_ValueError, "Unexpected end of sample data in RLE");
                     return -1;
                 }
                 uint8_t status = reader->sample_data[offset++];
-
                 ts->prev_timestamp += delta;
 
-                /* Emit sample using cached stack */
-                if (emit_sample(state, collector, thread_id, interpreter_id, status,
-                               ts->current_stack, ts->current_stack_depth, reader,
-                               ts->prev_timestamp) < 0) {
+                /* Start new batch on first sample or status change */
+                if (i == 0 || status != batch_status) {
+                    if (timestamps_list) {
+                        int rc = emit_batch(state, collector, thread_id, interpreter_id,
+                                            batch_status, ts->current_stack, ts->current_stack_depth,
+                                            reader, timestamps_list, batch_idx);
+                        Py_DECREF(timestamps_list);
+                        if (rc < 0) {
+                            return -1;
+                        }
+                    }
+                    timestamps_list = PyList_New(count - i);
+                    if (!timestamps_list) {
+                        return -1;
+                    }
+                    batch_status = status;
+                    batch_idx = 0;
+                }
+
+                PyObject *ts_obj = PyLong_FromUnsignedLongLong(ts->prev_timestamp);
+                if (!ts_obj) {
+                    Py_DECREF(timestamps_list);
                     return -1;
                 }
-                replayed++;
-                reader->stats.total_samples++;
+                PyList_SET_ITEM(timestamps_list, batch_idx++, ts_obj);
+            }
 
-                /* Progress callback inside RLE loop for smooth updates */
-                if (replayed % PROGRESS_CALLBACK_INTERVAL == 0) {
-                    invoke_progress_callback(progress_callback, replayed, reader->sample_count);
+            /* Emit final batch */
+            if (timestamps_list) {
+                int rc = emit_batch(state, collector, thread_id, interpreter_id,
+                                    batch_status, ts->current_stack, ts->current_stack_depth,
+                                    reader, timestamps_list, batch_idx);
+                Py_DECREF(timestamps_list);
+                if (rc < 0) {
+                    return -1;
                 }
             }
+
+            replayed += count;
+            reader->stats.total_samples += count;
+
+            /* Progress callback after batch */
+            if (replayed % PROGRESS_CALLBACK_INTERVAL < count) {
+                invoke_progress_callback(progress_callback, replayed, reader->sample_count);
+            }
             break;
         }
 
@@ -918,11 +979,25 @@ binary_reader_replay(BinaryReader *reader, PyObject *collector, PyObject *progre
             }
             reader->stats.stack_reconstructions++;
 
+            /* Build single-element timestamp list */
+            PyObject *ts_obj = PyLong_FromUnsignedLongLong(ts->prev_timestamp);
+            if (!ts_obj) {
+                return -1;
+            }
+            PyObject *timestamps_list = PyList_New(1);
+            if (!timestamps_list) {
+                Py_DECREF(ts_obj);
+                return -1;
+            }
+            PyList_SET_ITEM(timestamps_list, 0, ts_obj);
+
             if (emit_sample(state, collector, thread_id, interpreter_id, status,
                            ts->current_stack, ts->current_stack_depth, reader,
-                           ts->prev_timestamp) < 0) {
+                           timestamps_list) < 0) {
+                Py_DECREF(timestamps_list);
                 return -1;
             }
+            Py_DECREF(timestamps_list);
             replayed++;
             reader->stats.total_samples++;
             break;