From 086c79ab9a053e8b34253d6ed297718b88cbb3bc Mon Sep 17 00:00:00 2001 From: Emma Harper Smith Date: Sat, 11 Oct 2025 18:16:43 -0700 Subject: [PATCH 1/7] Use PyBytesWriter in pycore_blocks_output_buffer.h Currently, the _BlocksOutputBuffer code creates a list of bytes objects to handle the output data from compression libraries. This ends up being slow due to the output buffer code needing to copy each bytes element of the list into the final bytes object buffer at the end of compression. The new PyBytesWriter API introduced in PEP 782 is an ergonomic and fast method of writing data into a buffer that will later turn into a bytes object. Benchmarks show that using the PyBytesWriter API is 10-30% faster for decompression across a variety of settings. The performance gains are greatest when the decompressor is very performant, such as for Zstandard. Otherwise the decompressor can bottleneck decompression and the gains are more modest, but still sizable! --- .../internal/pycore_blocks_output_buffer.h | 113 ++++-------------- Modules/_bz2module.c | 4 +- Modules/_lzmamodule.c | 4 +- Modules/_zstd/buffer.h | 8 +- Modules/_zstd/compressor.c | 4 +- Modules/_zstd/decompressor.c | 2 +- Modules/zlibmodule.c | 12 +- 7 files changed, 43 insertions(+), 104 deletions(-) diff --git a/Include/internal/pycore_blocks_output_buffer.h b/Include/internal/pycore_blocks_output_buffer.h index 573e10359b7bd2..3ba979ad6ab189 100644 --- a/Include/internal/pycore_blocks_output_buffer.h +++ b/Include/internal/pycore_blocks_output_buffer.h @@ -46,11 +46,13 @@ extern "C" { typedef struct { // List of bytes objects - PyObject *list; + PyBytesWriter *writer; // Number of whole allocated size Py_ssize_t allocated; - // Max length of the buffer, negative number means unlimited length. + // Max length of the buffer, negative number means unlimited length Py_ssize_t max_length; + // Number of blocks of bytes. Used to calculate next allocation size + Py_ssize_t num_blocks; } _BlocksOutputBuffer; static const char unable_allocate_msg[] = "Unable to allocate output buffer."; @@ -107,11 +109,10 @@ _BlocksOutputBuffer_InitAndGrow(_BlocksOutputBuffer *buffer, const Py_ssize_t max_length, void **next_out) { - PyObject *b; Py_ssize_t block_size; - // ensure .list was set to NULL - assert(buffer->list == NULL); + // ensure .writer was set to NULL + assert(buffer->writer == NULL); // get block size if (0 <= max_length && max_length < BUFFER_BLOCK_SIZE[0]) { @@ -120,25 +121,17 @@ _BlocksOutputBuffer_InitAndGrow(_BlocksOutputBuffer *buffer, block_size = BUFFER_BLOCK_SIZE[0]; } - // the first block - b = PyBytes_FromStringAndSize(NULL, block_size); - if (b == NULL) { + buffer->writer = PyBytesWriter_Create(block_size); + if (buffer->writer == NULL) { return -1; } - // create the list - buffer->list = PyList_New(1); - if (buffer->list == NULL) { - Py_DECREF(b); - return -1; - } - PyList_SET_ITEM(buffer->list, 0, b); - // set variables buffer->allocated = block_size; buffer->max_length = max_length; + buffer->num_blocks = 1; - *next_out = PyBytes_AS_STRING(b); + *next_out = PyBytesWriter_GetData(buffer->writer); return block_size; } @@ -155,31 +148,22 @@ _BlocksOutputBuffer_InitWithSize(_BlocksOutputBuffer *buffer, const Py_ssize_t init_size, void **next_out) { - PyObject *b; - // ensure .list was set to NULL - assert(buffer->list == NULL); + // ensure .writer was set to NULL + assert(buffer->writer == NULL); - // the first block - b = PyBytes_FromStringAndSize(NULL, init_size); - if (b == NULL) { + buffer->writer = PyBytesWriter_Create(init_size); + if (buffer->writer == NULL) { PyErr_SetString(PyExc_MemoryError, unable_allocate_msg); return -1; } - // create the list - buffer->list = PyList_New(1); - if (buffer->list == NULL) { - Py_DECREF(b); - return -1; - } - PyList_SET_ITEM(buffer->list, 0, b); - // set variables buffer->allocated = init_size; buffer->max_length = -1; + buffer->num_blocks = 1; - *next_out = PyBytes_AS_STRING(b); + *next_out = PyBytesWriter_GetData(buffer->writer); return init_size; } @@ -193,8 +177,6 @@ _BlocksOutputBuffer_Grow(_BlocksOutputBuffer *buffer, void **next_out, const Py_ssize_t avail_out) { - PyObject *b; - const Py_ssize_t list_len = Py_SIZE(buffer->list); Py_ssize_t block_size; // ensure no gaps in the data @@ -205,8 +187,8 @@ _BlocksOutputBuffer_Grow(_BlocksOutputBuffer *buffer, } // get block size - if (list_len < (Py_ssize_t) Py_ARRAY_LENGTH(BUFFER_BLOCK_SIZE)) { - block_size = BUFFER_BLOCK_SIZE[list_len]; + if (buffer->num_blocks < (Py_ssize_t) Py_ARRAY_LENGTH(BUFFER_BLOCK_SIZE)) { + block_size = BUFFER_BLOCK_SIZE[buffer->num_blocks]; } else { block_size = BUFFER_BLOCK_SIZE[Py_ARRAY_LENGTH(BUFFER_BLOCK_SIZE) - 1]; } @@ -229,22 +211,18 @@ _BlocksOutputBuffer_Grow(_BlocksOutputBuffer *buffer, return -1; } - // create the block - b = PyBytes_FromStringAndSize(NULL, block_size); - if (b == NULL) { + if (PyBytesWriter_Grow(buffer->writer, block_size)) { PyErr_SetString(PyExc_MemoryError, unable_allocate_msg); return -1; } - if (PyList_Append(buffer->list, b) < 0) { - Py_DECREF(b); - return -1; - } - Py_DECREF(b); + + Py_ssize_t current_size = buffer->allocated; // set variables buffer->allocated += block_size; + buffer->num_blocks += 1; - *next_out = PyBytes_AS_STRING(b); + *next_out = PyBytesWriter_GetData(buffer->writer) + current_size; return block_size; } @@ -265,54 +243,15 @@ static inline PyObject * _BlocksOutputBuffer_Finish(_BlocksOutputBuffer *buffer, const Py_ssize_t avail_out) { - PyObject *result, *block; - const Py_ssize_t list_len = Py_SIZE(buffer->list); - - // fast path for single block - if ((list_len == 1 && avail_out == 0) || - (list_len == 2 && Py_SIZE(PyList_GET_ITEM(buffer->list, 1)) == avail_out)) - { - block = PyList_GET_ITEM(buffer->list, 0); - Py_INCREF(block); - - Py_CLEAR(buffer->list); - return block; - } - - // final bytes object - result = PyBytes_FromStringAndSize(NULL, buffer->allocated - avail_out); - if (result == NULL) { - PyErr_SetString(PyExc_MemoryError, unable_allocate_msg); - return NULL; - } - - // memory copy - if (list_len > 0) { - char *posi = PyBytes_AS_STRING(result); - - // blocks except the last one - Py_ssize_t i = 0; - for (; i < list_len-1; i++) { - block = PyList_GET_ITEM(buffer->list, i); - memcpy(posi, PyBytes_AS_STRING(block), Py_SIZE(block)); - posi += Py_SIZE(block); - } - // the last block - block = PyList_GET_ITEM(buffer->list, i); - memcpy(posi, PyBytes_AS_STRING(block), Py_SIZE(block) - avail_out); - } else { - assert(Py_SIZE(result) == 0); - } - - Py_CLEAR(buffer->list); - return result; + return PyBytesWriter_FinishWithSize(buffer->writer, + buffer->allocated - avail_out); } /* Clean up the buffer when an error occurred. */ static inline void _BlocksOutputBuffer_OnError(_BlocksOutputBuffer *buffer) { - Py_CLEAR(buffer->list); + PyBytesWriter_Discard(buffer->writer); } #ifdef __cplusplus diff --git a/Modules/_bz2module.c b/Modules/_bz2module.c index 2e4cc43a2c3f11..9721b493a19956 100644 --- a/Modules/_bz2module.c +++ b/Modules/_bz2module.c @@ -190,7 +190,7 @@ static PyObject * compress(BZ2Compressor *c, char *data, size_t len, int action) { PyObject *result; - _BlocksOutputBuffer buffer = {.list = NULL}; + _BlocksOutputBuffer buffer = {.writer = NULL}; if (OutputBuffer_InitAndGrow(&buffer, -1, &c->bzs.next_out, &c->bzs.avail_out) < 0) { goto error; @@ -429,7 +429,7 @@ decompress_buf(BZ2Decompressor *d, Py_ssize_t max_length) compare against max_length and PyBytes_GET_SIZE we declare it as signed */ PyObject *result; - _BlocksOutputBuffer buffer = {.list = NULL}; + _BlocksOutputBuffer buffer = {.writer = NULL}; bz_stream *bzs = &d->bzs; if (OutputBuffer_InitAndGrow(&buffer, max_length, &bzs->next_out, &bzs->avail_out) < 0) { diff --git a/Modules/_lzmamodule.c b/Modules/_lzmamodule.c index 3e8e37096ba6b4..6fc072f6d0a382 100644 --- a/Modules/_lzmamodule.c +++ b/Modules/_lzmamodule.c @@ -554,7 +554,7 @@ static PyObject * compress(Compressor *c, uint8_t *data, size_t len, lzma_action action) { PyObject *result; - _BlocksOutputBuffer buffer = {.list = NULL}; + _BlocksOutputBuffer buffer = {.writer = NULL}; _lzma_state *state = PyType_GetModuleState(Py_TYPE(c)); assert(state != NULL); @@ -940,7 +940,7 @@ decompress_buf(Decompressor *d, Py_ssize_t max_length) { PyObject *result; lzma_stream *lzs = &d->lzs; - _BlocksOutputBuffer buffer = {.list = NULL}; + _BlocksOutputBuffer buffer = {.writer = NULL}; _lzma_state *state = PyType_GetModuleState(Py_TYPE(d)); assert(state != NULL); diff --git a/Modules/_zstd/buffer.h b/Modules/_zstd/buffer.h index 0ac7bcb4ddc416..807c72c80dde8b 100644 --- a/Modules/_zstd/buffer.h +++ b/Modules/_zstd/buffer.h @@ -16,8 +16,8 @@ static inline int _OutputBuffer_InitAndGrow(_BlocksOutputBuffer *buffer, ZSTD_outBuffer *ob, Py_ssize_t max_length) { - /* Ensure .list was set to NULL */ - assert(buffer->list == NULL); + /* Ensure .writer was set to NULL */ + assert(buffer->writer == NULL); Py_ssize_t res = _BlocksOutputBuffer_InitAndGrow(buffer, max_length, &ob->dst); @@ -39,8 +39,8 @@ _OutputBuffer_InitWithSize(_BlocksOutputBuffer *buffer, ZSTD_outBuffer *ob, { Py_ssize_t block_size; - /* Ensure .list was set to NULL */ - assert(buffer->list == NULL); + /* Ensure .writer was set to NULL */ + assert(buffer->writer == NULL); /* Get block size */ if (0 <= max_length && max_length < init_size) { diff --git a/Modules/_zstd/compressor.c b/Modules/_zstd/compressor.c index 029c07113d4f45..f90bc9c5ab58b1 100644 --- a/Modules/_zstd/compressor.c +++ b/Modules/_zstd/compressor.c @@ -446,7 +446,7 @@ compress_lock_held(ZstdCompressor *self, Py_buffer *data, assert(PyMutex_IsLocked(&self->lock)); ZSTD_inBuffer in; ZSTD_outBuffer out; - _BlocksOutputBuffer buffer = {.list = NULL}; + _BlocksOutputBuffer buffer = {.writer = NULL}; size_t zstd_ret; PyObject *ret; @@ -527,7 +527,7 @@ compress_mt_continue_lock_held(ZstdCompressor *self, Py_buffer *data) assert(PyMutex_IsLocked(&self->lock)); ZSTD_inBuffer in; ZSTD_outBuffer out; - _BlocksOutputBuffer buffer = {.list = NULL}; + _BlocksOutputBuffer buffer = {.writer = NULL}; size_t zstd_ret; PyObject *ret; diff --git a/Modules/_zstd/decompressor.c b/Modules/_zstd/decompressor.c index 6592cad6690d49..13071b7a2bacf0 100644 --- a/Modules/_zstd/decompressor.c +++ b/Modules/_zstd/decompressor.c @@ -216,7 +216,7 @@ decompress_lock_held(ZstdDecompressor *self, ZSTD_inBuffer *in, { size_t zstd_ret; ZSTD_outBuffer out; - _BlocksOutputBuffer buffer = {.list = NULL}; + _BlocksOutputBuffer buffer = {.writer = NULL}; PyObject *ret; /* Initialize the output buffer */ diff --git a/Modules/zlibmodule.c b/Modules/zlibmodule.c index f1312e687da71c..36c933bf618af0 100644 --- a/Modules/zlibmodule.c +++ b/Modules/zlibmodule.c @@ -344,7 +344,7 @@ zlib_compress_impl(PyObject *module, Py_buffer *data, int level, int wbits) PyObject *return_value; int flush; z_stream zst; - _BlocksOutputBuffer buffer = {.list = NULL}; + _BlocksOutputBuffer buffer = {.writer = NULL}; zlibstate *state = get_zlib_state(module); @@ -445,7 +445,7 @@ zlib_decompress_impl(PyObject *module, Py_buffer *data, int wbits, Py_ssize_t ibuflen; int err, flush; z_stream zst; - _BlocksOutputBuffer buffer = {.list = NULL}; + _BlocksOutputBuffer buffer = {.writer = NULL}; _Uint32Window window; // output buffer's UINT32_MAX sliding window zlibstate *state = get_zlib_state(module); @@ -774,7 +774,7 @@ zlib_Compress_compress_impl(compobject *self, PyTypeObject *cls, { PyObject *return_value; int err; - _BlocksOutputBuffer buffer = {.list = NULL}; + _BlocksOutputBuffer buffer = {.writer = NULL}; zlibstate *state = PyType_GetModuleState(cls); ENTER_ZLIB(self); @@ -898,7 +898,7 @@ zlib_Decompress_decompress_impl(compobject *self, PyTypeObject *cls, int err = Z_OK; Py_ssize_t ibuflen; PyObject *return_value; - _BlocksOutputBuffer buffer = {.list = NULL}; + _BlocksOutputBuffer buffer = {.writer = NULL}; PyObject *module = PyType_GetModule(cls); if (module == NULL) @@ -1005,7 +1005,7 @@ zlib_Compress_flush_impl(compobject *self, PyTypeObject *cls, int mode) { int err; PyObject *return_value; - _BlocksOutputBuffer buffer = {.list = NULL}; + _BlocksOutputBuffer buffer = {.writer = NULL}; zlibstate *state = PyType_GetModuleState(cls); /* Flushing with Z_NO_FLUSH is a no-op, so there's no point in @@ -1267,7 +1267,7 @@ zlib_Decompress_flush_impl(compobject *self, PyTypeObject *cls, Py_buffer data; PyObject *return_value; Py_ssize_t ibuflen; - _BlocksOutputBuffer buffer = {.list = NULL}; + _BlocksOutputBuffer buffer = {.writer = NULL}; _Uint32Window window; // output buffer's UINT32_MAX sliding window PyObject *module = PyType_GetModule(cls); From 29a8230e5060a47396a93156849eaed3baedd26b Mon Sep 17 00:00:00 2001 From: Emma Harper Smith Date: Sat, 11 Oct 2025 18:48:08 -0700 Subject: [PATCH 2/7] Fix lint --- Include/internal/pycore_blocks_output_buffer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Include/internal/pycore_blocks_output_buffer.h b/Include/internal/pycore_blocks_output_buffer.h index 3ba979ad6ab189..54b0d35bd071df 100644 --- a/Include/internal/pycore_blocks_output_buffer.h +++ b/Include/internal/pycore_blocks_output_buffer.h @@ -243,7 +243,7 @@ static inline PyObject * _BlocksOutputBuffer_Finish(_BlocksOutputBuffer *buffer, const Py_ssize_t avail_out) { - return PyBytesWriter_FinishWithSize(buffer->writer, + return PyBytesWriter_FinishWithSize(buffer->writer, buffer->allocated - avail_out); } From ffd48e406ee28a94cbd0afa905fb30564f554200 Mon Sep 17 00:00:00 2001 From: Emma Harper Smith Date: Sat, 11 Oct 2025 18:56:22 -0700 Subject: [PATCH 3/7] Cast data buffer to char * for Windows --- Include/internal/pycore_blocks_output_buffer.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Include/internal/pycore_blocks_output_buffer.h b/Include/internal/pycore_blocks_output_buffer.h index 54b0d35bd071df..8141f24161aba0 100644 --- a/Include/internal/pycore_blocks_output_buffer.h +++ b/Include/internal/pycore_blocks_output_buffer.h @@ -222,7 +222,8 @@ _BlocksOutputBuffer_Grow(_BlocksOutputBuffer *buffer, buffer->allocated += block_size; buffer->num_blocks += 1; - *next_out = PyBytesWriter_GetData(buffer->writer) + current_size; + char* data = PyBytesWriter_GetData(buffer->writer); + *next_out = data + current_size; return block_size; } From 2336d0e5c2fbc26f7cf2f5754c52fdd2b2421e66 Mon Sep 17 00:00:00 2001 From: Emma Harper Smith Date: Sun, 12 Oct 2025 21:20:19 -0700 Subject: [PATCH 4/7] =?UTF-8?q?Respond=20to=20review=20from=20B=C3=A9n?= =?UTF-8?q?=C3=A9dikt?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Include/internal/pycore_blocks_output_buffer.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Include/internal/pycore_blocks_output_buffer.h b/Include/internal/pycore_blocks_output_buffer.h index 8141f24161aba0..5e57352c1a8014 100644 --- a/Include/internal/pycore_blocks_output_buffer.h +++ b/Include/internal/pycore_blocks_output_buffer.h @@ -45,7 +45,7 @@ extern "C" { #endif typedef struct { - // List of bytes objects + // Bytes writer managing output buffer PyBytesWriter *writer; // Number of whole allocated size Py_ssize_t allocated; @@ -154,7 +154,7 @@ _BlocksOutputBuffer_InitWithSize(_BlocksOutputBuffer *buffer, buffer->writer = PyBytesWriter_Create(init_size); if (buffer->writer == NULL) { - PyErr_SetString(PyExc_MemoryError, unable_allocate_msg); + // PyBytesWriter_Create already sets an exception when out of memory. return -1; } @@ -244,6 +244,7 @@ static inline PyObject * _BlocksOutputBuffer_Finish(_BlocksOutputBuffer *buffer, const Py_ssize_t avail_out) { + assert(buffer->writer != NULL); return PyBytesWriter_FinishWithSize(buffer->writer, buffer->allocated - avail_out); } @@ -253,6 +254,7 @@ static inline void _BlocksOutputBuffer_OnError(_BlocksOutputBuffer *buffer) { PyBytesWriter_Discard(buffer->writer); + buffer->writer = NULL; } #ifdef __cplusplus From dfd1c25b83f8efe39e8be09e9e1eda7b205ea50a Mon Sep 17 00:00:00 2001 From: Emma Harper Smith Date: Mon, 13 Oct 2025 11:19:07 -0700 Subject: [PATCH 5/7] =?UTF-8?q?Respond=20to=20more=20review=20from=20B?= =?UTF-8?q?=C3=A9n=C3=A9dikt?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Include/internal/pycore_blocks_output_buffer.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Include/internal/pycore_blocks_output_buffer.h b/Include/internal/pycore_blocks_output_buffer.h index 5e57352c1a8014..93cc642f4c1af1 100644 --- a/Include/internal/pycore_blocks_output_buffer.h +++ b/Include/internal/pycore_blocks_output_buffer.h @@ -52,7 +52,7 @@ typedef struct { // Max length of the buffer, negative number means unlimited length Py_ssize_t max_length; // Number of blocks of bytes. Used to calculate next allocation size - Py_ssize_t num_blocks; + size_t num_blocks; } _BlocksOutputBuffer; static const char unable_allocate_msg[] = "Unable to allocate output buffer."; @@ -154,7 +154,6 @@ _BlocksOutputBuffer_InitWithSize(_BlocksOutputBuffer *buffer, buffer->writer = PyBytesWriter_Create(init_size); if (buffer->writer == NULL) { - // PyBytesWriter_Create already sets an exception when out of memory. return -1; } @@ -187,7 +186,7 @@ _BlocksOutputBuffer_Grow(_BlocksOutputBuffer *buffer, } // get block size - if (buffer->num_blocks < (Py_ssize_t) Py_ARRAY_LENGTH(BUFFER_BLOCK_SIZE)) { + if (buffer->num_blocks < Py_ARRAY_LENGTH(BUFFER_BLOCK_SIZE)) { block_size = BUFFER_BLOCK_SIZE[buffer->num_blocks]; } else { block_size = BUFFER_BLOCK_SIZE[Py_ARRAY_LENGTH(BUFFER_BLOCK_SIZE) - 1]; From e5f3661b6931cca86775bc7787116f8c829bf195 Mon Sep 17 00:00:00 2001 From: Emma Smith Date: Mon, 13 Oct 2025 11:21:00 -0700 Subject: [PATCH 6/7] Update Include/internal/pycore_blocks_output_buffer.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com> --- Include/internal/pycore_blocks_output_buffer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Include/internal/pycore_blocks_output_buffer.h b/Include/internal/pycore_blocks_output_buffer.h index 93cc642f4c1af1..3085f17b13c138 100644 --- a/Include/internal/pycore_blocks_output_buffer.h +++ b/Include/internal/pycore_blocks_output_buffer.h @@ -221,7 +221,7 @@ _BlocksOutputBuffer_Grow(_BlocksOutputBuffer *buffer, buffer->allocated += block_size; buffer->num_blocks += 1; - char* data = PyBytesWriter_GetData(buffer->writer); + char *data = PyBytesWriter_GetData(buffer->writer); *next_out = data + current_size; return block_size; } From 029c53c192c9569366f87ec53bf472355378c346 Mon Sep 17 00:00:00 2001 From: Emma Harper Smith Date: Mon, 13 Oct 2025 12:10:55 -0700 Subject: [PATCH 7/7] Refactor block size calculation --- Include/internal/pycore_blocks_output_buffer.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/Include/internal/pycore_blocks_output_buffer.h b/Include/internal/pycore_blocks_output_buffer.h index 3085f17b13c138..016e7a18665859 100644 --- a/Include/internal/pycore_blocks_output_buffer.h +++ b/Include/internal/pycore_blocks_output_buffer.h @@ -186,11 +186,10 @@ _BlocksOutputBuffer_Grow(_BlocksOutputBuffer *buffer, } // get block size - if (buffer->num_blocks < Py_ARRAY_LENGTH(BUFFER_BLOCK_SIZE)) { - block_size = BUFFER_BLOCK_SIZE[buffer->num_blocks]; - } else { - block_size = BUFFER_BLOCK_SIZE[Py_ARRAY_LENGTH(BUFFER_BLOCK_SIZE) - 1]; - } + size_t maxblock = Py_ARRAY_LENGTH(BUFFER_BLOCK_SIZE); + assert(maxblock >= 1); + size_t block_index = Py_MIN(buffer->num_blocks, maxblock - 1); + block_size = BUFFER_BLOCK_SIZE[block_index]; // check max_length if (buffer->max_length >= 0) {