From 783c0b129bf4ddc75ddfad98f93a105ed4cbafee Mon Sep 17 00:00:00 2001 From: Cody Maloney Date: Wed, 3 Dec 2025 14:36:47 -0800 Subject: [PATCH] gh-139871: Optimize bytearray construction with encoding When a `str` is encoded in `bytearray.__init__` the encoder tends to create a new unique bytes object. Rather than allocate new memory and copy the bytes use the already created bytes object as bytearray backing. The bigger the `str` the bigger the saving. Mean +- std dev: [main_encoding] 497 us +- 9 us -> [encoding] 14.2 us +- 0.3 us: 34.97x faster ```python import pyperf runner = pyperf.Runner() runner.timeit( name="encode", setup="a = 'a' * 1_000_000", stmt="bytearray(a, encoding='utf8')") ``` --- Objects/bytearrayobject.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/Objects/bytearrayobject.c b/Objects/bytearrayobject.c index 99e1c9b13f7879..25cc0bfcbaba45 100644 --- a/Objects/bytearrayobject.c +++ b/Objects/bytearrayobject.c @@ -914,6 +914,10 @@ bytearray___init___impl(PyByteArrayObject *self, PyObject *arg, return -1; } + /* Should be caused by first init or the resize to 0. */ + assert(self->ob_bytes_object == Py_GetConstantBorrowed(Py_CONSTANT_EMPTY_BYTES)); + assert(self->ob_exports == 0); + /* Make a quick exit if no first argument */ if (arg == NULL) { if (encoding != NULL || errors != NULL) { @@ -935,9 +939,20 @@ bytearray___init___impl(PyByteArrayObject *self, PyObject *arg, return -1; } encoded = PyUnicode_AsEncodedString(arg, encoding, errors); - if (encoded == NULL) + if (encoded == NULL) { return -1; + } assert(PyBytes_Check(encoded)); + + /* Most encodes return a new unique bytes, just use it as buffer. */ + if (_PyObject_IsUniquelyReferenced(encoded) + && PyBytes_CheckExact(encoded)) + { + Py_ssize_t size = Py_SIZE(encoded); + self->ob_bytes_object = encoded; + bytearray_reinit_from_bytes(self, size, size); + return 0; + } new = bytearray_iconcat((PyObject*)self, encoded); Py_DECREF(encoded); if (new == NULL)