From 1711ebd4559313555e89d0f56294842779c7e900 Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Wed, 1 Nov 2023 10:33:11 -0700 Subject: [PATCH] Fixed Python memory leak in map lookup. Previously we were allocating memory on the message's arena every time we performed a `map[key]` or `map.get(key)` operation. This is unnecessary, as the key's data is only needed ephemerally, for the duration of the lookup, and we can therefore alias the Python object's string data instead of copying it. This required fixing a bug in the convert.c operation. Previously in the `arena==NULL` case, if the user passes a bytes object instead of a unicode string, the code would return a pointer to a temporary Python object that had already been freed, leading to use-after-free. I fixed this by referencing the bytes object's data directly, and using utf8_range to verify the UTF-8. Fixes: https://github.com/protocolbuffers/protobuf/issues/14571 PiperOrigin-RevId: 578563555 --- python/BUILD | 1 + python/convert.c | 32 ++++++++++++------- .../google/protobuf/internal/message_test.py | 1 - python/map.c | 8 ++--- 4 files changed, 24 insertions(+), 18 deletions(-) diff --git a/python/BUILD b/python/BUILD index 92e9c496c5c3..ea1dac316044 100644 --- a/python/BUILD +++ b/python/BUILD @@ -203,5 +203,6 @@ py_extension( "//upb/util:compare", "//upb/util:def_to_proto", "//upb/util:required_fields", + "@utf8_range", ], ) diff --git a/python/convert.c b/python/convert.c index 413c42f7359d..0c1174ba6b7a 100644 --- a/python/convert.c +++ b/python/convert.c @@ -35,6 +35,7 @@ #include "upb/message/map.h" #include "upb/reflection/message.h" #include "upb/util/compare.h" +#include "utf8_range.h" // Must be last. #include "upb/port/def.inc" @@ -259,20 +260,27 @@ bool PyUpb_PyToUpb(PyObject* obj, const upb_FieldDef* f, upb_MessageValue* val, } case kUpb_CType_String: { Py_ssize_t size; - const char* ptr; - PyObject* unicode = NULL; if (PyBytes_Check(obj)) { - unicode = obj = PyUnicode_FromEncodedObject(obj, "utf-8", NULL); - if (!obj) return false; + // Use the object's bytes if they are valid UTF-8. + char* ptr; + if (PyBytes_AsStringAndSize(obj, &ptr, &size) < 0) return false; + if (utf8_range2((const unsigned char*)ptr, size) != 0) { + // Invalid UTF-8. Try to convert the message to a Python Unicode + // object, even though we know this will fail, just to get the + // idiomatic Python error message. + obj = PyUnicode_FromEncodedObject(obj, "utf-8", NULL); + assert(!obj); + return false; + } + *val = PyUpb_MaybeCopyString(ptr, size, arena); + return true; + } else { + const char* ptr; + ptr = PyUnicode_AsUTF8AndSize(obj, &size); + if (PyErr_Occurred()) return false; + *val = PyUpb_MaybeCopyString(ptr, size, arena); + return true; } - ptr = PyUnicode_AsUTF8AndSize(obj, &size); - if (PyErr_Occurred()) { - Py_XDECREF(unicode); - return false; - } - *val = PyUpb_MaybeCopyString(ptr, size, arena); - Py_XDECREF(unicode); - return true; } case kUpb_CType_Message: PyErr_Format(PyExc_ValueError, "Message objects may not be assigned"); diff --git a/python/google/protobuf/internal/message_test.py b/python/google/protobuf/internal/message_test.py index 0bac00da023d..b0f1ae784fa6 100755 --- a/python/google/protobuf/internal/message_test.py +++ b/python/google/protobuf/internal/message_test.py @@ -48,7 +48,6 @@ warnings.simplefilter('error', DeprecationWarning) - @_parameterized.named_parameters(('_proto2', unittest_pb2), ('_proto3', unittest_proto3_arena_pb2)) @testing_refleaks.TestCase diff --git a/python/map.c b/python/map.c index a1d75de9a127..6bf12af43808 100644 --- a/python/map.c +++ b/python/map.c @@ -179,7 +179,7 @@ int PyUpb_MapContainer_AssignSubscript(PyObject* _self, PyObject* key, const upb_FieldDef* val_f = upb_MessageDef_Field(entry_m, 1); upb_Arena* arena = PyUpb_Arena_Get(self->arena); upb_MessageValue u_key, u_val; - if (!PyUpb_PyToUpb(key, key_f, &u_key, arena)) return -1; + if (!PyUpb_PyToUpb(key, key_f, &u_key, NULL)) return -1; if (val) { if (!PyUpb_PyToUpb(val, val_f, &u_val, arena)) return -1; @@ -200,9 +200,8 @@ PyObject* PyUpb_MapContainer_Subscript(PyObject* _self, PyObject* key) { const upb_MessageDef* entry_m = upb_FieldDef_MessageSubDef(f); const upb_FieldDef* key_f = upb_MessageDef_Field(entry_m, 0); const upb_FieldDef* val_f = upb_MessageDef_Field(entry_m, 1); - upb_Arena* arena = PyUpb_Arena_Get(self->arena); upb_MessageValue u_key, u_val; - if (!PyUpb_PyToUpb(key, key_f, &u_key, arena)) return NULL; + if (!PyUpb_PyToUpb(key, key_f, &u_key, NULL)) return NULL; if (!map || !upb_Map_Get(map, u_key, &u_val)) { map = PyUpb_MapContainer_EnsureReified(_self); upb_Arena* arena = PyUpb_Arena_Get(self->arena); @@ -256,9 +255,8 @@ static PyObject* PyUpb_MapContainer_Get(PyObject* _self, PyObject* args, const upb_MessageDef* entry_m = upb_FieldDef_MessageSubDef(f); const upb_FieldDef* key_f = upb_MessageDef_Field(entry_m, 0); const upb_FieldDef* val_f = upb_MessageDef_Field(entry_m, 1); - upb_Arena* arena = PyUpb_Arena_Get(self->arena); upb_MessageValue u_key, u_val; - if (!PyUpb_PyToUpb(key, key_f, &u_key, arena)) return NULL; + if (!PyUpb_PyToUpb(key, key_f, &u_key, NULL)) return NULL; if (map && upb_Map_Get(map, u_key, &u_val)) { return PyUpb_UpbToPy(u_val, val_f, self->arena); }