From 48ad552f6da2fe65f4d0b1834d459eed01191a96 Mon Sep 17 00:00:00 2001 From: Dong-hee Na Date: Sun, 31 Jul 2022 13:36:58 +0900 Subject: [PATCH 1/6] gh-91146: More reduce allocation size of list from str.split/rsplit Co-authored-by: Inada Naoki --- Objects/unicodeobject.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 355d74fe3bbda7..93ae9f74e9e658 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9698,11 +9698,11 @@ split(PyObject *self, PyObject* out; len1 = PyUnicode_GET_LENGTH(self); kind1 = PyUnicode_KIND(self); - if (maxcount < 0) { - maxcount = len1; - } - if (substring == NULL) + if (substring == NULL) { + if (maxcount < 0) { + maxcount = (len1 + 1) / 2; + } switch (kind1) { case PyUnicode_1BYTE_KIND: if (PyUnicode_IS_ASCII(self)) @@ -9728,9 +9728,13 @@ split(PyObject *self, default: Py_UNREACHABLE(); } + } kind2 = PyUnicode_KIND(substring); len2 = PyUnicode_GET_LENGTH(substring); + if (maxcount < 0) { + maxcount = len1 / len2 + 1; + } if (kind1 < kind2 || len1 < len2) { out = PyList_New(1); if (out == NULL) @@ -9785,11 +9789,11 @@ rsplit(PyObject *self, len1 = PyUnicode_GET_LENGTH(self); kind1 = PyUnicode_KIND(self); - if (maxcount < 0) { - maxcount = len1; - } - if (substring == NULL) + if (substring == NULL) { + if (maxcount < 0) { + maxcount = (len1 + 1) / 2; + } switch (kind1) { case PyUnicode_1BYTE_KIND: if (PyUnicode_IS_ASCII(self)) @@ -9815,9 +9819,12 @@ rsplit(PyObject *self, default: Py_UNREACHABLE(); } - + } kind2 = PyUnicode_KIND(substring); len2 = PyUnicode_GET_LENGTH(substring); + if (maxcount < 0) { + maxcount = len1 / len2 + 1; + } if (kind1 < kind2 || len1 < len2) { out = PyList_New(1); if (out == NULL) From 605e0cf079ce547185ee552ac5c5e097dc1b705f Mon Sep 17 00:00:00 2001 From: Dong-hee Na Date: Sun, 31 Jul 2022 13:51:37 +0900 Subject: [PATCH 2/6] Update NEWS.d --- .../2022-07-31-03-22-58.gh-issue-91146.Y2Hziy.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-07-31-03-22-58.gh-issue-91146.Y2Hziy.rst b/Misc/NEWS.d/next/Core and Builtins/2022-07-31-03-22-58.gh-issue-91146.Y2Hziy.rst index 52568dbedd1308..9172ca298e8095 100644 --- a/Misc/NEWS.d/next/Core and Builtins/2022-07-31-03-22-58.gh-issue-91146.Y2Hziy.rst +++ b/Misc/NEWS.d/next/Core and Builtins/2022-07-31-03-22-58.gh-issue-91146.Y2Hziy.rst @@ -1,2 +1,2 @@ Reduce allocation size of :class:`list` from :meth:`str.split` -and :meth:`str.rsplit`. Patch by Dong-hee Na. +and :meth:`str.rsplit`. Patch by Dong-hee Na and Inada Naoki. From cab2a61b06e5a502a6f4b407381e9969ad0c29a0 Mon Sep 17 00:00:00 2001 From: Dong-hee Na Date: Sun, 31 Jul 2022 14:24:23 +0900 Subject: [PATCH 3/6] Handle devide by zero --- Objects/unicodeobject.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 93ae9f74e9e658..e34c9e9f67f855 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9733,7 +9733,7 @@ split(PyObject *self, kind2 = PyUnicode_KIND(substring); len2 = PyUnicode_GET_LENGTH(substring); if (maxcount < 0) { - maxcount = len1 / len2 + 1; + maxcount = len2 == 0 ? (len1 + 1) / 2 : (len1 / len2) + 1; } if (kind1 < kind2 || len1 < len2) { out = PyList_New(1); @@ -9823,7 +9823,7 @@ rsplit(PyObject *self, kind2 = PyUnicode_KIND(substring); len2 = PyUnicode_GET_LENGTH(substring); if (maxcount < 0) { - maxcount = len1 / len2 + 1; + maxcount = len2 == 0 ? (len1 + 1) / 2 : (len1 / len2) + 1; } if (kind1 < kind2 || len1 < len2) { out = PyList_New(1); From 2ebc918d03af0c60f95f4ca02a866ed13e895e33 Mon Sep 17 00:00:00 2001 From: Dong-hee Na Date: Sun, 31 Jul 2022 15:04:24 +0900 Subject: [PATCH 4/6] Add comment for len2 == 0 --- Objects/unicodeobject.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index e34c9e9f67f855..9466ebb509d683 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9733,7 +9733,8 @@ split(PyObject *self, kind2 = PyUnicode_KIND(substring); len2 = PyUnicode_GET_LENGTH(substring); if (maxcount < 0) { - maxcount = len2 == 0 ? (len1 + 1) / 2 : (len1 / len2) + 1; + // if len2 == 0, it will raise TypeError. + maxcount = len2 == 0 ? 0 : (len1 / len2) + 1; } if (kind1 < kind2 || len1 < len2) { out = PyList_New(1); @@ -9823,7 +9824,8 @@ rsplit(PyObject *self, kind2 = PyUnicode_KIND(substring); len2 = PyUnicode_GET_LENGTH(substring); if (maxcount < 0) { - maxcount = len2 == 0 ? (len1 + 1) / 2 : (len1 / len2) + 1; + // if len2 == 0, it will raise TypeError. + maxcount = len2 == 0 ? 0 : (len1 / len2) + 1; } if (kind1 < kind2 || len1 < len2) { out = PyList_New(1); From b5edcd807ee72750b8f6b1c904ff30df7cc67c83 Mon Sep 17 00:00:00 2001 From: Dong-hee Na Date: Sun, 31 Jul 2022 19:09:57 +0900 Subject: [PATCH 5/6] Avoid overflow error --- Objects/unicodeobject.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 9466ebb509d683..deeb79347d4b83 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9701,7 +9701,7 @@ split(PyObject *self, if (substring == NULL) { if (maxcount < 0) { - maxcount = (len1 + 1) / 2; + maxcount = (len1 - 1) / 2 + 1; } switch (kind1) { case PyUnicode_1BYTE_KIND: @@ -9733,7 +9733,7 @@ split(PyObject *self, kind2 = PyUnicode_KIND(substring); len2 = PyUnicode_GET_LENGTH(substring); if (maxcount < 0) { - // if len2 == 0, it will raise TypeError. + // if len2 == 0, it will raise ValueError. maxcount = len2 == 0 ? 0 : (len1 / len2) + 1; } if (kind1 < kind2 || len1 < len2) { @@ -9793,7 +9793,7 @@ rsplit(PyObject *self, if (substring == NULL) { if (maxcount < 0) { - maxcount = (len1 + 1) / 2; + maxcount = (len1 - 1) / 2 + 1; } switch (kind1) { case PyUnicode_1BYTE_KIND: @@ -9824,7 +9824,7 @@ rsplit(PyObject *self, kind2 = PyUnicode_KIND(substring); len2 = PyUnicode_GET_LENGTH(substring); if (maxcount < 0) { - // if len2 == 0, it will raise TypeError. + // if len2 == 0, it will raise ValueError. maxcount = len2 == 0 ? 0 : (len1 / len2) + 1; } if (kind1 < kind2 || len1 < len2) { From 7f248ef0eae9ac862ea16ebef4b8a286181a156a Mon Sep 17 00:00:00 2001 From: Dong-hee Na Date: Sun, 31 Jul 2022 20:12:08 +0900 Subject: [PATCH 6/6] Handle overflow case --- Objects/unicodeobject.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index deeb79347d4b83..7ff79953257ee6 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9735,6 +9735,8 @@ split(PyObject *self, if (maxcount < 0) { // if len2 == 0, it will raise ValueError. maxcount = len2 == 0 ? 0 : (len1 / len2) + 1; + // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1 + maxcount = maxcount < 0 ? len1 : maxcount; } if (kind1 < kind2 || len1 < len2) { out = PyList_New(1); @@ -9826,6 +9828,8 @@ rsplit(PyObject *self, if (maxcount < 0) { // if len2 == 0, it will raise ValueError. maxcount = len2 == 0 ? 0 : (len1 / len2) + 1; + // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1 + maxcount = maxcount < 0 ? len1 : maxcount; } if (kind1 < kind2 || len1 < len2) { out = PyList_New(1);