From 3632624b11c864fc1875cd9509676e518f805a7d Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Fri, 24 May 2024 11:00:42 +0200 Subject: [PATCH 01/86] Support tuples for `find` & `rfind` --- Objects/bytes_methods.c | 40 ++++++++++++++++-- Objects/clinic/unicodeobject.c.h | 30 +++++-------- Objects/unicodeobject.c | 72 ++++++++++++++++++++++++++++---- 3 files changed, 111 insertions(+), 31 deletions(-) diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index 981aa57164385e..d9a2d9e692fcbf 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -557,10 +557,26 @@ find_internal(const char *str, Py_ssize_t len, } PyObject * -_Py_bytes_find(const char *str, Py_ssize_t len, PyObject *sub, +_Py_bytes_find(const char *str, Py_ssize_t len, PyObject *subobj, Py_ssize_t start, Py_ssize_t end) { - Py_ssize_t result = find_internal(str, len, "find", sub, start, end, +1); + Py_ssize_t result; + if (PyTuple_Check(subobj)) { + result = -1; + for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { + PyObject *subbytes = PyTuple_GET_ITEM(subobj, i); + Py_ssize_t new_result = find_internal(str, len, "find", subbytes, + start, end, +1); + if (new_result == -2) { + return NULL; + } + if (new_result != -1 && (new_result < result || result == -1)) { + result = new_result; + } + } + return PyLong_FromSsize_t(result); + } + result = find_internal(str, len, "find", subobj, start, end, +1); if (result == -2) return NULL; return PyLong_FromSsize_t(result); @@ -582,10 +598,26 @@ _Py_bytes_index(const char *str, Py_ssize_t len, PyObject *sub, } PyObject * -_Py_bytes_rfind(const char *str, Py_ssize_t len, PyObject *sub, +_Py_bytes_rfind(const char *str, Py_ssize_t len, PyObject *subobj, Py_ssize_t start, Py_ssize_t end) { - Py_ssize_t result = find_internal(str, len, "rfind", sub, start, end, -1); + Py_ssize_t result; + if (PyTuple_Check(subobj)) { + result = -1; + for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { + PyObject *subbytes = PyTuple_GET_ITEM(subobj, i); + Py_ssize_t new_result = find_internal(str, len, "rfind", subbytes, + start, end, -1); + if (new_result == -2) { + return NULL; + } + if (new_result > result) { + result = new_result; + } + } + return PyLong_FromSsize_t(result); + } + result = find_internal(str, len, "rfind", subobj, start, end, -1); if (result == -2) return NULL; return PyLong_FromSsize_t(result); diff --git a/Objects/clinic/unicodeobject.c.h b/Objects/clinic/unicodeobject.c.h index 78e14b0021d006..87ee50fb4cce65 100644 --- a/Objects/clinic/unicodeobject.c.h +++ b/Objects/clinic/unicodeobject.c.h @@ -357,7 +357,7 @@ unicode_expandtabs(PyObject *self, PyObject *const *args, Py_ssize_t nargs, PyOb } PyDoc_STRVAR(unicode_find__doc__, -"find($self, sub[, start[, end]], /)\n" +"find($self, sub, start=None, end=None, /)\n" "--\n" "\n" "Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].\n" @@ -369,14 +369,14 @@ PyDoc_STRVAR(unicode_find__doc__, {"find", _PyCFunction_CAST(unicode_find), METH_FASTCALL, unicode_find__doc__}, static Py_ssize_t -unicode_find_impl(PyObject *str, PyObject *substr, Py_ssize_t start, +unicode_find_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, Py_ssize_t end); static PyObject * unicode_find(PyObject *str, PyObject *const *args, Py_ssize_t nargs) { PyObject *return_value = NULL; - PyObject *substr; + PyObject *subobj; Py_ssize_t start = 0; Py_ssize_t end = PY_SSIZE_T_MAX; Py_ssize_t _return_value; @@ -384,11 +384,7 @@ unicode_find(PyObject *str, PyObject *const *args, Py_ssize_t nargs) if (!_PyArg_CheckPositional("find", nargs, 1, 3)) { goto exit; } - if (!PyUnicode_Check(args[0])) { - _PyArg_BadArgument("find", "argument 1", "str", args[0]); - goto exit; - } - substr = args[0]; + subobj = args[0]; if (nargs < 2) { goto skip_optional; } @@ -402,7 +398,7 @@ unicode_find(PyObject *str, PyObject *const *args, Py_ssize_t nargs) goto exit; } skip_optional: - _return_value = unicode_find_impl(str, substr, start, end); + _return_value = unicode_find_impl(str, subobj, start, end); if ((_return_value == -1) && PyErr_Occurred()) { goto exit; } @@ -1060,7 +1056,7 @@ unicode_removesuffix(PyObject *self, PyObject *arg) } PyDoc_STRVAR(unicode_rfind__doc__, -"rfind($self, sub[, start[, end]], /)\n" +"rfind($self, sub, start=None, end=None, /)\n" "--\n" "\n" "Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].\n" @@ -1072,14 +1068,14 @@ PyDoc_STRVAR(unicode_rfind__doc__, {"rfind", _PyCFunction_CAST(unicode_rfind), METH_FASTCALL, unicode_rfind__doc__}, static Py_ssize_t -unicode_rfind_impl(PyObject *str, PyObject *substr, Py_ssize_t start, +unicode_rfind_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, Py_ssize_t end); static PyObject * unicode_rfind(PyObject *str, PyObject *const *args, Py_ssize_t nargs) { PyObject *return_value = NULL; - PyObject *substr; + PyObject *subobj; Py_ssize_t start = 0; Py_ssize_t end = PY_SSIZE_T_MAX; Py_ssize_t _return_value; @@ -1087,11 +1083,7 @@ unicode_rfind(PyObject *str, PyObject *const *args, Py_ssize_t nargs) if (!_PyArg_CheckPositional("rfind", nargs, 1, 3)) { goto exit; } - if (!PyUnicode_Check(args[0])) { - _PyArg_BadArgument("rfind", "argument 1", "str", args[0]); - goto exit; - } - substr = args[0]; + subobj = args[0]; if (nargs < 2) { goto skip_optional; } @@ -1105,7 +1097,7 @@ unicode_rfind(PyObject *str, PyObject *const *args, Py_ssize_t nargs) goto exit; } skip_optional: - _return_value = unicode_rfind_impl(str, substr, start, end); + _return_value = unicode_rfind_impl(str, subobj, start, end); if ((_return_value == -1) && PyErr_Occurred()) { goto exit; } @@ -1888,4 +1880,4 @@ unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) exit: return return_value; } -/*[clinic end generated code: output=9fee62bd337f809b input=a9049054013a1b77]*/ +/*[clinic end generated code: output=1db638aa49eefba8 input=a9049054013a1b77]*/ diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index afff37467caf32..e8b3615a3740b3 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -11334,7 +11334,13 @@ unicode_expandtabs_impl(PyObject *self, int tabsize) } /*[clinic input] -str.find as unicode_find = str.count +str.find as unicode_find -> Py_ssize_t + + self as str: self + sub as subobj: object + start: slice_index(accept={int, NoneType}, c_default='0') = None + end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None + / Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end]. @@ -11343,11 +11349,36 @@ Return -1 on failure. [clinic start generated code]*/ static Py_ssize_t -unicode_find_impl(PyObject *str, PyObject *substr, Py_ssize_t start, +unicode_find_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, Py_ssize_t end) -/*[clinic end generated code: output=51dbe6255712e278 input=4a89d2d68ef57256]*/ +/*[clinic end generated code: output=80175735a6d549d0 input=51e7b530950ab304]*/ { - Py_ssize_t result = any_find_slice(str, substr, start, end, 1); + Py_ssize_t result; + if (PyTuple_Check(subobj)) { + result = -1; + for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { + PyObject *substr = PyTuple_GET_ITEM(subobj, i); + if (!PyUnicode_Check(substr)) { + PyErr_Format(PyExc_TypeError, + "tuple for find must only contain str, " + "not %.100s", + Py_TYPE(substr)->tp_name); + return -1; + } + Py_ssize_t new_result = any_find_slice(str, substr, start, end, 1); + if (new_result != -1 && (new_result < result || result == -1)) { + result = new_result; + } + } + return result; + } + if (!PyUnicode_Check(subobj)) { + PyErr_Format(PyExc_TypeError, + "find first arg must be str or " + "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name); + return -1; + } + result = any_find_slice(str, subobj, start, end, 1); if (result < 0) { return -1; } @@ -12496,7 +12527,7 @@ unicode_repr(PyObject *unicode) } /*[clinic input] -str.rfind as unicode_rfind = str.count +str.rfind as unicode_rfind = str.find Return the highest index in S where substring sub is found, such that sub is contained within S[start:end]. @@ -12505,11 +12536,36 @@ Return -1 on failure. [clinic start generated code]*/ static Py_ssize_t -unicode_rfind_impl(PyObject *str, PyObject *substr, Py_ssize_t start, +unicode_rfind_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, Py_ssize_t end) -/*[clinic end generated code: output=880b29f01dd014c8 input=898361fb71f59294]*/ +/*[clinic end generated code: output=9d316eee7b9f9bf0 input=23ae7964e8f70b35]*/ { - Py_ssize_t result = any_find_slice(str, substr, start, end, -1); + Py_ssize_t result; + if (PyTuple_Check(subobj)) { + result = -1; + for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { + PyObject *substr = PyTuple_GET_ITEM(subobj, i); + if (!PyUnicode_Check(substr)) { + PyErr_Format(PyExc_TypeError, + "tuple for rfind must only contain str, " + "not %.100s", + Py_TYPE(substr)->tp_name); + return -1; + } + Py_ssize_t new_result = any_find_slice(str, substr, start, end, -1); + if (new_result > result) { + result = new_result; + } + } + return result; + } + if (!PyUnicode_Check(subobj)) { + PyErr_Format(PyExc_TypeError, + "rfind first arg must be str or " + "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name); + return -1; + } + result = any_find_slice(str, subobj, start, end, -1); if (result < 0) { return -1; } From e39b040016c2591038fa865549c4133dbb84c534 Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Fri, 24 May 2024 11:12:04 +0200 Subject: [PATCH 02/86] Update docs --- Doc/library/stdtypes.rst | 35 +++++++++++++++++++++++++---------- Objects/bytes_methods.c | 8 ++++---- 2 files changed, 29 insertions(+), 14 deletions(-) diff --git a/Doc/library/stdtypes.rst b/Doc/library/stdtypes.rst index c0a3d0b3a2a49e..d84bb5f383d750 100644 --- a/Doc/library/stdtypes.rst +++ b/Doc/library/stdtypes.rst @@ -1724,8 +1724,9 @@ expression support in the :mod:`re` module). .. method:: str.find(sub[, start[, end]]) Return the lowest index in the string where substring *sub* is found within - the slice ``s[start:end]``. Optional arguments *start* and *end* are - interpreted as in slice notation. Return ``-1`` if *sub* is not found. + the slice ``s[start:end]``. *sub* can also be a tuple of substrings to look + for. Optional arguments *start* and *end* are interpreted as in slice + notation. Return ``-1`` if *sub* is not found. .. note:: @@ -1736,6 +1737,9 @@ expression support in the :mod:`re` module). >>> 'Py' in 'Python' True + .. versionchanged:: 3.14 + *sub* can now be a tuple of substrings. + .. method:: str.format(*args, **kwargs) @@ -2030,8 +2034,12 @@ expression support in the :mod:`re` module). .. method:: str.rfind(sub[, start[, end]]) Return the highest index in the string where substring *sub* is found, such - that *sub* is contained within ``s[start:end]``. Optional arguments *start* - and *end* are interpreted as in slice notation. Return ``-1`` on failure. + that *sub* is contained within ``s[start:end]``. *sub* can also be a tuple + of substrings to look for. Optional arguments *start* and *end* are + interpreted as in slice notation. Return ``-1`` on failure. + + .. versionchanged:: 3.14 + *sub* can now be a tuple of substrings. .. method:: str.rindex(sub[, start[, end]]) @@ -2859,9 +2867,10 @@ arbitrary binary data. bytearray.find(sub[, start[, end]]) Return the lowest index in the data where the subsequence *sub* is found, - such that *sub* is contained in the slice ``s[start:end]``. Optional - arguments *start* and *end* are interpreted as in slice notation. Return - ``-1`` if *sub* is not found. + such that *sub* is contained in the slice ``s[start:end]``. *sub* can + also be a tuple of subsequences to look for. Optional arguments *start* + and *end* are interpreted as in slice notation. Return ``-1`` if *sub* + is not found. The subsequence to search for may be any :term:`bytes-like object` or an integer in the range 0 to 255. @@ -2878,6 +2887,9 @@ arbitrary binary data. .. versionchanged:: 3.3 Also accept an integer in the range 0 to 255 as the subsequence. + .. versionchanged:: 3.14 + *sub* can now be a tuple of subsequences. + .. method:: bytes.index(sub[, start[, end]]) bytearray.index(sub[, start[, end]]) @@ -2947,9 +2959,9 @@ arbitrary binary data. bytearray.rfind(sub[, start[, end]]) Return the highest index in the sequence where the subsequence *sub* is - found, such that *sub* is contained within ``s[start:end]``. Optional - arguments *start* and *end* are interpreted as in slice notation. Return - ``-1`` on failure. + found, such that *sub* is contained within ``s[start:end]``. *sub* can + also be a tuple of subsequences to look for. Optional arguments *start* + and *end* are interpreted as in slice notation. Return ``-1`` on failure. The subsequence to search for may be any :term:`bytes-like object` or an integer in the range 0 to 255. @@ -2957,6 +2969,9 @@ arbitrary binary data. .. versionchanged:: 3.3 Also accept an integer in the range 0 to 255 as the subsequence. + .. versionchanged:: 3.14 + *sub* can now be a tuple of subsequences. + .. method:: bytes.rindex(sub[, start[, end]]) bytearray.rindex(sub[, start[, end]]) diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index d9a2d9e692fcbf..dd89df087af600 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -564,8 +564,8 @@ _Py_bytes_find(const char *str, Py_ssize_t len, PyObject *subobj, if (PyTuple_Check(subobj)) { result = -1; for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { - PyObject *subbytes = PyTuple_GET_ITEM(subobj, i); - Py_ssize_t new_result = find_internal(str, len, "find", subbytes, + PyObject *subseq = PyTuple_GET_ITEM(subobj, i); + Py_ssize_t new_result = find_internal(str, len, "find", subseq, start, end, +1); if (new_result == -2) { return NULL; @@ -605,8 +605,8 @@ _Py_bytes_rfind(const char *str, Py_ssize_t len, PyObject *subobj, if (PyTuple_Check(subobj)) { result = -1; for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { - PyObject *subbytes = PyTuple_GET_ITEM(subobj, i); - Py_ssize_t new_result = find_internal(str, len, "rfind", subbytes, + PyObject *subseq = PyTuple_GET_ITEM(subobj, i); + Py_ssize_t new_result = find_internal(str, len, "rfind", subseq, start, end, -1); if (new_result == -2) { return NULL; From cb905bcce9b5ce57d25b495509054b67e03668c8 Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Fri, 24 May 2024 12:05:31 +0200 Subject: [PATCH 03/86] Add tests --- Lib/test/string_tests.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py index 9bb0ce7bb57f8b..8ffafc624d275b 100644 --- a/Lib/test/string_tests.py +++ b/Lib/test/string_tests.py @@ -217,6 +217,19 @@ def test_find(self): if loc != -1: self.assertEqual(i[loc:loc+len(j)], j) + # test tuple arguments + self.checkequal(2, '__aa__bb__', 'find', ('aa', 'bb')) + self.checkequal(2, '__aa__bb__', 'find', ('bb', 'aa')) + self.checkequal(-1, '__aa__bb__', 'find', ('cc', 'dd')) + self.checkequal(-1, '__aa__bb__', 'find', ()) + self.checkequal(6, '__aa__bb__', 'find', ('aa', 'bb'), 3) + self.checkequal(-1, '__aa__bb__', 'find', ('aa', 'cc'), 3) + self.checkequal(2, '__aa__bb__', 'find', ('aa', 'bb'), 0, 10) + self.checkequal(-1, '__aa__bb__', 'find', ('aa', 'bb'), 0, 3) + self.checkequal(2, '__aa__bb__', 'find', ('aa', 'bb'), 0, 4) + + self.checkraises(TypeError, 'hello', 'find', (42,)) + def test_rfind(self): self.checkequal(9, 'abcdefghiabc', 'rfind', 'abc') self.checkequal(12, 'abcdefghiabc', 'rfind', '') @@ -270,6 +283,18 @@ def test_rfind(self): # issue #15534 self.checkequal(0, '<......\u043c...', "rfind", "<") + # test tuple arguments + self.checkequal(6, '__aa__bb__', 'rfind', ('aa', 'bb')) + self.checkequal(6, '__aa__bb__', 'rfind', ('bb', 'aa')) + self.checkequal(-1, '__aa__bb__', 'rfind', ('cc', 'dd')) + self.checkequal(-1, '__aa__bb__', 'rfind', ()) + self.checkequal(-1, '__aa__bb__', 'rfind', ('aa', 'cc'), 3) + self.checkequal(6, '__aa__bb__', 'rfind', ('aa', 'bb'), 0, 10) + self.checkequal(-1, '__aa__bb__', 'rfind', ('aa', 'bb'), 7, 10) + self.checkequal(6, '__aa__bb__', 'rfind', ('aa', 'bb'), 6, 10) + + self.checkraises(TypeError, 'hello', 'rfind', (42,)) + def test_index(self): self.checkequal(0, 'abcdefghiabc', 'index', '') self.checkequal(3, 'abcdefghiabc', 'index', 'def') From 1807fd8200b2e6af21e544142bda717d40e557ee Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Fri, 24 May 2024 11:07:17 +0000 Subject: [PATCH 04/86] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20b?= =?UTF-8?q?lurb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../2024-05-24-11-07-16.gh-issue-118184.EK4di_.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2024-05-24-11-07-16.gh-issue-118184.EK4di_.rst diff --git a/Misc/NEWS.d/next/Core and Builtins/2024-05-24-11-07-16.gh-issue-118184.EK4di_.rst b/Misc/NEWS.d/next/Core and Builtins/2024-05-24-11-07-16.gh-issue-118184.EK4di_.rst new file mode 100644 index 00000000000000..b656d5d2a87421 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2024-05-24-11-07-16.gh-issue-118184.EK4di_.rst @@ -0,0 +1 @@ +Support tuples for :meth:`str.find`, :meth:`bytearray.find`, :meth:`bytes.find`, :meth:`str.rfind`, :meth:`bytearray.rfind` and :meth:`bytes.rfind`. From cca08fa31a7a6db305d8733b4b0d122f2dc4d1e9 Mon Sep 17 00:00:00 2001 From: Nice Zombies Date: Fri, 24 May 2024 13:37:14 +0200 Subject: [PATCH 05/86] Apply suggestions from code review --- Lib/test/string_tests.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py index 8ffafc624d275b..45959fbe64771c 100644 --- a/Lib/test/string_tests.py +++ b/Lib/test/string_tests.py @@ -227,9 +227,6 @@ def test_find(self): self.checkequal(2, '__aa__bb__', 'find', ('aa', 'bb'), 0, 10) self.checkequal(-1, '__aa__bb__', 'find', ('aa', 'bb'), 0, 3) self.checkequal(2, '__aa__bb__', 'find', ('aa', 'bb'), 0, 4) - - self.checkraises(TypeError, 'hello', 'find', (42,)) - def test_rfind(self): self.checkequal(9, 'abcdefghiabc', 'rfind', 'abc') self.checkequal(12, 'abcdefghiabc', 'rfind', '') @@ -292,9 +289,6 @@ def test_rfind(self): self.checkequal(6, '__aa__bb__', 'rfind', ('aa', 'bb'), 0, 10) self.checkequal(-1, '__aa__bb__', 'rfind', ('aa', 'bb'), 7, 10) self.checkequal(6, '__aa__bb__', 'rfind', ('aa', 'bb'), 6, 10) - - self.checkraises(TypeError, 'hello', 'rfind', (42,)) - def test_index(self): self.checkequal(0, 'abcdefghiabc', 'index', '') self.checkequal(3, 'abcdefghiabc', 'index', 'def') From 302faa32d684c91d7e24a0cdd569b73dd9e7fae5 Mon Sep 17 00:00:00 2001 From: Nice Zombies Date: Fri, 24 May 2024 13:38:07 +0200 Subject: [PATCH 06/86] Apply suggestions from code review --- Lib/test/string_tests.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py index 45959fbe64771c..0599999de1fde3 100644 --- a/Lib/test/string_tests.py +++ b/Lib/test/string_tests.py @@ -227,6 +227,7 @@ def test_find(self): self.checkequal(2, '__aa__bb__', 'find', ('aa', 'bb'), 0, 10) self.checkequal(-1, '__aa__bb__', 'find', ('aa', 'bb'), 0, 3) self.checkequal(2, '__aa__bb__', 'find', ('aa', 'bb'), 0, 4) + def test_rfind(self): self.checkequal(9, 'abcdefghiabc', 'rfind', 'abc') self.checkequal(12, 'abcdefghiabc', 'rfind', '') @@ -289,6 +290,7 @@ def test_rfind(self): self.checkequal(6, '__aa__bb__', 'rfind', ('aa', 'bb'), 0, 10) self.checkequal(-1, '__aa__bb__', 'rfind', ('aa', 'bb'), 7, 10) self.checkequal(6, '__aa__bb__', 'rfind', ('aa', 'bb'), 6, 10) + def test_index(self): self.checkequal(0, 'abcdefghiabc', 'index', '') self.checkequal(3, 'abcdefghiabc', 'index', 'def') From cb95578ab7e25cc6b90886ff17f55fb8c6c2691a Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Fri, 24 May 2024 14:13:31 +0200 Subject: [PATCH 07/86] Fix signature tests --- Lib/test/test_inspect/test_inspect.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_inspect/test_inspect.py b/Lib/test/test_inspect/test_inspect.py index 011d42f34b6461..6ba2d85fb00742 100644 --- a/Lib/test/test_inspect/test_inspect.py +++ b/Lib/test/test_inspect/test_inspect.py @@ -5414,7 +5414,7 @@ def test_builtins_have_signatures(self): 'dict': {'pop'}, 'int': {'__round__'}, 'memoryview': {'cast', 'hex'}, - 'str': {'count', 'endswith', 'find', 'index', 'maketrans', 'rfind', 'rindex', 'startswith'}, + 'str': {'count', 'endswith', 'index', 'maketrans', 'rindex', 'startswith'}, } self._test_module_has_signatures(builtins, no_signature, unsupported_signature, @@ -5589,7 +5589,7 @@ def test_typing_module_has_signatures(self): 'Generic': {'__class_getitem__', '__init_subclass__'}, } methods_unsupported_signature = { - 'Text': {'count', 'find', 'index', 'rfind', 'rindex', 'startswith', 'endswith', 'maketrans'}, + 'Text': {'count', 'index', 'rindex', 'startswith', 'endswith', 'maketrans'}, } self._test_module_has_signatures(typing, no_signature, methods_no_signature=methods_no_signature, From a35d3aedf4bb469d20029ccf87a6b0156e88b2ea Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Fri, 24 May 2024 22:48:59 +0200 Subject: [PATCH 08/86] Short circuit --- Objects/bytes_methods.c | 81 ++++++++++++++++++++++++++++++++--------- Objects/unicodeobject.c | 72 ++++++++++++++++++++++++------------ 2 files changed, 111 insertions(+), 42 deletions(-) diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index dd89df087af600..0442d0bfc25f88 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -563,15 +563,38 @@ _Py_bytes_find(const char *str, Py_ssize_t len, PyObject *subobj, Py_ssize_t result; if (PyTuple_Check(subobj)) { result = -1; - for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { - PyObject *subseq = PyTuple_GET_ITEM(subobj, i); - Py_ssize_t new_result = find_internal(str, len, "find", subseq, - start, end, +1); - if (new_result == -2) { - return NULL; - } - if (new_result != -1 && (new_result < result || result == -1)) { - result = new_result; + ADJUST_INDICES(start, end, len); + // Work in batches of 10000 + for (; result == -1 && start <= end; start += 10000) { + for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { + PyObject *subseq = PyTuple_GET_ITEM(subobj, i); + Py_ssize_t sublen; + Py_buffer subbuf; + if (!PyObject_CheckBuffer(subseq)) { + sublen = 1; + } + else if (PyObject_GetBuffer(subseq, &subbuf, + PyBUF_SIMPLE) != 0) + { + return NULL; + } + else { + sublen = subbuf.len; + } + Py_ssize_t cur_end = start + 10000 + sublen; + if (cur_end > end) { + cur_end = end; + } + Py_ssize_t new_result = find_internal(str, len, "find", subseq, + start, cur_end, +1); + if (new_result == -2) { + return NULL; + } + if (new_result != -1 && + (new_result < result || result == -1)) + { + result = new_result; + } } } return PyLong_FromSsize_t(result); @@ -604,15 +627,37 @@ _Py_bytes_rfind(const char *str, Py_ssize_t len, PyObject *subobj, Py_ssize_t result; if (PyTuple_Check(subobj)) { result = -1; - for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { - PyObject *subseq = PyTuple_GET_ITEM(subobj, i); - Py_ssize_t new_result = find_internal(str, len, "rfind", subseq, - start, end, -1); - if (new_result == -2) { - return NULL; - } - if (new_result > result) { - result = new_result; + ADJUST_INDICES(start, end, len); + // Work in batches of 10000 + for (; result == -1 && end >= start; end -= 10000) { + for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { + PyObject *subseq = PyTuple_GET_ITEM(subobj, i); + Py_ssize_t sublen; + Py_buffer subbuf; + if (!PyObject_CheckBuffer(subseq)) { + sublen = 1; + } + else if (PyObject_GetBuffer(subseq, &subbuf, + PyBUF_SIMPLE) != 0) + { + return NULL; + } + else { + sublen = subbuf.len; + } + Py_ssize_t cur_start = end - 10000 - sublen; + if (cur_start < start) { + cur_start = start; + } + Py_ssize_t new_result = find_internal(str, len, "rfind", + subseq, cur_start, end, + -1); + if (new_result == -2) { + return NULL; + } + if (new_result > result) { + result = new_result; + } } } return PyLong_FromSsize_t(result); diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index e8b3615a3740b3..14350a820bd9c1 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -11356,18 +11356,31 @@ unicode_find_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, Py_ssize_t result; if (PyTuple_Check(subobj)) { result = -1; - for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { - PyObject *substr = PyTuple_GET_ITEM(subobj, i); - if (!PyUnicode_Check(substr)) { - PyErr_Format(PyExc_TypeError, - "tuple for find must only contain str, " - "not %.100s", - Py_TYPE(substr)->tp_name); - return -1; - } - Py_ssize_t new_result = any_find_slice(str, substr, start, end, 1); - if (new_result != -1 && (new_result < result || result == -1)) { - result = new_result; + Py_ssize_t len = PyUnicode_GET_LENGTH(str); + ADJUST_INDICES(start, end, len); + // Work in batches of 10000 + for (; result == -1 && start <= end; start += 10000) { + for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { + PyObject *substr = PyTuple_GET_ITEM(subobj, i); + if (!PyUnicode_Check(substr)) { + PyErr_Format(PyExc_TypeError, + "tuple for find must only contain str, " + "not %.100s", + Py_TYPE(substr)->tp_name); + return -1; + } + Py_ssize_t sublen = PyUnicode_GET_LENGTH(substr); + Py_ssize_t cur_end = start + 10000 + sublen; + if (cur_end > end) { + cur_end = end; + } + Py_ssize_t new_result = any_find_slice(str, substr, start, + cur_end, 1); + if (new_result != -1 && + (new_result < result || result == -1)) + { + result = new_result; + } } } return result; @@ -12543,18 +12556,29 @@ unicode_rfind_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, Py_ssize_t result; if (PyTuple_Check(subobj)) { result = -1; - for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { - PyObject *substr = PyTuple_GET_ITEM(subobj, i); - if (!PyUnicode_Check(substr)) { - PyErr_Format(PyExc_TypeError, - "tuple for rfind must only contain str, " - "not %.100s", - Py_TYPE(substr)->tp_name); - return -1; - } - Py_ssize_t new_result = any_find_slice(str, substr, start, end, -1); - if (new_result > result) { - result = new_result; + Py_ssize_t len = PyUnicode_GET_LENGTH(str); + ADJUST_INDICES(start, end, len); + // Work in batches of 10000 + for (; result == -1 && end >= start; end -= 10000) { + for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { + PyObject *substr = PyTuple_GET_ITEM(subobj, i); + if (!PyUnicode_Check(substr)) { + PyErr_Format(PyExc_TypeError, + "tuple for rfind must only contain str, " + "not %.100s", + Py_TYPE(substr)->tp_name); + return -1; + } + Py_ssize_t sublen = PyUnicode_GET_LENGTH(substr); + Py_ssize_t cur_start = end - 10000 - sublen; + if (cur_start < start) { + cur_start = start; + } + Py_ssize_t new_result = any_find_slice(str, substr, cur_start, + end, -1); + if (new_result > result) { + result = new_result; + } } } return result; From 65c0a9ebb6ac85d6d1b792afe253bc19e2176bee Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sat, 25 May 2024 07:43:40 +0200 Subject: [PATCH 09/86] Fix start for `rfind` --- Objects/bytes_methods.c | 14 +++++++++----- Objects/unicodeobject.c | 13 ++++++++----- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index 0442d0bfc25f88..6c5d9e6352b702 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -630,6 +630,10 @@ _Py_bytes_rfind(const char *str, Py_ssize_t len, PyObject *subobj, ADJUST_INDICES(start, end, len); // Work in batches of 10000 for (; result == -1 && end >= start; end -= 10000) { + Py_ssize_t cur_start = end - 10000; + if (cur_start < start) { + cur_start = start; + } for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { PyObject *subseq = PyTuple_GET_ITEM(subobj, i); Py_ssize_t sublen; @@ -645,13 +649,13 @@ _Py_bytes_rfind(const char *str, Py_ssize_t len, PyObject *subobj, else { sublen = subbuf.len; } - Py_ssize_t cur_start = end - 10000 - sublen; - if (cur_start < start) { - cur_start = start; + Py_ssize_t cur_end = end + sublen; + if (cur_end > end) { + cur_end = end; } Py_ssize_t new_result = find_internal(str, len, "rfind", - subseq, cur_start, end, - -1); + subseq, cur_start, + cur_end, -1); if (new_result == -2) { return NULL; } diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 14350a820bd9c1..46f95a68207637 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -12560,6 +12560,10 @@ unicode_rfind_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, ADJUST_INDICES(start, end, len); // Work in batches of 10000 for (; result == -1 && end >= start; end -= 10000) { + Py_ssize_t cur_start = end - 10000; + if (cur_start < start) { + cur_start = start; + } for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { PyObject *substr = PyTuple_GET_ITEM(subobj, i); if (!PyUnicode_Check(substr)) { @@ -12569,13 +12573,12 @@ unicode_rfind_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, Py_TYPE(substr)->tp_name); return -1; } - Py_ssize_t sublen = PyUnicode_GET_LENGTH(substr); - Py_ssize_t cur_start = end - 10000 - sublen; - if (cur_start < start) { - cur_start = start; + Py_ssize_t cur_end = end + PyUnicode_GET_LENGTH(substr); + if (cur_end > end) { + cur_end = end; } Py_ssize_t new_result = any_find_slice(str, substr, cur_start, - end, -1); + cur_end, -1); if (new_result > result) { result = new_result; } From 5cbb1f01ac02618323b807d873e75278b1d71e97 Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sat, 25 May 2024 07:55:26 +0200 Subject: [PATCH 10/86] Refactor checks --- Objects/unicodeobject.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 46f95a68207637..de2ca9a09b4a63 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -11355,6 +11355,16 @@ unicode_find_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, { Py_ssize_t result; if (PyTuple_Check(subobj)) { + for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { + PyObject *substr = PyTuple_GET_ITEM(subobj, i); + if (!PyUnicode_Check(substr)) { + PyErr_Format(PyExc_TypeError, + "tuple for find must only contain str, " + "not %.100s", + Py_TYPE(substr)->tp_name); + return -1; + } + } result = -1; Py_ssize_t len = PyUnicode_GET_LENGTH(str); ADJUST_INDICES(start, end, len); @@ -11362,13 +11372,6 @@ unicode_find_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, for (; result == -1 && start <= end; start += 10000) { for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { PyObject *substr = PyTuple_GET_ITEM(subobj, i); - if (!PyUnicode_Check(substr)) { - PyErr_Format(PyExc_TypeError, - "tuple for find must only contain str, " - "not %.100s", - Py_TYPE(substr)->tp_name); - return -1; - } Py_ssize_t sublen = PyUnicode_GET_LENGTH(substr); Py_ssize_t cur_end = start + 10000 + sublen; if (cur_end > end) { @@ -12555,6 +12558,16 @@ unicode_rfind_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, { Py_ssize_t result; if (PyTuple_Check(subobj)) { + for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { + PyObject *substr = PyTuple_GET_ITEM(subobj, i); + if (!PyUnicode_Check(substr)) { + PyErr_Format(PyExc_TypeError, + "tuple for rfind must only contain str, " + "not %.100s", + Py_TYPE(substr)->tp_name); + return -1; + } + } result = -1; Py_ssize_t len = PyUnicode_GET_LENGTH(str); ADJUST_INDICES(start, end, len); @@ -12566,13 +12579,6 @@ unicode_rfind_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, } for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { PyObject *substr = PyTuple_GET_ITEM(subobj, i); - if (!PyUnicode_Check(substr)) { - PyErr_Format(PyExc_TypeError, - "tuple for rfind must only contain str, " - "not %.100s", - Py_TYPE(substr)->tp_name); - return -1; - } Py_ssize_t cur_end = end + PyUnicode_GET_LENGTH(substr); if (cur_end > end) { cur_end = end; From 00b2b048ecfcf2b554128bc73bdb8b1a40cf4026 Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sat, 25 May 2024 08:34:33 +0200 Subject: [PATCH 11/86] Fix end for `rfind` --- Objects/bytes_methods.c | 27 ++++++++++++++------------- Objects/unicodeobject.c | 27 ++++++++++++++------------- 2 files changed, 28 insertions(+), 26 deletions(-) diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index 6c5d9e6352b702..c681b78a8f0c60 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -581,12 +581,12 @@ _Py_bytes_find(const char *str, Py_ssize_t len, PyObject *subobj, else { sublen = subbuf.len; } - Py_ssize_t cur_end = start + 10000 + sublen; - if (cur_end > end) { - cur_end = end; + Py_ssize_t sub_end = start + 10000 + sublen; + if (sub_end > end) { + sub_end = end; } Py_ssize_t new_result = find_internal(str, len, "find", subseq, - start, cur_end, +1); + start, sub_end, +1); if (new_result == -2) { return NULL; } @@ -629,10 +629,11 @@ _Py_bytes_rfind(const char *str, Py_ssize_t len, PyObject *subobj, result = -1; ADJUST_INDICES(start, end, len); // Work in batches of 10000 - for (; result == -1 && end >= start; end -= 10000) { - Py_ssize_t cur_start = end - 10000; - if (cur_start < start) { - cur_start = start; + Py_ssize_t cur_end = end; + for (; result == -1 && cur_end >= start; cur_end -= 10000) { + Py_ssize_t sub_start = end - 10000; + if (sub_start < start) { + sub_start = start; } for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { PyObject *subseq = PyTuple_GET_ITEM(subobj, i); @@ -649,13 +650,13 @@ _Py_bytes_rfind(const char *str, Py_ssize_t len, PyObject *subobj, else { sublen = subbuf.len; } - Py_ssize_t cur_end = end + sublen; - if (cur_end > end) { - cur_end = end; + Py_ssize_t sub_end = cur_end + sublen; + if (sub_end > end) { + sub_end = end; } Py_ssize_t new_result = find_internal(str, len, "rfind", - subseq, cur_start, - cur_end, -1); + subseq, sub_start, + sub_end, -1); if (new_result == -2) { return NULL; } diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index de2ca9a09b4a63..95318a9c5a395a 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -11373,12 +11373,12 @@ unicode_find_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { PyObject *substr = PyTuple_GET_ITEM(subobj, i); Py_ssize_t sublen = PyUnicode_GET_LENGTH(substr); - Py_ssize_t cur_end = start + 10000 + sublen; - if (cur_end > end) { - cur_end = end; + Py_ssize_t sub_end = start + 10000 + sublen; + if (sub_end > end) { + sub_end = end; } Py_ssize_t new_result = any_find_slice(str, substr, start, - cur_end, 1); + sub_end, 1); if (new_result != -1 && (new_result < result || result == -1)) { @@ -12572,19 +12572,20 @@ unicode_rfind_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, Py_ssize_t len = PyUnicode_GET_LENGTH(str); ADJUST_INDICES(start, end, len); // Work in batches of 10000 - for (; result == -1 && end >= start; end -= 10000) { - Py_ssize_t cur_start = end - 10000; - if (cur_start < start) { - cur_start = start; + Py_ssize_t cur_end = end; + for (; result == -1 && cur_end >= start; cur_end -= 10000) { + Py_ssize_t sub_start = end - 10000; + if (sub_start < start) { + sub_start = start; } for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { PyObject *substr = PyTuple_GET_ITEM(subobj, i); - Py_ssize_t cur_end = end + PyUnicode_GET_LENGTH(substr); - if (cur_end > end) { - cur_end = end; + Py_ssize_t sub_end = cur_end + PyUnicode_GET_LENGTH(substr); + if (sub_end > end) { + sub_end = end; } - Py_ssize_t new_result = any_find_slice(str, substr, cur_start, - cur_end, -1); + Py_ssize_t new_result = any_find_slice(str, substr, sub_start, + sub_end, -1); if (new_result > result) { result = new_result; } From e1246037e7c024e6a64cbc9f72fae930a3ee54b3 Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sat, 25 May 2024 09:21:17 +0200 Subject: [PATCH 12/86] Adjust indices --- Objects/bytes_methods.c | 29 +++++++++++++++-------------- Objects/unicodeobject.c | 16 ++++++++-------- 2 files changed, 23 insertions(+), 22 deletions(-) diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index c681b78a8f0c60..7b8783d0e7dd7f 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -566,12 +566,13 @@ _Py_bytes_find(const char *str, Py_ssize_t len, PyObject *subobj, ADJUST_INDICES(start, end, len); // Work in batches of 10000 for (; result == -1 && start <= end; start += 10000) { + Py_ssize_t cur_end = start + 10000; for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { PyObject *subseq = PyTuple_GET_ITEM(subobj, i); - Py_ssize_t sublen; + Py_ssize_t sub_len; Py_buffer subbuf; if (!PyObject_CheckBuffer(subseq)) { - sublen = 1; + sub_len = 1; } else if (PyObject_GetBuffer(subseq, &subbuf, PyBUF_SIMPLE) != 0) @@ -579,9 +580,9 @@ _Py_bytes_find(const char *str, Py_ssize_t len, PyObject *subobj, return NULL; } else { - sublen = subbuf.len; + sub_len = subbuf.len; } - Py_ssize_t sub_end = start + 10000 + sublen; + Py_ssize_t sub_end = cur_end + sub_len; if (sub_end > end) { sub_end = end; } @@ -593,7 +594,7 @@ _Py_bytes_find(const char *str, Py_ssize_t len, PyObject *subobj, if (new_result != -1 && (new_result < result || result == -1)) { - result = new_result; + result = cur_end = new_result; } } } @@ -631,16 +632,16 @@ _Py_bytes_rfind(const char *str, Py_ssize_t len, PyObject *subobj, // Work in batches of 10000 Py_ssize_t cur_end = end; for (; result == -1 && cur_end >= start; cur_end -= 10000) { - Py_ssize_t sub_start = end - 10000; - if (sub_start < start) { - sub_start = start; + Py_ssize_t cur_start = cur_end - 10000; + if (cur_start < start) { + cur_start = start; } for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { PyObject *subseq = PyTuple_GET_ITEM(subobj, i); - Py_ssize_t sublen; + Py_ssize_t sub_len; Py_buffer subbuf; if (!PyObject_CheckBuffer(subseq)) { - sublen = 1; + sub_len = 1; } else if (PyObject_GetBuffer(subseq, &subbuf, PyBUF_SIMPLE) != 0) @@ -648,20 +649,20 @@ _Py_bytes_rfind(const char *str, Py_ssize_t len, PyObject *subobj, return NULL; } else { - sublen = subbuf.len; + sub_len = subbuf.len; } - Py_ssize_t sub_end = cur_end + sublen; + Py_ssize_t sub_end = cur_end + sub_len; if (sub_end > end) { sub_end = end; } Py_ssize_t new_result = find_internal(str, len, "rfind", - subseq, sub_start, + subseq, cur_start, sub_end, -1); if (new_result == -2) { return NULL; } if (new_result > result) { - result = new_result; + result = cur_start = new_result; } } } diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 95318a9c5a395a..4a878503970052 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -11370,10 +11370,10 @@ unicode_find_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, ADJUST_INDICES(start, end, len); // Work in batches of 10000 for (; result == -1 && start <= end; start += 10000) { + Py_ssize_t cur_end = start + 10000; for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { PyObject *substr = PyTuple_GET_ITEM(subobj, i); - Py_ssize_t sublen = PyUnicode_GET_LENGTH(substr); - Py_ssize_t sub_end = start + 10000 + sublen; + Py_ssize_t sub_end = cur_end + PyUnicode_GET_LENGTH(substr); if (sub_end > end) { sub_end = end; } @@ -11382,7 +11382,7 @@ unicode_find_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, if (new_result != -1 && (new_result < result || result == -1)) { - result = new_result; + result = cur_end = new_result; } } } @@ -12574,9 +12574,9 @@ unicode_rfind_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, // Work in batches of 10000 Py_ssize_t cur_end = end; for (; result == -1 && cur_end >= start; cur_end -= 10000) { - Py_ssize_t sub_start = end - 10000; - if (sub_start < start) { - sub_start = start; + Py_ssize_t cur_start = cur_end - 10000; + if (cur_start < start) { + cur_start = start; } for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { PyObject *substr = PyTuple_GET_ITEM(subobj, i); @@ -12584,10 +12584,10 @@ unicode_rfind_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, if (sub_end > end) { sub_end = end; } - Py_ssize_t new_result = any_find_slice(str, substr, sub_start, + Py_ssize_t new_result = any_find_slice(str, substr, cur_start, sub_end, -1); if (new_result > result) { - result = new_result; + result = cur_start = new_result; } } } From 41b0cd8bae77fa86e11a92ef3522261307151dee Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sat, 25 May 2024 09:39:23 +0200 Subject: [PATCH 13/86] Micro optimisation --- Objects/bytes_methods.c | 18 ++++++++++++------ Objects/unicodeobject.c | 18 ++++++++++++------ 2 files changed, 24 insertions(+), 12 deletions(-) diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index 7b8783d0e7dd7f..a169076e29fa43 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -591,10 +591,12 @@ _Py_bytes_find(const char *str, Py_ssize_t len, PyObject *subobj, if (new_result == -2) { return NULL; } - if (new_result != -1 && - (new_result < result || result == -1)) - { - result = cur_end = new_result; + if (new_result != -1) { + if (new_result == start) { + return start; + } + cur_end = new_result - 1; + result = new_result; } } } @@ -661,8 +663,12 @@ _Py_bytes_rfind(const char *str, Py_ssize_t len, PyObject *subobj, if (new_result == -2) { return NULL; } - if (new_result > result) { - result = cur_start = new_result; + if (new_result != 1) { + if (new_result == cur_end) { + return cur_end; + } + cur_start = new_result + 1; + result = new_result; } } } diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 4a878503970052..3254418edf2030 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -11379,10 +11379,12 @@ unicode_find_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, } Py_ssize_t new_result = any_find_slice(str, substr, start, sub_end, 1); - if (new_result != -1 && - (new_result < result || result == -1)) - { - result = cur_end = new_result; + if (new_result != -1) { + if (new_result == start) { + return start; + } + cur_end = new_result - 1; + result = new_result; } } } @@ -12586,8 +12588,12 @@ unicode_rfind_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, } Py_ssize_t new_result = any_find_slice(str, substr, cur_start, sub_end, -1); - if (new_result > result) { - result = cur_start = new_result; + if (new_result != 1) { + if (new_result == cur_end) { + return cur_end; + } + cur_start = new_result + 1; + result = new_result; } } } From 7b83a22f01dee1c7f63854c4459c396f325940b7 Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sat, 25 May 2024 09:43:03 +0200 Subject: [PATCH 14/86] Fix conversion --- Objects/bytes_methods.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index a169076e29fa43..c9dfd6c0783524 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -593,7 +593,7 @@ _Py_bytes_find(const char *str, Py_ssize_t len, PyObject *subobj, } if (new_result != -1) { if (new_result == start) { - return start; + return PyLong_FromSsize_t(start); } cur_end = new_result - 1; result = new_result; @@ -665,7 +665,7 @@ _Py_bytes_rfind(const char *str, Py_ssize_t len, PyObject *subobj, } if (new_result != 1) { if (new_result == cur_end) { - return cur_end; + return PyLong_FromSsize_t(cur_end); } cur_start = new_result + 1; result = new_result; From c90545862900fc6329e07103b02fc3607483eea3 Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sat, 25 May 2024 10:06:07 +0200 Subject: [PATCH 15/86] Fix condition --- Objects/bytes_methods.c | 2 +- Objects/unicodeobject.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index c9dfd6c0783524..acfe69e9215745 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -663,7 +663,7 @@ _Py_bytes_rfind(const char *str, Py_ssize_t len, PyObject *subobj, if (new_result == -2) { return NULL; } - if (new_result != 1) { + if (new_result != -1) { if (new_result == cur_end) { return PyLong_FromSsize_t(cur_end); } diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 3254418edf2030..18ea77b6c43cab 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -12588,7 +12588,7 @@ unicode_rfind_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, } Py_ssize_t new_result = any_find_slice(str, substr, cur_start, sub_end, -1); - if (new_result != 1) { + if (new_result != -1) { if (new_result == cur_end) { return cur_end; } From 5c79f248cc25ed1ad2885bbc3b2688538e3ef629 Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sat, 25 May 2024 12:21:14 +0200 Subject: [PATCH 16/86] Add tests Co-authored-by: d.grigonis --- Lib/test/string_tests.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py index 0599999de1fde3..b1f6fc4c4f6af1 100644 --- a/Lib/test/string_tests.py +++ b/Lib/test/string_tests.py @@ -227,6 +227,8 @@ def test_find(self): self.checkequal(2, '__aa__bb__', 'find', ('aa', 'bb'), 0, 10) self.checkequal(-1, '__aa__bb__', 'find', ('aa', 'bb'), 0, 3) self.checkequal(2, '__aa__bb__', 'find', ('aa', 'bb'), 0, 4) + s = '_' * 9998 + 'aaaa' + '_' * 9998 + self.checkequal(9998, s, 'find', ('aaaa', 'bb')) def test_rfind(self): self.checkequal(9, 'abcdefghiabc', 'rfind', 'abc') @@ -290,6 +292,8 @@ def test_rfind(self): self.checkequal(6, '__aa__bb__', 'rfind', ('aa', 'bb'), 0, 10) self.checkequal(-1, '__aa__bb__', 'rfind', ('aa', 'bb'), 7, 10) self.checkequal(6, '__aa__bb__', 'rfind', ('aa', 'bb'), 6, 10) + s = '_' * 9998 + 'aaaa' + '_' * 9998 + self.checkequal(9998, s, 'rfind', ('aaaa', 'bb')) def test_index(self): self.checkequal(0, 'abcdefghiabc', 'index', '') From 148b4713e4381b293dcdca936d17fcf08212dcbc Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sat, 25 May 2024 13:11:44 +0200 Subject: [PATCH 17/86] Clarify documentation Co-authored-by: d.grigonis --- Doc/library/stdtypes.rst | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/Doc/library/stdtypes.rst b/Doc/library/stdtypes.rst index d84bb5f383d750..9f9f05d53310e2 100644 --- a/Doc/library/stdtypes.rst +++ b/Doc/library/stdtypes.rst @@ -1725,8 +1725,9 @@ expression support in the :mod:`re` module). Return the lowest index in the string where substring *sub* is found within the slice ``s[start:end]``. *sub* can also be a tuple of substrings to look - for. Optional arguments *start* and *end* are interpreted as in slice - notation. Return ``-1`` if *sub* is not found. + for. In this case the returned index, if found, will be the index of the + first match. Optional arguments *start* and *end* are interpreted as in + slice notation. Return ``-1`` if *sub* is not found. .. note:: @@ -2035,7 +2036,8 @@ expression support in the :mod:`re` module). Return the highest index in the string where substring *sub* is found, such that *sub* is contained within ``s[start:end]``. *sub* can also be a tuple - of substrings to look for. Optional arguments *start* and *end* are + of substrings to look for. In this case the returned index, if found, will + be the index of the last match. Optional arguments *start* and *end* are interpreted as in slice notation. Return ``-1`` on failure. .. versionchanged:: 3.14 @@ -2868,9 +2870,10 @@ arbitrary binary data. Return the lowest index in the data where the subsequence *sub* is found, such that *sub* is contained in the slice ``s[start:end]``. *sub* can - also be a tuple of subsequences to look for. Optional arguments *start* - and *end* are interpreted as in slice notation. Return ``-1`` if *sub* - is not found. + also be a tuple of subsequences to look for. In this case the returned + index, if found, will be the index of the first match. Optional arguments + *start* and *end* are interpreted as in slice notation. Return ``-1`` if + *sub* is not found. The subsequence to search for may be any :term:`bytes-like object` or an integer in the range 0 to 255. @@ -2960,8 +2963,10 @@ arbitrary binary data. Return the highest index in the sequence where the subsequence *sub* is found, such that *sub* is contained within ``s[start:end]``. *sub* can - also be a tuple of subsequences to look for. Optional arguments *start* - and *end* are interpreted as in slice notation. Return ``-1`` on failure. + also be a tuple of subsequences to look for. In this case the returned + index, if found, will be the index of the last match. Optional arguments + *start* and *end* are interpreted as in slice notation. Return ``-1`` on + failure. The subsequence to search for may be any :term:`bytes-like object` or an integer in the range 0 to 255. From 351dc8384b28f24f2a4d1bcc7a72cbf62ac3aa87 Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sat, 25 May 2024 14:22:03 +0200 Subject: [PATCH 18/86] Add constant --- Objects/bytes_methods.c | 14 ++++++++------ Objects/unicodeobject.c | 14 ++++++++------ 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index acfe69e9215745..d15eccab18b2e0 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -556,6 +556,8 @@ find_internal(const char *str, Py_ssize_t len, return res; } +#define FIND_CHUNK_SIZE 10000 + PyObject * _Py_bytes_find(const char *str, Py_ssize_t len, PyObject *subobj, Py_ssize_t start, Py_ssize_t end) @@ -564,9 +566,9 @@ _Py_bytes_find(const char *str, Py_ssize_t len, PyObject *subobj, if (PyTuple_Check(subobj)) { result = -1; ADJUST_INDICES(start, end, len); - // Work in batches of 10000 - for (; result == -1 && start <= end; start += 10000) { - Py_ssize_t cur_end = start + 10000; + // Work in chunks + for (; result == -1 && start <= end; start += FIND_CHUNK_SIZE) { + Py_ssize_t cur_end = start + FIND_CHUNK_SIZE; for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { PyObject *subseq = PyTuple_GET_ITEM(subobj, i); Py_ssize_t sub_len; @@ -631,10 +633,10 @@ _Py_bytes_rfind(const char *str, Py_ssize_t len, PyObject *subobj, if (PyTuple_Check(subobj)) { result = -1; ADJUST_INDICES(start, end, len); - // Work in batches of 10000 + // Work in chunks Py_ssize_t cur_end = end; - for (; result == -1 && cur_end >= start; cur_end -= 10000) { - Py_ssize_t cur_start = cur_end - 10000; + for (; result == -1 && cur_end >= start; cur_end -= FIND_CHUNK_SIZE) { + Py_ssize_t cur_start = cur_end - FIND_CHUNK_SIZE; if (cur_start < start) { cur_start = start; } diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 18ea77b6c43cab..4e9e1285163bf7 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -11333,6 +11333,8 @@ unicode_expandtabs_impl(PyObject *self, int tabsize) return NULL; } +#define FIND_CHUNK_SIZE 10000 + /*[clinic input] str.find as unicode_find -> Py_ssize_t @@ -11368,9 +11370,9 @@ unicode_find_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, result = -1; Py_ssize_t len = PyUnicode_GET_LENGTH(str); ADJUST_INDICES(start, end, len); - // Work in batches of 10000 - for (; result == -1 && start <= end; start += 10000) { - Py_ssize_t cur_end = start + 10000; + // Work in chunks + for (; result == -1 && start <= end; start += FIND_CHUNK_SIZE) { + Py_ssize_t cur_end = start + FIND_CHUNK_SIZE; for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { PyObject *substr = PyTuple_GET_ITEM(subobj, i); Py_ssize_t sub_end = cur_end + PyUnicode_GET_LENGTH(substr); @@ -12573,10 +12575,10 @@ unicode_rfind_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, result = -1; Py_ssize_t len = PyUnicode_GET_LENGTH(str); ADJUST_INDICES(start, end, len); - // Work in batches of 10000 + // Work in chunks Py_ssize_t cur_end = end; - for (; result == -1 && cur_end >= start; cur_end -= 10000) { - Py_ssize_t cur_start = cur_end - 10000; + for (; result == -1 && cur_end >= start; cur_end -= FIND_CHUNK_SIZE) { + Py_ssize_t cur_start = cur_end - FIND_CHUNK_SIZE; if (cur_start < start) { cur_start = start; } From ddaf4b4234254f48dd93f8d6e729ef629a26113a Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sat, 25 May 2024 14:33:03 +0200 Subject: [PATCH 19/86] Duplicate constant --- Objects/bytes_methods.c | 5 +++-- Objects/unicodeobject.c | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index d15eccab18b2e0..9842393b77581b 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -557,6 +557,7 @@ find_internal(const char *str, Py_ssize_t len, } #define FIND_CHUNK_SIZE 10000 +#define RFIND_CHUNK_SIZE FIND_CHUNK_SIZE PyObject * _Py_bytes_find(const char *str, Py_ssize_t len, PyObject *subobj, @@ -635,8 +636,8 @@ _Py_bytes_rfind(const char *str, Py_ssize_t len, PyObject *subobj, ADJUST_INDICES(start, end, len); // Work in chunks Py_ssize_t cur_end = end; - for (; result == -1 && cur_end >= start; cur_end -= FIND_CHUNK_SIZE) { - Py_ssize_t cur_start = cur_end - FIND_CHUNK_SIZE; + for (; result == -1 && cur_end >= start; cur_end -= RFIND_CHUNK_SIZE) { + Py_ssize_t cur_start = cur_end - RFIND_CHUNK_SIZE; if (cur_start < start) { cur_start = start; } diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 4e9e1285163bf7..9ec29e4984746a 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -11334,6 +11334,7 @@ unicode_expandtabs_impl(PyObject *self, int tabsize) } #define FIND_CHUNK_SIZE 10000 +#define RFIND_CHUNK_SIZE FIND_CHUNK_SIZE /*[clinic input] str.find as unicode_find -> Py_ssize_t @@ -12577,8 +12578,8 @@ unicode_rfind_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, ADJUST_INDICES(start, end, len); // Work in chunks Py_ssize_t cur_end = end; - for (; result == -1 && cur_end >= start; cur_end -= FIND_CHUNK_SIZE) { - Py_ssize_t cur_start = cur_end - FIND_CHUNK_SIZE; + for (; result == -1 && cur_end >= start; cur_end -= RFIND_CHUNK_SIZE) { + Py_ssize_t cur_start = cur_end - RFIND_CHUNK_SIZE; if (cur_start < start) { cur_start = start; } From 2b044a112edb300267c27c1971f1005947a4b38d Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sat, 25 May 2024 14:59:47 +0200 Subject: [PATCH 20/86] Add tests --- Lib/test/string_tests.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py index b1f6fc4c4f6af1..b388a68cce5466 100644 --- a/Lib/test/string_tests.py +++ b/Lib/test/string_tests.py @@ -229,6 +229,12 @@ def test_find(self): self.checkequal(2, '__aa__bb__', 'find', ('aa', 'bb'), 0, 4) s = '_' * 9998 + 'aaaa' + '_' * 9998 self.checkequal(9998, s, 'find', ('aaaa', 'bb')) + self.checkequal(2, 'foobar', 'find', ('ob', 'oba')) + self.checkequal(1, 'foobar', 'find', ('ob', 'oob')) + self.checkequal(0, '', 'find', ('',)) + self.checkequal(2, '__abcd__', 'find', ('cd', 'ab')) + self.checkequal(2, '__abc__', 'find', ('bc', 'ab')) + self.checkequal(1, 'a' + 'b' * 10000, 'find', ('b' * 10000,)) def test_rfind(self): self.checkequal(9, 'abcdefghiabc', 'rfind', 'abc') @@ -294,6 +300,13 @@ def test_rfind(self): self.checkequal(6, '__aa__bb__', 'rfind', ('aa', 'bb'), 6, 10) s = '_' * 9998 + 'aaaa' + '_' * 9998 self.checkequal(9998, s, 'rfind', ('aaaa', 'bb')) + self.checkequal(2, 'foobar', 'rfind', ('oba', 'ob')) + self.checkequal(2, 'foobar', 'rfind', ('oob', 'ob')) + self.checkequal(0, '', 'rfind', ('',)) + self.checkequal(4, '__abcd__', 'rfind', ('ab', 'cd')) + self.checkequal(3, '__abc__', 'rfind', ('ab', 'bc')) + self.checkequal(0, 'b' * 10000 + 'a', 'rfind', ('b' * 10000,)) + def test_index(self): self.checkequal(0, 'abcdefghiabc', 'index', '') From a632f25eee44f1bef45105a8524b7ef71579c2ae Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sat, 25 May 2024 15:01:08 +0200 Subject: [PATCH 21/86] Remove newline --- Lib/test/string_tests.py | 1 - 1 file changed, 1 deletion(-) diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py index b388a68cce5466..00baab39fa0c74 100644 --- a/Lib/test/string_tests.py +++ b/Lib/test/string_tests.py @@ -307,7 +307,6 @@ def test_rfind(self): self.checkequal(3, '__abc__', 'rfind', ('ab', 'bc')) self.checkequal(0, 'b' * 10000 + 'a', 'rfind', ('b' * 10000,)) - def test_index(self): self.checkequal(0, 'abcdefghiabc', 'index', '') self.checkequal(3, 'abcdefghiabc', 'index', 'def') From ef28dab3b89e3b96f4794c8ab03c50fd6c974e18 Mon Sep 17 00:00:00 2001 From: Nice Zombies Date: Sat, 25 May 2024 15:09:27 +0200 Subject: [PATCH 22/86] Update Lib/test/string_tests.py Co-authored-by: d.grigonis --- Lib/test/string_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py index 00baab39fa0c74..a213e0ead48797 100644 --- a/Lib/test/string_tests.py +++ b/Lib/test/string_tests.py @@ -235,7 +235,7 @@ def test_find(self): self.checkequal(2, '__abcd__', 'find', ('cd', 'ab')) self.checkequal(2, '__abc__', 'find', ('bc', 'ab')) self.checkequal(1, 'a' + 'b' * 10000, 'find', ('b' * 10000,)) - +self.checkequal(1, 'ab' + 'c' * 100000, 'find', ('c' * 100000, 'b' + 'c' * 100000)) def test_rfind(self): self.checkequal(9, 'abcdefghiabc', 'rfind', 'abc') self.checkequal(12, 'abcdefghiabc', 'rfind', '') From 4207d54a4d97c1f5b34e1d0be4fdd3029ec8c4f8 Mon Sep 17 00:00:00 2001 From: Nice Zombies Date: Sat, 25 May 2024 15:18:15 +0200 Subject: [PATCH 23/86] Update Lib/test/string_tests.py --- Lib/test/string_tests.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py index a213e0ead48797..3f04dc08fe432a 100644 --- a/Lib/test/string_tests.py +++ b/Lib/test/string_tests.py @@ -235,7 +235,8 @@ def test_find(self): self.checkequal(2, '__abcd__', 'find', ('cd', 'ab')) self.checkequal(2, '__abc__', 'find', ('bc', 'ab')) self.checkequal(1, 'a' + 'b' * 10000, 'find', ('b' * 10000,)) -self.checkequal(1, 'ab' + 'c' * 100000, 'find', ('c' * 100000, 'b' + 'c' * 100000)) + self.checkequal(1, 'ab' + 'c' * 100000, 'find', ('c' * 100000, 'b' + 'c' * 100000)) + def test_rfind(self): self.checkequal(9, 'abcdefghiabc', 'rfind', 'abc') self.checkequal(12, 'abcdefghiabc', 'rfind', '') From 0dff482124e1dd8bdf34fee307b5ce26be5d8e1e Mon Sep 17 00:00:00 2001 From: Nice Zombies Date: Sat, 25 May 2024 16:26:19 +0200 Subject: [PATCH 24/86] Update Lib/test/string_tests.py Co-authored-by: d.grigonis --- Lib/test/string_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py index 3f04dc08fe432a..acd0ba07d01613 100644 --- a/Lib/test/string_tests.py +++ b/Lib/test/string_tests.py @@ -307,7 +307,7 @@ def test_rfind(self): self.checkequal(4, '__abcd__', 'rfind', ('ab', 'cd')) self.checkequal(3, '__abc__', 'rfind', ('ab', 'bc')) self.checkequal(0, 'b' * 10000 + 'a', 'rfind', ('b' * 10000,)) - + self.checkequal(2, 'ab' + 'c' * 100000, 'rfind', ('c' * 100000, 'b' + 'c' * 100000)) def test_index(self): self.checkequal(0, 'abcdefghiabc', 'index', '') self.checkequal(3, 'abcdefghiabc', 'index', 'def') From fc0d9eab37efe66cd54c56f5893143cd94bb43a5 Mon Sep 17 00:00:00 2001 From: Nice Zombies Date: Sat, 25 May 2024 16:27:01 +0200 Subject: [PATCH 25/86] Update Lib/test/string_tests.py --- Lib/test/string_tests.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py index acd0ba07d01613..45ebce2e27d18c 100644 --- a/Lib/test/string_tests.py +++ b/Lib/test/string_tests.py @@ -308,6 +308,7 @@ def test_rfind(self): self.checkequal(3, '__abc__', 'rfind', ('ab', 'bc')) self.checkequal(0, 'b' * 10000 + 'a', 'rfind', ('b' * 10000,)) self.checkequal(2, 'ab' + 'c' * 100000, 'rfind', ('c' * 100000, 'b' + 'c' * 100000)) + def test_index(self): self.checkequal(0, 'abcdefghiabc', 'index', '') self.checkequal(3, 'abcdefghiabc', 'index', 'def') From cd317fd780f91ccc7673c600f945458e8c68e72e Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sat, 25 May 2024 17:26:03 +0200 Subject: [PATCH 26/86] Don't check twice on boundary --- Objects/bytes_methods.c | 4 ++-- Objects/unicodeobject.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index 9842393b77581b..7b920be31e10e6 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -569,7 +569,7 @@ _Py_bytes_find(const char *str, Py_ssize_t len, PyObject *subobj, ADJUST_INDICES(start, end, len); // Work in chunks for (; result == -1 && start <= end; start += FIND_CHUNK_SIZE) { - Py_ssize_t cur_end = start + FIND_CHUNK_SIZE; + Py_ssize_t cur_end = start + FIND_CHUNK_SIZE - 1; for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { PyObject *subseq = PyTuple_GET_ITEM(subobj, i); Py_ssize_t sub_len; @@ -637,7 +637,7 @@ _Py_bytes_rfind(const char *str, Py_ssize_t len, PyObject *subobj, // Work in chunks Py_ssize_t cur_end = end; for (; result == -1 && cur_end >= start; cur_end -= RFIND_CHUNK_SIZE) { - Py_ssize_t cur_start = cur_end - RFIND_CHUNK_SIZE; + Py_ssize_t cur_start = cur_end - RFIND_CHUNK_SIZE + 1; if (cur_start < start) { cur_start = start; } diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 9ec29e4984746a..3cbbaa8128f461 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -11373,7 +11373,7 @@ unicode_find_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, ADJUST_INDICES(start, end, len); // Work in chunks for (; result == -1 && start <= end; start += FIND_CHUNK_SIZE) { - Py_ssize_t cur_end = start + FIND_CHUNK_SIZE; + Py_ssize_t cur_end = start + FIND_CHUNK_SIZE - 1; for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { PyObject *substr = PyTuple_GET_ITEM(subobj, i); Py_ssize_t sub_end = cur_end + PyUnicode_GET_LENGTH(substr); @@ -12579,7 +12579,7 @@ unicode_rfind_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, // Work in chunks Py_ssize_t cur_end = end; for (; result == -1 && cur_end >= start; cur_end -= RFIND_CHUNK_SIZE) { - Py_ssize_t cur_start = cur_end - RFIND_CHUNK_SIZE; + Py_ssize_t cur_start = cur_end - RFIND_CHUNK_SIZE + 1; if (cur_start < start) { cur_start = start; } From 43e8259de7552b4720eefdfae1806a68f692f335 Mon Sep 17 00:00:00 2001 From: Nice Zombies Date: Sat, 25 May 2024 20:44:51 +0200 Subject: [PATCH 27/86] Apply suggestions from code review --- Lib/test/string_tests.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py index 45ebce2e27d18c..85da6a05cb4bae 100644 --- a/Lib/test/string_tests.py +++ b/Lib/test/string_tests.py @@ -227,6 +227,7 @@ def test_find(self): self.checkequal(2, '__aa__bb__', 'find', ('aa', 'bb'), 0, 10) self.checkequal(-1, '__aa__bb__', 'find', ('aa', 'bb'), 0, 3) self.checkequal(2, '__aa__bb__', 'find', ('aa', 'bb'), 0, 4) + self.checkraises(TypeError, 'hello', 'find', (None,)) s = '_' * 9998 + 'aaaa' + '_' * 9998 self.checkequal(9998, s, 'find', ('aaaa', 'bb')) self.checkequal(2, 'foobar', 'find', ('ob', 'oba')) @@ -235,7 +236,8 @@ def test_find(self): self.checkequal(2, '__abcd__', 'find', ('cd', 'ab')) self.checkequal(2, '__abc__', 'find', ('bc', 'ab')) self.checkequal(1, 'a' + 'b' * 10000, 'find', ('b' * 10000,)) - self.checkequal(1, 'ab' + 'c' * 100000, 'find', ('c' * 100000, 'b' + 'c' * 100000)) + s = 'ab' + 'c' * 100000 + self.checkequal(1, s, 'find', ('c' * 100000, 'b' + 'c' * 100000)) def test_rfind(self): self.checkequal(9, 'abcdefghiabc', 'rfind', 'abc') @@ -299,6 +301,7 @@ def test_rfind(self): self.checkequal(6, '__aa__bb__', 'rfind', ('aa', 'bb'), 0, 10) self.checkequal(-1, '__aa__bb__', 'rfind', ('aa', 'bb'), 7, 10) self.checkequal(6, '__aa__bb__', 'rfind', ('aa', 'bb'), 6, 10) + self.checkraises(TypeError, 'hello', 'rfind', (None,)) s = '_' * 9998 + 'aaaa' + '_' * 9998 self.checkequal(9998, s, 'rfind', ('aaaa', 'bb')) self.checkequal(2, 'foobar', 'rfind', ('oba', 'ob')) @@ -307,7 +310,8 @@ def test_rfind(self): self.checkequal(4, '__abcd__', 'rfind', ('ab', 'cd')) self.checkequal(3, '__abc__', 'rfind', ('ab', 'bc')) self.checkequal(0, 'b' * 10000 + 'a', 'rfind', ('b' * 10000,)) - self.checkequal(2, 'ab' + 'c' * 100000, 'rfind', ('c' * 100000, 'b' + 'c' * 100000)) + s = 'ab' + 'c' * 100000 + self.checkequal(2, s, 'rfind', ('c' * 100000, 'b' + 'c' * 100000)) def test_index(self): self.checkequal(0, 'abcdefghiabc', 'index', '') From 2524dc10439bd95d7217a3dca93057b544783605 Mon Sep 17 00:00:00 2001 From: Nice Zombies Date: Sat, 25 May 2024 20:48:23 +0200 Subject: [PATCH 28/86] Apply suggestions from code review --- Lib/test/string_tests.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py index 85da6a05cb4bae..662236646518e7 100644 --- a/Lib/test/string_tests.py +++ b/Lib/test/string_tests.py @@ -238,6 +238,7 @@ def test_find(self): self.checkequal(1, 'a' + 'b' * 10000, 'find', ('b' * 10000,)) s = 'ab' + 'c' * 100000 self.checkequal(1, s, 'find', ('c' * 100000, 'b' + 'c' * 100000)) + self.checkequal(0, 'foobar', 'find', ('foo',)) def test_rfind(self): self.checkequal(9, 'abcdefghiabc', 'rfind', 'abc') @@ -312,6 +313,7 @@ def test_rfind(self): self.checkequal(0, 'b' * 10000 + 'a', 'rfind', ('b' * 10000,)) s = 'ab' + 'c' * 100000 self.checkequal(2, s, 'rfind', ('c' * 100000, 'b' + 'c' * 100000)) + self.checkequal(3, 'foo', 'rfind', ('',)) def test_index(self): self.checkequal(0, 'abcdefghiabc', 'index', '') From dbc8c941668d6f7ccb9d48334d86dc1db7e42ef2 Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sat, 25 May 2024 20:53:26 +0200 Subject: [PATCH 29/86] Test bytes --- Lib/test/test_bytes.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Lib/test/test_bytes.py b/Lib/test/test_bytes.py index 9e1985bb3a7639..16caf2a49f4829 100644 --- a/Lib/test/test_bytes.py +++ b/Lib/test/test_bytes.py @@ -644,6 +644,10 @@ def test_find(self): ValueError, r'byte must be in range\(0, 256\)', b.find, index) + # test tuple arguments + self.assertEqual(b.find((i,)), 1) + self.assertEqual(b.find((w,)), -1) + def test_rfind(self): b = self.type2test(b'mississippi') i = 105 @@ -663,6 +667,10 @@ def test_rfind(self): self.assertEqual(b.rfind(i, 3, 9), 7) self.assertEqual(b.rfind(w, 1, 3), -1) + # test tuple arguments + self.assertEqual(b.rfind((i,)), 10) + self.assertEqual(b.rfind((w,)), -1) + def test_index(self): b = self.type2test(b'mississippi') i = 105 From 49a28a06b286ed31953c007e243bcad34bbb1d4b Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sat, 25 May 2024 20:59:07 +0200 Subject: [PATCH 30/86] Add more bytes tests --- Lib/test/test_bytes.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Lib/test/test_bytes.py b/Lib/test/test_bytes.py index 16caf2a49f4829..abcffc033f3b86 100644 --- a/Lib/test/test_bytes.py +++ b/Lib/test/test_bytes.py @@ -647,6 +647,8 @@ def test_find(self): # test tuple arguments self.assertEqual(b.find((i,)), 1) self.assertEqual(b.find((w,)), -1) + self.assertEqual(b.find((i, w)), 1) + self.assertEqual(b.find((w, i)), 1) def test_rfind(self): b = self.type2test(b'mississippi') @@ -670,6 +672,8 @@ def test_rfind(self): # test tuple arguments self.assertEqual(b.rfind((i,)), 10) self.assertEqual(b.rfind((w,)), -1) + self.assertEqual(b.rfind((i, w)), 10) + self.assertEqual(b.rfind((w, i)), 10) def test_index(self): b = self.type2test(b'mississippi') From 0bd606dd77d4a81914a0bd3f145f12858e7576f3 Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sat, 25 May 2024 22:31:08 +0200 Subject: [PATCH 31/86] Support tuples for index & rindex --- Lib/test/string_tests.py | 10 ++ Lib/test/test_inspect/test_inspect.py | 4 +- Objects/bytes_methods.c | 97 +++++++++-------- Objects/clinic/unicodeobject.c.h | 30 ++---- Objects/unicodeobject.c | 149 +++++++++++++------------- 5 files changed, 152 insertions(+), 138 deletions(-) diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py index 662236646518e7..7495a95bceb704 100644 --- a/Lib/test/string_tests.py +++ b/Lib/test/string_tests.py @@ -340,6 +340,11 @@ def test_index(self): else: self.checkraises(TypeError, 'hello', 'index', 42) + # test tuple arguments (should be wrapper around find) + self.checkequal(2, '__aa__bb__', 'index', ('aa', 'bb')) + self.checkequal(2, '__aa__bb__', 'index', ('aa', 'bb')) + self.checkraises(ValueError, '__aa__bb__', 'index', ('cc', 'dd')) + def test_rindex(self): self.checkequal(12, 'abcdefghiabc', 'rindex', '') self.checkequal(3, 'abcdefghiabc', 'rindex', 'def') @@ -366,6 +371,11 @@ def test_rindex(self): else: self.checkraises(TypeError, 'hello', 'rindex', 42) + # test tuple arguments (should be wrapper around rfind) + self.checkequal(6, '__aa__bb__', 'rindex', ('aa', 'bb')) + self.checkequal(6, '__aa__bb__', 'rindex', ('bb', 'aa')) + self.checkraises(ValueError, '__aa__bb__', 'rindex', ('cc', 'dd')) + def test_find_periodic_pattern(self): """Cover the special path for periodic patterns.""" def reference_find(p, s): diff --git a/Lib/test/test_inspect/test_inspect.py b/Lib/test/test_inspect/test_inspect.py index 6ba2d85fb00742..3f0536f10608a8 100644 --- a/Lib/test/test_inspect/test_inspect.py +++ b/Lib/test/test_inspect/test_inspect.py @@ -5414,7 +5414,7 @@ def test_builtins_have_signatures(self): 'dict': {'pop'}, 'int': {'__round__'}, 'memoryview': {'cast', 'hex'}, - 'str': {'count', 'endswith', 'index', 'maketrans', 'rindex', 'startswith'}, + 'str': {'count', 'endswith', 'maketrans', 'startswith'}, } self._test_module_has_signatures(builtins, no_signature, unsupported_signature, @@ -5589,7 +5589,7 @@ def test_typing_module_has_signatures(self): 'Generic': {'__class_getitem__', '__init_subclass__'}, } methods_unsupported_signature = { - 'Text': {'count', 'index', 'rindex', 'startswith', 'endswith', 'maketrans'}, + 'Text': {'count', 'startswith', 'endswith', 'maketrans'}, } self._test_module_has_signatures(typing, no_signature, methods_no_signature=methods_no_signature, diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index 7b920be31e10e6..782c7fc5b3c394 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -559,13 +559,12 @@ find_internal(const char *str, Py_ssize_t len, #define FIND_CHUNK_SIZE 10000 #define RFIND_CHUNK_SIZE FIND_CHUNK_SIZE -PyObject * -_Py_bytes_find(const char *str, Py_ssize_t len, PyObject *subobj, - Py_ssize_t start, Py_ssize_t end) +static Py_ssize_t +bytes_find_internal(const char *str, Py_ssize_t len, const char *function_name, + PyObject *subobj, Py_ssize_t start, Py_ssize_t end) { - Py_ssize_t result; if (PyTuple_Check(subobj)) { - result = -1; + Py_ssize_t result = -1; ADJUST_INDICES(start, end, len); // Work in chunks for (; result == -1 && start <= end; start += FIND_CHUNK_SIZE) { @@ -580,7 +579,7 @@ _Py_bytes_find(const char *str, Py_ssize_t len, PyObject *subobj, else if (PyObject_GetBuffer(subseq, &subbuf, PyBUF_SIMPLE) != 0) { - return NULL; + return -2; } else { sub_len = subbuf.len; @@ -589,50 +588,54 @@ _Py_bytes_find(const char *str, Py_ssize_t len, PyObject *subobj, if (sub_end > end) { sub_end = end; } - Py_ssize_t new_result = find_internal(str, len, "find", subseq, - start, sub_end, +1); + Py_ssize_t new_result = find_internal(str, len, function_name, + subseq, start, sub_end, + +1); if (new_result == -2) { - return NULL; + return -2; } if (new_result != -1) { if (new_result == start) { - return PyLong_FromSsize_t(start); + return start; } cur_end = new_result - 1; result = new_result; } } } - return PyLong_FromSsize_t(result); + return result; } - result = find_internal(str, len, "find", subobj, start, end, +1); - if (result == -2) - return NULL; - return PyLong_FromSsize_t(result); + return find_internal(str, len, function_name, subobj, start, end, +1); } PyObject * -_Py_bytes_index(const char *str, Py_ssize_t len, PyObject *sub, +_Py_bytes_find(const char *str, Py_ssize_t len, PyObject *subobj, + Py_ssize_t start, Py_ssize_t end) +{ + Py_ssize_t result = bytes_find_internal(str, len, "find", subobj, start, + end); + return result == -2 ? NULL : PyLong_FromSsize_t(result); +} + +PyObject * +_Py_bytes_index(const char *str, Py_ssize_t len, PyObject *subobj, Py_ssize_t start, Py_ssize_t end) { - Py_ssize_t result = find_internal(str, len, "index", sub, start, end, +1); - if (result == -2) - return NULL; + Py_ssize_t result = bytes_find_internal(str, len, "index", subobj, start, + end); if (result == -1) { - PyErr_SetString(PyExc_ValueError, - "subsection not found"); - return NULL; + PyErr_SetString(PyExc_ValueError, "subsection not found"); } - return PyLong_FromSsize_t(result); + return result < 0 ? NULL : PyLong_FromSsize_t(result); } -PyObject * -_Py_bytes_rfind(const char *str, Py_ssize_t len, PyObject *subobj, - Py_ssize_t start, Py_ssize_t end) +static Py_ssize_t +bytes_rfind_internal(const char *str, Py_ssize_t len, + const char *function_name, PyObject *subobj, + Py_ssize_t start, Py_ssize_t end) { - Py_ssize_t result; if (PyTuple_Check(subobj)) { - result = -1; + Py_ssize_t result = -1; ADJUST_INDICES(start, end, len); // Work in chunks Py_ssize_t cur_end = end; @@ -651,7 +654,7 @@ _Py_bytes_rfind(const char *str, Py_ssize_t len, PyObject *subobj, else if (PyObject_GetBuffer(subseq, &subbuf, PyBUF_SIMPLE) != 0) { - return NULL; + return -2; } else { sub_len = subbuf.len; @@ -660,42 +663,46 @@ _Py_bytes_rfind(const char *str, Py_ssize_t len, PyObject *subobj, if (sub_end > end) { sub_end = end; } - Py_ssize_t new_result = find_internal(str, len, "rfind", + Py_ssize_t new_result = find_internal(str, len, function_name, subseq, cur_start, sub_end, -1); if (new_result == -2) { - return NULL; + return -2; } if (new_result != -1) { if (new_result == cur_end) { - return PyLong_FromSsize_t(cur_end); + return cur_end; } cur_start = new_result + 1; result = new_result; } } } - return PyLong_FromSsize_t(result); + return result; } - result = find_internal(str, len, "rfind", subobj, start, end, -1); - if (result == -2) - return NULL; - return PyLong_FromSsize_t(result); + return find_internal(str, len, function_name, subobj, start, end, -1); } + PyObject * -_Py_bytes_rindex(const char *str, Py_ssize_t len, PyObject *sub, +_Py_bytes_rfind(const char *str, Py_ssize_t len, PyObject *subobj, + Py_ssize_t start, Py_ssize_t end) +{ + Py_ssize_t result = bytes_rfind_internal(str, len, "rfind", subobj, start, + end); + return result == -2 ? NULL : PyLong_FromSsize_t(result); +} + +PyObject * +_Py_bytes_rindex(const char *str, Py_ssize_t len, PyObject *subobj, Py_ssize_t start, Py_ssize_t end) { - Py_ssize_t result = find_internal(str, len, "rindex", sub, start, end, -1); - if (result == -2) - return NULL; + Py_ssize_t result = bytes_rfind_internal(str, len, "rindex", subobj, start, + end); if (result == -1) { - PyErr_SetString(PyExc_ValueError, - "subsection not found"); - return NULL; + PyErr_SetString(PyExc_ValueError, "subsection not found"); } - return PyLong_FromSsize_t(result); + return result < 0 ? NULL : PyLong_FromSsize_t(result); } PyObject * diff --git a/Objects/clinic/unicodeobject.c.h b/Objects/clinic/unicodeobject.c.h index 87ee50fb4cce65..ef86f67dcaee9a 100644 --- a/Objects/clinic/unicodeobject.c.h +++ b/Objects/clinic/unicodeobject.c.h @@ -409,7 +409,7 @@ unicode_find(PyObject *str, PyObject *const *args, Py_ssize_t nargs) } PyDoc_STRVAR(unicode_index__doc__, -"index($self, sub[, start[, end]], /)\n" +"index($self, sub, start=None, end=None, /)\n" "--\n" "\n" "Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].\n" @@ -421,14 +421,14 @@ PyDoc_STRVAR(unicode_index__doc__, {"index", _PyCFunction_CAST(unicode_index), METH_FASTCALL, unicode_index__doc__}, static Py_ssize_t -unicode_index_impl(PyObject *str, PyObject *substr, Py_ssize_t start, +unicode_index_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, Py_ssize_t end); static PyObject * unicode_index(PyObject *str, PyObject *const *args, Py_ssize_t nargs) { PyObject *return_value = NULL; - PyObject *substr; + PyObject *subobj; Py_ssize_t start = 0; Py_ssize_t end = PY_SSIZE_T_MAX; Py_ssize_t _return_value; @@ -436,11 +436,7 @@ unicode_index(PyObject *str, PyObject *const *args, Py_ssize_t nargs) if (!_PyArg_CheckPositional("index", nargs, 1, 3)) { goto exit; } - if (!PyUnicode_Check(args[0])) { - _PyArg_BadArgument("index", "argument 1", "str", args[0]); - goto exit; - } - substr = args[0]; + subobj = args[0]; if (nargs < 2) { goto skip_optional; } @@ -454,7 +450,7 @@ unicode_index(PyObject *str, PyObject *const *args, Py_ssize_t nargs) goto exit; } skip_optional: - _return_value = unicode_index_impl(str, substr, start, end); + _return_value = unicode_index_impl(str, subobj, start, end); if ((_return_value == -1) && PyErr_Occurred()) { goto exit; } @@ -1108,7 +1104,7 @@ unicode_rfind(PyObject *str, PyObject *const *args, Py_ssize_t nargs) } PyDoc_STRVAR(unicode_rindex__doc__, -"rindex($self, sub[, start[, end]], /)\n" +"rindex($self, sub, start=None, end=None, /)\n" "--\n" "\n" "Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].\n" @@ -1120,14 +1116,14 @@ PyDoc_STRVAR(unicode_rindex__doc__, {"rindex", _PyCFunction_CAST(unicode_rindex), METH_FASTCALL, unicode_rindex__doc__}, static Py_ssize_t -unicode_rindex_impl(PyObject *str, PyObject *substr, Py_ssize_t start, +unicode_rindex_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, Py_ssize_t end); static PyObject * unicode_rindex(PyObject *str, PyObject *const *args, Py_ssize_t nargs) { PyObject *return_value = NULL; - PyObject *substr; + PyObject *subobj; Py_ssize_t start = 0; Py_ssize_t end = PY_SSIZE_T_MAX; Py_ssize_t _return_value; @@ -1135,11 +1131,7 @@ unicode_rindex(PyObject *str, PyObject *const *args, Py_ssize_t nargs) if (!_PyArg_CheckPositional("rindex", nargs, 1, 3)) { goto exit; } - if (!PyUnicode_Check(args[0])) { - _PyArg_BadArgument("rindex", "argument 1", "str", args[0]); - goto exit; - } - substr = args[0]; + subobj = args[0]; if (nargs < 2) { goto skip_optional; } @@ -1153,7 +1145,7 @@ unicode_rindex(PyObject *str, PyObject *const *args, Py_ssize_t nargs) goto exit; } skip_optional: - _return_value = unicode_rindex_impl(str, substr, start, end); + _return_value = unicode_rindex_impl(str, subobj, start, end); if ((_return_value == -1) && PyErr_Occurred()) { goto exit; } @@ -1880,4 +1872,4 @@ unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) exit: return return_value; } -/*[clinic end generated code: output=1db638aa49eefba8 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=5fb071b9e4dc87fa input=a9049054013a1b77]*/ diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 3cbbaa8128f461..3e76d36e543d6f 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -11336,39 +11336,22 @@ unicode_expandtabs_impl(PyObject *self, int tabsize) #define FIND_CHUNK_SIZE 10000 #define RFIND_CHUNK_SIZE FIND_CHUNK_SIZE -/*[clinic input] -str.find as unicode_find -> Py_ssize_t - - self as str: self - sub as subobj: object - start: slice_index(accept={int, NoneType}, c_default='0') = None - end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None - / - -Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end]. - -Optional arguments start and end are interpreted as in slice notation. -Return -1 on failure. -[clinic start generated code]*/ - static Py_ssize_t -unicode_find_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, - Py_ssize_t end) -/*[clinic end generated code: output=80175735a6d549d0 input=51e7b530950ab304]*/ +unicode_find_internal(PyObject *str, const char *function_name, + PyObject *subobj, Py_ssize_t start, Py_ssize_t end) { - Py_ssize_t result; if (PyTuple_Check(subobj)) { for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { PyObject *substr = PyTuple_GET_ITEM(subobj, i); if (!PyUnicode_Check(substr)) { PyErr_Format(PyExc_TypeError, - "tuple for find must only contain str, " - "not %.100s", + "tuple for %.200s must only contain str, " + "not %.100s", function_name, Py_TYPE(substr)->tp_name); - return -1; + return -2; } } - result = -1; + Py_ssize_t result = -1; Py_ssize_t len = PyUnicode_GET_LENGTH(str); ADJUST_INDICES(start, end, len); // Work in chunks @@ -11395,15 +11378,36 @@ unicode_find_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, } if (!PyUnicode_Check(subobj)) { PyErr_Format(PyExc_TypeError, - "find first arg must be str or " - "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name); - return -1; - } - result = any_find_slice(str, subobj, start, end, 1); - if (result < 0) { - return -1; + "find %.200s arg must be str or " + "a tuple of str, not %.100s", function_name, + Py_TYPE(subobj)->tp_name); + return -2; } - return result; + return any_find_slice(str, subobj, start, end, 1); +} + +/*[clinic input] +str.find as unicode_find -> Py_ssize_t + + self as str: self + sub as subobj: object + start: slice_index(accept={int, NoneType}, c_default='0') = None + end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None + / + +Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end]. + +Optional arguments start and end are interpreted as in slice notation. +Return -1 on failure. +[clinic start generated code]*/ + +static Py_ssize_t +unicode_find_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, + Py_ssize_t end) +/*[clinic end generated code: output=80175735a6d549d0 input=51e7b530950ab304]*/ +{ + Py_ssize_t result = unicode_find_internal(str, "find", subobj, start, end); + return result < 0 ? -1 : result; } static PyObject * @@ -11447,7 +11451,7 @@ unicode_hash(PyObject *self) } /*[clinic input] -str.index as unicode_index = str.count +str.index as unicode_index = str.find Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end]. @@ -11456,18 +11460,16 @@ Raises ValueError when the substring is not found. [clinic start generated code]*/ static Py_ssize_t -unicode_index_impl(PyObject *str, PyObject *substr, Py_ssize_t start, +unicode_index_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, Py_ssize_t end) -/*[clinic end generated code: output=77558288837cdf40 input=d986aeac0be14a1c]*/ +/*[clinic end generated code: output=c9af24adf2f1f99e input=f0033cf1698b6108]*/ { - Py_ssize_t result = any_find_slice(str, substr, start, end, 1); + Py_ssize_t result = unicode_find_internal(str, "index", subobj, start, + end); if (result == -1) { PyErr_SetString(PyExc_ValueError, "substring not found"); } - else if (result < 0) { - return -1; - } - return result; + return result < 0 ? -1 : result; } /*[clinic input] @@ -12547,33 +12549,22 @@ unicode_repr(PyObject *unicode) return repr; } -/*[clinic input] -str.rfind as unicode_rfind = str.find - -Return the highest index in S where substring sub is found, such that sub is contained within S[start:end]. - -Optional arguments start and end are interpreted as in slice notation. -Return -1 on failure. -[clinic start generated code]*/ - static Py_ssize_t -unicode_rfind_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, - Py_ssize_t end) -/*[clinic end generated code: output=9d316eee7b9f9bf0 input=23ae7964e8f70b35]*/ +unicode_rfind_internal(PyObject *str, const char *function_name, + PyObject *subobj, Py_ssize_t start, Py_ssize_t end) { - Py_ssize_t result; if (PyTuple_Check(subobj)) { for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { PyObject *substr = PyTuple_GET_ITEM(subobj, i); if (!PyUnicode_Check(substr)) { PyErr_Format(PyExc_TypeError, - "tuple for rfind must only contain str, " - "not %.100s", + "tuple for %.200s must only contain str, " + "not %.100s", function_name, Py_TYPE(substr)->tp_name); - return -1; + return -2; } } - result = -1; + Py_ssize_t result = -1; Py_ssize_t len = PyUnicode_GET_LENGTH(str); ADJUST_INDICES(start, end, len); // Work in chunks @@ -12604,19 +12595,35 @@ unicode_rfind_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, } if (!PyUnicode_Check(subobj)) { PyErr_Format(PyExc_TypeError, - "rfind first arg must be str or " - "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name); - return -1; - } - result = any_find_slice(str, subobj, start, end, -1); - if (result < 0) { - return -1; + "%.200s first arg must be str or " + "a tuple of str, not %.100s", function_name, + Py_TYPE(subobj)->tp_name); + return -2; } - return result; + return any_find_slice(str, subobj, start, end, -1); } /*[clinic input] -str.rindex as unicode_rindex = str.count +str.rfind as unicode_rfind = str.find + +Return the highest index in S where substring sub is found, such that sub is contained within S[start:end]. + +Optional arguments start and end are interpreted as in slice notation. +Return -1 on failure. +[clinic start generated code]*/ + +static Py_ssize_t +unicode_rfind_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, + Py_ssize_t end) +/*[clinic end generated code: output=9d316eee7b9f9bf0 input=23ae7964e8f70b35]*/ +{ + Py_ssize_t result = unicode_rfind_internal(str, "rfind", subobj, start, + end); + return result < 0 ? -1 : result; +} + +/*[clinic input] +str.rindex as unicode_rindex = str.find Return the highest index in S where substring sub is found, such that sub is contained within S[start:end]. @@ -12625,18 +12632,16 @@ Raises ValueError when the substring is not found. [clinic start generated code]*/ static Py_ssize_t -unicode_rindex_impl(PyObject *str, PyObject *substr, Py_ssize_t start, +unicode_rindex_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, Py_ssize_t end) -/*[clinic end generated code: output=5f3aef124c867fe1 input=35943dead6c1ea9d]*/ +/*[clinic end generated code: output=847d553d0dc10a86 input=990f3925b149c1bc]*/ { - Py_ssize_t result = any_find_slice(str, substr, start, end, -1); + Py_ssize_t result = unicode_rfind_internal(str, "rindex", subobj, start, + end); if (result == -1) { PyErr_SetString(PyExc_ValueError, "substring not found"); } - else if (result < 0) { - return -1; - } - return result; + return result < 0 ? -1 : result; } /*[clinic input] From b337fdcb1b8bd58998638ec6ac10388f8f2769a2 Mon Sep 17 00:00:00 2001 From: Nice Zombies Date: Sat, 25 May 2024 22:33:45 +0200 Subject: [PATCH 32/86] Update Objects/bytes_methods.c --- Objects/bytes_methods.c | 1 - 1 file changed, 1 deletion(-) diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index 782c7fc5b3c394..62804a3a23be8c 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -683,7 +683,6 @@ bytes_rfind_internal(const char *str, Py_ssize_t len, return find_internal(str, len, function_name, subobj, start, end, -1); } - PyObject * _Py_bytes_rfind(const char *str, Py_ssize_t len, PyObject *subobj, Py_ssize_t start, Py_ssize_t end) From e43373f9b0fb69b07eb8f9bf73c0b22bca1b55f9 Mon Sep 17 00:00:00 2001 From: Nice Zombies Date: Sat, 25 May 2024 22:37:42 +0200 Subject: [PATCH 33/86] Update Misc/NEWS.d/next/Core and Builtins/2024-05-24-11-07-16.gh-issue-118184.EK4di_.rst --- .../2024-05-24-11-07-16.gh-issue-118184.EK4di_.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Core and Builtins/2024-05-24-11-07-16.gh-issue-118184.EK4di_.rst b/Misc/NEWS.d/next/Core and Builtins/2024-05-24-11-07-16.gh-issue-118184.EK4di_.rst index b656d5d2a87421..c38c1cbf051393 100644 --- a/Misc/NEWS.d/next/Core and Builtins/2024-05-24-11-07-16.gh-issue-118184.EK4di_.rst +++ b/Misc/NEWS.d/next/Core and Builtins/2024-05-24-11-07-16.gh-issue-118184.EK4di_.rst @@ -1 +1,4 @@ -Support tuples for :meth:`str.find`, :meth:`bytearray.find`, :meth:`bytes.find`, :meth:`str.rfind`, :meth:`bytearray.rfind` and :meth:`bytes.rfind`. +Support tuples for :meth:`str.find`, :meth:`bytearray.find`, :meth:`bytes.find`, + :meth:`str.index`, :meth:`bytearray.index`, :meth:`bytes.index`, + :meth:`str.rfind`, :meth:`bytearray.rfind`, :meth:`bytes.rfind`, + :meth:`str.rindex`, :meth:`bytearray.rindex` and :meth:`bytes.rindex`. From b47b0e07faaea36a6157b0b4f0fb91253c9b3128 Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sat, 25 May 2024 22:42:18 +0200 Subject: [PATCH 34/86] Update docs --- Doc/library/stdtypes.rst | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Doc/library/stdtypes.rst b/Doc/library/stdtypes.rst index 9f9f05d53310e2..b7aba0803e41c4 100644 --- a/Doc/library/stdtypes.rst +++ b/Doc/library/stdtypes.rst @@ -1794,6 +1794,9 @@ expression support in the :mod:`re` module). Like :meth:`~str.find`, but raise :exc:`ValueError` when the substring is not found. + .. versionchanged:: 3.14 + *sub* can now be a tuple of substrings. + .. method:: str.isalnum() @@ -2049,6 +2052,9 @@ expression support in the :mod:`re` module). Like :meth:`rfind` but raises :exc:`ValueError` when the substring *sub* is not found. + .. versionchanged:: 3.14 + *sub* can now be a tuple of substrings. + .. method:: str.rjust(width[, fillchar]) @@ -2906,6 +2912,9 @@ arbitrary binary data. .. versionchanged:: 3.3 Also accept an integer in the range 0 to 255 as the subsequence. + .. versionchanged:: 3.14 + *sub* can now be a tuple of subsequences. + .. method:: bytes.join(iterable) bytearray.join(iterable) @@ -2990,6 +2999,9 @@ arbitrary binary data. .. versionchanged:: 3.3 Also accept an integer in the range 0 to 255 as the subsequence. + .. versionchanged:: 3.14 + *sub* can now be a tuple of subsequences. + .. method:: bytes.rpartition(sep) bytearray.rpartition(sep) From 6f71b398755e718ec96d44fc5184d85e6dcf28cd Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sun, 26 May 2024 07:08:46 +0200 Subject: [PATCH 35/86] Refactor code --- Objects/bytes_methods.c | 84 ++++++++--------- Objects/unicodeobject.c | 203 ++++++++++++++++++---------------------- 2 files changed, 127 insertions(+), 160 deletions(-) diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index 62804a3a23be8c..979f7350f4c77d 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -560,13 +560,18 @@ find_internal(const char *str, Py_ssize_t len, #define RFIND_CHUNK_SIZE FIND_CHUNK_SIZE static Py_ssize_t -bytes_find_internal(const char *str, Py_ssize_t len, const char *function_name, - PyObject *subobj, Py_ssize_t start, Py_ssize_t end) +find_first_internal(const char *str, Py_ssize_t len, const char *function_name, + PyObject *subobj, Py_ssize_t start, Py_ssize_t end, + int direction) { - if (PyTuple_Check(subobj)) { - Py_ssize_t result = -1; - ADJUST_INDICES(start, end, len); - // Work in chunks + if (!PyTuple_Check(subobj)) { + return find_internal(str, len, function_name, subobj, start, end, + direction); + } + Py_ssize_t result = -1; + ADJUST_INDICES(start, end, len); + // Work in chunks + if (direction > 0) { for (; result == -1 && start <= end; start += FIND_CHUNK_SIZE) { Py_ssize_t cur_end = start + FIND_CHUNK_SIZE - 1; for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { @@ -603,41 +608,8 @@ bytes_find_internal(const char *str, Py_ssize_t len, const char *function_name, } } } - return result; } - return find_internal(str, len, function_name, subobj, start, end, +1); -} - -PyObject * -_Py_bytes_find(const char *str, Py_ssize_t len, PyObject *subobj, - Py_ssize_t start, Py_ssize_t end) -{ - Py_ssize_t result = bytes_find_internal(str, len, "find", subobj, start, - end); - return result == -2 ? NULL : PyLong_FromSsize_t(result); -} - -PyObject * -_Py_bytes_index(const char *str, Py_ssize_t len, PyObject *subobj, - Py_ssize_t start, Py_ssize_t end) -{ - Py_ssize_t result = bytes_find_internal(str, len, "index", subobj, start, - end); - if (result == -1) { - PyErr_SetString(PyExc_ValueError, "subsection not found"); - } - return result < 0 ? NULL : PyLong_FromSsize_t(result); -} - -static Py_ssize_t -bytes_rfind_internal(const char *str, Py_ssize_t len, - const char *function_name, PyObject *subobj, - Py_ssize_t start, Py_ssize_t end) -{ - if (PyTuple_Check(subobj)) { - Py_ssize_t result = -1; - ADJUST_INDICES(start, end, len); - // Work in chunks + else { Py_ssize_t cur_end = end; for (; result == -1 && cur_end >= start; cur_end -= RFIND_CHUNK_SIZE) { Py_ssize_t cur_start = cur_end - RFIND_CHUNK_SIZE + 1; @@ -678,17 +650,37 @@ bytes_rfind_internal(const char *str, Py_ssize_t len, } } } - return result; } - return find_internal(str, len, function_name, subobj, start, end, -1); + return result; +} + +PyObject * +_Py_bytes_find(const char *str, Py_ssize_t len, PyObject *subobj, + Py_ssize_t start, Py_ssize_t end) +{ + Py_ssize_t result = find_first_internal(str, len, "find", subobj, start, + end, +1); + return result == -2 ? NULL : PyLong_FromSsize_t(result); +} + +PyObject * +_Py_bytes_index(const char *str, Py_ssize_t len, PyObject *subobj, + Py_ssize_t start, Py_ssize_t end) +{ + Py_ssize_t result = find_first_internal(str, len, "index", subobj, start, + end, +1); + if (result == -1) { + PyErr_SetString(PyExc_ValueError, "subsection not found"); + } + return result < 0 ? NULL : PyLong_FromSsize_t(result); } PyObject * _Py_bytes_rfind(const char *str, Py_ssize_t len, PyObject *subobj, Py_ssize_t start, Py_ssize_t end) { - Py_ssize_t result = bytes_rfind_internal(str, len, "rfind", subobj, start, - end); + Py_ssize_t result = find_first_internal(str, len, "rfind", subobj, start, + end, -1); return result == -2 ? NULL : PyLong_FromSsize_t(result); } @@ -696,8 +688,8 @@ PyObject * _Py_bytes_rindex(const char *str, Py_ssize_t len, PyObject *subobj, Py_ssize_t start, Py_ssize_t end) { - Py_ssize_t result = bytes_rfind_internal(str, len, "rindex", subobj, start, - end); + Py_ssize_t result = find_first_internal(str, len, "rindex", subobj, start, + end, -1); if (result == -1) { PyErr_SetString(PyExc_ValueError, "subsection not found"); } diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 3e76d36e543d6f..3c51d5454b7ee9 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9137,6 +9137,87 @@ any_find_slice(PyObject* s1, PyObject* s2, return result; } +#define FIND_CHUNK_SIZE 10000 +#define RFIND_CHUNK_SIZE FIND_CHUNK_SIZE + +static Py_ssize_t +any_find_first_slice(PyObject *str, const char *function_name, + PyObject *subobj, Py_ssize_t start, Py_ssize_t end, + int direction) +{ + if (!PyTuple_Check(subobj)) { + if (!PyUnicode_Check(subobj)) { + PyErr_Format(PyExc_TypeError, + "find %.200s arg must be str or " + "a tuple of str, not %.100s", function_name, + Py_TYPE(subobj)->tp_name); + return -2; + } + return any_find_slice(str, subobj, start, end, direction); + } + for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { + PyObject *substr = PyTuple_GET_ITEM(subobj, i); + if (!PyUnicode_Check(substr)) { + PyErr_Format(PyExc_TypeError, + "tuple for %.200s must only contain str, " + "not %.100s", function_name, + Py_TYPE(substr)->tp_name); + return -2; + } + } + Py_ssize_t result = -1; + Py_ssize_t len = PyUnicode_GET_LENGTH(str); + ADJUST_INDICES(start, end, len); + // Work in chunks + if (direction > 0) { + for (; result == -1 && start <= end; start += FIND_CHUNK_SIZE) { + Py_ssize_t cur_end = start + FIND_CHUNK_SIZE - 1; + for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { + PyObject *substr = PyTuple_GET_ITEM(subobj, i); + Py_ssize_t sub_end = cur_end + PyUnicode_GET_LENGTH(substr); + if (sub_end > end) { + sub_end = end; + } + Py_ssize_t new_result = any_find_slice(str, substr, start, + sub_end, +1); + if (new_result != -1) { + if (new_result == start) { + return start; + } + cur_end = new_result - 1; + result = new_result; + } + } + } + } + else { + Py_ssize_t cur_end = end; + for (; result == -1 && cur_end >= start; cur_end -= RFIND_CHUNK_SIZE) { + Py_ssize_t cur_start = cur_end - RFIND_CHUNK_SIZE + 1; + if (cur_start < start) { + cur_start = start; + } + for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { + PyObject *substr = PyTuple_GET_ITEM(subobj, i); + Py_ssize_t sub_end = cur_end + PyUnicode_GET_LENGTH(substr); + if (sub_end > end) { + sub_end = end; + } + Py_ssize_t new_result = any_find_slice(str, substr, cur_start, + sub_end, -1); + if (new_result != -1) { + if (new_result == cur_end) { + return cur_end; + } + cur_start = new_result + 1; + result = new_result; + } + } + } + } + return result; +} + /* _PyUnicode_InsertThousandsGrouping() helper functions */ #include "stringlib/localeutil.h" @@ -11333,59 +11414,6 @@ unicode_expandtabs_impl(PyObject *self, int tabsize) return NULL; } -#define FIND_CHUNK_SIZE 10000 -#define RFIND_CHUNK_SIZE FIND_CHUNK_SIZE - -static Py_ssize_t -unicode_find_internal(PyObject *str, const char *function_name, - PyObject *subobj, Py_ssize_t start, Py_ssize_t end) -{ - if (PyTuple_Check(subobj)) { - for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { - PyObject *substr = PyTuple_GET_ITEM(subobj, i); - if (!PyUnicode_Check(substr)) { - PyErr_Format(PyExc_TypeError, - "tuple for %.200s must only contain str, " - "not %.100s", function_name, - Py_TYPE(substr)->tp_name); - return -2; - } - } - Py_ssize_t result = -1; - Py_ssize_t len = PyUnicode_GET_LENGTH(str); - ADJUST_INDICES(start, end, len); - // Work in chunks - for (; result == -1 && start <= end; start += FIND_CHUNK_SIZE) { - Py_ssize_t cur_end = start + FIND_CHUNK_SIZE - 1; - for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { - PyObject *substr = PyTuple_GET_ITEM(subobj, i); - Py_ssize_t sub_end = cur_end + PyUnicode_GET_LENGTH(substr); - if (sub_end > end) { - sub_end = end; - } - Py_ssize_t new_result = any_find_slice(str, substr, start, - sub_end, 1); - if (new_result != -1) { - if (new_result == start) { - return start; - } - cur_end = new_result - 1; - result = new_result; - } - } - } - return result; - } - if (!PyUnicode_Check(subobj)) { - PyErr_Format(PyExc_TypeError, - "find %.200s arg must be str or " - "a tuple of str, not %.100s", function_name, - Py_TYPE(subobj)->tp_name); - return -2; - } - return any_find_slice(str, subobj, start, end, 1); -} - /*[clinic input] str.find as unicode_find -> Py_ssize_t @@ -11406,7 +11434,8 @@ unicode_find_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, Py_ssize_t end) /*[clinic end generated code: output=80175735a6d549d0 input=51e7b530950ab304]*/ { - Py_ssize_t result = unicode_find_internal(str, "find", subobj, start, end); + Py_ssize_t result = any_find_first_slice(str, "find", subobj, start, end, + +1); return result < 0 ? -1 : result; } @@ -11464,8 +11493,8 @@ unicode_index_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, Py_ssize_t end) /*[clinic end generated code: output=c9af24adf2f1f99e input=f0033cf1698b6108]*/ { - Py_ssize_t result = unicode_find_internal(str, "index", subobj, start, - end); + Py_ssize_t result = any_find_first_slice(str, "index", subobj, start, end, + +1); if (result == -1) { PyErr_SetString(PyExc_ValueError, "substring not found"); } @@ -12549,60 +12578,6 @@ unicode_repr(PyObject *unicode) return repr; } -static Py_ssize_t -unicode_rfind_internal(PyObject *str, const char *function_name, - PyObject *subobj, Py_ssize_t start, Py_ssize_t end) -{ - if (PyTuple_Check(subobj)) { - for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { - PyObject *substr = PyTuple_GET_ITEM(subobj, i); - if (!PyUnicode_Check(substr)) { - PyErr_Format(PyExc_TypeError, - "tuple for %.200s must only contain str, " - "not %.100s", function_name, - Py_TYPE(substr)->tp_name); - return -2; - } - } - Py_ssize_t result = -1; - Py_ssize_t len = PyUnicode_GET_LENGTH(str); - ADJUST_INDICES(start, end, len); - // Work in chunks - Py_ssize_t cur_end = end; - for (; result == -1 && cur_end >= start; cur_end -= RFIND_CHUNK_SIZE) { - Py_ssize_t cur_start = cur_end - RFIND_CHUNK_SIZE + 1; - if (cur_start < start) { - cur_start = start; - } - for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { - PyObject *substr = PyTuple_GET_ITEM(subobj, i); - Py_ssize_t sub_end = cur_end + PyUnicode_GET_LENGTH(substr); - if (sub_end > end) { - sub_end = end; - } - Py_ssize_t new_result = any_find_slice(str, substr, cur_start, - sub_end, -1); - if (new_result != -1) { - if (new_result == cur_end) { - return cur_end; - } - cur_start = new_result + 1; - result = new_result; - } - } - } - return result; - } - if (!PyUnicode_Check(subobj)) { - PyErr_Format(PyExc_TypeError, - "%.200s first arg must be str or " - "a tuple of str, not %.100s", function_name, - Py_TYPE(subobj)->tp_name); - return -2; - } - return any_find_slice(str, subobj, start, end, -1); -} - /*[clinic input] str.rfind as unicode_rfind = str.find @@ -12617,8 +12592,8 @@ unicode_rfind_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, Py_ssize_t end) /*[clinic end generated code: output=9d316eee7b9f9bf0 input=23ae7964e8f70b35]*/ { - Py_ssize_t result = unicode_rfind_internal(str, "rfind", subobj, start, - end); + Py_ssize_t result = any_find_first_slice(str, "rfind", subobj, start, end, + -1); return result < 0 ? -1 : result; } @@ -12636,8 +12611,8 @@ unicode_rindex_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, Py_ssize_t end) /*[clinic end generated code: output=847d553d0dc10a86 input=990f3925b149c1bc]*/ { - Py_ssize_t result = unicode_rfind_internal(str, "rindex", subobj, start, - end); + Py_ssize_t result = any_find_first_slice(str, "rindex", subobj, start, end, + -1); if (result == -1) { PyErr_SetString(PyExc_ValueError, "substring not found"); } From 64ef311a10b8a86a57b772c06634ba9b26d0aa4d Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sun, 26 May 2024 07:27:01 +0200 Subject: [PATCH 36/86] Fix error message --- Objects/unicodeobject.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 3c51d5454b7ee9..54fee72eca2503 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9148,7 +9148,7 @@ any_find_first_slice(PyObject *str, const char *function_name, if (!PyTuple_Check(subobj)) { if (!PyUnicode_Check(subobj)) { PyErr_Format(PyExc_TypeError, - "find %.200s arg must be str or " + "%.200s first arg must be str or " "a tuple of str, not %.100s", function_name, Py_TYPE(subobj)->tp_name); return -2; From a116f33b78c81b4947a9d1fc7b69af6ec3497f01 Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sun, 26 May 2024 14:45:21 +0200 Subject: [PATCH 37/86] Add asserts --- Objects/bytes_methods.c | 2 ++ Objects/unicodeobject.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index 979f7350f4c77d..d188924e405b23 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -572,6 +572,7 @@ find_first_internal(const char *str, Py_ssize_t len, const char *function_name, ADJUST_INDICES(start, end, len); // Work in chunks if (direction > 0) { + assert(FIND_CHUNK_SIZE > 0); for (; result == -1 && start <= end; start += FIND_CHUNK_SIZE) { Py_ssize_t cur_end = start + FIND_CHUNK_SIZE - 1; for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { @@ -611,6 +612,7 @@ find_first_internal(const char *str, Py_ssize_t len, const char *function_name, } else { Py_ssize_t cur_end = end; + assert(RFIND_CHUNK_SIZE > 0); for (; result == -1 && cur_end >= start; cur_end -= RFIND_CHUNK_SIZE) { Py_ssize_t cur_start = cur_end - RFIND_CHUNK_SIZE + 1; if (cur_start < start) { diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 54fee72eca2503..5c7ece40793ecc 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9170,6 +9170,7 @@ any_find_first_slice(PyObject *str, const char *function_name, ADJUST_INDICES(start, end, len); // Work in chunks if (direction > 0) { + assert(FIND_CHUNK_SIZE > 0); for (; result == -1 && start <= end; start += FIND_CHUNK_SIZE) { Py_ssize_t cur_end = start + FIND_CHUNK_SIZE - 1; for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { @@ -9192,6 +9193,7 @@ any_find_first_slice(PyObject *str, const char *function_name, } else { Py_ssize_t cur_end = end; + assert(RFIND_CHUNK_SIZE > 0); for (; result == -1 && cur_end >= start; cur_end -= RFIND_CHUNK_SIZE) { Py_ssize_t cur_start = cur_end - RFIND_CHUNK_SIZE + 1; if (cur_start < start) { From e29828d40550744a2b0c350882c4d4f24c8e8a6a Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sun, 26 May 2024 14:59:10 +0200 Subject: [PATCH 38/86] Remove unnecessary check --- Objects/bytes_methods.c | 6 ------ Objects/unicodeobject.c | 6 ------ 2 files changed, 12 deletions(-) diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index d188924e405b23..b0a7e5748fb56c 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -591,9 +591,6 @@ find_first_internal(const char *str, Py_ssize_t len, const char *function_name, sub_len = subbuf.len; } Py_ssize_t sub_end = cur_end + sub_len; - if (sub_end > end) { - sub_end = end; - } Py_ssize_t new_result = find_internal(str, len, function_name, subseq, start, sub_end, +1); @@ -634,9 +631,6 @@ find_first_internal(const char *str, Py_ssize_t len, const char *function_name, sub_len = subbuf.len; } Py_ssize_t sub_end = cur_end + sub_len; - if (sub_end > end) { - sub_end = end; - } Py_ssize_t new_result = find_internal(str, len, function_name, subseq, cur_start, sub_end, -1); diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 5c7ece40793ecc..8c53361712bc25 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9176,9 +9176,6 @@ any_find_first_slice(PyObject *str, const char *function_name, for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { PyObject *substr = PyTuple_GET_ITEM(subobj, i); Py_ssize_t sub_end = cur_end + PyUnicode_GET_LENGTH(substr); - if (sub_end > end) { - sub_end = end; - } Py_ssize_t new_result = any_find_slice(str, substr, start, sub_end, +1); if (new_result != -1) { @@ -9202,9 +9199,6 @@ any_find_first_slice(PyObject *str, const char *function_name, for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { PyObject *substr = PyTuple_GET_ITEM(subobj, i); Py_ssize_t sub_end = cur_end + PyUnicode_GET_LENGTH(substr); - if (sub_end > end) { - sub_end = end; - } Py_ssize_t new_result = any_find_slice(str, substr, cur_start, sub_end, -1); if (new_result != -1) { From a85f84ac8dc77fba3edfe3fcb08fe80f0c64309a Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sun, 26 May 2024 15:12:33 +0200 Subject: [PATCH 39/86] Revert "Remove unnecessary check" This reverts commit e29828d40550744a2b0c350882c4d4f24c8e8a6a. --- Objects/bytes_methods.c | 6 ++++++ Objects/unicodeobject.c | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index b0a7e5748fb56c..d188924e405b23 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -591,6 +591,9 @@ find_first_internal(const char *str, Py_ssize_t len, const char *function_name, sub_len = subbuf.len; } Py_ssize_t sub_end = cur_end + sub_len; + if (sub_end > end) { + sub_end = end; + } Py_ssize_t new_result = find_internal(str, len, function_name, subseq, start, sub_end, +1); @@ -631,6 +634,9 @@ find_first_internal(const char *str, Py_ssize_t len, const char *function_name, sub_len = subbuf.len; } Py_ssize_t sub_end = cur_end + sub_len; + if (sub_end > end) { + sub_end = end; + } Py_ssize_t new_result = find_internal(str, len, function_name, subseq, cur_start, sub_end, -1); diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 8c53361712bc25..5c7ece40793ecc 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9176,6 +9176,9 @@ any_find_first_slice(PyObject *str, const char *function_name, for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { PyObject *substr = PyTuple_GET_ITEM(subobj, i); Py_ssize_t sub_end = cur_end + PyUnicode_GET_LENGTH(substr); + if (sub_end > end) { + sub_end = end; + } Py_ssize_t new_result = any_find_slice(str, substr, start, sub_end, +1); if (new_result != -1) { @@ -9199,6 +9202,9 @@ any_find_first_slice(PyObject *str, const char *function_name, for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { PyObject *substr = PyTuple_GET_ITEM(subobj, i); Py_ssize_t sub_end = cur_end + PyUnicode_GET_LENGTH(substr); + if (sub_end > end) { + sub_end = end; + } Py_ssize_t new_result = any_find_slice(str, substr, cur_start, sub_end, -1); if (new_result != -1) { From ac19e87c3d2324ae7b2034c9a916765d520b63e1 Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sun, 26 May 2024 16:27:46 +0200 Subject: [PATCH 40/86] Optimise length of 0 & 1 --- Objects/bytes_methods.c | 13 +++++++++++-- Objects/unicodeobject.c | 14 +++++++++++--- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index d188924e405b23..6c97966548f328 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -568,6 +568,15 @@ find_first_internal(const char *str, Py_ssize_t len, const char *function_name, return find_internal(str, len, function_name, subobj, start, end, direction); } + Py_ssize_t tuple_len = PyTuple_GET_SIZE(subobj); + if (tuple_len == 0) { + return -1; + } + if (tuple_len == 1) { + PyObject *subseq = PyTuple_GET_ITEM(subobj, 0); + return find_internal(str, len, function_name, subseq, start, end, + direction); + } Py_ssize_t result = -1; ADJUST_INDICES(start, end, len); // Work in chunks @@ -575,7 +584,7 @@ find_first_internal(const char *str, Py_ssize_t len, const char *function_name, assert(FIND_CHUNK_SIZE > 0); for (; result == -1 && start <= end; start += FIND_CHUNK_SIZE) { Py_ssize_t cur_end = start + FIND_CHUNK_SIZE - 1; - for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { + for (Py_ssize_t i = 0; i < tuple_len; i++) { PyObject *subseq = PyTuple_GET_ITEM(subobj, i); Py_ssize_t sub_len; Py_buffer subbuf; @@ -618,7 +627,7 @@ find_first_internal(const char *str, Py_ssize_t len, const char *function_name, if (cur_start < start) { cur_start = start; } - for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { + for (Py_ssize_t i = 0; i < tuple_len; i++) { PyObject *subseq = PyTuple_GET_ITEM(subobj, i); Py_ssize_t sub_len; Py_buffer subbuf; diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 5c7ece40793ecc..17221d9c9b44b0 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9155,7 +9155,11 @@ any_find_first_slice(PyObject *str, const char *function_name, } return any_find_slice(str, subobj, start, end, direction); } - for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { + Py_ssize_t tuple_len = PyTuple_GET_SIZE(subobj); + if (tuple_len == 0) { + return -1; + } + for (Py_ssize_t i = 0; i < tuple_len; i++) { PyObject *substr = PyTuple_GET_ITEM(subobj, i); if (!PyUnicode_Check(substr)) { PyErr_Format(PyExc_TypeError, @@ -9165,6 +9169,10 @@ any_find_first_slice(PyObject *str, const char *function_name, return -2; } } + if (tuple_len == 1) { + PyObject *substr = PyTuple_GET_ITEM(subobj, 0); + return any_find_slice(str, substr, start, end, direction); + } Py_ssize_t result = -1; Py_ssize_t len = PyUnicode_GET_LENGTH(str); ADJUST_INDICES(start, end, len); @@ -9173,7 +9181,7 @@ any_find_first_slice(PyObject *str, const char *function_name, assert(FIND_CHUNK_SIZE > 0); for (; result == -1 && start <= end; start += FIND_CHUNK_SIZE) { Py_ssize_t cur_end = start + FIND_CHUNK_SIZE - 1; - for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { + for (Py_ssize_t i = 0; i < tuple_len; i++) { PyObject *substr = PyTuple_GET_ITEM(subobj, i); Py_ssize_t sub_end = cur_end + PyUnicode_GET_LENGTH(substr); if (sub_end > end) { @@ -9199,7 +9207,7 @@ any_find_first_slice(PyObject *str, const char *function_name, if (cur_start < start) { cur_start = start; } - for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(subobj); i++) { + for (Py_ssize_t i = 0; i < tuple_len; i++) { PyObject *substr = PyTuple_GET_ITEM(subobj, i); Py_ssize_t sub_end = cur_end + PyUnicode_GET_LENGTH(substr); if (sub_end > end) { From b62e8b4f92bd12323eb81af98f37a8962bfe5881 Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sun, 26 May 2024 16:39:45 +0200 Subject: [PATCH 41/86] Avoid testing with tuples of 1 item --- Lib/test/string_tests.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py index 7495a95bceb704..fab7e2cefe15ab 100644 --- a/Lib/test/string_tests.py +++ b/Lib/test/string_tests.py @@ -218,6 +218,8 @@ def test_find(self): self.assertEqual(i[loc:loc+len(j)], j) # test tuple arguments + self.checkequal(0, 'foo', 'find', ('foo',)) + self.checkequal(-1, 'foo', 'find', ('bar',)) self.checkequal(2, '__aa__bb__', 'find', ('aa', 'bb')) self.checkequal(2, '__aa__bb__', 'find', ('bb', 'aa')) self.checkequal(-1, '__aa__bb__', 'find', ('cc', 'dd')) @@ -227,18 +229,18 @@ def test_find(self): self.checkequal(2, '__aa__bb__', 'find', ('aa', 'bb'), 0, 10) self.checkequal(-1, '__aa__bb__', 'find', ('aa', 'bb'), 0, 3) self.checkequal(2, '__aa__bb__', 'find', ('aa', 'bb'), 0, 4) - self.checkraises(TypeError, 'hello', 'find', (None,)) + self.checkraises(TypeError, 'hello', 'find', (1.0, 2.0)) s = '_' * 9998 + 'aaaa' + '_' * 9998 self.checkequal(9998, s, 'find', ('aaaa', 'bb')) self.checkequal(2, 'foobar', 'find', ('ob', 'oba')) self.checkequal(1, 'foobar', 'find', ('ob', 'oob')) - self.checkequal(0, '', 'find', ('',)) + self.checkequal(0, '', 'find', ('', '_')) self.checkequal(2, '__abcd__', 'find', ('cd', 'ab')) self.checkequal(2, '__abc__', 'find', ('bc', 'ab')) - self.checkequal(1, 'a' + 'b' * 10000, 'find', ('b' * 10000,)) + self.checkequal(1, 'a' + 'b' * 10000, 'find', ('b' * 10000, 'c')) s = 'ab' + 'c' * 100000 self.checkequal(1, s, 'find', ('c' * 100000, 'b' + 'c' * 100000)) - self.checkequal(0, 'foobar', 'find', ('foo',)) + self.checkequal(0, 'foobar', 'find', ('foo', 'bar')) def test_rfind(self): self.checkequal(9, 'abcdefghiabc', 'rfind', 'abc') @@ -294,6 +296,8 @@ def test_rfind(self): self.checkequal(0, '<......\u043c...', "rfind", "<") # test tuple arguments + self.checkequal(0, 'foo', 'rfind', ('foo',)) + self.checkequal(-1, 'foo', 'rfind', ('bar',)) self.checkequal(6, '__aa__bb__', 'rfind', ('aa', 'bb')) self.checkequal(6, '__aa__bb__', 'rfind', ('bb', 'aa')) self.checkequal(-1, '__aa__bb__', 'rfind', ('cc', 'dd')) @@ -302,18 +306,18 @@ def test_rfind(self): self.checkequal(6, '__aa__bb__', 'rfind', ('aa', 'bb'), 0, 10) self.checkequal(-1, '__aa__bb__', 'rfind', ('aa', 'bb'), 7, 10) self.checkequal(6, '__aa__bb__', 'rfind', ('aa', 'bb'), 6, 10) - self.checkraises(TypeError, 'hello', 'rfind', (None,)) + self.checkraises(TypeError, 'hello', 'rfind', (1.0, 2.0)) s = '_' * 9998 + 'aaaa' + '_' * 9998 self.checkequal(9998, s, 'rfind', ('aaaa', 'bb')) self.checkequal(2, 'foobar', 'rfind', ('oba', 'ob')) self.checkequal(2, 'foobar', 'rfind', ('oob', 'ob')) - self.checkequal(0, '', 'rfind', ('',)) + self.checkequal(0, '', 'rfind', ('', '_')) self.checkequal(4, '__abcd__', 'rfind', ('ab', 'cd')) self.checkequal(3, '__abc__', 'rfind', ('ab', 'bc')) - self.checkequal(0, 'b' * 10000 + 'a', 'rfind', ('b' * 10000,)) + self.checkequal(0, 'b' * 10000 + 'a', 'rfind', ('b' * 10000, 'c')) s = 'ab' + 'c' * 100000 self.checkequal(2, s, 'rfind', ('c' * 100000, 'b' + 'c' * 100000)) - self.checkequal(3, 'foo', 'rfind', ('',)) + self.checkequal(3, 'foo', 'rfind', ('', 'foo')) def test_index(self): self.checkequal(0, 'abcdefghiabc', 'index', '') From b6492db7dacbca8e3fd795d7f5a84d86ee4c26fc Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sun, 26 May 2024 20:30:00 +0200 Subject: [PATCH 42/86] Simplify news. Co-authored-by: Serhiy Storchaka --- .../2024-05-24-11-07-16.gh-issue-118184.EK4di_.rst | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/Misc/NEWS.d/next/Core and Builtins/2024-05-24-11-07-16.gh-issue-118184.EK4di_.rst b/Misc/NEWS.d/next/Core and Builtins/2024-05-24-11-07-16.gh-issue-118184.EK4di_.rst index c38c1cbf051393..85472e64dc94dd 100644 --- a/Misc/NEWS.d/next/Core and Builtins/2024-05-24-11-07-16.gh-issue-118184.EK4di_.rst +++ b/Misc/NEWS.d/next/Core and Builtins/2024-05-24-11-07-16.gh-issue-118184.EK4di_.rst @@ -1,4 +1 @@ -Support tuples for :meth:`str.find`, :meth:`bytearray.find`, :meth:`bytes.find`, - :meth:`str.index`, :meth:`bytearray.index`, :meth:`bytes.index`, - :meth:`str.rfind`, :meth:`bytearray.rfind`, :meth:`bytes.rfind`, - :meth:`str.rindex`, :meth:`bytearray.rindex` and :meth:`bytes.rindex`. +Support tuples for for :class:`str`, :class:`bytes` and :class:`bytearray` methods ``find()``, ``index()``, ``rfind()`` and ``rindex()``. From dd23e04039e39cb28a871ee9d515b62eca37de4b Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sun, 26 May 2024 20:31:55 +0200 Subject: [PATCH 43/86] Fix indentation --- Doc/library/stdtypes.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Doc/library/stdtypes.rst b/Doc/library/stdtypes.rst index b7aba0803e41c4..5354575c8a3530 100644 --- a/Doc/library/stdtypes.rst +++ b/Doc/library/stdtypes.rst @@ -2913,7 +2913,7 @@ arbitrary binary data. Also accept an integer in the range 0 to 255 as the subsequence. .. versionchanged:: 3.14 - *sub* can now be a tuple of subsequences. + *sub* can now be a tuple of subsequences. .. method:: bytes.join(iterable) @@ -3000,7 +3000,7 @@ arbitrary binary data. Also accept an integer in the range 0 to 255 as the subsequence. .. versionchanged:: 3.14 - *sub* can now be a tuple of subsequences. + *sub* can now be a tuple of subsequences. .. method:: bytes.rpartition(sep) From 38d2df8ceba41b6948b958dd32aafda6a11b691c Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sun, 26 May 2024 20:34:34 +0200 Subject: [PATCH 44/86] Handle -2 --- Objects/unicodeobject.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 17221d9c9b44b0..8b0f2fc993263a 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9189,6 +9189,9 @@ any_find_first_slice(PyObject *str, const char *function_name, } Py_ssize_t new_result = any_find_slice(str, substr, start, sub_end, +1); + if (new_result == -2) { + return -2; + } if (new_result != -1) { if (new_result == start) { return start; @@ -9215,6 +9218,9 @@ any_find_first_slice(PyObject *str, const char *function_name, } Py_ssize_t new_result = any_find_slice(str, substr, cur_start, sub_end, -1); + if (new_result == -2) { + return -2; + } if (new_result != -1) { if (new_result == cur_end) { return cur_end; From 223cb1bb92aac65aa458b51d107b1e9980decfd6 Mon Sep 17 00:00:00 2001 From: Nice Zombies Date: Mon, 27 May 2024 10:27:36 +0200 Subject: [PATCH 45/86] Update Misc/NEWS.d/next/Core and Builtins/2024-05-24-11-07-16.gh-issue-118184.EK4di_.rst --- .../2024-05-24-11-07-16.gh-issue-118184.EK4di_.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Core and Builtins/2024-05-24-11-07-16.gh-issue-118184.EK4di_.rst b/Misc/NEWS.d/next/Core and Builtins/2024-05-24-11-07-16.gh-issue-118184.EK4di_.rst index 85472e64dc94dd..1fddd4ca4ad26e 100644 --- a/Misc/NEWS.d/next/Core and Builtins/2024-05-24-11-07-16.gh-issue-118184.EK4di_.rst +++ b/Misc/NEWS.d/next/Core and Builtins/2024-05-24-11-07-16.gh-issue-118184.EK4di_.rst @@ -1 +1 @@ -Support tuples for for :class:`str`, :class:`bytes` and :class:`bytearray` methods ``find()``, ``index()``, ``rfind()`` and ``rindex()``. +Support tuples for :class:`str`, :class:`bytes` and :class:`bytearray` methods ``find()``, ``index()``, ``rfind()`` and ``rindex()``. From bc29c92d00ea91152371a37b7740cb06583796a2 Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Mon, 27 May 2024 13:35:03 +0200 Subject: [PATCH 46/86] Guard overflow --- Objects/bytes_methods.c | 49 +++++++++++++++++++++++++---------------- Objects/unicodeobject.c | 39 +++++++++++++++++++++----------- 2 files changed, 56 insertions(+), 32 deletions(-) diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index 6c97966548f328..37da9e86e079b7 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -579,15 +579,20 @@ find_first_internal(const char *str, Py_ssize_t len, const char *function_name, } Py_ssize_t result = -1; ADJUST_INDICES(start, end, len); - // Work in chunks if (direction > 0) { assert(FIND_CHUNK_SIZE > 0); - for (; result == -1 && start <= end; start += FIND_CHUNK_SIZE) { - Py_ssize_t cur_end = start + FIND_CHUNK_SIZE - 1; + for (; result == -1; start += FIND_CHUNK_SIZE) { + Py_ssize_t cur_end; + if (start > end - FIND_CHUNK_SIZE + 1) { // Guard overflow + cur_end = end; + } + else { + cur_end = start - 1 + FIND_CHUNK_SIZE; + } for (Py_ssize_t i = 0; i < tuple_len; i++) { - PyObject *subseq = PyTuple_GET_ITEM(subobj, i); - Py_ssize_t sub_len; Py_buffer subbuf; + Py_ssize_t new_result, sub_len; + PyObject *subseq = PyTuple_GET_ITEM(subobj, i); if (!PyObject_CheckBuffer(subseq)) { sub_len = 1; } @@ -599,13 +604,14 @@ find_first_internal(const char *str, Py_ssize_t len, const char *function_name, else { sub_len = subbuf.len; } - Py_ssize_t sub_end = cur_end + sub_len; - if (sub_end > end) { - sub_end = end; + if (cur_end >= end - sub_len) { // Guard overflow + new_result = find_internal(str, len, function_name, subseq, + start, end, +1); + } + else { + new_result = find_internal(str, len, function_name, subseq, + start, cur_end + sub_len, +1); } - Py_ssize_t new_result = find_internal(str, len, function_name, - subseq, start, sub_end, - +1); if (new_result == -2) { return -2; } @@ -617,6 +623,9 @@ find_first_internal(const char *str, Py_ssize_t len, const char *function_name, result = new_result; } } + if (start > end - FIND_CHUNK_SIZE) { + break; // Guard overflow + } } } else { @@ -628,9 +637,9 @@ find_first_internal(const char *str, Py_ssize_t len, const char *function_name, cur_start = start; } for (Py_ssize_t i = 0; i < tuple_len; i++) { - PyObject *subseq = PyTuple_GET_ITEM(subobj, i); - Py_ssize_t sub_len; Py_buffer subbuf; + Py_ssize_t new_result, sub_len; + PyObject *subseq = PyTuple_GET_ITEM(subobj, i); if (!PyObject_CheckBuffer(subseq)) { sub_len = 1; } @@ -642,13 +651,15 @@ find_first_internal(const char *str, Py_ssize_t len, const char *function_name, else { sub_len = subbuf.len; } - Py_ssize_t sub_end = cur_end + sub_len; - if (sub_end > end) { - sub_end = end; + if (cur_end >= end - sub_len) { // Guard overflow + new_result = find_internal(str, len, function_name, subseq, + cur_start, end, -1); + } + else { + new_result = find_internal(str, len, function_name, subseq, + cur_start, cur_end + sub_len, + -1); } - Py_ssize_t new_result = find_internal(str, len, function_name, - subseq, cur_start, - sub_end, -1); if (new_result == -2) { return -2; } diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 8b0f2fc993263a..20a767f4d9f26d 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9176,19 +9176,26 @@ any_find_first_slice(PyObject *str, const char *function_name, Py_ssize_t result = -1; Py_ssize_t len = PyUnicode_GET_LENGTH(str); ADJUST_INDICES(start, end, len); - // Work in chunks if (direction > 0) { assert(FIND_CHUNK_SIZE > 0); - for (; result == -1 && start <= end; start += FIND_CHUNK_SIZE) { - Py_ssize_t cur_end = start + FIND_CHUNK_SIZE - 1; + for (; result == -1; start += FIND_CHUNK_SIZE) { + Py_ssize_t cur_end; + if (start > end - FIND_CHUNK_SIZE + 1) { // Guard overflow + cur_end = end; + } + else { + cur_end = start - 1 + FIND_CHUNK_SIZE; + } for (Py_ssize_t i = 0; i < tuple_len; i++) { PyObject *substr = PyTuple_GET_ITEM(subobj, i); - Py_ssize_t sub_end = cur_end + PyUnicode_GET_LENGTH(substr); - if (sub_end > end) { - sub_end = end; + Py_ssize_t new_result, sub_len = PyUnicode_GET_LENGTH(substr); + if (cur_end >= end - sub_len) { // Guard overflow + new_result = any_find_slice(str, substr, start, end, +1); + } + else { + new_result = any_find_slice(str, substr, start, + cur_end + sub_len, +1); } - Py_ssize_t new_result = any_find_slice(str, substr, start, - sub_end, +1); if (new_result == -2) { return -2; } @@ -9200,6 +9207,9 @@ any_find_first_slice(PyObject *str, const char *function_name, result = new_result; } } + if (start > end - FIND_CHUNK_SIZE) { + break; // Guard overflow + } } } else { @@ -9212,12 +9222,15 @@ any_find_first_slice(PyObject *str, const char *function_name, } for (Py_ssize_t i = 0; i < tuple_len; i++) { PyObject *substr = PyTuple_GET_ITEM(subobj, i); - Py_ssize_t sub_end = cur_end + PyUnicode_GET_LENGTH(substr); - if (sub_end > end) { - sub_end = end; + Py_ssize_t new_result, sub_len = PyUnicode_GET_LENGTH(substr); + if (cur_end >= end - sub_len) { // Guard overflow + new_result = any_find_slice(str, substr, cur_start, end, + -1); + } + else { + new_result = any_find_slice(str, substr, cur_start, + cur_end + sub_len, -1); } - Py_ssize_t new_result = any_find_slice(str, substr, cur_start, - sub_end, -1); if (new_result == -2) { return -2; } From f14ee7dae4839faa664473ef05996f97417a8472 Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Mon, 27 May 2024 15:05:18 +0200 Subject: [PATCH 47/86] Tweak `FIND_CHUNK_SIZE` --- Lib/test/string_tests.py | 22 ++++++++++++---------- Objects/bytes_methods.c | 2 +- Objects/unicodeobject.c | 2 +- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py index fab7e2cefe15ab..ec1c2e8c0efe75 100644 --- a/Lib/test/string_tests.py +++ b/Lib/test/string_tests.py @@ -218,6 +218,7 @@ def test_find(self): self.assertEqual(i[loc:loc+len(j)], j) # test tuple arguments + N = 1000 # FIND_CHUNK_SIZE self.checkequal(0, 'foo', 'find', ('foo',)) self.checkequal(-1, 'foo', 'find', ('bar',)) self.checkequal(2, '__aa__bb__', 'find', ('aa', 'bb')) @@ -230,16 +231,16 @@ def test_find(self): self.checkequal(-1, '__aa__bb__', 'find', ('aa', 'bb'), 0, 3) self.checkequal(2, '__aa__bb__', 'find', ('aa', 'bb'), 0, 4) self.checkraises(TypeError, 'hello', 'find', (1.0, 2.0)) - s = '_' * 9998 + 'aaaa' + '_' * 9998 - self.checkequal(9998, s, 'find', ('aaaa', 'bb')) + s = '_' * (N - 2) + 'aaaa' + '_' * (N - 2) + self.checkequal((N - 2), s, 'find', ('aaaa', 'bb')) self.checkequal(2, 'foobar', 'find', ('ob', 'oba')) self.checkequal(1, 'foobar', 'find', ('ob', 'oob')) self.checkequal(0, '', 'find', ('', '_')) self.checkequal(2, '__abcd__', 'find', ('cd', 'ab')) self.checkequal(2, '__abc__', 'find', ('bc', 'ab')) - self.checkequal(1, 'a' + 'b' * 10000, 'find', ('b' * 10000, 'c')) - s = 'ab' + 'c' * 100000 - self.checkequal(1, s, 'find', ('c' * 100000, 'b' + 'c' * 100000)) + self.checkequal(1, 'a' + 'b' * N, 'find', ('b' * N, 'c')) + s = 'ab' + 'c' * (10 * N) + self.checkequal(1, s, 'find', ('c' * (10 * N), 'b' + 'c' * (10 * N))) self.checkequal(0, 'foobar', 'find', ('foo', 'bar')) def test_rfind(self): @@ -296,6 +297,7 @@ def test_rfind(self): self.checkequal(0, '<......\u043c...', "rfind", "<") # test tuple arguments + N = 1000 # RFIND_CHUNK_SIZE self.checkequal(0, 'foo', 'rfind', ('foo',)) self.checkequal(-1, 'foo', 'rfind', ('bar',)) self.checkequal(6, '__aa__bb__', 'rfind', ('aa', 'bb')) @@ -307,16 +309,16 @@ def test_rfind(self): self.checkequal(-1, '__aa__bb__', 'rfind', ('aa', 'bb'), 7, 10) self.checkequal(6, '__aa__bb__', 'rfind', ('aa', 'bb'), 6, 10) self.checkraises(TypeError, 'hello', 'rfind', (1.0, 2.0)) - s = '_' * 9998 + 'aaaa' + '_' * 9998 - self.checkequal(9998, s, 'rfind', ('aaaa', 'bb')) + s = '_' * (N - 2) + 'aaaa' + '_' * (N - 2) + self.checkequal((N - 2), s, 'rfind', ('aaaa', 'bb')) self.checkequal(2, 'foobar', 'rfind', ('oba', 'ob')) self.checkequal(2, 'foobar', 'rfind', ('oob', 'ob')) self.checkequal(0, '', 'rfind', ('', '_')) self.checkequal(4, '__abcd__', 'rfind', ('ab', 'cd')) self.checkequal(3, '__abc__', 'rfind', ('ab', 'bc')) - self.checkequal(0, 'b' * 10000 + 'a', 'rfind', ('b' * 10000, 'c')) - s = 'ab' + 'c' * 100000 - self.checkequal(2, s, 'rfind', ('c' * 100000, 'b' + 'c' * 100000)) + self.checkequal(0, 'b' * N + 'a', 'rfind', ('b' * N, 'c')) + s = 'ab' + 'c' * (10 * N) + self.checkequal(2, s, 'rfind', ('c' * (10 * N), 'b' + 'c' * (10 * N))) self.checkequal(3, 'foo', 'rfind', ('', 'foo')) def test_index(self): diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index 37da9e86e079b7..03f094627541d5 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -556,7 +556,7 @@ find_internal(const char *str, Py_ssize_t len, return res; } -#define FIND_CHUNK_SIZE 10000 +#define FIND_CHUNK_SIZE 1000 #define RFIND_CHUNK_SIZE FIND_CHUNK_SIZE static Py_ssize_t diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 20a767f4d9f26d..679eda562b2c51 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9137,7 +9137,7 @@ any_find_slice(PyObject* s1, PyObject* s2, return result; } -#define FIND_CHUNK_SIZE 10000 +#define FIND_CHUNK_SIZE 1000 #define RFIND_CHUNK_SIZE FIND_CHUNK_SIZE static Py_ssize_t From 3606e00fec0e6a79afd86f83a4d5d97e00e289a2 Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Tue, 28 May 2024 20:07:40 +0200 Subject: [PATCH 48/86] Refer to `re` & `regex` --- Doc/library/stdtypes.rst | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Doc/library/stdtypes.rst b/Doc/library/stdtypes.rst index 5354575c8a3530..a35fd2a743f564 100644 --- a/Doc/library/stdtypes.rst +++ b/Doc/library/stdtypes.rst @@ -1729,6 +1729,9 @@ expression support in the :mod:`re` module). first match. Optional arguments *start* and *end* are interpreted as in slice notation. Return ``-1`` if *sub* is not found. + .. seealso:: + The :mod:`re` module, which provides advanced pattern matching. + .. note:: The :meth:`~str.find` method should be used only if you need to know the @@ -2043,6 +2046,9 @@ expression support in the :mod:`re` module). be the index of the last match. Optional arguments *start* and *end* are interpreted as in slice notation. Return ``-1`` on failure. + .. seealso:: + The third-party :pypi:`regex` module, which provides advanced pattern matching. + .. versionchanged:: 3.14 *sub* can now be a tuple of substrings. @@ -2884,6 +2890,9 @@ arbitrary binary data. The subsequence to search for may be any :term:`bytes-like object` or an integer in the range 0 to 255. + .. seealso:: + The :mod:`re` module, which provides advanced pattern matching. + .. note:: The :meth:`~bytes.find` method should be used only if you need to know the @@ -2980,6 +2989,9 @@ arbitrary binary data. The subsequence to search for may be any :term:`bytes-like object` or an integer in the range 0 to 255. + .. seealso:: + The third-party :pypi:`regex` module, which provides advanced pattern matching. + .. versionchanged:: 3.3 Also accept an integer in the range 0 to 255 as the subsequence. From 9e2006c7316f37f00956fcd690af352d1ec0a72d Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Tue, 28 May 2024 23:04:54 +0200 Subject: [PATCH 49/86] Release buffer --- Objects/bytes_methods.c | 1 + 1 file changed, 1 insertion(+) diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index 03f094627541d5..bd9e9ceb564a30 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -603,6 +603,7 @@ find_first_internal(const char *str, Py_ssize_t len, const char *function_name, } else { sub_len = subbuf.len; + PyBuffer_Release(&subbuf); } if (cur_end >= end - sub_len) { // Guard overflow new_result = find_internal(str, len, function_name, subseq, From fb48c413fd744ce24c71720dd85b15d1a22fc3c5 Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Wed, 29 May 2024 06:56:52 +0200 Subject: [PATCH 50/86] Release other buffer --- Objects/bytes_methods.c | 1 + 1 file changed, 1 insertion(+) diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index bd9e9ceb564a30..09033599c40708 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -651,6 +651,7 @@ find_first_internal(const char *str, Py_ssize_t len, const char *function_name, } else { sub_len = subbuf.len; + PyBuffer_Release(&subbuf); } if (cur_end >= end - sub_len) { // Guard overflow new_result = find_internal(str, len, function_name, subseq, From 308174cec081e92e690f166d22918309c8cd235e Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Wed, 29 May 2024 08:03:42 +0200 Subject: [PATCH 51/86] Save lengths --- Objects/bytes_methods.c | 49 ++++++++++++++++++----------------------- Objects/unicodeobject.c | 6 +++-- 2 files changed, 25 insertions(+), 30 deletions(-) diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index 09033599c40708..ba67ae3da71518 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -577,6 +577,23 @@ find_first_internal(const char *str, Py_ssize_t len, const char *function_name, return find_internal(str, len, function_name, subseq, start, end, direction); } + Py_ssize_t sub_lengths[tuple_len]; + for (Py_ssize_t i = 0; i < tuple_len; i++) { + PyObject *subseq = PyTuple_GET_ITEM(subobj, i); + Py_ssize_t sub_len; + if (!PyObject_CheckBuffer(subseq)) { + sub_len = 1; + } + else if (PyObject_GetBuffer(subseq, &subbuf, PyBUF_SIMPLE) != 0) + { + return -2; + } + else { + sub_len = subbuf.len; + PyBuffer_Release(&subbuf); + } + sub_lengths[i] = sub_len; + } Py_ssize_t result = -1; ADJUST_INDICES(start, end, len); if (direction > 0) { @@ -590,21 +607,9 @@ find_first_internal(const char *str, Py_ssize_t len, const char *function_name, cur_end = start - 1 + FIND_CHUNK_SIZE; } for (Py_ssize_t i = 0; i < tuple_len; i++) { - Py_buffer subbuf; - Py_ssize_t new_result, sub_len; PyObject *subseq = PyTuple_GET_ITEM(subobj, i); - if (!PyObject_CheckBuffer(subseq)) { - sub_len = 1; - } - else if (PyObject_GetBuffer(subseq, &subbuf, - PyBUF_SIMPLE) != 0) - { - return -2; - } - else { - sub_len = subbuf.len; - PyBuffer_Release(&subbuf); - } + Py_ssize_t sub_len = sub_lengths[i]; + Py_ssize_t new_result; if (cur_end >= end - sub_len) { // Guard overflow new_result = find_internal(str, len, function_name, subseq, start, end, +1); @@ -638,21 +643,9 @@ find_first_internal(const char *str, Py_ssize_t len, const char *function_name, cur_start = start; } for (Py_ssize_t i = 0; i < tuple_len; i++) { - Py_buffer subbuf; - Py_ssize_t new_result, sub_len; PyObject *subseq = PyTuple_GET_ITEM(subobj, i); - if (!PyObject_CheckBuffer(subseq)) { - sub_len = 1; - } - else if (PyObject_GetBuffer(subseq, &subbuf, - PyBUF_SIMPLE) != 0) - { - return -2; - } - else { - sub_len = subbuf.len; - PyBuffer_Release(&subbuf); - } + Py_ssize_t sub_len = sub_lengths[i]; + Py_ssize_t new_result; if (cur_end >= end - sub_len) { // Guard overflow new_result = find_internal(str, len, function_name, subseq, cur_start, end, -1); diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 679eda562b2c51..5f853115520923 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9188,7 +9188,8 @@ any_find_first_slice(PyObject *str, const char *function_name, } for (Py_ssize_t i = 0; i < tuple_len; i++) { PyObject *substr = PyTuple_GET_ITEM(subobj, i); - Py_ssize_t new_result, sub_len = PyUnicode_GET_LENGTH(substr); + Py_ssize_t sub_len = PyUnicode_GET_LENGTH(substr); + Py_ssize_t new_result; if (cur_end >= end - sub_len) { // Guard overflow new_result = any_find_slice(str, substr, start, end, +1); } @@ -9222,7 +9223,8 @@ any_find_first_slice(PyObject *str, const char *function_name, } for (Py_ssize_t i = 0; i < tuple_len; i++) { PyObject *substr = PyTuple_GET_ITEM(subobj, i); - Py_ssize_t new_result, sub_len = PyUnicode_GET_LENGTH(substr); + Py_ssize_t sub_len = PyUnicode_GET_LENGTH(substr); + Py_ssize_t new_result; if (cur_end >= end - sub_len) { // Guard overflow new_result = any_find_slice(str, substr, cur_start, end, -1); From 6a3d651b96fb3b4c0b0cb9501f62baaae94ec4f6 Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Wed, 29 May 2024 08:20:52 +0200 Subject: [PATCH 52/86] malloc --- Objects/bytes_methods.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index ba67ae3da71518..0a0ce2bf7c94d2 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -577,16 +577,24 @@ find_first_internal(const char *str, Py_ssize_t len, const char *function_name, return find_internal(str, len, function_name, subseq, start, end, direction); } - Py_ssize_t sub_lengths[tuple_len]; + Py_ssize_t* sub_lengths = PyMem_RawMalloc(((size_t)tuple_len + 1) * + sizeof(wchar_t)); + if (!sub_lengths) { + PyErr_NoMemory(); + return -2; + } + Py_ssize_t result = -1; for (Py_ssize_t i = 0; i < tuple_len; i++) { PyObject *subseq = PyTuple_GET_ITEM(subobj, i); + Py_buffer subbuf; Py_ssize_t sub_len; if (!PyObject_CheckBuffer(subseq)) { sub_len = 1; } else if (PyObject_GetBuffer(subseq, &subbuf, PyBUF_SIMPLE) != 0) { - return -2; + result = -2; + goto exit; } else { sub_len = subbuf.len; @@ -594,7 +602,6 @@ find_first_internal(const char *str, Py_ssize_t len, const char *function_name, } sub_lengths[i] = sub_len; } - Py_ssize_t result = -1; ADJUST_INDICES(start, end, len); if (direction > 0) { assert(FIND_CHUNK_SIZE > 0); @@ -619,11 +626,13 @@ find_first_internal(const char *str, Py_ssize_t len, const char *function_name, start, cur_end + sub_len, +1); } if (new_result == -2) { - return -2; + result = -2; + goto exit; } if (new_result != -1) { if (new_result == start) { - return start; + result = start; + goto exit; } cur_end = new_result - 1; result = new_result; @@ -656,11 +665,13 @@ find_first_internal(const char *str, Py_ssize_t len, const char *function_name, -1); } if (new_result == -2) { - return -2; + result = -2; + goto exit; } if (new_result != -1) { if (new_result == cur_end) { - return cur_end; + result = cur_end; + goto exit; } cur_start = new_result + 1; result = new_result; @@ -668,6 +679,8 @@ find_first_internal(const char *str, Py_ssize_t len, const char *function_name, } } } +exit: + PyMem_RawFree(sub_lengths); return result; } From 3227e63bd2b0ddc8038b0937a42669e66133a0a0 Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Wed, 29 May 2024 08:51:11 +0200 Subject: [PATCH 53/86] Fix malloc --- Objects/bytes_methods.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index 0a0ce2bf7c94d2..bfa6b4d9a0a3ea 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -577,8 +577,12 @@ find_first_internal(const char *str, Py_ssize_t len, const char *function_name, return find_internal(str, len, function_name, subseq, start, end, direction); } - Py_ssize_t* sub_lengths = PyMem_RawMalloc(((size_t)tuple_len + 1) * - sizeof(wchar_t)); + if ((size_t)tuple_len > (size_t)PY_SSIZE_T_MAX / sizeof(Py_ssize_t)) { + PyErr_SetString(PyExc_OverflowError, "tuple is too long"); + return -2; + } + Py_ssize_t *sub_lengths = PyMem_RawMalloc(((size_t)tuple_len) * + sizeof(Py_ssize_t)); if (!sub_lengths) { PyErr_NoMemory(); return -2; From 70d673f8b6a3f33ead1a2a9cd13eebc23122a8ac Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sat, 1 Jun 2024 11:34:18 +0200 Subject: [PATCH 54/86] Store needles for bytes --- Lib/test/test_bytes.py | 12 ---- Objects/bytes_methods.c | 122 ++++++++++++++++++++++++---------------- 2 files changed, 74 insertions(+), 60 deletions(-) diff --git a/Lib/test/test_bytes.py b/Lib/test/test_bytes.py index abcffc033f3b86..9e1985bb3a7639 100644 --- a/Lib/test/test_bytes.py +++ b/Lib/test/test_bytes.py @@ -644,12 +644,6 @@ def test_find(self): ValueError, r'byte must be in range\(0, 256\)', b.find, index) - # test tuple arguments - self.assertEqual(b.find((i,)), 1) - self.assertEqual(b.find((w,)), -1) - self.assertEqual(b.find((i, w)), 1) - self.assertEqual(b.find((w, i)), 1) - def test_rfind(self): b = self.type2test(b'mississippi') i = 105 @@ -669,12 +663,6 @@ def test_rfind(self): self.assertEqual(b.rfind(i, 3, 9), 7) self.assertEqual(b.rfind(w, 1, 3), -1) - # test tuple arguments - self.assertEqual(b.rfind((i,)), 10) - self.assertEqual(b.rfind((w,)), -1) - self.assertEqual(b.rfind((i, w)), 10) - self.assertEqual(b.rfind((w, i)), 10) - def test_index(self): b = self.type2test(b'mississippi') i = 105 diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index bfa6b4d9a0a3ea..ff3faf109a0685 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -496,17 +496,42 @@ parse_args_finds_byte(const char *function_name, PyObject **subobj, char *byte) start = 0; \ } +Py_ssize_t +fast_find_internal(const char *str, Py_ssize_t len, + const char *sub, Py_ssize_t sub_len, + Py_ssize_t start, Py_ssize_t end, + int direction) +{ + if (end - start < sub_len) + return -1; + else if (sub_len == 1) { + Py_ssize_t res; + if (direction > 0) + res = stringlib_find_char(str + start, end - start, *sub); + else + res = stringlib_rfind_char(str + start, end - start, *sub); + if (res >= 0) + res += start; + return res; + } + else { + if (direction > 0) + return stringlib_find_slice(str, len, sub, sub_len, start, end); + else + return stringlib_rfind_slice(str, len, sub, sub_len, start, end); + } +} + Py_LOCAL_INLINE(Py_ssize_t) find_internal(const char *str, Py_ssize_t len, const char *function_name, PyObject *subobj, Py_ssize_t start, Py_ssize_t end, - int dir) + int direction) { char byte; Py_buffer subbuf; const char *sub; Py_ssize_t sub_len; - Py_ssize_t res; if (!parse_args_finds_byte(function_name, &subobj, &byte)) { return -2; @@ -525,30 +550,8 @@ find_internal(const char *str, Py_ssize_t len, } ADJUST_INDICES(start, end, len); - if (end - start < sub_len) - res = -1; - else if (sub_len == 1) { - if (dir > 0) - res = stringlib_find_char( - str + start, end - start, - *sub); - else - res = stringlib_rfind_char( - str + start, end - start, - *sub); - if (res >= 0) - res += start; - } - else { - if (dir > 0) - res = stringlib_find_slice( - str, len, - sub, sub_len, start, end); - else - res = stringlib_rfind_slice( - str, len, - sub, sub_len, start, end); - } + Py_ssize_t res = fast_find_internal(str, len, sub, sub_len, start, end, + direction); if (subobj) PyBuffer_Release(&subbuf); @@ -577,12 +580,26 @@ find_first_internal(const char *str, Py_ssize_t len, const char *function_name, return find_internal(str, len, function_name, subseq, start, end, direction); } - if ((size_t)tuple_len > (size_t)PY_SSIZE_T_MAX / sizeof(Py_ssize_t)) { + if ((size_t)tuple_len > (size_t)PY_SSIZE_T_MAX / sizeof(char) || + (size_t)tuple_len > (size_t)PY_SSIZE_T_MAX / sizeof(char *) || + (size_t)tuple_len > (size_t)PY_SSIZE_T_MAX / sizeof(Py_ssize_t)) + { PyErr_SetString(PyExc_OverflowError, "tuple is too long"); return -2; } - Py_ssize_t *sub_lengths = PyMem_RawMalloc(((size_t)tuple_len) * - sizeof(Py_ssize_t)); + char *bytes = PyMem_RawMalloc(((size_t)tuple_len) * sizeof(char)); + const char **subs = NULL; + Py_ssize_t *sub_lengths = NULL; + if (!bytes) { + PyErr_NoMemory(); + return -2; + } + subs = PyMem_RawMalloc(((size_t)tuple_len) * sizeof(char *)); + if (!subs) { + PyErr_NoMemory(); + return -2; + } + sub_lengths = PyMem_RawMalloc(((size_t)tuple_len) * sizeof(Py_ssize_t)); if (!sub_lengths) { PyErr_NoMemory(); return -2; @@ -590,20 +607,26 @@ find_first_internal(const char *str, Py_ssize_t len, const char *function_name, Py_ssize_t result = -1; for (Py_ssize_t i = 0; i < tuple_len; i++) { PyObject *subseq = PyTuple_GET_ITEM(subobj, i); + char byte = bytes[i]; Py_buffer subbuf; + const char *sub; Py_ssize_t sub_len; - if (!PyObject_CheckBuffer(subseq)) { - sub_len = 1; - } - else if (PyObject_GetBuffer(subseq, &subbuf, PyBUF_SIMPLE) != 0) - { - result = -2; - goto exit; + if (!parse_args_finds_byte(function_name, &subseq, &byte)) { + return -2; } - else { + else if (subseq) { + if (PyObject_GetBuffer(subseq, &subbuf, PyBUF_SIMPLE) != 0) { + return -2; + } + sub = subbuf.buf; sub_len = subbuf.len; PyBuffer_Release(&subbuf); } + else { + sub = &byte; + sub_len = 1; + } + subs[i] = sub; sub_lengths[i] = sub_len; } ADJUST_INDICES(start, end, len); @@ -618,16 +641,17 @@ find_first_internal(const char *str, Py_ssize_t len, const char *function_name, cur_end = start - 1 + FIND_CHUNK_SIZE; } for (Py_ssize_t i = 0; i < tuple_len; i++) { - PyObject *subseq = PyTuple_GET_ITEM(subobj, i); + const char *sub = subs[i]; Py_ssize_t sub_len = sub_lengths[i]; Py_ssize_t new_result; if (cur_end >= end - sub_len) { // Guard overflow - new_result = find_internal(str, len, function_name, subseq, - start, end, +1); + new_result = fast_find_internal(str, len, sub, sub_len, + start, end, +1); } else { - new_result = find_internal(str, len, function_name, subseq, - start, cur_end + sub_len, +1); + new_result = fast_find_internal(str, len, sub, sub_len, + start, cur_end + sub_len, + +1); } if (new_result == -2) { result = -2; @@ -656,17 +680,17 @@ find_first_internal(const char *str, Py_ssize_t len, const char *function_name, cur_start = start; } for (Py_ssize_t i = 0; i < tuple_len; i++) { - PyObject *subseq = PyTuple_GET_ITEM(subobj, i); + const char *sub = subs[i]; Py_ssize_t sub_len = sub_lengths[i]; Py_ssize_t new_result; if (cur_end >= end - sub_len) { // Guard overflow - new_result = find_internal(str, len, function_name, subseq, - cur_start, end, -1); + new_result = fast_find_internal(str, len, sub, sub_len, + cur_start, end, -1); } else { - new_result = find_internal(str, len, function_name, subseq, - cur_start, cur_end + sub_len, - -1); + new_result = fast_find_internal(str, len, sub, sub_len, + cur_start, + cur_end + sub_len, -1); } if (new_result == -2) { result = -2; @@ -684,6 +708,8 @@ find_first_internal(const char *str, Py_ssize_t len, const char *function_name, } } exit: + PyMem_RawFree(bytes); + PyMem_RawFree(subs); PyMem_RawFree(sub_lengths); return result; } From 7b205b3b8fc5bb0f9de7b4aed457c244b0f51b78 Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sat, 1 Jun 2024 11:37:52 +0200 Subject: [PATCH 55/86] Revert test --- Lib/test/test_bytes.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Lib/test/test_bytes.py b/Lib/test/test_bytes.py index 9e1985bb3a7639..abcffc033f3b86 100644 --- a/Lib/test/test_bytes.py +++ b/Lib/test/test_bytes.py @@ -644,6 +644,12 @@ def test_find(self): ValueError, r'byte must be in range\(0, 256\)', b.find, index) + # test tuple arguments + self.assertEqual(b.find((i,)), 1) + self.assertEqual(b.find((w,)), -1) + self.assertEqual(b.find((i, w)), 1) + self.assertEqual(b.find((w, i)), 1) + def test_rfind(self): b = self.type2test(b'mississippi') i = 105 @@ -663,6 +669,12 @@ def test_rfind(self): self.assertEqual(b.rfind(i, 3, 9), 7) self.assertEqual(b.rfind(w, 1, 3), -1) + # test tuple arguments + self.assertEqual(b.rfind((i,)), 10) + self.assertEqual(b.rfind((w,)), -1) + self.assertEqual(b.rfind((i, w)), 10) + self.assertEqual(b.rfind((w, i)), 10) + def test_index(self): b = self.type2test(b'mississippi') i = 105 From 0664cedaeffbd17b37ba1297b0b0e0086dad5f42 Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sat, 1 Jun 2024 14:47:49 +0200 Subject: [PATCH 56/86] Restructure code --- Objects/bytes_methods.c | 104 ++++++++++++++++++++-------------------- 1 file changed, 53 insertions(+), 51 deletions(-) diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index ff3faf109a0685..81e21448e553b7 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -505,14 +505,14 @@ fast_find_internal(const char *str, Py_ssize_t len, if (end - start < sub_len) return -1; else if (sub_len == 1) { - Py_ssize_t res; + Py_ssize_t result; if (direction > 0) - res = stringlib_find_char(str + start, end - start, *sub); + result = stringlib_find_char(str + start, end - start, *sub); else - res = stringlib_rfind_char(str + start, end - start, *sub); - if (res >= 0) - res += start; - return res; + result = stringlib_rfind_char(str + start, end - start, *sub); + if (result >= 0) + result += start; + return result; } else { if (direction > 0) @@ -526,7 +526,7 @@ Py_LOCAL_INLINE(Py_ssize_t) find_internal(const char *str, Py_ssize_t len, const char *function_name, PyObject *subobj, Py_ssize_t start, Py_ssize_t end, - int direction) + int dir) { char byte; Py_buffer subbuf; @@ -543,6 +543,7 @@ find_internal(const char *str, Py_ssize_t len, sub = subbuf.buf; sub_len = subbuf.len; + PyBuffer_Release(&subbuf); } else { sub = &byte; @@ -550,13 +551,7 @@ find_internal(const char *str, Py_ssize_t len, } ADJUST_INDICES(start, end, len); - Py_ssize_t res = fast_find_internal(str, len, sub, sub_len, start, end, - direction); - - if (subobj) - PyBuffer_Release(&subbuf); - - return res; + return fast_find_internal(str, len, sub, sub_len, start, end, dir); } #define FIND_CHUNK_SIZE 1000 @@ -571,65 +566,80 @@ find_first_internal(const char *str, Py_ssize_t len, const char *function_name, return find_internal(str, len, function_name, subobj, start, end, direction); } + Py_ssize_t result = -2; // Error + char *bytes = NULL; + const char **subs = NULL; + Py_ssize_t *sub_lengths = NULL; + + /* ALLOCATE MEMORY */ Py_ssize_t tuple_len = PyTuple_GET_SIZE(subobj); - if (tuple_len == 0) { - return -1; - } - if (tuple_len == 1) { - PyObject *subseq = PyTuple_GET_ITEM(subobj, 0); - return find_internal(str, len, function_name, subseq, start, end, - direction); - } if ((size_t)tuple_len > (size_t)PY_SSIZE_T_MAX / sizeof(char) || (size_t)tuple_len > (size_t)PY_SSIZE_T_MAX / sizeof(char *) || (size_t)tuple_len > (size_t)PY_SSIZE_T_MAX / sizeof(Py_ssize_t)) { PyErr_SetString(PyExc_OverflowError, "tuple is too long"); - return -2; + goto exit; } - char *bytes = PyMem_RawMalloc(((size_t)tuple_len) * sizeof(char)); - const char **subs = NULL; - Py_ssize_t *sub_lengths = NULL; + bytes = PyMem_RawMalloc(((size_t)tuple_len) * sizeof(char)); if (!bytes) { PyErr_NoMemory(); - return -2; + goto exit; } subs = PyMem_RawMalloc(((size_t)tuple_len) * sizeof(char *)); if (!subs) { PyErr_NoMemory(); - return -2; + goto exit; } sub_lengths = PyMem_RawMalloc(((size_t)tuple_len) * sizeof(Py_ssize_t)); if (!sub_lengths) { PyErr_NoMemory(); - return -2; + goto exit; } - Py_ssize_t result = -1; + + /* STORE SUBSTRINGS */ + ADJUST_INDICES(start, end, len); + Py_ssize_t slice_len = end - start; + Py_ssize_t subs_len = 0; for (Py_ssize_t i = 0; i < tuple_len; i++) { - PyObject *subseq = PyTuple_GET_ITEM(subobj, i); - char byte = bytes[i]; Py_buffer subbuf; const char *sub; Py_ssize_t sub_len; - if (!parse_args_finds_byte(function_name, &subseq, &byte)) { - return -2; + + PyObject *subseq = PyTuple_GET_ITEM(subobj, i); + char *byte = &bytes[subs_len]; + if (!parse_args_finds_byte(function_name, &subseq, byte)) { + goto exit; } else if (subseq) { if (PyObject_GetBuffer(subseq, &subbuf, PyBUF_SIMPLE) != 0) { - return -2; + goto exit; } sub = subbuf.buf; sub_len = subbuf.len; PyBuffer_Release(&subbuf); } else { - sub = &byte; + sub = byte; sub_len = 1; } - subs[i] = sub; - sub_lengths[i] = sub_len; + if (sub_len > slice_len) { + continue; + } + subs[subs_len] = sub; + sub_lengths[subs_len] = sub_len; + subs_len++; + } + + /* FIND SUBSTRINGS */ + result = -1; // Not found (yet) + if (subs_len == 0) { + goto exit; + } + if (subs_len == 1) { + result = fast_find_internal(str, len, subs[0], sub_lengths[0], start, + end, direction); + goto exit; } - ADJUST_INDICES(start, end, len); if (direction > 0) { assert(FIND_CHUNK_SIZE > 0); for (; result == -1; start += FIND_CHUNK_SIZE) { @@ -640,10 +650,10 @@ find_first_internal(const char *str, Py_ssize_t len, const char *function_name, else { cur_end = start - 1 + FIND_CHUNK_SIZE; } - for (Py_ssize_t i = 0; i < tuple_len; i++) { + for (Py_ssize_t i = 0; i < subs_len; i++) { + Py_ssize_t new_result; const char *sub = subs[i]; Py_ssize_t sub_len = sub_lengths[i]; - Py_ssize_t new_result; if (cur_end >= end - sub_len) { // Guard overflow new_result = fast_find_internal(str, len, sub, sub_len, start, end, +1); @@ -653,10 +663,6 @@ find_first_internal(const char *str, Py_ssize_t len, const char *function_name, start, cur_end + sub_len, +1); } - if (new_result == -2) { - result = -2; - goto exit; - } if (new_result != -1) { if (new_result == start) { result = start; @@ -679,10 +685,10 @@ find_first_internal(const char *str, Py_ssize_t len, const char *function_name, if (cur_start < start) { cur_start = start; } - for (Py_ssize_t i = 0; i < tuple_len; i++) { + for (Py_ssize_t i = 0; i < subs_len; i++) { + Py_ssize_t new_result; const char *sub = subs[i]; Py_ssize_t sub_len = sub_lengths[i]; - Py_ssize_t new_result; if (cur_end >= end - sub_len) { // Guard overflow new_result = fast_find_internal(str, len, sub, sub_len, cur_start, end, -1); @@ -692,10 +698,6 @@ find_first_internal(const char *str, Py_ssize_t len, const char *function_name, cur_start, cur_end + sub_len, -1); } - if (new_result == -2) { - result = -2; - goto exit; - } if (new_result != -1) { if (new_result == cur_end) { result = cur_end; From b1327420964b118fce27be56bd27b6cc71c1448e Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sat, 1 Jun 2024 14:55:36 +0200 Subject: [PATCH 57/86] Fix smelly symbol --- Objects/bytes_methods.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index 81e21448e553b7..9fd595a5911731 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -497,10 +497,10 @@ parse_args_finds_byte(const char *function_name, PyObject **subobj, char *byte) } Py_ssize_t -fast_find_internal(const char *str, Py_ssize_t len, - const char *sub, Py_ssize_t sub_len, - Py_ssize_t start, Py_ssize_t end, - int direction) +_Py_fast_find(const char *str, Py_ssize_t len, + const char *sub, Py_ssize_t sub_len, + Py_ssize_t start, Py_ssize_t end, + int direction) { if (end - start < sub_len) return -1; @@ -551,7 +551,7 @@ find_internal(const char *str, Py_ssize_t len, } ADJUST_INDICES(start, end, len); - return fast_find_internal(str, len, sub, sub_len, start, end, dir); + return _Py_fast_find(str, len, sub, sub_len, start, end, dir); } #define FIND_CHUNK_SIZE 1000 @@ -636,8 +636,8 @@ find_first_internal(const char *str, Py_ssize_t len, const char *function_name, goto exit; } if (subs_len == 1) { - result = fast_find_internal(str, len, subs[0], sub_lengths[0], start, - end, direction); + result = _Py_fast_find(str, len, subs[0], sub_lengths[0], start, end, + direction); goto exit; } if (direction > 0) { @@ -655,13 +655,12 @@ find_first_internal(const char *str, Py_ssize_t len, const char *function_name, const char *sub = subs[i]; Py_ssize_t sub_len = sub_lengths[i]; if (cur_end >= end - sub_len) { // Guard overflow - new_result = fast_find_internal(str, len, sub, sub_len, - start, end, +1); + new_result = _Py_fast_find(str, len, sub, sub_len, start, + end, +1); } else { - new_result = fast_find_internal(str, len, sub, sub_len, - start, cur_end + sub_len, - +1); + new_result = _Py_fast_find(str, len, sub, sub_len, start, + cur_end + sub_len, +1); } if (new_result != -1) { if (new_result == start) { @@ -690,13 +689,13 @@ find_first_internal(const char *str, Py_ssize_t len, const char *function_name, const char *sub = subs[i]; Py_ssize_t sub_len = sub_lengths[i]; if (cur_end >= end - sub_len) { // Guard overflow - new_result = fast_find_internal(str, len, sub, sub_len, - cur_start, end, -1); + new_result = _Py_fast_find(str, len, sub, sub_len, + cur_start, end, -1); } else { - new_result = fast_find_internal(str, len, sub, sub_len, - cur_start, - cur_end + sub_len, -1); + new_result = _Py_fast_find(str, len, sub, sub_len, + cur_start, cur_end + sub_len, + -1); } if (new_result != -1) { if (new_result == cur_end) { From 8189c66e13815f1e788454bbd09c53f191d18deb Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sat, 1 Jun 2024 14:57:05 +0200 Subject: [PATCH 58/86] Make static --- Objects/bytes_methods.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index 9fd595a5911731..c17c25f5ab1d98 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -496,7 +496,7 @@ parse_args_finds_byte(const char *function_name, PyObject **subobj, char *byte) start = 0; \ } -Py_ssize_t +static Py_ssize_t _Py_fast_find(const char *str, Py_ssize_t len, const char *sub, Py_ssize_t sub_len, Py_ssize_t start, Py_ssize_t end, From 53d3a070ec6871d628cece20bd38d973931228f8 Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sat, 1 Jun 2024 17:44:30 +0200 Subject: [PATCH 59/86] Remove variable --- Objects/bytes_methods.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index c17c25f5ab1d98..821317640504ed 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -598,7 +598,6 @@ find_first_internal(const char *str, Py_ssize_t len, const char *function_name, /* STORE SUBSTRINGS */ ADJUST_INDICES(start, end, len); - Py_ssize_t slice_len = end - start; Py_ssize_t subs_len = 0; for (Py_ssize_t i = 0; i < tuple_len; i++) { Py_buffer subbuf; @@ -622,7 +621,7 @@ find_first_internal(const char *str, Py_ssize_t len, const char *function_name, sub = byte; sub_len = 1; } - if (sub_len > slice_len) { + if (sub_len > end - start) { continue; } subs[subs_len] = sub; From 648725de19a3252b8230d451c22f240f1d2b6dcb Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sat, 1 Jun 2024 17:47:38 +0200 Subject: [PATCH 60/86] Reverse comparison --- Objects/bytes_methods.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index 821317640504ed..824034f55f1af0 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -502,7 +502,7 @@ _Py_fast_find(const char *str, Py_ssize_t len, Py_ssize_t start, Py_ssize_t end, int direction) { - if (end - start < sub_len) + if (sub_len > end - start) return -1; else if (sub_len == 1) { Py_ssize_t result; From 4fe06fb513fd8de942de2fd09bfc8ab33de95cc5 Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sat, 1 Jun 2024 17:50:22 +0200 Subject: [PATCH 61/86] Add brackets --- Objects/bytes_methods.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index 824034f55f1af0..ed9a6609442635 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -502,23 +502,29 @@ _Py_fast_find(const char *str, Py_ssize_t len, Py_ssize_t start, Py_ssize_t end, int direction) { - if (sub_len > end - start) + if (sub_len > end - start) { return -1; + } else if (sub_len == 1) { Py_ssize_t result; - if (direction > 0) + if (direction > 0) { result = stringlib_find_char(str + start, end - start, *sub); - else + } + else { result = stringlib_rfind_char(str + start, end - start, *sub); - if (result >= 0) + } + if (result >= 0) { result += start; + } return result; } else { - if (direction > 0) + if (direction > 0) { return stringlib_find_slice(str, len, sub, sub_len, start, end); - else + } + else { return stringlib_rfind_slice(str, len, sub, sub_len, start, end); + } } } From 145f45dff4c0579f7db500ded904223ad31df224 Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sat, 1 Jun 2024 17:52:53 +0200 Subject: [PATCH 62/86] Remove continue --- Objects/bytes_methods.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index ed9a6609442635..6172876d695f48 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -627,12 +627,11 @@ find_first_internal(const char *str, Py_ssize_t len, const char *function_name, sub = byte; sub_len = 1; } - if (sub_len > end - start) { - continue; + if (sub_len <= end - start) { + subs[subs_len] = sub; + sub_lengths[subs_len] = sub_len; + subs_len++; } - subs[subs_len] = sub; - sub_lengths[subs_len] = sub_len; - subs_len++; } /* FIND SUBSTRINGS */ From c96775ce027602ce5a3c134fca9772b193797a0b Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sat, 1 Jun 2024 19:13:15 +0200 Subject: [PATCH 63/86] 2 arguments per line --- Objects/bytes_methods.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index 6172876d695f48..40f0d72d085ac7 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -564,8 +564,9 @@ find_internal(const char *str, Py_ssize_t len, #define RFIND_CHUNK_SIZE FIND_CHUNK_SIZE static Py_ssize_t -find_first_internal(const char *str, Py_ssize_t len, const char *function_name, - PyObject *subobj, Py_ssize_t start, Py_ssize_t end, +find_first_internal(const char *str, Py_ssize_t len, + const char *function_name, PyObject *subobj, + Py_ssize_t start, Py_ssize_t end, int direction) { if (!PyTuple_Check(subobj)) { From b4722c4ad63abdf0904283d48d63987d8ca24e9a Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sat, 1 Jun 2024 19:40:34 +0200 Subject: [PATCH 64/86] Exclude long needles --- Objects/unicodeobject.c | 77 +++++++++++++++++++++++++++++++---------- 1 file changed, 58 insertions(+), 19 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 5f853115520923..55ea2354a18296 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9155,10 +9155,33 @@ any_find_first_slice(PyObject *str, const char *function_name, } return any_find_slice(str, subobj, start, end, direction); } + Py_ssize_t result = -2; // Error + PyObject **substrs = NULL; + Py_ssize_t *sub_lengths = NULL; + + /* ALLOCATE MEMORY */ Py_ssize_t tuple_len = PyTuple_GET_SIZE(subobj); - if (tuple_len == 0) { - return -1; + if ((size_t)tuple_len > (size_t)PY_SSIZE_T_MAX / sizeof(PyObject *) || + (size_t)tuple_len > (size_t)PY_SSIZE_T_MAX / sizeof(Py_ssize_t)) + { + PyErr_SetString(PyExc_OverflowError, "tuple is too long"); + goto exit; + } + substrs = PyMem_RawMalloc(((size_t)tuple_len) * sizeof(PyObject *)); + if (!substrs) { + PyErr_NoMemory(); + goto exit; } + sub_lengths = PyMem_RawMalloc(((size_t)tuple_len) * sizeof(Py_ssize_t)); + if (!sub_lengths) { + PyErr_NoMemory(); + goto exit; + } + + /* STORE SUBSTRINGS */ + Py_ssize_t len = PyUnicode_GET_LENGTH(str); + ADJUST_INDICES(start, end, len); + Py_ssize_t subs_len = 0; for (Py_ssize_t i = 0; i < tuple_len; i++) { PyObject *substr = PyTuple_GET_ITEM(subobj, i); if (!PyUnicode_Check(substr)) { @@ -9166,16 +9189,25 @@ any_find_first_slice(PyObject *str, const char *function_name, "tuple for %.200s must only contain str, " "not %.100s", function_name, Py_TYPE(substr)->tp_name); - return -2; + goto exit; + } + Py_ssize_t sub_len = PyUnicode_GET_LENGTH(substr); + if (sub_len <= end - start) { + substrs[subs_len] = substr; + sub_lengths[subs_len] = sub_len; + subs_len++; } } - if (tuple_len == 1) { - PyObject *substr = PyTuple_GET_ITEM(subobj, 0); - return any_find_slice(str, substr, start, end, direction); + + /* FIND SUBSTRINGS */ + result = -1; // Not found (yet) + if (subs_len == 0) { + goto exit; + } + if (subs_len == 1) { + result = any_find_slice(str, substrs[0], start, end, direction); + goto exit; } - Py_ssize_t result = -1; - Py_ssize_t len = PyUnicode_GET_LENGTH(str); - ADJUST_INDICES(start, end, len); if (direction > 0) { assert(FIND_CHUNK_SIZE > 0); for (; result == -1; start += FIND_CHUNK_SIZE) { @@ -9186,10 +9218,10 @@ any_find_first_slice(PyObject *str, const char *function_name, else { cur_end = start - 1 + FIND_CHUNK_SIZE; } - for (Py_ssize_t i = 0; i < tuple_len; i++) { - PyObject *substr = PyTuple_GET_ITEM(subobj, i); - Py_ssize_t sub_len = PyUnicode_GET_LENGTH(substr); + for (Py_ssize_t i = 0; i < subs_len; i++) { Py_ssize_t new_result; + PyObject *substr = substrs[i]; + Py_ssize_t sub_len = sub_lengths[i]; if (cur_end >= end - sub_len) { // Guard overflow new_result = any_find_slice(str, substr, start, end, +1); } @@ -9198,11 +9230,13 @@ any_find_first_slice(PyObject *str, const char *function_name, cur_end + sub_len, +1); } if (new_result == -2) { - return -2; + result = -2; + goto exit; } if (new_result != -1) { if (new_result == start) { - return start; + result = start; + goto exit; } cur_end = new_result - 1; result = new_result; @@ -9221,10 +9255,10 @@ any_find_first_slice(PyObject *str, const char *function_name, if (cur_start < start) { cur_start = start; } - for (Py_ssize_t i = 0; i < tuple_len; i++) { - PyObject *substr = PyTuple_GET_ITEM(subobj, i); - Py_ssize_t sub_len = PyUnicode_GET_LENGTH(substr); + for (Py_ssize_t i = 0; i < subs_len; i++) { Py_ssize_t new_result; + PyObject *substr = substrs[i]; + Py_ssize_t sub_len = sub_lengths[i]; if (cur_end >= end - sub_len) { // Guard overflow new_result = any_find_slice(str, substr, cur_start, end, -1); @@ -9234,11 +9268,13 @@ any_find_first_slice(PyObject *str, const char *function_name, cur_end + sub_len, -1); } if (new_result == -2) { - return -2; + result = -2; + goto exit; } if (new_result != -1) { if (new_result == cur_end) { - return cur_end; + result = cur_end; + goto exit; } cur_start = new_result + 1; result = new_result; @@ -9246,6 +9282,9 @@ any_find_first_slice(PyObject *str, const char *function_name, } } } +exit: + PyMem_RawFree(substrs); + PyMem_RawFree(sub_lengths); return result; } From 5c8751a7a25e578615ed1f3ee5f38b103a59be24 Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sat, 1 Jun 2024 19:51:23 +0200 Subject: [PATCH 65/86] Include needles with a larger kind --- Objects/unicodeobject.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 55ea2354a18296..907ac4d6dc7b0f 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9179,6 +9179,7 @@ any_find_first_slice(PyObject *str, const char *function_name, } /* STORE SUBSTRINGS */ + int kind = PyUnicode_KIND(str); Py_ssize_t len = PyUnicode_GET_LENGTH(str); ADJUST_INDICES(start, end, len); Py_ssize_t subs_len = 0; @@ -9191,6 +9192,10 @@ any_find_first_slice(PyObject *str, const char *function_name, Py_TYPE(substr)->tp_name); goto exit; } + int sub_kind = PyUnicode_KIND(substr); + if (sub_kind > kind) { + continue; + } Py_ssize_t sub_len = PyUnicode_GET_LENGTH(substr); if (sub_len <= end - start) { substrs[subs_len] = substr; From c219cf5a48a5ec4057fd37e47aaa4e9087f31d6f Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sun, 2 Jun 2024 15:20:53 +0200 Subject: [PATCH 66/86] fast find for strings --- Objects/bytes_methods.c | 56 +++++----- Objects/unicodeobject.c | 230 +++++++++++++++++++++++++--------------- 2 files changed, 178 insertions(+), 108 deletions(-) diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index 40f0d72d085ac7..3cb86b1b9cf61f 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -502,6 +502,8 @@ _Py_fast_find(const char *str, Py_ssize_t len, Py_ssize_t start, Py_ssize_t end, int direction) { + assert(start >= 0); + assert(end <= len); if (sub_len > end - start) { return -1; } @@ -513,10 +515,10 @@ _Py_fast_find(const char *str, Py_ssize_t len, else { result = stringlib_rfind_char(str + start, end - start, *sub); } - if (result >= 0) { - result += start; + if (result == -1) { + return -1; } - return result; + return start + result; } else { if (direction > 0) { @@ -573,15 +575,16 @@ find_first_internal(const char *str, Py_ssize_t len, return find_internal(str, len, function_name, subobj, start, end, direction); } - Py_ssize_t result = -2; // Error + Py_ssize_t result, tuple_len, subs_len; char *bytes = NULL; const char **subs = NULL; - Py_ssize_t *sub_lengths = NULL; + Py_ssize_t *sub_lens = NULL; /* ALLOCATE MEMORY */ - Py_ssize_t tuple_len = PyTuple_GET_SIZE(subobj); + result = -2; // Error + tuple_len = PyTuple_GET_SIZE(subobj); if ((size_t)tuple_len > (size_t)PY_SSIZE_T_MAX / sizeof(char) || - (size_t)tuple_len > (size_t)PY_SSIZE_T_MAX / sizeof(char *) || + (size_t)tuple_len > (size_t)PY_SSIZE_T_MAX / sizeof(char*) || (size_t)tuple_len > (size_t)PY_SSIZE_T_MAX / sizeof(Py_ssize_t)) { PyErr_SetString(PyExc_OverflowError, "tuple is too long"); @@ -592,27 +595,28 @@ find_first_internal(const char *str, Py_ssize_t len, PyErr_NoMemory(); goto exit; } - subs = PyMem_RawMalloc(((size_t)tuple_len) * sizeof(char *)); + subs = PyMem_RawMalloc(((size_t)tuple_len) * sizeof(char*)); if (!subs) { PyErr_NoMemory(); goto exit; } - sub_lengths = PyMem_RawMalloc(((size_t)tuple_len) * sizeof(Py_ssize_t)); - if (!sub_lengths) { + sub_lens = PyMem_RawMalloc(((size_t)tuple_len) * sizeof(Py_ssize_t)); + if (!sub_lens) { PyErr_NoMemory(); goto exit; } /* STORE SUBSTRINGS */ ADJUST_INDICES(start, end, len); - Py_ssize_t subs_len = 0; + subs_len = 0; for (Py_ssize_t i = 0; i < tuple_len; i++) { + PyObject *subseq; + char *byte, *sub; Py_buffer subbuf; - const char *sub; Py_ssize_t sub_len; - PyObject *subseq = PyTuple_GET_ITEM(subobj, i); - char *byte = &bytes[subs_len]; + subseq = PyTuple_GET_ITEM(subobj, i); + byte = &bytes[subs_len]; if (!parse_args_finds_byte(function_name, &subseq, byte)) { goto exit; } @@ -630,7 +634,7 @@ find_first_internal(const char *str, Py_ssize_t len, } if (sub_len <= end - start) { subs[subs_len] = sub; - sub_lengths[subs_len] = sub_len; + sub_lens[subs_len] = sub_len; subs_len++; } } @@ -641,7 +645,7 @@ find_first_internal(const char *str, Py_ssize_t len, goto exit; } if (subs_len == 1) { - result = _Py_fast_find(str, len, subs[0], sub_lengths[0], start, end, + result = _Py_fast_find(str, len, subs[0], sub_lens[0], start, end, direction); goto exit; } @@ -656,9 +660,11 @@ find_first_internal(const char *str, Py_ssize_t len, cur_end = start - 1 + FIND_CHUNK_SIZE; } for (Py_ssize_t i = 0; i < subs_len; i++) { - Py_ssize_t new_result; - const char *sub = subs[i]; - Py_ssize_t sub_len = sub_lengths[i]; + const char *sub; + Py_ssize_t sub_len, new_result; + + sub = subs[i]; + sub_len = sub_lens[i]; if (cur_end >= end - sub_len) { // Guard overflow new_result = _Py_fast_find(str, len, sub, sub_len, start, end, +1); @@ -682,17 +688,19 @@ find_first_internal(const char *str, Py_ssize_t len, } } else { - Py_ssize_t cur_end = end; assert(RFIND_CHUNK_SIZE > 0); + Py_ssize_t cur_end = end; for (; result == -1 && cur_end >= start; cur_end -= RFIND_CHUNK_SIZE) { Py_ssize_t cur_start = cur_end - RFIND_CHUNK_SIZE + 1; if (cur_start < start) { cur_start = start; } for (Py_ssize_t i = 0; i < subs_len; i++) { - Py_ssize_t new_result; - const char *sub = subs[i]; - Py_ssize_t sub_len = sub_lengths[i]; + const char *sub; + Py_ssize_t sub_len, new_result; + + sub = subs[i]; + sub_len = sub_lens[i]; if (cur_end >= end - sub_len) { // Guard overflow new_result = _Py_fast_find(str, len, sub, sub_len, cur_start, end, -1); @@ -716,7 +724,7 @@ find_first_internal(const char *str, Py_ssize_t len, exit: PyMem_RawFree(bytes); PyMem_RawFree(subs); - PyMem_RawFree(sub_lengths); + PyMem_RawFree(sub_lens); return result; } diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 907ac4d6dc7b0f..c9c21e8eb6e0e1 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9055,93 +9055,119 @@ _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode) } static Py_ssize_t -any_find_slice(PyObject* s1, PyObject* s2, - Py_ssize_t start, - Py_ssize_t end, - int direction) +fast_find(const void *str, int kind, int len, + const void *sub, int sub_kind, int sub_len, + Py_ssize_t start, Py_ssize_t end, + int isascii, int direction) { - int kind1, kind2; - const void *buf1, *buf2; - Py_ssize_t len1, len2, result; - - kind1 = PyUnicode_KIND(s1); - kind2 = PyUnicode_KIND(s2); - if (kind1 < kind2) - return -1; + Py_ssize_t result; - len1 = PyUnicode_GET_LENGTH(s1); - len2 = PyUnicode_GET_LENGTH(s2); - ADJUST_INDICES(start, end, len1); - if (end - start < len2) + assert(sub_kind <= kind); + assert(start >= 0); + assert(end <= len); + if (sub_len > end - start) { return -1; - - buf1 = PyUnicode_DATA(s1); - buf2 = PyUnicode_DATA(s2); - if (len2 == 1) { - Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0); - result = findchar((const char *)buf1 + kind1*start, - kind1, end - start, ch, direction); - if (result == -1) + } + else if (sub_len == 1) { + Py_UCS4 ch = PyUnicode_READ(sub_kind, sub, 0); + result = findchar((const char *)str + kind * start, kind, end - start, + ch, direction); + if (result == -1) { return -1; - else - return start + result; + } + return start + result; } - if (kind2 != kind1) { - buf2 = unicode_askind(kind2, buf2, len2, kind1); - if (!buf2) + if (sub_kind != kind) { + sub = unicode_askind(sub_kind, sub, sub_len, kind); + if (!sub) { return -2; + } } if (direction > 0) { - switch (kind1) { + switch (kind) { case PyUnicode_1BYTE_KIND: - if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) - result = asciilib_find_slice(buf1, len1, buf2, len2, start, end); - else - result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end); + if (isascii) { + result = asciilib_find_slice(str, len, sub, sub_len, start, end); + } + else { + result = ucs1lib_find_slice(str, len, sub, sub_len, start, end); + } break; case PyUnicode_2BYTE_KIND: - result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end); + result = ucs2lib_find_slice(str, len, sub, sub_len, start, end); break; case PyUnicode_4BYTE_KIND: - result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end); + result = ucs4lib_find_slice(str, len, sub, sub_len, start, end); break; default: Py_UNREACHABLE(); } } else { - switch (kind1) { + switch (kind) { case PyUnicode_1BYTE_KIND: - if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) - result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end); - else - result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end); + if (isascii) { + result = asciilib_rfind_slice(str, len, sub, sub_len, start, end); + } + else { + result = ucs1lib_rfind_slice(str, len, sub, sub_len, start, end); + } break; case PyUnicode_2BYTE_KIND: - result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end); + result = ucs2lib_rfind_slice(str, len, sub, sub_len, start, end); break; case PyUnicode_4BYTE_KIND: - result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end); + result = ucs4lib_rfind_slice(str, len, sub, sub_len, start, end); break; default: Py_UNREACHABLE(); } } - assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2))); - if (kind2 != kind1) - PyMem_Free((void *)buf2); + if (sub_kind != kind) { + PyMem_Free((void *)sub); + } return result; } +static Py_ssize_t +any_find_slice(PyObject* s1, PyObject* s2, + Py_ssize_t start, Py_ssize_t end, + int direction) +{ + int kind1, kind2, isascii1, isascii2; + const void *buf1, *buf2; + Py_ssize_t len1, len2; + + kind1 = PyUnicode_KIND(s1); + kind2 = PyUnicode_KIND(s2); + if (kind2 > kind1) { + return -1; + } + + isascii1 = PyUnicode_IS_ASCII(s1); + isascii2 = PyUnicode_IS_ASCII(s2); + if (!isascii2 && isascii1) { + return -1; + } + + buf1 = PyUnicode_DATA(s1); + buf2 = PyUnicode_DATA(s2); + len1 = PyUnicode_GET_LENGTH(s1); + len2 = PyUnicode_GET_LENGTH(s2); + ADJUST_INDICES(start, end, len1); + return fast_find(buf1, kind1, len1, buf2, kind2, len2, start, end, + isascii1, direction); +} + #define FIND_CHUNK_SIZE 1000 #define RFIND_CHUNK_SIZE FIND_CHUNK_SIZE static Py_ssize_t -any_find_first_slice(PyObject *str, const char *function_name, +any_find_first_slice(PyObject *strobj, const char *function_name, PyObject *subobj, Py_ssize_t start, Py_ssize_t end, int direction) { @@ -9153,53 +9179,75 @@ any_find_first_slice(PyObject *str, const char *function_name, Py_TYPE(subobj)->tp_name); return -2; } - return any_find_slice(str, subobj, start, end, direction); + return any_find_slice(strobj, subobj, start, end, direction); } - Py_ssize_t result = -2; // Error - PyObject **substrs = NULL; - Py_ssize_t *sub_lengths = NULL; + Py_ssize_t result, tuple_len, len, subs_len; + const void **subs = NULL; + int *sub_kinds = NULL; + Py_ssize_t *sub_lens = NULL; + const void *str; + int kind, isascii; /* ALLOCATE MEMORY */ - Py_ssize_t tuple_len = PyTuple_GET_SIZE(subobj); - if ((size_t)tuple_len > (size_t)PY_SSIZE_T_MAX / sizeof(PyObject *) || + result = -2; // Error + tuple_len = PyTuple_GET_SIZE(subobj); + if ((size_t)tuple_len > (size_t)PY_SSIZE_T_MAX / sizeof(void*) || + (size_t)tuple_len > (size_t)PY_SSIZE_T_MAX / sizeof(int) || (size_t)tuple_len > (size_t)PY_SSIZE_T_MAX / sizeof(Py_ssize_t)) { PyErr_SetString(PyExc_OverflowError, "tuple is too long"); goto exit; } - substrs = PyMem_RawMalloc(((size_t)tuple_len) * sizeof(PyObject *)); - if (!substrs) { + subs = PyMem_RawMalloc(((size_t)tuple_len) * sizeof(void*)); + if (!subs) { + PyErr_NoMemory(); + goto exit; + } + sub_kinds = PyMem_RawMalloc(((size_t)tuple_len) * sizeof(int)); + if (!sub_kinds) { PyErr_NoMemory(); goto exit; } - sub_lengths = PyMem_RawMalloc(((size_t)tuple_len) * sizeof(Py_ssize_t)); - if (!sub_lengths) { + sub_lens = PyMem_RawMalloc(((size_t)tuple_len) * sizeof(Py_ssize_t)); + if (!sub_lens) { PyErr_NoMemory(); goto exit; } /* STORE SUBSTRINGS */ - int kind = PyUnicode_KIND(str); - Py_ssize_t len = PyUnicode_GET_LENGTH(str); + str = PyUnicode_DATA(strobj); + kind = PyUnicode_KIND(strobj); + isascii = PyUnicode_IS_ASCII(strobj); + len = PyUnicode_GET_LENGTH(strobj); ADJUST_INDICES(start, end, len); - Py_ssize_t subs_len = 0; + subs_len = 0; for (Py_ssize_t i = 0; i < tuple_len; i++) { - PyObject *substr = PyTuple_GET_ITEM(subobj, i); + PyObject *substr; + int sub_kind, sub_isascii; + const void *sub; + Py_ssize_t sub_len; + + substr = PyTuple_GET_ITEM(subobj, i); if (!PyUnicode_Check(substr)) { PyErr_Format(PyExc_TypeError, "tuple for %.200s must only contain str, " - "not %.100s", function_name, - Py_TYPE(substr)->tp_name); + "not %.100s", function_name, Py_TYPE(substr)->tp_name); goto exit; } - int sub_kind = PyUnicode_KIND(substr); + sub_kind = PyUnicode_KIND(substr); if (sub_kind > kind) { continue; } - Py_ssize_t sub_len = PyUnicode_GET_LENGTH(substr); + sub_isascii = PyUnicode_IS_ASCII(substr); + if (!sub_isascii && isascii) { + continue; + } + sub = PyUnicode_DATA(substr); + sub_len = PyUnicode_GET_LENGTH(substr); if (sub_len <= end - start) { - substrs[subs_len] = substr; - sub_lengths[subs_len] = sub_len; + subs[subs_len] = sub; + sub_kinds[subs_len] = sub_kind; + sub_lens[subs_len] = sub_len; subs_len++; } } @@ -9210,7 +9258,8 @@ any_find_first_slice(PyObject *str, const char *function_name, goto exit; } if (subs_len == 1) { - result = any_find_slice(str, substrs[0], start, end, direction); + result = fast_find(str, kind, len, subs[0], sub_kinds[0], sub_lens[0], + start, end, isascii, direction); goto exit; } if (direction > 0) { @@ -9224,15 +9273,21 @@ any_find_first_slice(PyObject *str, const char *function_name, cur_end = start - 1 + FIND_CHUNK_SIZE; } for (Py_ssize_t i = 0; i < subs_len; i++) { - Py_ssize_t new_result; - PyObject *substr = substrs[i]; - Py_ssize_t sub_len = sub_lengths[i]; + const void *sub; + int sub_kind; + Py_ssize_t sub_len, new_result; + + sub = subs[i]; + sub_kind = sub_kinds[i]; + sub_len = sub_lens[i]; if (cur_end >= end - sub_len) { // Guard overflow - new_result = any_find_slice(str, substr, start, end, +1); + new_result = fast_find(str, kind, len, sub, sub_kind, + sub_len, start, end, isascii, +1); } else { - new_result = any_find_slice(str, substr, start, - cur_end + sub_len, +1); + new_result = fast_find(str, kind, len, sub, sub_kind, + sub_len, start, cur_end + sub_len, + isascii, +1); } if (new_result == -2) { result = -2; @@ -9253,24 +9308,30 @@ any_find_first_slice(PyObject *str, const char *function_name, } } else { - Py_ssize_t cur_end = end; assert(RFIND_CHUNK_SIZE > 0); + Py_ssize_t cur_end = end; for (; result == -1 && cur_end >= start; cur_end -= RFIND_CHUNK_SIZE) { Py_ssize_t cur_start = cur_end - RFIND_CHUNK_SIZE + 1; if (cur_start < start) { cur_start = start; } for (Py_ssize_t i = 0; i < subs_len; i++) { - Py_ssize_t new_result; - PyObject *substr = substrs[i]; - Py_ssize_t sub_len = sub_lengths[i]; + const void *sub; + int sub_kind; + Py_ssize_t sub_len, new_result; + + sub = subs[i]; + sub_kind = sub_kinds[i]; + sub_len = sub_lens[i]; if (cur_end >= end - sub_len) { // Guard overflow - new_result = any_find_slice(str, substr, cur_start, end, - -1); + new_result = fast_find(str, kind, len, sub, sub_kind, + sub_len, cur_start, end, isascii, + -1); } else { - new_result = any_find_slice(str, substr, cur_start, - cur_end + sub_len, -1); + new_result = fast_find(str, kind, len, sub, sub_kind, + sub_len, cur_start, + cur_end + sub_len, isascii, -1); } if (new_result == -2) { result = -2; @@ -9288,8 +9349,9 @@ any_find_first_slice(PyObject *str, const char *function_name, } } exit: - PyMem_RawFree(substrs); - PyMem_RawFree(sub_lengths); + PyMem_RawFree(subs); + PyMem_RawFree(sub_kinds); + PyMem_RawFree(sub_lens); return result; } From ccbfa0ef4c5fae3baba6683e4007a833f002bb97 Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sun, 2 Jun 2024 15:29:13 +0200 Subject: [PATCH 67/86] Fix argument type --- Objects/bytes_methods.c | 22 +++++++++++----------- Objects/unicodeobject.c | 4 ++-- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index 3cb86b1b9cf61f..83bae052eb7fb1 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -729,20 +729,20 @@ find_first_internal(const char *str, Py_ssize_t len, } PyObject * -_Py_bytes_find(const char *str, Py_ssize_t len, PyObject *subobj, +_Py_bytes_find(const char *str, Py_ssize_t len, PyObject *sub, Py_ssize_t start, Py_ssize_t end) { - Py_ssize_t result = find_first_internal(str, len, "find", subobj, start, - end, +1); + Py_ssize_t result = find_first_internal(str, len, "find", sub, start, end, + +1); return result == -2 ? NULL : PyLong_FromSsize_t(result); } PyObject * -_Py_bytes_index(const char *str, Py_ssize_t len, PyObject *subobj, +_Py_bytes_index(const char *str, Py_ssize_t len, PyObject *sub, Py_ssize_t start, Py_ssize_t end) { - Py_ssize_t result = find_first_internal(str, len, "index", subobj, start, - end, +1); + Py_ssize_t result = find_first_internal(str, len, "index", sub, start, end, + +1); if (result == -1) { PyErr_SetString(PyExc_ValueError, "subsection not found"); } @@ -750,19 +750,19 @@ _Py_bytes_index(const char *str, Py_ssize_t len, PyObject *subobj, } PyObject * -_Py_bytes_rfind(const char *str, Py_ssize_t len, PyObject *subobj, +_Py_bytes_rfind(const char *str, Py_ssize_t len, PyObject *sub, Py_ssize_t start, Py_ssize_t end) { - Py_ssize_t result = find_first_internal(str, len, "rfind", subobj, start, - end, -1); + Py_ssize_t result = find_first_internal(str, len, "rfind", sub, start, end, + -1); return result == -2 ? NULL : PyLong_FromSsize_t(result); } PyObject * -_Py_bytes_rindex(const char *str, Py_ssize_t len, PyObject *subobj, +_Py_bytes_rindex(const char *str, Py_ssize_t len, PyObject *sub, Py_ssize_t start, Py_ssize_t end) { - Py_ssize_t result = find_first_internal(str, len, "rindex", subobj, start, + Py_ssize_t result = find_first_internal(str, len, "rindex", sub, start, end, -1); if (result == -1) { PyErr_SetString(PyExc_ValueError, "subsection not found"); diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index c9c21e8eb6e0e1..ded121b24d69db 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9055,8 +9055,8 @@ _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode) } static Py_ssize_t -fast_find(const void *str, int kind, int len, - const void *sub, int sub_kind, int sub_len, +fast_find(const void *str, int kind, Py_ssize_t len, + const void *sub, int sub_kind, Py_ssize_t sub_len, Py_ssize_t start, Py_ssize_t end, int isascii, int direction) { From aada7f5a616586b3039fe21fa53a08b064efca70 Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sun, 2 Jun 2024 15:32:57 +0200 Subject: [PATCH 68/86] Rename argument --- Objects/clinic/unicodeobject.c.h | 34 ++++++++++++++++---------------- Objects/unicodeobject.c | 27 ++++++++++++------------- 2 files changed, 30 insertions(+), 31 deletions(-) diff --git a/Objects/clinic/unicodeobject.c.h b/Objects/clinic/unicodeobject.c.h index ef86f67dcaee9a..700fb7dd9a487c 100644 --- a/Objects/clinic/unicodeobject.c.h +++ b/Objects/clinic/unicodeobject.c.h @@ -369,14 +369,14 @@ PyDoc_STRVAR(unicode_find__doc__, {"find", _PyCFunction_CAST(unicode_find), METH_FASTCALL, unicode_find__doc__}, static Py_ssize_t -unicode_find_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, +unicode_find_impl(PyObject *str, PyObject *sub, Py_ssize_t start, Py_ssize_t end); static PyObject * unicode_find(PyObject *str, PyObject *const *args, Py_ssize_t nargs) { PyObject *return_value = NULL; - PyObject *subobj; + PyObject *sub; Py_ssize_t start = 0; Py_ssize_t end = PY_SSIZE_T_MAX; Py_ssize_t _return_value; @@ -384,7 +384,7 @@ unicode_find(PyObject *str, PyObject *const *args, Py_ssize_t nargs) if (!_PyArg_CheckPositional("find", nargs, 1, 3)) { goto exit; } - subobj = args[0]; + sub = args[0]; if (nargs < 2) { goto skip_optional; } @@ -398,7 +398,7 @@ unicode_find(PyObject *str, PyObject *const *args, Py_ssize_t nargs) goto exit; } skip_optional: - _return_value = unicode_find_impl(str, subobj, start, end); + _return_value = unicode_find_impl(str, sub, start, end); if ((_return_value == -1) && PyErr_Occurred()) { goto exit; } @@ -421,14 +421,14 @@ PyDoc_STRVAR(unicode_index__doc__, {"index", _PyCFunction_CAST(unicode_index), METH_FASTCALL, unicode_index__doc__}, static Py_ssize_t -unicode_index_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, +unicode_index_impl(PyObject *str, PyObject *sub, Py_ssize_t start, Py_ssize_t end); static PyObject * unicode_index(PyObject *str, PyObject *const *args, Py_ssize_t nargs) { PyObject *return_value = NULL; - PyObject *subobj; + PyObject *sub; Py_ssize_t start = 0; Py_ssize_t end = PY_SSIZE_T_MAX; Py_ssize_t _return_value; @@ -436,7 +436,7 @@ unicode_index(PyObject *str, PyObject *const *args, Py_ssize_t nargs) if (!_PyArg_CheckPositional("index", nargs, 1, 3)) { goto exit; } - subobj = args[0]; + sub = args[0]; if (nargs < 2) { goto skip_optional; } @@ -450,7 +450,7 @@ unicode_index(PyObject *str, PyObject *const *args, Py_ssize_t nargs) goto exit; } skip_optional: - _return_value = unicode_index_impl(str, subobj, start, end); + _return_value = unicode_index_impl(str, sub, start, end); if ((_return_value == -1) && PyErr_Occurred()) { goto exit; } @@ -1064,14 +1064,14 @@ PyDoc_STRVAR(unicode_rfind__doc__, {"rfind", _PyCFunction_CAST(unicode_rfind), METH_FASTCALL, unicode_rfind__doc__}, static Py_ssize_t -unicode_rfind_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, +unicode_rfind_impl(PyObject *str, PyObject *sub, Py_ssize_t start, Py_ssize_t end); static PyObject * unicode_rfind(PyObject *str, PyObject *const *args, Py_ssize_t nargs) { PyObject *return_value = NULL; - PyObject *subobj; + PyObject *sub; Py_ssize_t start = 0; Py_ssize_t end = PY_SSIZE_T_MAX; Py_ssize_t _return_value; @@ -1079,7 +1079,7 @@ unicode_rfind(PyObject *str, PyObject *const *args, Py_ssize_t nargs) if (!_PyArg_CheckPositional("rfind", nargs, 1, 3)) { goto exit; } - subobj = args[0]; + sub = args[0]; if (nargs < 2) { goto skip_optional; } @@ -1093,7 +1093,7 @@ unicode_rfind(PyObject *str, PyObject *const *args, Py_ssize_t nargs) goto exit; } skip_optional: - _return_value = unicode_rfind_impl(str, subobj, start, end); + _return_value = unicode_rfind_impl(str, sub, start, end); if ((_return_value == -1) && PyErr_Occurred()) { goto exit; } @@ -1116,14 +1116,14 @@ PyDoc_STRVAR(unicode_rindex__doc__, {"rindex", _PyCFunction_CAST(unicode_rindex), METH_FASTCALL, unicode_rindex__doc__}, static Py_ssize_t -unicode_rindex_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, +unicode_rindex_impl(PyObject *str, PyObject *sub, Py_ssize_t start, Py_ssize_t end); static PyObject * unicode_rindex(PyObject *str, PyObject *const *args, Py_ssize_t nargs) { PyObject *return_value = NULL; - PyObject *subobj; + PyObject *sub; Py_ssize_t start = 0; Py_ssize_t end = PY_SSIZE_T_MAX; Py_ssize_t _return_value; @@ -1131,7 +1131,7 @@ unicode_rindex(PyObject *str, PyObject *const *args, Py_ssize_t nargs) if (!_PyArg_CheckPositional("rindex", nargs, 1, 3)) { goto exit; } - subobj = args[0]; + sub = args[0]; if (nargs < 2) { goto skip_optional; } @@ -1145,7 +1145,7 @@ unicode_rindex(PyObject *str, PyObject *const *args, Py_ssize_t nargs) goto exit; } skip_optional: - _return_value = unicode_rindex_impl(str, subobj, start, end); + _return_value = unicode_rindex_impl(str, sub, start, end); if ((_return_value == -1) && PyErr_Occurred()) { goto exit; } @@ -1872,4 +1872,4 @@ unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) exit: return return_value; } -/*[clinic end generated code: output=5fb071b9e4dc87fa input=a9049054013a1b77]*/ +/*[clinic end generated code: output=fb38686a525c786a input=a9049054013a1b77]*/ diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index ded121b24d69db..b17cc6c431c4cd 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -11555,7 +11555,7 @@ unicode_expandtabs_impl(PyObject *self, int tabsize) str.find as unicode_find -> Py_ssize_t self as str: self - sub as subobj: object + sub: object start: slice_index(accept={int, NoneType}, c_default='0') = None end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None / @@ -11567,12 +11567,11 @@ Return -1 on failure. [clinic start generated code]*/ static Py_ssize_t -unicode_find_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, +unicode_find_impl(PyObject *str, PyObject *sub, Py_ssize_t start, Py_ssize_t end) -/*[clinic end generated code: output=80175735a6d549d0 input=51e7b530950ab304]*/ +/*[clinic end generated code: output=da52b0913b08a960 input=a236fecd6e36a36a]*/ { - Py_ssize_t result = any_find_first_slice(str, "find", subobj, start, end, - +1); + Py_ssize_t result = any_find_first_slice(str, "find", sub, start, end, +1); return result < 0 ? -1 : result; } @@ -11626,11 +11625,11 @@ Raises ValueError when the substring is not found. [clinic start generated code]*/ static Py_ssize_t -unicode_index_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, +unicode_index_impl(PyObject *str, PyObject *sub, Py_ssize_t start, Py_ssize_t end) -/*[clinic end generated code: output=c9af24adf2f1f99e input=f0033cf1698b6108]*/ +/*[clinic end generated code: output=4f3129c11e833e01 input=f0033cf1698b6108]*/ { - Py_ssize_t result = any_find_first_slice(str, "index", subobj, start, end, + Py_ssize_t result = any_find_first_slice(str, "index", sub, start, end, +1); if (result == -1) { PyErr_SetString(PyExc_ValueError, "substring not found"); @@ -12725,11 +12724,11 @@ Return -1 on failure. [clinic start generated code]*/ static Py_ssize_t -unicode_rfind_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, +unicode_rfind_impl(PyObject *str, PyObject *sub, Py_ssize_t start, Py_ssize_t end) -/*[clinic end generated code: output=9d316eee7b9f9bf0 input=23ae7964e8f70b35]*/ +/*[clinic end generated code: output=0576fddc53b8616e input=23ae7964e8f70b35]*/ { - Py_ssize_t result = any_find_first_slice(str, "rfind", subobj, start, end, + Py_ssize_t result = any_find_first_slice(str, "rfind", sub, start, end, -1); return result < 0 ? -1 : result; } @@ -12744,11 +12743,11 @@ Raises ValueError when the substring is not found. [clinic start generated code]*/ static Py_ssize_t -unicode_rindex_impl(PyObject *str, PyObject *subobj, Py_ssize_t start, +unicode_rindex_impl(PyObject *str, PyObject *sub, Py_ssize_t start, Py_ssize_t end) -/*[clinic end generated code: output=847d553d0dc10a86 input=990f3925b149c1bc]*/ +/*[clinic end generated code: output=137ad2933d200f38 input=990f3925b149c1bc]*/ { - Py_ssize_t result = any_find_first_slice(str, "rindex", subobj, start, end, + Py_ssize_t result = any_find_first_slice(str, "rindex", sub, start, end, -1); if (result == -1) { PyErr_SetString(PyExc_ValueError, "substring not found"); From 090ddeebb334de2e27864b1f5069a253d9e5c097 Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sun, 2 Jun 2024 21:36:40 +0200 Subject: [PATCH 69/86] Decrease diff --- Objects/unicodeobject.c | 72 +++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 39 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index b17cc6c431c4cd..43880bdcbe8a00 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9055,80 +9055,74 @@ _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode) } static Py_ssize_t -fast_find(const void *str, int kind, Py_ssize_t len, - const void *sub, int sub_kind, Py_ssize_t sub_len, +fast_find(const void *buf1, int kind1, Py_ssize_t len1, + const void *buf2, int kind2, Py_ssize_t len2, Py_ssize_t start, Py_ssize_t end, - int isascii, int direction) + int isascii1, int direction) { Py_ssize_t result; - assert(sub_kind <= kind); + assert(kind2 <= kind1); assert(start >= 0); - assert(end <= len); - if (sub_len > end - start) { + assert(end <= len1); + if (end - start < len2) return -1; - } - else if (sub_len == 1) { - Py_UCS4 ch = PyUnicode_READ(sub_kind, sub, 0); - result = findchar((const char *)str + kind * start, kind, end - start, - ch, direction); - if (result == -1) { + + if (len2 == 1) { + Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0); + result = findchar((const char *)buf1 + kind1*start, + kind1, end - start, ch, direction); + if (result == -1) return -1; - } - return start + result; + else + return start + result; } - if (sub_kind != kind) { - sub = unicode_askind(sub_kind, sub, sub_len, kind); - if (!sub) { + if (kind2 != kind1) { + buf2 = unicode_askind(kind2, buf2, len2, kind1); + if (!buf2) return -2; - } } if (direction > 0) { - switch (kind) { + switch (kind1) { case PyUnicode_1BYTE_KIND: - if (isascii) { - result = asciilib_find_slice(str, len, sub, sub_len, start, end); - } - else { - result = ucs1lib_find_slice(str, len, sub, sub_len, start, end); - } + if (isascii1) + result = asciilib_find_slice(buf1, len1, buf2, len2, start, end); + else + result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end); break; case PyUnicode_2BYTE_KIND: - result = ucs2lib_find_slice(str, len, sub, sub_len, start, end); + result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end); break; case PyUnicode_4BYTE_KIND: - result = ucs4lib_find_slice(str, len, sub, sub_len, start, end); + result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end); break; default: Py_UNREACHABLE(); } } else { - switch (kind) { + switch (kind1) { case PyUnicode_1BYTE_KIND: - if (isascii) { - result = asciilib_rfind_slice(str, len, sub, sub_len, start, end); - } - else { - result = ucs1lib_rfind_slice(str, len, sub, sub_len, start, end); - } + if (isascii1) + result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end); + else + result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end); break; case PyUnicode_2BYTE_KIND: - result = ucs2lib_rfind_slice(str, len, sub, sub_len, start, end); + result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end); break; case PyUnicode_4BYTE_KIND: - result = ucs4lib_rfind_slice(str, len, sub, sub_len, start, end); + result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end); break; default: Py_UNREACHABLE(); } } - if (sub_kind != kind) { - PyMem_Free((void *)sub); - } + if (kind2 != kind1) + PyMem_Free((void *)buf2); return result; } From 6b85fd714b6814670fa6866625019bfdf64bca98 Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sun, 2 Jun 2024 21:56:02 +0200 Subject: [PATCH 70/86] Decrease diff 2 --- Objects/bytes_methods.c | 55 ++++++++++++++++++++++++----------------- Objects/unicodeobject.c | 19 +++++++------- 2 files changed, 42 insertions(+), 32 deletions(-) diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index 83bae052eb7fb1..059804a88f7bae 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -500,34 +500,38 @@ static Py_ssize_t _Py_fast_find(const char *str, Py_ssize_t len, const char *sub, Py_ssize_t sub_len, Py_ssize_t start, Py_ssize_t end, - int direction) + int dir) { + Py_ssize_t res; + assert(start >= 0); assert(end <= len); - if (sub_len > end - start) { - return -1; - } + if (end - start < sub_len) + res = -1; else if (sub_len == 1) { - Py_ssize_t result; - if (direction > 0) { - result = stringlib_find_char(str + start, end - start, *sub); - } - else { - result = stringlib_rfind_char(str + start, end - start, *sub); - } - if (result == -1) { - return -1; - } - return start + result; + if (dir > 0) + res = stringlib_find_char( + str + start, end - start, + *sub); + else + res = stringlib_rfind_char( + str + start, end - start, + *sub); + if (res >= 0) + res += start; } else { - if (direction > 0) { - return stringlib_find_slice(str, len, sub, sub_len, start, end); - } - else { - return stringlib_rfind_slice(str, len, sub, sub_len, start, end); - } + if (dir > 0) + res = stringlib_find_slice( + str, len, + sub, sub_len, start, end); + else + res = stringlib_rfind_slice( + str, len, + sub, sub_len, start, end); } + + return res; } Py_LOCAL_INLINE(Py_ssize_t) @@ -540,6 +544,7 @@ find_internal(const char *str, Py_ssize_t len, Py_buffer subbuf; const char *sub; Py_ssize_t sub_len; + Py_ssize_t res; if (!parse_args_finds_byte(function_name, &subobj, &byte)) { return -2; @@ -551,7 +556,6 @@ find_internal(const char *str, Py_ssize_t len, sub = subbuf.buf; sub_len = subbuf.len; - PyBuffer_Release(&subbuf); } else { sub = &byte; @@ -559,7 +563,12 @@ find_internal(const char *str, Py_ssize_t len, } ADJUST_INDICES(start, end, len); - return _Py_fast_find(str, len, sub, sub_len, start, end, dir); + res = _Py_fast_find(str, len, sub, sub_len, start, end, dir); + + if (subobj) + PyBuffer_Release(&subbuf); + + return res; } #define FIND_CHUNK_SIZE 1000 diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 43880bdcbe8a00..3a2b40f9f27fe6 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9134,27 +9134,28 @@ any_find_slice(PyObject* s1, PyObject* s2, { int kind1, kind2, isascii1, isascii2; const void *buf1, *buf2; - Py_ssize_t len1, len2; + Py_ssize_t len1, len2, result; kind1 = PyUnicode_KIND(s1); kind2 = PyUnicode_KIND(s2); - if (kind2 > kind1) { + if (kind2 > kind1) return -1; - } isascii1 = PyUnicode_IS_ASCII(s1); isascii2 = PyUnicode_IS_ASCII(s2); - if (!isascii2 && isascii1) { + if (!isascii2 && isascii1) return -1; - } - buf1 = PyUnicode_DATA(s1); - buf2 = PyUnicode_DATA(s2); len1 = PyUnicode_GET_LENGTH(s1); len2 = PyUnicode_GET_LENGTH(s2); ADJUST_INDICES(start, end, len1); - return fast_find(buf1, kind1, len1, buf2, kind2, len2, start, end, - isascii1, direction); + + buf1 = PyUnicode_DATA(s1); + buf2 = PyUnicode_DATA(s2); + result = fast_find(buf1, kind1, len1, buf2, kind2, len2, start, end, + isascii1, direction); + + return result; } #define FIND_CHUNK_SIZE 1000 From 41a6c20e6c1dd91d407a91952fab2b56a01cfdac Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sun, 2 Jun 2024 22:03:49 +0200 Subject: [PATCH 71/86] Decrease diff 3 --- Objects/bytes_methods.c | 24 ++++++++++++++++++------ Objects/unicodeobject.c | 22 +++++++++++++++++----- 2 files changed, 35 insertions(+), 11 deletions(-) diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index 059804a88f7bae..2cabbf36894020 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -743,7 +743,9 @@ _Py_bytes_find(const char *str, Py_ssize_t len, PyObject *sub, { Py_ssize_t result = find_first_internal(str, len, "find", sub, start, end, +1); - return result == -2 ? NULL : PyLong_FromSsize_t(result); + if (result == -2) + return NULL; + return PyLong_FromSsize_t(result); } PyObject * @@ -752,10 +754,14 @@ _Py_bytes_index(const char *str, Py_ssize_t len, PyObject *sub, { Py_ssize_t result = find_first_internal(str, len, "index", sub, start, end, +1); + if (result == -2) + return NULL; if (result == -1) { - PyErr_SetString(PyExc_ValueError, "subsection not found"); + PyErr_SetString(PyExc_ValueError, + "subsection not found"); + return NULL; } - return result < 0 ? NULL : PyLong_FromSsize_t(result); + return PyLong_FromSsize_t(result); } PyObject * @@ -764,7 +770,9 @@ _Py_bytes_rfind(const char *str, Py_ssize_t len, PyObject *sub, { Py_ssize_t result = find_first_internal(str, len, "rfind", sub, start, end, -1); - return result == -2 ? NULL : PyLong_FromSsize_t(result); + if (result == -2) + return NULL; + return PyLong_FromSsize_t(result); } PyObject * @@ -773,10 +781,14 @@ _Py_bytes_rindex(const char *str, Py_ssize_t len, PyObject *sub, { Py_ssize_t result = find_first_internal(str, len, "rindex", sub, start, end, -1); + if (result == -2) + return NULL; if (result == -1) { - PyErr_SetString(PyExc_ValueError, "subsection not found"); + PyErr_SetString(PyExc_ValueError, + "subsection not found"); + return NULL; } - return result < 0 ? NULL : PyLong_FromSsize_t(result); + return PyLong_FromSsize_t(result); } PyObject * diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 3a2b40f9f27fe6..fabd9281c631ef 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9138,7 +9138,7 @@ any_find_slice(PyObject* s1, PyObject* s2, kind1 = PyUnicode_KIND(s1); kind2 = PyUnicode_KIND(s2); - if (kind2 > kind1) + if (kind1 < kind2) return -1; isascii1 = PyUnicode_IS_ASCII(s1); @@ -11567,7 +11567,10 @@ unicode_find_impl(PyObject *str, PyObject *sub, Py_ssize_t start, /*[clinic end generated code: output=da52b0913b08a960 input=a236fecd6e36a36a]*/ { Py_ssize_t result = any_find_first_slice(str, "find", sub, start, end, +1); - return result < 0 ? -1 : result; + if (result < 0) { + return -1; + } + return result; } static PyObject * @@ -11629,7 +11632,10 @@ unicode_index_impl(PyObject *str, PyObject *sub, Py_ssize_t start, if (result == -1) { PyErr_SetString(PyExc_ValueError, "substring not found"); } - return result < 0 ? -1 : result; + else if (result < 0) { + return -1; + } + return result; } /*[clinic input] @@ -12725,7 +12731,10 @@ unicode_rfind_impl(PyObject *str, PyObject *sub, Py_ssize_t start, { Py_ssize_t result = any_find_first_slice(str, "rfind", sub, start, end, -1); - return result < 0 ? -1 : result; + if (result < 0) { + return -1; + } + return result; } /*[clinic input] @@ -12747,7 +12756,10 @@ unicode_rindex_impl(PyObject *str, PyObject *sub, Py_ssize_t start, if (result == -1) { PyErr_SetString(PyExc_ValueError, "substring not found"); } - return result < 0 ? -1 : result; + else if (result < 0) { + return -1; + } + return result; } /*[clinic input] From 460effa9873184cd9606cc5b0f84a07003161ae5 Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sun, 2 Jun 2024 22:14:18 +0200 Subject: [PATCH 72/86] Remove continue --- Objects/unicodeobject.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index fabd9281c631ef..1673367749a2d7 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9219,7 +9219,6 @@ any_find_first_slice(PyObject *strobj, const char *function_name, for (Py_ssize_t i = 0; i < tuple_len; i++) { PyObject *substr; int sub_kind, sub_isascii; - const void *sub; Py_ssize_t sub_len; substr = PyTuple_GET_ITEM(subobj, i); @@ -9230,17 +9229,12 @@ any_find_first_slice(PyObject *strobj, const char *function_name, goto exit; } sub_kind = PyUnicode_KIND(substr); - if (sub_kind > kind) { - continue; - } sub_isascii = PyUnicode_IS_ASCII(substr); - if (!sub_isascii && isascii) { - continue; - } - sub = PyUnicode_DATA(substr); sub_len = PyUnicode_GET_LENGTH(substr); - if (sub_len <= end - start) { - subs[subs_len] = sub; + if (!(sub_kind > kind || !sub_isascii && isascii || + sub_len > end - start)) + { + subs[subs_len] = PyUnicode_DATA(substr); sub_kinds[subs_len] = sub_kind; sub_lens[subs_len] = sub_len; subs_len++; From d1c4af6860645ef6d40d2414b2abd7d2661573a9 Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sun, 2 Jun 2024 22:29:16 +0200 Subject: [PATCH 73/86] Parentheses --- Objects/unicodeobject.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 1673367749a2d7..8e25f7f4a8f718 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9231,8 +9231,8 @@ any_find_first_slice(PyObject *strobj, const char *function_name, sub_kind = PyUnicode_KIND(substr); sub_isascii = PyUnicode_IS_ASCII(substr); sub_len = PyUnicode_GET_LENGTH(substr); - if (!(sub_kind > kind || !sub_isascii && isascii || - sub_len > end - start)) + if (sub_kind <= kind && (sub_isascii || !isascii) && + sub_len <= end - start) { subs[subs_len] = PyUnicode_DATA(substr); sub_kinds[subs_len] = sub_kind; From c19ddcfa690594df4dfada8afdd2198f4970aa19 Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sun, 2 Jun 2024 22:53:08 +0200 Subject: [PATCH 74/86] Store converted needles on the heap --- Objects/bytes_methods.c | 4 ++-- Objects/unicodeobject.c | 46 +++++++++++++++++++++++++++++------------ 2 files changed, 35 insertions(+), 15 deletions(-) diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index 2cabbf36894020..db0e8f49ce9b2a 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -593,7 +593,7 @@ find_first_internal(const char *str, Py_ssize_t len, result = -2; // Error tuple_len = PyTuple_GET_SIZE(subobj); if ((size_t)tuple_len > (size_t)PY_SSIZE_T_MAX / sizeof(char) || - (size_t)tuple_len > (size_t)PY_SSIZE_T_MAX / sizeof(char*) || + (size_t)tuple_len > (size_t)PY_SSIZE_T_MAX / sizeof(char *) || (size_t)tuple_len > (size_t)PY_SSIZE_T_MAX / sizeof(Py_ssize_t)) { PyErr_SetString(PyExc_OverflowError, "tuple is too long"); @@ -604,7 +604,7 @@ find_first_internal(const char *str, Py_ssize_t len, PyErr_NoMemory(); goto exit; } - subs = PyMem_RawMalloc(((size_t)tuple_len) * sizeof(char*)); + subs = PyMem_RawMalloc(((size_t)tuple_len) * sizeof(char *)); if (!subs) { PyErr_NoMemory(); goto exit; diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 8e25f7f4a8f718..1db58e4d18c6b6 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9078,12 +9078,6 @@ fast_find(const void *buf1, int kind1, Py_ssize_t len1, return start + result; } - if (kind2 != kind1) { - buf2 = unicode_askind(kind2, buf2, len2, kind1); - if (!buf2) - return -2; - } - if (direction > 0) { switch (kind1) { case PyUnicode_1BYTE_KIND: @@ -9121,9 +9115,6 @@ fast_find(const void *buf1, int kind1, Py_ssize_t len1, } } - if (kind2 != kind1) - PyMem_Free((void *)buf2); - return result; } @@ -9152,9 +9143,19 @@ any_find_slice(PyObject* s1, PyObject* s2, buf1 = PyUnicode_DATA(s1); buf2 = PyUnicode_DATA(s2); + if (len2 != 1 && kind2 != kind1) { + buf2 = unicode_askind(kind2, buf2, len2, kind1); + if (!buf2) + return -2; + } + result = fast_find(buf1, kind1, len1, buf2, kind2, len2, start, end, isascii1, direction); + assert((len2 != 1 && kind2 != kind1) == (buf2 != PyUnicode_DATA(s2))); + if (len2 != 1 && kind2 != kind1) + PyMem_Free((void *)buf2); + return result; } @@ -9177,7 +9178,7 @@ any_find_first_slice(PyObject *strobj, const char *function_name, return any_find_slice(strobj, subobj, start, end, direction); } Py_ssize_t result, tuple_len, len, subs_len; - const void **subs = NULL; + const void **heap_subs = NULL, **subs = NULL; int *sub_kinds = NULL; Py_ssize_t *sub_lens = NULL; const void *str; @@ -9186,14 +9187,19 @@ any_find_first_slice(PyObject *strobj, const char *function_name, /* ALLOCATE MEMORY */ result = -2; // Error tuple_len = PyTuple_GET_SIZE(subobj); - if ((size_t)tuple_len > (size_t)PY_SSIZE_T_MAX / sizeof(void*) || + if ((size_t)tuple_len > (size_t)PY_SSIZE_T_MAX / sizeof(void *) || (size_t)tuple_len > (size_t)PY_SSIZE_T_MAX / sizeof(int) || (size_t)tuple_len > (size_t)PY_SSIZE_T_MAX / sizeof(Py_ssize_t)) { PyErr_SetString(PyExc_OverflowError, "tuple is too long"); goto exit; } - subs = PyMem_RawMalloc(((size_t)tuple_len) * sizeof(void*)); + heap_subs = PyMem_RawMalloc(((size_t)tuple_len) * sizeof(void *)); + if (!heap_subs) { + PyErr_NoMemory(); + goto exit; + } + subs = PyMem_RawMalloc(((size_t)tuple_len) * sizeof(void *)); if (!subs) { PyErr_NoMemory(); goto exit; @@ -9234,7 +9240,15 @@ any_find_first_slice(PyObject *strobj, const char *function_name, if (sub_kind <= kind && (sub_isascii || !isascii) && sub_len <= end - start) { - subs[subs_len] = PyUnicode_DATA(substr); + const void *sub = PyUnicode_DATA(substr); + if (sub_len != 1 && sub_kind != kind) { + sub = unicode_askind(sub_kind, sub, sub_len, kind); + if (!sub) { + return -2; + } + heap_subs[subs_len] = sub; + } + subs[subs_len] = sub; sub_kinds[subs_len] = sub_kind; sub_lens[subs_len] = sub_len; subs_len++; @@ -9338,6 +9352,12 @@ any_find_first_slice(PyObject *strobj, const char *function_name, } } exit: + if (heap_subs) { + for (Py_ssize_t i = 0; i < subs_len; i++) { + PyMem_Free((void *)heap_subs[i]); + } + } + PyMem_RawFree(heap_subs); PyMem_RawFree(subs); PyMem_RawFree(sub_kinds); PyMem_RawFree(sub_lens); From 6beae49d8781fd8aa000e0864aac06b060b3c057 Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sun, 2 Jun 2024 22:56:11 +0200 Subject: [PATCH 75/86] cleanup --- Objects/unicodeobject.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 1db58e4d18c6b6..3111b48a405c2a 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9244,7 +9244,7 @@ any_find_first_slice(PyObject *strobj, const char *function_name, if (sub_len != 1 && sub_kind != kind) { sub = unicode_askind(sub_kind, sub, sub_len, kind); if (!sub) { - return -2; + goto exit; } heap_subs[subs_len] = sub; } @@ -9292,10 +9292,6 @@ any_find_first_slice(PyObject *strobj, const char *function_name, sub_len, start, cur_end + sub_len, isascii, +1); } - if (new_result == -2) { - result = -2; - goto exit; - } if (new_result != -1) { if (new_result == start) { result = start; @@ -9336,10 +9332,6 @@ any_find_first_slice(PyObject *strobj, const char *function_name, sub_len, cur_start, cur_end + sub_len, isascii, -1); } - if (new_result == -2) { - result = -2; - goto exit; - } if (new_result != -1) { if (new_result == cur_end) { result = cur_end; From ff514beaf1771e881e38a28fea9fbfa6f32b60b6 Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sun, 2 Jun 2024 22:58:12 +0200 Subject: [PATCH 76/86] Fix uninitialised variable --- Objects/unicodeobject.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 3111b48a405c2a..38ab80539c12d2 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9186,6 +9186,7 @@ any_find_first_slice(PyObject *strobj, const char *function_name, /* ALLOCATE MEMORY */ result = -2; // Error + subs_len = 0; tuple_len = PyTuple_GET_SIZE(subobj); if ((size_t)tuple_len > (size_t)PY_SSIZE_T_MAX / sizeof(void *) || (size_t)tuple_len > (size_t)PY_SSIZE_T_MAX / sizeof(int) || @@ -9221,7 +9222,6 @@ any_find_first_slice(PyObject *strobj, const char *function_name, isascii = PyUnicode_IS_ASCII(strobj); len = PyUnicode_GET_LENGTH(strobj); ADJUST_INDICES(start, end, len); - subs_len = 0; for (Py_ssize_t i = 0; i < tuple_len; i++) { PyObject *substr; int sub_kind, sub_isascii; From ff6eea201b3d135bf479634834fafea81ffb62c6 Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sun, 2 Jun 2024 23:07:55 +0200 Subject: [PATCH 77/86] Try to prevent segmentation fault --- Objects/unicodeobject.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 38ab80539c12d2..479bda9d58afac 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9346,7 +9346,10 @@ any_find_first_slice(PyObject *strobj, const char *function_name, exit: if (heap_subs) { for (Py_ssize_t i = 0; i < subs_len; i++) { - PyMem_Free((void *)heap_subs[i]); + void *heap_sub = heap_subs[i]; + if (heap_sub) { + PyMem_Free(heap_sub); + } } } PyMem_RawFree(heap_subs); From 0cbf03a7322654ff5b93ad1b27599f072978ebd3 Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sun, 2 Jun 2024 23:09:26 +0200 Subject: [PATCH 78/86] Fix cast --- Objects/unicodeobject.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 479bda9d58afac..51c1b7fb7d4d4f 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9346,9 +9346,9 @@ any_find_first_slice(PyObject *strobj, const char *function_name, exit: if (heap_subs) { for (Py_ssize_t i = 0; i < subs_len; i++) { - void *heap_sub = heap_subs[i]; + const void *heap_sub = heap_subs[i]; if (heap_sub) { - PyMem_Free(heap_sub); + PyMem_Free((void *)heap_sub); } } } From d412046f7e321b2c309f88319bda362b1b228af8 Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sun, 2 Jun 2024 23:55:34 +0200 Subject: [PATCH 79/86] Revert "Fix cast" This reverts commit 0cbf03a7322654ff5b93ad1b27599f072978ebd3. --- Objects/unicodeobject.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 51c1b7fb7d4d4f..479bda9d58afac 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9346,9 +9346,9 @@ any_find_first_slice(PyObject *strobj, const char *function_name, exit: if (heap_subs) { for (Py_ssize_t i = 0; i < subs_len; i++) { - const void *heap_sub = heap_subs[i]; + void *heap_sub = heap_subs[i]; if (heap_sub) { - PyMem_Free((void *)heap_sub); + PyMem_Free(heap_sub); } } } From 168fe84c264e7435bde3e67111a34da1bf9b40ff Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Sun, 2 Jun 2024 23:57:45 +0200 Subject: [PATCH 80/86] Revert "Try to prevent segmentation fault" This reverts commit ff6eea201b3d135bf479634834fafea81ffb62c6. --- Objects/unicodeobject.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 479bda9d58afac..38ab80539c12d2 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9346,10 +9346,7 @@ any_find_first_slice(PyObject *strobj, const char *function_name, exit: if (heap_subs) { for (Py_ssize_t i = 0; i < subs_len; i++) { - void *heap_sub = heap_subs[i]; - if (heap_sub) { - PyMem_Free(heap_sub); - } + PyMem_Free((void *)heap_subs[i]); } } PyMem_RawFree(heap_subs); From 41b11e5d626d13922cb4980f0a03383e2e926359 Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Mon, 3 Jun 2024 07:03:11 +0200 Subject: [PATCH 81/86] Uninitialised memory? --- Objects/unicodeobject.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 38ab80539c12d2..b9cdf88c9e03bb 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9248,6 +9248,9 @@ any_find_first_slice(PyObject *strobj, const char *function_name, } heap_subs[subs_len] = sub; } + else { + heap_subs[subs_len] = NULL; + } subs[subs_len] = sub; sub_kinds[subs_len] = sub_kind; sub_lens[subs_len] = sub_len; From ffe115264f7ae2a3cc8770931a3e8fe0ac843150 Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Mon, 3 Jun 2024 08:23:46 +0200 Subject: [PATCH 82/86] More tests --- Lib/test/string_tests.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py index ec1c2e8c0efe75..a0e463223e1dda 100644 --- a/Lib/test/string_tests.py +++ b/Lib/test/string_tests.py @@ -180,8 +180,10 @@ def test_find(self): if self.contains_bytes: self.checkequal(-1, 'hello', 'find', 42) + self.checkequal(-1, 'hello', 'find', (42,)) else: self.checkraises(TypeError, 'hello', 'find', 42) + self.checkraises(TypeError, 'hello', 'find', (42,)) self.checkequal(0, '', 'find', '') self.checkequal(-1, '', 'find', '', 1, 1) @@ -230,7 +232,7 @@ def test_find(self): self.checkequal(2, '__aa__bb__', 'find', ('aa', 'bb'), 0, 10) self.checkequal(-1, '__aa__bb__', 'find', ('aa', 'bb'), 0, 3) self.checkequal(2, '__aa__bb__', 'find', ('aa', 'bb'), 0, 4) - self.checkraises(TypeError, 'hello', 'find', (1.0, 2.0)) + self.checkraises(TypeError, 'hello', 'find', (None,)) s = '_' * (N - 2) + 'aaaa' + '_' * (N - 2) self.checkequal((N - 2), s, 'find', ('aaaa', 'bb')) self.checkequal(2, 'foobar', 'find', ('ob', 'oba')) @@ -242,6 +244,7 @@ def test_find(self): s = 'ab' + 'c' * (10 * N) self.checkequal(1, s, 'find', ('c' * (10 * N), 'b' + 'c' * (10 * N))) self.checkequal(0, 'foobar', 'find', ('foo', 'bar')) + self.checkequal(-1, 'foo', 'find', ('foobar',)) def test_rfind(self): self.checkequal(9, 'abcdefghiabc', 'rfind', 'abc') @@ -264,8 +267,10 @@ def test_rfind(self): if self.contains_bytes: self.checkequal(-1, 'hello', 'rfind', 42) + self.checkequal(-1, 'hello', 'rfind', (42,)) else: self.checkraises(TypeError, 'hello', 'rfind', 42) + self.checkraises(TypeError, 'hello', 'rfind', (42,)) # For a variety of combinations, # verify that str.rfind() matches __contains__ @@ -308,18 +313,19 @@ def test_rfind(self): self.checkequal(6, '__aa__bb__', 'rfind', ('aa', 'bb'), 0, 10) self.checkequal(-1, '__aa__bb__', 'rfind', ('aa', 'bb'), 7, 10) self.checkequal(6, '__aa__bb__', 'rfind', ('aa', 'bb'), 6, 10) - self.checkraises(TypeError, 'hello', 'rfind', (1.0, 2.0)) + self.checkraises(TypeError, 'hello', 'rfind', (None,)) s = '_' * (N - 2) + 'aaaa' + '_' * (N - 2) self.checkequal((N - 2), s, 'rfind', ('aaaa', 'bb')) self.checkequal(2, 'foobar', 'rfind', ('oba', 'ob')) self.checkequal(2, 'foobar', 'rfind', ('oob', 'ob')) - self.checkequal(0, '', 'rfind', ('', '_')) + self.checkequal(1, '_', 'rfind', ('', 'a')) self.checkequal(4, '__abcd__', 'rfind', ('ab', 'cd')) self.checkequal(3, '__abc__', 'rfind', ('ab', 'bc')) self.checkequal(0, 'b' * N + 'a', 'rfind', ('b' * N, 'c')) s = 'ab' + 'c' * (10 * N) self.checkequal(2, s, 'rfind', ('c' * (10 * N), 'b' + 'c' * (10 * N))) self.checkequal(3, 'foo', 'rfind', ('', 'foo')) + self.checkequal(-1, 'foo', 'rfind', ('foobar',)) def test_index(self): self.checkequal(0, 'abcdefghiabc', 'index', '') From ac91f79a9dfd012ecceac0ab918bbe4720f65caa Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Mon, 3 Jun 2024 08:29:26 +0200 Subject: [PATCH 83/86] Rename parameter --- Objects/unicodeobject.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index b9cdf88c9e03bb..2ffa17fc9cb2cd 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9058,7 +9058,7 @@ static Py_ssize_t fast_find(const void *buf1, int kind1, Py_ssize_t len1, const void *buf2, int kind2, Py_ssize_t len2, Py_ssize_t start, Py_ssize_t end, - int isascii1, int direction) + int isascii, int direction) { Py_ssize_t result; @@ -9081,7 +9081,7 @@ fast_find(const void *buf1, int kind1, Py_ssize_t len1, if (direction > 0) { switch (kind1) { case PyUnicode_1BYTE_KIND: - if (isascii1) + if (isascii) result = asciilib_find_slice(buf1, len1, buf2, len2, start, end); else result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end); @@ -9099,7 +9099,7 @@ fast_find(const void *buf1, int kind1, Py_ssize_t len1, else { switch (kind1) { case PyUnicode_1BYTE_KIND: - if (isascii1) + if (isascii) result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end); else result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end); From 67519928549d6ddcc084e6723245bc0643f2cb5a Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Mon, 3 Jun 2024 08:33:12 +0200 Subject: [PATCH 84/86] Unnest --- Objects/unicodeobject.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 2ffa17fc9cb2cd..5b256d9af89105 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9241,16 +9241,16 @@ any_find_first_slice(PyObject *strobj, const char *function_name, sub_len <= end - start) { const void *sub = PyUnicode_DATA(substr); - if (sub_len != 1 && sub_kind != kind) { + if (sub_len == 1 || sub_kind == kind) { + heap_subs[subs_len] = NULL; + } + else { sub = unicode_askind(sub_kind, sub, sub_len, kind); if (!sub) { goto exit; } heap_subs[subs_len] = sub; } - else { - heap_subs[subs_len] = NULL; - } subs[subs_len] = sub; sub_kinds[subs_len] = sub_kind; sub_lens[subs_len] = sub_len; From 44aebd10cab7196919ee93a47a9a8aa882269024 Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Mon, 3 Jun 2024 09:33:53 +0200 Subject: [PATCH 85/86] Keep buffers acquired during search --- Objects/bytes_methods.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index db0e8f49ce9b2a..2b8c6208e8d956 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -586,6 +586,7 @@ find_first_internal(const char *str, Py_ssize_t len, } Py_ssize_t result, tuple_len, subs_len; char *bytes = NULL; + Py_buffer *buffers = NULL; const char **subs = NULL; Py_ssize_t *sub_lens = NULL; @@ -593,6 +594,7 @@ find_first_internal(const char *str, Py_ssize_t len, result = -2; // Error tuple_len = PyTuple_GET_SIZE(subobj); if ((size_t)tuple_len > (size_t)PY_SSIZE_T_MAX / sizeof(char) || + (size_t)tuple_len > (size_t)PY_SSIZE_T_MAX / sizeof(Py_buffer) || (size_t)tuple_len > (size_t)PY_SSIZE_T_MAX / sizeof(char *) || (size_t)tuple_len > (size_t)PY_SSIZE_T_MAX / sizeof(Py_ssize_t)) { @@ -604,6 +606,11 @@ find_first_internal(const char *str, Py_ssize_t len, PyErr_NoMemory(); goto exit; } + buffers = PyMem_RawMalloc(((size_t)tuple_len) * sizeof(Py_buffer)); + if (!buffers) { + PyErr_NoMemory(); + goto exit; + } subs = PyMem_RawMalloc(((size_t)tuple_len) * sizeof(char *)); if (!subs) { PyErr_NoMemory(); @@ -621,21 +628,21 @@ find_first_internal(const char *str, Py_ssize_t len, for (Py_ssize_t i = 0; i < tuple_len; i++) { PyObject *subseq; char *byte, *sub; - Py_buffer subbuf; + Py_buffer *buffer; Py_ssize_t sub_len; subseq = PyTuple_GET_ITEM(subobj, i); byte = &bytes[subs_len]; + buffer = &buffers[i]; if (!parse_args_finds_byte(function_name, &subseq, byte)) { goto exit; } else if (subseq) { - if (PyObject_GetBuffer(subseq, &subbuf, PyBUF_SIMPLE) != 0) { + if (PyObject_GetBuffer(subseq, buffer, PyBUF_SIMPLE) != 0) { goto exit; } - sub = subbuf.buf; - sub_len = subbuf.len; - PyBuffer_Release(&subbuf); + sub = buffer->buf; + sub_len = buffer->len; } else { sub = byte; @@ -732,6 +739,12 @@ find_first_internal(const char *str, Py_ssize_t len, } exit: PyMem_RawFree(bytes); + if (buffers) { + for (Py_ssize_t i = 0; i < tuple_len; i++) { + PyBuffer_Release(&buffers[i]); + } + } + PyMem_RawFree(buffers); PyMem_RawFree(subs); PyMem_RawFree(sub_lens); return result; From 9a51fd9c1a5d119028ff77e26b3a4cc12c732104 Mon Sep 17 00:00:00 2001 From: Nineteendo Date: Mon, 3 Jun 2024 10:14:26 +0200 Subject: [PATCH 86/86] Add `buffers_len` --- Objects/bytes_methods.c | 8 +++++--- Objects/unicodeobject.c | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index 2b8c6208e8d956..1b31cfaef55d21 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -584,7 +584,7 @@ find_first_internal(const char *str, Py_ssize_t len, return find_internal(str, len, function_name, subobj, start, end, direction); } - Py_ssize_t result, tuple_len, subs_len; + Py_ssize_t result, buffers_len, tuple_len, subs_len; char *bytes = NULL; Py_buffer *buffers = NULL; const char **subs = NULL; @@ -592,6 +592,7 @@ find_first_internal(const char *str, Py_ssize_t len, /* ALLOCATE MEMORY */ result = -2; // Error + buffers_len = 0; tuple_len = PyTuple_GET_SIZE(subobj); if ((size_t)tuple_len > (size_t)PY_SSIZE_T_MAX / sizeof(char) || (size_t)tuple_len > (size_t)PY_SSIZE_T_MAX / sizeof(Py_buffer) || @@ -633,7 +634,7 @@ find_first_internal(const char *str, Py_ssize_t len, subseq = PyTuple_GET_ITEM(subobj, i); byte = &bytes[subs_len]; - buffer = &buffers[i]; + buffer = &buffers[buffers_len]; if (!parse_args_finds_byte(function_name, &subseq, byte)) { goto exit; } @@ -643,6 +644,7 @@ find_first_internal(const char *str, Py_ssize_t len, } sub = buffer->buf; sub_len = buffer->len; + buffers_len++; } else { sub = byte; @@ -740,7 +742,7 @@ find_first_internal(const char *str, Py_ssize_t len, exit: PyMem_RawFree(bytes); if (buffers) { - for (Py_ssize_t i = 0; i < tuple_len; i++) { + for (Py_ssize_t i = 0; i < buffers_len; i++) { PyBuffer_Release(&buffers[i]); } } diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 5b256d9af89105..435dd48ba5ee56 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9177,7 +9177,7 @@ any_find_first_slice(PyObject *strobj, const char *function_name, } return any_find_slice(strobj, subobj, start, end, direction); } - Py_ssize_t result, tuple_len, len, subs_len; + Py_ssize_t result, subs_len, tuple_len, len; const void **heap_subs = NULL, **subs = NULL; int *sub_kinds = NULL; Py_ssize_t *sub_lens = NULL;