From 283b2f8ffcf2e9c832b295f7a48255a054e555fb Mon Sep 17 00:00:00 2001 From: vaggelisd Date: Fri, 8 May 2026 17:04:15 +0300 Subject: [PATCH 1/3] [mypyc] Add `librt.strings.isidentifier` codepoint primitive True if a codepoint can start a valid identifier (XID_Start, per PEP 3131). The ASCII fast path covers `[A-Za-z_]` inline; non-ASCII codepoints round-trip through PyUnicode_FromOrdinal + PyUnicode_IsIdentifier so the answer matches str.isidentifier on a 1-character string. The non-ASCII path is the first allocating helper in this series, so its body lives out-of-line in codepoint_extra_ops.c (it would otherwise be emitted as a separate copy in every translation unit that includes the header). On OOM it swallows the exception via PyErr_Clear() and returns False, which keeps the function ERR_NEVER. Documented at the call site so callers don't get a surprising silent failure. Stack: depends on the librt.strings.isspace primitive. --- mypy/typeshed/stubs/librt/librt/strings.pyi | 1 + mypyc/build.py | 7 ++++++- mypyc/lib-rt/codepoint_extra_ops.c | 16 ++++++++++++++-- mypyc/lib-rt/codepoint_extra_ops.h | 19 +++++++++++++++++++ mypyc/lib-rt/setup.py | 1 + mypyc/lib-rt/strings/librt_strings.c | 4 ++++ mypyc/primitives/librt_strings_ops.py | 12 ++++++++++++ mypyc/test-data/irbuild-librt-strings.test | 14 ++++++++++++++ mypyc/test-data/run-librt-strings.test | 5 ++++- 9 files changed, 75 insertions(+), 4 deletions(-) diff --git a/mypy/typeshed/stubs/librt/librt/strings.pyi b/mypy/typeshed/stubs/librt/librt/strings.pyi index 01aee3ff758d..7a028f9e7859 100644 --- a/mypy/typeshed/stubs/librt/librt/strings.pyi +++ b/mypy/typeshed/stubs/librt/librt/strings.pyi @@ -47,3 +47,4 @@ def isspace(c: i32, /) -> bool: ... def isdigit(c: i32, /) -> bool: ... def isalnum(c: i32, /) -> bool: ... def isalpha(c: i32, /) -> bool: ... +def isidentifier(c: i32, /) -> bool: ... diff --git a/mypyc/build.py b/mypyc/build.py index 13bd50fef3b1..4dad74a4e349 100644 --- a/mypyc/build.py +++ b/mypyc/build.py @@ -54,7 +54,12 @@ class ModDesc(NamedTuple): LIBRT_MODULES = [ ModDesc("librt.internal", ["internal/librt_internal.c"], [], ["internal"]), - ModDesc("librt.strings", ["strings/librt_strings.c"], [], ["strings"]), + ModDesc( + "librt.strings", + ["strings/librt_strings.c", "codepoint_extra_ops.c"], + ["codepoint_extra_ops.h"], + ["strings"], + ), ModDesc( "librt.base64", [ diff --git a/mypyc/lib-rt/codepoint_extra_ops.c b/mypyc/lib-rt/codepoint_extra_ops.c index ca03eba4e6f5..c66351141dbf 100644 --- a/mypyc/lib-rt/codepoint_extra_ops.c +++ b/mypyc/lib-rt/codepoint_extra_ops.c @@ -4,5 +4,17 @@ // The classification helpers and the ASCII fast paths for case conversion // stay inline in codepoint_extra_ops.h; this file holds the slow paths // that round-trip through PyUnicode_FromOrdinal and CPython's Unicode -// machinery. Currently empty; populated as later commits add -// isidentifier, toupper, and tolower. +// machinery. + +bool LibRTStrings_IsIdentifier_slow(int32_t c) { + PyObject *s = PyUnicode_FromOrdinal((int)c); + if (s == NULL) { + // OOM. Swallow and return false to keep the function ERR_NEVER; + // callers expect a defined answer, not a propagated exception. + PyErr_Clear(); + return false; + } + int r = PyUnicode_IsIdentifier(s); + Py_DECREF(s); + return r == 1; +} diff --git a/mypyc/lib-rt/codepoint_extra_ops.h b/mypyc/lib-rt/codepoint_extra_ops.h index bb83f92e4b87..6c2a6c12b564 100644 --- a/mypyc/lib-rt/codepoint_extra_ops.h +++ b/mypyc/lib-rt/codepoint_extra_ops.h @@ -25,4 +25,23 @@ static inline bool LibRTStrings_IsAlpha(int32_t c) { return c >= 0 && Py_UNICODE_ISALPHA((Py_UCS4)c); } +// Slow path for non-ASCII isidentifier; defined out-of-line in +// codepoint_extra_ops.c because it allocates and calls into CPython. +bool LibRTStrings_IsIdentifier_slow(int32_t c); + +// True if c could start a valid identifier (matches XID_Start +// semantics, which is what str.isidentifier reports for a 1-character +// string). The ASCII fast path covers `[A-Za-z_]` inline; non-ASCII +// delegates to PyUnicode_IsIdentifier for correct PEP 3131 handling. +// Returns false on OOM in the slow path (the function stays ERR_NEVER). +static inline bool LibRTStrings_IsIdentifier(int32_t c) { + if (c < 0) return false; + if (c < 128) { + return (c >= 'a' && c <= 'z') + || (c >= 'A' && c <= 'Z') + || c == '_'; + } + return LibRTStrings_IsIdentifier_slow(c); +} + #endif // MYPYC_CODEPOINT_EXTRA_OPS_H diff --git a/mypyc/lib-rt/setup.py b/mypyc/lib-rt/setup.py index 371b322ca18b..1801c4e7dfa9 100644 --- a/mypyc/lib-rt/setup.py +++ b/mypyc/lib-rt/setup.py @@ -103,6 +103,7 @@ def run(self) -> None: "librt.strings", [ "strings/librt_strings.c", + "codepoint_extra_ops.c", "init.c", "int_ops.c", "exc_ops.c", diff --git a/mypyc/lib-rt/strings/librt_strings.c b/mypyc/lib-rt/strings/librt_strings.c index cbc3e5f753fa..62b4edffcd7f 100644 --- a/mypyc/lib-rt/strings/librt_strings.c +++ b/mypyc/lib-rt/strings/librt_strings.c @@ -1194,6 +1194,7 @@ DEFINE_CP_BOOL_WRAPPER(isspace, LibRTStrings_IsSpace) DEFINE_CP_BOOL_WRAPPER(isdigit, LibRTStrings_IsDigit) DEFINE_CP_BOOL_WRAPPER(isalnum, LibRTStrings_IsAlnum) DEFINE_CP_BOOL_WRAPPER(isalpha, LibRTStrings_IsAlpha) +DEFINE_CP_BOOL_WRAPPER(isidentifier, LibRTStrings_IsIdentifier) static PyMethodDef librt_strings_module_methods[] = { {"write_i16_le", (PyCFunction) write_i16_le, METH_FASTCALL, @@ -1268,6 +1269,9 @@ static PyMethodDef librt_strings_module_methods[] = { {"isalpha", cp_isalpha, METH_O, PyDoc_STR("Test whether a codepoint (i32) is a Unicode letter.") }, + {"isidentifier", cp_isidentifier, METH_O, + PyDoc_STR("Test whether a codepoint (i32) is a valid identifier start (XID_Start).") + }, {NULL, NULL, 0, NULL} }; diff --git a/mypyc/primitives/librt_strings_ops.py b/mypyc/primitives/librt_strings_ops.py index 93fa717cf529..312d5a16195b 100644 --- a/mypyc/primitives/librt_strings_ops.py +++ b/mypyc/primitives/librt_strings_ops.py @@ -431,3 +431,15 @@ error_kind=ERR_NEVER, dependencies=[LIBRT_STRINGS, CODEPOINT_EXTRA_OPS], ) + +# isidentifier checks XID_Start semantics for a single codepoint, matching +# str.isidentifier() on a 1-character string. The non-ASCII path allocates +# but swallows OOM (returning False), keeping the function ERR_NEVER. +function_op( + name="librt.strings.isidentifier", + arg_types=[int32_rprimitive], + return_type=bool_rprimitive, + c_function_name="LibRTStrings_IsIdentifier", + error_kind=ERR_NEVER, + dependencies=[LIBRT_STRINGS, CODEPOINT_EXTRA_OPS], +) diff --git a/mypyc/test-data/irbuild-librt-strings.test b/mypyc/test-data/irbuild-librt-strings.test index e5d18b6eb852..e3aaa49bd6f9 100644 --- a/mypyc/test-data/irbuild-librt-strings.test +++ b/mypyc/test-data/irbuild-librt-strings.test @@ -387,3 +387,17 @@ def is_a(c): L0: r0 = LibRTStrings_IsAlpha(c) return r0 + +[case testLibrtStringsIsIdentifierIR] +from librt.strings import isidentifier +from mypy_extensions import i32 + +def is_id(c: i32) -> bool: + return isidentifier(c) +[out] +def is_id(c): + c :: i32 + r0 :: bool +L0: + r0 = LibRTStrings_IsIdentifier(c) + return r0 diff --git a/mypyc/test-data/run-librt-strings.test b/mypyc/test-data/run-librt-strings.test index aa38c713d384..0a3320ff6522 100644 --- a/mypyc/test-data/run-librt-strings.test +++ b/mypyc/test-data/run-librt-strings.test @@ -1443,7 +1443,7 @@ def test_new_without_init_is_usable() -> None: [case testLibrtStringsCodepointClassifiers_librt] from typing import Any from mypy_extensions import i32 -from librt.strings import isspace, isdigit, isalnum, isalpha +from librt.strings import isspace, isdigit, isalnum, isalpha, isidentifier from testutil import assertRaises @@ -1455,6 +1455,7 @@ def test_codepoint_classifiers() -> None: assert not isdigit(bad) assert not isalnum(bad) assert not isalpha(bad) + assert not isidentifier(bad) # Verify each codepoint primitive agrees with the matching str method # across all Unicode codepoints, including the ord(chr(i)) round-trip. # Any forces generic dispatch on the str side. @@ -1466,6 +1467,7 @@ def test_codepoint_classifiers() -> None: assert isdigit(o) == isdigit(i) == a.isdigit() assert isalnum(o) == isalnum(i) == a.isalnum() assert isalpha(o) == isalpha(i) == a.isalpha() + assert isidentifier(o) == isidentifier(i) == a.isidentifier() def test_codepoint_classifiers_via_any() -> None: @@ -1476,6 +1478,7 @@ def test_codepoint_classifiers_via_any() -> None: (isdigit, "5", "a"), (isalnum, "A", " "), (isalpha, "A", " "), + (isidentifier, "A", "0"), ): f: Any = fn assert f(ord(true_input)) is True From 472aab01704158c2bc9b441e5b0f8a34750ae228 Mon Sep 17 00:00:00 2001 From: vaggelisd Date: Fri, 22 May 2026 18:31:16 +0300 Subject: [PATCH 2/3] Address review: abort on OOM, keep _extra_ops out of librt build - codepoint_extra_ops.h: include CPy.h and move the isidentifier slow path inline into LibRTStrings_IsIdentifier. Aborts via CPyError_OutOfMemory on allocation failure (the helper is ERR_NEVER, so returning a silently-wrong bool under memory pressure was the wrong contract). Matches the pattern in the sibling _extra_ops.h files (all bodies static inline, CPy.h included for runtime helpers). - codepoint_extra_ops.c: reduce to a single-line shim that #includes the header. Exists only so SourceDep("codepoint_extra_ops.c") pulls the header into user mypyc extensions in include_runtime_files mode. - build.py / lib-rt/setup.py: drop codepoint_extra_ops.c from the librt.strings module sources. The _extra_ops.c files are mypyc-internal (linked into user extensions via SourceDep in mypyc/ir/deps.py); the librt.strings Python module shouldn't depend on them, matching how bytes_extra_ops, str_extra_ops, etc. are organized. librt.strings now picks up LibRTStrings_IsIdentifier via #include of the header. --- mypyc/build.py | 7 +------ mypyc/lib-rt/codepoint_extra_ops.c | 24 +++++------------------- mypyc/lib-rt/codepoint_extra_ops.h | 16 ++++++++++------ mypyc/lib-rt/setup.py | 1 - 4 files changed, 16 insertions(+), 32 deletions(-) diff --git a/mypyc/build.py b/mypyc/build.py index 4dad74a4e349..13bd50fef3b1 100644 --- a/mypyc/build.py +++ b/mypyc/build.py @@ -54,12 +54,7 @@ class ModDesc(NamedTuple): LIBRT_MODULES = [ ModDesc("librt.internal", ["internal/librt_internal.c"], [], ["internal"]), - ModDesc( - "librt.strings", - ["strings/librt_strings.c", "codepoint_extra_ops.c"], - ["codepoint_extra_ops.h"], - ["strings"], - ), + ModDesc("librt.strings", ["strings/librt_strings.c"], [], ["strings"]), ModDesc( "librt.base64", [ diff --git a/mypyc/lib-rt/codepoint_extra_ops.c b/mypyc/lib-rt/codepoint_extra_ops.c index c66351141dbf..3eba41727d25 100644 --- a/mypyc/lib-rt/codepoint_extra_ops.c +++ b/mypyc/lib-rt/codepoint_extra_ops.c @@ -1,20 +1,6 @@ +// All codepoint helper bodies live in codepoint_extra_ops.h as static +// inline. This translation unit exists so the header is pulled into +// mypyc-compiled extensions via SourceDep("codepoint_extra_ops.c") in +// mypyc/ir/deps.py (which, in include_runtime_files mode, emits +// `#include ` into the generated __native.c). #include "codepoint_extra_ops.h" - -// Out-of-line bodies for codepoint helpers that are too large to inline. -// The classification helpers and the ASCII fast paths for case conversion -// stay inline in codepoint_extra_ops.h; this file holds the slow paths -// that round-trip through PyUnicode_FromOrdinal and CPython's Unicode -// machinery. - -bool LibRTStrings_IsIdentifier_slow(int32_t c) { - PyObject *s = PyUnicode_FromOrdinal((int)c); - if (s == NULL) { - // OOM. Swallow and return false to keep the function ERR_NEVER; - // callers expect a defined answer, not a propagated exception. - PyErr_Clear(); - return false; - } - int r = PyUnicode_IsIdentifier(s); - Py_DECREF(s); - return r == 1; -} diff --git a/mypyc/lib-rt/codepoint_extra_ops.h b/mypyc/lib-rt/codepoint_extra_ops.h index 6c2a6c12b564..8d7201fdd70a 100644 --- a/mypyc/lib-rt/codepoint_extra_ops.h +++ b/mypyc/lib-rt/codepoint_extra_ops.h @@ -4,6 +4,7 @@ #include #include #include +#include "CPy.h" // Codepoint helpers for librt.strings. // Inputs are signed int32_t for compatibility with mypyc's i32 type. @@ -25,15 +26,12 @@ static inline bool LibRTStrings_IsAlpha(int32_t c) { return c >= 0 && Py_UNICODE_ISALPHA((Py_UCS4)c); } -// Slow path for non-ASCII isidentifier; defined out-of-line in -// codepoint_extra_ops.c because it allocates and calls into CPython. -bool LibRTStrings_IsIdentifier_slow(int32_t c); - // True if c could start a valid identifier (matches XID_Start // semantics, which is what str.isidentifier reports for a 1-character // string). The ASCII fast path covers `[A-Za-z_]` inline; non-ASCII // delegates to PyUnicode_IsIdentifier for correct PEP 3131 handling. -// Returns false on OOM in the slow path (the function stays ERR_NEVER). +// Aborts via CPyError_OutOfMemory on allocation failure, so this helper +// stays ERR_NEVER. static inline bool LibRTStrings_IsIdentifier(int32_t c) { if (c < 0) return false; if (c < 128) { @@ -41,7 +39,13 @@ static inline bool LibRTStrings_IsIdentifier(int32_t c) { || (c >= 'A' && c <= 'Z') || c == '_'; } - return LibRTStrings_IsIdentifier_slow(c); + PyObject *s = PyUnicode_FromOrdinal((int)c); + if (s == NULL) { + CPyError_OutOfMemory(); + } + int r = PyUnicode_IsIdentifier(s); + Py_DECREF(s); + return r == 1; } #endif // MYPYC_CODEPOINT_EXTRA_OPS_H diff --git a/mypyc/lib-rt/setup.py b/mypyc/lib-rt/setup.py index 1801c4e7dfa9..371b322ca18b 100644 --- a/mypyc/lib-rt/setup.py +++ b/mypyc/lib-rt/setup.py @@ -103,7 +103,6 @@ def run(self) -> None: "librt.strings", [ "strings/librt_strings.c", - "codepoint_extra_ops.c", "init.c", "int_ops.c", "exc_ops.c", From 35aabc815613aa895550bd29e86014d0d2c1f948 Mon Sep 17 00:00:00 2001 From: vaggelisd Date: Mon, 25 May 2026 17:43:45 +0300 Subject: [PATCH 3/3] Route codepoint helpers through the capsule API Per follow-up review on #21522, the codepoint classifiers belong with the rest of the librt.strings public surface rather than in a shared inline header, since they implement Python-visible librt.strings functions (not mypyc-internal codegen helpers like the other _extra_ops files). Move them out of codepoint_extra_ops.h and into librt_strings.c as proper non-static functions, exposed to mypyc-compiled callers via the capsule API the same way every other LibRTStrings_* function works. This keeps the librt module files independent of mypyc-internal _extra_ops headers, matching the pattern used by BytesWriter_internal etc. The cost is one indirect call per use vs. the previous inlined macro; still substantially faster than the Python method dispatch the primitives are replacing. - librt_strings.h: bump LIBRT_STRINGS_API_VERSION 4->5, LIBRT_STRINGS_API_LEN 14->19. - librt_strings_api.h: add 5 macro entries for LibRTStrings_API[14..18]. - librt_strings.c: define the 5 helpers; register them in the capsule table; drop `#include "codepoint_extra_ops.h"`. - mypyc/ir/deps.py: delete CODEPOINT_EXTRA_OPS. - mypyc/primitives/librt_strings_ops.py: drop the CODEPOINT_EXTRA_OPS dep from the five codepoint primitives. - Delete codepoint_extra_ops.{c,h}. --- mypyc/ir/deps.py | 1 - mypyc/lib-rt/codepoint_extra_ops.c | 6 --- mypyc/lib-rt/codepoint_extra_ops.h | 51 -------------------- mypyc/lib-rt/strings/librt_strings.c | 59 ++++++++++++++++++++---- mypyc/lib-rt/strings/librt_strings.h | 4 +- mypyc/lib-rt/strings/librt_strings_api.h | 6 +++ mypyc/primitives/librt_strings_ops.py | 19 +++----- 7 files changed, 64 insertions(+), 82 deletions(-) delete mode 100644 mypyc/lib-rt/codepoint_extra_ops.c delete mode 100644 mypyc/lib-rt/codepoint_extra_ops.h diff --git a/mypyc/ir/deps.py b/mypyc/ir/deps.py index 0cf58c83c27b..751845d3a324 100644 --- a/mypyc/ir/deps.py +++ b/mypyc/ir/deps.py @@ -116,5 +116,4 @@ def get_header(self) -> str: STRING_WRITER_EXTRA_OPS: Final = SourceDep("stringwriter_extra_ops.c") BYTEARRAY_EXTRA_OPS: Final = SourceDep("bytearray_extra_ops.c") STR_EXTRA_OPS: Final = SourceDep("str_extra_ops.c") -CODEPOINT_EXTRA_OPS: Final = SourceDep("codepoint_extra_ops.c") VECS_EXTRA_OPS: Final = SourceDep("vecs_extra_ops.c") diff --git a/mypyc/lib-rt/codepoint_extra_ops.c b/mypyc/lib-rt/codepoint_extra_ops.c deleted file mode 100644 index 3eba41727d25..000000000000 --- a/mypyc/lib-rt/codepoint_extra_ops.c +++ /dev/null @@ -1,6 +0,0 @@ -// All codepoint helper bodies live in codepoint_extra_ops.h as static -// inline. This translation unit exists so the header is pulled into -// mypyc-compiled extensions via SourceDep("codepoint_extra_ops.c") in -// mypyc/ir/deps.py (which, in include_runtime_files mode, emits -// `#include ` into the generated __native.c). -#include "codepoint_extra_ops.h" diff --git a/mypyc/lib-rt/codepoint_extra_ops.h b/mypyc/lib-rt/codepoint_extra_ops.h deleted file mode 100644 index 8d7201fdd70a..000000000000 --- a/mypyc/lib-rt/codepoint_extra_ops.h +++ /dev/null @@ -1,51 +0,0 @@ -#ifndef MYPYC_CODEPOINT_EXTRA_OPS_H -#define MYPYC_CODEPOINT_EXTRA_OPS_H - -#include -#include -#include -#include "CPy.h" - -// Codepoint helpers for librt.strings. -// Inputs are signed int32_t for compatibility with mypyc's i32 type. -// Negative values are treated as non-codepoints and return false. - -static inline bool LibRTStrings_IsSpace(int32_t c) { - return c >= 0 && Py_UNICODE_ISSPACE((Py_UCS4)c); -} - -static inline bool LibRTStrings_IsDigit(int32_t c) { - return c >= 0 && Py_UNICODE_ISDIGIT((Py_UCS4)c); -} - -static inline bool LibRTStrings_IsAlnum(int32_t c) { - return c >= 0 && Py_UNICODE_ISALNUM((Py_UCS4)c); -} - -static inline bool LibRTStrings_IsAlpha(int32_t c) { - return c >= 0 && Py_UNICODE_ISALPHA((Py_UCS4)c); -} - -// True if c could start a valid identifier (matches XID_Start -// semantics, which is what str.isidentifier reports for a 1-character -// string). The ASCII fast path covers `[A-Za-z_]` inline; non-ASCII -// delegates to PyUnicode_IsIdentifier for correct PEP 3131 handling. -// Aborts via CPyError_OutOfMemory on allocation failure, so this helper -// stays ERR_NEVER. -static inline bool LibRTStrings_IsIdentifier(int32_t c) { - if (c < 0) return false; - if (c < 128) { - return (c >= 'a' && c <= 'z') - || (c >= 'A' && c <= 'Z') - || c == '_'; - } - PyObject *s = PyUnicode_FromOrdinal((int)c); - if (s == NULL) { - CPyError_OutOfMemory(); - } - int r = PyUnicode_IsIdentifier(s); - Py_DECREF(s); - return r == 1; -} - -#endif // MYPYC_CODEPOINT_EXTRA_OPS_H diff --git a/mypyc/lib-rt/strings/librt_strings.c b/mypyc/lib-rt/strings/librt_strings.c index 62b4edffcd7f..d5245af9183f 100644 --- a/mypyc/lib-rt/strings/librt_strings.c +++ b/mypyc/lib-rt/strings/librt_strings.c @@ -4,7 +4,6 @@ #include #include #include "CPy.h" -#include "codepoint_extra_ops.h" #include "librt_strings.h" #define CPY_BOOL_ERROR 2 @@ -1154,15 +1153,50 @@ read_f64_be(PyObject *module, PyObject *const *args, size_t nargs) { return PyFloat_FromDouble(CPyBytes_ReadF64BEUnsafe(data + index)); } -// Codepoint classification helpers exposed to interpreted callers. -// The C-side names are prefixed `cp_` to avoid colliding with libc's -// isspace / isdigit / etc. Compiled callers go through the -// LibRTStrings_* static inlines in codepoint_extra_ops.h instead. -// -// All wrappers parse a single int argument as i32 (codepoint) and -// dispatch to the corresponding LibRTStrings_* function. The parse -// step accepts any int but rejects values outside the i32 range with -// OverflowError, matching the input domain of the compiled fast path. +// Codepoint classification helpers. Inputs are signed i32 for compatibility +// with mypyc's int32_rprimitive; negative values are non-codepoints and +// return false. Mypyc-compiled callers reach these through the librt.strings +// capsule API (see librt_strings_api.h); interpreted callers go through the +// `cp_*` Python wrappers below. + +bool LibRTStrings_IsSpace(int32_t c) { + return c >= 0 && Py_UNICODE_ISSPACE((Py_UCS4)c); +} + +bool LibRTStrings_IsDigit(int32_t c) { + return c >= 0 && Py_UNICODE_ISDIGIT((Py_UCS4)c); +} + +bool LibRTStrings_IsAlnum(int32_t c) { + return c >= 0 && Py_UNICODE_ISALNUM((Py_UCS4)c); +} + +bool LibRTStrings_IsAlpha(int32_t c) { + return c >= 0 && Py_UNICODE_ISALPHA((Py_UCS4)c); +} + +// True if c could start a valid identifier (XID_Start, per PEP 3131). +// ASCII fast path covers `[A-Za-z_]`; non-ASCII delegates to CPython's +// PyUnicode_IsIdentifier on a 1-character string. Aborts via +// CPyError_OutOfMemory on allocation failure to keep this ERR_NEVER. +bool LibRTStrings_IsIdentifier(int32_t c) { + if (c < 0) return false; + if (c < 128) { + return (c >= 'a' && c <= 'z') + || (c >= 'A' && c <= 'Z') + || c == '_'; + } + PyObject *s = PyUnicode_FromOrdinal((int)c); + if (s == NULL) { + CPyError_OutOfMemory(); + } + int r = PyUnicode_IsIdentifier(s); + Py_DECREF(s); + return r == 1; +} + +// Python-level wrappers (`cp_*`) for interpreted callers. The C-side names +// are prefixed `cp_` to avoid colliding with libc's isspace etc. // Parse a Python int as i32 codepoint. Returns 0 on success and writes // the value to *out; returns -1 on error with a Python exception set. @@ -1317,6 +1351,11 @@ librt_strings_module_exec(PyObject *m) (void *)StringWriter_type_internal, (void *)StringWriter_write_internal, (void *)grow_string_buffer, + (void *)LibRTStrings_IsSpace, + (void *)LibRTStrings_IsDigit, + (void *)LibRTStrings_IsAlnum, + (void *)LibRTStrings_IsAlpha, + (void *)LibRTStrings_IsIdentifier, }; PyObject *c_api_object = PyCapsule_New((void *)librt_strings_api, "librt.strings._C_API", NULL); if (PyModule_Add(m, "_C_API", c_api_object) < 0) { diff --git a/mypyc/lib-rt/strings/librt_strings.h b/mypyc/lib-rt/strings/librt_strings.h index e6236f795092..903cda6b0918 100644 --- a/mypyc/lib-rt/strings/librt_strings.h +++ b/mypyc/lib-rt/strings/librt_strings.h @@ -13,11 +13,11 @@ // API version -- more recent versions must maintain backward compatibility, i.e. // we can add new features but not remove or change existing features (unless // ABI version is changed, but see the comment above). -#define LIBRT_STRINGS_API_VERSION 4 +#define LIBRT_STRINGS_API_VERSION 5 // Number of functions in the capsule API. If you add a new function, also increase // LIBRT_STRINGS_API_VERSION. -#define LIBRT_STRINGS_API_LEN 14 +#define LIBRT_STRINGS_API_LEN 19 typedef struct { PyObject_HEAD diff --git a/mypyc/lib-rt/strings/librt_strings_api.h b/mypyc/lib-rt/strings/librt_strings_api.h index 536b90ad7f21..406543190daf 100644 --- a/mypyc/lib-rt/strings/librt_strings_api.h +++ b/mypyc/lib-rt/strings/librt_strings_api.h @@ -6,6 +6,7 @@ import_librt_strings(void); #include #include +#include #include "librt_strings.h" extern void *LibRTStrings_API[LIBRT_STRINGS_API_LEN]; @@ -24,6 +25,11 @@ extern void *LibRTStrings_API[LIBRT_STRINGS_API_LEN]; #define LibRTStrings_StringWriter_type_internal (*(PyTypeObject* (*)(void)) LibRTStrings_API[11]) #define LibRTStrings_StringWriter_write_internal (*(char (*)(PyObject *source, PyObject *value)) LibRTStrings_API[12]) #define LibRTStrings_grow_string_buffer (*(bool (*)(StringWriterObject *obj, Py_ssize_t n)) LibRTStrings_API[13]) +#define LibRTStrings_IsSpace (*(bool (*)(int32_t c)) LibRTStrings_API[14]) +#define LibRTStrings_IsDigit (*(bool (*)(int32_t c)) LibRTStrings_API[15]) +#define LibRTStrings_IsAlnum (*(bool (*)(int32_t c)) LibRTStrings_API[16]) +#define LibRTStrings_IsAlpha (*(bool (*)(int32_t c)) LibRTStrings_API[17]) +#define LibRTStrings_IsIdentifier (*(bool (*)(int32_t c)) LibRTStrings_API[18]) static inline bool CPyBytesWriter_Check(PyObject *obj) { diff --git a/mypyc/primitives/librt_strings_ops.py b/mypyc/primitives/librt_strings_ops.py index 312d5a16195b..f025c6e95b71 100644 --- a/mypyc/primitives/librt_strings_ops.py +++ b/mypyc/primitives/librt_strings_ops.py @@ -1,9 +1,4 @@ -from mypyc.ir.deps import ( - BYTES_WRITER_EXTRA_OPS, - CODEPOINT_EXTRA_OPS, - LIBRT_STRINGS, - STRING_WRITER_EXTRA_OPS, -) +from mypyc.ir.deps import BYTES_WRITER_EXTRA_OPS, LIBRT_STRINGS, STRING_WRITER_EXTRA_OPS from mypyc.ir.ops import ERR_MAGIC, ERR_MAGIC_OVERLAPPING, ERR_NEVER from mypyc.ir.rtypes import ( bool_rprimitive, @@ -402,7 +397,7 @@ return_type=bool_rprimitive, c_function_name="LibRTStrings_IsSpace", error_kind=ERR_NEVER, - dependencies=[LIBRT_STRINGS, CODEPOINT_EXTRA_OPS], + dependencies=[LIBRT_STRINGS], ) function_op( @@ -411,7 +406,7 @@ return_type=bool_rprimitive, c_function_name="LibRTStrings_IsDigit", error_kind=ERR_NEVER, - dependencies=[LIBRT_STRINGS, CODEPOINT_EXTRA_OPS], + dependencies=[LIBRT_STRINGS], ) function_op( @@ -420,7 +415,7 @@ return_type=bool_rprimitive, c_function_name="LibRTStrings_IsAlnum", error_kind=ERR_NEVER, - dependencies=[LIBRT_STRINGS, CODEPOINT_EXTRA_OPS], + dependencies=[LIBRT_STRINGS], ) function_op( @@ -429,17 +424,17 @@ return_type=bool_rprimitive, c_function_name="LibRTStrings_IsAlpha", error_kind=ERR_NEVER, - dependencies=[LIBRT_STRINGS, CODEPOINT_EXTRA_OPS], + dependencies=[LIBRT_STRINGS], ) # isidentifier checks XID_Start semantics for a single codepoint, matching # str.isidentifier() on a 1-character string. The non-ASCII path allocates -# but swallows OOM (returning False), keeping the function ERR_NEVER. +# and aborts via CPyError_OutOfMemory on failure, so this stays ERR_NEVER. function_op( name="librt.strings.isidentifier", arg_types=[int32_rprimitive], return_type=bool_rprimitive, c_function_name="LibRTStrings_IsIdentifier", error_kind=ERR_NEVER, - dependencies=[LIBRT_STRINGS, CODEPOINT_EXTRA_OPS], + dependencies=[LIBRT_STRINGS], )