From c0511ccb72195839d277f2371fba914d1f9f24f5 Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Fri, 17 Oct 2025 20:44:51 +0100 Subject: [PATCH 01/15] Commit --- Doc/library/unicodedata.rst | 26 +++++++ Doc/whatsnew/3.15.rst | 4 + Include/cpython/unicodeobject.h | 8 ++ Lib/test/test_unicodedata.py | 27 +++++++ ...-10-17-20-42-38.gh-issue-129117.X9jr4p.rst | 2 + Modules/clinic/unicodedata.c.h | 74 ++++++++++++++++++- Modules/unicodedata.c | 58 +++++++++++++++ 7 files changed, 198 insertions(+), 1 deletion(-) create mode 100644 Misc/NEWS.d/next/Library/2025-10-17-20-42-38.gh-issue-129117.X9jr4p.rst diff --git a/Doc/library/unicodedata.rst b/Doc/library/unicodedata.rst index 0369cd99c47c18..60d91acb91ec7e 100644 --- a/Doc/library/unicodedata.rst +++ b/Doc/library/unicodedata.rst @@ -144,6 +144,32 @@ following functions: 1 +.. function:: isidstart(chr, /) + + Return ``True`` if the character has the ``XID_Start`` property, ``False`` + otherwise. For example:: + + >>> unicodedata.isidstart('S') + True + >>> unicodedata.isidstart('0') + False + + .. versionadded:: next + + +.. function:: isidcontinue(chr, /) + + Return ``True`` if the character has the ``XID_Continue`` property, ``False`` + otherwise. For example:: + + >>> unicodedata.isidcontinue('S') + True + >>> unicodedata.isidcontinue(' ') + False + + .. versionadded:: next + + .. function:: decomposition(chr) Returns the character decomposition mapping assigned to the character diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst index d3ae7c21a0358b..5e85bc26ce39b3 100644 --- a/Doc/whatsnew/3.15.rst +++ b/Doc/whatsnew/3.15.rst @@ -768,6 +768,10 @@ unicodedata * The Unicode database has been updated to Unicode 17.0.0. +* Add :func:`unicodedata.isidstart` and :func:`unicodedata.isidcontinue` + functions. + (Contributed by Stan Ulbrych in :gh:`129117`.) + wave ---- diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index 73e3bc44d6c9ca..5e061e84950cf5 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -733,6 +733,14 @@ PyAPI_FUNC(int) _PyUnicode_IsAlpha( Py_UCS4 ch /* Unicode character */ ); +PyAPI_FUNC(int) _PyUnicode_IsXidStart( + Py_UCS4 ch /* Unicode character */ + ); + +PyAPI_FUNC(int) _PyUnicode_IsXidContinue( + Py_UCS4 ch /* Unicode character */ + ); + // Helper array used by Py_UNICODE_ISSPACE(). PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[]; diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index 8013eaf6e9d851..f4947c06feddc6 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -276,6 +276,33 @@ def test_east_asian_width_9_0_changes(self): self.assertEqual(self.db.ucd_3_2_0.east_asian_width('\u231a'), 'N') self.assertEqual(self.db.east_asian_width('\u231a'), 'W') + def test_isidstart(self): + self.assertTrue(self.db.isidstart('S')) + self.assertTrue(self.db.isidstart('\u0AD0')) # GUJARATI OM + self.assertTrue(self.db.isidstart('\u0EC6')) # LAO KO LA + self.assertTrue(self.db.isidstart('\u17DC')) # KHMER SIGN AVAKRAHASANYA + self.assertTrue(self.db.isidstart('\uA015')) # YI SYLLABLE WU + self.assertTrue(self.db.isidstart('\uFE7B')) # ARABIC KASRA MEDIAL FORM + + self.assertFalse(self.db.isidstart(' ')) + self.assertRaises(TypeError, self.db.isidstart) + self.assertRaises(TypeError, self.db.isidstart, 'xx') + + def test_isidcontinue(self): + self.assertTrue(self.db.isidcontinue('S')) + self.assertTrue(self.db.isidcontinue('_')) + self.assertTrue(self.db.isidcontinue('0')) + self.assertTrue(self.db.isidcontinue('\u00BA')) # MASCULINE ORDINAL INDICATOR + self.assertTrue(self.db.isidcontinue('\u0640')) # ARABIC TATWEEL + self.assertTrue(self.db.isidcontinue('\u0710')) # SYRIAC LETTER ALAPH + self.assertTrue(self.db.isidcontinue('\u0B3E')) # ORIYA VOWEL SIGN AA + self.assertTrue(self.db.isidcontinue('\u17D7')) # KHMER SIGN LEK TOO + + self.assertFalse(self.db.isidcontinue(' ')) + self.assertFalse(self.db.isidstart('0')) + self.assertRaises(TypeError, self.db.isidcontinue) + self.assertRaises(TypeError, self.db.isidcontinue, 'xx') + class UnicodeMiscTest(UnicodeDatabaseTest): @cpython_only diff --git a/Misc/NEWS.d/next/Library/2025-10-17-20-42-38.gh-issue-129117.X9jr4p.rst b/Misc/NEWS.d/next/Library/2025-10-17-20-42-38.gh-issue-129117.X9jr4p.rst new file mode 100644 index 00000000000000..da77a5e02158a9 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-10-17-20-42-38.gh-issue-129117.X9jr4p.rst @@ -0,0 +1,2 @@ +:mod:`unicodedata`: Add :func:`~unicodedata.isidstart` and +:func:`~unicodedata.isidcontinue` functions. diff --git a/Modules/clinic/unicodedata.c.h b/Modules/clinic/unicodedata.c.h index 345440eeee89a6..5e968cf63d2c24 100644 --- a/Modules/clinic/unicodedata.c.h +++ b/Modules/clinic/unicodedata.c.h @@ -518,6 +518,78 @@ unicodedata_UCD_name(PyObject *self, PyObject *const *args, Py_ssize_t nargs) return return_value; } +PyDoc_STRVAR(unicodedata_UCD_isidstart__doc__, +"isidstart($self, chr, /)\n" +"--\n" +"\n" +"Return True if the character has the XID_Start property, else False."); + +#define UNICODEDATA_UCD_ISIDSTART_METHODDEF \ + {"isidstart", (PyCFunction)unicodedata_UCD_isidstart, METH_O, unicodedata_UCD_isidstart__doc__}, + +static PyObject * +unicodedata_UCD_isidstart_impl(PyObject *self, int chr); + +static PyObject * +unicodedata_UCD_isidstart(PyObject *self, PyObject *arg) +{ + PyObject *return_value = NULL; + int chr; + + if (!PyUnicode_Check(arg)) { + _PyArg_BadArgument("isidstart", "argument", "a unicode character", arg); + goto exit; + } + if (PyUnicode_GET_LENGTH(arg) != 1) { + PyErr_Format(PyExc_TypeError, + "isidstart(): argument must be a unicode character, " + "not a string of length %zd", + PyUnicode_GET_LENGTH(arg)); + goto exit; + } + chr = PyUnicode_READ_CHAR(arg, 0); + return_value = unicodedata_UCD_isidstart_impl(self, chr); + +exit: + return return_value; +} + +PyDoc_STRVAR(unicodedata_UCD_isidcontinue__doc__, +"isidcontinue($self, chr, /)\n" +"--\n" +"\n" +"Return True if the character has the XID_Continue property, else False."); + +#define UNICODEDATA_UCD_ISIDCONTINUE_METHODDEF \ + {"isidcontinue", (PyCFunction)unicodedata_UCD_isidcontinue, METH_O, unicodedata_UCD_isidcontinue__doc__}, + +static PyObject * +unicodedata_UCD_isidcontinue_impl(PyObject *self, int chr); + +static PyObject * +unicodedata_UCD_isidcontinue(PyObject *self, PyObject *arg) +{ + PyObject *return_value = NULL; + int chr; + + if (!PyUnicode_Check(arg)) { + _PyArg_BadArgument("isidcontinue", "argument", "a unicode character", arg); + goto exit; + } + if (PyUnicode_GET_LENGTH(arg) != 1) { + PyErr_Format(PyExc_TypeError, + "isidcontinue(): argument must be a unicode character, " + "not a string of length %zd", + PyUnicode_GET_LENGTH(arg)); + goto exit; + } + chr = PyUnicode_READ_CHAR(arg, 0); + return_value = unicodedata_UCD_isidcontinue_impl(self, chr); + +exit: + return return_value; +} + PyDoc_STRVAR(unicodedata_UCD_lookup__doc__, "lookup($self, name, /)\n" "--\n" @@ -549,4 +621,4 @@ unicodedata_UCD_lookup(PyObject *self, PyObject *arg) exit: return return_value; } -/*[clinic end generated code: output=8a59d430cee41058 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=33bfeb04e3eded40 input=a9049054013a1b77]*/ diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index a3699beff7da01..3833b0595b02a0 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -1525,6 +1525,62 @@ unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value) return PyUnicode_FromString(name); } +/*[clinic input] +unicodedata.UCD.isidstart + + self: self + chr: int(accept={str}) + / + +Return True if the character has the XID_Start property, else False. + +[clinic start generated code]*/ + +static PyObject * +unicodedata_UCD_isidstart_impl(PyObject *self, int chr) +/*[clinic end generated code: output=29fbeaf6491d9f85 input=b71b6b1b2db3c16d]*/ +{ + Py_UCS4 c = (Py_UCS4)chr; + + if (UCD_Check(self)) { + const change_record *old = get_old_record(self, c); + if (old->category_changed == 0) { + /* unassigned */ + Py_RETURN_FALSE; + } + } + + return PyBool_FromLong(_PyUnicode_IsXidStart(c)); +} + +/*[clinic input] +unicodedata.UCD.isidcontinue + + self: self + chr: int(accept={str}) + / + +Return True if the character has the XID_Continue property, else False. + +[clinic start generated code]*/ + +static PyObject * +unicodedata_UCD_isidcontinue_impl(PyObject *self, int chr) +/*[clinic end generated code: output=5ae694da0ee16534 input=01b4ccd399484e6b]*/ +{ + Py_UCS4 c = (Py_UCS4)chr; + + if (UCD_Check(self)) { + const change_record *old = get_old_record(self, c); + if (old->category_changed == 0) { + /* unassigned */ + Py_RETURN_FALSE; + } + } + + return PyBool_FromLong(_PyUnicode_IsXidContinue(c)); +} + /*[clinic input] unicodedata.UCD.lookup @@ -1590,6 +1646,8 @@ static PyMethodDef unicodedata_functions[] = { UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF UNICODEDATA_UCD_DECOMPOSITION_METHODDEF UNICODEDATA_UCD_NAME_METHODDEF + UNICODEDATA_UCD_ISIDSTART_METHODDEF + UNICODEDATA_UCD_ISIDCONTINUE_METHODDEF UNICODEDATA_UCD_LOOKUP_METHODDEF UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF UNICODEDATA_UCD_NORMALIZE_METHODDEF From 83bb3a9561d7386387487c00548b7334d910ad60 Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Sat, 18 Oct 2025 09:15:54 +0100 Subject: [PATCH 02/15] Fix linking on windows and refactor test --- Include/internal/pycore_unicodeobject.h | 4 ++-- Lib/test/test_unicodedata.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Include/internal/pycore_unicodeobject.h b/Include/internal/pycore_unicodeobject.h index b83039c1869f23..28f63ba2b94883 100644 --- a/Include/internal/pycore_unicodeobject.h +++ b/Include/internal/pycore_unicodeobject.h @@ -77,8 +77,8 @@ _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch) /* --- Characters Type APIs ----------------------------------------------- */ -extern int _PyUnicode_IsXidStart(Py_UCS4 ch); -extern int _PyUnicode_IsXidContinue(Py_UCS4 ch); +PyAPI_FUNC(int) _PyUnicode_IsXidStart(Py_UCS4 ch); +PyAPI_FUNC(int) _PyUnicode_IsXidContinue(Py_UCS4 ch); extern int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res); extern int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res); extern int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res); diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index f4947c06feddc6..d677b06a57daba 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -285,6 +285,7 @@ def test_isidstart(self): self.assertTrue(self.db.isidstart('\uFE7B')) # ARABIC KASRA MEDIAL FORM self.assertFalse(self.db.isidstart(' ')) + self.assertFalse(self.db.isidstart('0')) self.assertRaises(TypeError, self.db.isidstart) self.assertRaises(TypeError, self.db.isidstart, 'xx') @@ -299,7 +300,6 @@ def test_isidcontinue(self): self.assertTrue(self.db.isidcontinue('\u17D7')) # KHMER SIGN LEK TOO self.assertFalse(self.db.isidcontinue(' ')) - self.assertFalse(self.db.isidstart('0')) self.assertRaises(TypeError, self.db.isidcontinue) self.assertRaises(TypeError, self.db.isidcontinue, 'xx') From 571c6223172234ff28ece4b01eec832971492cc7 Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Sat, 18 Oct 2025 23:03:41 +0100 Subject: [PATCH 03/15] Move to pycore_unicodedata.h --- Include/cpython/unicodeobject.h | 8 -------- Include/internal/pycore_unicodedata.h | 17 +++++++++++++++++ Include/internal/pycore_unicodeobject.h | 4 ++-- Makefile.pre.in | 1 + Modules/unicodedata.c | 1 + Objects/unicodectype.c | 1 + PCbuild/pythoncore.vcxproj | 1 + 7 files changed, 23 insertions(+), 10 deletions(-) create mode 100644 Include/internal/pycore_unicodedata.h diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index 5e061e84950cf5..73e3bc44d6c9ca 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -733,14 +733,6 @@ PyAPI_FUNC(int) _PyUnicode_IsAlpha( Py_UCS4 ch /* Unicode character */ ); -PyAPI_FUNC(int) _PyUnicode_IsXidStart( - Py_UCS4 ch /* Unicode character */ - ); - -PyAPI_FUNC(int) _PyUnicode_IsXidContinue( - Py_UCS4 ch /* Unicode character */ - ); - // Helper array used by Py_UNICODE_ISSPACE(). PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[]; diff --git a/Include/internal/pycore_unicodedata.h b/Include/internal/pycore_unicodedata.h new file mode 100644 index 00000000000000..dc2f50607f8d99 --- /dev/null +++ b/Include/internal/pycore_unicodedata.h @@ -0,0 +1,17 @@ +#ifndef Py_INTERNAL_UNICODEDATA_H +#define Py_INTERNAL_UNICODEDATA_H +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef Py_BUILD_CORE +# error "this header requires Py_BUILD_CORE define" +#endif + +PyAPI_FUNC(int) _PyUnicode_IsXidStart(Py_UCS4 ch); +PyAPI_FUNC(int) _PyUnicode_IsXidContinue(Py_UCS4 ch); + +#ifdef __cplusplus +} +#endif +#endif /* !Py_INTERNAL_UNICODEDATA_H */ \ No newline at end of file diff --git a/Include/internal/pycore_unicodeobject.h b/Include/internal/pycore_unicodeobject.h index 28f63ba2b94883..b83039c1869f23 100644 --- a/Include/internal/pycore_unicodeobject.h +++ b/Include/internal/pycore_unicodeobject.h @@ -77,8 +77,8 @@ _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch) /* --- Characters Type APIs ----------------------------------------------- */ -PyAPI_FUNC(int) _PyUnicode_IsXidStart(Py_UCS4 ch); -PyAPI_FUNC(int) _PyUnicode_IsXidContinue(Py_UCS4 ch); +extern int _PyUnicode_IsXidStart(Py_UCS4 ch); +extern int _PyUnicode_IsXidContinue(Py_UCS4 ch); extern int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res); extern int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res); extern int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res); diff --git a/Makefile.pre.in b/Makefile.pre.in index 19423c11545c19..914e28bf298a0a 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -1433,6 +1433,7 @@ PYTHON_HEADERS= \ $(srcdir)/Include/internal/pycore_typeobject.h \ $(srcdir)/Include/internal/pycore_typevarobject.h \ $(srcdir)/Include/internal/pycore_ucnhash.h \ + $(srcdir)/Include/internal/pycore_unicodedata.h \ $(srcdir)/Include/internal/pycore_unicodeobject.h \ $(srcdir)/Include/internal/pycore_unicodeobject_generated.h \ $(srcdir)/Include/internal/pycore_unionobject.h \ diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index 3833b0595b02a0..84b351dbfff0cf 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -19,6 +19,7 @@ #include "Python.h" #include "pycore_object.h" // _PyObject_VisitType() #include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI +#include "pycore_unicodedata.h" // _PyUnicode_IsXidStart, _PyUnicode_IsXidContinue #include #include // offsetof() diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c index 7cd0dca3d13545..52edce4db5118d 100644 --- a/Objects/unicodectype.c +++ b/Objects/unicodectype.c @@ -9,6 +9,7 @@ */ #include "Python.h" +#include "pycore_unicodedata.h" // _PyUnicode_IsXidStart, _PyUnicode_IsXidContinue #define ALPHA_MASK 0x01 #define DECIMAL_MASK 0x02 diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index 2657ee5c444e60..fe87c7c5bc2830 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -328,6 +328,7 @@ + From 904386513a3d0304e0a589c02e3f91a80203b5e9 Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Sat, 18 Oct 2025 23:04:49 +0100 Subject: [PATCH 04/15] lint --- Include/internal/pycore_unicodedata.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Include/internal/pycore_unicodedata.h b/Include/internal/pycore_unicodedata.h index dc2f50607f8d99..4dc5becda92c5b 100644 --- a/Include/internal/pycore_unicodedata.h +++ b/Include/internal/pycore_unicodedata.h @@ -14,4 +14,4 @@ PyAPI_FUNC(int) _PyUnicode_IsXidContinue(Py_UCS4 ch); #ifdef __cplusplus } #endif -#endif /* !Py_INTERNAL_UNICODEDATA_H */ \ No newline at end of file +#endif /* !Py_INTERNAL_UNICODEDATA_H */ From cf197af67d65baa44049cb8f8feeeb99c611cf90 Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Wed, 29 Oct 2025 13:36:19 +0000 Subject: [PATCH 05/15] Part of review --- Doc/library/unicodedata.rst | 10 ++++++---- Modules/unicodedata.c | 12 ++++-------- Objects/unicodectype.c | 2 +- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/Doc/library/unicodedata.rst b/Doc/library/unicodedata.rst index 60d91acb91ec7e..140ba4bfde174a 100644 --- a/Doc/library/unicodedata.rst +++ b/Doc/library/unicodedata.rst @@ -146,8 +146,9 @@ following functions: .. function:: isidstart(chr, /) - Return ``True`` if the character has the ``XID_Start`` property, ``False`` - otherwise. For example:: + Return ``True`` if *chr* is a valid identifier start per the + `Unicode Standard Annex #31 `_, + that is, it has the ``XID_Start`` property, ``False`` otherwise. For example:: >>> unicodedata.isidstart('S') True @@ -159,8 +160,9 @@ following functions: .. function:: isidcontinue(chr, /) - Return ``True`` if the character has the ``XID_Continue`` property, ``False`` - otherwise. For example:: + Return ``True`` if *chr* is a valid identifier charcter per the + `Unicode Standard Annex #31 `_, + that is, it has the ``XID_Continue`` property, ``False`` otherwise. For example:: >>> unicodedata.isidcontinue('S') True diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index 84b351dbfff0cf..357649e96f0459 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -1541,17 +1541,15 @@ static PyObject * unicodedata_UCD_isidstart_impl(PyObject *self, int chr) /*[clinic end generated code: output=29fbeaf6491d9f85 input=b71b6b1b2db3c16d]*/ { - Py_UCS4 c = (Py_UCS4)chr; - if (UCD_Check(self)) { - const change_record *old = get_old_record(self, c); + const change_record *old = get_old_record(self, chr); if (old->category_changed == 0) { /* unassigned */ Py_RETURN_FALSE; } } - return PyBool_FromLong(_PyUnicode_IsXidStart(c)); + return PyBool_FromLong(_PyUnicode_IsXidStart(chr)); } /*[clinic input] @@ -1569,17 +1567,15 @@ static PyObject * unicodedata_UCD_isidcontinue_impl(PyObject *self, int chr) /*[clinic end generated code: output=5ae694da0ee16534 input=01b4ccd399484e6b]*/ { - Py_UCS4 c = (Py_UCS4)chr; - if (UCD_Check(self)) { - const change_record *old = get_old_record(self, c); + const change_record *old = get_old_record(self, chr); if (old->category_changed == 0) { /* unassigned */ Py_RETURN_FALSE; } } - return PyBool_FromLong(_PyUnicode_IsXidContinue(c)); + return PyBool_FromLong(_PyUnicode_IsXidContinue(chr)); } /*[clinic input] diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c index 52edce4db5118d..37ebc1e0d13b81 100644 --- a/Objects/unicodectype.c +++ b/Objects/unicodectype.c @@ -9,7 +9,7 @@ */ #include "Python.h" -#include "pycore_unicodedata.h" // _PyUnicode_IsXidStart, _PyUnicode_IsXidContinue +#include "pycore_unicodedata.h" // export _PyUnicode_IsXidStart(), _PyUnicode_IsXidContinue() #define ALPHA_MASK 0x01 #define DECIMAL_MASK 0x02 From b24b994a8cee7ce6f083a26dec730bd866cae899 Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Wed, 29 Oct 2025 13:46:40 +0000 Subject: [PATCH 06/15] Review --- Include/internal/pycore_unicodedata.h | 17 ----------------- Include/internal/pycore_unicodeobject.h | 6 ++++-- Makefile.pre.in | 1 - Modules/unicodedata.c | 2 +- Objects/unicodectype.c | 2 +- PCbuild/pythoncore.vcxproj | 1 - 6 files changed, 6 insertions(+), 23 deletions(-) delete mode 100644 Include/internal/pycore_unicodedata.h diff --git a/Include/internal/pycore_unicodedata.h b/Include/internal/pycore_unicodedata.h deleted file mode 100644 index 4dc5becda92c5b..00000000000000 --- a/Include/internal/pycore_unicodedata.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef Py_INTERNAL_UNICODEDATA_H -#define Py_INTERNAL_UNICODEDATA_H -#ifdef __cplusplus -extern "C" { -#endif - -#ifndef Py_BUILD_CORE -# error "this header requires Py_BUILD_CORE define" -#endif - -PyAPI_FUNC(int) _PyUnicode_IsXidStart(Py_UCS4 ch); -PyAPI_FUNC(int) _PyUnicode_IsXidContinue(Py_UCS4 ch); - -#ifdef __cplusplus -} -#endif -#endif /* !Py_INTERNAL_UNICODEDATA_H */ diff --git a/Include/internal/pycore_unicodeobject.h b/Include/internal/pycore_unicodeobject.h index b83039c1869f23..055038338ec481 100644 --- a/Include/internal/pycore_unicodeobject.h +++ b/Include/internal/pycore_unicodeobject.h @@ -77,8 +77,10 @@ _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch) /* --- Characters Type APIs ----------------------------------------------- */ -extern int _PyUnicode_IsXidStart(Py_UCS4 ch); -extern int _PyUnicode_IsXidContinue(Py_UCS4 ch); +// Export for 'unicodedata' shared extension. +PyAPI_FUNC(int) _PyUnicode_IsXidStart(Py_UCS4 ch); +PyAPI_FUNC(int) _PyUnicode_IsXidContinue(Py_UCS4 ch); + extern int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res); extern int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res); extern int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res); diff --git a/Makefile.pre.in b/Makefile.pre.in index 914e28bf298a0a..19423c11545c19 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -1433,7 +1433,6 @@ PYTHON_HEADERS= \ $(srcdir)/Include/internal/pycore_typeobject.h \ $(srcdir)/Include/internal/pycore_typevarobject.h \ $(srcdir)/Include/internal/pycore_ucnhash.h \ - $(srcdir)/Include/internal/pycore_unicodedata.h \ $(srcdir)/Include/internal/pycore_unicodeobject.h \ $(srcdir)/Include/internal/pycore_unicodeobject_generated.h \ $(srcdir)/Include/internal/pycore_unionobject.h \ diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index 357649e96f0459..33a41d8f958113 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -19,7 +19,7 @@ #include "Python.h" #include "pycore_object.h" // _PyObject_VisitType() #include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI -#include "pycore_unicodedata.h" // _PyUnicode_IsXidStart, _PyUnicode_IsXidContinue +#include "pycore_unicodeobject.h" // _PyUnicode_IsXidStart, _PyUnicode_IsXidContinue #include #include // offsetof() diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c index 37ebc1e0d13b81..33046367ab007b 100644 --- a/Objects/unicodectype.c +++ b/Objects/unicodectype.c @@ -9,7 +9,7 @@ */ #include "Python.h" -#include "pycore_unicodedata.h" // export _PyUnicode_IsXidStart(), _PyUnicode_IsXidContinue() +#include "pycore_unicodeobject.h" // export _PyUnicode_IsXidStart(), _PyUnicode_IsXidContinue() #define ALPHA_MASK 0x01 #define DECIMAL_MASK 0x02 diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index fe87c7c5bc2830..2657ee5c444e60 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -328,7 +328,6 @@ - From b9ae70d3294f0318694972f3a5d11c184c09c135 Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Wed, 29 Oct 2025 15:34:40 +0000 Subject: [PATCH 07/15] Revert "Review" This reverts commit b24b994a8cee7ce6f083a26dec730bd866cae899. --- Include/internal/pycore_unicodedata.h | 17 +++++++++++++++++ Include/internal/pycore_unicodeobject.h | 6 ++---- Makefile.pre.in | 1 + Modules/unicodedata.c | 2 +- Objects/unicodectype.c | 2 +- PCbuild/pythoncore.vcxproj | 1 + 6 files changed, 23 insertions(+), 6 deletions(-) create mode 100644 Include/internal/pycore_unicodedata.h diff --git a/Include/internal/pycore_unicodedata.h b/Include/internal/pycore_unicodedata.h new file mode 100644 index 00000000000000..4dc5becda92c5b --- /dev/null +++ b/Include/internal/pycore_unicodedata.h @@ -0,0 +1,17 @@ +#ifndef Py_INTERNAL_UNICODEDATA_H +#define Py_INTERNAL_UNICODEDATA_H +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef Py_BUILD_CORE +# error "this header requires Py_BUILD_CORE define" +#endif + +PyAPI_FUNC(int) _PyUnicode_IsXidStart(Py_UCS4 ch); +PyAPI_FUNC(int) _PyUnicode_IsXidContinue(Py_UCS4 ch); + +#ifdef __cplusplus +} +#endif +#endif /* !Py_INTERNAL_UNICODEDATA_H */ diff --git a/Include/internal/pycore_unicodeobject.h b/Include/internal/pycore_unicodeobject.h index 055038338ec481..b83039c1869f23 100644 --- a/Include/internal/pycore_unicodeobject.h +++ b/Include/internal/pycore_unicodeobject.h @@ -77,10 +77,8 @@ _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch) /* --- Characters Type APIs ----------------------------------------------- */ -// Export for 'unicodedata' shared extension. -PyAPI_FUNC(int) _PyUnicode_IsXidStart(Py_UCS4 ch); -PyAPI_FUNC(int) _PyUnicode_IsXidContinue(Py_UCS4 ch); - +extern int _PyUnicode_IsXidStart(Py_UCS4 ch); +extern int _PyUnicode_IsXidContinue(Py_UCS4 ch); extern int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res); extern int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res); extern int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res); diff --git a/Makefile.pre.in b/Makefile.pre.in index 19423c11545c19..914e28bf298a0a 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -1433,6 +1433,7 @@ PYTHON_HEADERS= \ $(srcdir)/Include/internal/pycore_typeobject.h \ $(srcdir)/Include/internal/pycore_typevarobject.h \ $(srcdir)/Include/internal/pycore_ucnhash.h \ + $(srcdir)/Include/internal/pycore_unicodedata.h \ $(srcdir)/Include/internal/pycore_unicodeobject.h \ $(srcdir)/Include/internal/pycore_unicodeobject_generated.h \ $(srcdir)/Include/internal/pycore_unionobject.h \ diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index 33a41d8f958113..357649e96f0459 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -19,7 +19,7 @@ #include "Python.h" #include "pycore_object.h" // _PyObject_VisitType() #include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI -#include "pycore_unicodeobject.h" // _PyUnicode_IsXidStart, _PyUnicode_IsXidContinue +#include "pycore_unicodedata.h" // _PyUnicode_IsXidStart, _PyUnicode_IsXidContinue #include #include // offsetof() diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c index 33046367ab007b..37ebc1e0d13b81 100644 --- a/Objects/unicodectype.c +++ b/Objects/unicodectype.c @@ -9,7 +9,7 @@ */ #include "Python.h" -#include "pycore_unicodeobject.h" // export _PyUnicode_IsXidStart(), _PyUnicode_IsXidContinue() +#include "pycore_unicodedata.h" // export _PyUnicode_IsXidStart(), _PyUnicode_IsXidContinue() #define ALPHA_MASK 0x01 #define DECIMAL_MASK 0x02 diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index 2657ee5c444e60..fe87c7c5bc2830 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -328,6 +328,7 @@ + From dc0752fba84afa19be4f1aa1394d2c1f25076baa Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Wed, 29 Oct 2025 15:39:06 +0000 Subject: [PATCH 08/15] Third times the charm --- .../{pycore_unicodedata.h => pycore_unicodectype.h} | 6 +++--- Makefile.pre.in | 2 +- Modules/unicodedata.c | 2 +- Objects/unicodectype.c | 2 +- PCbuild/pythoncore.vcxproj | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) rename Include/internal/{pycore_unicodedata.h => pycore_unicodectype.h} (69%) diff --git a/Include/internal/pycore_unicodedata.h b/Include/internal/pycore_unicodectype.h similarity index 69% rename from Include/internal/pycore_unicodedata.h rename to Include/internal/pycore_unicodectype.h index 4dc5becda92c5b..ce37b44492e9a0 100644 --- a/Include/internal/pycore_unicodedata.h +++ b/Include/internal/pycore_unicodectype.h @@ -1,5 +1,5 @@ -#ifndef Py_INTERNAL_UNICODEDATA_H -#define Py_INTERNAL_UNICODEDATA_H +#ifndef Py_INTERNAL_UNICODECTYPE_H +#define Py_INTERNAL_UNICODECTYPE_H #ifdef __cplusplus extern "C" { #endif @@ -14,4 +14,4 @@ PyAPI_FUNC(int) _PyUnicode_IsXidContinue(Py_UCS4 ch); #ifdef __cplusplus } #endif -#endif /* !Py_INTERNAL_UNICODEDATA_H */ +#endif /* !Py_INTERNAL_UNICODECTYPE_H */ diff --git a/Makefile.pre.in b/Makefile.pre.in index 914e28bf298a0a..5db1d11d9f6fcd 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -1433,9 +1433,9 @@ PYTHON_HEADERS= \ $(srcdir)/Include/internal/pycore_typeobject.h \ $(srcdir)/Include/internal/pycore_typevarobject.h \ $(srcdir)/Include/internal/pycore_ucnhash.h \ - $(srcdir)/Include/internal/pycore_unicodedata.h \ $(srcdir)/Include/internal/pycore_unicodeobject.h \ $(srcdir)/Include/internal/pycore_unicodeobject_generated.h \ + $(srcdir)/Include/internal/pycore_unicodectype.h \ $(srcdir)/Include/internal/pycore_unionobject.h \ $(srcdir)/Include/internal/pycore_uniqueid.h \ $(srcdir)/Include/internal/pycore_uop.h \ diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index 357649e96f0459..24ac647a944528 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -19,7 +19,7 @@ #include "Python.h" #include "pycore_object.h" // _PyObject_VisitType() #include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI -#include "pycore_unicodedata.h" // _PyUnicode_IsXidStart, _PyUnicode_IsXidContinue +#include "pycore_unicodectype.h" // _PyUnicode_IsXidStart, _PyUnicode_IsXidContinue #include #include // offsetof() diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c index 37ebc1e0d13b81..fdd380190ac1ec 100644 --- a/Objects/unicodectype.c +++ b/Objects/unicodectype.c @@ -9,7 +9,7 @@ */ #include "Python.h" -#include "pycore_unicodedata.h" // export _PyUnicode_IsXidStart(), _PyUnicode_IsXidContinue() +#include "pycore_unicodectype.h" // export _PyUnicode_IsXidStart(), _PyUnicode_IsXidContinue() #define ALPHA_MASK 0x01 #define DECIMAL_MASK 0x02 diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index fe87c7c5bc2830..5c653b3d0cad83 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -328,9 +328,9 @@ - + From c958c2a80b57947e8aaecd2ad2e331ef06aae5de Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Wed, 29 Oct 2025 16:26:18 +0000 Subject: [PATCH 09/15] Review --- Include/internal/pycore_unicodeobject.h | 2 -- Makefile.pre.in | 2 +- Modules/unicodedata.c | 2 +- Objects/unicodeobject.c | 1 + PCbuild/pythoncore.vcxproj | 2 +- PCbuild/pythoncore.vcxproj.filters | 3 +++ 6 files changed, 7 insertions(+), 5 deletions(-) diff --git a/Include/internal/pycore_unicodeobject.h b/Include/internal/pycore_unicodeobject.h index b83039c1869f23..bdcb7b807db462 100644 --- a/Include/internal/pycore_unicodeobject.h +++ b/Include/internal/pycore_unicodeobject.h @@ -77,8 +77,6 @@ _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch) /* --- Characters Type APIs ----------------------------------------------- */ -extern int _PyUnicode_IsXidStart(Py_UCS4 ch); -extern int _PyUnicode_IsXidContinue(Py_UCS4 ch); extern int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res); extern int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res); extern int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res); diff --git a/Makefile.pre.in b/Makefile.pre.in index 5db1d11d9f6fcd..0a1b8d028addad 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -1433,9 +1433,9 @@ PYTHON_HEADERS= \ $(srcdir)/Include/internal/pycore_typeobject.h \ $(srcdir)/Include/internal/pycore_typevarobject.h \ $(srcdir)/Include/internal/pycore_ucnhash.h \ + $(srcdir)/Include/internal/pycore_unicodectype.h \ $(srcdir)/Include/internal/pycore_unicodeobject.h \ $(srcdir)/Include/internal/pycore_unicodeobject_generated.h \ - $(srcdir)/Include/internal/pycore_unicodectype.h \ $(srcdir)/Include/internal/pycore_unionobject.h \ $(srcdir)/Include/internal/pycore_uniqueid.h \ $(srcdir)/Include/internal/pycore_uop.h \ diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index 24ac647a944528..da4e0ddedc2a44 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -19,7 +19,7 @@ #include "Python.h" #include "pycore_object.h" // _PyObject_VisitType() #include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI -#include "pycore_unicodectype.h" // _PyUnicode_IsXidStart, _PyUnicode_IsXidContinue +#include "pycore_unicodectype.h" // _PyUnicode_IsXidStart() #include #include // offsetof() diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index d4549b70d4dabc..26f5faf146f2ac 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -57,6 +57,7 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #include "pycore_pylifecycle.h" // _Py_SetFileSystemEncoding() #include "pycore_pystate.h" // _PyInterpreterState_GET() #include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI +#include "pycore_unicodectype.h" // _PyUnicode_IsXidStart #include "pycore_unicodeobject.h" // struct _Py_unicode_state #include "pycore_unicodeobject_generated.h" // _PyUnicode_InitStaticStrings() diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index 5c653b3d0cad83..a101c1b45cf25c 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -328,9 +328,9 @@ + - diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters index 9c12be6e9356a6..e3f261c2b92ab9 100644 --- a/PCbuild/pythoncore.vcxproj.filters +++ b/PCbuild/pythoncore.vcxproj.filters @@ -528,6 +528,9 @@ Include\cpython + + Include\internal + Include\internal From 33fe65f83b06e7a202eb538c5e5e0e02b051e581 Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Wed, 29 Oct 2025 16:52:07 +0000 Subject: [PATCH 10/15] Move the rest --- Include/internal/pycore_unicodectype.h | 8 ++++++++ Include/internal/pycore_unicodeobject.h | 10 ---------- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/Include/internal/pycore_unicodectype.h b/Include/internal/pycore_unicodectype.h index ce37b44492e9a0..523bdb56b09cde 100644 --- a/Include/internal/pycore_unicodectype.h +++ b/Include/internal/pycore_unicodectype.h @@ -8,6 +8,14 @@ extern "C" { # error "this header requires Py_BUILD_CORE define" #endif +extern int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res); +extern int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res); +extern int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res); +extern int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res); +extern int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch); +extern int _PyUnicode_IsCased(Py_UCS4 ch); + +// Export for 'unicodedata' shared extension. PyAPI_FUNC(int) _PyUnicode_IsXidStart(Py_UCS4 ch); PyAPI_FUNC(int) _PyUnicode_IsXidContinue(Py_UCS4 ch); diff --git a/Include/internal/pycore_unicodeobject.h b/Include/internal/pycore_unicodeobject.h index bdcb7b807db462..f384fad8713adc 100644 --- a/Include/internal/pycore_unicodeobject.h +++ b/Include/internal/pycore_unicodeobject.h @@ -74,16 +74,6 @@ _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch) return 0; } - -/* --- Characters Type APIs ----------------------------------------------- */ - -extern int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res); -extern int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res); -extern int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res); -extern int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res); -extern int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch); -extern int _PyUnicode_IsCased(Py_UCS4 ch); - /* --- Unicode API -------------------------------------------------------- */ // Export for '_json' shared extension From 14c953605bfca1e1312e474dc98e9bc9bad73fcb Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Wed, 29 Oct 2025 19:07:39 +0000 Subject: [PATCH 11/15] Split sentance in docs --- Doc/library/unicodedata.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Doc/library/unicodedata.rst b/Doc/library/unicodedata.rst index 140ba4bfde174a..baead2a99ca7dd 100644 --- a/Doc/library/unicodedata.rst +++ b/Doc/library/unicodedata.rst @@ -148,7 +148,8 @@ following functions: Return ``True`` if *chr* is a valid identifier start per the `Unicode Standard Annex #31 `_, - that is, it has the ``XID_Start`` property, ``False`` otherwise. For example:: + that is, it has the ``XID_Start`` property. Return ``False`` otherwise. + For example:: >>> unicodedata.isidstart('S') True @@ -162,7 +163,8 @@ following functions: Return ``True`` if *chr* is a valid identifier charcter per the `Unicode Standard Annex #31 `_, - that is, it has the ``XID_Continue`` property, ``False`` otherwise. For example:: + that is, it has the ``XID_Continue`` property. Return ``False`` otherwise. + For example:: >>> unicodedata.isidcontinue('S') True From 6fe1ab897535b52cee1d895fea9b0ee82246997c Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Wed, 29 Oct 2025 19:19:33 +0000 Subject: [PATCH 12/15] "X" --- Doc/library/unicodedata.rst | 4 +-- Lib/test/test_unicodedata.py | 52 +++++++++++++++++----------------- Modules/clinic/unicodedata.c.h | 38 ++++++++++++------------- Modules/unicodedata.c | 16 +++++------ 4 files changed, 55 insertions(+), 55 deletions(-) diff --git a/Doc/library/unicodedata.rst b/Doc/library/unicodedata.rst index baead2a99ca7dd..fe57f5a71437b2 100644 --- a/Doc/library/unicodedata.rst +++ b/Doc/library/unicodedata.rst @@ -144,7 +144,7 @@ following functions: 1 -.. function:: isidstart(chr, /) +.. function:: isxidstart(chr, /) Return ``True`` if *chr* is a valid identifier start per the `Unicode Standard Annex #31 `_, @@ -159,7 +159,7 @@ following functions: .. versionadded:: next -.. function:: isidcontinue(chr, /) +.. function:: isxidcontinue(chr, /) Return ``True`` if *chr* is a valid identifier charcter per the `Unicode Standard Annex #31 `_, diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index d677b06a57daba..a3c22a4f27ee77 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -276,32 +276,32 @@ def test_east_asian_width_9_0_changes(self): self.assertEqual(self.db.ucd_3_2_0.east_asian_width('\u231a'), 'N') self.assertEqual(self.db.east_asian_width('\u231a'), 'W') - def test_isidstart(self): - self.assertTrue(self.db.isidstart('S')) - self.assertTrue(self.db.isidstart('\u0AD0')) # GUJARATI OM - self.assertTrue(self.db.isidstart('\u0EC6')) # LAO KO LA - self.assertTrue(self.db.isidstart('\u17DC')) # KHMER SIGN AVAKRAHASANYA - self.assertTrue(self.db.isidstart('\uA015')) # YI SYLLABLE WU - self.assertTrue(self.db.isidstart('\uFE7B')) # ARABIC KASRA MEDIAL FORM - - self.assertFalse(self.db.isidstart(' ')) - self.assertFalse(self.db.isidstart('0')) - self.assertRaises(TypeError, self.db.isidstart) - self.assertRaises(TypeError, self.db.isidstart, 'xx') - - def test_isidcontinue(self): - self.assertTrue(self.db.isidcontinue('S')) - self.assertTrue(self.db.isidcontinue('_')) - self.assertTrue(self.db.isidcontinue('0')) - self.assertTrue(self.db.isidcontinue('\u00BA')) # MASCULINE ORDINAL INDICATOR - self.assertTrue(self.db.isidcontinue('\u0640')) # ARABIC TATWEEL - self.assertTrue(self.db.isidcontinue('\u0710')) # SYRIAC LETTER ALAPH - self.assertTrue(self.db.isidcontinue('\u0B3E')) # ORIYA VOWEL SIGN AA - self.assertTrue(self.db.isidcontinue('\u17D7')) # KHMER SIGN LEK TOO - - self.assertFalse(self.db.isidcontinue(' ')) - self.assertRaises(TypeError, self.db.isidcontinue) - self.assertRaises(TypeError, self.db.isidcontinue, 'xx') + def test_isxidstart(self): + self.assertTrue(self.db.isxidstart('S')) + self.assertTrue(self.db.isxidstart('\u0AD0')) # GUJARATI OM + self.assertTrue(self.db.isxidstart('\u0EC6')) # LAO KO LA + self.assertTrue(self.db.isxidstart('\u17DC')) # KHMER SIGN AVAKRAHASANYA + self.assertTrue(self.db.isxidstart('\uA015')) # YI SYLLABLE WU + self.assertTrue(self.db.isxidstart('\uFE7B')) # ARABIC KASRA MEDIAL FORM + + self.assertFalse(self.db.isxidstart(' ')) + self.assertFalse(self.db.isxidstart('0')) + self.assertRaises(TypeError, self.db.isxidstart) + self.assertRaises(TypeError, self.db.isxidstart, 'xx') + + def test_isxidcontinue(self): + self.assertTrue(self.db.isxidcontinue('S')) + self.assertTrue(self.db.isxidcontinue('_')) + self.assertTrue(self.db.isxidcontinue('0')) + self.assertTrue(self.db.isxidcontinue('\u00BA')) # MASCULINE ORDINAL INDICATOR + self.assertTrue(self.db.isxidcontinue('\u0640')) # ARABIC TATWEEL + self.assertTrue(self.db.isxidcontinue('\u0710')) # SYRIAC LETTER ALAPH + self.assertTrue(self.db.isxidcontinue('\u0B3E')) # ORIYA VOWEL SIGN AA + self.assertTrue(self.db.isxidcontinue('\u17D7')) # KHMER SIGN LEK TOO + + self.assertFalse(self.db.isxidcontinue(' ')) + self.assertRaises(TypeError, self.db.isxidcontinue) + self.assertRaises(TypeError, self.db.isxidcontinue, 'xx') class UnicodeMiscTest(UnicodeDatabaseTest): diff --git a/Modules/clinic/unicodedata.c.h b/Modules/clinic/unicodedata.c.h index 5e968cf63d2c24..5fcba083c2f4ce 100644 --- a/Modules/clinic/unicodedata.c.h +++ b/Modules/clinic/unicodedata.c.h @@ -518,73 +518,73 @@ unicodedata_UCD_name(PyObject *self, PyObject *const *args, Py_ssize_t nargs) return return_value; } -PyDoc_STRVAR(unicodedata_UCD_isidstart__doc__, -"isidstart($self, chr, /)\n" +PyDoc_STRVAR(unicodedata_UCD_isxidstart__doc__, +"isxidstart($self, chr, /)\n" "--\n" "\n" "Return True if the character has the XID_Start property, else False."); -#define UNICODEDATA_UCD_ISIDSTART_METHODDEF \ - {"isidstart", (PyCFunction)unicodedata_UCD_isidstart, METH_O, unicodedata_UCD_isidstart__doc__}, +#define UNICODEDATA_UCD_ISXIDSTART_METHODDEF \ + {"isxidstart", (PyCFunction)unicodedata_UCD_isxidstart, METH_O, unicodedata_UCD_isxidstart__doc__}, static PyObject * -unicodedata_UCD_isidstart_impl(PyObject *self, int chr); +unicodedata_UCD_isxidstart_impl(PyObject *self, int chr); static PyObject * -unicodedata_UCD_isidstart(PyObject *self, PyObject *arg) +unicodedata_UCD_isxidstart(PyObject *self, PyObject *arg) { PyObject *return_value = NULL; int chr; if (!PyUnicode_Check(arg)) { - _PyArg_BadArgument("isidstart", "argument", "a unicode character", arg); + _PyArg_BadArgument("isxidstart", "argument", "a unicode character", arg); goto exit; } if (PyUnicode_GET_LENGTH(arg) != 1) { PyErr_Format(PyExc_TypeError, - "isidstart(): argument must be a unicode character, " + "isxidstart(): argument must be a unicode character, " "not a string of length %zd", PyUnicode_GET_LENGTH(arg)); goto exit; } chr = PyUnicode_READ_CHAR(arg, 0); - return_value = unicodedata_UCD_isidstart_impl(self, chr); + return_value = unicodedata_UCD_isxidstart_impl(self, chr); exit: return return_value; } -PyDoc_STRVAR(unicodedata_UCD_isidcontinue__doc__, -"isidcontinue($self, chr, /)\n" +PyDoc_STRVAR(unicodedata_UCD_isxidcontinue__doc__, +"isxidcontinue($self, chr, /)\n" "--\n" "\n" "Return True if the character has the XID_Continue property, else False."); -#define UNICODEDATA_UCD_ISIDCONTINUE_METHODDEF \ - {"isidcontinue", (PyCFunction)unicodedata_UCD_isidcontinue, METH_O, unicodedata_UCD_isidcontinue__doc__}, +#define UNICODEDATA_UCD_ISXIDCONTINUE_METHODDEF \ + {"isxidcontinue", (PyCFunction)unicodedata_UCD_isxidcontinue, METH_O, unicodedata_UCD_isxidcontinue__doc__}, static PyObject * -unicodedata_UCD_isidcontinue_impl(PyObject *self, int chr); +unicodedata_UCD_isxidcontinue_impl(PyObject *self, int chr); static PyObject * -unicodedata_UCD_isidcontinue(PyObject *self, PyObject *arg) +unicodedata_UCD_isxidcontinue(PyObject *self, PyObject *arg) { PyObject *return_value = NULL; int chr; if (!PyUnicode_Check(arg)) { - _PyArg_BadArgument("isidcontinue", "argument", "a unicode character", arg); + _PyArg_BadArgument("isxidcontinue", "argument", "a unicode character", arg); goto exit; } if (PyUnicode_GET_LENGTH(arg) != 1) { PyErr_Format(PyExc_TypeError, - "isidcontinue(): argument must be a unicode character, " + "isxidcontinue(): argument must be a unicode character, " "not a string of length %zd", PyUnicode_GET_LENGTH(arg)); goto exit; } chr = PyUnicode_READ_CHAR(arg, 0); - return_value = unicodedata_UCD_isidcontinue_impl(self, chr); + return_value = unicodedata_UCD_isxidcontinue_impl(self, chr); exit: return return_value; @@ -621,4 +621,4 @@ unicodedata_UCD_lookup(PyObject *self, PyObject *arg) exit: return return_value; } -/*[clinic end generated code: output=33bfeb04e3eded40 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=c5e56c8f6bb80f93 input=a9049054013a1b77]*/ diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index da4e0ddedc2a44..a6094676d4194c 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -1527,7 +1527,7 @@ unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value) } /*[clinic input] -unicodedata.UCD.isidstart +unicodedata.UCD.isxidstart self: self chr: int(accept={str}) @@ -1538,8 +1538,8 @@ Return True if the character has the XID_Start property, else False. [clinic start generated code]*/ static PyObject * -unicodedata_UCD_isidstart_impl(PyObject *self, int chr) -/*[clinic end generated code: output=29fbeaf6491d9f85 input=b71b6b1b2db3c16d]*/ +unicodedata_UCD_isxidstart_impl(PyObject *self, int chr) +/*[clinic end generated code: output=944005823c72c3ef input=9353f88d709c21fb]*/ { if (UCD_Check(self)) { const change_record *old = get_old_record(self, chr); @@ -1553,7 +1553,7 @@ unicodedata_UCD_isidstart_impl(PyObject *self, int chr) } /*[clinic input] -unicodedata.UCD.isidcontinue +unicodedata.UCD.isxidcontinue self: self chr: int(accept={str}) @@ -1564,8 +1564,8 @@ Return True if the character has the XID_Continue property, else False. [clinic start generated code]*/ static PyObject * -unicodedata_UCD_isidcontinue_impl(PyObject *self, int chr) -/*[clinic end generated code: output=5ae694da0ee16534 input=01b4ccd399484e6b]*/ +unicodedata_UCD_isxidcontinue_impl(PyObject *self, int chr) +/*[clinic end generated code: output=9438dcbff5ca3e41 input=bbb8dd3ac0d2d709]*/ { if (UCD_Check(self)) { const change_record *old = get_old_record(self, chr); @@ -1643,8 +1643,8 @@ static PyMethodDef unicodedata_functions[] = { UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF UNICODEDATA_UCD_DECOMPOSITION_METHODDEF UNICODEDATA_UCD_NAME_METHODDEF - UNICODEDATA_UCD_ISIDSTART_METHODDEF - UNICODEDATA_UCD_ISIDCONTINUE_METHODDEF + UNICODEDATA_UCD_ISXIDSTART_METHODDEF + UNICODEDATA_UCD_ISXIDCONTINUE_METHODDEF UNICODEDATA_UCD_LOOKUP_METHODDEF UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF UNICODEDATA_UCD_NORMALIZE_METHODDEF From 5ccc2cdb07464ad966d9eb41e1218b601fb2ad16 Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Wed, 29 Oct 2025 19:22:17 +0000 Subject: [PATCH 13/15] More "X" --- Doc/library/unicodedata.rst | 8 ++++---- Doc/whatsnew/3.15.rst | 2 +- .../2025-10-17-20-42-38.gh-issue-129117.X9jr4p.rst | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Doc/library/unicodedata.rst b/Doc/library/unicodedata.rst index fe57f5a71437b2..082dc0299de9c5 100644 --- a/Doc/library/unicodedata.rst +++ b/Doc/library/unicodedata.rst @@ -151,9 +151,9 @@ following functions: that is, it has the ``XID_Start`` property. Return ``False`` otherwise. For example:: - >>> unicodedata.isidstart('S') + >>> unicodedata.isxidstart('S') True - >>> unicodedata.isidstart('0') + >>> unicodedata.isxidstart('0') False .. versionadded:: next @@ -166,9 +166,9 @@ following functions: that is, it has the ``XID_Continue`` property. Return ``False`` otherwise. For example:: - >>> unicodedata.isidcontinue('S') + >>> unicodedata.isxidcontinue('S') True - >>> unicodedata.isidcontinue(' ') + >>> unicodedata.isxidcontinue(' ') False .. versionadded:: next diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst index 5e85bc26ce39b3..13e36a64ff0a30 100644 --- a/Doc/whatsnew/3.15.rst +++ b/Doc/whatsnew/3.15.rst @@ -768,7 +768,7 @@ unicodedata * The Unicode database has been updated to Unicode 17.0.0. -* Add :func:`unicodedata.isidstart` and :func:`unicodedata.isidcontinue` +* Add :func:`unicodedata.isxidstart` and :func:`unicodedata.isxidcontinue` functions. (Contributed by Stan Ulbrych in :gh:`129117`.) diff --git a/Misc/NEWS.d/next/Library/2025-10-17-20-42-38.gh-issue-129117.X9jr4p.rst b/Misc/NEWS.d/next/Library/2025-10-17-20-42-38.gh-issue-129117.X9jr4p.rst index da77a5e02158a9..d34538e97efaa0 100644 --- a/Misc/NEWS.d/next/Library/2025-10-17-20-42-38.gh-issue-129117.X9jr4p.rst +++ b/Misc/NEWS.d/next/Library/2025-10-17-20-42-38.gh-issue-129117.X9jr4p.rst @@ -1,2 +1,2 @@ -:mod:`unicodedata`: Add :func:`~unicodedata.isidstart` and -:func:`~unicodedata.isidcontinue` functions. +:mod:`unicodedata`: Add :func:`~unicodedata.isxidstart` and +:func:`~unicodedata.isxidcontinue` functions. From 4900fb76f2a28432e43bea6e8a39e83a27661e2f Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Wed, 29 Oct 2025 20:11:55 +0000 Subject: [PATCH 14/15] Expand What's New and blurb --- Doc/whatsnew/3.15.rst | 3 ++- .../Library/2025-10-17-20-42-38.gh-issue-129117.X9jr4p.rst | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst index 13e36a64ff0a30..92933105447049 100644 --- a/Doc/whatsnew/3.15.rst +++ b/Doc/whatsnew/3.15.rst @@ -769,7 +769,8 @@ unicodedata * The Unicode database has been updated to Unicode 17.0.0. * Add :func:`unicodedata.isxidstart` and :func:`unicodedata.isxidcontinue` - functions. + functions to check whether a character can start or continue a + `Unicode Standard Annex #31 `_ identifier. (Contributed by Stan Ulbrych in :gh:`129117`.) diff --git a/Misc/NEWS.d/next/Library/2025-10-17-20-42-38.gh-issue-129117.X9jr4p.rst b/Misc/NEWS.d/next/Library/2025-10-17-20-42-38.gh-issue-129117.X9jr4p.rst index d34538e97efaa0..8767b1bb4837ad 100644 --- a/Misc/NEWS.d/next/Library/2025-10-17-20-42-38.gh-issue-129117.X9jr4p.rst +++ b/Misc/NEWS.d/next/Library/2025-10-17-20-42-38.gh-issue-129117.X9jr4p.rst @@ -1,2 +1,3 @@ :mod:`unicodedata`: Add :func:`~unicodedata.isxidstart` and -:func:`~unicodedata.isxidcontinue` functions. +:func:`~unicodedata.isxidcontinue` functions to check whether a character can +start or continue a `Unicode Standard Annex #31 `_ identifier. From 50300f4bdef57702e30286eef8dfa24751838eb4 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 30 Oct 2025 10:51:03 +0100 Subject: [PATCH 15/15] Update Doc/library/unicodedata.rst --- Doc/library/unicodedata.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/library/unicodedata.rst b/Doc/library/unicodedata.rst index 082dc0299de9c5..c49bf641704616 100644 --- a/Doc/library/unicodedata.rst +++ b/Doc/library/unicodedata.rst @@ -161,7 +161,7 @@ following functions: .. function:: isxidcontinue(chr, /) - Return ``True`` if *chr* is a valid identifier charcter per the + Return ``True`` if *chr* is a valid identifier character per the `Unicode Standard Annex #31 `_, that is, it has the ``XID_Continue`` property. Return ``False`` otherwise. For example::