diff --git a/Lib/locale.py b/Lib/locale.py index 37cafb4a601b3c..457acd0e449532 100644 --- a/Lib/locale.py +++ b/Lib/locale.py @@ -97,6 +97,23 @@ def setlocale(category, value=None): if 'strcoll' not in globals(): strcoll = _strcoll +if sys.platform.startswith(('freebsd', 'dragonflybsd')): + # On FreeBSD, wcsxfrm() fails with EINVAL for + # 'Å' (U+00C5 LATIN CAPITAL LETTER A WITH RING ABOVE) and + # 'Å' (U+212B ANGSTROM SIGN) on non-C locales. + # As a workaround, replace them with + # 'å' (U+00E5 LATIN SMALL LETTER A WITH RING ABOVE). + # To preserve the relative order of these characters according to + # wcscoll(), add a digit 0-2. + _strxfrm = strxfrm + def strxfrm(string, /): + if (not string.isascii() and + _setlocale(LC_COLLATE) not in ('C', 'C.UTF-8', 'POSIX') and + ('\xe5' in string or '\xc5' in string or '\u212b' in string)): + string = string.replace('\xe5', '\xe50') + string = string.replace('\xc5', '\xe51') + string = string.replace('\u212b', '\xe52') + return _strxfrm(string) _localeconv = localeconv diff --git a/Lib/test/test_locale.py b/Lib/test/test_locale.py index 01b1e754d04219..d532cc92c4205d 100644 --- a/Lib/test/test_locale.py +++ b/Lib/test/test_locale.py @@ -332,12 +332,14 @@ def test_strcoll(self): self.assertLess(locale.strcoll('a', 'b'), 0) self.assertEqual(locale.strcoll('a', 'a'), 0) self.assertGreater(locale.strcoll('b', 'a'), 0) + self.assertLess(locale.strcoll('A', 'B'), 0) # embedded null character self.assertRaises(ValueError, locale.strcoll, 'a\0', 'a') self.assertRaises(ValueError, locale.strcoll, 'a', 'a\0') def test_strxfrm(self): self.assertLess(locale.strxfrm('a'), locale.strxfrm('b')) + self.assertLess(locale.strxfrm('A'), locale.strxfrm('B')) # embedded null character self.assertRaises(ValueError, locale.strxfrm, 'a\0') @@ -351,8 +353,7 @@ def setUp(self): enc = codecs.lookup(locale.getencoding() or 'ascii').name if enc not in ('utf-8', 'iso8859-1', 'cp1252'): raise unittest.SkipTest('encoding not suitable') - if enc != 'iso8859-1' and (sys.platform == 'darwin' or is_android or - sys.platform.startswith('freebsd')): + if enc != 'iso8859-1' and (sys.platform == 'darwin' or is_android): raise unittest.SkipTest('wcscoll/wcsxfrm have known bugs') BaseLocalizedTest.setUp(self) @@ -363,6 +364,10 @@ def setUp(self): "gh-124108: NetBSD doesn't support UTF-8 for LC_COLLATE") def test_strcoll_with_diacritic(self): self.assertLess(locale.strcoll('à', 'b'), 0) + self.assertLess(locale.strcoll('À', 'B'), 0) + self.assertLess(locale.strcoll('å', 'b'), 0) + self.assertLess(locale.strcoll('\xc5', 'B'), 0) + self.assertLess(locale.strcoll('\u212b', 'B'), 0) @unittest.skipIf(sys.platform.startswith('aix'), 'bpo-29972: broken test on AIX') @@ -371,6 +376,28 @@ def test_strcoll_with_diacritic(self): "gh-124108: NetBSD doesn't support UTF-8 for LC_COLLATE") def test_strxfrm_with_diacritic(self): self.assertLess(locale.strxfrm('à'), locale.strxfrm('b')) + self.assertLess(locale.strxfrm('À'), locale.strxfrm('B')) + self.assertLess(locale.strxfrm('å'), locale.strxfrm('b')) + # gh-130567: Should not fail with OSError EINVAL. + self.assertLess(locale.strxfrm('\xc5'), locale.strxfrm('B')) + self.assertLess(locale.strxfrm('\u212b'), locale.strxfrm('B')) + + def test_strxfrm_strcoll_consistency(self): + enc = codecs.lookup(locale.getencoding() or 'ascii').name + if enc != 'utf-8': + self.skipTest('strcoll() and strxfrm() can be inconsistent on non-UTF-8 locale') + def check(a, b): + r = locale.strcoll(a, b) + if r < 0: + self.assertLess(locale.strxfrm(a), locale.strxfrm(b)) + elif r > 0: + self.assertGreater(locale.strxfrm(a), locale.strxfrm(b)) + else: + self.assertEqual(locale.strxfrm(a), locale.strxfrm(b)) + check('à', 'À') + check('å', '\xc5') # 'Å' U+00C5 LATIN CAPITAL LETTER A WITH RING ABOVE + check('å', '\u212b') # 'Å' U+212B ANGSTROM SIGN + check('\xc5', '\u212b') class NormalizeTest(unittest.TestCase): diff --git a/Misc/NEWS.d/next/Library/2025-09-07-11-29-49.gh-issue-130567.zZRq0v.rst b/Misc/NEWS.d/next/Library/2025-09-07-11-29-49.gh-issue-130567.zZRq0v.rst new file mode 100644 index 00000000000000..ac93272b8f791e --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-09-07-11-29-49.gh-issue-130567.zZRq0v.rst @@ -0,0 +1,3 @@ +Fix :func:`locale.strxfrm` failure on FreeBSD and DragonFlyBSD for strings +containing characters 'Å' (U+00C5 LATIN CAPITAL LETTER A WITH RING ABOVE) or +'Å' (U+212B ANGSTROM SIGN).