From 3cc6f0615e1944ff4af52236aa9df10709768f54 Mon Sep 17 00:00:00 2001 From: Dong-hee Na Date: Sat, 28 Jan 2023 20:31:46 +0900 Subject: [PATCH 1/5] gh-101372: Fix unicodedata.is_normalized to properly handle the UCD 3.2.0 --- Lib/test/test_unicodedata.py | 6 ++++++ .../2023-01-28-20-31-42.gh-issue-101372.8BcpCC.rst | 2 ++ Modules/unicodedata.c | 2 +- 3 files changed, 9 insertions(+), 1 deletion(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2023-01-28-20-31-42.gh-issue-101372.8BcpCC.rst diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index 74503c89e559a0..1fe6d9d7bf4f86 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -216,6 +216,12 @@ def test_issue29456(self): self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b) self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b) + def test_unicode_3_2_0(self): + for x in range(0x110000): + for form in ('NFC', 'NFD', 'NFKC', 'NFKD'): + normalized = self.db.ucd_3_2_0.normalize(form, chr(x)) + self.assertTrue(self.db.ucd_3_2_0.is_normalized(form, normalized)) + def test_east_asian_width(self): eaw = self.db.east_asian_width self.assertRaises(TypeError, eaw, b'a') diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-01-28-20-31-42.gh-issue-101372.8BcpCC.rst b/Misc/NEWS.d/next/Core and Builtins/2023-01-28-20-31-42.gh-issue-101372.8BcpCC.rst new file mode 100644 index 00000000000000..65a207e3f7e436 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2023-01-28-20-31-42.gh-issue-101372.8BcpCC.rst @@ -0,0 +1,2 @@ +Fix :func:`~unicodedata.is_normalized` to properly handle the UCD 3.2.0 +cases. Patch by Dong-hee Na. diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index 59fccd4b834dd3..c108f14871f946 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -800,7 +800,7 @@ is_normalized_quickcheck(PyObject *self, PyObject *input, bool nfc, bool k, { /* UCD 3.2.0 is requested, quickchecks must be disabled. */ if (UCD_Check(self)) { - return NO; + return MAYBE; } if (PyUnicode_IS_ASCII(input)) { From 9e663af2557c00bbd7dbbd1cb681a9ef825634e9 Mon Sep 17 00:00:00 2001 From: Dong-hee Na Date: Sat, 28 Jan 2023 20:34:45 +0900 Subject: [PATCH 2/5] Rename the test --- Lib/test/test_unicodedata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index 1fe6d9d7bf4f86..7a2e1151685234 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -216,7 +216,7 @@ def test_issue29456(self): self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b) self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b) - def test_unicode_3_2_0(self): + def test_is_normalized_unicode_3_2_0(self): for x in range(0x110000): for form in ('NFC', 'NFD', 'NFKC', 'NFKD'): normalized = self.db.ucd_3_2_0.normalize(form, chr(x)) From d8edb5461b555d4ee2150155e67b191d1cb4c3a4 Mon Sep 17 00:00:00 2001 From: Dong-hee Na Date: Sun, 29 Jan 2023 00:33:58 +0900 Subject: [PATCH 3/5] Sampling test sets --- Lib/test/test_unicodedata.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index 7a2e1151685234..781ae2f5fc154a 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -8,6 +8,7 @@ import hashlib from http.client import HTTPException +import random import sys import unicodedata import unittest @@ -217,10 +218,12 @@ def test_issue29456(self): self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b) def test_is_normalized_unicode_3_2_0(self): - for x in range(0x110000): - for form in ('NFC', 'NFD', 'NFKC', 'NFKD'): - normalized = self.db.ucd_3_2_0.normalize(form, chr(x)) - self.assertTrue(self.db.ucd_3_2_0.is_normalized(form, normalized)) + sample_chrs = random.sample(range(0x110000), 100) + for form in ('NFC', 'NFD', 'NFKC', 'NFKD'): + with self.subTest(form=form): + for x in sample_chrs: + norm = self.db.ucd_3_2_0.normalize(form, chr(x)) + self.assertTrue(self.db.ucd_3_2_0.is_normalized(form, norm)) def test_east_asian_width(self): eaw = self.db.east_asian_width From 080b6ccc0623b0a54b6840934ba7b917b66884ac Mon Sep 17 00:00:00 2001 From: Dong-hee Na Date: Sun, 29 Jan 2023 00:37:51 +0900 Subject: [PATCH 4/5] Add multi multicharacter case --- Lib/test/test_unicodedata.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index 781ae2f5fc154a..d3bee6d336077e 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -225,6 +225,13 @@ def test_is_normalized_unicode_3_2_0(self): norm = self.db.ucd_3_2_0.normalize(form, chr(x)) self.assertTrue(self.db.ucd_3_2_0.is_normalized(form, norm)) + def test_is_normalized_unicode_3_2_0_multicharacter(self): + sample_chrs = random.sample(range(0x110000), 100) + for form in ('NFC', 'NFD', 'NFKC', 'NFKD'): + s = ''.join(map(chr, sample_chrs)) + norm = self.db.ucd_3_2_0.normalize(form, s) + self.assertTrue(self.db.ucd_3_2_0.is_normalized(form, norm)) + def test_east_asian_width(self): eaw = self.db.east_asian_width self.assertRaises(TypeError, eaw, b'a') From c1976653b1fa15a08f4ff0008d001e1bbec0b602 Mon Sep 17 00:00:00 2001 From: Dong-hee Na Date: Mon, 6 Feb 2023 11:36:01 +0900 Subject: [PATCH 5/5] Address code review --- Lib/test/test_unicodedata.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index d3bee6d336077e..74503c89e559a0 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -8,7 +8,6 @@ import hashlib from http.client import HTTPException -import random import sys import unicodedata import unittest @@ -217,21 +216,6 @@ def test_issue29456(self): self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b) self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b) - def test_is_normalized_unicode_3_2_0(self): - sample_chrs = random.sample(range(0x110000), 100) - for form in ('NFC', 'NFD', 'NFKC', 'NFKD'): - with self.subTest(form=form): - for x in sample_chrs: - norm = self.db.ucd_3_2_0.normalize(form, chr(x)) - self.assertTrue(self.db.ucd_3_2_0.is_normalized(form, norm)) - - def test_is_normalized_unicode_3_2_0_multicharacter(self): - sample_chrs = random.sample(range(0x110000), 100) - for form in ('NFC', 'NFD', 'NFKC', 'NFKD'): - s = ''.join(map(chr, sample_chrs)) - norm = self.db.ucd_3_2_0.normalize(form, s) - self.assertTrue(self.db.ucd_3_2_0.is_normalized(form, norm)) - def test_east_asian_width(self): eaw = self.db.east_asian_width self.assertRaises(TypeError, eaw, b'a')