gh-85287: Change codecs to raise precise UnicodeEncodeError and Unico…

…deDecodeError (#113674) Co-authored-by: Inada Naoki <songofacandy@gmail.com>
python · Mar 17, 2024 · 649857a · 649857a
1 parent c514a97
commit 649857a
Show file tree

Hide file tree

Showing 9 changed files with 306 additions and 81 deletions.
diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py
@@ -11,7 +11,7 @@
 sace_prefix = "xn--"
 
 # This assumes query strings, so AllowUnassigned is true
-def nameprep(label):
+def nameprep(label):  # type: (str) -> str
     # Map
     newlabel = []
     for c in label:
@@ -25,7 +25,7 @@ def nameprep(label):
     label = unicodedata.normalize("NFKC", label)
 
     # Prohibit
-    for c in label:
+    for i, c in enumerate(label):
         if stringprep.in_table_c12(c) or \
            stringprep.in_table_c22(c) or \
            stringprep.in_table_c3(c) or \
@@ -35,7 +35,7 @@ def nameprep(label):
            stringprep.in_table_c7(c) or \
            stringprep.in_table_c8(c) or \
            stringprep.in_table_c9(c):
-            raise UnicodeError("Invalid character %r" % c)
+            raise UnicodeEncodeError("idna", label, i, i+1, f"Invalid character {c!r}")
 
     # Check bidi
     RandAL = [stringprep.in_table_d1(x) for x in label]
@@ -46,59 +46,73 @@ def nameprep(label):
         # This is table C.8, which was already checked
         # 2) If a string contains any RandALCat character, the string
         # MUST NOT contain any LCat character.
-        if any(stringprep.in_table_d2(x) for x in label):
-            raise UnicodeError("Violation of BIDI requirement 2")
+        for i, x in enumerate(label):
+            if stringprep.in_table_d2(x):
+                raise UnicodeEncodeError("idna", label, i, i+1,
+                                         "Violation of BIDI requirement 2")
         # 3) If a string contains any RandALCat character, a
         # RandALCat character MUST be the first character of the
         # string, and a RandALCat character MUST be the last
         # character of the string.
-        if not RandAL[0] or not RandAL[-1]:
-            raise UnicodeError("Violation of BIDI requirement 3")
+        if not RandAL[0]:
+            raise UnicodeEncodeError("idna", label, 0, 1,
+                                     "Violation of BIDI requirement 3")
+        if not RandAL[-1]:
+            raise UnicodeEncodeError("idna", label, len(label)-1, len(label),
+                                     "Violation of BIDI requirement 3")
 
     return label
 
-def ToASCII(label):
+def ToASCII(label):  # type: (str) -> bytes
     try:
         # Step 1: try ASCII
-        label = label.encode("ascii")
-    except UnicodeError:
+        label_ascii = label.encode("ascii")
+    except UnicodeEncodeError:
         pass
     else:
         # Skip to step 3: UseSTD3ASCIIRules is false, so
         # Skip to step 8.
-        if 0 < len(label) < 64:
-            return label
-        raise UnicodeError("label empty or too long")
+        if 0 < len(label_ascii) < 64:
+            return label_ascii
+        if len(label) == 0:
+            raise UnicodeEncodeError("idna", label, 0, 1, "label empty")
+        else:
+            raise UnicodeEncodeError("idna", label, 0, len(label), "label too long")
 
     # Step 2: nameprep
     label = nameprep(label)
 
     # Step 3: UseSTD3ASCIIRules is false
     # Step 4: try ASCII
     try:
-        label = label.encode("ascii")
-    except UnicodeError:
+        label_ascii = label.encode("ascii")
+    except UnicodeEncodeError:
         pass
     else:
         # Skip to step 8.
         if 0 < len(label) < 64:
-            return label
-        raise UnicodeError("label empty or too long")
+            return label_ascii
+        if len(label) == 0:
+            raise UnicodeEncodeError("idna", label, 0, 1, "label empty")
+        else:
+            raise UnicodeEncodeError("idna", label, 0, len(label), "label too long")
 
     # Step 5: Check ACE prefix
-    if label[:4].lower() == sace_prefix:
-        raise UnicodeError("Label starts with ACE prefix")
+    if label.lower().startswith(sace_prefix):
+        raise UnicodeEncodeError(
+            "idna", label, 0, len(sace_prefix), "Label starts with ACE prefix")
 
     # Step 6: Encode with PUNYCODE
-    label = label.encode("punycode")
+    label_ascii = label.encode("punycode")
 
     # Step 7: Prepend ACE prefix
-    label = ace_prefix + label
+    label_ascii = ace_prefix + label_ascii
 
     # Step 8: Check size
-    if 0 < len(label) < 64:
-        return label
-    raise UnicodeError("label empty or too long")
+    # do not check for empty as we prepend ace_prefix.
+    if len(label_ascii) < 64:
+        return label_ascii
+    raise UnicodeEncodeError("idna", label, 0, len(label), "label too long")
 
 def ToUnicode(label):
     if len(label) > 1024:
@@ -110,41 +124,51 @@ def ToUnicode(label):
         # per https://www.rfc-editor.org/rfc/rfc3454#section-3.1 while still
         # preventing us from wasting time decoding a big thing that'll just
         # hit the actual <= 63 length limit in Step 6.
-        raise UnicodeError("label way too long")
+        if isinstance(label, str):
+            label = label.encode("utf-8", errors="backslashreplace")
+        raise UnicodeDecodeError("idna", label, 0, len(label), "label way too long")
     # Step 1: Check for ASCII
     if isinstance(label, bytes):
         pure_ascii = True
     else:
         try:
             label = label.encode("ascii")
             pure_ascii = True
-        except UnicodeError:
+        except UnicodeEncodeError:
             pure_ascii = False
     if not pure_ascii:
+        assert isinstance(label, str)
         # Step 2: Perform nameprep
         label = nameprep(label)
         # It doesn't say this, but apparently, it should be ASCII now
         try:
             label = label.encode("ascii")
-        except UnicodeError:
-            raise UnicodeError("Invalid character in IDN label")
+        except UnicodeEncodeError as exc:
+            raise UnicodeEncodeError("idna", label, exc.start, exc.end,
+                                     "Invalid character in IDN label")
     # Step 3: Check for ACE prefix
-    if not label[:4].lower() == ace_prefix:
+    assert isinstance(label, bytes)
+    if not label.lower().startswith(ace_prefix):
         return str(label, "ascii")
 
     # Step 4: Remove ACE prefix
     label1 = label[len(ace_prefix):]
 
     # Step 5: Decode using PUNYCODE
-    result = label1.decode("punycode")
+    try:
+        result = label1.decode("punycode")
+    except UnicodeDecodeError as exc:
+        offset = len(ace_prefix)
+        raise UnicodeDecodeError("idna", label, offset+exc.start, offset+exc.end, exc.reason)
 
     # Step 6: Apply ToASCII
     label2 = ToASCII(result)
 
     # Step 7: Compare the result of step 6 with the one of step 3
     # label2 will already be in lower case.
     if str(label, "ascii").lower() != str(label2, "ascii"):
-        raise UnicodeError("IDNA does not round-trip", label, label2)
+        raise UnicodeDecodeError("idna", label, 0, len(label),
+                                 f"IDNA does not round-trip, '{label!r}' != '{label2!r}'")
 
     # Step 8: return the result of step 5
     return result
@@ -156,7 +180,7 @@ def encode(self, input, errors='strict'):
 
         if errors != 'strict':
             # IDNA is quite clear that implementations must be strict
-            raise UnicodeError("unsupported error handling "+errors)
+            raise UnicodeError(f"Unsupported error handling: {errors}")
 
         if not input:
             return b'', 0
@@ -168,11 +192,16 @@ def encode(self, input, errors='strict'):
         else:
             # ASCII name: fast path
             labels = result.split(b'.')
-            for label in labels[:-1]:
-                if not (0 < len(label) < 64):
-                    raise UnicodeError("label empty or too long")
-            if len(labels[-1]) >= 64:
-                raise UnicodeError("label too long")
+            for i, label in enumerate(labels[:-1]):
+                if len(label) == 0:
+                    offset = sum(len(l) for l in labels[:i]) + i
+                    raise UnicodeEncodeError("idna", input, offset, offset+1,
+                                             "label empty")
+            for i, label in enumerate(labels):
+                if len(label) >= 64:
+                    offset = sum(len(l) for l in labels[:i]) + i
+                    raise UnicodeEncodeError("idna", input, offset, offset+len(label),
+                                             "label too long")
             return result, len(input)
 
         result = bytearray()
@@ -182,17 +211,27 @@ def encode(self, input, errors='strict'):
             del labels[-1]
         else:
             trailing_dot = b''
-        for label in labels:
+        for i, label in enumerate(labels):
             if result:
                 # Join with U+002E
                 result.extend(b'.')
-            result.extend(ToASCII(label))
+            try:
+                result.extend(ToASCII(label))
+            except (UnicodeEncodeError, UnicodeDecodeError) as exc:
+                offset = sum(len(l) for l in labels[:i]) + i
+                raise UnicodeEncodeError(
+                    "idna",
+                    input,
+                    offset + exc.start,
+                    offset + exc.end,
+                    exc.reason,
+                )
         return bytes(result+trailing_dot), len(input)
 
     def decode(self, input, errors='strict'):
 
         if errors != 'strict':
-            raise UnicodeError("Unsupported error handling "+errors)
+            raise UnicodeError(f"Unsupported error handling: {errors}")
 
         if not input:
             return "", 0
@@ -218,16 +257,23 @@ def decode(self, input, errors='strict'):
             trailing_dot = ''
 
         result = []
-        for label in labels:
-            result.append(ToUnicode(label))
+        for i, label in enumerate(labels):
+            try:
+                u_label = ToUnicode(label)
+            except (UnicodeEncodeError, UnicodeDecodeError) as exc:
+                offset = sum(len(x) for x in labels[:i]) + len(labels[:i])
+                raise UnicodeDecodeError(
+                    "idna", input, offset+exc.start, offset+exc.end, exc.reason)
+            else:
+                result.append(u_label)
 
         return ".".join(result)+trailing_dot, len(input)
 
 class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
     def _buffer_encode(self, input, errors, final):
         if errors != 'strict':
             # IDNA is quite clear that implementations must be strict
-            raise UnicodeError("unsupported error handling "+errors)
+            raise UnicodeError(f"Unsupported error handling: {errors}")
 
         if not input:
             return (b'', 0)
@@ -251,7 +297,16 @@ def _buffer_encode(self, input, errors, final):
                 # Join with U+002E
                 result.extend(b'.')
                 size += 1
-            result.extend(ToASCII(label))
+            try:
+                result.extend(ToASCII(label))
+            except (UnicodeEncodeError, UnicodeDecodeError) as exc:
+                raise UnicodeEncodeError(
+                    "idna",
+                    input,
+                    size + exc.start,
+                    size + exc.end,
+                    exc.reason,
+                )
             size += len(label)
 
         result += trailing_dot
@@ -261,7 +316,7 @@ def _buffer_encode(self, input, errors, final):
 class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
     def _buffer_decode(self, input, errors, final):
         if errors != 'strict':
-            raise UnicodeError("Unsupported error handling "+errors)
+            raise UnicodeError("Unsupported error handling: {errors}")
 
         if not input:
             return ("", 0)
@@ -271,7 +326,11 @@ def _buffer_decode(self, input, errors, final):
             labels = dots.split(input)
         else:
             # Must be ASCII string
-            input = str(input, "ascii")
+            try:
+                input = str(input, "ascii")
+            except (UnicodeEncodeError, UnicodeDecodeError) as exc:
+                raise UnicodeDecodeError("idna", input,
+                                         exc.start, exc.end, exc.reason)
             labels = input.split(".")
 
         trailing_dot = ''
@@ -288,7 +347,18 @@ def _buffer_decode(self, input, errors, final):
         result = []
         size = 0
         for label in labels:
-            result.append(ToUnicode(label))
+            try:
+                u_label = ToUnicode(label)
+            except (UnicodeEncodeError, UnicodeDecodeError) as exc:
+                raise UnicodeDecodeError(
+                    "idna",
+                    input.encode("ascii", errors="backslashreplace"),
+                    size + exc.start,
+                    size + exc.end,
+                    exc.reason,
+                )
+            else:
+                result.append(u_label)
             if size:
                 size += 1
             size += len(label)