Skip to content

Commit

Permalink
gh-85287: Change codecs to raise precise UnicodeEncodeError and Unico…
Browse files Browse the repository at this point in the history
…deDecodeError (#113674)

Co-authored-by: Inada Naoki <songofacandy@gmail.com>
  • Loading branch information
jjsloboda and methane committed Mar 17, 2024
1 parent c514a97 commit 649857a
Show file tree
Hide file tree
Showing 9 changed files with 306 additions and 81 deletions.
164 changes: 117 additions & 47 deletions Lib/encodings/idna.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
sace_prefix = "xn--"

# This assumes query strings, so AllowUnassigned is true
def nameprep(label):
def nameprep(label): # type: (str) -> str
# Map
newlabel = []
for c in label:
Expand All @@ -25,7 +25,7 @@ def nameprep(label):
label = unicodedata.normalize("NFKC", label)

# Prohibit
for c in label:
for i, c in enumerate(label):
if stringprep.in_table_c12(c) or \
stringprep.in_table_c22(c) or \
stringprep.in_table_c3(c) or \
Expand All @@ -35,7 +35,7 @@ def nameprep(label):
stringprep.in_table_c7(c) or \
stringprep.in_table_c8(c) or \
stringprep.in_table_c9(c):
raise UnicodeError("Invalid character %r" % c)
raise UnicodeEncodeError("idna", label, i, i+1, f"Invalid character {c!r}")

# Check bidi
RandAL = [stringprep.in_table_d1(x) for x in label]
Expand All @@ -46,59 +46,73 @@ def nameprep(label):
# This is table C.8, which was already checked
# 2) If a string contains any RandALCat character, the string
# MUST NOT contain any LCat character.
if any(stringprep.in_table_d2(x) for x in label):
raise UnicodeError("Violation of BIDI requirement 2")
for i, x in enumerate(label):
if stringprep.in_table_d2(x):
raise UnicodeEncodeError("idna", label, i, i+1,
"Violation of BIDI requirement 2")
# 3) If a string contains any RandALCat character, a
# RandALCat character MUST be the first character of the
# string, and a RandALCat character MUST be the last
# character of the string.
if not RandAL[0] or not RandAL[-1]:
raise UnicodeError("Violation of BIDI requirement 3")
if not RandAL[0]:
raise UnicodeEncodeError("idna", label, 0, 1,
"Violation of BIDI requirement 3")
if not RandAL[-1]:
raise UnicodeEncodeError("idna", label, len(label)-1, len(label),
"Violation of BIDI requirement 3")

return label

def ToASCII(label):
def ToASCII(label): # type: (str) -> bytes
try:
# Step 1: try ASCII
label = label.encode("ascii")
except UnicodeError:
label_ascii = label.encode("ascii")
except UnicodeEncodeError:
pass
else:
# Skip to step 3: UseSTD3ASCIIRules is false, so
# Skip to step 8.
if 0 < len(label) < 64:
return label
raise UnicodeError("label empty or too long")
if 0 < len(label_ascii) < 64:
return label_ascii
if len(label) == 0:
raise UnicodeEncodeError("idna", label, 0, 1, "label empty")
else:
raise UnicodeEncodeError("idna", label, 0, len(label), "label too long")

# Step 2: nameprep
label = nameprep(label)

# Step 3: UseSTD3ASCIIRules is false
# Step 4: try ASCII
try:
label = label.encode("ascii")
except UnicodeError:
label_ascii = label.encode("ascii")
except UnicodeEncodeError:
pass
else:
# Skip to step 8.
if 0 < len(label) < 64:
return label
raise UnicodeError("label empty or too long")
return label_ascii
if len(label) == 0:
raise UnicodeEncodeError("idna", label, 0, 1, "label empty")
else:
raise UnicodeEncodeError("idna", label, 0, len(label), "label too long")

# Step 5: Check ACE prefix
if label[:4].lower() == sace_prefix:
raise UnicodeError("Label starts with ACE prefix")
if label.lower().startswith(sace_prefix):
raise UnicodeEncodeError(
"idna", label, 0, len(sace_prefix), "Label starts with ACE prefix")

# Step 6: Encode with PUNYCODE
label = label.encode("punycode")
label_ascii = label.encode("punycode")

# Step 7: Prepend ACE prefix
label = ace_prefix + label
label_ascii = ace_prefix + label_ascii

# Step 8: Check size
if 0 < len(label) < 64:
return label
raise UnicodeError("label empty or too long")
# do not check for empty as we prepend ace_prefix.
if len(label_ascii) < 64:
return label_ascii
raise UnicodeEncodeError("idna", label, 0, len(label), "label too long")

def ToUnicode(label):
if len(label) > 1024:
Expand All @@ -110,41 +124,51 @@ def ToUnicode(label):
# per https://www.rfc-editor.org/rfc/rfc3454#section-3.1 while still
# preventing us from wasting time decoding a big thing that'll just
# hit the actual <= 63 length limit in Step 6.
raise UnicodeError("label way too long")
if isinstance(label, str):
label = label.encode("utf-8", errors="backslashreplace")
raise UnicodeDecodeError("idna", label, 0, len(label), "label way too long")
# Step 1: Check for ASCII
if isinstance(label, bytes):
pure_ascii = True
else:
try:
label = label.encode("ascii")
pure_ascii = True
except UnicodeError:
except UnicodeEncodeError:
pure_ascii = False
if not pure_ascii:
assert isinstance(label, str)
# Step 2: Perform nameprep
label = nameprep(label)
# It doesn't say this, but apparently, it should be ASCII now
try:
label = label.encode("ascii")
except UnicodeError:
raise UnicodeError("Invalid character in IDN label")
except UnicodeEncodeError as exc:
raise UnicodeEncodeError("idna", label, exc.start, exc.end,
"Invalid character in IDN label")
# Step 3: Check for ACE prefix
if not label[:4].lower() == ace_prefix:
assert isinstance(label, bytes)
if not label.lower().startswith(ace_prefix):
return str(label, "ascii")

# Step 4: Remove ACE prefix
label1 = label[len(ace_prefix):]

# Step 5: Decode using PUNYCODE
result = label1.decode("punycode")
try:
result = label1.decode("punycode")
except UnicodeDecodeError as exc:
offset = len(ace_prefix)
raise UnicodeDecodeError("idna", label, offset+exc.start, offset+exc.end, exc.reason)

# Step 6: Apply ToASCII
label2 = ToASCII(result)

# Step 7: Compare the result of step 6 with the one of step 3
# label2 will already be in lower case.
if str(label, "ascii").lower() != str(label2, "ascii"):
raise UnicodeError("IDNA does not round-trip", label, label2)
raise UnicodeDecodeError("idna", label, 0, len(label),
f"IDNA does not round-trip, '{label!r}' != '{label2!r}'")

# Step 8: return the result of step 5
return result
Expand All @@ -156,7 +180,7 @@ def encode(self, input, errors='strict'):

if errors != 'strict':
# IDNA is quite clear that implementations must be strict
raise UnicodeError("unsupported error handling "+errors)
raise UnicodeError(f"Unsupported error handling: {errors}")

if not input:
return b'', 0
Expand All @@ -168,11 +192,16 @@ def encode(self, input, errors='strict'):
else:
# ASCII name: fast path
labels = result.split(b'.')
for label in labels[:-1]:
if not (0 < len(label) < 64):
raise UnicodeError("label empty or too long")
if len(labels[-1]) >= 64:
raise UnicodeError("label too long")
for i, label in enumerate(labels[:-1]):
if len(label) == 0:
offset = sum(len(l) for l in labels[:i]) + i
raise UnicodeEncodeError("idna", input, offset, offset+1,
"label empty")
for i, label in enumerate(labels):
if len(label) >= 64:
offset = sum(len(l) for l in labels[:i]) + i
raise UnicodeEncodeError("idna", input, offset, offset+len(label),
"label too long")
return result, len(input)

result = bytearray()
Expand All @@ -182,17 +211,27 @@ def encode(self, input, errors='strict'):
del labels[-1]
else:
trailing_dot = b''
for label in labels:
for i, label in enumerate(labels):
if result:
# Join with U+002E
result.extend(b'.')
result.extend(ToASCII(label))
try:
result.extend(ToASCII(label))
except (UnicodeEncodeError, UnicodeDecodeError) as exc:
offset = sum(len(l) for l in labels[:i]) + i
raise UnicodeEncodeError(
"idna",
input,
offset + exc.start,
offset + exc.end,
exc.reason,
)
return bytes(result+trailing_dot), len(input)

def decode(self, input, errors='strict'):

if errors != 'strict':
raise UnicodeError("Unsupported error handling "+errors)
raise UnicodeError(f"Unsupported error handling: {errors}")

if not input:
return "", 0
Expand All @@ -218,16 +257,23 @@ def decode(self, input, errors='strict'):
trailing_dot = ''

result = []
for label in labels:
result.append(ToUnicode(label))
for i, label in enumerate(labels):
try:
u_label = ToUnicode(label)
except (UnicodeEncodeError, UnicodeDecodeError) as exc:
offset = sum(len(x) for x in labels[:i]) + len(labels[:i])
raise UnicodeDecodeError(
"idna", input, offset+exc.start, offset+exc.end, exc.reason)
else:
result.append(u_label)

return ".".join(result)+trailing_dot, len(input)

class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
def _buffer_encode(self, input, errors, final):
if errors != 'strict':
# IDNA is quite clear that implementations must be strict
raise UnicodeError("unsupported error handling "+errors)
raise UnicodeError(f"Unsupported error handling: {errors}")

if not input:
return (b'', 0)
Expand All @@ -251,7 +297,16 @@ def _buffer_encode(self, input, errors, final):
# Join with U+002E
result.extend(b'.')
size += 1
result.extend(ToASCII(label))
try:
result.extend(ToASCII(label))
except (UnicodeEncodeError, UnicodeDecodeError) as exc:
raise UnicodeEncodeError(
"idna",
input,
size + exc.start,
size + exc.end,
exc.reason,
)
size += len(label)

result += trailing_dot
Expand All @@ -261,7 +316,7 @@ def _buffer_encode(self, input, errors, final):
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
def _buffer_decode(self, input, errors, final):
if errors != 'strict':
raise UnicodeError("Unsupported error handling "+errors)
raise UnicodeError("Unsupported error handling: {errors}")

if not input:
return ("", 0)
Expand All @@ -271,7 +326,11 @@ def _buffer_decode(self, input, errors, final):
labels = dots.split(input)
else:
# Must be ASCII string
input = str(input, "ascii")
try:
input = str(input, "ascii")
except (UnicodeEncodeError, UnicodeDecodeError) as exc:
raise UnicodeDecodeError("idna", input,
exc.start, exc.end, exc.reason)
labels = input.split(".")

trailing_dot = ''
Expand All @@ -288,7 +347,18 @@ def _buffer_decode(self, input, errors, final):
result = []
size = 0
for label in labels:
result.append(ToUnicode(label))
try:
u_label = ToUnicode(label)
except (UnicodeEncodeError, UnicodeDecodeError) as exc:
raise UnicodeDecodeError(
"idna",
input.encode("ascii", errors="backslashreplace"),
size + exc.start,
size + exc.end,
exc.reason,
)
else:
result.append(u_label)
if size:
size += 1
size += len(label)
Expand Down

0 comments on commit 649857a

Please sign in to comment.