Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 17 additions & 2 deletions Doc/library/email.policy.rst
Original file line number Diff line number Diff line change
Expand Up @@ -403,11 +403,26 @@ added matters. To illustrate::
.. attribute:: utf8

If ``False``, follow :rfc:`5322`, supporting non-ASCII characters in
headers by encoding them as "encoded words". If ``True``, follow
:rfc:`6532` and use ``utf-8`` encoding for headers. Messages
headers by encoding them as :rfc:`2047` "encoded words". If ``True``,
follow :rfc:`6532` and use ``utf-8`` encoding for headers. Messages
formatted in this way may be passed to SMTP servers that support
the ``SMTPUTF8`` extension (:rfc:`6531`).

When ``False``, the generator will raise
:exc:`~email.errors.HeaderWriteError` if any header includes non-ASCII
characters in a context where :rfc:`2047` does not permit encoded words.
This particularly applies to mailboxes ("addr-spec") with non-ASCII
characters, which can be created via
:class:`~email.headerregistry.Address`. To use a mailbox with a non-ASCII
domain name with ``utf8=False``, first encode the domain using the
third-party :pypi:`idna` or :pypi:`uts46` module or with
:mod:`encodings.idna`. It is not possible to use a non-ASCII username
("local-part") in a mailbox when ``utf8=False``.

.. versionchanged:: 3.15
Can trigger the raising of :exc:`~email.errors.HeaderWriteError`.
(Earlier versions incorrectly applied :rfc:`2047` in certain contexts,
mostly notably in addr-specs.)

.. attribute:: refold_source

Expand Down
10 changes: 10 additions & 0 deletions Doc/whatsnew/3.15.rst
Original file line number Diff line number Diff line change
Expand Up @@ -914,6 +914,16 @@ faulthandler
(Contributed by Eric Froemling in :gh:`149085`.)


email
-----

* Email generators now raise an error when an :class:`.EmailMessage` cannot be
accurately flattened due to a non-ASCII email address (mailbox) in an address
header. Options for supporting Email Address Internationalization (EAI) are
discussed in :attr:`.EmailPolicy.utf8`.
(Contributed by R David Murray and Mike Edmunds in :gh:`122540`.)


functools
---------

Expand Down
91 changes: 72 additions & 19 deletions Lib/email/_header_value_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,10 +157,7 @@ def all_defects(self):
def startswith_fws(self):
return self[0].startswith_fws()

@property
def as_ew_allowed(self):
"""True if all top level tokens of this part may be RFC2047 encoded."""
return all(part.as_ew_allowed for part in self)
as_ew_allowed = True

@property
def comments(self):
Expand Down Expand Up @@ -429,6 +426,7 @@ def addr_spec(self):
class AngleAddr(TokenList):

token_type = 'angle-addr'
as_ew_allowed = False

@property
def local_part(self):
Expand Down Expand Up @@ -847,26 +845,22 @@ def params(self):

class ContentType(ParameterizedHeaderValue):
token_type = 'content-type'
as_ew_allowed = False
maintype = 'text'
subtype = 'plain'


class ContentDisposition(ParameterizedHeaderValue):
token_type = 'content-disposition'
as_ew_allowed = False
content_disposition = None


class ContentTransferEncoding(TokenList):
token_type = 'content-transfer-encoding'
as_ew_allowed = False
cte = '7bit'


class HeaderLabel(TokenList):
token_type = 'header-label'
as_ew_allowed = False


class MsgID(TokenList):
Expand Down Expand Up @@ -2838,13 +2832,68 @@ def _steal_trailing_WSP_if_exists(lines):


def _refold_parse_tree(parse_tree, *, policy):
"""Return string of contents of parse_tree folded according to RFC rules.

"""
# max_line_length 0/None means no limit, ie: infinitely long.
maxlen = policy.max_line_length or sys.maxsize
encoding = 'utf-8' if policy.utf8 else 'us-ascii'
lines = [''] # Folded lines to be output
if parse_tree.as_ew_allowed:
_refold_with_ew(parse_tree, lines, maxlen, encoding, policy=policy)
else:
_refold_without_ew(parse_tree, lines, maxlen, encoding, policy=policy)
return policy.linesep.join(lines) + policy.linesep

def _refold_without_ew(parse_tree, lines, maxlen, encoding, *, policy):
parts = list(parse_tree)
while parts:
part = parts.pop(0)
tstr = str(part)
try:
tstr.encode(encoding)
except UnicodeEncodeError:
if any(isinstance(x, errors.UndecodableBytesDefect)
for x in part.all_defects):
# There is garbage data from parsing a message in binary mode,
# just pass it through. Not good, but the best we can do.
pass
elif policy.utf8:
# If this happens, it's a programmer error.
raise
else:
raise errors.HeaderWriteError(
f"Non-ASCII {part.token_type} '{part}' is invalid"
" under current policy setting (utf8=False)"
)
if len(tstr) <= maxlen - len(lines[-1]):
lines[-1] += tstr
continue
# This part is too long to fit. The RFC wants us to break at
# "major syntactic breaks", so unless we don't consider this
# to be one, check if it will fit on the next line by itself.
if (part.syntactic_break and
len(tstr) + 1 <= maxlen):
newline = _steal_trailing_WSP_if_exists(lines)
if newline or part.startswith_fws():
lines.append(newline + tstr)
continue
if not hasattr(part, 'encode'):
# It's not a terminal, try folding the subparts.
newparts = list(part)
parts = newparts + parts
continue
# We can't figure out how to wrap, it, so give up.
newline = _steal_trailing_WSP_if_exists(lines)
if newline or part.startswith_fws():
lines.append(newline + tstr)
else:
# We can't fold it onto the next line either...
lines[-1] += tstr
return


def _refold_with_ew(parse_tree, lines, maxlen, encoding, *, policy):
"""Return string of contents of parse_tree folded according to RFC rules.

"""
last_word_is_ew = False
last_ew = None # if there is an encoded word in the last line of lines,
# points to the encoded word's first character
Expand All @@ -2858,6 +2907,11 @@ def _refold_parse_tree(parse_tree, *, policy):
if part is end_ew_not_allowed:
wrap_as_ew_blocked -= 1
continue
if part.token_type == 'mime-parameters':
# Mime parameter folding (using RFC2231) is extra special.
_fold_mime_parameters(part, lines, maxlen, encoding)
last_word_is_ew = False
continue
tstr = str(part)
if not want_encoding:
if part.token_type in ('ptext', 'vtext'):
Expand All @@ -2879,14 +2933,11 @@ def _refold_parse_tree(parse_tree, *, policy):
charset = 'utf-8'
want_encoding = True

if part.token_type == 'mime-parameters':
# Mime parameter folding (using RFC2231) is extra special.
_fold_mime_parameters(part, lines, maxlen, encoding)
last_word_is_ew = False
continue

if want_encoding and not wrap_as_ew_blocked:
if not part.as_ew_allowed:
if any(
not x.as_ew_allowed for x in part
if hasattr(x, 'as_ew_allowed')
):
want_encoding = False
last_ew = None
if part.syntactic_break:
Expand Down Expand Up @@ -2967,6 +3018,8 @@ def _refold_parse_tree(parse_tree, *, policy):
[ValueTerminal(make_quoted_pairs(p), 'ptext')
for p in newparts] +
[ValueTerminal('"', 'ptext')])
_refold_without_ew(newparts, lines, maxlen, encoding, policy=policy)
continue
if part.token_type == 'comment':
newparts = (
[ValueTerminal('(', 'ptext')] +
Expand Down Expand Up @@ -2994,7 +3047,7 @@ def _refold_parse_tree(parse_tree, *, policy):
lines[-1] += tstr
last_word_is_ew = last_word_is_ew and not bool(tstr.strip(_WSP))

return policy.linesep.join(lines) + policy.linesep
return

def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, last_word_is_ew):
"""Fold string to_encode into lines as encoded word, combining if allowed.
Expand Down
8 changes: 5 additions & 3 deletions Lib/test/test_email/test__header_value_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3364,10 +3364,12 @@ def test_fold_unfoldable_element_stealing_whitespace(self):
self._test(token, expected, policy=policy)

def test_encoded_word_with_undecodable_bytes(self):
self._test(parser.get_address_list(
' =?utf-8?Q?=E5=AE=A2=E6=88=B6=E6=AD=A3=E8=A6=8F=E4=BA=A4=E7?='
self._test(
parser.get_address_list(
' =?utf-8?Q?=E5=AE=A2=E6=88=B6=E6=AD=A3=E8=A6=8F=E4=BA=A4=E7?='
' <xyz@abc.com>'
)[0],
' =?unknown-8bit?b?5a6i5oi25q2j6KaP5Lqk5w==?=\n',
' =?unknown-8bit?b?5a6i5oi25q2j6KaP5Lqk5w==?= <xyz@abc.com>\n',
)


Expand Down
99 changes: 97 additions & 2 deletions Lib/test/test_email/test_generator.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import io
import re
import textwrap
import unittest
import random
Expand Down Expand Up @@ -295,6 +296,69 @@ def test_keep_long_encoded_newlines(self):
g.flatten(msg)
self.assertEqual(s.getvalue(), self.typ(expected))

def test_non_ascii_addr_spec_raises(self):
# non-ascii is not permitted in any part of an addr-spec. If the
# programmer generated it, it's an error. (See also
# test_non_ascii_addr_spec_preserved below.)
p = self.policy.clone(utf8=False, max_line_length=20)
g = self.genclass(self.ioclass(), policy=p)
# XXX The particular part detected here isn't part of a behavioral
# spec and may change in the future.
cases = [
('wők@example.com', 'wők', 'local-part'),
('wok@exàmple.com', 'exàmple.com', 'domain'),
('wők@exàmple.com', 'wők', 'local-part'),
(
'"Name, for display" <wők@example.com>',
'wők@example.com',
'addr-spec',
),
(
'Näyttönimi <wők@example.com>',
'wők@example.com',
'addr-spec',
),
(
'"a lőng quoted string as the local part"@example.com',
'a lőng quoted string as the local part',
'local-part',
),

]
for address, badtoken, partname in cases:
with self.subTest(address=address):
msg = EmailMessage()
msg['To'] = address
expected_error = (
fr"(?i)(?=.*non-ascii)"
fr"(?=.*{re.escape(badtoken)})"
fr"(?=.*{partname})"
fr"(?=.*policy.*utf8)"
)
with self.assertRaisesRegex(
email.errors.HeaderWriteError, expected_error
):
g.flatten(msg)

def test_local_part_quoted_string_wrapped_correctly(self):
msg = self.msgmaker(self.typ(textwrap.dedent("""\
To: <"a long local part in a quoted string"@example.com>
Subject: test

None
""")), policy=self.policy.clone(max_line_length=20))
expected = textwrap.dedent("""\
To: <"a long local part in a
quoted string"@example.com>
Subject: test

None
""")
s = self.ioclass()
g = self.genclass(s, policy=self.policy.clone(max_line_length=30))
g.flatten(msg)
self.assertEqual(s.getvalue(), self.typ(expected))

def _test_boundary_detection(self, linesep):
# Generate a boundary token in the same way as _make_boundary
token = random.randrange(sys.maxsize)
Expand Down Expand Up @@ -515,12 +579,12 @@ def test_cte_type_7bit_transforms_8bit_cte(self):

def test_smtputf8_policy(self):
msg = EmailMessage()
msg['From'] = "Páolo <főo@bar.com>"
msg['From'] = "Páolo <főo@bàr.com>"
msg['To'] = 'Dinsdale'
msg['Subject'] = 'Nudge nudge, wink, wink \u1F609'
msg.set_content("oh là là, know what I mean, know what I mean?")
expected = textwrap.dedent("""\
From: Páolo <főo@bar.com>
From: Páolo <főo@bàr.com>
To: Dinsdale
Subject: Nudge nudge, wink, wink \u1F609
Content-Type: text/plain; charset="utf-8"
Expand Down Expand Up @@ -555,6 +619,37 @@ def test_smtp_policy(self):
g.flatten(msg)
self.assertEqual(s.getvalue(), expected)

def test_non_ascii_addr_spec_preserved(self):
# A defective non-ASCII addr-spec parsed from the original
# message is left unchanged when flattening.
# (See also test_non_ascii_addr_spec_raises above.)
source = (
'To: jörg@example.com, "But a long name still works with refold_source" <jörg@example.com>'
).encode()
expected = (
b'To: j\xc3\xb6rg@example.com,\n'
b' "But a long name still works with refold_source" <j\xc3\xb6rg@example.com>\n'
b'\n'
)
msg = message_from_bytes(source, policy=policy.default)
s = io.BytesIO()
g = BytesGenerator(s, policy=policy.default)
g.flatten(msg)
self.assertEqual(s.getvalue(), expected)

def test_idna_encoding_preserved(self):
# Nothing tries to decode a pre-encoded IDNA domain.
msg = EmailMessage()
msg["To"] = Address(
username='jörg',
domain='☕.example'.encode('idna').decode() # IDNA 2003
)
expected = 'To: jörg@xn--53h.example\n\n'.encode()
s = io.BytesIO()
g = BytesGenerator(s, policy=policy.default.clone(utf8=True))
g.flatten(msg)
self.assertEqual(s.getvalue(), expected)


if __name__ == '__main__':
unittest.main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
The :mod:`email` module no longer incorrectly uses :rfc:`2047` encoding for
a mailbox with non-ASCII characters in its domain. Under a policy with
:attr:`~email.policy.EmailPolicy.utf8` set ``False``, attempting to serialize
such a message will now raise an :exc:`~email.errors.HeaderWriteError`.
Either apply an appropriate IDNA encoding to convert the domain to ASCII before
serialization, or use :data:`email.policy.SMTPUTF8` (or another policy with
``utf8=True``) to correctly pass through the internationalized domain name
as Unicode characters.
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
The :mod:`email` module no longer incorrectly uses :rfc:`2047` encoding for
a mailbox with non-ASCII characters in its local-part. Under a policy with
:attr:`~email.policy.EmailPolicy.utf8` set ``False``, attempting to serialize
such a message will now raise an :exc:`~email.errors.HeaderWriteError`.
There is no valid 7-bit encoding for an internationalized local-part. Use
:data:`email.policy.SMTPUTF8` (or another policy with ``utf8=True``) to
correctly pass through the local-part as Unicode characters.
Loading