Skip to content

Commit

Permalink
[3.9] gh-91810: ElementTree: Use text file's encoding by default in X…
Browse files Browse the repository at this point in the history
…ML declaration (GH-91903) (GH-92665)

ElementTree method write() and function tostring() now use the text file's
encoding ("UTF-8" if not available) instead of locale encoding in XML
declaration when encoding="unicode" is specified.
(cherry picked from commit 707839b)


Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>

Automerge-Triggered-By: GH:serhiy-storchaka
  • Loading branch information
miss-islington committed May 11, 2022
1 parent 3f2113d commit bfc88d3
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 30 deletions.
31 changes: 15 additions & 16 deletions Lib/test/test_xml_etree.py
Expand Up @@ -10,7 +10,6 @@
import html
import io
import itertools
import locale
import operator
import os
import pickle
Expand Down Expand Up @@ -960,15 +959,13 @@ def test_tostring_xml_declaration(self):

def test_tostring_xml_declaration_unicode_encoding(self):
elem = ET.XML('<body><tag/></body>')
preferredencoding = locale.getpreferredencoding()
self.assertEqual(
f"<?xml version='1.0' encoding='{preferredencoding}'?>\n<body><tag /></body>",
ET.tostring(elem, encoding='unicode', xml_declaration=True)
ET.tostring(elem, encoding='unicode', xml_declaration=True),
"<?xml version='1.0' encoding='utf-8'?>\n<body><tag /></body>"
)

def test_tostring_xml_declaration_cases(self):
elem = ET.XML('<body><tag>ø</tag></body>')
preferredencoding = locale.getpreferredencoding()
TESTCASES = [
# (expected_retval, encoding, xml_declaration)
# ... xml_declaration = None
Expand All @@ -995,7 +992,7 @@ def test_tostring_xml_declaration_cases(self):
b"<body><tag>&#248;</tag></body>", 'US-ASCII', True),
(b"<?xml version='1.0' encoding='ISO-8859-1'?>\n"
b"<body><tag>\xf8</tag></body>", 'ISO-8859-1', True),
(f"<?xml version='1.0' encoding='{preferredencoding}'?>\n"
("<?xml version='1.0' encoding='utf-8'?>\n"
"<body><tag>ø</tag></body>", 'unicode', True),

]
Expand Down Expand Up @@ -1033,11 +1030,10 @@ def test_tostringlist_xml_declaration(self):
b"<?xml version='1.0' encoding='us-ascii'?>\n<body><tag /></body>"
)

preferredencoding = locale.getpreferredencoding()
stringlist = ET.tostringlist(elem, encoding='unicode', xml_declaration=True)
self.assertEqual(
''.join(stringlist),
f"<?xml version='1.0' encoding='{preferredencoding}'?>\n<body><tag /></body>"
"<?xml version='1.0' encoding='utf-8'?>\n<body><tag /></body>"
)
self.assertRegex(stringlist[0], r"^<\?xml version='1.0' encoding='.+'?>")
self.assertEqual(['<body', '>', '<tag', ' />', '</body>'], stringlist[1:])
Expand Down Expand Up @@ -3681,17 +3677,16 @@ def test_write_to_filename_as_unicode(self):
encoding = f.encoding
support.unlink(TESTFN)

try:
'\xf8'.encode(encoding)
except UnicodeEncodeError:
self.skipTest(f'default file encoding {encoding} not supported')

tree = ET.ElementTree(ET.XML('''<site>\xf8</site>'''))
tree.write(TESTFN, encoding='unicode')
with open(TESTFN, 'rb') as f:
data = f.read()
expected = "<site>\xf8</site>".encode(encoding, 'xmlcharrefreplace')
self.assertEqual(data, expected)
if encoding.lower() in ('utf-8', 'ascii'):
self.assertEqual(data, expected)
else:
self.assertIn(b"<?xml version='1.0' encoding=", data)
self.assertIn(expected, data)

def test_write_to_text_file(self):
self.addCleanup(support.unlink, TESTFN)
Expand All @@ -3706,13 +3701,17 @@ def test_write_to_text_file(self):
tree.write(f, encoding='unicode')
self.assertFalse(f.closed)
with open(TESTFN, 'rb') as f:
self.assertEqual(f.read(), b'''<site>&#248;</site>''')
self.assertEqual(f.read(), convlinesep(
b'''<?xml version='1.0' encoding='ascii'?>\n'''
b'''<site>&#248;</site>'''))

with open(TESTFN, 'w', encoding='ISO-8859-1') as f:
tree.write(f, encoding='unicode')
self.assertFalse(f.closed)
with open(TESTFN, 'rb') as f:
self.assertEqual(f.read(), b'''<site>\xf8</site>''')
self.assertEqual(f.read(), convlinesep(
b'''<?xml version='1.0' encoding='ISO-8859-1'?>\n'''
b'''<site>\xf8</site>'''))

def test_write_to_binary_file(self):
self.addCleanup(support.unlink, TESTFN)
Expand Down
23 changes: 9 additions & 14 deletions Lib/xml/etree/ElementTree.py
Expand Up @@ -728,16 +728,10 @@ def write(self, file_or_filename,
encoding = "utf-8"
else:
encoding = "us-ascii"
enc_lower = encoding.lower()
with _get_writer(file_or_filename, enc_lower) as write:
with _get_writer(file_or_filename, encoding) as (write, declared_encoding):
if method == "xml" and (xml_declaration or
(xml_declaration is None and
enc_lower not in ("utf-8", "us-ascii", "unicode"))):
declared_encoding = encoding
if enc_lower == "unicode":
# Retrieve the default encoding for the xml declaration
import locale
declared_encoding = locale.getpreferredencoding()
declared_encoding.lower() not in ("utf-8", "us-ascii"))):
write("<?xml version='1.0' encoding='%s'?>\n" % (
declared_encoding,))
if method == "text":
Expand All @@ -762,19 +756,20 @@ def _get_writer(file_or_filename, encoding):
write = file_or_filename.write
except AttributeError:
# file_or_filename is a file name
if encoding == "unicode":
file = open(file_or_filename, "w")
if encoding.lower() == "unicode":
file = open(file_or_filename, "w",
errors="xmlcharrefreplace")
else:
file = open(file_or_filename, "w", encoding=encoding,
errors="xmlcharrefreplace")
with file:
yield file.write
yield file.write, file.encoding
else:
# file_or_filename is a file-like object
# encoding determines if it is a text or binary writer
if encoding == "unicode":
if encoding.lower() == "unicode":
# use a text writer as is
yield write
yield write, getattr(file_or_filename, "encoding", None) or "utf-8"
else:
# wrap a binary writer with TextIOWrapper
with contextlib.ExitStack() as stack:
Expand Down Expand Up @@ -805,7 +800,7 @@ def _get_writer(file_or_filename, encoding):
# Keep the original file open when the TextIOWrapper is
# destroyed
stack.callback(file.detach)
yield file.write
yield file.write, encoding

def _namespaces(elem, default_namespace=None):
# identify namespaces used in this tree
Expand Down
@@ -0,0 +1,5 @@
:class:`~xml.etree.ElementTree.ElementTree` method
:meth:`~xml.etree.ElementTree.ElementTree.write` and function
:func:`~xml.etree.ElementTree.tostring` now use the text file's encoding
("UTF-8" if not available) instead of locale encoding in XML declaration
when ``encoding="unicode"`` is specified.

0 comments on commit bfc88d3

Please sign in to comment.