From f6f868c39309b2d3ebebef24b8091c09d046e417 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Sat, 17 Feb 2018 19:21:28 +0100 Subject: [PATCH 1/4] bpo-20928: bring elementtree's XInclude support en-par with the implementation in lxml by adding support for recursive includes and a base-URL. --- Lib/test/test_xml_etree.py | 32 ++++++++++++++++++++++++++++++++ Lib/xml/etree/ElementInclude.py | 32 +++++++++++++++++++++++++------- 2 files changed, 57 insertions(+), 7 deletions(-) diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py index f2e3f8d8b22bc9..b2ef25e5da599e 100644 --- a/Lib/test/test_xml_etree.py +++ b/Lib/test/test_xml_etree.py @@ -1281,6 +1281,31 @@ def test_unknown_event(self): """ +XINCLUDE["Recursive1.xml"] = """\ + + +

The following is the source code of Recursive2.xml:

+ +
+""" + +XINCLUDE["Recursive2.xml"] = """\ + + +

The following is the source code of Recursive3.xml:

+ +
+""" + +XINCLUDE["Recursive3.xml"] = """\ + + +

The following is the source code of Recursive1.xml:

+ +
+""" + + class XIncludeTest(unittest.TestCase): def xinclude_loader(self, href, parse="xml", encoding=None): @@ -1414,6 +1439,13 @@ def test_xinclude_failures(self): "xi:fallback tag must be child of xi:include " "('{http://www.w3.org/2001/XInclude}fallback')") + document = self.xinclude_loader("Recursive1.xml") + with self.assertRaises(SyntaxError) as cm: + ElementInclude.include(document, self.xinclude_loader) + self.assertEqual(str(cm.exception), + "recursive include of Recursive2.xml") + + # -------------------------------------------------------------------- # reported bugs diff --git a/Lib/xml/etree/ElementInclude.py b/Lib/xml/etree/ElementInclude.py index 963470e3b15c71..94d53dcfabb07e 100644 --- a/Lib/xml/etree/ElementInclude.py +++ b/Lib/xml/etree/ElementInclude.py @@ -50,6 +50,7 @@ import copy from . import ElementTree +from urllib.parse import urljoin XINCLUDE = "{http://www.w3.org/2001/XInclude}" @@ -92,13 +93,22 @@ def default_loader(href, parse, encoding=None): # @param loader Optional resource loader. If omitted, it defaults # to {@link default_loader}. If given, it should be a callable # that implements the same interface as default_loader. +# @param base_url The base URL of the original file, to resolve +# relative include file references. # @throws FatalIncludeError If the function fails to include a given # resource, or if the tree contains malformed XInclude elements. -# @throws OSError If the function fails to load a given resource. +# @throws IOError If the function fails to load a given resource. +# @returns the node or its replacement if it was an XInclude node -def include(elem, loader=None): +def include(elem, loader=None, base_url=None): + if hasattr(elem, 'getroot'): + elem = elem.getroot() if loader is None: loader = default_loader + _include(elem, loader, base_url, set()) + + +def _include(elem, loader, base_url, _parent_hrefs): # look for xinclude elements i = 0 while i < len(elem): @@ -106,14 +116,20 @@ def include(elem, loader=None): if e.tag == XINCLUDE_INCLUDE: # process xinclude directive href = e.get("href") + if base_url: + href = urljoin(base_url, href) parse = e.get("parse", "xml") if parse == "xml": + if href in _parent_hrefs: + raise FatalIncludeError("recursive include of %s" % href) + _parent_hrefs.add(href) node = loader(href, parse) if node is None: raise FatalIncludeError( "cannot load %r as %r" % (href, parse) ) - node = copy.copy(node) + node = copy.copy(node) # FIXME: this makes little sense with recursive includes + _include(node, loader, href, _parent_hrefs) if e.tail: node.tail = (node.tail or "") + e.tail elem[i] = node @@ -123,11 +139,13 @@ def include(elem, loader=None): raise FatalIncludeError( "cannot load %r as %r" % (href, parse) ) + if e.tail: + text += e.tail if i: node = elem[i-1] - node.tail = (node.tail or "") + text + (e.tail or "") + node.tail = (node.tail or "") + text else: - elem.text = (elem.text or "") + text + (e.tail or "") + elem.text = (elem.text or "") + text del elem[i] continue else: @@ -139,5 +157,5 @@ def include(elem, loader=None): "xi:fallback tag must be child of xi:include (%r)" % e.tag ) else: - include(e, loader) - i = i + 1 + _include(e, loader, base_url, _parent_hrefs) + i += 1 From 93d384499de01fd5e392f318dd13346d576e7e07 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Fri, 23 Feb 2018 18:02:19 +0100 Subject: [PATCH 2/4] bpo-20928: Support xincluding the same file multiple times, just not recursively. --- Lib/test/test_xml_etree.py | 18 ++++++++++++++++++ Lib/xml/etree/ElementInclude.py | 1 + 2 files changed, 19 insertions(+) diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py index b2ef25e5da599e..a4178619847094 100644 --- a/Lib/test/test_xml_etree.py +++ b/Lib/test/test_xml_etree.py @@ -1261,6 +1261,17 @@ def test_unknown_event(self): """.format(html.escape(SIMPLE_XMLFILE, True)) +XINCLUDE["include_c1_repeated.xml"] = """\ + + +

The following is the source code of Recursive1.xml:

+ + + + +
+""" + # # badly formatted xi:include tags @@ -1407,6 +1418,13 @@ def test_xinclude(self): ' \n' '') # C5 + def test_xinclude_repeated(self): + from xml.etree import ElementInclude + + document = self.xinclude_loader("include_c1_repeated.xml") + ElementInclude.include(document, self.xinclude_loader) + self.assertEqual(1+4*2, len(document.findall(".//p"))) + def test_xinclude_failures(self): from xml.etree import ElementInclude diff --git a/Lib/xml/etree/ElementInclude.py b/Lib/xml/etree/ElementInclude.py index 94d53dcfabb07e..3375caed2c574b 100644 --- a/Lib/xml/etree/ElementInclude.py +++ b/Lib/xml/etree/ElementInclude.py @@ -130,6 +130,7 @@ def _include(elem, loader, base_url, _parent_hrefs): ) node = copy.copy(node) # FIXME: this makes little sense with recursive includes _include(node, loader, href, _parent_hrefs) + _parent_hrefs.remove(href) if e.tail: node.tail = (node.tail or "") + e.tail elem[i] = node From 641bd818e8c2ba257fbfe1ee6f23ebb6c2836a14 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Fri, 23 Feb 2018 18:36:18 +0100 Subject: [PATCH 3/4] bpo-20928: Add 'max_depth' parameter to xinclude that limits the maximum recursion depth to 6 by default. --- Lib/test/test_xml_etree.py | 34 ++++++++++++++++++++++++++++++++- Lib/xml/etree/ElementInclude.py | 33 +++++++++++++++++++++++++++----- 2 files changed, 61 insertions(+), 6 deletions(-) diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py index a4178619847094..ae349e6a87830b 100644 --- a/Lib/test/test_xml_etree.py +++ b/Lib/test/test_xml_etree.py @@ -1457,12 +1457,44 @@ def test_xinclude_failures(self): "xi:fallback tag must be child of xi:include " "('{http://www.w3.org/2001/XInclude}fallback')") + # Test infinitely recursive includes. document = self.xinclude_loader("Recursive1.xml") - with self.assertRaises(SyntaxError) as cm: + with self.assertRaises(ElementInclude.FatalIncludeError) as cm: ElementInclude.include(document, self.xinclude_loader) self.assertEqual(str(cm.exception), "recursive include of Recursive2.xml") + # Test 'max_depth' limitation. + document = self.xinclude_loader("Recursive1.xml") + with self.assertRaises(ElementInclude.FatalIncludeError) as cm: + ElementInclude.include(document, self.xinclude_loader, max_depth=None) + self.assertEqual(str(cm.exception), + "recursive include of Recursive2.xml") + + document = self.xinclude_loader("Recursive1.xml") + with self.assertRaises(ElementInclude.LimitedRecursiveIncludeError) as cm: + ElementInclude.include(document, self.xinclude_loader, max_depth=0) + self.assertEqual(str(cm.exception), + "maximum xinclude depth reached when including file Recursive2.xml") + + document = self.xinclude_loader("Recursive1.xml") + with self.assertRaises(ElementInclude.LimitedRecursiveIncludeError) as cm: + ElementInclude.include(document, self.xinclude_loader, max_depth=1) + self.assertEqual(str(cm.exception), + "maximum xinclude depth reached when including file Recursive3.xml") + + document = self.xinclude_loader("Recursive1.xml") + with self.assertRaises(ElementInclude.LimitedRecursiveIncludeError) as cm: + ElementInclude.include(document, self.xinclude_loader, max_depth=2) + self.assertEqual(str(cm.exception), + "maximum xinclude depth reached when including file Recursive1.xml") + + document = self.xinclude_loader("Recursive1.xml") + with self.assertRaises(ElementInclude.FatalIncludeError) as cm: + ElementInclude.include(document, self.xinclude_loader, max_depth=3) + self.assertEqual(str(cm.exception), + "recursive include of Recursive2.xml") + # -------------------------------------------------------------------- # reported bugs diff --git a/Lib/xml/etree/ElementInclude.py b/Lib/xml/etree/ElementInclude.py index 3375caed2c574b..5303062716c47a 100644 --- a/Lib/xml/etree/ElementInclude.py +++ b/Lib/xml/etree/ElementInclude.py @@ -57,12 +57,21 @@ XINCLUDE_INCLUDE = XINCLUDE + "include" XINCLUDE_FALLBACK = XINCLUDE + "fallback" +# For security reasons, the inclusion depth is limited to this read-only value by default. +DEFAULT_MAX_INCLUSION_DEPTH = 6 + + ## # Fatal include error. class FatalIncludeError(SyntaxError): pass + +class LimitedRecursiveIncludeError(FatalIncludeError): + pass + + ## # Default loader. This loader reads an included resource from disk. # @@ -95,20 +104,31 @@ def default_loader(href, parse, encoding=None): # that implements the same interface as default_loader. # @param base_url The base URL of the original file, to resolve # relative include file references. +# @param max_depth The maximum number of recursive inclusions. +# Limited to reduce the risk of malicious content explosion. +# Pass a negative value to disable the limitation. +# @throws LimitedRecursiveIncludeError If the {@link max_depth} was exceeded. # @throws FatalIncludeError If the function fails to include a given # resource, or if the tree contains malformed XInclude elements. # @throws IOError If the function fails to load a given resource. # @returns the node or its replacement if it was an XInclude node -def include(elem, loader=None, base_url=None): +def include(elem, loader=None, base_url=None, + max_depth=DEFAULT_MAX_INCLUSION_DEPTH): + if max_depth is None: + max_depth = -1 + elif max_depth < 0: + raise ValueError("expected non-negative depth or None for 'max_depth', got %r" % max_depth) + if hasattr(elem, 'getroot'): elem = elem.getroot() if loader is None: loader = default_loader - _include(elem, loader, base_url, set()) + + _include(elem, loader, base_url, max_depth, set()) -def _include(elem, loader, base_url, _parent_hrefs): +def _include(elem, loader, base_url, max_depth, _parent_hrefs): # look for xinclude elements i = 0 while i < len(elem): @@ -122,6 +142,9 @@ def _include(elem, loader, base_url, _parent_hrefs): if parse == "xml": if href in _parent_hrefs: raise FatalIncludeError("recursive include of %s" % href) + if max_depth == 0: + raise LimitedRecursiveIncludeError( + "maximum xinclude depth reached when including file %s" % href) _parent_hrefs.add(href) node = loader(href, parse) if node is None: @@ -129,7 +152,7 @@ def _include(elem, loader, base_url, _parent_hrefs): "cannot load %r as %r" % (href, parse) ) node = copy.copy(node) # FIXME: this makes little sense with recursive includes - _include(node, loader, href, _parent_hrefs) + _include(node, loader, href, max_depth - 1, _parent_hrefs) _parent_hrefs.remove(href) if e.tail: node.tail = (node.tail or "") + e.tail @@ -158,5 +181,5 @@ def _include(elem, loader, base_url, _parent_hrefs): "xi:fallback tag must be child of xi:include (%r)" % e.tag ) else: - _include(e, loader, base_url, _parent_hrefs) + _include(e, loader, base_url, max_depth, _parent_hrefs) i += 1 From 7f0348391b7670cd983cc1b505104da70b80be82 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Fri, 30 Mar 2018 18:24:50 +0200 Subject: [PATCH 4/4] Add news entry for updated ElementInclude support --- .../NEWS.d/next/Library/2018-03-30-16-18-12.bpo-20928.ieXu6I.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Library/2018-03-30-16-18-12.bpo-20928.ieXu6I.rst diff --git a/Misc/NEWS.d/next/Library/2018-03-30-16-18-12.bpo-20928.ieXu6I.rst b/Misc/NEWS.d/next/Library/2018-03-30-16-18-12.bpo-20928.ieXu6I.rst new file mode 100644 index 00000000000000..2585400907799d --- /dev/null +++ b/Misc/NEWS.d/next/Library/2018-03-30-16-18-12.bpo-20928.ieXu6I.rst @@ -0,0 +1 @@ +ElementTree supports recursive XInclude processing. Patch by Stefan Behnel.