Merge branch '2.11-redos' into 2.11

scrapy · Feb 14, 2024 · 479619b · 479619b
2 parents 809bfac + 5e5a920
commit 479619b
Show file tree

Hide file tree

Showing 9 changed files with 219 additions and 41 deletions.
diff --git a/docs/faq.rst b/docs/faq.rst
@@ -297,9 +297,13 @@ build the DOM of the entire feed in memory, and this can be quite slow and
 consume a lot of memory.
 
 In order to avoid parsing all the entire feed at once in memory, you can use
-the functions ``xmliter`` and ``csviter`` from ``scrapy.utils.iterators``
-module. In fact, this is what the feed spiders (see :ref:`topics-spiders`) use
-under the cover.
+the :func:`~scrapy.utils.iterators.xmliter_lxml` and
+:func:`~scrapy.utils.iterators.csviter` functions. In fact, this is what
+:class:`~scrapy.spiders.XMLFeedSpider` uses.
+
+.. autofunction:: scrapy.utils.iterators.xmliter_lxml
+
+.. autofunction:: scrapy.utils.iterators.csviter
 
 Does Scrapy manage cookies automatically?
 -----------------------------------------

diff --git a/docs/news.rst b/docs/news.rst
@@ -19,6 +19,25 @@ Highlights:
 Security bug fixes
 ~~~~~~~~~~~~~~~~~~
 
+-   Addressed `ReDoS vulnerabilities`_:
+
+    -   ``scrapy.utils.iterators.xmliter`` is now deprecated in favor of
+        :func:`~scrapy.utils.iterators.xmliter_lxml`, which
+        :class:`~scrapy.spiders.XMLFeedSpider` now uses.
+
+        To minimize the impact of this change on existing code,
+        :func:`~scrapy.utils.iterators.xmliter_lxml` now supports indicating
+        the node namespace with a prefix in the node name, and big files with
+        highly nested trees when using libxml2 2.7+.
+
+    -   Fixed regular expressions in the implementation of the
+        :func:`~scrapy.utils.response.open_in_browser` function.
+
+    Please, see the `cc65-xxvf-f7r9 security advisory`_ for more information.
+
+    .. _ReDoS vulnerabilities: https://owasp.org/www-community/attacks/Regular_expression_Denial_of_Service_-_ReDoS
+    .. _cc65-xxvf-f7r9 security advisory: https://github.com/scrapy/scrapy/security/advisories/GHSA-cc65-xxvf-f7r9
+
 -   :setting:`DOWNLOAD_MAXSIZE` and :setting:`DOWNLOAD_WARNSIZE` now also apply
     to the decompressed response body. Please, see the `7j7m-v7m3-jqm7 security
     advisory`_ for more information.
@@ -2951,14 +2970,24 @@ affect subclasses:
 
 (:issue:`3884`)
 
-
 .. _release-1.8.4:
 
 Scrapy 1.8.4 (unreleased)
 -------------------------
 
 **Security bug fixes:**
 
+-   Due to its `ReDoS vulnerabilities`_, ``scrapy.utils.iterators.xmliter`` is
+    now deprecated in favor of :func:`~scrapy.utils.iterators.xmliter_lxml`,
+    which :class:`~scrapy.spiders.XMLFeedSpider` now uses.
+
+    To minimize the impact of this change on existing code,
+    :func:`~scrapy.utils.iterators.xmliter_lxml` now supports indicating
+    the node namespace as a prefix in the node name, and big files with highly
+    nested trees when using libxml2 2.7+.
+
+    Please, see the `cc65-xxvf-f7r9 security advisory`_ for more information.
+
 -   :setting:`DOWNLOAD_MAXSIZE` and :setting:`DOWNLOAD_WARNSIZE` now also apply
     to the decompressed response body. Please, see the `7j7m-v7m3-jqm7 security
     advisory`_ for more information.

diff --git a/docs/topics/debug.rst b/docs/topics/debug.rst
@@ -125,26 +125,16 @@ Fortunately, the :command:`shell` is your bread and butter in this case (see
 
 See also: :ref:`topics-shell-inspect-response`.
 
+
 Open in browser
 ===============
 
 Sometimes you just want to see how a certain response looks in a browser, you
-can use the ``open_in_browser`` function for that. Here is an example of how
-you would use it:
-
-.. code-block:: python
+can use the :func:`~scrapy.utils.response.open_in_browser` function for that:
 
-    from scrapy.utils.response import open_in_browser
+.. autofunction:: scrapy.utils.response.open_in_browser
 
 
-    def parse_details(self, response):
-        if "item name" not in response.body:
-            open_in_browser(response)
-
-``open_in_browser`` will open a browser with the response received by Scrapy at
-that point, adjusting the `base tag`_ so that images and styles are displayed
-properly.
-
 Logging
 =======
 
@@ -163,8 +153,6 @@ available in all future runs should they be necessary again:
 
 For more information, check the :ref:`topics-logging` section.
 
-.. _base tag: https://www.w3schools.com/tags/tag_base.asp
-
 .. _debug-vscode:
 
 Visual Studio Code

diff --git a/scrapy/spiders/feed.py b/scrapy/spiders/feed.py
@@ -7,7 +7,7 @@
 from scrapy.exceptions import NotConfigured, NotSupported
 from scrapy.selector import Selector
 from scrapy.spiders import Spider
-from scrapy.utils.iterators import csviter, xmliter
+from scrapy.utils.iterators import csviter, xmliter_lxml
 from scrapy.utils.spider import iterate_spider_output
 
 
@@ -84,7 +84,7 @@ def _parse(self, response, **kwargs):
         return self.parse_nodes(response, nodes)
 
     def _iternodes(self, response):
-        for node in xmliter(response, self.itertag):
+        for node in xmliter_lxml(response, self.itertag):
             self._register_namespaces(node)
             yield node
 

diff --git a/scrapy/utils/iterators.py b/scrapy/utils/iterators.py
@@ -16,7 +16,11 @@
     cast,
     overload,
 )
+from warnings import warn
 
+from lxml import etree
+
+from scrapy.exceptions import ScrapyDeprecationWarning
 from scrapy.http import Response, TextResponse
 from scrapy.selector import Selector
 from scrapy.utils.python import re_rsearch, to_unicode
@@ -38,6 +42,16 @@ def xmliter(
     - a unicode string
     - a string encoded as utf-8
     """
+    warn(
+        (
+            "xmliter is deprecated and its use strongly discouraged because "
+            "it is vulnerable to ReDoS attacks. Use xmliter_lxml instead. See "
+            "https://github.com/scrapy/scrapy/security/advisories/GHSA-cc65-xxvf-f7r9"
+        ),
+        ScrapyDeprecationWarning,
+        stacklevel=2,
+    )
+
     nodename_patt = re.escape(nodename)
 
     DOCUMENT_HEADER_RE = re.compile(r"<\?xml[^>]+>\s*", re.S)
@@ -81,15 +95,34 @@ def xmliter_lxml(
     namespace: Optional[str] = None,
     prefix: str = "x",
 ) -> Generator[Selector, Any, None]:
-    from lxml import etree
-
     reader = _StreamReader(obj)
     tag = f"{{{namespace}}}{nodename}" if namespace else nodename
     iterable = etree.iterparse(
-        cast("SupportsReadClose[bytes]", reader), tag=tag, encoding=reader.encoding
+        cast("SupportsReadClose[bytes]", reader),
+        encoding=reader.encoding,
+        events=("end", "start-ns"),
+        huge_tree=True,
     )
     selxpath = "//" + (f"{prefix}:{nodename}" if namespace else nodename)
-    for _, node in iterable:
+    needs_namespace_resolution = not namespace and ":" in nodename
+    if needs_namespace_resolution:
+        prefix, nodename = nodename.split(":", maxsplit=1)
+    for event, data in iterable:
+        if event == "start-ns":
+            assert isinstance(data, tuple)
+            if needs_namespace_resolution:
+                _prefix, _namespace = data
+                if _prefix != prefix:
+                    continue
+                namespace = _namespace
+                needs_namespace_resolution = False
+                selxpath = f"//{prefix}:{nodename}"
+                tag = f"{{{namespace}}}{nodename}"
+            continue
+        assert isinstance(data, etree._Element)
+        node = data
+        if node.tag != tag:
+            continue
         nodetext = etree.tostring(node, encoding="unicode")
         node.clear()
         xs = Selector(text=nodetext, type="xml")

diff --git a/scrapy/utils/response.py b/scrapy/utils/response.py
@@ -74,25 +74,50 @@ def response_httprepr(response: Response) -> bytes:
     return b"".join(values)
 
 
+def _remove_html_comments(body):
+    start = body.find(b"<!--")
+    while start != -1:
+        end = body.find(b"-->", start + 1)
+        if end == -1:
+            return body[:start]
+        else:
+            body = body[:start] + body[end + 3 :]
+            start = body.find(b"<!--")
+    return body
+
+
 def open_in_browser(
     response: Union[
         "scrapy.http.response.html.HtmlResponse",
         "scrapy.http.response.text.TextResponse",
     ],
     _openfunc: Callable[[str], Any] = webbrowser.open,
 ) -> Any:
-    """Open the given response in a local web browser, populating the <base>
-    tag for external links to work
+    """Open *response* in a local web browser, adjusting the `base tag`_ for
+    external links to work, e.g. so that images and styles are displayed.
+
+    .. _base tag: https://www.w3schools.com/tags/tag_base.asp
+
+    For example:
+
+    .. code-block:: python
+
+        from scrapy.utils.response import open_in_browser
+
+
+        def parse_details(self, response):
+            if "item name" not in response.body:
+                open_in_browser(response)
     """
     from scrapy.http import HtmlResponse, TextResponse
 
     # XXX: this implementation is a bit dirty and could be improved
     body = response.body
     if isinstance(response, HtmlResponse):
         if b"<base" not in body:
-            repl = rf'\1<base href="{response.url}">'
-            body = re.sub(b"<!--.*?-->", b"", body, flags=re.DOTALL)
-            body = re.sub(rb"(<head(?:>|\s.*?>))", to_bytes(repl), body)
+            _remove_html_comments(body)
+            repl = rf'\0<base href="{response.url}">'
+            body = re.sub(rb"<head(?:[^<>]*?>)", to_bytes(repl), body, count=1)
         ext = ".html"
     elif isinstance(response, TextResponse):
         ext = ".txt"

diff --git a/tests/test_spider.py b/tests/test_spider.py
@@ -151,10 +151,10 @@ def test_register_namespace(self):
         body = b"""<?xml version="1.0" encoding="UTF-8"?>
         <urlset xmlns:x="http://www.google.com/schemas/sitemap/0.84"
                 xmlns:y="http://www.example.com/schemas/extras/1.0">
-        <url><x:loc>http://www.example.com/Special-Offers.html</loc><y:updated>2009-08-16</updated>
+        <url><x:loc>http://www.example.com/Special-Offers.html</x:loc><y:updated>2009-08-16</y:updated>
             <other value="bar" y:custom="fuu"/>
         </url>
-        <url><loc>http://www.example.com/</loc><y:updated>2009-08-16</updated><other value="foo"/></url>
+        <url><loc>http://www.example.com/</loc><y:updated>2009-08-16</y:updated><other value="foo"/></url>
         </urlset>"""
         response = XmlResponse(url="http://example.com/sitemap.xml", body=body)
 

diff --git a/tests/test_utils_iterators.py b/tests/test_utils_iterators.py
@@ -1,14 +1,14 @@
-from pytest import mark
+import pytest
 from twisted.trial import unittest
 
+from scrapy.exceptions import ScrapyDeprecationWarning
 from scrapy.http import Response, TextResponse, XmlResponse
 from scrapy.utils.iterators import _body_or_str, csviter, xmliter, xmliter_lxml
 from tests import get_testdata
 
 
-class XmliterTestCase(unittest.TestCase):
-    xmliter = staticmethod(xmliter)
-
+class XmliterBaseTestCase:
+    @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
     def test_xmliter(self):
         body = b"""
             <?xml version="1.0" encoding="UTF-8"?>
@@ -40,6 +40,7 @@ def test_xmliter(self):
             attrs, [("001", ["Name 1"], ["Type 1"]), ("002", ["Name 2"], ["Type 2"])]
         )
 
+    @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
     def test_xmliter_unusual_node(self):
         body = b"""<?xml version="1.0" encoding="UTF-8"?>
             <root>
@@ -53,6 +54,7 @@ def test_xmliter_unusual_node(self):
         ]
         self.assertEqual(nodenames, [["matchme..."]])
 
+    @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
     def test_xmliter_unicode(self):
         # example taken from https://github.com/scrapy/scrapy/issues/1665
         body = """<?xml version="1.0" encoding="UTF-8"?>
@@ -112,6 +114,7 @@ def test_xmliter_unicode(self):
                 [("26", ["-"], ["80"]), ("21", ["Ab"], ["76"]), ("27", ["A"], ["27"])],
             )
 
+    @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
     def test_xmliter_text(self):
         body = (
             '<?xml version="1.0" encoding="UTF-8"?>'
@@ -123,6 +126,7 @@ def test_xmliter_text(self):
             [["one"], ["two"]],
         )
 
+    @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
     def test_xmliter_namespaces(self):
         body = b"""
             <?xml version="1.0" encoding="UTF-8"?>
@@ -162,6 +166,7 @@ def test_xmliter_namespaces(self):
         self.assertEqual(node.xpath("id/text()").getall(), [])
         self.assertEqual(node.xpath("price/text()").getall(), [])
 
+    @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
     def test_xmliter_namespaced_nodename(self):
         body = b"""
             <?xml version="1.0" encoding="UTF-8"?>
@@ -190,6 +195,7 @@ def test_xmliter_namespaced_nodename(self):
             ["http://www.mydummycompany.com/images/item1.jpg"],
         )
 
+    @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
     def test_xmliter_namespaced_nodename_missing(self):
         body = b"""
             <?xml version="1.0" encoding="UTF-8"?>
@@ -214,6 +220,7 @@ def test_xmliter_namespaced_nodename_missing(self):
         with self.assertRaises(StopIteration):
             next(my_iter)
 
+    @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
     def test_xmliter_exception(self):
         body = (
             '<?xml version="1.0" encoding="UTF-8"?>'
@@ -226,10 +233,12 @@ def test_xmliter_exception(self):
 
         self.assertRaises(StopIteration, next, iter)
 
+    @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
     def test_xmliter_objtype_exception(self):
         i = self.xmliter(42, "product")
         self.assertRaises(TypeError, next, i)
 
+    @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
     def test_xmliter_encoding(self):
         body = (
             b'<?xml version="1.0" encoding="ISO-8859-9"?>\n'
@@ -244,12 +253,25 @@ def test_xmliter_encoding(self):
         )
 
 
-class LxmlXmliterTestCase(XmliterTestCase):
-    xmliter = staticmethod(xmliter_lxml)
+class XmliterTestCase(XmliterBaseTestCase, unittest.TestCase):
+    xmliter = staticmethod(xmliter)
 
-    @mark.xfail(reason="known bug of the current implementation")
-    def test_xmliter_namespaced_nodename(self):
-        super().test_xmliter_namespaced_nodename()
+    def test_deprecation(self):
+        body = b"""
+            <?xml version="1.0" encoding="UTF-8"?>
+            <products>
+              <product></product>
+            </products>
+        """
+        with pytest.warns(
+            ScrapyDeprecationWarning,
+            match="xmliter",
+        ):
+            next(self.xmliter(body, "product"))
+
+
+class LxmlXmliterTestCase(XmliterBaseTestCase, unittest.TestCase):
+    xmliter = staticmethod(xmliter_lxml)
 
     def test_xmliter_iterate_namespace(self):
         body = b"""