Skip to content

Commit

Permalink
Backport the latest 2.11 implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
Gallaecio committed Feb 14, 2024
1 parent 684d4ef commit 73e7c0e
Show file tree
Hide file tree
Showing 6 changed files with 88 additions and 18 deletions.
10 changes: 7 additions & 3 deletions docs/faq.rst
Original file line number Diff line number Diff line change
Expand Up @@ -280,9 +280,13 @@ build the DOM of the entire feed in memory, and this can be quite slow and
consume a lot of memory.

In order to avoid parsing all the entire feed at once in memory, you can use
the functions ``xmliter`` and ``csviter`` from ``scrapy.utils.iterators``
module. In fact, this is what the feed spiders (see :ref:`topics-spiders`) use
under the cover.
the :func:`~scrapy.utils.iterators.xmliter_lxml` and
:func:`~scrapy.utils.iterators.csviter` functions. In fact, this is what
:class:`~scrapy.spiders.XMLFeedSpider` uses.

.. autofunction:: scrapy.utils.iterators.xmliter_lxml

.. autofunction:: scrapy.utils.iterators.csviter

Does Scrapy manage cookies automatically?
-----------------------------------------
Expand Down
13 changes: 10 additions & 3 deletions docs/news.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,18 @@ Scrapy 1.8.4 (unreleased)

**Security bug fix:**

- Fixed regular expressions susceptible to a `ReDoS attack`_ affecting the
``iternodes`` node iterator of :class:`~scrapy.spiders.XMLFeedSpider`.
- Due to its `ReDoS vulnerabilities`_, ``scrapy.utils.iterators.xmliter`` is
now deprecated in favor of :func:`~scrapy.utils.iterators.xmliter_lxml`,
which :class:`~scrapy.spiders.XMLFeedSpider` now uses.

To minimize the impact of this change on existing code,
:func:`~scrapy.utils.iterators.xmliter_lxml` now supports indicating
the node namespace as a prefix in the node name, and big files with highly
nested trees when using libxml2 2.7+.

Please, see the `cc65-xxvf-f7r9 security advisory`_ for more information.

.. _ReDoS attack: https://owasp.org/www-community/attacks/Regular_expression_Denial_of_Service_-_ReDoS
.. _ReDoS vulnerability: https://owasp.org/www-community/attacks/Regular_expression_Denial_of_Service_-_ReDoS
.. _cc65-xxvf-f7r9 security advisory: https://github.com/scrapy/scrapy/security/advisories/GHSA-cc65-xxvf-f7r9

.. _release-1.8.3:
Expand Down
4 changes: 2 additions & 2 deletions scrapy/spiders/feed.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
See documentation in docs/topics/spiders.rst
"""
from scrapy.spiders import Spider
from scrapy.utils.iterators import xmliter, csviter
from scrapy.utils.iterators import csviter, xmliter_lxml
from scrapy.utils.spider import iterate_spider_output
from scrapy.selector import Selector
from scrapy.exceptions import NotConfigured, NotSupported
Expand Down Expand Up @@ -82,7 +82,7 @@ def parse(self, response):
return self.parse_nodes(response, nodes)

def _iternodes(self, response):
for node in xmliter(response, self.itertag):
for node in xmliter_lxml(response, self.itertag):
self._register_namespaces(node)
yield node

Expand Down
41 changes: 38 additions & 3 deletions scrapy/utils/iterators.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,12 @@
except ImportError:
from io import BytesIO
from io import StringIO
from warnings import warn

import six
from lxml import etree

from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.http import TextResponse, Response
from scrapy.selector import Selector
from scrapy.utils.python import re_rsearch, to_unicode
Expand All @@ -24,6 +28,15 @@ def xmliter(obj, nodename):
- a unicode string
- a string encoded as utf-8
"""
warn(
(
"xmliter is deprecated and its use strongly discouraged because "
"it is vulnerable to ReDoS attacks. Use xmliter_lxml instead. See "
"https://github.com/scrapy/scrapy/security/advisories/GHSA-cc65-xxvf-f7r9"
),
ScrapyDeprecationWarning,
stacklevel=2,
)
nodename_patt = re.escape(nodename)

HEADER_START_RE = re.compile(r'^(.{,1024}?)<\s*%s(?:\s|>)' % nodename_patt, re.S)
Expand All @@ -42,12 +55,34 @@ def xmliter(obj, nodename):


def xmliter_lxml(obj, nodename, namespace=None, prefix='x'):
from lxml import etree
reader = _StreamReader(obj)
tag = '{%s}%s' % (namespace, nodename) if namespace else nodename
iterable = etree.iterparse(reader, tag=tag, encoding=reader.encoding)
iterable = etree.iterparse(
reader,
encoding=reader.encoding,
events=("end", "start-ns"),
huge_tree=True,
)
selxpath = '//' + ('%s:%s' % (prefix, nodename) if namespace else nodename)
for _, node in iterable:
needs_namespace_resolution = not namespace and ":" in nodename
if needs_namespace_resolution:
prefix, nodename = nodename.split(":", maxsplit=1)
for event, data in iterable:
if event == "start-ns":
assert isinstance(data, tuple)
if needs_namespace_resolution:
_prefix, _namespace = data
if _prefix != prefix:
continue
namespace = _namespace
needs_namespace_resolution = False
selxpath = "//{prefix}:{nodename}".format(prefix=prefix, nodename=nodename)
tag = "{{{namespace}}}{nodename}".format(namespace=namespace, nodename=nodename)
continue
assert isinstance(data, etree._Element)
node = data
if node.tag != tag:
continue
nodetext = etree.tostring(node, encoding='unicode')
node.clear()
xs = Selector(text=nodetext, type='xml')
Expand Down
4 changes: 2 additions & 2 deletions tests/test_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,8 @@ def test_register_namespace(self):
body = b"""<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns:x="http://www.google.com/schemas/sitemap/0.84"
xmlns:y="http://www.example.com/schemas/extras/1.0">
<url><x:loc>http://www.example.com/Special-Offers.html</loc><y:updated>2009-08-16</updated><other value="bar" y:custom="fuu"/></url>
<url><loc>http://www.example.com/</loc><y:updated>2009-08-16</updated><other value="foo"/></url>
<url><x:loc>http://www.example.com/Special-Offers.html</x:loc><y:updated>2009-08-16</y:updated><other value="bar" y:custom="fuu"/></url>
<url><loc>http://www.example.com/</loc><y:updated>2009-08-16</y:updated><other value="foo"/></url>
</urlset>"""
response = XmlResponse(url='http://example.com/sitemap.xml', body=body)

Expand Down
34 changes: 29 additions & 5 deletions tests/test_utils_iterators.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,18 @@
import six
from twisted.trial import unittest

import pytest

from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.utils.iterators import csviter, xmliter, _body_or_str, xmliter_lxml
from scrapy.http import XmlResponse, TextResponse, Response
from tests import get_testdata

FOOBAR_NL = u"foo\nbar"


class XmliterTestCase(unittest.TestCase):

xmliter = staticmethod(xmliter)

class XmliterBaseTestCase:
@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
def test_xmliter(self):
body = b"""<?xml version="1.0" encoding="UTF-8"?>\
<products xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="someschmea.xsd">\
Expand All @@ -38,6 +39,7 @@ def test_xmliter(self):
self.assertEqual(attrs,
[('001', ['Name 1'], ['Type 1']), ('002', ['Name 2'], ['Type 2'])])

@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
def test_xmliter_unusual_node(self):
body = b"""<?xml version="1.0" encoding="UTF-8"?>
<root>
Expand All @@ -50,6 +52,7 @@ def test_xmliter_unusual_node(self):
for e in self.xmliter(response, 'matchme...')]
self.assertEqual(nodenames, [['matchme...']])

@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
def test_xmliter_unicode(self):
# example taken from https://github.com/scrapy/scrapy/issues/1665
body = u"""<?xml version="1.0" encoding="UTF-8"?>
Expand Down Expand Up @@ -105,12 +108,14 @@ def test_xmliter_unicode(self):
(u'21', [u'Ab'], [u'76']),
(u'27', [u'A'], [u'27'])])

@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
def test_xmliter_text(self):
body = u"""<?xml version="1.0" encoding="UTF-8"?><products><product>one</product><product>two</product></products>"""

self.assertEqual([x.xpath("text()").getall() for x in self.xmliter(body, 'product')],
[[u'one'], [u'two']])

@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
def test_xmliter_namespaces(self):
body = b"""\
<?xml version="1.0" encoding="UTF-8"?>
Expand Down Expand Up @@ -145,6 +150,7 @@ def test_xmliter_namespaces(self):
self.assertEqual(node.xpath('id/text()').getall(), [])
self.assertEqual(node.xpath('price/text()').getall(), [])

@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
def test_xmliter_exception(self):
body = u"""<?xml version="1.0" encoding="UTF-8"?><products><product>one</product><product>two</product></products>"""

Expand All @@ -154,10 +160,12 @@ def test_xmliter_exception(self):

self.assertRaises(StopIteration, next, iter)

@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
def test_xmliter_objtype_exception(self):
i = self.xmliter(42, 'product')
self.assertRaises(AssertionError, next, i)

@pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
def test_xmliter_encoding(self):
body = b'<?xml version="1.0" encoding="ISO-8859-9"?>\n<xml>\n <item>Some Turkish Characters \xd6\xc7\xde\xdd\xd0\xdc \xfc\xf0\xfd\xfe\xe7\xf6</item>\n</xml>\n\n'
response = XmlResponse('http://www.example.com', body=body)
Expand All @@ -166,8 +174,24 @@ def test_xmliter_encoding(self):
u'<item>Some Turkish Characters \xd6\xc7\u015e\u0130\u011e\xdc \xfc\u011f\u0131\u015f\xe7\xf6</item>'
)

class XmliterTestCase(XmliterBaseTestCase, unittest.TestCase):
xmliter = staticmethod(xmliter)

def test_deprecation(self):
body = b"""
<?xml version="1.0" encoding="UTF-8"?>
<products>
<product></product>
</products>
"""
with pytest.warns(
ScrapyDeprecationWarning,
match="xmliter",
):
next(self.xmliter(body, "product"))


class LxmlXmliterTestCase(XmliterTestCase):
class LxmlXmliterTestCase(XmliterBaseTestCase, unittest.TestCase):
xmliter = staticmethod(xmliter_lxml)

def test_xmliter_iterate_namespace(self):
Expand Down

0 comments on commit 73e7c0e

Please sign in to comment.