Address create_root_node memory issues (#269)

scrapy · Feb 24, 2023 · 5b28b54 · 5b28b54
1 parent f73b390
commit 5b28b54
Show file tree

Hide file tree

Showing 3 changed files with 94 additions and 8 deletions.
diff --git a/parsel/selector.py b/parsel/selector.py
@@ -90,14 +90,20 @@ def create_root_node(
     parser_cls: Type[_ParserType],
     base_url: Optional[str] = None,
     huge_tree: bool = LXML_SUPPORTS_HUGE_TREE,
+    body: bytes = b"",
+    encoding: str = "utf8",
 ) -> etree._Element:
     """Create root node for text using given parser class."""
-    body = text.strip().replace("\x00", "").encode("utf8") or b"<html/>"
+    if not text:
+        body = body.replace(b"\x00", b"").strip()
+    else:
+        body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>"
+
     if huge_tree and LXML_SUPPORTS_HUGE_TREE:
-        parser = parser_cls(recover=True, encoding="utf8", huge_tree=True)
+        parser = parser_cls(recover=True, encoding=encoding, huge_tree=True)
         root = etree.fromstring(body, parser=parser, base_url=base_url)
     else:
-        parser = parser_cls(recover=True, encoding="utf8")
+        parser = parser_cls(recover=True, encoding=encoding)
         root = etree.fromstring(body, parser=parser, base_url=base_url)
         for error in parser.error_log:
             if "use XML_PARSE_HUGE option" in error.message:
@@ -317,6 +323,7 @@ class Selector:
         "type",
         "_expr",
         "root",
+        "body",
         "__weakref__",
         "_parser",
         "_csstranslator",
@@ -341,6 +348,8 @@ def __init__(
         self,
         text: Optional[str] = None,
         type: Optional[str] = None,
+        body: bytes = b"",
+        encoding: str = "utf8",
         namespaces: Optional[Mapping[str, str]] = None,
         root: Optional[Any] = None,
         base_url: Optional[str] = None,
@@ -363,8 +372,18 @@ def __init__(
                 msg = f"text argument should be of type str, got {text.__class__}"
                 raise TypeError(msg)
             root = self._get_root(text, base_url, huge_tree)
+        elif body:
+            if not isinstance(body, bytes):
+                msg = f"body argument should be of type bytes, got {body.__class__}"
+                raise TypeError(msg)
+            root = self._get_root(
+                base_url=base_url,
+                huge_tree=huge_tree,
+                body=body,
+                encoding=encoding,
+            )
         elif root is None:
-            raise ValueError("Selector needs either text or root argument")
+            raise ValueError("Selector needs text, body, or root arguments")
 
         self.namespaces = dict(self._default_namespaces)
         if namespaces is not None:
@@ -377,12 +396,19 @@ def __getstate__(self) -> Any:
 
     def _get_root(
         self,
-        text: str,
+        text: str = "",
         base_url: Optional[str] = None,
         huge_tree: bool = LXML_SUPPORTS_HUGE_TREE,
+        body: bytes = b"",
+        encoding: str = "utf8",
     ) -> etree._Element:
         return create_root_node(
-            text, self._parser, base_url=base_url, huge_tree=huge_tree
+            text,
+            self._parser,
+            base_url=base_url,
+            huge_tree=huge_tree,
+            body=body,
+            encoding=encoding,
         )
 
     def xpath(

diff --git a/tests/test_selector.py b/tests/test_selector.py
@@ -5,7 +5,7 @@
 import pickle
 
 import typing
-from typing import Any
+from typing import Any, Optional, Mapping
 
 from lxml import etree
 from lxml.html import HtmlElement
@@ -15,6 +15,7 @@
 from parsel.selector import (
     CannotRemoveElementWithoutRoot,
     CannotRemoveElementWithoutParent,
+    LXML_SUPPORTS_HUGE_TREE,
 )
 
 
@@ -421,7 +422,7 @@ def test_error_for_unknown_selector_type(self) -> None:
     def test_text_or_root_is_required(self) -> None:
         self.assertRaisesRegex(
             ValueError,
-            "Selector needs either text or root argument",
+            "Selector needs text, body, or root arguments",
             self.sscls,
         )
 
@@ -1336,3 +1337,57 @@ def test_drop_with_xml_type(self) -> None:
         assert el.root.getparent() is not None
         el.drop()
         assert sel.get() == "<a><c/></a>"
+
+
+class SelectorBytesInput(Selector):
+    def __init__(
+        self,
+        text: Optional[str] = None,
+        type: Optional[str] = None,
+        body: bytes = b"",
+        encoding: str = "utf8",
+        namespaces: Optional[Mapping[str, str]] = None,
+        root: Optional[Any] = None,
+        base_url: Optional[str] = None,
+        _expr: Optional[str] = None,
+        huge_tree: bool = LXML_SUPPORTS_HUGE_TREE,
+    ) -> None:
+        if text:
+            body = bytes(text, encoding=encoding)
+            text = None
+        super().__init__(
+            text=text,
+            type=type,
+            body=body,
+            encoding=encoding,
+            namespaces=namespaces,
+            root=root,
+            base_url=base_url,
+            _expr=_expr,
+            huge_tree=huge_tree,
+        )
+
+
+class SelectorTestCaseBytes(SelectorTestCase):
+    sscls = SelectorBytesInput
+
+    def test_representation_slice(self) -> None:
+        pass
+
+    def test_representation_unicode_query(self) -> None:
+        pass
+
+    def test_weakref_slots(self) -> None:
+        pass
+
+    def test_check_text_argument_type(self) -> None:
+        self.assertRaisesRegex(
+            TypeError,
+            "body argument should be of type",
+            self.sscls,
+            body="<html/>",
+        )
+
+
+class ExsltTestCaseBytes(ExsltTestCase):
+    sscls = SelectorBytesInput
diff --git a/tests/test_selector_csstranslator.py b/tests/test_selector_csstranslator.py
@@ -236,3 +236,8 @@ def test_nested_selector(self) -> None:
     )
     def test_pseudoclass_has(self) -> None:
         self.assertEqual(self.x("p:has(b)::text"), ["lorem ipsum text"])
+
+
+class CSSSelectorTestBytes(CSSSelectorTest):
+    def setUp(self) -> None:
+        self.sel = self.sscls(body=bytes(HTMLBODY, encoding="utf8"))