Skip to content

Commit

Permalink
Address create_root_node memory issues (#269)
Browse files Browse the repository at this point in the history
  • Loading branch information
GeorgeA92 committed Feb 24, 2023
1 parent f73b390 commit 5b28b54
Show file tree
Hide file tree
Showing 3 changed files with 94 additions and 8 deletions.
38 changes: 32 additions & 6 deletions parsel/selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,14 +90,20 @@ def create_root_node(
parser_cls: Type[_ParserType],
base_url: Optional[str] = None,
huge_tree: bool = LXML_SUPPORTS_HUGE_TREE,
body: bytes = b"",
encoding: str = "utf8",
) -> etree._Element:
"""Create root node for text using given parser class."""
body = text.strip().replace("\x00", "").encode("utf8") or b"<html/>"
if not text:
body = body.replace(b"\x00", b"").strip()
else:
body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>"

if huge_tree and LXML_SUPPORTS_HUGE_TREE:
parser = parser_cls(recover=True, encoding="utf8", huge_tree=True)
parser = parser_cls(recover=True, encoding=encoding, huge_tree=True)
root = etree.fromstring(body, parser=parser, base_url=base_url)
else:
parser = parser_cls(recover=True, encoding="utf8")
parser = parser_cls(recover=True, encoding=encoding)
root = etree.fromstring(body, parser=parser, base_url=base_url)
for error in parser.error_log:
if "use XML_PARSE_HUGE option" in error.message:
Expand Down Expand Up @@ -317,6 +323,7 @@ class Selector:
"type",
"_expr",
"root",
"body",
"__weakref__",
"_parser",
"_csstranslator",
Expand All @@ -341,6 +348,8 @@ def __init__(
self,
text: Optional[str] = None,
type: Optional[str] = None,
body: bytes = b"",
encoding: str = "utf8",
namespaces: Optional[Mapping[str, str]] = None,
root: Optional[Any] = None,
base_url: Optional[str] = None,
Expand All @@ -363,8 +372,18 @@ def __init__(
msg = f"text argument should be of type str, got {text.__class__}"
raise TypeError(msg)
root = self._get_root(text, base_url, huge_tree)
elif body:
if not isinstance(body, bytes):
msg = f"body argument should be of type bytes, got {body.__class__}"
raise TypeError(msg)
root = self._get_root(
base_url=base_url,
huge_tree=huge_tree,
body=body,
encoding=encoding,
)
elif root is None:
raise ValueError("Selector needs either text or root argument")
raise ValueError("Selector needs text, body, or root arguments")

self.namespaces = dict(self._default_namespaces)
if namespaces is not None:
Expand All @@ -377,12 +396,19 @@ def __getstate__(self) -> Any:

def _get_root(
self,
text: str,
text: str = "",
base_url: Optional[str] = None,
huge_tree: bool = LXML_SUPPORTS_HUGE_TREE,
body: bytes = b"",
encoding: str = "utf8",
) -> etree._Element:
return create_root_node(
text, self._parser, base_url=base_url, huge_tree=huge_tree
text,
self._parser,
base_url=base_url,
huge_tree=huge_tree,
body=body,
encoding=encoding,
)

def xpath(
Expand Down
59 changes: 57 additions & 2 deletions tests/test_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pickle

import typing
from typing import Any
from typing import Any, Optional, Mapping

from lxml import etree
from lxml.html import HtmlElement
Expand All @@ -15,6 +15,7 @@
from parsel.selector import (
CannotRemoveElementWithoutRoot,
CannotRemoveElementWithoutParent,
LXML_SUPPORTS_HUGE_TREE,
)


Expand Down Expand Up @@ -421,7 +422,7 @@ def test_error_for_unknown_selector_type(self) -> None:
def test_text_or_root_is_required(self) -> None:
self.assertRaisesRegex(
ValueError,
"Selector needs either text or root argument",
"Selector needs text, body, or root arguments",
self.sscls,
)

Expand Down Expand Up @@ -1336,3 +1337,57 @@ def test_drop_with_xml_type(self) -> None:
assert el.root.getparent() is not None
el.drop()
assert sel.get() == "<a><c/></a>"


class SelectorBytesInput(Selector):
def __init__(
self,
text: Optional[str] = None,
type: Optional[str] = None,
body: bytes = b"",
encoding: str = "utf8",
namespaces: Optional[Mapping[str, str]] = None,
root: Optional[Any] = None,
base_url: Optional[str] = None,
_expr: Optional[str] = None,
huge_tree: bool = LXML_SUPPORTS_HUGE_TREE,
) -> None:
if text:
body = bytes(text, encoding=encoding)
text = None
super().__init__(
text=text,
type=type,
body=body,
encoding=encoding,
namespaces=namespaces,
root=root,
base_url=base_url,
_expr=_expr,
huge_tree=huge_tree,
)


class SelectorTestCaseBytes(SelectorTestCase):
sscls = SelectorBytesInput

def test_representation_slice(self) -> None:
pass

def test_representation_unicode_query(self) -> None:
pass

def test_weakref_slots(self) -> None:
pass

def test_check_text_argument_type(self) -> None:
self.assertRaisesRegex(
TypeError,
"body argument should be of type",
self.sscls,
body="<html/>",
)


class ExsltTestCaseBytes(ExsltTestCase):
sscls = SelectorBytesInput
5 changes: 5 additions & 0 deletions tests/test_selector_csstranslator.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,3 +236,8 @@ def test_nested_selector(self) -> None:
)
def test_pseudoclass_has(self) -> None:
self.assertEqual(self.x("p:has(b)::text"), ["lorem ipsum text"])


class CSSSelectorTestBytes(CSSSelectorTest):
def setUp(self) -> None:
self.sel = self.sscls(body=bytes(HTMLBODY, encoding="utf8"))

0 comments on commit 5b28b54

Please sign in to comment.