Skip to content

Commit

Permalink
Full typing (#265)
Browse files Browse the repository at this point in the history
  • Loading branch information
wRAR committed Feb 15, 2023
1 parent 55248fa commit f73b390
Show file tree
Hide file tree
Showing 12 changed files with 168 additions and 79 deletions.
1 change: 1 addition & 0 deletions .bandit.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
skips:
- B101
- B320
- B410
2 changes: 1 addition & 1 deletion .flake8
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[flake8]
ignore = E203
ignore = E203,W503
per-file-ignores =
docs/conftest.py:E501
parsel/csstranslator.py:E501
Expand Down
84 changes: 66 additions & 18 deletions parsel/csstranslator.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,38 @@
from functools import lru_cache
from typing import TYPE_CHECKING, Any, Optional

from cssselect import GenericTranslator as OriginalGenericTranslator
from cssselect import HTMLTranslator as OriginalHTMLTranslator
from cssselect.xpath import XPathExpr as OriginalXPathExpr
from cssselect.xpath import ExpressionError
from cssselect.parser import FunctionalPseudoElement
from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement


if TYPE_CHECKING:
# typing.Self requires Python 3.11
from typing_extensions import Self


class XPathExpr(OriginalXPathExpr):

textnode = False
attribute = None
textnode: bool = False
attribute: Optional[str] = None

@classmethod
def from_xpath(cls, xpath, textnode=False, attribute=None):
def from_xpath(
cls,
xpath: OriginalXPathExpr,
textnode: bool = False,
attribute: Optional[str] = None,
) -> "Self":
x = cls(
path=xpath.path, element=xpath.element, condition=xpath.condition
)
x.textnode = textnode
x.attribute = attribute
return x

def __str__(self):
def __str__(self) -> str:
path = super().__str__()
if self.textnode:
if path == "*":
Expand All @@ -38,46 +49,77 @@ def __str__(self):

return path

def join(self, combiner, other, *args, **kwargs):
def join(
self: "Self",
combiner: str,
other: OriginalXPathExpr,
*args: Any,
**kwargs: Any,
) -> "Self":
if not isinstance(other, XPathExpr):
raise ValueError(
f"Expressions of type {__name__}.XPathExpr can ony join expressions"
f" of the same type (or its descendants), got {type(other)}"
)
super().join(combiner, other, *args, **kwargs)
self.textnode = other.textnode
self.attribute = other.attribute
return self


if TYPE_CHECKING:
# requires Python 3.8
from typing import Protocol

# e.g. cssselect.GenericTranslator, cssselect.HTMLTranslator
class TranslatorProtocol(Protocol):
def xpath_element(self, selector: Element) -> OriginalXPathExpr:
pass

def css_to_xpath(self, css: str, prefix: str = ...) -> str:
pass


class TranslatorMixin:
"""This mixin adds support to CSS pseudo elements via dynamic dispatch.
Currently supported pseudo-elements are ``::text`` and ``::attr(ATTR_NAME)``.
"""

def xpath_element(self, selector):
xpath = super().xpath_element(selector)
def xpath_element(
self: "TranslatorProtocol", selector: Element
) -> XPathExpr:
# https://github.com/python/mypy/issues/12344
xpath = super().xpath_element(selector) # type: ignore[safe-super]
return XPathExpr.from_xpath(xpath)

def xpath_pseudo_element(self, xpath, pseudo_element):
def xpath_pseudo_element(
self, xpath: OriginalXPathExpr, pseudo_element: PseudoElement
) -> OriginalXPathExpr:
"""
Dispatch method that transforms XPath to support pseudo-element
"""
if isinstance(pseudo_element, FunctionalPseudoElement):
method = f"xpath_{pseudo_element.name.replace('-', '_')}_functional_pseudo_element"
method = getattr(self, method, None)
method_name = f"xpath_{pseudo_element.name.replace('-', '_')}_functional_pseudo_element"
method = getattr(self, method_name, None)
if not method:
raise ExpressionError(
f"The functional pseudo-element ::{pseudo_element.name}() is unknown"
)
xpath = method(xpath, pseudo_element)
else:
method = f"xpath_{pseudo_element.replace('-', '_')}_simple_pseudo_element"
method = getattr(self, method, None)
method_name = f"xpath_{pseudo_element.replace('-', '_')}_simple_pseudo_element"
method = getattr(self, method_name, None)
if not method:
raise ExpressionError(
f"The pseudo-element ::{pseudo_element} is unknown"
)
xpath = method(xpath)
return xpath

def xpath_attr_functional_pseudo_element(self, xpath, function):
def xpath_attr_functional_pseudo_element(
self, xpath: OriginalXPathExpr, function: FunctionalPseudoElement
) -> XPathExpr:
"""Support selecting attribute values using ::attr() pseudo-element"""
if function.argument_types() not in (["STRING"], ["IDENT"]):
raise ExpressionError(
Expand All @@ -87,26 +129,32 @@ def xpath_attr_functional_pseudo_element(self, xpath, function):
xpath, attribute=function.arguments[0].value
)

def xpath_text_simple_pseudo_element(self, xpath):
def xpath_text_simple_pseudo_element(
self, xpath: OriginalXPathExpr
) -> XPathExpr:
"""Support selecting text nodes using ::text pseudo-element"""
return XPathExpr.from_xpath(xpath, textnode=True)


class GenericTranslator(TranslatorMixin, OriginalGenericTranslator):
@lru_cache(maxsize=256)
def css_to_xpath(self, css, prefix="descendant-or-self::"):
def css_to_xpath(
self, css: str, prefix: str = "descendant-or-self::"
) -> str:
return super().css_to_xpath(css, prefix)


class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
@lru_cache(maxsize=256)
def css_to_xpath(self, css, prefix="descendant-or-self::"):
def css_to_xpath(
self, css: str, prefix: str = "descendant-or-self::"
) -> str:
return super().css_to_xpath(css, prefix)


_translator = HTMLTranslator()


def css2xpath(query):
def css2xpath(query: str) -> str:
"Return translated XPath version of a given CSS query"
return _translator.css_to_xpath(query)
26 changes: 12 additions & 14 deletions parsel/selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ class CannotDropElementWithoutParent(CannotRemoveElementWithoutParent):
pass


class SafeXMLParser(etree.XMLParser):
def __init__(self, *args, **kwargs) -> None:
class SafeXMLParser(etree.XMLParser): # type: ignore[type-arg]
def __init__(self, *args: Any, **kwargs: Any) -> None:
kwargs.setdefault("resolve_entities", False)
super().__init__(*args, **kwargs)

Expand Down Expand Up @@ -95,11 +95,10 @@ def create_root_node(
body = text.strip().replace("\x00", "").encode("utf8") or b"<html/>"
if huge_tree and LXML_SUPPORTS_HUGE_TREE:
parser = parser_cls(recover=True, encoding="utf8", huge_tree=True)
# the stub wrongly thinks base_url can't be None
root = etree.fromstring(body, parser=parser, base_url=base_url) # type: ignore[arg-type]
root = etree.fromstring(body, parser=parser, base_url=base_url)
else:
parser = parser_cls(recover=True, encoding="utf8")
root = etree.fromstring(body, parser=parser, base_url=base_url) # type: ignore[arg-type]
root = etree.fromstring(body, parser=parser, base_url=base_url)
for error in parser.error_log:
if "use XML_PARSE_HUGE option" in error.message:
warnings.warn(
Expand Down Expand Up @@ -143,7 +142,7 @@ def xpath(
self,
xpath: str,
namespaces: Optional[Mapping[str, str]] = None,
**kwargs,
**kwargs: Any,
) -> "SelectorList[_SelectorType]":
"""
Call the ``.xpath()`` method for each element in this list and return
Expand Down Expand Up @@ -230,7 +229,7 @@ def re_first(
for el in iflatten(
x.re(regex, replace_entities=replace_entities) for x in self
):
return el
return typing.cast(str, el)
return default

def getall(self) -> List[str]:
Expand Down Expand Up @@ -390,7 +389,7 @@ def xpath(
self: _SelectorType,
query: str,
namespaces: Optional[Mapping[str, str]] = None,
**kwargs,
**kwargs: Any,
) -> SelectorList[_SelectorType]:
"""
Find nodes matching the xpath ``query`` and return the result as a
Expand Down Expand Up @@ -563,10 +562,7 @@ def remove_namespaces(self) -> None:
# loop on element attributes also
for an in el.attrib:
if an.startswith("{"):
# this cast shouldn't be needed as pop never returns None
el.attrib[an.split("}", 1)[1]] = typing.cast(
str, el.attrib.pop(an)
)
el.attrib[an.split("}", 1)[1]] = el.attrib.pop(an)
# remove namespace declarations
etree.cleanup_namespaces(self.root)

Expand Down Expand Up @@ -599,7 +595,7 @@ def remove(self) -> None:
"are you trying to remove a root element?"
)

def drop(self):
def drop(self) -> None:
"""
Drop matched nodes from the parent element.
"""
Expand All @@ -616,9 +612,11 @@ def drop(self):

try:
if self.type == "xml":
if parent is None:
raise ValueError("This node has no parent")
parent.remove(self.root)
else:
self.root.drop_tree()
typing.cast(html.HtmlElement, self.root).drop_tree()
except (AttributeError, AssertionError):
# 'NoneType' object has no attribute 'drop'
raise CannotDropElementWithoutParent(
Expand Down
6 changes: 3 additions & 3 deletions parsel/utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import re
from typing import Any, List, Pattern, Union, cast, Match
from typing import Any, Iterable, Iterator, List, Match, Pattern, Union, cast
from w3lib.html import replace_entities as w3lib_replace_entities


def flatten(x):
def flatten(x: Iterable[Any]) -> List[Any]:
"""flatten(sequence) -> list
Returns a single, flat list which contains all elements retrieved
from the sequence and all recursively contained sub-sequences
Expand All @@ -21,7 +21,7 @@ def flatten(x):
return list(iflatten(x))


def iflatten(x):
def iflatten(x: Iterable[Any]) -> Iterator[Any]:
"""iflatten(sequence) -> Iterator
Similar to ``.flatten()``, but returns iterator instead"""
for el in x:
Expand Down
11 changes: 7 additions & 4 deletions parsel/xpathfuncs.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import re
from typing import Any, Callable, Optional

from lxml import etree

from w3lib.html import HTML5_WHITESPACE


regex = f"[{HTML5_WHITESPACE}]+"
replace_html5_whitespaces = re.compile(regex).sub


def set_xpathfunc(fname, func):
def set_xpathfunc(fname: str, func: Optional[Callable]) -> None: # type: ignore[type-arg]
"""Register a custom extension function to use in XPath expressions.
The function ``func`` registered under ``fname`` identifier will be called
Expand All @@ -21,18 +24,18 @@ def set_xpathfunc(fname, func):
.. _`in lxml documentation`: https://lxml.de/extensions.html#xpath-extension-functions
"""
ns_fns = etree.FunctionNamespace(None)
ns_fns = etree.FunctionNamespace(None) # type: ignore[attr-defined]
if func is not None:
ns_fns[fname] = func
else:
del ns_fns[fname]


def setup():
def setup() -> None:
set_xpathfunc("has-class", has_class)


def has_class(context, *classes):
def has_class(context: Any, *classes: str) -> bool:
"""has-class function.
Return True if all ``classes`` are present in element's class attr.
Expand Down
2 changes: 1 addition & 1 deletion tests/test_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -1108,7 +1108,7 @@ def test_remove_root_element_selector(self) -> None:
sel.css("body").drop()
self.assertEqual(sel.get(), "<html></html>")

def test_deep_nesting(self):
def test_deep_nesting(self) -> None:
lxml_version = parse_version(etree.__version__)
lxml_huge_tree_version = parse_version("4.2")

Expand Down

0 comments on commit f73b390

Please sign in to comment.