Full typing (#265)

scrapy · Feb 15, 2023 · f73b390 · f73b390
1 parent 55248fa
commit f73b390
Show file tree

Hide file tree

Showing 12 changed files with 168 additions and 79 deletions.
diff --git a/.bandit.yml b/.bandit.yml
@@ -1,3 +1,4 @@
 skips:
+- B101
 - B320
 - B410
diff --git a/.flake8 b/.flake8
@@ -1,5 +1,5 @@
 [flake8]
-ignore = E203
+ignore = E203,W503
 per-file-ignores =
     docs/conftest.py:E501
     parsel/csstranslator.py:E501

diff --git a/parsel/csstranslator.py b/parsel/csstranslator.py
@@ -1,27 +1,38 @@
 from functools import lru_cache
+from typing import TYPE_CHECKING, Any, Optional
 
 from cssselect import GenericTranslator as OriginalGenericTranslator
 from cssselect import HTMLTranslator as OriginalHTMLTranslator
 from cssselect.xpath import XPathExpr as OriginalXPathExpr
 from cssselect.xpath import ExpressionError
-from cssselect.parser import FunctionalPseudoElement
+from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
+
+
+if TYPE_CHECKING:
+    # typing.Self requires Python 3.11
+    from typing_extensions import Self
 
 
 class XPathExpr(OriginalXPathExpr):
 
-    textnode = False
-    attribute = None
+    textnode: bool = False
+    attribute: Optional[str] = None
 
     @classmethod
-    def from_xpath(cls, xpath, textnode=False, attribute=None):
+    def from_xpath(
+        cls,
+        xpath: OriginalXPathExpr,
+        textnode: bool = False,
+        attribute: Optional[str] = None,
+    ) -> "Self":
         x = cls(
             path=xpath.path, element=xpath.element, condition=xpath.condition
         )
         x.textnode = textnode
         x.attribute = attribute
         return x
 
-    def __str__(self):
+    def __str__(self) -> str:
         path = super().__str__()
         if self.textnode:
             if path == "*":
@@ -38,46 +49,77 @@ def __str__(self):
 
         return path
 
-    def join(self, combiner, other, *args, **kwargs):
+    def join(
+        self: "Self",
+        combiner: str,
+        other: OriginalXPathExpr,
+        *args: Any,
+        **kwargs: Any,
+    ) -> "Self":
+        if not isinstance(other, XPathExpr):
+            raise ValueError(
+                f"Expressions of type {__name__}.XPathExpr can ony join expressions"
+                f" of the same type (or its descendants), got {type(other)}"
+            )
         super().join(combiner, other, *args, **kwargs)
         self.textnode = other.textnode
         self.attribute = other.attribute
         return self
 
 
+if TYPE_CHECKING:
+    # requires Python 3.8
+    from typing import Protocol
+
+    # e.g. cssselect.GenericTranslator, cssselect.HTMLTranslator
+    class TranslatorProtocol(Protocol):
+        def xpath_element(self, selector: Element) -> OriginalXPathExpr:
+            pass
+
+        def css_to_xpath(self, css: str, prefix: str = ...) -> str:
+            pass
+
+
 class TranslatorMixin:
     """This mixin adds support to CSS pseudo elements via dynamic dispatch.
 
     Currently supported pseudo-elements are ``::text`` and ``::attr(ATTR_NAME)``.
     """
 
-    def xpath_element(self, selector):
-        xpath = super().xpath_element(selector)
+    def xpath_element(
+        self: "TranslatorProtocol", selector: Element
+    ) -> XPathExpr:
+        # https://github.com/python/mypy/issues/12344
+        xpath = super().xpath_element(selector)  # type: ignore[safe-super]
         return XPathExpr.from_xpath(xpath)
 
-    def xpath_pseudo_element(self, xpath, pseudo_element):
+    def xpath_pseudo_element(
+        self, xpath: OriginalXPathExpr, pseudo_element: PseudoElement
+    ) -> OriginalXPathExpr:
         """
         Dispatch method that transforms XPath to support pseudo-element
         """
         if isinstance(pseudo_element, FunctionalPseudoElement):
-            method = f"xpath_{pseudo_element.name.replace('-', '_')}_functional_pseudo_element"
-            method = getattr(self, method, None)
+            method_name = f"xpath_{pseudo_element.name.replace('-', '_')}_functional_pseudo_element"
+            method = getattr(self, method_name, None)
             if not method:
                 raise ExpressionError(
                     f"The functional pseudo-element ::{pseudo_element.name}() is unknown"
                 )
             xpath = method(xpath, pseudo_element)
         else:
-            method = f"xpath_{pseudo_element.replace('-', '_')}_simple_pseudo_element"
-            method = getattr(self, method, None)
+            method_name = f"xpath_{pseudo_element.replace('-', '_')}_simple_pseudo_element"
+            method = getattr(self, method_name, None)
             if not method:
                 raise ExpressionError(
                     f"The pseudo-element ::{pseudo_element} is unknown"
                 )
             xpath = method(xpath)
         return xpath
 
-    def xpath_attr_functional_pseudo_element(self, xpath, function):
+    def xpath_attr_functional_pseudo_element(
+        self, xpath: OriginalXPathExpr, function: FunctionalPseudoElement
+    ) -> XPathExpr:
         """Support selecting attribute values using ::attr() pseudo-element"""
         if function.argument_types() not in (["STRING"], ["IDENT"]):
             raise ExpressionError(
@@ -87,26 +129,32 @@ def xpath_attr_functional_pseudo_element(self, xpath, function):
             xpath, attribute=function.arguments[0].value
         )
 
-    def xpath_text_simple_pseudo_element(self, xpath):
+    def xpath_text_simple_pseudo_element(
+        self, xpath: OriginalXPathExpr
+    ) -> XPathExpr:
         """Support selecting text nodes using ::text pseudo-element"""
         return XPathExpr.from_xpath(xpath, textnode=True)
 
 
 class GenericTranslator(TranslatorMixin, OriginalGenericTranslator):
     @lru_cache(maxsize=256)
-    def css_to_xpath(self, css, prefix="descendant-or-self::"):
+    def css_to_xpath(
+        self, css: str, prefix: str = "descendant-or-self::"
+    ) -> str:
         return super().css_to_xpath(css, prefix)
 
 
 class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
     @lru_cache(maxsize=256)
-    def css_to_xpath(self, css, prefix="descendant-or-self::"):
+    def css_to_xpath(
+        self, css: str, prefix: str = "descendant-or-self::"
+    ) -> str:
         return super().css_to_xpath(css, prefix)
 
 
 _translator = HTMLTranslator()
 
 
-def css2xpath(query):
+def css2xpath(query: str) -> str:
     "Return translated XPath version of a given CSS query"
     return _translator.css_to_xpath(query)
diff --git a/parsel/selector.py b/parsel/selector.py
@@ -56,8 +56,8 @@ class CannotDropElementWithoutParent(CannotRemoveElementWithoutParent):
     pass
 
 
-class SafeXMLParser(etree.XMLParser):
-    def __init__(self, *args, **kwargs) -> None:
+class SafeXMLParser(etree.XMLParser):  # type: ignore[type-arg]
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
         kwargs.setdefault("resolve_entities", False)
         super().__init__(*args, **kwargs)
 
@@ -95,11 +95,10 @@ def create_root_node(
     body = text.strip().replace("\x00", "").encode("utf8") or b"<html/>"
     if huge_tree and LXML_SUPPORTS_HUGE_TREE:
         parser = parser_cls(recover=True, encoding="utf8", huge_tree=True)
-        # the stub wrongly thinks base_url can't be None
-        root = etree.fromstring(body, parser=parser, base_url=base_url)  # type: ignore[arg-type]
+        root = etree.fromstring(body, parser=parser, base_url=base_url)
     else:
         parser = parser_cls(recover=True, encoding="utf8")
-        root = etree.fromstring(body, parser=parser, base_url=base_url)  # type: ignore[arg-type]
+        root = etree.fromstring(body, parser=parser, base_url=base_url)
         for error in parser.error_log:
             if "use XML_PARSE_HUGE option" in error.message:
                 warnings.warn(
@@ -143,7 +142,7 @@ def xpath(
         self,
         xpath: str,
         namespaces: Optional[Mapping[str, str]] = None,
-        **kwargs,
+        **kwargs: Any,
     ) -> "SelectorList[_SelectorType]":
         """
         Call the ``.xpath()`` method for each element in this list and return
@@ -230,7 +229,7 @@ def re_first(
         for el in iflatten(
             x.re(regex, replace_entities=replace_entities) for x in self
         ):
-            return el
+            return typing.cast(str, el)
         return default
 
     def getall(self) -> List[str]:
@@ -390,7 +389,7 @@ def xpath(
         self: _SelectorType,
         query: str,
         namespaces: Optional[Mapping[str, str]] = None,
-        **kwargs,
+        **kwargs: Any,
     ) -> SelectorList[_SelectorType]:
         """
         Find nodes matching the xpath ``query`` and return the result as a
@@ -563,10 +562,7 @@ def remove_namespaces(self) -> None:
             # loop on element attributes also
             for an in el.attrib:
                 if an.startswith("{"):
-                    # this cast shouldn't be needed as pop never returns None
-                    el.attrib[an.split("}", 1)[1]] = typing.cast(
-                        str, el.attrib.pop(an)
-                    )
+                    el.attrib[an.split("}", 1)[1]] = el.attrib.pop(an)
         # remove namespace declarations
         etree.cleanup_namespaces(self.root)
 
@@ -599,7 +595,7 @@ def remove(self) -> None:
                 "are you trying to remove a root element?"
             )
 
-    def drop(self):
+    def drop(self) -> None:
         """
         Drop matched nodes from the parent element.
         """
@@ -616,9 +612,11 @@ def drop(self):
 
         try:
             if self.type == "xml":
+                if parent is None:
+                    raise ValueError("This node has no parent")
                 parent.remove(self.root)
             else:
-                self.root.drop_tree()
+                typing.cast(html.HtmlElement, self.root).drop_tree()
         except (AttributeError, AssertionError):
             # 'NoneType' object has no attribute 'drop'
             raise CannotDropElementWithoutParent(

diff --git a/parsel/utils.py b/parsel/utils.py
@@ -1,9 +1,9 @@
 import re
-from typing import Any, List, Pattern, Union, cast, Match
+from typing import Any, Iterable, Iterator, List, Match, Pattern, Union, cast
 from w3lib.html import replace_entities as w3lib_replace_entities
 
 
-def flatten(x):
+def flatten(x: Iterable[Any]) -> List[Any]:
     """flatten(sequence) -> list
     Returns a single, flat list which contains all elements retrieved
     from the sequence and all recursively contained sub-sequences
@@ -21,7 +21,7 @@ def flatten(x):
     return list(iflatten(x))
 
 
-def iflatten(x):
+def iflatten(x: Iterable[Any]) -> Iterator[Any]:
     """iflatten(sequence) -> Iterator
     Similar to ``.flatten()``, but returns iterator instead"""
     for el in x:

diff --git a/parsel/xpathfuncs.py b/parsel/xpathfuncs.py
@@ -1,13 +1,16 @@
 import re
+from typing import Any, Callable, Optional
+
 from lxml import etree
 
 from w3lib.html import HTML5_WHITESPACE
 
+
 regex = f"[{HTML5_WHITESPACE}]+"
 replace_html5_whitespaces = re.compile(regex).sub
 
 
-def set_xpathfunc(fname, func):
+def set_xpathfunc(fname: str, func: Optional[Callable]) -> None:  # type: ignore[type-arg]
     """Register a custom extension function to use in XPath expressions.
 
     The function ``func`` registered under ``fname`` identifier will be called
@@ -21,18 +24,18 @@ def set_xpathfunc(fname, func):
     .. _`in lxml documentation`: https://lxml.de/extensions.html#xpath-extension-functions
 
     """
-    ns_fns = etree.FunctionNamespace(None)
+    ns_fns = etree.FunctionNamespace(None)  # type: ignore[attr-defined]
     if func is not None:
         ns_fns[fname] = func
     else:
         del ns_fns[fname]
 
 
-def setup():
+def setup() -> None:
     set_xpathfunc("has-class", has_class)
 
 
-def has_class(context, *classes):
+def has_class(context: Any, *classes: str) -> bool:
     """has-class function.
 
     Return True if all ``classes`` are present in element's class attr.

diff --git a/tests/test_selector.py b/tests/test_selector.py
@@ -1108,7 +1108,7 @@ def test_remove_root_element_selector(self) -> None:
         sel.css("body").drop()
         self.assertEqual(sel.get(), "<html></html>")
 
-    def test_deep_nesting(self):
+    def test_deep_nesting(self) -> None:
         lxml_version = parse_version(etree.__version__)
         lxml_huge_tree_version = parse_version("4.2")