Skip to content

Commit

Permalink
Merge pull request #218 from pcorpet/typing
Browse files Browse the repository at this point in the history
  • Loading branch information
wRAR committed Jul 30, 2021
2 parents 56bec18 + 9de2cd2 commit dc2eb7d
Show file tree
Hide file tree
Showing 13 changed files with 262 additions and 120 deletions.
4 changes: 4 additions & 0 deletions .coveragerc
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
[run]
branch = true
include = parsel/*

[report]
exclude_lines =
@typing.overload
3 changes: 3 additions & 0 deletions .github/workflows/checks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ jobs:
- python-version: 3.8 # Keep in sync with .readthedocs.yml
env:
TOXENV: docs
- python-version: 3.9
env:
TOXENV: typing

steps:
- uses: actions/checkout@v2
Expand Down
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ include CONTRIBUTING.rst
include NEWS
include LICENSE
include README.rst
include py.typed

recursive-include tests *
recursive-exclude * __pycache__
Expand Down
2 changes: 2 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
History
-------

* Add PEP 561-style type information

1.6.0 (2020-05-07)
~~~~~~~~~~~~~~~~~~

Expand Down
8 changes: 7 additions & 1 deletion parsel/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,15 @@
__author__ = 'Scrapy project'
__email__ = 'info@scrapy.org'
__version__ = '1.6.0'
__all__ = [
'Selector',
'SelectorList',
'css2xpath',
'xpathfuncs',
]

from parsel.selector import Selector, SelectorList # NOQA
from parsel.csstranslator import css2xpath # NOQA
from parsel import xpathfuncs # NOQA
from parsel import xpathfuncs # NOQA

xpathfuncs.setup()
115 changes: 83 additions & 32 deletions parsel/selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,16 @@
XPath selectors based on lxml
"""

import typing
from typing import Any, Dict, List, Optional, Mapping, Pattern, Union

from lxml import etree, html

from .utils import flatten, iflatten, extract_regex, shorten
from .csstranslator import HTMLTranslator, GenericTranslator

_SelectorType = typing.TypeVar('_SelectorType', bound='Selector')


class CannotRemoveElementWithoutRoot(Exception):
pass
Expand All @@ -17,7 +22,7 @@ class CannotRemoveElementWithoutParent(Exception):


class SafeXMLParser(etree.XMLParser):
def __init__(self, *args, **kwargs):
def __init__(self, *args, **kwargs) -> None:
kwargs.setdefault('resolve_entities', False)
super().__init__(*args, **kwargs)

Expand All @@ -32,7 +37,7 @@ def __init__(self, *args, **kwargs):
}


def _st(st):
def _st(st: Optional[str]) -> str:
if st is None:
return 'html'
elif st in _ctgroup:
Expand All @@ -52,20 +57,33 @@ def create_root_node(text, parser_cls, base_url=None):
return root


class SelectorList(list):
class SelectorList(List[_SelectorType]):
"""
The :class:`SelectorList` class is a subclass of the builtin ``list``
class, which provides a few additional methods.
"""

def __getitem__(self, pos):
@typing.overload
def __getitem__(self, pos: int) -> _SelectorType:
pass

@typing.overload
def __getitem__(self, pos: slice) -> 'SelectorList[_SelectorType]':
pass

def __getitem__(self, pos: Union[int, slice]) -> Union[_SelectorType, 'SelectorList[_SelectorType]']:
o = super().__getitem__(pos)
return self.__class__(o) if isinstance(pos, slice) else o

def __getstate__(self):
def __getstate__(self) -> None:
raise TypeError("can't pickle SelectorList objects")

def xpath(self, xpath, namespaces=None, **kwargs):
def xpath(
self,
xpath: str,
namespaces: Optional[Mapping[str, str]] = None,
**kwargs,
) -> 'SelectorList[_SelectorType]':
"""
Call the ``.xpath()`` method for each element in this list and return
their results flattened as another :class:`SelectorList`.
Expand All @@ -84,7 +102,7 @@ def xpath(self, xpath, namespaces=None, **kwargs):
"""
return self.__class__(flatten([x.xpath(xpath, namespaces=namespaces, **kwargs) for x in self]))

def css(self, query):
def css(self, query: str) -> 'SelectorList[_SelectorType]':
"""
Call the ``.css()`` method for each element in this list and return
their results flattened as another :class:`SelectorList`.
Expand All @@ -93,7 +111,7 @@ def css(self, query):
"""
return self.__class__(flatten([x.css(query) for x in self]))

def re(self, regex, replace_entities=True):
def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True) -> List[str]:
"""
Call the ``.re()`` method for each element in this list and return
their results flattened, as a list of unicode strings.
Expand All @@ -105,7 +123,18 @@ def re(self, regex, replace_entities=True):
"""
return flatten([x.re(regex, replace_entities=replace_entities) for x in self])

def re_first(self, regex, default=None, replace_entities=True):
@typing.overload
def re_first(self, regex: Union[str, Pattern[str]], default: None = None,
replace_entities: bool = True) -> Optional[str]:
pass

@typing.overload
def re_first(self, regex: Union[str, Pattern[str]], default: str,
replace_entities: bool = True) -> str:
pass

def re_first(self, regex: Union[str, Pattern[str]], default: Optional[str] = None,
replace_entities: bool = True) -> Optional[str]:
"""
Call the ``.re()`` method for the first element in this list and
return the result in an unicode string. If the list is empty or the
Expand All @@ -121,15 +150,23 @@ def re_first(self, regex, default=None, replace_entities=True):
return el
return default

def getall(self):
def getall(self) -> List[str]:
"""
Call the ``.get()`` method for each element is this list and return
their results flattened, as a list of unicode strings.
"""
return [x.get() for x in self]
extract = getall

def get(self, default=None):
@typing.overload
def get(self, default: None = None) -> Optional[str]:
pass

@typing.overload
def get(self, default: str) -> str:
pass

def get(self, default: Optional[str] = None) -> Optional[str]:
"""
Return the result of ``.get()`` for the first element in this list.
If the list is empty, return the default value.
Expand All @@ -140,15 +177,15 @@ def get(self, default=None):
extract_first = get

@property
def attrib(self):
def attrib(self) -> Mapping[str, str]:
"""Return the attributes dictionary for the first element.
If the list is empty, return an empty dict.
"""
for x in self:
return x.attrib
return {}

def remove(self):
def remove(self) -> None:
"""
Remove matched nodes from the parent for each element in this list.
"""
Expand All @@ -173,7 +210,7 @@ class Selector:
__slots__ = ['text', 'namespaces', 'type', '_expr', 'root',
'__weakref__', '_parser', '_csstranslator', '_tostring_method']

_default_type = None
_default_type: Optional[str] = None
_default_namespaces = {
"re": "http://exslt.org/regular-expressions",

Expand All @@ -186,10 +223,11 @@ class Selector:
"set": "http://exslt.org/sets"
}
_lxml_smart_strings = False
selectorlist_cls = SelectorList
selectorlist_cls = SelectorList['Selector']

def __init__(self, text=None, type=None, namespaces=None, root=None,
base_url=None, _expr=None):
def __init__(self, text: Optional[str] = None, type: Optional[str] = None,
namespaces: Optional[Mapping[str, str]] = None, root: Optional[Any] = None,
base_url: Optional[str] = None, _expr: Optional[str] = None) -> None:
self.type = st = _st(type or self._default_type)
self._parser = _ctgroup[st]['_parser']
self._csstranslator = _ctgroup[st]['_csstranslator']
Expand All @@ -211,13 +249,15 @@ def __init__(self, text=None, type=None, namespaces=None, root=None,
self.root = root
self._expr = _expr

def __getstate__(self):
def __getstate__(self) -> Any:
raise TypeError("can't pickle Selector objects")

def _get_root(self, text, base_url=None):
def _get_root(self, text: str, base_url: Optional[str] = None) -> Any:
return create_root_node(text, self._parser, base_url=base_url)

def xpath(self, query, namespaces=None, **kwargs):
def xpath(
self: _SelectorType, query: str, namespaces: Optional[Mapping[str, str]] = None, **kwargs,
) -> SelectorList[_SelectorType]:
"""
Find nodes matching the xpath ``query`` and return the result as a
:class:`SelectorList` instance with all elements flattened. List
Expand Down Expand Up @@ -259,7 +299,7 @@ def xpath(self, query, namespaces=None, **kwargs):
for x in result]
return self.selectorlist_cls(result)

def css(self, query):
def css(self: _SelectorType, query: str) -> SelectorList[_SelectorType]:
"""
Apply the given CSS selector and return a :class:`SelectorList` instance.
Expand All @@ -272,10 +312,10 @@ def css(self, query):
"""
return self.xpath(self._css2xpath(query))

def _css2xpath(self, query):
def _css2xpath(self, query: str) -> Any:
return self._csstranslator.css_to_xpath(query)

def re(self, regex, replace_entities=True):
def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True) -> List[str]:
"""
Apply the given regex and return a list of unicode strings with the
matches.
Expand All @@ -290,7 +330,18 @@ def re(self, regex, replace_entities=True):
"""
return extract_regex(regex, self.get(), replace_entities=replace_entities)

def re_first(self, regex, default=None, replace_entities=True):
@typing.overload
def re_first(self, regex: Union[str, Pattern[str]], default: None = None,
replace_entities: bool = True) -> Optional[str]:
pass

@typing.overload
def re_first(self, regex: Union[str, Pattern[str]], default: str,
replace_entities: bool = True) -> str:
pass

def re_first(self, regex: Union[str, Pattern[str]], default: Optional[str] = None,
replace_entities: bool = True) -> Optional[str]:
"""
Apply the given regex and return the first unicode string which
matches. If there is no match, return the default value (``None`` if
Expand All @@ -303,7 +354,7 @@ def re_first(self, regex, default=None, replace_entities=True):
"""
return next(iflatten(self.re(regex, replace_entities=replace_entities)), default)

def get(self):
def get(self) -> str:
"""
Serialize and return the matched nodes in a single unicode string.
Percent encoded content is unquoted.
Expand All @@ -322,21 +373,21 @@ def get(self):
return str(self.root)
extract = get

def getall(self):
def getall(self) -> List[str]:
"""
Serialize and return the matched node in a 1-element list of unicode strings.
"""
return [self.get()]

def register_namespace(self, prefix, uri):
def register_namespace(self, prefix: str, uri: str) -> None:
"""
Register the given namespace to be used in this :class:`Selector`.
Without registering namespaces you can't select or extract data from
non-standard namespaces. See :ref:`selector-examples-xml`.
"""
self.namespaces[prefix] = uri

def remove_namespaces(self):
def remove_namespaces(self) -> None:
"""
Remove all namespaces, allowing to traverse the document using
namespace-less xpaths. See :ref:`removing-namespaces`.
Expand All @@ -351,7 +402,7 @@ def remove_namespaces(self):
# remove namespace declarations
etree.cleanup_namespaces(self.root)

def remove(self):
def remove(self) -> None:
"""
Remove matched nodes from the parent element.
"""
Expand All @@ -376,12 +427,12 @@ def remove(self):
)

@property
def attrib(self):
def attrib(self) -> Dict[str, str]:
"""Return the attributes dictionary for underlying element.
"""
return dict(self.root.attrib)

def __bool__(self):
def __bool__(self) -> bool:
"""
Return ``True`` if there is any real content selected or ``False``
otherwise. In other words, the boolean value of a :class:`Selector` is
Expand All @@ -390,7 +441,7 @@ def __bool__(self):
return bool(self.get())
__nonzero__ = __bool__

def __str__(self):
def __str__(self) -> str:
data = repr(shorten(self.get(), width=40))
return "<%s xpath=%r data=%s>" % (type(self).__name__, self._expr, data)
__repr__ = __str__
8 changes: 5 additions & 3 deletions parsel/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
from typing import Any, List, Pattern, Union
from w3lib.html import replace_entities as w3lib_replace_entities


Expand Down Expand Up @@ -31,7 +32,7 @@ def iflatten(x):
yield el


def _is_listlike(x):
def _is_listlike(x: Any) -> bool:
"""
>>> _is_listlike("foo")
False
Expand All @@ -55,7 +56,8 @@ def _is_listlike(x):
return hasattr(x, "__iter__") and not isinstance(x, (str, bytes))


def extract_regex(regex, text, replace_entities=True):
def extract_regex(regex: Union[str, Pattern[str]], text: str,
replace_entities: bool = True) -> List[str]:
"""Extract a list of unicode strings from the given text/encoding using the following policies:
* if the regex contains a named group called "extract" that will be returned
* if the regex contains multiple numbered groups, all those will be returned (flattened)
Expand All @@ -82,7 +84,7 @@ def extract_regex(regex, text, replace_entities=True):
return [w3lib_replace_entities(s, keep=['lt', 'amp']) for s in strings]


def shorten(text, width, suffix='...'):
def shorten(text: str, width: int, suffix: str = '...') -> str:
"""Truncate the given text to fit in the given width."""
if len(text) <= width:
return text
Expand Down
Empty file added py.typed
Empty file.
1 change: 1 addition & 0 deletions pylintrc
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
[MASTER]
ignore=tests/typing
persistent=no

[MESSAGES CONTROL]
Expand Down

0 comments on commit dc2eb7d

Please sign in to comment.