From c9b8e5ac4a295c79abe1a2e6fb90217e33c187db Mon Sep 17 00:00:00 2001 From: Elias Dorneles Date: Thu, 20 Aug 2015 19:10:28 -0300 Subject: [PATCH 1/4] setting up local doc build to use RTD theme --- Makefile | 1 + docs/conf.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 9ab6a096..2a83dc87 100644 --- a/Makefile +++ b/Makefile @@ -50,6 +50,7 @@ coverage: python -m webbrowser htmlcov/index.html docs: + ( python -c 'import sphinx_rtd_theme' 2>/dev/null || pip install sphinx_rtd_theme ) rm -f docs/parsel.rst rm -f docs/modules.rst sphinx-apidoc -o docs/ parsel diff --git a/docs/conf.py b/docs/conf.py index dee626da..e68c650c 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -111,7 +111,7 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_theme = 'default' +html_theme = 'sphinx_rtd_theme' # Theme options are theme-specific and customize the look and feel of a # theme further. For a list of options available for each theme, see the From bd61b43a69794227e4a2f23596797dafceb3d2a1 Mon Sep 17 00:00:00 2001 From: Elias Dorneles Date: Thu, 20 Aug 2015 19:49:20 -0300 Subject: [PATCH 2/4] move API docs to docstrings and refer to them using autodocs directives --- docs/usage.rst | 93 ++-------------------------------------------- parsel/selector.py | 73 ++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 89 deletions(-) diff --git a/docs/usage.rst b/docs/usage.rst index 49780029..7b743703 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -519,102 +519,17 @@ to use the ``.`` in the XPath expressions that will follow. API reference ============= -.. module:: parsel.selector - :synopsis: Selector class -.. class:: Selector(text, type=None) - - :class:`Selector` allows you to select parts of an XML or HTML text using CSS - or XPath expressions and extract data from it. - - ``text`` is utf-8 encoded text (unicode object in Python 3 or str in Python 3) - - ``type`` defines the selector type, it can be ``"html"``, ``"xml"`` or ``None`` (default). - If ``type`` is ``None``, the selector defaults to ``"html"``. - - .. method:: xpath(query) - - Find nodes matching the xpath ``query`` and return the result as a - :class:`SelectorList` instance with all elements flattened. List - elements implement :class:`Selector` interface too. - - ``query`` is a string containing the XPATH query to apply. - - .. method:: css(query) - - Apply the given CSS selector and return a :class:`SelectorList` instance. - - ``query`` is a string containing the CSS selector to apply. - - In the background, CSS queries are translated into XPath queries using - `cssselect`_ library and run ``.xpath()`` method. - - .. method:: extract() - - Serialize and return the matched nodes as a list of unicode strings. - Percent encoded content is unquoted. - - .. method:: re(regex) - - Apply the given regex and return a list of unicode strings with the - matches. - - ``regex`` can be either a compiled regular expression or a string which - will be compiled to a regular expression using ``re.compile(regex)`` - - .. method:: register_namespace(prefix, uri) - - Register the given namespace to be used in this :class:`Selector`. - Without registering namespaces you can't select or extract data from - non-standard namespaces. See examples below. - - .. method:: remove_namespaces() - - Remove all namespaces, allowing to traverse the document using - namespace-less xpaths. See example below. - - .. method:: __nonzero__() - - Returns ``True`` if there is any real content selected or ``False`` - otherwise. In other words, the boolean value of a :class:`Selector` is - given by the contents it selects. +.. autoclass:: parsel.selector.Selector + :members: SelectorList objects -------------------- -.. class:: SelectorList - - The :class:`SelectorList` class is a subclass of the builtin ``list`` - class, which provides a few additional methods. - - .. method:: xpath(query) - - Call the ``.xpath()`` method for each element in this list and return - their results flattened as another :class:`SelectorList`. - - ``query`` is the same argument as the one in :meth:`Selector.xpath` - - .. method:: css(query) - - Call the ``.css()`` method for each element in this list and return - their results flattened as another :class:`SelectorList`. - - ``query`` is the same argument as the one in :meth:`Selector.css` - - .. method:: extract() - - Call the ``.extract()`` method for each element is this list and return - their results flattened, as a list of unicode strings. - - .. method:: re() - - Call the ``.re()`` method for each element is this list and return - their results flattened, as a list of unicode strings. - - .. method:: __nonzero__() +.. autoclass:: parsel.selector.SelectorList + :members: - returns True if the list is not empty, False otherwise. Selector examples on HTML text diff --git a/parsel/selector.py b/parsel/selector.py index 9d172656..d3381a6c 100644 --- a/parsel/selector.py +++ b/parsel/selector.py @@ -42,6 +42,10 @@ def create_root_node(text, parser_cls, base_url=None): class SelectorList(list): + """ + The :class:`SelectorList` class is a subclass of the builtin ``list`` + class, which provides a few additional methods. + """ # __getslice__ is deprecated but `list` builtin implements it only in Py2 def __getslice__(self, i, j): @@ -53,12 +57,28 @@ def __getitem__(self, pos): return self.__class__(o) if isinstance(pos, slice) else o def xpath(self, xpath): + """ + Call the ``.xpath()`` method for each element in this list and return + their results flattened as another :class:`SelectorList`. + + ``query`` is the same argument as the one in :meth:`Selector.xpath` + """ return self.__class__(flatten([x.xpath(xpath) for x in self])) def css(self, xpath): + """ + Call the ``.css()`` method for each element in this list and return + their results flattened as another :class:`SelectorList`. + + ``query`` is the same argument as the one in :meth:`Selector.css` + """ return self.__class__(flatten([x.css(xpath) for x in self])) def re(self, regex): + """ + Call the ``.re()`` method for each element is this list and return + their results flattened, as a list of unicode strings. + """ return flatten([x.re(regex) for x in self]) def re_first(self, regex): @@ -66,6 +86,10 @@ def re_first(self, regex): return el def extract(self): + """ + Call the ``.extract()`` method for each element is this list and return + their results flattened, as a list of unicode strings. + """ return [x.extract() for x in self] def extract_first(self, default=None): @@ -76,6 +100,15 @@ def extract_first(self, default=None): class Selector(object): + """ + :class:`Selector` allows you to select parts of an XML or HTML text using CSS + or XPath expressions and extract data from it. + + ``text`` is utf-8 encoded text (unicode object in Python 3 or str in Python 3) + + ``type`` defines the selector type, it can be ``"html"``, ``"xml"`` or ``None`` (default). + If ``type`` is ``None``, the selector defaults to ``"html"``. + """ __slots__ = ['text', 'namespaces', 'type', '_expr', 'root', '__weakref__', '_parser', '_csstranslator', '_tostring_method'] @@ -119,6 +152,13 @@ def _get_root(self, text, base_url=None): return create_root_node(text, self._parser, base_url=base_url) def xpath(self, query): + """ + Find nodes matching the xpath ``query`` and return the result as a + :class:`SelectorList` instance with all elements flattened. List + elements implement :class:`Selector` interface too. + + ``query`` is a string containing the XPATH query to apply. + """ try: xpathev = self.root.xpath except AttributeError: @@ -141,15 +181,34 @@ def xpath(self, query): return self.selectorlist_cls(result) def css(self, query): + """ + Apply the given CSS selector and return a :class:`SelectorList` instance. + + ``query`` is a string containing the CSS selector to apply. + + In the background, CSS queries are translated into XPath queries using + `cssselect`_ library and run ``.xpath()`` method. + """ return self.xpath(self._css2xpath(query)) def _css2xpath(self, query): return self._csstranslator.css_to_xpath(query) def re(self, regex): + """ + Apply the given regex and return a list of unicode strings with the + matches. + + ``regex`` can be either a compiled regular expression or a string which + will be compiled to a regular expression using ``re.compile(regex)`` + """ return extract_regex(regex, self.extract()) def extract(self): + """ + Serialize and return the matched nodes as a list of unicode strings. + Percent encoded content is unquoted. + """ try: return etree.tostring(self.root, method=self._tostring_method, @@ -164,9 +223,18 @@ def extract(self): return six.text_type(self.root) def register_namespace(self, prefix, uri): + """ + Register the given namespace to be used in this :class:`Selector`. + Without registering namespaces you can't select or extract data from + non-standard namespaces. See examples below. + """ self.namespaces[prefix] = uri def remove_namespaces(self): + """ + Remove all namespaces, allowing to traverse the document using + namespace-less xpaths. See example below. + """ for el in self.root.iter('*'): if el.tag.startswith('{'): el.tag = el.tag.split('}', 1)[1] @@ -176,6 +244,11 @@ def remove_namespaces(self): el.attrib[an.split('}', 1)[1]] = el.attrib.pop(an) def __bool__(self): + """ + Returns ``True`` if there is any real content selected or ``False`` + otherwise. In other words, the boolean value of a :class:`Selector` is + given by the contents it selects. + """ return bool(self.extract()) __nonzero__ = __bool__ From 524ea3eda55b931d16774b3c7181c9b94fa33b46 Mon Sep 17 00:00:00 2001 From: Elias Dorneles Date: Fri, 21 Aug 2015 15:15:14 -0300 Subject: [PATCH 3/4] use proper references in the docstrings --- docs/usage.rst | 2 ++ parsel/selector.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/usage.rst b/docs/usage.rst index 7b743703..db4672c6 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -558,6 +558,8 @@ an HTML text like this:: print node.xpath("@class").extract() +.. _selector-examples-xml: + Selector examples on XML text ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/parsel/selector.py b/parsel/selector.py index d3381a6c..88863df0 100644 --- a/parsel/selector.py +++ b/parsel/selector.py @@ -226,14 +226,14 @@ def register_namespace(self, prefix, uri): """ Register the given namespace to be used in this :class:`Selector`. Without registering namespaces you can't select or extract data from - non-standard namespaces. See examples below. + non-standard namespaces. See :ref:`selector-examples-xml`. """ self.namespaces[prefix] = uri def remove_namespaces(self): """ Remove all namespaces, allowing to traverse the document using - namespace-less xpaths. See example below. + namespace-less xpaths. See :ref:`removing-namespaces`. """ for el in self.root.iter('*'): if el.tag.startswith('{'): From 945105fb791cea1a2c8286c90d23ba7e67ac8d15 Mon Sep 17 00:00:00 2001 From: Elias Dorneles Date: Fri, 21 Aug 2015 15:54:03 -0300 Subject: [PATCH 4/4] minor grammar and style fixes --- parsel/selector.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parsel/selector.py b/parsel/selector.py index 88863df0..282f2e13 100644 --- a/parsel/selector.py +++ b/parsel/selector.py @@ -104,7 +104,7 @@ class Selector(object): :class:`Selector` allows you to select parts of an XML or HTML text using CSS or XPath expressions and extract data from it. - ``text`` is utf-8 encoded text (unicode object in Python 3 or str in Python 3) + ``text`` is a ``unicode`` object in Python 2 or a ``str`` object in Python 3 ``type`` defines the selector type, it can be ``"html"``, ``"xml"`` or ``None`` (default). If ``type`` is ``None``, the selector defaults to ``"html"``. @@ -245,7 +245,7 @@ def remove_namespaces(self): def __bool__(self): """ - Returns ``True`` if there is any real content selected or ``False`` + Return ``True`` if there is any real content selected or ``False`` otherwise. In other words, the boolean value of a :class:`Selector` is given by the contents it selects. """