Merge pull request #181 from EchoShoot/master

Support JMESPath now
scrapy · Apr 11, 2023 · bcfd94e · bcfd94e
2 parents 5b28b54 + 53d5146
commit bcfd94e
Show file tree

Hide file tree

Showing 11 changed files with 615 additions and 115 deletions.
diff --git a/.coveragerc b/.coveragerc
@@ -1,6 +1,5 @@
 [run]
 branch = true
-include = parsel/*
 
 [report]
 exclude_lines =

diff --git a/.gitignore b/.gitignore
@@ -24,6 +24,7 @@ pip-log.txt
 
 # Unit test / coverage reports
 .coverage
+/coverage.xml
 .tox
 nosetests.xml
 htmlcov

diff --git a/README.rst b/README.rst
@@ -19,9 +19,16 @@ Parsel
    :alt: Coverage report
 
 
-Parsel is a BSD-licensed Python_ library to extract and remove data from HTML_
-and XML_ using XPath_ and CSS_ selectors, optionally combined with
-`regular expressions`_.
+Parsel is a BSD-licensed Python_ library to extract data from HTML_, JSON_, and
+XML_ documents.
+
+It supports:
+
+-   CSS_ and XPath_ expressions for HTML and XML documents
+
+-   JMESPath_ expressions for JSON documents
+
+-   `Regular expressions`_
 
 Find the Parsel online documentation at https://parsel.readthedocs.org.
 
@@ -30,15 +37,18 @@ Example (`open online demo`_):
 .. code-block:: python
 
     >>> from parsel import Selector
-    >>> selector = Selector(text="""<html>
-            <body>
-                <h1>Hello, Parsel!</h1>
-                <ul>
-                    <li><a href="http://example.com">Link 1</a></li>
-                    <li><a href="http://scrapy.org">Link 2</a></li>
-                </ul>
-            </body>
-            </html>""")
+    >>> text = """
+            <html>
+                <body>
+                    <h1>Hello, Parsel!</h1>
+                    <ul>
+                        <li><a href="http://example.com">Link 1</a></li>
+                        <li><a href="http://scrapy.org">Link 2</a></li>
+                    </ul>
+                    <script type="application/json">{"a": ["b", "c"]}</script>
+                </body>
+            </html>"""
+    >>> selector = Selector(text=text)
     >>> selector.css('h1::text').get()
     'Hello, Parsel!'
     >>> selector.xpath('//h1/text()').re(r'\w+')
@@ -47,12 +57,18 @@ Example (`open online demo`_):
     ...     print(li.xpath('.//@href').get())
     http://example.com
     http://scrapy.org
-
+    >>> selector.css('script::text').jmespath("a").get()
+    'b'
+    >>> selector.css('script::text').jmespath("a").getall()
+    ['b', 'c']
 
 .. _CSS: https://en.wikipedia.org/wiki/Cascading_Style_Sheets
 .. _HTML: https://en.wikipedia.org/wiki/HTML
+.. _JMESPath: https://jmespath.org/
+.. _JSON: https://en.wikipedia.org/wiki/JSON
 .. _open online demo: https://colab.research.google.com/drive/149VFa6Px3wg7S3SEnUqk--TyBrKplxCN#forceEdit=true&sandboxMode=true
 .. _Python: https://www.python.org/
 .. _regular expressions: https://docs.python.org/library/re.html
 .. _XML: https://en.wikipedia.org/wiki/XML
 .. _XPath: https://en.wikipedia.org/wiki/XPath
+
diff --git a/docs/conf.py b/docs/conf.py
@@ -134,6 +134,8 @@
 
 # nitpicky = True  # https://github.com/scrapy/cssselect/pull/110
 nitpick_ignore = [
+    ("py:class", "ExpressionError"),
+    ("py:class", "SelectorSyntaxError"),
     ("py:class", "cssselect.xpath.GenericTranslator"),
     ("py:class", "cssselect.xpath.HTMLTranslator"),
     ("py:class", "cssselect.xpath.XPathExpr"),

diff --git a/docs/usage.rst b/docs/usage.rst
@@ -4,32 +4,38 @@
 Usage
 =====
 
-Create a :class:`~parsel.selector.Selector` object for the HTML or XML text
-that you want to parse::
+Create a :class:`~parsel.selector.Selector` object for your input text.
+
+For HTML or XML, use `CSS`_ or `XPath`_ expressions to select data::
 
     >>> from parsel import Selector
-    >>> text = "<html><body><h1>Hello, Parsel!</h1></body></html>"
-    >>> selector = Selector(text=text)
+    >>> html_text = "<html><body><h1>Hello, Parsel!</h1></body></html>"
+    >>> html_selector = Selector(text=html_text)
+    >>> html_selector.css('h1')
+    [<Selector query='descendant-or-self::h1' data='<h1>Hello, Parsel!</h1>'>]
+    >>> html_selector.xpath('//h1')  # the same, but now with XPath
+    [<Selector query='//h1' data='<h1>Hello, Parsel!</h1>'>]
 
-Then use `CSS`_ or `XPath`_ expressions to select elements::
+For JSON, use `JMESPath`_ expressions to select data::
 
-    >>> selector.css('h1')
-    [<Selector xpath='descendant-or-self::h1' data='<h1>Hello, Parsel!</h1>'>]
-    >>> selector.xpath('//h1')  # the same, but now with XPath
-    [<Selector xpath='//h1' data='<h1>Hello, Parsel!</h1>'>]
+    >>> json_text = '{"title":"Hello, Parsel!"}'
+    >>> json_selector = Selector(text=json_text)
+    >>> json_selector.jmespath('title')
+    [<Selector query='title' data='Hello, Parsel!'>]
 
 And extract data from those elements::
 
-    >>> selector.css('h1::text').get()
+    >>> html_selector.xpath('//h1/text()').get()
     'Hello, Parsel!'
-    >>> selector.xpath('//h1/text()').getall()
+    >>> json_selector.jmespath('title').getall()
     ['Hello, Parsel!']
 
 .. _CSS: https://www.w3.org/TR/selectors
 .. _XPath: https://www.w3.org/TR/xpath
+.. _JMESPath: https://jmespath.org/
 
-Learning CSS and XPath
-======================
+Learning expression languages
+=============================
 
 `CSS`_ is a language for applying styles to HTML documents. It defines
 selectors to associate those styles with specific HTML elements. Resources to
@@ -39,20 +45,34 @@ learn CSS_ selectors include:
 
 -   `XPath/CSS Equivalents in Wikibooks`_
 
+Parsel support for CSS selectors comes from cssselect, so read about `CSS
+selectors supported by cssselect`_.
+
+.. _CSS selectors supported by cssselect: https://cssselect.readthedocs.io/en/latest/#supported-selectors
+
 `XPath`_ is a language for selecting nodes in XML documents, which can also be
 used with HTML. Resources to learn XPath_ include:
 
 -   `XPath Tutorial in W3Schools`_
 
 -   `XPath cheatsheet`_
 
-You can use either CSS_ or XPath_. CSS_ is usually more readable, but some
-things can only be done with XPath_.
+For HTML and XML input, you can use either CSS_ or XPath_. CSS_ is usually
+more readable, but some things can only be done with XPath_.
+
+JMESPath_ allows you to declaratively specify how to extract elements from
+a JSON document. Resources to learn JMESPath_ include:
+
+-   `JMESPath Tutorial`_
+
+-   `JMESPath Specification`_
 
 .. _CSS selectors in the MDN: https://developer.mozilla.org/en-US/docs/Learn/CSS/Building_blocks/Selectors
 .. _XPath cheatsheet: https://devhints.io/xpath
 .. _XPath Tutorial in W3Schools: https://www.w3schools.com/xml/xpath_intro.asp
 .. _XPath/CSS Equivalents in Wikibooks: https://en.wikibooks.org/wiki/XPath/CSS_Equivalents
+.. _JMESPath Tutorial: https://jmespath.org/tutorial.html
+.. _JMESPath Specification: https://jmespath.org/specification.html
 
 
 Using selectors
@@ -95,12 +115,12 @@ So, by looking at the :ref:`HTML code <topics-selectors-htmlcode>` of that
 page, let's construct an XPath for selecting the text inside the title tag::
 
     >>> selector.xpath('//title/text()')
-    [<Selector xpath='//title/text()' data='Example website'>]
+    [<Selector query='//title/text()' data='Example website'>]
 
 You can also ask the same thing using CSS instead::
 
     >>> selector.css('title::text')
-    [<Selector xpath='descendant-or-self::title/text()' data='Example website'>]
+    [<Selector query='descendant-or-self::title/text()' data='Example website'>]
 
 To actually extract the textual data, you must call the selector ``.get()``
 or ``.getall()`` methods, as follows::
@@ -597,10 +617,10 @@ returns ``True`` for nodes that have all of the specified HTML classes::
     ... """)
     ...
     >>> sel.xpath('//p[has-class("foo")]')
-    [<Selector xpath='//p[has-class("foo")]' data='<p class="foo bar-baz">First</p>'>,
-     <Selector xpath='//p[has-class("foo")]' data='<p class="foo">Second</p>'>]
+    [<Selector query='//p[has-class("foo")]' data='<p class="foo bar-baz">First</p>'>,
+     <Selector query='//p[has-class("foo")]' data='<p class="foo">Second</p>'>]
     >>> sel.xpath('//p[has-class("foo", "bar-baz")]')
-    [<Selector xpath='//p[has-class("foo", "bar-baz")]' data='<p class="foo bar-baz">First</p>'>]
+    [<Selector query='//p[has-class("foo", "bar-baz")]' data='<p class="foo bar-baz">First</p>'>]
     >>> sel.xpath('//p[has-class("foo", "bar")]')
     []
 
@@ -1011,8 +1031,8 @@ directly by their names::
 
     >>> sel.remove_namespaces()
     >>> sel.xpath("//link")
-    [<Selector xpath='//link' data='<link rel="alternate" type="text/html...'>,
-     <Selector xpath='//link' data='<link rel="next" type="application/at...'>,
+    [<Selector query='//link' data='<link rel="alternate" type="text/html...'>,
+     <Selector query='//link' data='<link rel="next" type="application/at...'>,
      ...]
 
 If you wonder why the namespace removal procedure isn't called always by default
@@ -1057,8 +1077,8 @@ And try to select the links again, now using an "atom:" prefix
 for the "link" node test::
 
     >>> sel.xpath("//atom:link", namespaces={"atom": "http://www.w3.org/2005/Atom"})
-    [<Selector xpath='//atom:link' data='<link xmlns="http://www.w3.org/2005/A...'>,
-     <Selector xpath='//atom:link' data='<link xmlns="http://www.w3.org/2005/A...'>,
+    [<Selector query='//atom:link' data='<link xmlns="http://www.w3.org/2005/A...'>,
+     <Selector query='//atom:link' data='<link xmlns="http://www.w3.org/2005/A...'>,
      ...]
 
 You can pass several namespaces (here we're using shorter 1-letter prefixes)::