# XPath

_"XPath is a language for addressing parts of an XML document"_ -- http://www.w3.org/TR/xpath/

# Document model

In [1]:
import lxml.html

sample ="""
<html>
 <head>
  <title>Ceci est un titre</title>
  <meta http-equiv="content-type" content="text/html;charset=utf-8" />
 </head>
 <body>
  <div>
   <div>
    <p>Ceci est un paragraphe.</p>
    <p>Ceci est-il <a href="page2.html">un lien</a>?</p>
    <br>
    Apparemment.
   </div>
   <div>
    Rien &agrave; ajouter.
    Sauf cet <a href="page3.html">autre lien</a>.
    <!-- Et ce commentaire -->
   </div>
  </div>
 </body>
</html>
"""

root = lxml.html.fromstring(sample)

The model considers different node types:

- element nodes: `<p>...</p>`
- attribute nodes: `href="page2.html"`
- comment nodes: `<!-- ceci est un commentaire -->`
- text nodes ("Character data"): `Rien à ajouter`
- 3 others: processing instructions, root nodes and namespace nodes

Nodes have an order in the tree, the order they appear in the HTML source (a.k.a "document order")

In [2]:
# lxml's .iter() outputs elements in document order
# http://lxml.de/tutorial.html#tree-iteration

for element in root.iter():
    print "--------------"
    print lxml.html.tostring(element)[:100]+"..."

--------------
<html>
 <head>
  <title>Ceci est un titre</title>
  <meta http-equiv="content-type" content="text/ht...
--------------
<head>
  <title>Ceci est un titre</title>
  <meta http-equiv="content-type" content="text/html;chars...
--------------
<title>Ceci est un titre</title>
  ...
--------------
<meta http-equiv="content-type" content="text/html;charset=utf-8">
 ...
--------------
<body>
  <div>
   <div>
    <p>Ceci est un paragraphe.</p>
    <p>Ceci est-il <a href="page2.html">u...
--------------
<div>
   <div>
    <p>Ceci est un paragraphe.</p>
    <p>Ceci est-il <a href="page2.html">un lien</a...
--------------
<div>
    <p>Ceci est un paragraphe.</p>
    <p>Ceci est-il <a href="page2.html">un lien</a>?</p>
  ...
--------------
<p>Ceci est un paragraphe.</p>
    ...
--------------
<p>Ceci est-il <a href="page2.html">un lien</a>?</p>
    ...
--------------
<a href="page2.html">un lien</a>?...
--------------
<br>
    Apparemment.
   ...
--------------
<div>
    Rien &#224; a

# Simple expressions to explore node types

In [3]:
# A / by itself selects the root node of the document containing the context node.
# the root node is NOT <html>...</html>
# <html> is a child of the root node: cf. http://www.w3.org/TR/xpath/#root-node
root.xpath('/')

[]

In [7]:
root.xpath('/ child :: html')

[<Element html at 0x7f40fc08d520>]

In [5]:
# no <p> child of the root node
root.xpath('/p')

[]

In [6]:
# all decendants of the root node: all elements
root.xpath('//*') # lxml outputs nodes in document order

[<Element html at 0x7f3188856a48>,
 <Element head at 0x7f3178652838>,
 <Element title at 0x7f3178652890>,
 <Element meta at 0x7f31786528e8>,
 <Element body at 0x7f3178652940>,
 <Element div at 0x7f3178652998>,
 <Element div at 0x7f31786529f0>,
 <Element p at 0x7f3178652a48>,
 <Element p at 0x7f3178652aa0>,
 <Element a at 0x7f3178652af8>,
 <Element br at 0x7f3178652b50>,
 <Element div at 0x7f3178652ba8>,
 <Element a at 0x7f3178652c00>]

In [7]:
from pprint import pprint

for e in root.xpath('//*'):
    print "---------------"
    print lxml.html.tostring(e)[:100]+"..."

---------------
<html>
 <head>
  <title>Ceci est un titre</title>
  <meta http-equiv="content-type" content="text/ht...
---------------
<head>
  <title>Ceci est un titre</title>
  <meta http-equiv="content-type" content="text/html;chars...
---------------
<title>Ceci est un titre</title>
  ...
---------------
<meta http-equiv="content-type" content="text/html;charset=utf-8">
 ...
---------------
<body>
  <div>
   <div>
    <p>Ceci est un paragraphe.</p>
    <p>Ceci est-il <a href="page2.html">u...
---------------
<div>
   <div>
    <p>Ceci est un paragraphe.</p>
    <p>Ceci est-il <a href="page2.html">un lien</a...
---------------
<div>
    <p>Ceci est un paragraphe.</p>
    <p>Ceci est-il <a href="page2.html">un lien</a>?</p>
  ...
---------------
<p>Ceci est un paragraphe.</p>
    ...
---------------
<p>Ceci est-il <a href="page2.html">un lien</a>?</p>
    ...
---------------
<a href="page2.html">un lien</a>?...
---------------
<br>
    Apparemment.
   ...
---------------
<div>
    R

In [8]:
# `*` does not select _all nodes_, but only element nodes, neither text nor attributes
root.xpath('//node()') == root.xpath('//*')

False

In [9]:
root.xpath('//node()')

[<Element html at 0x7f3188856a48>,
 '\n ',
 <Element head at 0x7f3178652838>,
 '\n  ',
 <Element title at 0x7f3178652890>,
 'Ceci est un titre',
 '\n  ',
 <Element meta at 0x7f31786528e8>,
 '\n ',
 '\n ',
 <Element body at 0x7f3178652940>,
 '\n  ',
 <Element div at 0x7f3178652998>,
 '\n   ',
 <Element div at 0x7f31786529f0>,
 '\n    ',
 <Element p at 0x7f3178652a48>,
 'Ceci est un paragraphe.',
 '\n    ',
 <Element p at 0x7f3178652aa0>,
 'Ceci est-il ',
 <Element a at 0x7f3178652af8>,
 'un lien',
 '?',
 '\n    ',
 <Element br at 0x7f3178652b50>,
 '\n    Apparemment.\n   ',
 '\n   ',
 <Element div at 0x7f3178652ba8>,
 u'\n    Rien \xe0 ajouter.\n    Sauf cet ',
 <Element a at 0x7f3178652c00>,
 'autre lien',
 '.\n    ',
 <!-- Et ce commentaire -->,
 '\n   ',
 '\n  ',
 '\n ',
 '\n']

In [10]:
# select text nodes
root.xpath('//text()')

['\n ',
 '\n  ',
 'Ceci est un titre',
 '\n  ',
 '\n ',
 '\n ',
 '\n  ',
 '\n   ',
 '\n    ',
 'Ceci est un paragraphe.',
 '\n    ',
 'Ceci est-il ',
 'un lien',
 '?',
 '\n    ',
 '\n    Apparemment.\n   ',
 '\n   ',
 u'\n    Rien \xe0 ajouter.\n    Sauf cet ',
 'autre lien',
 '.\n    ',
 '\n   ',
 '\n  ',
 '\n ',
 '\n']

In [11]:
# comment nodes
root.xpath('//comment()')

[<!-- Et ce commentaire -->]

In [12]:
# you cannot do that...
root.xpath('//attribute()')

XPathEvalError: Invalid expression

In [13]:
# but "@*" and you get attribute values
root.xpath('//@*')

['content-type', 'text/html;charset=utf-8', 'page2.html', 'page3.html']

In [14]:
root.xpath('//attribute::*')

['content-type', 'text/html;charset=utf-8', 'page2.html', 'page3.html']

## What about _not_ selecting all nodes of a type? Use predicates

In [15]:
# paragraphs that contain a link 
paras = root.xpath('//p[a]')
#                       ^
#                       |
#                  this is a predicate: here "context node contains at least 1 `a` element
paras, map(lxml.html.tostring, paras)

([<Element p at 0x7f3178652aa0>],
 ['<p>Ceci est-il <a href="page2.html">un lien</a>?</p>\n    '])

In [16]:
# divs' that contain a paragraph that contain a link
divs = root.xpath('//div[p[a]]')

divs, map(lxml.html.tostring, divs)

([<Element div at 0x7f31786529f0>],
 ['<div>\n    <p>Ceci est un paragraphe.</p>\n    <p>Ceci est-il <a href="page2.html">un lien</a>?</p>\n    <br>\n    Apparemment.\n   </div>\n   '])

In [17]:
# by the way, whitespace is not important (except for `//`...)
(
    root.xpath('// div [    p [a ]     ]'),
    
    root.xpath('''
        //div[
                    p[
                        a
                    ]     
                ]
                '''),
    
    root.xpath('/ / div[p[a]]')
)

([<Element div at 0x7f31786529f0>], [<Element div at 0x7f31786529f0>], [])

In [18]:
# selecting using attributes or attribute values
root.xpath('//a[@href="page2.html"]')

[<Element a at 0x7f3178652af8>]

In [19]:
# expressions as predicates, nested predicates
(
    root.xpath('//p[   a/@href="page2.html"    ]'),
    root.xpath('//p[   a[@href="page2.html"]   ]'),
)

([<Element p at 0x7f3178652aa0>], [<Element p at 0x7f3178652aa0>])

In [20]:
# selecting using text content
root.xpath('//p[contains(., "paragraphe")]')

[<Element p at 0x7f3178652a48>]

In [21]:
# selecting using text content with exact match
root.xpath('//p[.="Ceci est un paragraphe"]')

[]

In [22]:
# Oops, yes, whitespace in text nodes is important
# XPath has various string functions to help
root.xpath('//p[ normalize-space(.) = "Ceci est un paragraphe" ]')

[]

In [23]:
# selecting by position (relative to it's parent node)
root.xpath('//p[1]'), root.xpath('//p[last()]'), 

([<Element p at 0x7f3178652a48>], [<Element p at 0x7f3178652aa0>])

In [24]:
# selecting by position, careful when grouping
root.xpath('//div[2]'), root.xpath('(//div)[2]'), 
#                 ^                         ^
#                 |                         |
#   second div child (of the context node)
#                                           |
#                           second div in whole document

([<Element div at 0x7f3178652ba8>], [<Element div at 0x7f31786529f0>])

# LocationPaths & Axes

What we've seen so far are all **LocationPaths** expressions.

LocationPaths are the most common XPath expressions.

- `{/}? step {/step}*`
- each step is: _axis_ '::' _node-test()_ _[predicates]*_

## Absolute vs. relative LocationPaths

_"A relative location path consists of a sequence of one or more location steps separated by / "_ -- http://www.w3.org/TR/xpath/#location-paths

_"An absolute location path consists of / optionally followed by a relative location path. (...) the location path selects the set of nodes that would be selected by the relative location path **relative to the root node** of the document containing the context node."_

In [25]:
for div in root.xpath('//div'):
    print div.xpath('//text()') # this will select all text nodes from the root node down

['\n ', '\n  ', 'Ceci est un titre', '\n  ', '\n ', '\n ', '\n  ', '\n   ', '\n    ', 'Ceci est un paragraphe.', '\n    ', 'Ceci est-il ', 'un lien', '?', '\n    ', '\n    Apparemment.\n   ', '\n   ', u'\n    Rien \xe0 ajouter.\n    Sauf cet ', 'autre lien', '.\n    ', '\n   ', '\n  ', '\n ', '\n']
['\n ', '\n  ', 'Ceci est un titre', '\n  ', '\n ', '\n ', '\n  ', '\n   ', '\n    ', 'Ceci est un paragraphe.', '\n    ', 'Ceci est-il ', 'un lien', '?', '\n    ', '\n    Apparemment.\n   ', '\n   ', u'\n    Rien \xe0 ajouter.\n    Sauf cet ', 'autre lien', '.\n    ', '\n   ', '\n  ', '\n ', '\n']
['\n ', '\n  ', 'Ceci est un titre', '\n  ', '\n ', '\n ', '\n  ', '\n   ', '\n    ', 'Ceci est un paragraphe.', '\n    ', 'Ceci est-il ', 'un lien', '?', '\n    ', '\n    Apparemment.\n   ', '\n   ', u'\n    Rien \xe0 ajouter.\n    Sauf cet ', 'autre lien', '.\n    ', '\n   ', '\n  ', '\n ', '\n']


In [26]:
# advice: use only relative location paths to avoid confusion
for div in root.xpath('.//div'):
    print div.xpath('.//text()') # only selects text nodes relative to each div

['\n   ', '\n    ', 'Ceci est un paragraphe.', '\n    ', 'Ceci est-il ', 'un lien', '?', '\n    ', '\n    Apparemment.\n   ', '\n   ', u'\n    Rien \xe0 ajouter.\n    Sauf cet ', 'autre lien', '.\n    ', '\n   ', '\n  ']
['\n    ', 'Ceci est un paragraphe.', '\n    ', 'Ceci est-il ', 'un lien', '?', '\n    ', '\n    Apparemment.\n   ']
[u'\n    Rien \xe0 ajouter.\n    Sauf cet ', 'autre lien', '.\n    ', '\n   ']


## Axes

Inside the document, for each **element** node, XPath defines several axes, to navigate in the document from the context node (where you are in the tree)

- self (where you are right now)
- ancestor (and ancestor-or-self)
- descendant (and descendant-or-self)
- preceding-sibling, following-sibling


Abbreviated syntax is very helpful and much easier to read:

- **`//`** is short for **`/descendant-or-self::node()/`**
- **.** is short for **`self::node()`**. 

E.g. **`.//para`** is short for **`self::node()/descendant-or-self::node()/child::p`**

In [27]:
(
    root.xpath('//*') == root.xpath('.//*'),
    root.xpath('.//*') == root.xpath('self::node()/descendant-or-self::node()/child::*'),
    
    # which is equivalent to descendant elements
    root.xpath('.//*') == root.xpath('descendant::*'),
)

(False, True, True)

In [28]:
# root node
root.xpath('/self::node()')

[]

In [29]:
# child of root
root.xpath('/child::node()')

[<Element html at 0x7f3188856a48>]

In [30]:
# but strangely, child axis with root node as context, yield children nodes from <html>...
root.xpath('child::node()')

['\n ',
 <Element head at 0x7f3178652838>,
 '\n ',
 <Element body at 0x7f3178652940>,
 '\n']

The selection of node-set from LocationPaths is done from left to right

1. _The initial sequence of steps selects a set of nodes relative to a context node._
2. _Each node in that set is used as a context node for the following step._
3. _The sets of nodes identified by that step are unioned together. The set of nodes identified by the composition of the steps is this union._


```
step1 / step2 / step3 / step4:

1. context node --> step1 --> node-set-1
2. for each node in node-set-1, node --> step2 --> node-set appended to node-set-2
3. node-set-2   --> step3 --> node-set-3
4. node-set-3   --> step4 --> node-set-4 --> result of expression selection
```

In [31]:
abbrpath = "/html/body/div//p"

steps = ['/html', 'body', 'div', 'descendant-or-self::node()', 'p']
relpath = "/".join(steps)
print relpath
root.xpath(relpath) == root.xpath(abbrpath)

/html/body/div/descendant-or-self::node()/p


True

In [32]:
inodeset = [root]
for cnt, step in enumerate(steps, start=1):
    print "%2d: '%s'" % (cnt, step),
    nodeset = []
    for node in inodeset:
        try:
            selected = node.xpath(step)
            nodeset.extend(selected)
        except:
            pass
    print len(nodeset)
    inodeset = nodeset
nodeset == root.xpath(abbrpath)

 1: '/html' 1
 2: 'body' 1
 3: 'div' 1
 4: 'descendant-or-self::node()' 24
 5: 'p' 2


True

# XPath expressions return types

So far we have seen that XPath expressions can return nodes (elements, attribute value and comments).

But XPath expression can also return:
 
- a floating point number
- a boolean
- a string

In [33]:
# return node-sets (using LocationPaths)
(
    root.xpath('//p'),
    root.xpath('//a'),
    
    # The | operator computes the union of its operands, which must be node-sets.
    # http://www.w3.org/TR/xpath/#node-sets
    root.xpath('//p | //div'),
)


([<Element p at 0x7f3178652a48>, <Element p at 0x7f3178652aa0>],
 [<Element a at 0x7f3178652af8>, <Element a at 0x7f3178652c00>],
 [<Element div at 0x7f3178652998>,
  <Element div at 0x7f31786529f0>,
  <Element p at 0x7f3178652a48>,
  <Element p at 0x7f3178652aa0>,
  <Element div at 0x7f3178652ba8>])

In [34]:
# return floating point numbers
(
    root.xpath('42'), # I know, not very useful
    root.xpath('count(//p)'),
    root.xpath('count(//div)'),
    root.xpath('count(//div | //p)')
)

(42.0, 2.0, 3.0, 5.0)

In [35]:
# return booleans
(
    root.xpath('false()'),
    root.xpath('boolean(//p)'),
    root.xpath('boolean(//img)'), # no img tags in document
    root.xpath('count(//div) = count(//p)'),
    root.xpath('count(//meta/@*) = count(//a)'),
)

(False, True, False, False, True)

In [36]:
# string functions
# http://www.w3.org/TR/xpath/#section-String-Functions
(    
    root.xpath('"a string"'),
    root.xpath('string(//div)'),
    root.xpath('normalize-space(//div)'),
    root.xpath('string(//comment())'),
)

('a string',
 u'\n   \n    Ceci est un paragraphe.\n    Ceci est-il un lien?\n    \n    Apparemment.\n   \n   \n    Rien \xe0 ajouter.\n    Sauf cet autre lien.\n    \n   \n  ',
 u'Ceci est un paragraphe. Ceci est-il un lien? Apparemment. Rien \xe0 ajouter. Sauf cet autre lien.',
 ' Et ce commentaire ')

Have you noticed something?
You get only 1 string back with `string()`, even with `//div` as parameter

This is a bit hidden in the XPath 1.0 specifications:

```
Function: string string(object?)

The string function converts an object to a string as follows:

A node-set is converted to a string by returning the string-value of the node in the node-set that is first in document order. If the node-set is empty, an empty string is returned.
```

In [37]:
(    
    root.xpath('string(//div)') == root.xpath('string((//div)[1])'),
    root.xpath('string(//div)') == root.xpath('string((//div)[2])'),
)

(True, False)

# Web scraping use cases

## Extracting attribute (values)

In [8]:
htmlsource= """<div class="second">
      Rien &agrave; ajouter.
      Sauf cet <a href="page3.html">autre lien</a>. 
      <!-- Et ce commentaire -->
    </div>"""

In [9]:
import lxml.html
root = lxml.html.fromstring(htmlsource)

In [12]:
root.xpath('//div[@class="second"]/text()')

[u'\n      Rien \xe0 ajouter.\n      Sauf cet ', '. \n      ', '\n    ']

In [13]:
root.xpath('//div[@class="second"]//text()')

[u'\n      Rien \xe0 ajouter.\n      Sauf cet ',
 'autre lien',
 '. \n      ',
 '\n    ']

In [14]:
root.xpath('string(//div[@class="second"])')

u'\n      Rien \xe0 ajouter.\n      Sauf cet autre lien. \n      \n    '

## Loop on elements

In [64]:
# source HTML from http://www.sarenza.com/new-balance-u420-s807876-p0000022369
f = open('New Balance U420 (Noir) - Baskets chez Sarenza (238259).html')
htmlsource = f.read()

In [65]:
import parsel

# here I know it's UTF8. Scrapy would determine this by itself
selector = parsel.Selector(text=htmlsource.decode('utf8'))

In [66]:
from pprint import pprint

print selector.css('div.detail-product').extract_first()

<div class="detail-product">
                <div class="row-panel ">
                    <h2 class="title-panel large">
                        <span>On en pense quoi ?</span>
                    </h2>
                    <div class="panel">
                        <div class="mask-text">
                            <p itemprop="description">
                                Avec ses baskets U420, New Balance réédite la 420, sa fameuse basket devenue légendaire pour son look, sa légèreté et son confort! Choisissez le modèle qui vous sied et portez la légende à vos pieds avec les New Balance U420!<br>La semelle est antibactérienne. <br>Baskets New Balance U420 pour homme, en cuir et textile, à lacets (6 œillets), montées sur une semelle de gomme anti-traces. La semelle intérieure est amovible.<br>
                            </p>
                        </div>
                    </div>
                </div>
                <ul>
                    
                     

In [67]:
for list_item in selector.css('div.detail-product > ul > li'):
    print list_item.xpath('./strong/text()').extract_first()
    print list_item.xpath('./strong/following-sibling::text()').extract_first()

Type

                                Baskets
                            
Boutique

                                Petites Pointures
                            
Saison

                                Automne/Hiver 2015
                            
Pays de fabrication

                                Viêt Nam
                            
Pointure de réf.

                                40
                            
Ref

                                22369
                            
Dessus / Tige

                                Cuir/Textile
                            
Doublure

                                Textile
                            
Semelle amovible

                                Oui
                            
Semelle intérieure

                                Textile
                            
Semelle extérieure

                                Gomme
                            
Construction

                                Soudé


In [68]:
for list_item in selector.css('div.detail-product > ul > li'):
    print "---------"
    print list_item.xpath('string(./strong)').extract_first()
    print list_item.xpath('normalize-space(./strong/following-sibling::text())').extract_first()

---------
Type
Baskets
---------
Boutique
Petites Pointures
---------
Saison
Automne/Hiver 2015
---------
Pays de fabrication
Viêt Nam
---------
Pointure de réf.
40
---------
Ref
22369
---------
Dessus / Tige
Cuir/Textile
---------
Doublure
Textile
---------
Semelle amovible
Oui
---------
Semelle intérieure
Textile
---------
Semelle extérieure
Gomme
---------
Construction
Soudé


In [70]:
# all together, for the slides
from pprint import pprint
import parsel

selector = parsel.Selector(text=u"""<div class='detail-product'>
    <ul>
        <li><strong>Type</strong> Baskets</li>
        <li><strong>Boutique</strong> Petites Pointures</li>
        <li><strong>Saison</strong> Automne/Hiver 2015</li>
        <li><strong>Pays de fabrication</strong> Viêt Nam</li>
        <li><strong>Pointure de réf.</strong> 40</li>
        <li><strong>Ref</strong> 22369</li>
        <li><strong>Dessus / Tige</strong> Cuir/Textile</li>
        <li><strong>Doublure</strong> Textile</li>
        <li><strong>Semelle amovible</strong> Oui</li>
        <li><strong>Semelle int&eacute;rieure</strong> Textile</li>
        <li><strong>Semelle ext&eacute;rieure</strong> Gomme</li>
        <li><strong>Construction</strong> Soudé</li>
    </ul>
</div>""")

pprint(
    dict((li.xpath('string(./strong)').extract_first(),
          li.xpath('normalize-space(./strong/following-sibling::text())').extract_first())
         for li in selector.css('div.detail-product > ul > li'))
)

{u'Boutique': u'Petites Pointures',
 u'Construction': u'Soud\xe9',
 u'Dessus / Tige': u'Cuir/Textile',
 u'Doublure': u'Textile',
 u'Pays de fabrication': u'Vi\xeat Nam',
 u'Pointure de r\xe9f.': u'40',
 u'Ref': u'22369',
 u'Saison': u'Automne/Hiver 2015',
 u'Semelle amovible': u'Oui',
 u'Semelle ext\xe9rieure': u'Gomme',
 u'Semelle int\xe9rieure': u'Textile',
 u'Type': u'Baskets'}


In [62]:
import parslepy

rules = {
    "product(div.detail-product)": {
        "details(./ul/li)": [
            {
                "k": "parslepy:strip(strong)",
                "v": "parslepy:strip(./strong/following-sibling::text())"
            }
        ]
    }
}
parselet = parslepy.Parselet(rules)
pprint(parselet.parse_fromstring(htmlsource))

{'product': {'details': [{'k': 'Type Baskets', 'v': 'Baskets'},
                         {'k': 'Boutique Petites Pointures',
                          'v': 'Petites Pointures'},
                         {'k': 'Saison Automne/Hiver 2015',
                          'v': 'Automne/Hiver 2015'},
                         {'k': u'Pays de fabrication Vi\xeat Nam',
                          'v': u'Vi\xeat Nam'},
                         {'k': u'Pointure de r\xe9f. 40', 'v': '40'},
                         {'k': 'Ref 22369', 'v': '22369'},
                         {'k': 'Dessus / Tige Cuir/Textile',
                          'v': 'Cuir/Textile'},
                         {'k': 'Doublure Textile', 'v': 'Textile'},
                         {'k': 'Semelle amovible Oui', 'v': 'Oui'},
                         {'k': u'Semelle int\xe9rieure Textile',
                          'v': 'Textile'},
                         {'k': u'Semelle ext\xe9rieure Gomme', 'v': 'Gomme'},
                         {'k': u'

## Extraction data from Javascript code

In [72]:
import parsel

# source HTML from http://eur.shoreprojects.com/collections/watches/products/cowes
f = open('Cowes _ Shore Projects.html')
htmlsource = f.read()

# here I know it's UTF8. Scrapy would determine this by itself
selector = parsel.Selector(text=htmlsource.decode('utf8'))

In [73]:
selector.xpath('.//script/text()').extract()

[u' Shopify.money_format = "&euro;{{amount_no_decimals}} EUR"; ',
 u'\n//<![CDATA[\n      var Shopify = Shopify || {};\n      Shopify.shop = "otherwayeur.myshopify.com";\n      Shopify.theme = {"name":"Shore Projects","id":8992956,"theme_store_id":null,"role":"main"};\n\n//]]>\n',
 u'\n//<![CDATA[\n    (function() {\n      function asyncLoad() {\n        var urls = ["http:\\/\\/beacon.riskified.com?shop=otherwayeur.myshopify.com"];\n        for (var i = 0; i < urls.length; i++) {\n          var s = document.createElement(\'script\');\n          s.type = \'text/javascript\';\n          s.async = true;\n          s.src = urls[i];\n          var x = document.getElementsByTagName(\'script\')[0];\n          x.parentNode.insertBefore(s, x);\n        }\n      }\n      window.attachEvent ? window.attachEvent(\'onload\', asyncLoad) : window.addEventListener(\'load\', asyncLoad, false);\n    })();\n\n//]]>\n',
 u'\n//<![CDATA[\nvar __st={"a":7162745,"offset":3600,"reqid":"cd39460a-41eb-421f-9297

In [82]:
# I cheated a bit, I know where the interesting bit of Javascript is...
jssnippet = selector.xpath('.//comment()[.="END PRODUCT"]/following-sibling::script[1]/text()').extract_first()
print jssnippet


jQuery(function($) {
  var selectCallback = function(variant, selector) {
    if (variant) {
      if (variant.available) {
        // Selected a valid variant that is available.
        $('#add').removeClass('disabled').removeAttr('disabled').val('Add to Cart').fadeTo(200,1);
        if($('.btn.add-to-cart.xmas').length)
        {
        	$('.btn.add-to-cart.xmas').val($('.btn.add-to-cart.xmas').data("label").replace('<span class=money>', '').replace('</span>',''));
        	$('.btn.add-to-cart.xmas').addClass('disabled').attr('disabled', 'disabled');
        }
      } else {
        // Variant is sold out.
        $('#add').val('Sold Out').addClass('disabled').attr('disabled', 'disabled').fadeTo(200,0.5);  
        if($('.btn.add-to-cart.xmas').length)
        {
        	$('.btn.add-to-cart.xmas').val($('.btn.add-to-cart.xmas').data("label").replace('<span class=money>', '').replace('</span>',''));
        	$('.btn.add-to-cart.xmas').addClass('disabled').attr('disabled', 'disabled'

In [80]:
# we're interesting in "new Shopify.OptionSelectors..."

In [84]:
import js2xml

jstree = js2xml.parse(jssnippet)

In [96]:
# 2nd argument of the "new" construct
#objs = jstree.xpath('//*/arguments[string="product-select"]/*[2]')
objs = jstree.xpath('//arguments/string[.="product-select"]/following-sibling::*[1]')

In [97]:
js2xml.jsonlike.make_dict(objs[0])

{'onVariantSelected': 'selectCallback',
 'product': {'available': True,
  'compare_at_price': None,
  'compare_at_price_max': 0,
  'compare_at_price_min': 0,
  'compare_at_price_varies': False,
  'content': '<div class="short">Brushed Silver<br> Silver Face</div>\n<div class="long">Inspired by the look of venerable explorer watches, our watches are designed to mirror vintage timepieces with modern design features and are fully equipped to both stand out and weather the demands of a modern lifestyle.<br><br><b>Outer Casing:</b> Silver Plated 316L Stainless Steel<br> <b>Movement:</b> Miyota Quartz Movement<br><b>Glass:</b> Scratch Resistant Sapphire Crystal<br> <b>Water Proof:</b> 100 meters<br> <b>Dial Diameter &amp; Thickness:</b> 39mm &amp; 7.9mm<br><span class="straplabelcontainer"><b>STRAP CHOICE:</b> <span class="straplabel"></span></span>\n</div>',
  'created_at': '2015-02-21T11:40:01+00:00',
  'description': '<div class="short">Brushed Silver<br> Silver Face</div>\n<div class="lo

### Google maps examples

In [119]:
import js2xml
import parsel

htmlsource = r"""<html>
  <!-- from https://developers.google.com/maps/documentation/javascript/examples/marker-simple -->
  <body>
    <div id="map"></div>
    <script>
function initMap() {
  var myLatLng = {lat: -25.363, lng: 131.044};
  var map = new google.maps.Map(document.getElementById('map'), {
    zoom: 4,
    center: myLatLng
  });
  var marker = new google.maps.Marker({
    position: myLatLng,
    map: map,
    title: 'Hello World!'
  });
}
    </script>
  </body>
</html>"""

selector = parsel.Selector(text=htmlsource.decode('utf8'))
jssnippet = selector.css('div#map + script::text').extract_first()
jstree = js2xml.parse(jssnippet)

js2xml.jsonlike.getall(jstree)

[{'lat': -25.363, 'lng': 131.044},
 {'center': 'myLatLng', 'zoom': 4},
 {'map': 'map', 'position': 'myLatLng', 'title': 'Hello World!'}]

In [120]:
js2xml.jsonlike.make_dict(jstree.xpath('//object[1]')[0])

{'lat': -25.363, 'lng': 131.044}

## EXSLT set operations

In [2]:
"""
Microdata parser

Piece of code extracted form:
* http://blog.scrapinghub.com/2014/06/18/extracting-schema-org-microdata-using-scrapy-selectors-and-xpath/

Ported to lxml
follows http://www.w3.org/TR/microdata/#json

"""

import collections
import urlparse

import lxml.etree
import lxml.html


class MicrodataExtractor(object):
    _xp_item = lxml.etree.XPath('descendant-or-self::*[@itemscope]')
    _xp_prop = lxml.etree.XPath("""set:difference(.//*[@itemprop],
                                                  .//*[@itemscope]//*[@itemprop])""",
                                namespaces = {"set": "http://exslt.org/sets"})
    _xp_item_docid = lxml.etree.XPath("""count(preceding::*[@itemscope])
                                        + count(ancestor::*[@itemscope])
                                        + 1""")

    def __init__(self, nested=True):
        self.nested = nested

    def get_docid(self, node):
        return int(self._xp_item_docid(node))

    def extract(self, htmlstring, url='http://www.example.com/', encoding="UTF-8"):
        self.url = url
        self.items_seen = set()
        parser = lxml.html.HTMLParser(encoding=encoding)
        lxmldoc = lxml.html.fromstring(htmlstring, parser=parser)
        return self.extract_items(lxmldoc)

    def extract_items(self, document):
        return {"items" : filter(bool, [self.extract_item(item)
                                        for item in self._xp_item(document)])}

    def extract_item(self, node):
        itemid = self.get_docid(node)

        if self.nested:
            if itemid in self.items_seen:
                return
            self.items_seen.add(itemid)

        item = {}
        if not self.nested:
            item["iid"] = itemid
        types = node.get('itemtype', '').split()
        if types:
            item["type"] = types

            itemid = node.get('itemid')
            if itemid:
                item["id"] = itemid.strip()

        properties = collections.defaultdict(list)
        for name, value in self.extract_properties(node):
            properties[name].append(value)

        props = []
        for (name, values) in properties.items():
            if len(values) == 1:
                props.append((name, values[0]))
            else:
                props.append((name, values))

        item["properties"] = dict(props)

        # not in the specs, but can be handy
        #textContent = self.extract_textContent(node)
        #if textContent:
            #item["textContent"] = textContent

        return item

    def extract_properties(self, node):
        for prop in self._xp_prop(node):
            for p, v in self.extract_property(prop):
                yield p, v

    def extract_property(self, node):
        props = node.get("itemprop").split()
        value = self.extract_property_value(node)
        return [(p, value) for p in props]

    def extract_property_value(self, node):
        #http://www.w3.org/TR/microdata/#values
        if node.get("itemscope") is not None:
            if self.nested:
                return self.extract_item(node)
            else:
                return {"iid_ref": self.get_docid(node)}

        elif node.tag == "meta":
            return node.get("content", "")

        elif node.tag in ("audio", "embed", "iframe", "img", "source", "track", "video"):
            return urlparse.urljoin(self.url, node.get("src", ""))

        elif node.tag in ("a", "area", "link"):
            return urlparse.urljoin(self.url, node.get("href", ""))

        elif node.tag in ("object",):
            return node.get("data", "")

        elif node.tag in ("data", "meter"):
            return node.get("value", "")

        elif node.tag in ("time",):
            return node.get("datetime", "")

        # not in W3C specs but used in schema.org examples
        elif node.get("content"):
            return node.get("content")

        else:
            return self.extract_textContent(node)

    def extract_textContent(self, node):
        return lxml.html.tostring(node, method="text", encoding=unicode,
                    with_tail=False).strip()

In [3]:
h = """<div itemscope itemtype ="http://schema.org/Movie">
  <h1 itemprop="name">Avatar</h1>
  <div itemprop="director" itemscope itemtype="http://schema.org/Person">
  Director: <span itemprop="name">James Cameron</span> (born <span itemprop="birthDate">August 16, 1954</span>)
  </div>
  <span itemprop="genre">Science fiction</span>
  <a href="../movies/avatar-theatrical-trailer.html" itemprop="trailer">Trailer</a>
</div>"""

In [7]:
import pprint

mde = MicrodataExtractor()
pprint.pprint(mde.extract(h))

{'items': [{'properties': {'director': {'properties': {'birthDate': u'August 16, 1954',
                                                       'name': u'James Cameron'},
                                        'type': ['http://schema.org/Person']},
                           'genre': u'Science fiction',
                           'name': u'Avatar',
                           'trailer': 'http://www.example.com/../movies/avatar-theatrical-trailer.html'},
            'type': ['http://schema.org/Movie']}]}
