In [1]:
from selectolax.parser import HTMLParser

In [3]:
html = """
<body>
    <span id="vspan"></span>
    <h1>Welcome to selectolax tutorial</h1>
    <div id="text">
        <p class='p3' style='display:none;'>Excepteur <i>sint</i> occaecat cupidatat non proident</p>
        <p class='p3' vid>Lorem ipsum</p>
    </div>
    <div>
        <p id='stext'>Lorem ipsum dolor sit amet, ea quo modus meliore platonem.</p>
    </div>
</body>
"""

#### Select all p tags with class p3

In [4]:
selector = "p.p3"

for node in HTMLParser(html).css(selector):
    print('---------------------')
    print('Node: %s' % node.html)
    print('attributes: %s' % node.attributes)
    print('node text: %s' % node.text(deep=True, separator='', strip=False))
    print('tag: %s' % node.tag)
    print('parent tag: %s' % node.parent.tag)
    if node.last_child:
        print('last child inside current node: %s' % node.last_child.html)
    print('---------------------\n')

---------------------
Node: <p class="p3" style="display:none;">Excepteur <i>sint</i> occaecat cupidatat non proident</p>
attributes: {'class': 'p3', 'style': 'display:none;'}
node text: Excepteur sint occaecat cupidatat non proident
tag: p
parent tag: div
last child inside current node:  occaecat cupidatat non proident
---------------------

---------------------
Node: <p class="p3" vid="">Lorem ipsum</p>
attributes: {'class': 'p3', 'vid': None}
node text: Lorem ipsum
tag: p
parent tag: div
last child inside current node: Lorem ipsum
---------------------



#### Select first match

In [5]:
print("H1: %s" % HTMLParser(html).css_first('h1').text())

H1: Welcome to selectolax tutorial


#### Default return value if there is no matches

In [6]:
print("Title: %s" % HTMLParser(html).css_first('title', default='not-found'))

Title: not-found


#### Strictly one match

If there multiple matches, you will get an error.

In [7]:
HTMLParser(html).css_first("p.p3", default='not-found', strict=True)

ValueError: Expected 1 match, but found 2 matches

#### Print parent of p#stext

In [8]:
print(HTMLParser(html).css_first('p#stext').parent.html)

<div>
        <p id="stext">Lorem ipsum dolor sit amet, ea quo modus meliore platonem.</p>
    </div>


#### Nested selectors

In [9]:
HTMLParser(html).css_first('div#text').css_first('p:nth-child(2)').html

'<p class="p3" vid="">Lorem ipsum</p>'

#### Iterate over all nodes on the current level

In [10]:
for node in HTMLParser(html).css("div#text"):
    for cnode in node.iter():
        print(cnode.tag, cnode.html)


p <p class="p3" style="display:none;">Excepteur <i>sint</i> occaecat cupidatat non proident</p>
p <p class="p3" vid="">Lorem ipsum</p>


#### Tag removal 

In [11]:
html_parser = HTMLParser(html)
for node in html_parser.tags('p'):
    node.decompose()
print(html_parser.body.html)

<body>
    <span id="vspan"></span>
    <h1>Welcome to selectolax tutorial</h1>
    <div id="text">
        
        
    </div>
    <div>
        
    </div>

</body>


#### Tag unwrapping

In [12]:
print(html)


<body>
    <span id="vspan"></span>
    <h1>Welcome to selectolax tutorial</h1>
    <div id="text">
        <p class='p3' style='display:none;'>Excepteur <i>sint</i> occaecat cupidatat non proident</p>
        <p class='p3' vid>Lorem ipsum</p>
    </div>
    <div>
        <p id='stext'>Lorem ipsum dolor sit amet, ea quo modus meliore platonem.</p>
    </div>
</body>



In [13]:
html_parser = HTMLParser(html)
html_parser.unwrap_tags(['p', 'i'])
print(html_parser.body.html)

<body>
    <span id="vspan"></span>
    <h1>Welcome to selectolax tutorial</h1>
    <div id="text">
        Excepteur sint occaecat cupidatat non proident
        Lorem ipsum
    </div>
    <div>
        Lorem ipsum dolor sit amet, ea quo modus meliore platonem.
    </div>

</body>


#### Attribute manipulation

In [14]:
html_parser = HTMLParser(html)
node = html_parser.css_first('div#text')
node.attrs['data'] = 'secrect data'
node.attrs['id'] = 'new_id'
print(node.attributes)
del node.attrs['id']
print(node.attributes)
print(node.html)

{'data': 'secrect data', 'id': 'new_id'}
{'data': 'secrect data'}
<div data="secrect data">
        <p class="p3" style="display:none;">Excepteur <i>sint</i> occaecat cupidatat non proident</p>
        <p class="p3" vid="">Lorem ipsum</p>
    </div>


#### Tree traversal

In [15]:
html_parser = HTMLParser(html)
for node in html_parser.root.traverse():

    if node.tag == '-text':
        text = node.text(deep=True).strip()
        if text:
            print(text)
    else:
        print(node.tag)

html
head
body
span
h1
div
p
i
p
div
p


#### Encoding detection

In [16]:
html = "<div>Привет мир!</div>"
# Encoding detector works only with raw strings (bytes)
html_bytes = html.encode('cp1251')

In [17]:
html_bytes

b'<div>\xcf\xf0\xe8\xe2\xe5\xf2 \xec\xe8\xf0!</div>'

In [18]:
HTMLParser(html_bytes, detect_encoding=True).input_encoding

'WINDOWS-1251'

#### Encoding detection using meta tags

In [19]:
html = '<head><meta charset="WINDOWS-1251"></head>'.encode('cp1251')
HTMLParser(html, detect_encoding=True, use_meta_tags=True).input_encoding

'WINDOWS-1251'

In [20]:
html_utf = '<head><meta charset="WINDOWS-1251"></head>'.encode('utf-8')
HTMLParser(html_utf, detect_encoding=True, use_meta_tags=True).input_encoding

'WINDOWS-1251'

### Advanced selector

Advanced selector mimics some of the features of XPath that are missing in CSS.

The ``select`` methods accepts an optional initial CSS selector.

In [21]:
html = """
<script>
 var super_variable = 100;
</script>
<script>
 console.log('debug');
</script>
"""
tree = HTMLParser(html)

[node.text() for node in tree.select('script').text_contains("super").matches]


['\n var super_variable = 100;\n']

### CSS chaining

Chaining allows executing multiple CSS selectors against the current scope quickly. Each new call filters previous results.

In [2]:
html = """
<div id="container">
    <span class="red"></span>
    <span class="green"></span>
    <span class="red"></span>
    <span class="green"></span>
</div>
"""
tree = HTMLParser(html)

print([node.html for node in tree.select('div').css("span").css(".red").matches])


['<span class="red"></span>', '<span class="red"></span>']


### Inserting nodes

In [None]:
html = """
<div id="container">
    <span class="red"></span>
    <span class="green"></span>
    <span class="red"></span>
    <span class="green"></span>
</div>
"""
tree = HTMLParser(html)

# Insert text
dest_node = html_parser.css_first('.red')
dest_node.insert_before("Hello")

# Insert nodes
subtree = HTMLParser("<div>Hi</div>")
dest_node = html_parser.css_first('.red')
dest_node.insert_before(subtree)

# Insert before, after, or append inside
subtree = HTMLParser("<div>Car</div>")
dest_node = html_parser.css_first('.green')
dest_node.insert_before(subtree)
dest_node.insert_after(subtree)
dest_node.insert_child(subtree)