In [1]:
from selectolax.parser import HTMLParser

In [25]:
html = """
<body>
    <span id="vspan"></span>
    <h1>Welcome to selectolax tutorial</h1>
    <div id="text">
        <p class='p3' style='display:none;'>Excepteur sint occaecat cupidatat non proident</p>
        <p class='p3' vid>Lorem ipsum</p>
    </div>
    <div>
        <p id='stext'>Lorem ipsum dolor sit amet, ea quo modus meliore platonem.</p>
    </div>
</body>
"""

#### Select all p tags with class p3

In [26]:
selector = "p.p3"

for node in HTMLParser(html).css(selector):
    print('---------------------')
    print('Node: %s' % node.html)
    print('attributes: %s' % node.attributes)
    print('node text: %s' % node.text)
    print('tag: %s' % node.tag)
    print('parent tag: %s' % node.parent.tag)
    if node.last_child:
        print('last child inside current node: %s' % node.last_child.html)
    print('---------------------\n')

---------------------
Node: <p class="p3" style="display:none;">Excepteur sint occaecat cupidatat non proident</p>
attributes: {'style': 'display:none;', 'class': 'p3'}
node text: Excepteur sint occaecat cupidatat non proident
tag: p
parent tag: div
last child inside current node: Excepteur sint occaecat cupidatat non proident
---------------------

---------------------
Node: <p class="p3" vid="">Lorem ipsum</p>
attributes: {'vid': None, 'class': 'p3'}
node text: Lorem ipsum
tag: p
parent tag: div
last child inside current node: Lorem ipsum
---------------------



#### Select first match

In [24]:
print("H1: %s" % HTMLParser(html).css_first('h1').text)

Title: Welcome to selectolax tutorial


#### Default return value if there is no matches

In [25]:
print("Title: %s" % HTMLParser(html).css_first('title', default='not-found'))

Title: not-found


#### Strictly one match

In [26]:
HTMLParser(html).css_first("p.p3", default='not-found', strict=True)

ValueError: Excepted 1 match, but found 2 matches

#### Print parent of p#stext

In [35]:
HTMLParser(html).css_first('p#stext').parent.html

'<div>\n        <p id="stext">Lorem ipsum dolor sit amet, ea quo modus meliore platonem.</p>\n    </div>'

#### Nested selectors

In [4]:
HTMLParser(html).css_first('div#text').css_first('p:nth-child(2)').html

'<p class="p3" vid="">sd</p>'

#### Encoding detection

In [14]:
html = "<div>Привет мир!</div>"
# Encoding detector works only with raw strings (bytes)
html_bytes = html.encode('cp1251')

In [3]:
html_bytes

b'<div>\xcf\xf0\xe8\xe2\xe5\xf2 \xec\xe8\xf0!</div>'

In [15]:
HTMLParser(html_bytes, detect_encoding=True).input_encoding

'WINDOWS-1251'

#### Encoding detection using meta tags

In [20]:
html = '<head><meta charset="WINDOWS-1251"></head>'.encode('cp1251')
HTMLParser(html, detect_encoding=True, use_meta_tags=True).input_encoding

'WINDOWS-1251'

In [19]:
html_utf = '<head><meta charset="WINDOWS-1251"></head>'.encode('utf-8')
HTMLParser(html_utf, detect_encoding=True, use_meta_tags=True).input_encoding

'WINDOWS-1251'