/
benchmark.py
118 lines (89 loc) · 3.44 KB
/
benchmark.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# coding:utf-8
"""A simple benchmark that measures speed of lxml and selectolax.
How the benchmark works
-----------------------
For each page, we extract:
1) Title
2) Number of script tag
3) The ``href`` attribute from all links
4) The content of the Meta description tag
"""
import functools
import json
import time
from bs4 import BeautifulSoup
from html5_parser import parse
from lxml.html import fromstring
from selectolax.parser import HTMLParser
from selectolax.lexbor import LexborHTMLParser
bad_urls = []
def bs4_parser(html_content, parser=HTMLParser):
soup = BeautifulSoup(html_content, 'html.parser')
title_text = soup.title.string
assert title_text
a_hrefs = [a.attrs.get('href', '') for a in soup.find_all('a')]
assert len(a_hrefs) >= 5, 'href'
num_script_tags = len(soup.find_all('script'))
assert num_script_tags > 0, 'script'
meta_description = soup.find('meta', attrs={"name": "description"})
if meta_description:
meta_content = meta_description.get('content')
def selectolax_parser(html_content, parser=HTMLParser):
tree = parser(html_content)
title_text = ""
title_node = tree.css_first('title')
if title_node:
title_text = title_node.text()
assert title_text
a_hrefs = [a.attrs.get('href', '') for a in tree.css('a[href]')]
assert len(a_hrefs) >= 5, 'href'
num_script_tags = len(tree.css('script'))
assert num_script_tags > 0, 'script'
meta_description = tree.css_first('meta[name="description"]')
if meta_description:
meta_content = meta_description.attrs.sget('content', '')
def lxml_parser(html_content):
tree = fromstring(html_content)
title_text = tree.xpath('//title/text()')
assert title_text, 'title'
a_hrefs = [a.attrib.get('href', '') for a in tree.xpath('//a[@href]')]
assert len(a_hrefs) >= 5, 'href'
num_script_tags = len(tree.xpath('//script'))
assert num_script_tags > 0, 'script'
meta_description = tree.xpath('meta[@name="description"]')
if meta_description:
meta_content = meta_description[0].attrib.get('content', '')
def html5_parser(html_content):
tree = parse(html_content)
title_text = tree.xpath('//title/text()')
assert title_text, 'title'
a_hrefs = [a.attrib.get('href', '') for a in tree.xpath('//a[@href]')]
assert len(a_hrefs) >= 5, 'href'
num_script_tags = len(tree.xpath('//script'))
assert num_script_tags > 0, 'script'
meta_description = tree.xpath('meta[@name="description"]')
if meta_description:
meta_content = meta_description[0].attrib.get('content', '')
def _perform_test(pages, parse_func):
for page in pages:
parse_func(page['html'])
def main():
#
# This file contains 754 main pages from the top internet domains (according to Alexa rank).
# That translates to 324MB of HTML data.
# Because of potential copyright infringements, I don't publish it.
#
html_pages = [json.loads(page) for page in open('pages/pages.json', 'rt')]
available_parsers = [
('bs4', bs4_parser,),
('lxml', lxml_parser,),
('html5_parser', html5_parser,),
('modest', selectolax_parser,),
('lexbor', functools.partial(selectolax_parser, parser=LexborHTMLParser)),
]
for parser_name, parser in available_parsers:
start = time.time()
_perform_test(html_pages, parser)
print('%r: %s' % (parser_name, time.time() - start))
if __name__ == '__main__':
main()