Skip to content

Commit

Permalink
add parser_cls argument, changes default html parser to html.HTMLParser
Browse files Browse the repository at this point in the history
This changes the default HTML parser to html.HTMLParser, and also
introduces a parameter in Selector to specify another parser class
if desired.

The parser parameter will enable users that care a big deal about
performance to use a custom parser if desired.

This will affect Scrapy because it just uses the default here, but
doesn't seem to have a perceived impact on performance, as per @kmike
benchmark shared here:

https://gist.github.com/kmike/af647777cef39c3d01071905d176c006
  • Loading branch information
eliasdorneles committed Aug 10, 2016
1 parent 1bba625 commit 13eb040
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 4 deletions.
8 changes: 4 additions & 4 deletions parsel/selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import sys

import six
from lxml import etree
from lxml import etree, html

from .utils import flatten, iflatten, extract_regex
from .csstranslator import HTMLTranslator, GenericTranslator
Expand All @@ -17,7 +17,7 @@ def __init__(self, *args, **kwargs):
super(SafeXMLParser, self).__init__(*args, **kwargs)

_ctgroup = {
'html': {'_parser': etree.HTMLParser,
'html': {'_parser': html.HTMLParser,
'_csstranslator': HTMLTranslator(),
'_tostring_method': 'html'},
'xml': {'_parser': SafeXMLParser,
Expand Down Expand Up @@ -139,9 +139,9 @@ class Selector(object):
selectorlist_cls = SelectorList

def __init__(self, text=None, type=None, namespaces=None, root=None,
base_url=None, _expr=None):
base_url=None, _expr=None, parser_cls=None):
self.type = st = _st(type or self._default_type)
self._parser = _ctgroup[st]['_parser']
self._parser = parser_cls or _ctgroup[st]['_parser']
self._csstranslator = _ctgroup[st]['_csstranslator']
self._tostring_method = _ctgroup[st]['_tostring_method']

Expand Down
10 changes: 10 additions & 0 deletions tests/test_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,15 @@ def test_configure_base_url(self):
sel = self.sscls(text=u'nothing', base_url='http://example.com')
self.assertEquals(u'http://example.com', sel.root.base)

def test_custom_parser_cls(self):
from lxml import etree, html
text = u"<a>example</a>"
with self.assertRaisesRegexp(AttributeError, "has no attribute 'make_links_absolute'"):
sel = Selector(text, parser_cls=etree.HTMLParser)
sel.xpath('//a')[0].root.make_links_absolute

sel = Selector(text, parser_cls=html.HTMLParser)
self.assertIsNotNone(sel.xpath('//a')[0].root.make_links_absolute)

def test_extending_selector(self):
class MySelectorList(Selector.selectorlist_cls):
Expand All @@ -418,6 +427,7 @@ class MySelector(Selector):
self.assertIsInstance(sel.css('div'), MySelectorList)
self.assertIsInstance(sel.css('div')[0], MySelector)


class ExsltTestCase(unittest.TestCase):

sscls = Selector
Expand Down

0 comments on commit 13eb040

Please sign in to comment.