Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

100% Coverage #15

Merged
merged 1 commit into from
Aug 14, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions parsel/selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,14 @@ def create_root_node(text, parser_cls, base_url=None):

class SelectorList(list):

# __getslice__ is deprecated but `list` builtin implements it only in Py2
def __getslice__(self, i, j):
return self.__class__(list.__getslice__(self, i, j))
o = super(SelectorList, self).__getslice__(i, j)
return self.__class__(o)

def __getitem__(self, pos):
o = super(SelectorList, self).__getitem__(pos)
return self.__class__(o) if isinstance(pos, slice) else o

def xpath(self, xpath):
return self.__class__(flatten([x.xpath(xpath) for x in self]))
Expand Down Expand Up @@ -158,8 +164,6 @@ def extract(self):
return six.text_type(self.root)

def register_namespace(self, prefix, uri):
if self.namespaces is None:
self.namespaces = {}
self.namespaces[prefix] = uri

def remove_namespaces(self):
Expand All @@ -171,8 +175,9 @@ def remove_namespaces(self):
if an.startswith('{'):
el.attrib[an.split('}', 1)[1]] = el.attrib.pop(an)

def __nonzero__(self):
def __bool__(self):
return bool(self.extract())
__nonzero__ = __bool__

def __str__(self):
data = repr(self.extract()[:40])
Expand Down
41 changes: 13 additions & 28 deletions parsel/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,63 +25,48 @@ def iflatten(x):
"""iflatten(sequence) -> iterator
Similar to ``.flatten()``, but returns iterator instead"""
for el in x:
if is_listlike(el):
if _is_listlike(el):
for el_ in flatten(el):
yield el_
else:
yield el


def is_listlike(x):
def _is_listlike(x):
"""
>>> is_listlike("foo")
>>> _is_listlike("foo")
False
>>> is_listlike(5)
>>> _is_listlike(5)
False
>>> is_listlike(b"foo")
>>> _is_listlike(b"foo")
False
>>> is_listlike([b"foo"])
>>> _is_listlike([b"foo"])
True
>>> is_listlike((b"foo",))
>>> _is_listlike((b"foo",))
True
>>> is_listlike({})
>>> _is_listlike({})
True
>>> is_listlike(set())
>>> _is_listlike(set())
True
>>> is_listlike((x for x in range(3)))
>>> _is_listlike((x for x in range(3)))
True
>>> is_listlike(six.moves.xrange(5))
>>> _is_listlike(six.moves.xrange(5))
True
"""
return hasattr(x, "__iter__") and not isinstance(x, (six.text_type, bytes))


def to_unicode(text, encoding='utf-8', errors='strict'):
"""Return the unicode representation of a bytes object `text`. If `text`
is already an unicode object, return it as-is."""
if isinstance(text, six.text_type):
return text
return text.decode(encoding, errors)


def extract_regex(regex, text, encoding='utf-8'):
def extract_regex(regex, text):
"""Extract a list of unicode strings from the given text/encoding using the following policies:
* if the regex contains a named group called "extract" that will be returned
* if the regex contains multiple numbered groups, all those will be returned (flattened)
* if the regex doesn't contain any group the entire regex matching is returned
"""

if isinstance(regex, six.string_types):
regex = re.compile(regex, re.UNICODE)

try:
strings = [regex.search(text).group('extract')] # named group
except:
strings = regex.findall(text) # full regex or numbered groups
strings = flatten(strings)

if isinstance(text, six.text_type):
return [replace_entities(s, keep=['lt', 'amp']) for s in strings]
else:
return [replace_entities(to_unicode(s, encoding), keep=['lt', 'amp'])
for s in strings]
return [replace_entities(s, keep=['lt', 'amp']) for s in flatten(strings)]
53 changes: 40 additions & 13 deletions tests/test_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,33 @@ def test_differences_parsing_xml_vs_html(self):
self.assertEqual(xs.xpath("//div").extract(),
[u'<div><img src="a.jpg"><p>Hello</p></img></div>'])

def test_error_for_unknown_selector_type(self):
self.assertRaises(ValueError, self.sscls, text=u'', type='_na_')

def test_text_or_root_is_required(self):
self.assertRaisesRegexp(ValueError,
'Selector needs either text or root argument',
self.sscls)

def test_bool(self):
text = u'<a href="" >false</a><a href="nonempty">true</a>'
hs = self.sscls(text=text, type='html')
falsish = hs.xpath('//a/@href')[0]
self.assertEqual(falsish.extract(), u'')
self.assertFalse(falsish)
trueish = hs.xpath('//a/@href')[1]
self.assertEqual(trueish.extract(), u'nonempty')
self.assertTrue(trueish)

def test_slicing(self):
text = u'<div><p>1</p><p>2</p><p>3</p></div>'
hs = self.sscls(text=text, type='html')
self.assertIsInstance(hs.css('p')[2], self.sscls)
self.assertIsInstance(hs.css('p')[2:3], self.sscls.selectorlist_cls)
self.assertIsInstance(hs.css('p')[:2], self.sscls.selectorlist_cls)
self.assertEqual(hs.css('p')[2:3].extract(), [u'<p>3</p>'])
self.assertEqual(hs.css('p')[1:3].extract(), [u'<p>2</p>', u'<p>3</p>'])

def test_nested_selectors(self):
"""Nested selector tests"""
body = u"""<body>
Expand Down Expand Up @@ -378,6 +405,19 @@ def test_configure_base_url(self):
self.assertEquals(u'http://example.com', sel.root.base)


def test_extending_selector(self):
class MySelectorList(Selector.selectorlist_cls):
pass

class MySelector(Selector):
selectorlist_cls = MySelectorList

sel = MySelector(text=u'<html><div>foo</div></html>')
self.assertIsInstance(sel.xpath('//div'), MySelectorList)
self.assertIsInstance(sel.xpath('//div')[0], MySelector)
self.assertIsInstance(sel.css('div'), MySelectorList)
self.assertIsInstance(sel.css('div')[0], MySelector)

class ExsltTestCase(unittest.TestCase):

sscls = Selector
Expand Down Expand Up @@ -493,16 +533,3 @@ def test_set(self):
//div[@itemtype="http://schema.org/Event"]
//*[@itemscope]/*/@itemprop)''').extract(),
[u'url', u'name', u'startDate', u'location', u'offers'])

def test_extending_selector(self):
class MySelectorList(Selector.selectorlist_cls):
pass

class MySelector(Selector):
selectorlist_cls = MySelectorList

sel = MySelector(text=u'<html><div>foo</div></html>')
self.assertIsInstance(sel.xpath('//div'), MySelectorList)
self.assertIsInstance(sel.xpath('//div')[0], MySelector)
self.assertIsInstance(sel.css('div'), MySelectorList)
self.assertIsInstance(sel.css('div')[0], MySelector)