diff --git a/parsel/selector.py b/parsel/selector.py index 53c3fbdb..9d172656 100644 --- a/parsel/selector.py +++ b/parsel/selector.py @@ -43,8 +43,14 @@ def create_root_node(text, parser_cls, base_url=None): class SelectorList(list): + # __getslice__ is deprecated but `list` builtin implements it only in Py2 def __getslice__(self, i, j): - return self.__class__(list.__getslice__(self, i, j)) + o = super(SelectorList, self).__getslice__(i, j) + return self.__class__(o) + + def __getitem__(self, pos): + o = super(SelectorList, self).__getitem__(pos) + return self.__class__(o) if isinstance(pos, slice) else o def xpath(self, xpath): return self.__class__(flatten([x.xpath(xpath) for x in self])) @@ -158,8 +164,6 @@ def extract(self): return six.text_type(self.root) def register_namespace(self, prefix, uri): - if self.namespaces is None: - self.namespaces = {} self.namespaces[prefix] = uri def remove_namespaces(self): @@ -171,8 +175,9 @@ def remove_namespaces(self): if an.startswith('{'): el.attrib[an.split('}', 1)[1]] = el.attrib.pop(an) - def __nonzero__(self): + def __bool__(self): return bool(self.extract()) + __nonzero__ = __bool__ def __str__(self): data = repr(self.extract()[:40]) diff --git a/parsel/utils.py b/parsel/utils.py index d1c7e59a..5c2bdef5 100644 --- a/parsel/utils.py +++ b/parsel/utils.py @@ -25,52 +25,43 @@ def iflatten(x): """iflatten(sequence) -> iterator Similar to ``.flatten()``, but returns iterator instead""" for el in x: - if is_listlike(el): + if _is_listlike(el): for el_ in flatten(el): yield el_ else: yield el -def is_listlike(x): +def _is_listlike(x): """ - >>> is_listlike("foo") + >>> _is_listlike("foo") False - >>> is_listlike(5) + >>> _is_listlike(5) False - >>> is_listlike(b"foo") + >>> _is_listlike(b"foo") False - >>> is_listlike([b"foo"]) + >>> _is_listlike([b"foo"]) True - >>> is_listlike((b"foo",)) + >>> _is_listlike((b"foo",)) True - >>> is_listlike({}) + >>> _is_listlike({}) True - >>> is_listlike(set()) + >>> _is_listlike(set()) True - >>> is_listlike((x for x in range(3))) + >>> _is_listlike((x for x in range(3))) True - >>> is_listlike(six.moves.xrange(5)) + >>> _is_listlike(six.moves.xrange(5)) True """ return hasattr(x, "__iter__") and not isinstance(x, (six.text_type, bytes)) -def to_unicode(text, encoding='utf-8', errors='strict'): - """Return the unicode representation of a bytes object `text`. If `text` - is already an unicode object, return it as-is.""" - if isinstance(text, six.text_type): - return text - return text.decode(encoding, errors) - - -def extract_regex(regex, text, encoding='utf-8'): +def extract_regex(regex, text): """Extract a list of unicode strings from the given text/encoding using the following policies: * if the regex contains a named group called "extract" that will be returned * if the regex contains multiple numbered groups, all those will be returned (flattened) * if the regex doesn't contain any group the entire regex matching is returned """ - if isinstance(regex, six.string_types): regex = re.compile(regex, re.UNICODE) @@ -78,10 +69,4 @@ def extract_regex(regex, text, encoding='utf-8'): strings = [regex.search(text).group('extract')] # named group except: strings = regex.findall(text) # full regex or numbered groups - strings = flatten(strings) - - if isinstance(text, six.text_type): - return [replace_entities(s, keep=['lt', 'amp']) for s in strings] - else: - return [replace_entities(to_unicode(s, encoding), keep=['lt', 'amp']) - for s in strings] + return [replace_entities(s, keep=['lt', 'amp']) for s in flatten(strings)] diff --git a/tests/test_selector.py b/tests/test_selector.py index 6515d728..29446d4e 100644 --- a/tests/test_selector.py +++ b/tests/test_selector.py @@ -132,6 +132,33 @@ def test_differences_parsing_xml_vs_html(self): self.assertEqual(xs.xpath("//div").extract(), [u'

Hello

']) + def test_error_for_unknown_selector_type(self): + self.assertRaises(ValueError, self.sscls, text=u'', type='_na_') + + def test_text_or_root_is_required(self): + self.assertRaisesRegexp(ValueError, + 'Selector needs either text or root argument', + self.sscls) + + def test_bool(self): + text = u'falsetrue' + hs = self.sscls(text=text, type='html') + falsish = hs.xpath('//a/@href')[0] + self.assertEqual(falsish.extract(), u'') + self.assertFalse(falsish) + trueish = hs.xpath('//a/@href')[1] + self.assertEqual(trueish.extract(), u'nonempty') + self.assertTrue(trueish) + + def test_slicing(self): + text = u'

1

2

3

' + hs = self.sscls(text=text, type='html') + self.assertIsInstance(hs.css('p')[2], self.sscls) + self.assertIsInstance(hs.css('p')[2:3], self.sscls.selectorlist_cls) + self.assertIsInstance(hs.css('p')[:2], self.sscls.selectorlist_cls) + self.assertEqual(hs.css('p')[2:3].extract(), [u'

3

']) + self.assertEqual(hs.css('p')[1:3].extract(), [u'

2

', u'

3

']) + def test_nested_selectors(self): """Nested selector tests""" body = u""" @@ -378,6 +405,19 @@ def test_configure_base_url(self): self.assertEquals(u'http://example.com', sel.root.base) + def test_extending_selector(self): + class MySelectorList(Selector.selectorlist_cls): + pass + + class MySelector(Selector): + selectorlist_cls = MySelectorList + + sel = MySelector(text=u'
foo
') + self.assertIsInstance(sel.xpath('//div'), MySelectorList) + self.assertIsInstance(sel.xpath('//div')[0], MySelector) + self.assertIsInstance(sel.css('div'), MySelectorList) + self.assertIsInstance(sel.css('div')[0], MySelector) + class ExsltTestCase(unittest.TestCase): sscls = Selector @@ -493,16 +533,3 @@ def test_set(self): //div[@itemtype="http://schema.org/Event"] //*[@itemscope]/*/@itemprop)''').extract(), [u'url', u'name', u'startDate', u'location', u'offers']) - - def test_extending_selector(self): - class MySelectorList(Selector.selectorlist_cls): - pass - - class MySelector(Selector): - selectorlist_cls = MySelectorList - - sel = MySelector(text=u'
foo
') - self.assertIsInstance(sel.xpath('//div'), MySelectorList) - self.assertIsInstance(sel.xpath('//div')[0], MySelector) - self.assertIsInstance(sel.css('div'), MySelectorList) - self.assertIsInstance(sel.css('div')[0], MySelector)