Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP

Loading…

Add extract_first() method to SelectorList #572

Closed
wants to merge 6 commits into from

5 participants

@shirk3y

Related to discussion #568

@dangra
Owner

cool, missing tests and docs.

scrapy/selector/unified.py
@@ -172,6 +172,10 @@ def re(self, regex):
def extract(self):
return [x.extract() for x in self]
+ def extract_first(self):
+ for x in self.extract():
@kmike Owner
kmike added a note

This is a bit inefficient: there is no need to build full [x.extract() for x in self] list if we're only interested in a first value

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
@shirk3y

Yep, I optimized it a little.

docs/topics/selectors.rst
@@ -117,6 +117,16 @@ method, as follows::
>>> sel.xpath('//title/text()').extract()
[u'Example website']
+If you want to extract only first matched element, you must call the selector ``.extract_first()``
@kmike Owner
kmike added a note

I think "must" is too strong here - there are other means of taking first matched element.

@shirk3y
shirk3y added a note

Agreed

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
docs/topics/selectors.rst
@@ -117,6 +117,16 @@ method, as follows::
>>> sel.xpath('//title/text()').extract()
[u'Example website']
+If you want to extract only first matched element, you must call the selector ``.extract_first()``
+
+ >>> sel.xpath('//ul/li').extract_first()
+ u'First list element'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
docs/topics/selectors.rst
@@ -117,6 +117,16 @@ method, as follows::
>>> sel.xpath('//title/text()').extract()
[u'Example website']
+If you want to extract only first matched element, you must call the selector ``.extract_first()``
+
+ >>> sel.xpath('//ul/li').extract_first()
+ u'First list element'
+
+It returns ``None`` if no element was found:
+
+ >>> sel.xpath('//ul/li[999]').extract_first()
+ None
@kmike Owner
kmike added a note

Python shell doesn't print None in such cases. Maybe write ... is None and >>> True ?

@shirk3y
shirk3y added a note

That's true, I'll fix it soon

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
@ananana ananana referenced this pull request
Closed

Selectorlist extract first #624

@kreedz

What about this feature? It will be implemented?

@kmike
Owner

I like this feature and I think we should add it. Every other library has this feature, even browsers have it via document.querySelector. There is a follow-up PR which fixes issues with this PR (#624). The problem is that we haven't agreed on it yet - see #568.

@kmike
Owner

Closing it in favor of #624.

@kmike kmike closed this
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
This page is out of date. Refresh to see the latest.
View
10 docs/topics/selectors.rst
@@ -117,6 +117,16 @@ method, as follows::
>>> sel.xpath('//title/text()').extract()
[u'Example website']
+If you want to extract only first matched element, you can call the selector ``.extract_first()``
+
+ >>> sel.xpath('//div[@id="images"]/a/text()').extract_first()
+ u'Name: My image 1 '
+
+It returns ``None`` if no element was found:
+
+ >>> sel.xpath('//div/[id="not-exists"]/text()').extract_first() is None
+ True
+
Notice that CSS selectors can select text or attribute nodes using CSS3
pseudo-elements::
View
10 scrapy/selector/unified.py
@@ -6,7 +6,7 @@
from scrapy.utils.misc import extract_regex
from scrapy.utils.trackref import object_ref
-from scrapy.utils.python import unicode_to_str, flatten
+from scrapy.utils.python import unicode_to_str, flatten, iflatten
from scrapy.utils.decorator import deprecated
from scrapy.http import HtmlResponse, XmlResponse
from .lxmldocument import LxmlDocument
@@ -169,9 +169,17 @@ def css(self, xpath):
def re(self, regex):
return flatten([x.re(regex) for x in self])
+ def re_first(self, regex):
+ for el in iflatten((x.re(regex) for x in self)):
+ return el
+
def extract(self):
return [x.extract() for x in self]
+ def extract_first(self):
+ for x in self:
+ return x.extract()
+
@deprecated(use_instead='.extract()')
def extract_unquoted(self):
return [x.extract_unquoted() for x in self]
View
35 scrapy/tests/test_selector.py
@@ -37,6 +37,41 @@ def test_simple_selection(self):
self.assertEqual([x.extract() for x in sel.xpath("concat(//input[@name='a']/@value, //input[@name='b']/@value)")],
[u'12'])
+ def test_extract_first(self):
+ """Test if extract_first() returns first element"""
+ body = '<ul><li id="1">1</li><li id="2">2</li></ul>'
+ response = TextResponse(url="http://example.com", body=body)
+ sel = self.sscls(response)
+
+ self.assertEqual(sel.xpath('//ul/li/text()').extract_first(),
+ sel.xpath('//ul/li/text()').extract()[0])
+
+ self.assertEqual(sel.xpath('//ul/li[@id="1"]/text()').extract_first(),
+ sel.xpath('//ul/li[@id="1"]/text()').extract()[0])
+
+ self.assertEqual(sel.xpath('//ul/li[2]/text()').extract_first(),
+ sel.xpath('//ul/li/text()').extract()[1])
+
+ self.assertEqual(sel.xpath('/ul/li[@id="doesnt-exist"]/text()').extract_first(), None)
+
+ def test_re_first(self):
+ """Test if re_first() returns first matched element"""
+ body = '<ul><li id="1">1</li><li id="2">2</li></ul>'
+ response = TextResponse(url="http://example.com", body=body)
+ sel = self.sscls(response)
+
+ self.assertEqual(sel.xpath('//ul/li/text()').re_first('\d'),
+ sel.xpath('//ul/li/text()').re('\d')[0])
+
+ self.assertEqual(sel.xpath('//ul/li[@id="1"]/text()').re_first('\d'),
+ sel.xpath('//ul/li[@id="1"]/text()').re('\d')[0])
+
+ self.assertEqual(sel.xpath('//ul/li[2]/text()').re_first('\d'),
+ sel.xpath('//ul/li/text()').re('\d')[1])
+
+ self.assertEqual(sel.xpath('/ul/li/text()').re_first('\w+'), None)
+ self.assertEqual(sel.xpath('/ul/li[@id="doesnt-exist"]/text()').re_first('\d'), None)
+
def test_select_unicode_query(self):
body = u"<p><input name='\xa9' value='1'/></p>"
response = TextResponse(url="http://example.com", body=body, encoding='utf8')
View
15 scrapy/utils/python.py
@@ -42,13 +42,20 @@ def flatten(x):
>>> flatten([[[1,2,3], (42,None)], [4,5], [6], 7, (8,9,10)])
[1, 2, 3, 42, None, 4, 5, 6, 7, 8, 9, 10]"""
- result = []
+ return list(iflatten(x))
+
+
+def iflatten(x):
+ """iflatten(sequence) -> iterator
+
+ Similar to ``.flatten()``, but returns iterator instead"""
+
for el in x:
if hasattr(el, "__iter__"):
- result.extend(flatten(el))
+ for el_ in flatten(el):
+ yield el_
else:
- result.append(el)
- return result
+ yield el
def unique(list_, key=lambda x: x):
Something went wrong with that request. Please try again.