Skip to content

Commit

Permalink
Merge branch 'master' into py3-linkextractors
Browse files Browse the repository at this point in the history
Conflicts:
	scrapy/linkextractors/lxmlhtml.py
	tests/test_linkextractors.py
  • Loading branch information
kmike committed Aug 27, 2015
2 parents ff24cbb + aa31811 commit 9bfe6ec
Show file tree
Hide file tree
Showing 5 changed files with 22 additions and 6 deletions.
4 changes: 2 additions & 2 deletions scrapy/linkextractors/lxmlhtml.py
Expand Up @@ -7,7 +7,7 @@
import lxml.etree as etree

from scrapy.link import Link
from scrapy.utils.misc import arg_to_iter
from scrapy.utils.misc import arg_to_iter, rel_has_nofollow
from scrapy.utils.python import unique as unique_list, to_native_str
from scrapy.linkextractors import FilteringLinkExtractor
from scrapy.utils.response import get_base_url
Expand Down Expand Up @@ -60,7 +60,7 @@ def _extract_links(self, selector, response_url, response_encoding, base_url):
# to fix relative links after process_value
url = urljoin(response_url, url)
link = Link(url, _collect_string_content(el) or u'',
nofollow=True if el.get('rel') == 'nofollow' else False)
nofollow=rel_has_nofollow(el.get('rel')))
links.append(link)
return self._deduplicate_if_needed(links)

Expand Down
4 changes: 2 additions & 2 deletions scrapy/linkextractors/sgml.py
Expand Up @@ -9,7 +9,7 @@
from scrapy.selector import Selector
from scrapy.link import Link
from scrapy.linkextractors import FilteringLinkExtractor
from scrapy.utils.misc import arg_to_iter
from scrapy.utils.misc import arg_to_iter, rel_has_nofollow
from scrapy.utils.python import unique as unique_list, to_unicode
from scrapy.utils.response import get_base_url
from scrapy.exceptions import ScrapyDeprecationWarning
Expand Down Expand Up @@ -80,7 +80,7 @@ def unknown_starttag(self, tag, attrs):
if self.scan_attr(attr):
url = self.process_value(value)
if url is not None:
link = Link(url=url, nofollow=True if dict(attrs).get('rel') == 'nofollow' else False)
link = Link(url=url, nofollow=rel_has_nofollow(dict(attrs).get('rel')))
self.links.append(link)
self.current_link = link

Expand Down
5 changes: 5 additions & 0 deletions scrapy/utils/misc.py
Expand Up @@ -112,3 +112,8 @@ def md5sum(file):
break
m.update(d)
return m.hexdigest()

def rel_has_nofollow(rel):
"""Return True if link rel attribute has nofollow type"""
return True if rel is not None and 'nofollow' in rel.split() else False

13 changes: 11 additions & 2 deletions tests/test_linkextractors.py
Expand Up @@ -112,6 +112,9 @@ def test_nofollow(self):
<div>
<p><a href="/nofollow2.html" rel="blah">Choose to follow or not</a></p>
</div>
<div>
<p><a href="http://google.com/something" rel="external nofollow">External link not to follow</a></p>
</div>
</body></html>"""
response = HtmlResponse("http://example.org/somepage/index.html", body=html)

Expand All @@ -121,6 +124,7 @@ def test_nofollow(self):
Link(url='http://example.org/follow.html', text=u'Follow this link'),
Link(url='http://example.org/nofollow.html', text=u'Dont follow this one', nofollow=True),
Link(url='http://example.org/nofollow2.html', text=u'Choose to follow or not'),
Link(url='http://google.com/something', text=u'External link not to follow', nofollow=True),
])

def test_matches(self):
Expand Down Expand Up @@ -369,6 +373,9 @@ def test_xhtml(self):
<div>
<p><a href="/nofollow2.html" rel="blah">Choose to follow or not</a></p>
</div>
<div>
<p><a href="http://google.com/something" rel="external nofollow">External link not to follow</a></p>
</div>
</body>
</html>
"""
Expand All @@ -380,7 +387,8 @@ def test_xhtml(self):
[Link(url='http://example.com/about.html', text=u'About us', fragment='', nofollow=False),
Link(url='http://example.com/follow.html', text=u'Follow this link', fragment='', nofollow=False),
Link(url='http://example.com/nofollow.html', text=u'Dont follow this one', fragment='', nofollow=True),
Link(url='http://example.com/nofollow2.html', text=u'Choose to follow or not', fragment='', nofollow=False)]
Link(url='http://example.com/nofollow2.html', text=u'Choose to follow or not', fragment='', nofollow=False),
Link(url='http://google.com/something', text=u'External link not to follow', nofollow=True)]
)

response = XmlResponse("http://example.com/index.xhtml", body=xhtml)
Expand All @@ -390,7 +398,8 @@ def test_xhtml(self):
[Link(url='http://example.com/about.html', text=u'About us', fragment='', nofollow=False),
Link(url='http://example.com/follow.html', text=u'Follow this link', fragment='', nofollow=False),
Link(url='http://example.com/nofollow.html', text=u'Dont follow this one', fragment='', nofollow=True),
Link(url='http://example.com/nofollow2.html', text=u'Choose to follow or not', fragment='', nofollow=False)]
Link(url='http://example.com/nofollow2.html', text=u'Choose to follow or not', fragment='', nofollow=False),
Link(url='http://google.com/something', text=u'External link not to follow', nofollow=True)]
)

def test_link_wrong_href(self):
Expand Down
2 changes: 2 additions & 0 deletions tests/test_linkextractors_deprecated.py
Expand Up @@ -149,12 +149,14 @@ def test_link_nofollow(self):
html = """
<a href="page.html?action=print" rel="nofollow">Printer-friendly page</a>
<a href="about.html">About us</a>
<a href="http://google.com/something" rel="external nofollow">Something</a>
"""
response = HtmlResponse("http://example.org/page.html", body=html)
lx = SgmlLinkExtractor()
self.assertEqual([link for link in lx.extract_links(response)], [
Link(url='http://example.org/page.html?action=print', text=u'Printer-friendly page', nofollow=True),
Link(url='http://example.org/about.html', text=u'About us', nofollow=False),
Link(url='http://google.com/something', text=u'Something', nofollow=True),
])


Expand Down

0 comments on commit 9bfe6ec

Please sign in to comment.