Skip to content
Permalink
Browse files

Extract links from XHTML documents with MIME-Type "application/xml"

"application/xhtml+xml" is already interpreted as HTML
and link extractor is fine with it to extract links.

Only for XML documents can the namespaces in tags be an issue.

Fixes #780

Do both tag and attribute tests in _iter_links() method
  • Loading branch information
redapple authored and dangra committed Jul 11, 2014
1 parent 91a1106 commit ed1f376d2b3cb9eb3bd1fef41c5a7be79531775d
Showing with 61 additions and 16 deletions.
  1. +15 −15 scrapy/contrib/linkextractors/lxmlhtml.py
  2. +46 −1 scrapy/tests/test_contrib_linkextractors.py
@@ -36,30 +36,30 @@ def __init__(self, tag="a", attr="href", process=None, unique=False):

def _iter_links(self, document):
for el in document.iter(etree.Element):
tag = _nons(el.tag)
if not self.scan_tag(el.tag):
if not self.scan_tag(_nons(el.tag)):
continue
attribs = el.attrib
for attrib in attribs:
if not self.scan_attr(attrib):
continue
yield (el, attrib, attribs[attrib])

def _extract_links(self, selector, response_url, response_encoding, base_url):
links = []
# hacky way to get the underlying lxml parsed document
for el, attr, attr_val in self._iter_links(selector._root):
if self.scan_tag(el.tag) and self.scan_attr(attr):
# pseudo _root.make_links_absolute(base_url)
attr_val = urljoin(base_url, attr_val)
url = self.process_attr(attr_val)
if url is None:
continue
if isinstance(url, unicode):
url = url.encode(response_encoding)
# to fix relative links after process_value
url = urljoin(response_url, url)
link = Link(url, _collect_string_content(el) or u'',
nofollow=True if el.get('rel') == 'nofollow' else False)
links.append(link)
# pseudo lxml.html.HtmlElement.make_links_absolute(base_url)
attr_val = urljoin(base_url, attr_val)
url = self.process_attr(attr_val)
if url is None:
continue
if isinstance(url, unicode):
url = url.encode(response_encoding)
# to fix relative links after process_value
url = urljoin(response_url, url)
link = Link(url, _collect_string_content(el) or u'',
nofollow=True if el.get('rel') == 'nofollow' else False)
links.append(link)

return unique_list(links, key=lambda link: link.url) \
if self.unique else links
@@ -1,7 +1,7 @@
import re
import unittest
from scrapy.contrib.linkextractors.regex import RegexLinkExtractor
from scrapy.http import HtmlResponse
from scrapy.http import HtmlResponse, XmlResponse
from scrapy.link import Link
from scrapy.contrib.linkextractors.htmlparser import HtmlParserLinkExtractor
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor, BaseSgmlLinkExtractor
@@ -419,6 +419,51 @@ def test_tags_attrs(self):
Link(url='http://example.com/get?id=2', text=u'Item 2', fragment='', nofollow=False)
])

def test_xhtml(self):
xhtml = """
<?xml version="1.0"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title>XHTML document title</title>
</head>
<body>
<div class='links'>
<p><a href="/about.html">About us</a></p>
</div>
<div>
<p><a href="/follow.html">Follow this link</a></p>
</div>
<div>
<p><a href="/nofollow.html" rel="nofollow">Dont follow this one</a></p>
</div>
<div>
<p><a href="/nofollow2.html" rel="blah">Choose to follow or not</a></p>
</div>
</body>
</html>
"""

response = HtmlResponse("http://example.com/index.xhtml", body=xhtml)

lx = self.extractor_cls()
self.assertEqual(lx.extract_links(response),
[Link(url='http://example.com/about.html', text=u'About us', fragment='', nofollow=False),
Link(url='http://example.com/follow.html', text=u'Follow this link', fragment='', nofollow=False),
Link(url='http://example.com/nofollow.html', text=u'Dont follow this one', fragment='', nofollow=True),
Link(url='http://example.com/nofollow2.html', text=u'Choose to follow or not', fragment='', nofollow=False)]
)

response = XmlResponse("http://example.com/index.xhtml", body=xhtml)

lx = self.extractor_cls()
self.assertEqual(lx.extract_links(response),
[Link(url='http://example.com/about.html', text=u'About us', fragment='', nofollow=False),
Link(url='http://example.com/follow.html', text=u'Follow this link', fragment='', nofollow=False),
Link(url='http://example.com/nofollow.html', text=u'Dont follow this one', fragment='', nofollow=True),
Link(url='http://example.com/nofollow2.html', text=u'Choose to follow or not', fragment='', nofollow=False)]
)


class LxmlLinkExtractorTestCase(SgmlLinkExtractorTestCase):

0 comments on commit ed1f376

Please sign in to comment.
You can’t perform that action at this time.