diff --git a/scrapy/linkextractors/htmlparser.py b/scrapy/linkextractors/htmlparser.py
index 202340f538f..dcc261b319f 100644
--- a/scrapy/linkextractors/htmlparser.py
+++ b/scrapy/linkextractors/htmlparser.py
@@ -41,7 +41,10 @@ def _extract_links(self, response_text, response_url, response_encoding):
for link in links:
if isinstance(link.url, unicode):
link.url = link.url.encode(response_encoding)
- link.url = urljoin(base_url, link.url)
+ try:
+ link.url = urljoin(base_url, link.url)
+ except ValueError:
+ continue
link.url = safe_url_string(link.url, response_encoding)
link.text = link.text.decode(response_encoding)
ret.append(link)
diff --git a/scrapy/linkextractors/lxmlhtml.py b/scrapy/linkextractors/lxmlhtml.py
index 606a45212a1..e9fa521f392 100644
--- a/scrapy/linkextractors/lxmlhtml.py
+++ b/scrapy/linkextractors/lxmlhtml.py
@@ -49,10 +49,14 @@ def _extract_links(self, selector, response_url, response_encoding, base_url):
# hacky way to get the underlying lxml parsed document
for el, attr, attr_val in self._iter_links(selector.root):
# pseudo lxml.html.HtmlElement.make_links_absolute(base_url)
- attr_val = urljoin(base_url, attr_val)
- url = self.process_attr(attr_val)
- if url is None:
- continue
+ try:
+ attr_val = urljoin(base_url, attr_val)
+ except ValueError:
+ continue # skipping bogus links
+ else:
+ url = self.process_attr(attr_val)
+ if url is None:
+ continue
if isinstance(url, unicode):
url = url.encode(response_encoding)
# to fix relative links after process_value
diff --git a/scrapy/linkextractors/regex.py b/scrapy/linkextractors/regex.py
index 905eb89692a..b6f8d5d30dc 100644
--- a/scrapy/linkextractors/regex.py
+++ b/scrapy/linkextractors/regex.py
@@ -14,16 +14,25 @@ def clean_link(link_text):
"""Remove leading and trailing whitespace and punctuation"""
return link_text.strip("\t\r\n '\"")
+
class RegexLinkExtractor(SgmlLinkExtractor):
"""High performant link extractor"""
def _extract_links(self, response_text, response_url, response_encoding, base_url=None):
+ def clean_text(text):
+ return replace_escape_chars(remove_tags(text.decode(response_encoding))).strip()
+
+ def clean_url(url):
+ clean_url = ''
+ try:
+ clean_url = urljoin(base_url, replace_entities(clean_link(url.decode(response_encoding))))
+ except ValueError:
+ pass
+ return clean_url
+
if base_url is None:
base_url = urljoin(response_url, self.base_url) if self.base_url else response_url
- clean_url = lambda u: urljoin(base_url, replace_entities(clean_link(u.decode(response_encoding))))
- clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip()
-
links_text = linkre.findall(response_text)
return [Link(clean_url(url).encode(response_encoding),
clean_text(text))
diff --git a/scrapy/linkextractors/sgml.py b/scrapy/linkextractors/sgml.py
index 4a6a24254ac..acecd9c0d59 100644
--- a/scrapy/linkextractors/sgml.py
+++ b/scrapy/linkextractors/sgml.py
@@ -42,7 +42,10 @@ def _extract_links(self, response_text, response_url, response_encoding, base_ur
for link in self.links:
if isinstance(link.url, unicode):
link.url = link.url.encode(response_encoding)
- link.url = urljoin(base_url, link.url)
+ try:
+ link.url = urljoin(base_url, link.url)
+ except ValueError:
+ continue
link.url = safe_url_string(link.url, response_encoding)
link.text = to_unicode(link.text, response_encoding, errors='replace').strip()
ret.append(link)
diff --git a/tests/test_linkextractors.py b/tests/test_linkextractors.py
index 948289f8f89..d78b25f2580 100644
--- a/tests/test_linkextractors.py
+++ b/tests/test_linkextractors.py
@@ -491,10 +491,36 @@ def test_xhtml(self):
Link(url='http://example.com/nofollow2.html', text=u'Choose to follow or not', fragment='', nofollow=False)]
)
+ def test_link_wrong_href(self):
+ html = """
+ Item 1
+ Item 2
+ Item 3
+ """
+ response = HtmlResponse("http://example.org/index.html", body=html)
+ lx = self.extractor_cls()
+ self.assertEqual([link for link in lx.extract_links(response)], [
+ Link(url='http://example.org/item1.html', text=u'Item 1', nofollow=False),
+ Link(url='http://example.org/item3.html', text=u'Item 3', nofollow=False),
+ ])
+
class LxmlLinkExtractorTestCase(SgmlLinkExtractorTestCase):
extractor_cls = LxmlLinkExtractor
+ def test_link_wrong_href(self):
+ html = """
+ Item 1
+ Item 2
+ Item 3
+ """
+ response = HtmlResponse("http://example.org/index.html", body=html)
+ lx = self.extractor_cls()
+ self.assertEqual([link for link in lx.extract_links(response)], [
+ Link(url='http://example.org/item1.html', text=u'Item 1', nofollow=False),
+ Link(url='http://example.org/item3.html', text=u'Item 3', nofollow=False),
+ ])
+
class HtmlParserLinkExtractorTestCase(unittest.TestCase):
@@ -512,6 +538,19 @@ def test_extraction(self):
Link(url='http://www.google.com/something', text=u''),
Link(url='http://example.com/innertag.html', text=u'inner tag'),])
+ def test_link_wrong_href(self):
+ html = """
+ Item 1
+ Item 2
+ Item 3
+ """
+ response = HtmlResponse("http://example.org/index.html", body=html)
+ lx = HtmlParserLinkExtractor()
+ self.assertEqual([link for link in lx.extract_links(response)], [
+ Link(url='http://example.org/item1.html', text=u'Item 1', nofollow=False),
+ Link(url='http://example.org/item3.html', text=u'Item 3', nofollow=False),
+ ])
+
class RegexLinkExtractorTestCase(unittest.TestCase):
@@ -528,6 +567,19 @@ def test_extraction(self):
Link(url='http://www.google.com/something', text=u''),
Link(url='http://example.com/innertag.html', text=u'inner tag'),])
+ def test_link_wrong_href(self):
+ html = """
+ Item 1
+ Item 2
+ Item 3
+ """
+ response = HtmlResponse("http://example.org/index.html", body=html)
+ lx = RegexLinkExtractor()
+ self.assertEqual([link for link in lx.extract_links(response)], [
+ Link(url='http://example.org/item1.html', text=u'Item 1', nofollow=False),
+ Link(url='http://example.org/item3.html', text=u'Item 3', nofollow=False),
+ ])
+
if __name__ == "__main__":
unittest.main()