Skip to content

Commit

Permalink
Merge pull request #1669 from redapple/bogus-links
Browse files Browse the repository at this point in the history
[MRG+1] [1.0.x backport] [LinkExtractors] Ignore bogus links
  • Loading branch information
eliasdorneles committed Jan 19, 2016
2 parents 4b3ea07 + 108195e commit 9396e0e
Show file tree
Hide file tree
Showing 5 changed files with 80 additions and 9 deletions.
5 changes: 4 additions & 1 deletion scrapy/linkextractors/htmlparser.py
Expand Up @@ -41,7 +41,10 @@ def _extract_links(self, response_text, response_url, response_encoding):
for link in links:
if isinstance(link.url, unicode):
link.url = link.url.encode(response_encoding)
link.url = urljoin(base_url, link.url)
try:
link.url = urljoin(base_url, link.url)
except ValueError:
continue
link.url = safe_url_string(link.url, response_encoding)
link.text = link.text.decode(response_encoding)
ret.append(link)
Expand Down
12 changes: 8 additions & 4 deletions scrapy/linkextractors/lxmlhtml.py
Expand Up @@ -49,10 +49,14 @@ def _extract_links(self, selector, response_url, response_encoding, base_url):
# hacky way to get the underlying lxml parsed document
for el, attr, attr_val in self._iter_links(selector._root):
# pseudo lxml.html.HtmlElement.make_links_absolute(base_url)
attr_val = urljoin(base_url, attr_val)
url = self.process_attr(attr_val)
if url is None:
continue
try:
attr_val = urljoin(base_url, attr_val)
except ValueError:
continue # skipping bogus links
else:
url = self.process_attr(attr_val)
if url is None:
continue
if isinstance(url, unicode):
url = url.encode(response_encoding)
# to fix relative links after process_value
Expand Down
15 changes: 12 additions & 3 deletions scrapy/linkextractors/regex.py
Expand Up @@ -14,16 +14,25 @@ def clean_link(link_text):
"""Remove leading and trailing whitespace and punctuation"""
return link_text.strip("\t\r\n '\"")


class RegexLinkExtractor(SgmlLinkExtractor):
"""High performant link extractor"""

def _extract_links(self, response_text, response_url, response_encoding, base_url=None):
def clean_text(text):
return replace_escape_chars(remove_tags(text.decode(response_encoding))).strip()

def clean_url(url):
clean_url = ''
try:
clean_url = urljoin(base_url, replace_entities(clean_link(url.decode(response_encoding))))
except ValueError:
pass
return clean_url

if base_url is None:
base_url = urljoin(response_url, self.base_url) if self.base_url else response_url

clean_url = lambda u: urljoin(base_url, replace_entities(clean_link(u.decode(response_encoding))))
clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip()

links_text = linkre.findall(response_text)
return [Link(clean_url(url).encode(response_encoding),
clean_text(text))
Expand Down
5 changes: 4 additions & 1 deletion scrapy/linkextractors/sgml.py
Expand Up @@ -42,7 +42,10 @@ def _extract_links(self, response_text, response_url, response_encoding, base_ur
for link in self.links:
if isinstance(link.url, unicode):
link.url = link.url.encode(response_encoding)
link.url = urljoin(base_url, link.url)
try:
link.url = urljoin(base_url, link.url)
except ValueError:
continue
link.url = safe_url_string(link.url, response_encoding)
link.text = str_to_unicode(link.text, response_encoding, errors='replace').strip()
ret.append(link)
Expand Down
52 changes: 52 additions & 0 deletions tests/test_linkextractors.py
Expand Up @@ -491,10 +491,36 @@ def test_xhtml(self):
Link(url='http://example.com/nofollow2.html', text=u'Choose to follow or not', fragment='', nofollow=False)]
)

def test_link_wrong_href(self):
html = """
<a href="http://example.org/item1.html">Item 1</a>
<a href="http://[example.org/item2.html">Item 2</a>
<a href="http://example.org/item3.html">Item 3</a>
"""
response = HtmlResponse("http://example.org/index.html", body=html)
lx = self.extractor_cls()
self.assertEqual([link for link in lx.extract_links(response)], [
Link(url='http://example.org/item1.html', text=u'Item 1', nofollow=False),
Link(url='http://example.org/item3.html', text=u'Item 3', nofollow=False),
])


class LxmlLinkExtractorTestCase(SgmlLinkExtractorTestCase):
extractor_cls = LxmlLinkExtractor

def test_link_wrong_href(self):
html = """
<a href="http://example.org/item1.html">Item 1</a>
<a href="http://[example.org/item2.html">Item 2</a>
<a href="http://example.org/item3.html">Item 3</a>
"""
response = HtmlResponse("http://example.org/index.html", body=html)
lx = self.extractor_cls()
self.assertEqual([link for link in lx.extract_links(response)], [
Link(url='http://example.org/item1.html', text=u'Item 1', nofollow=False),
Link(url='http://example.org/item3.html', text=u'Item 3', nofollow=False),
])


class HtmlParserLinkExtractorTestCase(unittest.TestCase):

Expand All @@ -512,6 +538,19 @@ def test_extraction(self):
Link(url='http://www.google.com/something', text=u''),
Link(url='http://example.com/innertag.html', text=u'inner tag'),])

def test_link_wrong_href(self):
html = """
<a href="http://example.org/item1.html">Item 1</a>
<a href="http://[example.org/item2.html">Item 2</a>
<a href="http://example.org/item3.html">Item 3</a>
"""
response = HtmlResponse("http://example.org/index.html", body=html)
lx = HtmlParserLinkExtractor()
self.assertEqual([link for link in lx.extract_links(response)], [
Link(url='http://example.org/item1.html', text=u'Item 1', nofollow=False),
Link(url='http://example.org/item3.html', text=u'Item 3', nofollow=False),
])


class RegexLinkExtractorTestCase(unittest.TestCase):

Expand All @@ -528,6 +567,19 @@ def test_extraction(self):
Link(url='http://www.google.com/something', text=u''),
Link(url='http://example.com/innertag.html', text=u'inner tag'),])

def test_link_wrong_href(self):
html = """
<a href="http://example.org/item1.html">Item 1</a>
<a href="http://[example.org/item2.html">Item 2</a>
<a href="http://example.org/item3.html">Item 3</a>
"""
response = HtmlResponse("http://example.org/index.html", body=html)
lx = RegexLinkExtractor()
self.assertEqual([link for link in lx.extract_links(response)], [
Link(url='http://example.org/item1.html', text=u'Item 1', nofollow=False),
Link(url='http://example.org/item3.html', text=u'Item 3', nofollow=False),
])


if __name__ == "__main__":
unittest.main()

0 comments on commit 9396e0e

Please sign in to comment.