Skip to content

Commit

Permalink
Merge pull request #565 from dangra/562-sgmlinkextractor
Browse files Browse the repository at this point in the history
replace unencodeable codepoints with html entities
  • Loading branch information
pablohoffman committed Jan 28, 2014
2 parents 116a1df + 66829c9 commit 21f0e40
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 1 deletion.
2 changes: 1 addition & 1 deletion scrapy/contrib/linkextractors/sgml.py
Expand Up @@ -121,7 +121,7 @@ def extract_links(self, response):
body = u''.join(f
for x in self.restrict_xpaths
for f in sel.xpath(x).extract()
).encode(response.encoding)
).encode(response.encoding, errors='xmlcharrefreplace')
else:
body = response.body

Expand Down
7 changes: 7 additions & 0 deletions scrapy/tests/test_contrib_linkextractors.py
Expand Up @@ -236,6 +236,13 @@ def test_restrict_xpaths_encoding(self):
self.assertEqual(lx.extract_links(response),
[Link(url='http://example.org/about.html', text=u'About us\xa3')])

def test_restrict_xpaths_with_html_entities(self):
html = '<html><body><p><a href="/&hearts;/you?c=&euro;">text</a></p></body></html>'
response = HtmlResponse("http://example.org/somepage/index.html", body=html, encoding='iso8859-15')
links = SgmlLinkExtractor(restrict_xpaths='//p').extract_links(response)
self.assertEqual(links,
[Link(url='http://example.org/%E2%99%A5/you?c=%E2%82%AC', text=u'text')])

def test_restrict_xpaths_concat_in_handle_data(self):
"""html entities cause SGMLParser to call handle_data hook twice"""
body = """<html><body><div><a href="/foo">&gt;\xbe\xa9&lt;\xb6\xab</a></body></html>"""
Expand Down

0 comments on commit 21f0e40

Please sign in to comment.