Merge pull request #565 from dangra/562-sgmlinkextractor

replace unencodeable codepoints with html entities
scrapy · Jan 28, 2014 · 21f0e40 · 21f0e40
2 parents 116a1df + 66829c9
commit 21f0e40
Show file tree

Hide file tree

Showing 2 changed files with 8 additions and 1 deletion.
diff --git a/scrapy/contrib/linkextractors/sgml.py b/scrapy/contrib/linkextractors/sgml.py
@@ -121,7 +121,7 @@ def extract_links(self, response):
             body = u''.join(f
                             for x in self.restrict_xpaths
                             for f in sel.xpath(x).extract()
-                            ).encode(response.encoding)
+                            ).encode(response.encoding, errors='xmlcharrefreplace')
         else:
             body = response.body
 

diff --git a/scrapy/tests/test_contrib_linkextractors.py b/scrapy/tests/test_contrib_linkextractors.py
@@ -236,6 +236,13 @@ def test_restrict_xpaths_encoding(self):
         self.assertEqual(lx.extract_links(response),
                          [Link(url='http://example.org/about.html', text=u'About us\xa3')])
 
+    def test_restrict_xpaths_with_html_entities(self):
+        html = '<html><body><p><a href="/&hearts;/you?c=&euro;">text</a></p></body></html>'
+        response = HtmlResponse("http://example.org/somepage/index.html", body=html, encoding='iso8859-15')
+        links = SgmlLinkExtractor(restrict_xpaths='//p').extract_links(response)
+        self.assertEqual(links,
+                         [Link(url='http://example.org/%E2%99%A5/you?c=%E2%82%AC', text=u'text')])
+
     def test_restrict_xpaths_concat_in_handle_data(self):
         """html entities cause SGMLParser to call handle_data hook twice"""
         body = """<html><body><div><a href="/foo">&gt;\xbe\xa9&lt;\xb6\xab</a></body></html>"""