Skip to content
Permalink
Browse files

SgmlLinkExtractor - fix for parsing <area> tag with Unicode present

  • Loading branch information
yakxxx authored and dangra committed Sep 2, 2014
1 parent 4677eed commit 49b40f01ad168cef9bc03c953480397234b13877
Showing with 12 additions and 0 deletions.
  1. +1 −0 scrapy/contrib/linkextractors/sgml.py
  2. +11 −0 scrapy/tests/test_contrib_linkextractors.py
@@ -59,6 +59,7 @@ def reset(self):
FixedSGMLParser.reset(self)
self.links = []
self.base_url = None
self.current_link = None

def unknown_starttag(self, tag, attrs):
if tag == 'base':
@@ -284,6 +284,17 @@ def test_restrict_xpaths_concat_in_handle_data(self):
[Link(url='http://example.org/foo', text=u'>\u4eac<\u4e1c',
fragment='', nofollow=False)])

def test_area_tag_with_unicode_present(self):
body = """<html><body>\xbe\xa9<map><area href="http://example.org/foo" /></map></body></html>"""
response = HtmlResponse("http://example.org", body=body, encoding='utf-8')
lx = self.extractor_cls()
lx.extract_links(response)
lx.extract_links(response)
lx.extract_links(response)
self.assertEqual(lx.extract_links(response),
[Link(url='http://example.org/foo', text=u'',
fragment='', nofollow=False)])

def test_encoded_url(self):
body = """<html><body><div><a href="?page=2">BinB</a></body></html>"""
response = HtmlResponse("http://known.fm/AC%2FDC/", body=body, encoding='utf8')

0 comments on commit 49b40f0

Please sign in to comment.