Browse files

Unescape HTML entities in the parser, fixes #14

  • Loading branch information...
1 parent 466656a commit 3ff5c18072574d2de0c879b5d8d798412c7833a4 @coleifer coleifer committed Dec 19, 2012
Showing with 18 additions and 3 deletions.
  1. +2 −2 micawber/parsers.py
  2. +15 −0 micawber/tests.py
  3. +1 −1 runtests.py
View
4 micawber/parsers.py
@@ -118,7 +118,7 @@ def parse_html(html, providers, urlize_all=True, handler=full_handler, block_han
if not BeautifulSoup:
raise Exception('Unable to parse HTML, please install BeautifulSoup or use the text parser')
- soup = BeautifulSoup(html)
+ soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
for url in soup.findAll(text=re.compile(url_re)):
if not _inside_skip(url):
@@ -136,7 +136,7 @@ def extract_html(html, providers, **params):
if not BeautifulSoup:
raise Exception('Unable to parse HTML, please install BeautifulSoup or use the text parser')
- soup = BeautifulSoup(html)
+ soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
all_urls = set()
urls = []
extracted_urls = {}
View
15 micawber/tests.py
@@ -217,3 +217,18 @@ def test_outside_of_markup(self):
for url, expected in self.full_pairs.items():
parsed = parse_html(frame % (url), test_pr)
self.assertEqual(parsed, frame % (expected))
+
+ def test_html_entities(self):
+ frame_html = '<p>test %s</p><p><a href="foo">%s</a></p>'
+
+ for url, expected in self.data_pairs.items():
+ esc_url = url.replace('&', '&amp;')
+ all_urls, extracted = extract_html(frame_html % (esc_url, esc_url), test_pr)
+ self.assertEqual(all_urls, [url])
+
+ if 'url' not in expected:
+ expected['url'] = url
+ self.assertEqual(extracted, {url: expected})
+
+ rendered = parse_html('<p>%s</p>' % esc_url, test_pr)
+ self.assertEqual(rendered, '<p>%s</p>' % self.full_pairs[url])
View
2 runtests.py
@@ -11,7 +11,7 @@ def run_django_tests():
import django
except ImportError:
print 'Skipping django tests'
- return failures, errors
+ return
else:
print 'Running django integration tests'

0 comments on commit 3ff5c18

Please sign in to comment.