Skip to content
Browse files

move base HTMLParser to lib/__init__.py

Fix also an issue with <tag foo> where foo does not have a value, which
caused an uncaught exception in cgi.escape.

Using the same codebase in hyphenation.py and summarize.py should prevent
more serious bugs within HTMLParser.
  • Loading branch information...
1 parent bfdeceb commit 42ef5a07a436c3372ceb747f6fa2eb02765d86cc @posativ committed Apr 12, 2012
Showing with 128 additions and 69 deletions.
  1. +2 −38 acrylamid/filters/hyphenation.py
  2. +14 −31 acrylamid/filters/summarize.py
  3. +56 −0 acrylamid/lib/__init__.py
  4. +56 −0 tests/test_lib.py
View
40 acrylamid/filters/hyphenation.py
@@ -4,11 +4,11 @@
from acrylamid.filters import Filter
from acrylamid.filters import log
from acrylamid.utils import cached_property
+from acrylamid.lib import HTMLParser, HTMLParseError
import re
import os
import codecs
-from HTMLParser import HTMLParser, HTMLParseError
from cgi import escape
from os.path import join, dirname, basename
@@ -103,27 +103,10 @@ class Separator(HTMLParser):
math and em tags."""
def __init__(self, html, hyphenationfunc, length=10):
- HTMLParser.__init__(self)
self.hyphenate = hyphenationfunc
self.length = length
- self.result = []
- self.stack = []
- self.feed(html)
-
- def handle_starttag(self, tag, attrs):
- """Apply and stack each read tag until we reach maxword."""
-
- def tagify(tag, attrs):
- """convert parsed tag back into a html tag"""
- if attrs:
- return '<%s %s>' % (tag, ' '.join(['%s="%s"' % (k, escape(v))
- for k, v in attrs]))
- else:
- return '<%s>' % tag
-
- self.stack.append(tag)
- self.result.append(tagify(tag, attrs))
+ HTMLParser.__init__(self, html)
def handle_data(self, data):
"""Hyphenate words longer than 10 characters."""
@@ -139,25 +122,6 @@ def handle_data(self, data):
self.result.append(data)
- def handle_endtag(self, tag):
- """Until we reach not the maxwords limit, we can safely pop every ending tag,
- added by handle_starttag. Afterwards, we apply missing endings tags if missing."""
- try:
- self.stack.pop()
- except IndexError:
- pass
- self.result.append('</%s>' % tag)
-
- def handle_startendtag(self, tag, attrs):
- s = '<%s %s/>' % (tag, ' '.join(['%s="%s"' % (k, escape(v)) for k, v in attrs]))
- self.result.append(s)
-
- def handle_entityref(self, name):
- self.result.append('&' + name + ';')
-
- def handle_charref(self, char):
- self.result.append('&#' + char + ';')
-
def build(lang):
"""build the Hyphenator from given language. If you want add more, see
View
45 acrylamid/filters/summarize.py
@@ -4,39 +4,25 @@
from acrylamid import log
from acrylamid.filters import Filter
-from HTMLParser import HTMLParser, HTMLParseError
+from acrylamid.lib import HTMLParser, HTMLParseError
from cgi import escape
class Summarizer(HTMLParser):
def __init__(self, text, href, link, mode, maxwords=100):
- HTMLParser.__init__(self)
-
self.maxwords = maxwords
self.href = href
self.link = link
self.mode = mode
-
- self.summarized = []
self.words = 0
- self.stack = []
- self.feed(text)
+ HTMLParser.__init__(self, text)
def handle_starttag(self, tag, attrs):
# Apply and stack each tag until we reach maxword.
-
- def tagify(tag, attrs):
- if attrs:
- return '<%s %s>' % (tag, ' '.join(['%s="%s"' % (k, escape(v))
- for k, v in attrs]))
- else:
- return '<%s>' % tag
-
if self.words < self.maxwords:
- self.stack.append(tag)
- self.summarized.append(tagify(tag, attrs))
+ super(Summarizer, self).handle_starttag(tag, attrs)
def handle_data(self, data):
# append words
@@ -45,51 +31,48 @@ def handle_data(self, data):
else:
ws = data.count(' ')
if ws + self.words < self.maxwords:
- self.summarized.append(data)
+ self.result.append(data)
else:
words = data.split(' ')
- self.summarized.append(' '.join(words[:self.maxwords - self.words]) + ' ')
+ self.result.append(' '.join(words[:self.maxwords - self.words]) + ' ')
self.words += ws
def handle_endtag(self, tag):
# If we are behind the word limit, append out link in various modes, else append tag
if self.words < self.maxwords:
- self.summarized.append('</%s>' % self.stack.pop())
+ self.result.append('</%s>' % self.stack.pop())
elif self.stack:
# this injects the link to the end of the current tag
if self.mode == 0:
- self.summarized.append(self.link % self.href)
+ self.result.append(self.link % self.href)
# now we append all stored tags
for x in self.stack[:]:
# this adds the link if it's not inside a given tag, prefered way
if self.mode == 1:
if not filter(lambda t: t in ['code', 'pre', 'b', 'a', 'em'], self.stack):
- self.summarized.append(self.link % self.href)
+ self.result.append(self.link % self.href)
self.mode = -1
- self.summarized.append('</%s>' % self.stack.pop())
+ self.result.append('</%s>' % self.stack.pop())
# this adds the link when the stack is empty
if self.mode == 2:
- self.summarized.append(self.link % self.href)
+ self.result.append(self.link % self.href)
def handle_startendtag(self, tag, attrs):
if self.words < self.maxwords:
- s = '<%s %s/>' % (tag, ' '.join(['%s="%s"' % (k, escape(v)) for k, v in attrs]))
- self.summarized.append(s)
+ super(Summarizer, self).handle_startendtag(tag, attrs)
def handle_entityref(self, entity):
- # handle &shy; correctly
if self.words < self.maxwords:
- self.summarized.append('&' + entity + ';')
+ super(Summarizer, self).handle_entityref(entity)
def handle_charref(self, char):
- # handle charrefs
if self.words < self.maxwords:
- self.summarized.append('&#' + char + ';')
+ super(Summarizer, self).handle_charref(char)
class Summarize(Filter):
@@ -119,7 +102,7 @@ def transform(self, content, entry, *args):
try:
X = Summarizer(content, self.path+entry.permalink, self.link, self.mode, maxwords)
- return ''.join(X.summarized)
+ return ''.join(X.result)
except HTMLParseError as e:
log.warn('%s: %s in %s' % (e.__class__.__name__, e.msg, entry.filename))
return content
View
56 acrylamid/lib/__init__.py
@@ -0,0 +1,56 @@
+# -*- encoding: utf-8 -*-
+#
+# Copyright 2012 posativ <info@posativ.org>. All rights reserved.
+# License: BSD Style, 2 clauses. see acrylamid/__init__.py
+
+from HTMLParser import HTMLParser as SystemsParser, HTMLParseError
+from cgi import escape
+
+
+def format(attrs):
+ res = []
+ for key, value in attrs:
+ if value is None:
+ res.append(key)
+ else:
+ res.append('%s="%s"' % (key, escape(value)))
+ return ' '.join(res)
+
+
+class HTMLParser(object, SystemsParser):
+ """A more useful base HTMLParser. This class contains the processed but untouched
+ result in self.result. It is intended to use this class to avoid HTML mess up."""
+
+ def __init__(self, html):
+ SystemsParser.__init__(self)
+ self.result = []
+ self.stack = []
+
+ self.feed(html)
+
+ def handle_starttag(self, tag, attrs):
+
+ self.stack.append(tag)
+ self.result.append('<%s %s>' % (tag, format(attrs)) if attrs else '<%s>' % tag)
+
+ def handle_data(self, data):
+ self.result.append(data)
+
+ def handle_endtag(self, tag):
+ try:
+ self.stack.pop()
+ except IndexError:
+ pass
+ self.result.append('</%s>' % tag)
+
+ def handle_startendtag(self, tag, attrs):
+ self.result.append('<%s %s/>' % (tag, format(attrs)))
+
+ def handle_entityref(self, name):
+ self.result.append('&' + name + ';')
+
+ def handle_charref(self, char):
+ self.result.append('&#' + char + ';')
+
+
+__all__ = ['HTMLParser', 'HTMLParseError']
View
56 tests/test_lib.py
@@ -0,0 +1,56 @@
+# -*- coding: utf-8 -*-
+
+try:
+ import unittest2 as unittest
+except ImportError:
+ import unittest # NOQA
+
+from acrylamid.lib import HTMLParser
+f = lambda x: ''.join(HTMLParser(x).result)
+
+
+class TestHTMLParser(unittest.TestCase):
+
+ def test_starttag(self):
+
+ examples = [
+ '<p></p>',
+ '<p id="foo"></p>',
+ '<script src="/js/foo.js" type="text/javascript"></script>',
+ '<iframe allowfullscreen></iframe>',
+ ]
+
+ for example in examples:
+ self.assertEqual(f(example), example)
+
+ def test_data(self):
+ assert f('<p>Data!1</p>') == '<p>Data!1</p>'
+
+ def test_endtag(self):
+
+ examples = [
+ '<p></p></p>',
+ '</p>'*3,
+ ]
+
+ for example in examples:
+ self.assertEqual(f(example), example)
+
+ def test_startendtag(self):
+
+ examples = [
+ '<br />',
+ '<link id="foo" attr="bar"/>'
+ ]
+
+ for example in examples:
+ self.assertEqual(f(example), example)
+
+ def test_entityref(self):
+
+ self.assertEqual(f('<span>&amp;</span>'), '<span>&amp;</span>')
+ self.assertEqual(f('<span>&foo;</span>'), '<span>&foo;</span>')
+
+ def test_charref(self):
+
+ self.assertEqual(f('<span>&#1234;</span>'), '<span>&#1234;</span>')

0 comments on commit 42ef5a0

Please sign in to comment.
Something went wrong with that request. Please try again.