Skip to content

Commit

Permalink
Merge pull request #485 from gloomy-ghost/bleach
Browse files Browse the repository at this point in the history
 Use bleach to sanitize HTML
  • Loading branch information
jelmer committed Oct 1, 2018
2 parents 8e37a88 + 13426ca commit f6271a0
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 40 deletions.
8 changes: 3 additions & 5 deletions isso/tests/test_html.py
Expand Up @@ -59,7 +59,6 @@ def test_github_flavoured_markdown(self):
print("Hello, World")
</code></pre>""")

@unittest.skipIf(html.HTML5LIB_VERSION <= html.HTML5LIB_SIMPLETREE, "backport")
def test_sanitizer(self):
sanitizer = html.Sanitizer(elements=[], attributes=[])
examples = [
Expand All @@ -73,19 +72,18 @@ def test_sanitizer(self):

for (input, expected) in examples:
if isinstance(expected, list):
self.assertIn(html.sanitize(sanitizer, input), expected)
self.assertIn(sanitizer.sanitize(input), expected)
else:
self.assertEqual(html.sanitize(sanitizer, input), expected)
self.assertEqual(sanitizer.sanitize(input), expected)

@unittest.skipIf(html.HTML5LIB_VERSION <= html.HTML5LIB_SIMPLETREE, "backport")
def test_sanitizer_extensions(self):
sanitizer = html.Sanitizer(elements=["img"], attributes=["src"])
examples = [
('<img src="cat.gif" />', '<img src="cat.gif">'),
('<script src="doge.js"></script>', '')]

for (input, expected) in examples:
self.assertEqual(html.sanitize(sanitizer, input), expected)
self.assertEqual(sanitizer.sanitize(input), expected)

def test_render(self):
conf = config.new({
Expand Down
56 changes: 24 additions & 32 deletions isso/utils/html.py
Expand Up @@ -6,61 +6,53 @@

from distutils.version import LooseVersion as Version

HTML5LIB_VERSION = Version(pkg_resources.get_distribution("html5lib").version)
HTML5LIB_SIMPLETREE = Version("0.95")

import html5lib
from html5lib.sanitizer import HTMLSanitizer
from html5lib.serializer import HTMLSerializer

import bleach
import misaka


def Sanitizer(elements, attributes):

class Inner(HTMLSanitizer):
class Sanitizer(object):

def __init__(self, elements, attributes):
# attributes found in Sundown's HTML serializer [1]
# except for <img> tag,
# because images are not generated anyways.
#
# [1] https://github.com/vmg/sundown/blob/master/html/html.c
allowed_elements = ["a", "p", "hr", "br", "ol", "ul", "li",
self.elements = ["a", "p", "hr", "br", "ol", "ul", "li",
"pre", "code", "blockquote",
"del", "ins", "strong", "em",
"h1", "h2", "h3", "h4", "h5", "h6",
"table", "thead", "tbody", "th", "td"] + elements

# href for <a> and align for <table>
allowed_attributes = ["align", "href"] + attributes

# remove disallowed tokens from the output
def disallowed_token(self, token, token_type):
return None
self.attributes = ["align", "href"] + attributes

return Inner


def sanitize(tokenizer, document):
def sanitize(self, text):
clean_html = bleach.clean(text, tags=self.elements,
attributes=self.attributes, strip=True)

parser = html5lib.HTMLParser(tokenizer=tokenizer)
domtree = parser.parseFragment(document)
def set_links(attrs, new=False):
href_key = (None, u'href')

if HTML5LIB_VERSION > HTML5LIB_SIMPLETREE:
builder = "etree"
if href_key not in attrs:
return attrs
if attrs[href_key].startswith(u'mailto:'):
return attrs

for link in domtree.findall(".//{http://www.w3.org/1999/xhtml}a"):
if link.get('href', None):
link.set("rel", "nofollow noopener")
rel_key = (None, u'rel')
rel_values = [val for val in attrs.get(rel_key, u'').split(u' ') if val]

else:
builder = "simpletree"
for value in [u'nofollow', u'noopener']:
if value not in [rel_val.lower() for rel_val in rel_values]:
rel_values.append(value)

stream = html5lib.treewalkers.getTreeWalker(builder)(domtree)
serializer = HTMLSerializer(
quote_attr_values=True, omit_optional_tags=False)
attrs[rel_key] = u' '.join(rel_values)
return attrs

return serializer.render(stream)
linker = bleach.linkifier.Linker(callbacks=[set_links])
return linker.linkify(clean_html)


def Markdown(extensions=("strikethrough", "superscript", "autolink",
Expand Down Expand Up @@ -100,7 +92,7 @@ def __init__(self, conf):
conf.getlist("allowed-elements"),
conf.getlist("allowed-attributes"))

self._render = lambda text: sanitize(sanitizer, parser(text))
self._render = lambda text: sanitizer.sanitize(parser(text))

def render(self, text):
return self._render(text)
4 changes: 2 additions & 2 deletions setup.py
Expand Up @@ -5,8 +5,8 @@

from setuptools import setup, find_packages

requires = ['itsdangerous', 'Jinja2', 'misaka>=2.0,<3.0', 'html5lib<0.9999999',
'werkzeug>=0.9']
requires = ['itsdangerous', 'Jinja2', 'misaka>=2.0,<3.0', 'html5lib',
'werkzeug>=0.9', 'bleach']

if sys.version_info < (2, 7):
raise SystemExit("Python 2 versions < 2.7 are not supported.")
Expand Down
3 changes: 2 additions & 1 deletion tox.ini
Expand Up @@ -15,7 +15,8 @@ deps =

[testenv:debian]
deps=
html5lib==0.95
bleach
html5lib
ipaddr==2.1.10
itsdangerous==0.22
misaka==1.0.2
Expand Down

0 comments on commit f6271a0

Please sign in to comment.