From c2878f117fdd6f7ecce44f9846dc54b6cfceb48f Mon Sep 17 00:00:00 2001 From: Ruairi Fahy Date: Tue, 21 Jun 2016 11:13:55 +0100 Subject: [PATCH] Handle void tags when processing as safehtml #92 --- scrapely/extractors.py | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/scrapely/extractors.py b/scrapely/extractors.py index 26f281d..20dab22 100644 --- a/scrapely/extractors.py +++ b/scrapely/extractors.py @@ -51,7 +51,12 @@ } # tags whoose content will be completely removed (recursively) # (overrides tags_to_keep and tags_to_replace) -_TAGS_TO_PURGE = ('script', 'img', 'input') +_TAGS_TO_PURGE = ('script', 'style', 'img', 'input') +# tags that are automatically closed in HTML4 and HTML5 +_VOID_TAGS = frozenset([ + 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', + 'link', 'meta', 'param', 'source', 'track', 'wbr' +]) def htmlregion(text): @@ -104,7 +109,7 @@ def text(region): def safehtml(region, allowed_tags=_TAGS_TO_KEEP, replace_tags=_TAGS_TO_REPLACE, - tags_to_purge=_TAGS_TO_PURGE): + tags_to_purge=_TAGS_TO_PURGE): """Creates an HTML subset, using a whitelist of HTML tags. The HTML generated is safe for display on a website,without escaping and @@ -117,7 +122,7 @@ def safehtml(region, allowed_tags=_TAGS_TO_KEEP, replace_tags=_TAGS_TO_REPLACE, opening and closing tag is removed. For example: - >>> t = lambda s: safehtml(htmlregion(s)) + >>> t = lambda s, keep=_TAGS_TO_KEEP: safehtml(htmlregion(s), keep) >>> t(u'test test') u'test test' @@ -125,8 +130,8 @@ def safehtml(region, allowed_tags=_TAGS_TO_KEEP, replace_tags=_TAGS_TO_REPLACE, >>> t(u'test') u'test' - replace_tags define tags that are converted. By default all headers, bold and indenting - are converted to strong and em. + replace_tags define tags that are converted. By default all headers, bold + and indenting are converted to strong and em. >>> t(u'

header

test bold indent') u'header test bold indent' @@ -145,14 +150,27 @@ def safehtml(region, allowed_tags=_TAGS_TO_KEEP, replace_tags=_TAGS_TO_REPLACE, >>> t(u'

test
test

') u'

test
test

' + Include or exclude tags that you want + >>> t(u'Keep and
tags') + u'Keep and tags' + >>> tags = set(list(_TAGS_TO_KEEP)[:] + ['meta', 'hr']) + >>> t(u'Keep and
tags', tags) + u'Keep and
tags
' + + Handle void tags when purged + >>> t(u'Keep content around img tag') + u'Keep content around img tag' + """ tagstack = [] + def _process_tag(tag): tagstr = replace_tags.get(tag.tag, tag.tag) if tagstr not in allowed_tags: return if tag.tag_type == HtmlTagType.OPEN_TAG: - tagstack.append(tagstr) + if tag.tag not in _VOID_TAGS: + tagstack.append(tagstr) return u"<%s>" % tagstr elif tag.tag_type == HtmlTagType.CLOSE_TAG: try: @@ -188,7 +206,8 @@ def _process_markup(region, textf, tagf, tags_to_purge=_TAGS_TO_PURGE): tag = fragment.tag if tag in tags_to_purge: # if opening, keep going until closed - if fragment.tag_type == HtmlTagType.OPEN_TAG: + if (fragment.tag_type == HtmlTagType.OPEN_TAG and + tag not in _VOID_TAGS): for probe in fiter: if isinstance(probe, HtmlTag) and \ probe.tag == tag and \