Skip to content

Commit

Permalink
Strip control characters and non-characters before parsing
Browse files Browse the repository at this point in the history
This brings Sanitize into compliance with the HTML Standard's
preprocessing guidelines:

https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream

Fixes #179
  • Loading branch information
rgrove committed Sep 8, 2019
1 parent 18b872e commit 0d4158f
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 99 deletions.
23 changes: 19 additions & 4 deletions lib/sanitize.rb
Expand Up @@ -19,18 +19,33 @@
class Sanitize
attr_reader :config

# Matches one or more control characters that should be removed from HTML
# before parsing, as defined by the HTML living standard.
#
# - https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
# - https://infra.spec.whatwg.org/#control
REGEX_HTML_CONTROL_CHARACTERS = /[\u0001-\u0008\u000b\u000e-\u001f\u007f-\u009f]+/u

# Matches one or more non-characters that should be removed from HTML before
# parsing, as defined by the HTML living standard.
#
# - https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
# - https://infra.spec.whatwg.org/#noncharacter
REGEX_HTML_NON_CHARACTERS = /[\ufdd0-\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}]+/u

# Matches an attribute value that could be treated by a browser as a URL
# with a protocol prefix, such as "http:" or "javascript:". Any string of zero
# or more characters followed by a colon is considered a match, even if the
# colon is encoded as an entity and even if it's an incomplete entity (which
# IE6 and Opera will still parse).
REGEX_PROTOCOL = /\A\s*([^\/#]*?)(?:\:|&#0*58|&#x0*3a)/i

# Matches Unicode characters that should be stripped from HTML before passing
# it to the parser.
# Matches one or more characters that should be stripped from HTML before
# parsing. This is a combination of `REGEX_HTML_CONTROL_CHARACTERS` and
# `REGEX_HTML_NON_CHARACTERS`.
#
# http://www.w3.org/TR/unicode-xml/#Charlist
REGEX_UNSUITABLE_CHARS = /[\u0000\u0340\u0341\u17a3\u17d3\u2028\u2029\u202a-\u202e\u206a-\u206f\ufff9-\ufffb\ufeff\ufffc\u{1d173}-\u{1d17a}\u{e0000}-\u{e007f}]/u
# https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
REGEX_UNSUITABLE_CHARS = /(?:#{REGEX_HTML_CONTROL_CHARACTERS}|#{REGEX_HTML_NON_CHARACTERS})/u

#--
# Class Methods
Expand Down
30 changes: 30 additions & 0 deletions test/test_sanitize.rb
Expand Up @@ -38,6 +38,21 @@
@s.document('<!doctype html><html><b>foo</b>'.freeze).must_equal "<html>foo</html>"
end

it 'should normalize newlines' do
@s.document("a\r\n\n\r\r\r\nz").must_equal "<html>a\n\n\n\n\nz</html>"
end

it 'should strip control characters (except ASCII whitespace)' do
sample_control_chars = "\u0001\u0008\u000b\u000e\u001f\u007f\u009f"
whitespace = "\t\n\f\u0020"
@s.document("a#{sample_control_chars}#{whitespace}z").must_equal "<html>a#{whitespace}z</html>"
end

it 'should strip non-characters' do
sample_non_chars = "\ufdd0\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}"
@s.document("a#{sample_non_chars}z").must_equal "<html>az</html>"
end

describe 'when html body exceeds Nokogumbo::DEFAULT_MAX_TREE_DEPTH' do
let(:content) do
content = nest_html_content('<b>foo</b>', Nokogumbo::DEFAULT_MAX_TREE_DEPTH)
Expand Down Expand Up @@ -85,6 +100,21 @@
@s.fragment('<b>foo</b>'.freeze).must_equal 'foo'
end

it 'should normalize newlines' do
@s.fragment("a\r\n\n\r\r\r\nz").must_equal "a\n\n\n\n\nz"
end

it 'should strip control characters (except ASCII whitespace)' do
sample_control_chars = "\u0001\u0008\u000b\u000e\u001f\u007f\u009f"
whitespace = "\t\n\f\u0020"
@s.fragment("a#{sample_control_chars}#{whitespace}z").must_equal "a#{whitespace}z"
end

it 'should strip non-characters' do
sample_non_chars = "\ufdd0\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}"
@s.fragment("a#{sample_non_chars}z").must_equal "az"
end

describe 'when html body exceeds Nokogumbo::DEFAULT_MAX_TREE_DEPTH' do
let(:content) do
content = nest_html_content('<b>foo</b>', Nokogumbo::DEFAULT_MAX_TREE_DEPTH)
Expand Down
95 changes: 0 additions & 95 deletions test/test_unicode.rb

This file was deleted.

0 comments on commit 0d4158f

Please sign in to comment.