Strip control characters and non-characters before parsing

This brings Sanitize into compliance with the HTML Standard's preprocessing guidelines: https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream Fixes #179
rgrove · Sep 8, 2019 · 0d4158f · 0d4158f
1 parent 18b872e
commit 0d4158f
Show file tree

Hide file tree

Showing 3 changed files with 49 additions and 99 deletions.
diff --git a/lib/sanitize.rb b/lib/sanitize.rb
@@ -19,18 +19,33 @@
 class Sanitize
   attr_reader :config
 
+  # Matches one or more control characters that should be removed from HTML
+  # before parsing, as defined by the HTML living standard.
+  #
+  # -   https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
+  # -   https://infra.spec.whatwg.org/#control
+  REGEX_HTML_CONTROL_CHARACTERS = /[\u0001-\u0008\u000b\u000e-\u001f\u007f-\u009f]+/u
+
+  # Matches one or more non-characters that should be removed from HTML before
+  # parsing, as defined by the HTML living standard.
+  #
+  # -   https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
+  # -   https://infra.spec.whatwg.org/#noncharacter
+  REGEX_HTML_NON_CHARACTERS = /[\ufdd0-\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}]+/u
+
   # Matches an attribute value that could be treated by a browser as a URL
   # with a protocol prefix, such as "http:" or "javascript:". Any string of zero
   # or more characters followed by a colon is considered a match, even if the
   # colon is encoded as an entity and even if it's an incomplete entity (which
   # IE6 and Opera will still parse).
   REGEX_PROTOCOL = /\A\s*([^\/#]*?)(?:\:|&#0*58|&#x0*3a)/i
 
-  # Matches Unicode characters that should be stripped from HTML before passing
-  # it to the parser.
+  # Matches one or more characters that should be stripped from HTML before
+  # parsing. This is a combination of `REGEX_HTML_CONTROL_CHARACTERS` and
+  # `REGEX_HTML_NON_CHARACTERS`.
   #
-  # http://www.w3.org/TR/unicode-xml/#Charlist
-  REGEX_UNSUITABLE_CHARS = /[\u0000\u0340\u0341\u17a3\u17d3\u2028\u2029\u202a-\u202e\u206a-\u206f\ufff9-\ufffb\ufeff\ufffc\u{1d173}-\u{1d17a}\u{e0000}-\u{e007f}]/u
+  # https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
+  REGEX_UNSUITABLE_CHARS = /(?:#{REGEX_HTML_CONTROL_CHARACTERS}|#{REGEX_HTML_NON_CHARACTERS})/u
 
   #--
   # Class Methods

diff --git a/test/test_sanitize.rb b/test/test_sanitize.rb
@@ -38,6 +38,21 @@
         @s.document('<!doctype html><html><b>foo</b>'.freeze).must_equal "<html>foo</html>"
       end
 
+      it 'should normalize newlines' do
+        @s.document("a\r\n\n\r\r\r\nz").must_equal "<html>a\n\n\n\n\nz</html>"
+      end
+
+      it 'should strip control characters (except ASCII whitespace)' do
+        sample_control_chars = "\u0001\u0008\u000b\u000e\u001f\u007f\u009f"
+        whitespace = "\t\n\f\u0020"
+        @s.document("a#{sample_control_chars}#{whitespace}z").must_equal "<html>a#{whitespace}z</html>"
+      end
+
+      it 'should strip non-characters' do
+        sample_non_chars = "\ufdd0\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}"
+        @s.document("a#{sample_non_chars}z").must_equal "<html>az</html>"
+      end
+
       describe 'when html body exceeds Nokogumbo::DEFAULT_MAX_TREE_DEPTH' do
         let(:content) do
           content = nest_html_content('<b>foo</b>', Nokogumbo::DEFAULT_MAX_TREE_DEPTH)
@@ -85,6 +100,21 @@
         @s.fragment('<b>foo</b>'.freeze).must_equal 'foo'
       end
 
+      it 'should normalize newlines' do
+        @s.fragment("a\r\n\n\r\r\r\nz").must_equal "a\n\n\n\n\nz"
+      end
+
+      it 'should strip control characters (except ASCII whitespace)' do
+        sample_control_chars = "\u0001\u0008\u000b\u000e\u001f\u007f\u009f"
+        whitespace = "\t\n\f\u0020"
+        @s.fragment("a#{sample_control_chars}#{whitespace}z").must_equal "a#{whitespace}z"
+      end
+
+      it 'should strip non-characters' do
+        sample_non_chars = "\ufdd0\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}"
+        @s.fragment("a#{sample_non_chars}z").must_equal "az"
+      end
+
       describe 'when html body exceeds Nokogumbo::DEFAULT_MAX_TREE_DEPTH' do
         let(:content) do
           content = nest_html_content('<b>foo</b>', Nokogumbo::DEFAULT_MAX_TREE_DEPTH)

diff --git a/test/test_unicode.rb b/test/test_unicode.rb