Strip unsuitable Unicode characters before parsing HTML.

http://www.w3.org/TR/unicode-xml/#Charlist Closes #106.
rgrove · Jun 13, 2014 · bf0d753 · bf0d753
1 parent 6ac8080
commit bf0d753
Show file tree

Hide file tree

Showing 2 changed files with 107 additions and 2 deletions.
diff --git a/lib/sanitize.rb b/lib/sanitize.rb
@@ -26,6 +26,12 @@ class Sanitize
   # IE6 and Opera will still parse).
   REGEX_PROTOCOL = /\A([^\/#]*?)(?:\:|&#0*58|&#x0*3a)/i
 
+  # Matches Unicode characters that should be stripped from HTML before passing
+  # it to the parser.
+  #
+  # http://www.w3.org/TR/unicode-xml/#Charlist
+  REGEX_UNSUITABLE_CHARS = /[\u0340\u0341\u17a3\u17d3\u2028\u2029\u202a-\u202e\u206a-\u206f\ufff9-\ufffb\ufeff\ufffc\u{1d173}-\u{1d17a}\u{e0000}-\u{e007f}]/u
+
   #--
   # Class Methods
   #++
@@ -101,7 +107,7 @@ def initialize(config = {})
   def document(html)
     return '' unless html
 
-    doc = Nokogiri::HTML5.parse(html)
+    doc = Nokogiri::HTML5.parse(preprocess(html))
     node!(doc)
     to_html(doc)
   end
@@ -113,7 +119,8 @@ def document(html)
   def fragment(html)
     return '' unless html
 
-    doc = Nokogiri::HTML5.parse("<html><body>#{html}")
+    html = preprocess(html)
+    doc  = Nokogiri::HTML5.parse("<html><body>#{html}")
 
     # Hack to allow fragments containing <body>. Borrowed from
     # Nokogiri::HTML::DocumentFragment.
@@ -161,6 +168,20 @@ def node!(node)
 
   private
 
+  # Preprocesses HTML before parsing to remove undesirable Unicode chars.
+  def preprocess(html)
+    html.to_s.dup
+
+    unless html.encoding.name == 'UTF-8'
+      html.encode!('UTF-8',
+        :invalid => :replace,
+        :undef   => :replace)
+    end
+
+    html.gsub!(REGEX_UNSUITABLE_CHARS, '')
+    html
+  end
+
   def to_html(node)
     replace_meta = false
 

diff --git a/test/test_unicode.rb b/test/test_unicode.rb
@@ -0,0 +1,84 @@
+# encoding: utf-8
+require_relative 'common'
+
+describe 'Unicode' do
+  make_my_diffs_pretty!
+  parallelize_me!
+
+  # http://www.w3.org/TR/unicode-xml/#Charlist
+  describe 'Unsuitable characters' do
+    before do
+      @s = Sanitize.new(Sanitize::Config::RELAXED)
+    end
+
+    it 'should strip deprecated grave and acute clones' do
+      @s.document("a\u0340b\u0341c").must_equal "<html><head></head><body>abc</body></html>\n"
+      @s.fragment("a\u0340b\u0341c").must_equal 'abc'
+    end
+
+    it 'should strip deprecated Khmer characters' do
+      @s.document("a\u17a3b\u17d3c").must_equal "<html><head></head><body>abc</body></html>\n"
+      @s.fragment("a\u17a3b\u17d3c").must_equal 'abc'
+    end
+
+    it 'should strip line and paragraph separator punctuation' do
+      @s.document("a\u2028b\u2029c").must_equal "<html><head></head><body>abc</body></html>\n"
+      @s.fragment("a\u2028b\u2029c").must_equal 'abc'
+    end
+
+    it 'should strip bidi embedding control characters' do
+      @s.document("a\u202ab\u202bc\u202cd\u202de\u202e")
+        .must_equal "<html><head></head><body>abcde</body></html>\n"
+
+      @s.fragment("a\u202ab\u202bc\u202cd\u202de\u202e")
+        .must_equal 'abcde'
+    end
+
+    it 'should strip deprecated symmetric swapping characters' do
+      @s.document("a\u206ab\u206bc").must_equal "<html><head></head><body>abc</body></html>\n"
+      @s.fragment("a\u206ab\u206bc").must_equal 'abc'
+    end
+
+    it 'should strip deprecated Arabic form shaping characters' do
+      @s.document("a\u206cb\u206dc").must_equal "<html><head></head><body>abc</body></html>\n"
+      @s.fragment("a\u206cb\u206dc").must_equal 'abc'
+    end
+
+    it 'should strip deprecated National digit shape characters' do
+      @s.document("a\u206eb\u206fc").must_equal "<html><head></head><body>abc</body></html>\n"
+      @s.fragment("a\u206eb\u206fc").must_equal 'abc'
+    end
+
+    it 'should strip interlinear annotation characters' do
+      @s.document("a\ufff9b\ufffac\ufffb").must_equal "<html><head></head><body>abc</body></html>\n"
+      @s.fragment("a\ufff9b\ufffac\ufffb").must_equal 'abc'
+    end
+
+    it 'should strip BOM/zero-width non-breaking space characters' do
+      @s.document("a\ufeffbc").must_equal "<html><head></head><body>abc</body></html>\n"
+      @s.fragment("a\ufeffbc").must_equal 'abc'
+    end
+
+    it 'should strip object replacement characters' do
+      @s.document("a\ufffcbc").must_equal "<html><head></head><body>abc</body></html>\n"
+      @s.fragment("a\ufffcbc").must_equal 'abc'
+    end
+
+    it 'should strip musical notation scoping characters' do
+      @s.document("a\u{1d173}b\u{1d174}c\u{1d175}d\u{1d176}e\u{1d177}f\u{1d178}g\u{1d179}h\u{1d17a}")
+        .must_equal "<html><head></head><body>abcdefgh</body></html>\n"
+
+      @s.fragment("a\u{1d173}b\u{1d174}c\u{1d175}d\u{1d176}e\u{1d177}f\u{1d178}g\u{1d179}h\u{1d17a}")
+        .must_equal 'abcdefgh'
+    end
+
+    it 'should strip language tag code point characters' do
+      str = 'a'
+      (0xE0000..0xE007F).each {|n| str << [n].pack('U') }
+      str << 'b'
+
+      @s.document(str).must_equal "<html><head></head><body>ab</body></html>\n"
+      @s.fragment(str).must_equal 'ab'
+    end
+  end
+end