diff --git a/lib/sanitize.rb b/lib/sanitize.rb index 75d384e..42f18e2 100644 --- a/lib/sanitize.rb +++ b/lib/sanitize.rb @@ -26,6 +26,12 @@ class Sanitize # IE6 and Opera will still parse). REGEX_PROTOCOL = /\A([^\/#]*?)(?:\:|�*58|�*3a)/i + # Matches Unicode characters that should be stripped from HTML before passing + # it to the parser. + # + # http://www.w3.org/TR/unicode-xml/#Charlist + REGEX_UNSUITABLE_CHARS = /[\u0340\u0341\u17a3\u17d3\u2028\u2029\u202a-\u202e\u206a-\u206f\ufff9-\ufffb\ufeff\ufffc\u{1d173}-\u{1d17a}\u{e0000}-\u{e007f}]/u + #-- # Class Methods #++ @@ -101,7 +107,7 @@ def initialize(config = {}) def document(html) return '' unless html - doc = Nokogiri::HTML5.parse(html) + doc = Nokogiri::HTML5.parse(preprocess(html)) node!(doc) to_html(doc) end @@ -113,7 +119,8 @@ def document(html) def fragment(html) return '' unless html - doc = Nokogiri::HTML5.parse("#{html}") + html = preprocess(html) + doc = Nokogiri::HTML5.parse("#{html}") # Hack to allow fragments containing . Borrowed from # Nokogiri::HTML::DocumentFragment. @@ -161,6 +168,20 @@ def node!(node) private + # Preprocesses HTML before parsing to remove undesirable Unicode chars. + def preprocess(html) + html.to_s.dup + + unless html.encoding.name == 'UTF-8' + html.encode!('UTF-8', + :invalid => :replace, + :undef => :replace) + end + + html.gsub!(REGEX_UNSUITABLE_CHARS, '') + html + end + def to_html(node) replace_meta = false diff --git a/test/test_unicode.rb b/test/test_unicode.rb new file mode 100644 index 0000000..f6d0c94 --- /dev/null +++ b/test/test_unicode.rb @@ -0,0 +1,84 @@ +# encoding: utf-8 +require_relative 'common' + +describe 'Unicode' do + make_my_diffs_pretty! + parallelize_me! + + # http://www.w3.org/TR/unicode-xml/#Charlist + describe 'Unsuitable characters' do + before do + @s = Sanitize.new(Sanitize::Config::RELAXED) + end + + it 'should strip deprecated grave and acute clones' do + @s.document("a\u0340b\u0341c").must_equal "abc\n" + @s.fragment("a\u0340b\u0341c").must_equal 'abc' + end + + it 'should strip deprecated Khmer characters' do + @s.document("a\u17a3b\u17d3c").must_equal "abc\n" + @s.fragment("a\u17a3b\u17d3c").must_equal 'abc' + end + + it 'should strip line and paragraph separator punctuation' do + @s.document("a\u2028b\u2029c").must_equal "abc\n" + @s.fragment("a\u2028b\u2029c").must_equal 'abc' + end + + it 'should strip bidi embedding control characters' do + @s.document("a\u202ab\u202bc\u202cd\u202de\u202e") + .must_equal "abcde\n" + + @s.fragment("a\u202ab\u202bc\u202cd\u202de\u202e") + .must_equal 'abcde' + end + + it 'should strip deprecated symmetric swapping characters' do + @s.document("a\u206ab\u206bc").must_equal "abc\n" + @s.fragment("a\u206ab\u206bc").must_equal 'abc' + end + + it 'should strip deprecated Arabic form shaping characters' do + @s.document("a\u206cb\u206dc").must_equal "abc\n" + @s.fragment("a\u206cb\u206dc").must_equal 'abc' + end + + it 'should strip deprecated National digit shape characters' do + @s.document("a\u206eb\u206fc").must_equal "abc\n" + @s.fragment("a\u206eb\u206fc").must_equal 'abc' + end + + it 'should strip interlinear annotation characters' do + @s.document("a\ufff9b\ufffac\ufffb").must_equal "abc\n" + @s.fragment("a\ufff9b\ufffac\ufffb").must_equal 'abc' + end + + it 'should strip BOM/zero-width non-breaking space characters' do + @s.document("a\ufeffbc").must_equal "abc\n" + @s.fragment("a\ufeffbc").must_equal 'abc' + end + + it 'should strip object replacement characters' do + @s.document("a\ufffcbc").must_equal "abc\n" + @s.fragment("a\ufffcbc").must_equal 'abc' + end + + it 'should strip musical notation scoping characters' do + @s.document("a\u{1d173}b\u{1d174}c\u{1d175}d\u{1d176}e\u{1d177}f\u{1d178}g\u{1d179}h\u{1d17a}") + .must_equal "abcdefgh\n" + + @s.fragment("a\u{1d173}b\u{1d174}c\u{1d175}d\u{1d176}e\u{1d177}f\u{1d178}g\u{1d179}h\u{1d17a}") + .must_equal 'abcdefgh' + end + + it 'should strip language tag code point characters' do + str = 'a' + (0xE0000..0xE007F).each {|n| str << [n].pack('U') } + str << 'b' + + @s.document(str).must_equal "ab\n" + @s.fragment(str).must_equal 'ab' + end + end +end