Skip to content

Commit

Permalink
Strip unsuitable Unicode characters before parsing HTML.
Browse files Browse the repository at this point in the history
  • Loading branch information
rgrove committed Jun 13, 2014
1 parent 6ac8080 commit bf0d753
Show file tree
Hide file tree
Showing 2 changed files with 107 additions and 2 deletions.
25 changes: 23 additions & 2 deletions lib/sanitize.rb
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,12 @@ class Sanitize
# IE6 and Opera will still parse).
REGEX_PROTOCOL = /\A([^\/#]*?)(?:\:|&#0*58|&#x0*3a)/i

# Matches Unicode characters that should be stripped from HTML before passing
# it to the parser.
#
# http://www.w3.org/TR/unicode-xml/#Charlist
REGEX_UNSUITABLE_CHARS = /[\u0340\u0341\u17a3\u17d3\u2028\u2029\u202a-\u202e\u206a-\u206f\ufff9-\ufffb\ufeff\ufffc\u{1d173}-\u{1d17a}\u{e0000}-\u{e007f}]/u

#--
# Class Methods
#++
Expand Down Expand Up @@ -101,7 +107,7 @@ def initialize(config = {})
def document(html)
return '' unless html

doc = Nokogiri::HTML5.parse(html)
doc = Nokogiri::HTML5.parse(preprocess(html))
node!(doc)
to_html(doc)
end
Expand All @@ -113,7 +119,8 @@ def document(html)
def fragment(html)
return '' unless html

doc = Nokogiri::HTML5.parse("<html><body>#{html}")
html = preprocess(html)
doc = Nokogiri::HTML5.parse("<html><body>#{html}")

# Hack to allow fragments containing <body>. Borrowed from
# Nokogiri::HTML::DocumentFragment.
Expand Down Expand Up @@ -161,6 +168,20 @@ def node!(node)

private

# Preprocesses HTML before parsing to remove undesirable Unicode chars.
def preprocess(html)
html.to_s.dup

unless html.encoding.name == 'UTF-8'
html.encode!('UTF-8',
:invalid => :replace,
:undef => :replace)
end

html.gsub!(REGEX_UNSUITABLE_CHARS, '')
html
end

def to_html(node)
replace_meta = false

Expand Down
84 changes: 84 additions & 0 deletions test/test_unicode.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# encoding: utf-8
require_relative 'common'

describe 'Unicode' do
make_my_diffs_pretty!
parallelize_me!

# http://www.w3.org/TR/unicode-xml/#Charlist
describe 'Unsuitable characters' do
before do
@s = Sanitize.new(Sanitize::Config::RELAXED)
end

it 'should strip deprecated grave and acute clones' do
@s.document("a\u0340b\u0341c").must_equal "<html><head></head><body>abc</body></html>\n"
@s.fragment("a\u0340b\u0341c").must_equal 'abc'
end

it 'should strip deprecated Khmer characters' do
@s.document("a\u17a3b\u17d3c").must_equal "<html><head></head><body>abc</body></html>\n"
@s.fragment("a\u17a3b\u17d3c").must_equal 'abc'
end

it 'should strip line and paragraph separator punctuation' do
@s.document("a\u2028b\u2029c").must_equal "<html><head></head><body>abc</body></html>\n"
@s.fragment("a\u2028b\u2029c").must_equal 'abc'
end

it 'should strip bidi embedding control characters' do
@s.document("a\u202ab\u202bc\u202cd\u202de\u202e")
.must_equal "<html><head></head><body>abcde</body></html>\n"

@s.fragment("a\u202ab\u202bc\u202cd\u202de\u202e")
.must_equal 'abcde'
end

it 'should strip deprecated symmetric swapping characters' do
@s.document("a\u206ab\u206bc").must_equal "<html><head></head><body>abc</body></html>\n"
@s.fragment("a\u206ab\u206bc").must_equal 'abc'
end

it 'should strip deprecated Arabic form shaping characters' do
@s.document("a\u206cb\u206dc").must_equal "<html><head></head><body>abc</body></html>\n"
@s.fragment("a\u206cb\u206dc").must_equal 'abc'
end

it 'should strip deprecated National digit shape characters' do
@s.document("a\u206eb\u206fc").must_equal "<html><head></head><body>abc</body></html>\n"
@s.fragment("a\u206eb\u206fc").must_equal 'abc'
end

it 'should strip interlinear annotation characters' do
@s.document("a\ufff9b\ufffac\ufffb").must_equal "<html><head></head><body>abc</body></html>\n"
@s.fragment("a\ufff9b\ufffac\ufffb").must_equal 'abc'
end

it 'should strip BOM/zero-width non-breaking space characters' do
@s.document("a\ufeffbc").must_equal "<html><head></head><body>abc</body></html>\n"
@s.fragment("a\ufeffbc").must_equal 'abc'
end

it 'should strip object replacement characters' do
@s.document("a\ufffcbc").must_equal "<html><head></head><body>abc</body></html>\n"
@s.fragment("a\ufffcbc").must_equal 'abc'
end

it 'should strip musical notation scoping characters' do
@s.document("a\u{1d173}b\u{1d174}c\u{1d175}d\u{1d176}e\u{1d177}f\u{1d178}g\u{1d179}h\u{1d17a}")
.must_equal "<html><head></head><body>abcdefgh</body></html>\n"

@s.fragment("a\u{1d173}b\u{1d174}c\u{1d175}d\u{1d176}e\u{1d177}f\u{1d178}g\u{1d179}h\u{1d17a}")
.must_equal 'abcdefgh'
end

it 'should strip language tag code point characters' do
str = 'a'
(0xE0000..0xE007F).each {|n| str << [n].pack('U') }
str << 'b'

@s.document(str).must_equal "<html><head></head><body>ab</body></html>\n"
@s.fragment(str).must_equal 'ab'
end
end
end

0 comments on commit bf0d753

Please sign in to comment.