Skip to content

Commit 01629a1

Browse files
committed
fix: Prevent code injection due to improper escaping in libxml2 >= 2.9.2
When Sanitize <= 4.6.2 is used in combination with libxml2 >= 2.9.2, a specially crafted HTML fragment can cause libxml2 to generate improperly escaped output, allowing non-whitelisted attributes to be used on whitelisted elements. Sanitize now performs additional escaping on affected attributes to prevent this. Many thanks to the Shopify Application Security Team for responsibly reporting this issue. Fixes #176
1 parent 0eee92e commit 01629a1

File tree

3 files changed

+149
-20
lines changed

3 files changed

+149
-20
lines changed

Diff for: lib/sanitize/transformers/clean_element.rb

+74-19
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,31 @@ class Sanitize; module Transformers; class CleanElement
1818
# http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#embedding-custom-non-visible-data-with-the-data-*-attributes
1919
REGEX_DATA_ATTR = /\Adata-(?!xml)[a-z_][\w.\u00E0-\u00F6\u00F8-\u017F\u01DD-\u02AF-]*\z/u
2020

21+
# Attributes that need additional escaping on `<a>` elements due to unsafe
22+
# libxml2 behavior.
23+
UNSAFE_LIBXML_ATTRS_A = Set.new(%w[
24+
name
25+
])
26+
27+
# Attributes that need additional escaping on all elements due to unsafe
28+
# libxml2 behavior.
29+
UNSAFE_LIBXML_ATTRS_GLOBAL = Set.new(%w[
30+
action
31+
href
32+
src
33+
])
34+
35+
# Mapping of original characters to escape sequences for characters that
36+
# should be escaped in attributes affected by unsafe libxml2 behavior.
37+
UNSAFE_LIBXML_ESCAPE_CHARS = {
38+
' ' => '%20',
39+
'"' => '%22'
40+
}
41+
42+
# Regex that matches any single character that needs to be escaped in
43+
# attributes affected by unsafe libxml2 behavior.
44+
UNSAFE_LIBXML_ESCAPE_REGEX = /[ "]/
45+
2146
def initialize(config)
2247
@add_attributes = config[:add_attributes]
2348
@attributes = config[:attributes].dup
@@ -92,31 +117,61 @@ def call(env)
92117
node.attribute_nodes.each do |attr|
93118
attr_name = attr.name.downcase
94119

95-
if attr_whitelist.include?(attr_name)
96-
# The attribute is whitelisted.
120+
unless attr_whitelist.include?(attr_name)
121+
# The attribute isn't whitelisted.
122+
123+
if allow_data_attributes && attr_name.start_with?('data-')
124+
# Arbitrary data attributes are allowed. If this is a data
125+
# attribute, continue.
126+
next if attr_name =~ REGEX_DATA_ATTR
127+
end
128+
129+
# Either the attribute isn't a data attribute or arbitrary data
130+
# attributes aren't allowed. Remove the attribute.
131+
attr.unlink
132+
next
133+
end
134+
135+
# The attribute is whitelisted.
97136

98-
# Remove any attributes that use unacceptable protocols.
99-
if @protocols.include?(name) && @protocols[name].include?(attr_name)
100-
attr_protocols = @protocols[name][attr_name]
137+
# Remove any attributes that use unacceptable protocols.
138+
if @protocols.include?(name) && @protocols[name].include?(attr_name)
139+
attr_protocols = @protocols[name][attr_name]
101140

102-
if attr.value =~ REGEX_PROTOCOL
103-
attr.unlink unless attr_protocols.include?($1.downcase)
104-
else
105-
attr.unlink unless attr_protocols.include?(:relative)
141+
if attr.value =~ REGEX_PROTOCOL
142+
unless attr_protocols.include?($1.downcase)
143+
attr.unlink
144+
next
106145
end
107-
end
108-
else
109-
# The attribute isn't whitelisted.
110146

111-
if allow_data_attributes && attr_name.start_with?('data-')
112-
# Arbitrary data attributes are allowed. Verify that the attribute
113-
# is a valid data attribute.
114-
attr.unlink unless attr_name =~ REGEX_DATA_ATTR
115147
else
116-
# Either the attribute isn't a data attribute, or arbitrary data
117-
# attributes aren't allowed. Remove the attribute.
118-
attr.unlink
148+
unless attr_protocols.include?(:relative)
149+
attr.unlink
150+
next
151+
end
119152
end
153+
154+
# Leading and trailing whitespace around URLs is ignored at parse
155+
# time. Stripping it here prevents it from being escaped by the
156+
# libxml2 workaround below.
157+
attr.value = attr.value.strip
158+
end
159+
160+
# libxml2 >= 2.9.2 doesn't escape comments within some attributes, in an
161+
# attempt to preserve server-side includes. This can result in XSS since
162+
# an unescaped double quote can allow an attacker to inject a
163+
# non-whitelisted attribute.
164+
#
165+
# Sanitize works around this by implementing its own escaping for
166+
# affected attributes, some of which can exist on any element and some
167+
# of which can only exist on `<a>` elements.
168+
#
169+
# The relevant libxml2 code is here:
170+
# <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588>
171+
if UNSAFE_LIBXML_ATTRS_GLOBAL.include?(attr_name) ||
172+
(name == 'a' && UNSAFE_LIBXML_ATTRS_A.include?(attr_name))
173+
174+
attr.value = attr.value.gsub(UNSAFE_LIBXML_ESCAPE_REGEX, UNSAFE_LIBXML_ESCAPE_CHARS)
120175
end
121176
end
122177
end

Diff for: test/test_clean_element.rb

+11-1
Original file line numberDiff line numberDiff line change
@@ -234,7 +234,7 @@
234234

235235
it 'should not choke on valueless attributes' do
236236
@s.fragment('foo <a href>foo</a> bar')
237-
.must_equal 'foo <a href="" rel="nofollow">foo</a> bar'
237+
.must_equal 'foo <a href rel="nofollow">foo</a> bar'
238238
end
239239

240240
it 'should downcase attribute names' do
@@ -300,6 +300,16 @@
300300
}).must_equal input
301301
end
302302

303+
it "should not allow relative URLs when relative URLs aren't whitelisted" do
304+
input = '<a href="/foo/bar">Link</a>'
305+
306+
Sanitize.fragment(input,
307+
:elements => ['a'],
308+
:attributes => {'a' => ['href']},
309+
:protocols => {'a' => {'href' => ['http']}}
310+
).must_equal '<a>Link</a>'
311+
end
312+
303313
it 'should allow relative URLs containing colons when the colon is not in the first path segment' do
304314
input = '<a href="/wiki/Special:Random">Random Page</a>'
305315

Diff for: test/test_malicious_html.rb

+64
Original file line numberDiff line numberDiff line change
@@ -125,4 +125,68 @@
125125
must_equal '&lt;alert("XSS");//&lt;'
126126
end
127127
end
128+
129+
# libxml2 >= 2.9.2 doesn't escape comments within some attributes, in an
130+
# attempt to preserve server-side includes. This can result in XSS since an
131+
# unescaped double quote can allow an attacker to inject a non-whitelisted
132+
# attribute. Sanitize works around this by implementing its own escaping for
133+
# affected attributes.
134+
#
135+
# The relevant libxml2 code is here:
136+
# <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588>
137+
describe 'unsafe libxml2 server-side includes in attributes' do
138+
tag_configs = [
139+
{
140+
tag_name: 'a',
141+
escaped_attrs: %w[ action href src name ],
142+
unescaped_attrs: []
143+
},
144+
145+
{
146+
tag_name: 'div',
147+
escaped_attrs: %w[ action href src ],
148+
unescaped_attrs: %w[ name ]
149+
}
150+
]
151+
152+
before do
153+
@s = Sanitize.new({
154+
elements: %w[ a div ],
155+
156+
attributes: {
157+
all: %w[ action href src name ]
158+
}
159+
})
160+
end
161+
162+
tag_configs.each do |tag_config|
163+
tag_name = tag_config[:tag_name]
164+
165+
tag_config[:escaped_attrs].each do |attr_name|
166+
input = %[<#{tag_name} #{attr_name}='examp<!--" onmouseover=alert(1)>-->le.com'>foo</#{tag_name}>]
167+
168+
it 'should escape unsafe characters in attributes' do
169+
@s.fragment(input).must_equal(%[<#{tag_name} #{attr_name}="examp<!--%22%20onmouseover=alert(1)>-->le.com">foo</#{tag_name}>])
170+
end
171+
172+
it 'should round-trip to the same output' do
173+
output = @s.fragment(input)
174+
@s.fragment(output).must_equal(output)
175+
end
176+
end
177+
178+
tag_config[:unescaped_attrs].each do |attr_name|
179+
input = %[<#{tag_name} #{attr_name}='examp<!--" onmouseover=alert(1)>-->le.com'>foo</#{tag_name}>]
180+
181+
it 'should not escape characters unnecessarily' do
182+
@s.fragment(input).must_equal(input)
183+
end
184+
185+
it 'should round-trip to the same output' do
186+
output = @s.fragment(input)
187+
@s.fragment(output).must_equal(output)
188+
end
189+
end
190+
end
191+
end
128192
end

0 commit comments

Comments
 (0)