@@ -18,6 +18,31 @@ class Sanitize; module Transformers; class CleanElement
1818 # http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#embedding-custom-non-visible-data-with-the-data-*-attributes
1919 REGEX_DATA_ATTR = /\A data-(?!xml)[a-z_][\w .\u00E0 -\u00F6 \u00F8 -\u017F \u01DD -\u02AF -]*\z /u
2020
21+ # Attributes that need additional escaping on `<a>` elements due to unsafe
22+ # libxml2 behavior.
23+ UNSAFE_LIBXML_ATTRS_A = Set . new ( %w[
24+ name
25+ ] )
26+
27+ # Attributes that need additional escaping on all elements due to unsafe
28+ # libxml2 behavior.
29+ UNSAFE_LIBXML_ATTRS_GLOBAL = Set . new ( %w[
30+ action
31+ href
32+ src
33+ ] )
34+
35+ # Mapping of original characters to escape sequences for characters that
36+ # should be escaped in attributes affected by unsafe libxml2 behavior.
37+ UNSAFE_LIBXML_ESCAPE_CHARS = {
38+ ' ' => '%20' ,
39+ '"' => '%22'
40+ }
41+
42+ # Regex that matches any single character that needs to be escaped in
43+ # attributes affected by unsafe libxml2 behavior.
44+ UNSAFE_LIBXML_ESCAPE_REGEX = /[ "]/
45+
2146 def initialize ( config )
2247 @add_attributes = config [ :add_attributes ]
2348 @attributes = config [ :attributes ] . dup
@@ -92,31 +117,61 @@ def call(env)
92117 node . attribute_nodes . each do |attr |
93118 attr_name = attr . name . downcase
94119
95- if attr_whitelist . include? ( attr_name )
96- # The attribute is whitelisted.
120+ unless attr_whitelist . include? ( attr_name )
121+ # The attribute isn't whitelisted.
122+
123+ if allow_data_attributes && attr_name . start_with? ( 'data-' )
124+ # Arbitrary data attributes are allowed. If this is a data
125+ # attribute, continue.
126+ next if attr_name =~ REGEX_DATA_ATTR
127+ end
128+
129+ # Either the attribute isn't a data attribute or arbitrary data
130+ # attributes aren't allowed. Remove the attribute.
131+ attr . unlink
132+ next
133+ end
134+
135+ # The attribute is whitelisted.
97136
98- # Remove any attributes that use unacceptable protocols.
99- if @protocols . include? ( name ) && @protocols [ name ] . include? ( attr_name )
100- attr_protocols = @protocols [ name ] [ attr_name ]
137+ # Remove any attributes that use unacceptable protocols.
138+ if @protocols . include? ( name ) && @protocols [ name ] . include? ( attr_name )
139+ attr_protocols = @protocols [ name ] [ attr_name ]
101140
102- if attr . value =~ REGEX_PROTOCOL
103- attr . unlink unless attr_protocols . include? ( $1. downcase )
104- else
105- attr . unlink unless attr_protocols . include? ( :relative )
141+ if attr . value =~ REGEX_PROTOCOL
142+ unless attr_protocols . include? ( $1. downcase )
143+ attr . unlink
144+ next
106145 end
107- end
108- else
109- # The attribute isn't whitelisted.
110146
111- if allow_data_attributes && attr_name . start_with? ( 'data-' )
112- # Arbitrary data attributes are allowed. Verify that the attribute
113- # is a valid data attribute.
114- attr . unlink unless attr_name =~ REGEX_DATA_ATTR
115147 else
116- # Either the attribute isn't a data attribute, or arbitrary data
117- # attributes aren't allowed. Remove the attribute.
118- attr . unlink
148+ unless attr_protocols . include? ( :relative )
149+ attr . unlink
150+ next
151+ end
119152 end
153+
154+ # Leading and trailing whitespace around URLs is ignored at parse
155+ # time. Stripping it here prevents it from being escaped by the
156+ # libxml2 workaround below.
157+ attr . value = attr . value . strip
158+ end
159+
160+ # libxml2 >= 2.9.2 doesn't escape comments within some attributes, in an
161+ # attempt to preserve server-side includes. This can result in XSS since
162+ # an unescaped double quote can allow an attacker to inject a
163+ # non-whitelisted attribute.
164+ #
165+ # Sanitize works around this by implementing its own escaping for
166+ # affected attributes, some of which can exist on any element and some
167+ # of which can only exist on `<a>` elements.
168+ #
169+ # The relevant libxml2 code is here:
170+ # <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588>
171+ if UNSAFE_LIBXML_ATTRS_GLOBAL . include? ( attr_name ) ||
172+ ( name == 'a' && UNSAFE_LIBXML_ATTRS_A . include? ( attr_name ) )
173+
174+ attr . value = attr . value . gsub ( UNSAFE_LIBXML_ESCAPE_REGEX , UNSAFE_LIBXML_ESCAPE_CHARS )
120175 end
121176 end
122177 end
0 commit comments