Permalink
Browse files

Refactor sanitizer helpers into HTML classes and make it easy to swap…

… them out with custom implementations. Closes #10129.  [rick]

git-svn-id: http://svn-commit.rubyonrails.org/rails/trunk@8213 5ecf4fe2-1ee6-0310-87b1-e25e094e27de
  • Loading branch information...
technoweenie committed Nov 26, 2007
1 parent bd5ed65 commit 1af084ecda66a8e1b4eb3a51a07ebca85bf2e419
View
@@ -1,5 +1,7 @@
*SVN*
+* Refactor sanitizer helpers into HTML classes and make it easy to swap them out with custom implementations. Closes #10129. [rick]
+
* Add deprecation for old subtemplate syntax for ActionMailer templates, use render :partial [rick]
* Fix TemplateError so it doesn't bomb on exceptions while running tests [rick]
@@ -1,6 +1,7 @@
require 'html/tokenizer'
require 'html/node'
require 'html/selector'
+require 'html/sanitizer'
module HTML #:nodoc:
# A top-level HTMl document. You give it a body of text, and it will parse that
@@ -0,0 +1,173 @@
+module HTML
+ class Sanitizer
+ def sanitize(text, options = {})
+ return text unless sanitizeable?(text)
+ tokenize(text, options).join
+ end
+
+ def sanitizeable?(text)
+ !(text.nil? || text.empty? || !text.index("<"))
+ end
+
+ protected
+ def tokenize(text, options)
+ tokenizer = HTML::Tokenizer.new(text)
+ result = []
+ while token = tokenizer.next
+ node = Node.parse(nil, 0, 0, token, false)
+ process_node node, result, options
+ end
+ result
+ end
+
+ def process_node(node, result, options)
+ result << node.to_s
+ end
+ end
+
+ class FullSanitizer < Sanitizer
+ def sanitize(text, options = {})
+ result = super
+ # strip any comments, and if they have a newline at the end (ie. line with
+ # only a comment) strip that too
+ result.gsub!(/<!--(.*?)-->[\n]?/m, "") if result
+ # Recurse - handle all dirty nested tags
+ result == text ? result : sanitize(result, options)
+ end
+
+ def process_node(node, result, options)
+ result << node.to_s if node.class == HTML::Text
+ end
+ end
+
+ class LinkSanitizer < FullSanitizer
+ cattr_accessor :included_tags, :instance_writer => false
+ self.included_tags = Set.new(%w(a href))
+
+ def sanitizeable?(text)
+ !(text.nil? || text.empty? || !((text.index("<a") || text.index("<href")) && text.index(">")))
+ end
+
+ protected
+ def process_node(node, result, options)
+ result << node.to_s unless node.is_a?(HTML::Tag) && included_tags.include?(node.name)
+ end
+ end
+
+ class WhiteListSanitizer < Sanitizer
+ [:protocol_separator, :uri_attributes, :allowed_attributes, :allowed_tags, :allowed_protocols, :bad_tags,
+ :allowed_css_properties, :allowed_css_keywords, :shorthand_css_properties].each do |attr|
+ class_inheritable_accessor attr, :instance_writer => false
+ end
+
+ # A regular expression of the valid characters used to separate protocols like
+ # the ':' in 'http://foo.com'
+ self.protocol_separator = /:|(&#0*58)|(&#x70)|(%|&#37;)3A/
+
+ # Specifies a Set of HTML attributes that can have URIs.
+ self.uri_attributes = Set.new(%w(href src cite action longdesc xlink:href lowsrc))
+
+ # Specifies a Set of 'bad' tags that the #sanitize helper will remove completely, as opposed
+ # to just escaping harmless tags like &lt;font&gt;
+ self.bad_tags = Set.new(%w(script))
+
+ # Specifies the default Set of tags that the #sanitize helper will allow unscathed.
+ self.allowed_tags = Set.new(%w(strong em b i p code pre tt output samp kbd var sub
+ sup dfn cite big small address hr br div span h1 h2 h3 h4 h5 h6 ul ol li dt dd abbr
+ acronym a img blockquote del ins fieldset legend))
+
+ # Specifies the default Set of html attributes that the #sanitize helper will leave
+ # in the allowed tag.
+ self.allowed_attributes = Set.new(%w(href src width height alt cite datetime title class name xml:lang abbr))
+
+ # Specifies the default Set of acceptable css properties that #sanitize and #sanitize_css will accept.
+ self.allowed_protocols = Set.new(%w(ed2k ftp http https irc mailto news gopher nntp telnet webcal xmpp callto
+ feed svn urn aim rsync tag ssh sftp rtsp afs))
+
+ # Specifies the default Set of acceptable css keywords that #sanitize and #sanitize_css will accept.
+ self.allowed_css_properties = Set.new(%w(azimuth background-color border-bottom-color border-collapse
+ border-color border-left-color border-right-color border-top-color clear color cursor direction display
+ elevation float font font-family font-size font-style font-variant font-weight height letter-spacing line-height
+ overflow pause pause-after pause-before pitch pitch-range richness speak speak-header speak-numeral speak-punctuation
+ speech-rate stress text-align text-decoration text-indent unicode-bidi vertical-align voice-family volume white-space
+ width))
+
+ # Specifies the default Set of acceptable css keywords that #sanitize and #sanitize_css will accept.
+ self.allowed_css_keywords = Set.new(%w(auto aqua black block blue bold both bottom brown center
+ collapse dashed dotted fuchsia gray green !important italic left lime maroon medium none navy normal
+ nowrap olive pointer purple red right solid silver teal top transparent underline white yellow))
+
+ # Specifies the default Set of allowed shorthand css properties for the #sanitize and #sanitize_css helpers.
+ self.shorthand_css_properties = Set.new(%w(background border margin padding))
+
+ # Sanitizes a block of css code. Used by #sanitize when it comes across a style attribute
+ def sanitize_css(style)
+ # disallow urls
+ style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
+
+ # gauntlet
+ if style !~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/ ||
+ style !~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$/
+ return ''
+ end
+
+ clean = []
+ style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop,val|
+ if allowed_css_properties.include?(prop.downcase)
+ clean << prop + ': ' + val + ';'
+ elsif shorthand_css_properties.include?(prop.split('-')[0].downcase)
+ unless val.split().any? do |keyword|
+ !allowed_css_keywords.include?(keyword) &&
+ keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
+ end
+ clean << prop + ': ' + val + ';'
+ end
+ end
+ end
+ clean.join(' ')
+ end
+
+ protected
+ def tokenize(text, options)
+ options[:parent] = []
+ options[:attributes] ||= allowed_attributes
+ options[:tags] ||= allowed_tags
+ super
+ end
+
+ def process_node(node, result, options)
+ result << case node
+ when HTML::Tag
+ if node.closing == :close
+ options[:parent].shift
+ else
+ options[:parent].unshift node.name
+ end
+
+ process_attributes_for node, options
+
+ options[:tags].include?(node.name) ? node : nil
+ else
+ bad_tags.include?(options[:parent].first) ? nil : node.to_s.gsub(/</, "&lt;")
+ end
+ end
+
+ def process_attributes_for(node, options)
+ return unless node.attributes
+ node.attributes.keys.each do |attr_name|
+ value = node.attributes[attr_name].to_s
+
+ if !options[:attributes].include?(attr_name) || contains_bad_protocols?(attr_name, value)
+ node.attributes.delete(attr_name)
+ else
+ node.attributes[attr_name] = attr_name == 'style' ? sanitize_css(value) : CGI::escapeHTML(value)
+ end
+ end
+ end
+
+ def contains_bad_protocols?(attr_name, value)
+ uri_attributes.include?(attr_name) &&
+ (value =~ /(^[^\/:]*):|(&#0*58)|(&#x70)|(%|&#37;)3A/ && !allowed_protocols.include?(value.split(protocol_separator).first))
+ end
+ end
+end
Oops, something went wrong.

0 comments on commit 1af084e

Please sign in to comment.