Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
261 lines (207 sloc) 7.871 kb
# encoding: utf-8
require 'nokogumbo'
require 'set'
require_relative 'sanitize/version'
require_relative 'sanitize/config'
require_relative 'sanitize/config/default'
require_relative 'sanitize/config/restricted'
require_relative 'sanitize/config/basic'
require_relative 'sanitize/config/relaxed'
require_relative 'sanitize/css'
require_relative 'sanitize/transformers/clean_cdata'
require_relative 'sanitize/transformers/clean_comment'
require_relative 'sanitize/transformers/clean_css'
require_relative 'sanitize/transformers/clean_doctype'
require_relative 'sanitize/transformers/clean_element'
class Sanitize
attr_reader :config
# Matches an attribute value that could be treated by a browser as a URL
# with a protocol prefix, such as "http:" or "javascript:". Any string of zero
# or more characters followed by a colon is considered a match, even if the
# colon is encoded as an entity and even if it's an incomplete entity (which
# IE6 and Opera will still parse).
REGEX_PROTOCOL = /\A\s*([^\/#]*?)(?:\:|&#0*58|&#x0*3a)/i
# Matches Unicode characters that should be stripped from HTML before passing
# it to the parser.
#
# http://www.w3.org/TR/unicode-xml/#Charlist
REGEX_UNSUITABLE_CHARS = /[\u0340\u0341\u17a3\u17d3\u2028\u2029\u202a-\u202e\u206a-\u206f\ufff9-\ufffb\ufeff\ufffc\u{1d173}-\u{1d17a}\u{e0000}-\u{e007f}]/u
#--
# Class Methods
#++
# Returns a sanitized copy of the given full _html_ document, using the
# settings in _config_ if specified.
#
# When sanitizing a document, the `<html>` element must be whitelisted or an
# error will be raised. If this is undesirable, you should probably use
# {#fragment} instead.
def self.document(html, config = {})
Sanitize.new(config).document(html)
end
# Returns a sanitized copy of the given _html_ fragment, using the settings in
# _config_ if specified.
def self.fragment(html, config = {})
Sanitize.new(config).fragment(html)
end
# Sanitizes the given `Nokogiri::XML::Node` instance and all its children.
def self.node!(node, config = {})
Sanitize.new(config).node!(node)
end
# Aliases for pre-3.0.0 backcompat.
class << Sanitize
# @deprecated Use {.document} instead.
alias_method :clean_document, :document
# @deprecated Use {.fragment} instead.
alias_method :clean, :fragment
# @deprecated Use {.node!} instead.
alias_method :clean_node!, :node!
end
#--
# Instance Methods
#++
# Returns a new Sanitize object initialized with the settings in _config_.
def initialize(config = {})
@config = Config.merge(Config::DEFAULT, config)
@transformers = Array(@config[:transformers].dup)
# Default transformers always run at the end of the chain, after any custom
# transformers.
@transformers << Transformers::CleanComment unless @config[:allow_comments]
@transformers << Transformers::CleanDoctype unless @config[:allow_doctype]
if @config[:elements].include?('style')
scss = Sanitize::CSS.new(config)
@transformers << Transformers::CSS::CleanElement.new(scss)
end
if @config[:attributes].values.any? {|attr| attr.include?('style') }
scss ||= Sanitize::CSS.new(config)
@transformers << Transformers::CSS::CleanAttribute.new(scss)
end
@transformers <<
Transformers::CleanCDATA <<
Transformers::CleanElement.new(@config)
end
# Returns a sanitized copy of the given _html_ document.
#
# When sanitizing a document, the `<html>` element must be whitelisted or an
# error will be raised. If this is undesirable, you should probably use
# {#fragment} instead.
def document(html)
return '' unless html
doc = Nokogiri::HTML5.parse(preprocess(html))
node!(doc)
to_html(doc)
end
# @deprecated Use {#document} instead.
alias_method :clean_document, :document
# Returns a sanitized copy of the given _html_ fragment.
def fragment(html)
return '' unless html
html = preprocess(html)
doc = Nokogiri::HTML5.parse("<html><body>#{html}")
# Hack to allow fragments containing <body>. Borrowed from
# Nokogiri::HTML::DocumentFragment.
if html =~ /\A<body(?:\s|>)/i
path = '/html/body'
else
path = '/html/body/node()'
end
frag = doc.fragment
doc.xpath(path).each {|node| frag << node }
node!(frag)
to_html(frag)
end
# @deprecated Use {#fragment} instead.
alias_method :clean, :fragment
# Sanitizes the given `Nokogiri::XML::Node` and all its children, modifying it
# in place.
#
# If _node_ is a `Nokogiri::XML::Document`, the `<html>` element must be
# whitelisted or an error will be raised.
def node!(node)
raise ArgumentError unless node.is_a?(Nokogiri::XML::Node)
if node.is_a?(Nokogiri::XML::Document)
unless @config[:elements].include?('html')
raise Error, 'When sanitizing a document, "<html>" must be whitelisted.'
end
end
node_whitelist = Set.new
traverse(node) do |n|
transform_node!(n, node_whitelist)
end
node
end
# @deprecated Use {#node!} instead.
alias_method :clean_node!, :node!
private
# Preprocesses HTML before parsing to remove undesirable Unicode chars.
def preprocess(html)
html = html.to_s.dup
unless html.encoding.name == 'UTF-8'
html.encode!('UTF-8',
:invalid => :replace,
:undef => :replace)
end
html.gsub!(REGEX_UNSUITABLE_CHARS, '')
html
end
def to_html(node)
replace_meta = false
# Hacky workaround for a libxml2 bug that adds an undesired Content-Type
# meta tag to all serialized HTML documents.
#
# https://github.com/sparklemotion/nokogiri/issues/1008
if node.type == Nokogiri::XML::Node::DOCUMENT_NODE ||
node.type == Nokogiri::XML::Node::HTML_DOCUMENT_NODE
regex_meta = %r|(<html[^>]*>\s*<head[^>]*>\s*)<meta http-equiv="Content-Type" content="text/html; charset=utf-8">|i
# Only replace the content-type meta tag if <meta> isn't whitelisted or
# the original document didn't actually include a content-type meta tag.
replace_meta = !@config[:elements].include?('meta') ||
node.xpath('/html/head/meta[@http-equiv]').none? do |meta|
meta['http-equiv'].downcase == 'content-type'
end
end
so = Nokogiri::XML::Node::SaveOptions
# Serialize to HTML without any formatting to prevent Nokogiri from adding
# newlines after certain tags.
html = node.to_html(
:encoding => 'utf-8',
:indent => 0,
:save_with => so::NO_DECLARATION | so::NO_EMPTY_TAGS | so::AS_HTML
)
html.gsub!(regex_meta, '\1') if replace_meta
html
end
def transform_node!(node, node_whitelist)
@transformers.each do |transformer|
result = transformer.call(
:config => @config,
:is_whitelisted => node_whitelist.include?(node),
:node => node,
:node_name => node.name.downcase,
:node_whitelist => node_whitelist
)
if result.is_a?(Hash) && result[:node_whitelist].respond_to?(:each)
node_whitelist.merge(result[:node_whitelist])
end
end
node
end
# Performs top-down traversal of the given node, operating first on the node
# itself, then traversing each child (if any) in order.
def traverse(node, &block)
block.call(node)
child = node.child
while child do
prev = child.previous_sibling
traverse(child, &block)
if child.parent != node
# The child was unlinked or reparented, so traverse the previous node's
# next sibling, or the parent's first child if there is no previous
# node.
child = prev ? prev.next_sibling : node.child
else
child = child.next_sibling
end
end
end
class Error < StandardError; end
end
Jump to Line
Something went wrong with that request. Please try again.