diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 30a6faf..222b475 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -47,7 +47,6 @@ jobs:
- run: bundle exec rake
jruby:
- continue-on-error: true # nokogiri on jruby has different behavior
strategy:
fail-fast: false
matrix:
diff --git a/.rubocop.yml b/.rubocop.yml
index fa00f06..b96841e 100644
--- a/.rubocop.yml
+++ b/.rubocop.yml
@@ -343,3 +343,6 @@ Minitest/SkipEnsure:
Minitest/UnreachableAssertion:
Enabled: true
+
+Minitest/NoAssertions:
+ Enabled: true
diff --git a/CHANGELOG.md b/CHANGELOG.md
index d9cd3b6..80b52e1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,15 @@
+## next / unreleased
+
+* `SafeListSanitizer` allows `time` tag and `lang` attribute by default.
+
+ *Mike Dalessio*
+
+* `Rails::Html::XPATHS_TO_REMOVE` has been removed. It's not necessary with the existing sanitizers,
+ and should have been a private constant all along anyway.
+
+ *Mike Dalessio*
+
+
## 1.5.0 / 2023-01-20
* `SafeListSanitizer`, `PermitScrubber`, and `TargetScrubber` now all support pruning of unsafe tags.
diff --git a/lib/rails/html/sanitizer.rb b/lib/rails/html/sanitizer.rb
index dc50430..531ff0d 100644
--- a/lib/rails/html/sanitizer.rb
+++ b/lib/rails/html/sanitizer.rb
@@ -2,8 +2,6 @@
module Rails
module Html
- XPATHS_TO_REMOVE = %w{.//script .//form comment()}
-
class Sanitizer # :nodoc:
def sanitize(html, options = {})
raise NotImplementedError, "subclasses must implement sanitize method."
@@ -33,7 +31,6 @@ def sanitize(html, options = {})
loofah_fragment = Loofah.fragment(html)
- remove_xpaths(loofah_fragment, XPATHS_TO_REMOVE)
loofah_fragment.scrub!(TextOnlyScrubber.new)
properly_encode(loofah_fragment, encoding: "UTF-8")
@@ -106,10 +103,65 @@ class << self
attr_accessor :allowed_tags
attr_accessor :allowed_attributes
end
- self.allowed_tags = Set.new(%w(strong em b i p code pre tt samp kbd var sub
- sup dfn cite big small address hr br div span h1 h2 h3 h4 h5 h6 ul ol li dl dt dd abbr
- acronym a img blockquote del ins))
- self.allowed_attributes = Set.new(%w(href src width height alt cite datetime title class name xml:lang abbr))
+ self.allowed_tags = Set.new([
+ "a",
+ "abbr",
+ "acronym",
+ "address",
+ "b",
+ "big",
+ "blockquote",
+ "br",
+ "cite",
+ "code",
+ "dd",
+ "del",
+ "dfn",
+ "div",
+ "dl",
+ "dt",
+ "em",
+ "h1",
+ "h2",
+ "h3",
+ "h4",
+ "h5",
+ "h6",
+ "hr",
+ "i",
+ "img",
+ "ins",
+ "kbd",
+ "li",
+ "ol",
+ "p",
+ "pre",
+ "samp",
+ "small",
+ "span",
+ "strong",
+ "sub",
+ "sup",
+ "time",
+ "tt",
+ "ul",
+ "var",
+ ])
+ self.allowed_attributes = Set.new([
+ "abbr",
+ "alt",
+ "cite",
+ "class",
+ "datetime",
+ "height",
+ "href",
+ "lang",
+ "name",
+ "src",
+ "title",
+ "width",
+ "xml:lang",
+ ])
def initialize(prune: false)
@permit_scrubber = PermitScrubber.new(prune: prune)
@@ -129,7 +181,6 @@ def sanitize(html, options = {})
@permit_scrubber.attributes = allowed_attributes(options)
loofah_fragment.scrub!(@permit_scrubber)
else
- remove_xpaths(loofah_fragment, XPATHS_TO_REMOVE)
loofah_fragment.scrub!(:strip)
end
diff --git a/test/sanitizer_test.rb b/test/sanitizer_test.rb
index 41b274e..dae5001 100644
--- a/test/sanitizer_test.rb
+++ b/test/sanitizer_test.rb
@@ -6,9 +6,22 @@
puts Nokogiri::VERSION_INFO
+#
+# NOTE that many of these tests contain multiple acceptable results.
+#
+# In some cases, this is because of how the HTML4 parser's recovery behavior changed in libxml2
+# 2.9.14 and 2.10.0. For more details, see:
+#
+# - https://github.com/sparklemotion/nokogiri/releases/tag/v1.13.5
+# - https://gitlab.gnome.org/GNOME/libxml2/-/issues/380
+#
+# In other cases, multiple acceptable results are provided because Nokogiri's vendored libxml2 is
+# patched to entity-escape server-side includes (aks "SSI", aka ``).
+#
+# In many other cases, it's because the parser used by Nokogiri on JRuby (xerces+nekohtml) parses
+# slightly differently than libxml2 in edge cases.
+#
class SanitizersTest < Minitest::Test
- include Rails::Dom::Testing::Assertions::DomAssertions
-
def test_sanitizer_sanitize_raises_not_implemented_error
assert_raises NotImplementedError do
Rails::Html::Sanitizer.new.sanitize("")
@@ -20,7 +33,16 @@ def test_sanitize_nested_script
end
def test_sanitize_nested_script_in_style
- assert_equal '<script>alert("XSS");</script>', safe_list_sanitize('alert("XSS");/', tags: %w(em))
+ input = 'alert("XSS");/'
+ result = safe_list_sanitize(input, tags: %w(em))
+ acceptable_results = [
+ # libxml2
+ %{<script>alert("XSS");</script>},
+ # xerces+neko. unavoidable double-escaping, see loofah/docs/2022-10-decision-on-cdata-nodes.md
+ %{<script>alert(\"XSS\");<</style>/script>},
+ ]
+
+ assert_includes(acceptable_results, result)
end
class XpathRemovalTestSanitizer < Rails::Html::Sanitizer
@@ -56,8 +78,15 @@ def test_remove_xpaths_called_with_enumerable_xpaths
def test_strip_tags_with_quote
input = '<" hi'
- expected = libxml_2_9_14_recovery_lt? ? %{<" hi} : %{ hi}
- assert_equal(expected, full_sanitize(input))
+ result = full_sanitize(input)
+ acceptable_results = [
+ # libxml2 >= 2.9.14 and xerces+neko
+ %{<" hi},
+ # other libxml2
+ %{ hi},
+ ]
+
+ assert_includes(acceptable_results, result)
end
def test_strip_invalid_html
@@ -72,27 +101,54 @@ def test_strip_nested_tags
def test_strip_tags_multiline
expected = %{This is a test.\n\n\n\nIt no longer contains any HTML.\n}
- input = %{
This is a test.\n\n\n\nIt no longer contains any HTML.
\n}
+ input = %{This is a test.
\n\n\n\nIt no longer contains any HTML.
\n}
assert_equal expected, full_sanitize(input)
end
def test_remove_unclosed_tags
input = "This is <-- not\n a comment here."
- expected = libxml_2_9_14_recovery_lt? ? %{This is <-- not\n a comment here.} : %{This is }
- assert_equal(expected, full_sanitize(input))
+ result = full_sanitize(input)
+ acceptable_results = [
+ # libxml2 >= 2.9.14 and xerces+neko
+ %{This is <-- not\n a comment here.},
+ # other libxml2
+ %{This is },
+ ]
+
+ assert_includes(acceptable_results, result)
end
def test_strip_cdata
input = "This has a ]]> here."
- expected = libxml_2_9_14_recovery_lt_bang? ? %{This has a <![CDATA[]]> here.} : %{This has a ]]> here.}
- assert_equal(expected, full_sanitize(input))
+ result = full_sanitize(input)
+ acceptable_results = [
+ # libxml2 = 2.9.14
+ %{This has a <![CDATA[]]> here.},
+ # other libxml2
+ %{This has a ]]> here.},
+ # xerces+neko
+ %{This has a here.},
+ ]
+
+ assert_includes(acceptable_results, result)
end
def test_strip_unclosed_cdata
input = "This has an unclosed ]] here..."
- expected = libxml_2_9_14_recovery_lt_bang? ? %{This has an unclosed <![CDATA[]] here...} : %{This has an unclosed ]] here...}
- assert_equal(expected, full_sanitize(input))
+
+ result = safe_list_sanitize(input)
+
+ acceptable_results = [
+ # libxml2 = 2.9.14
+ %{This has an unclosed <![CDATA[]] here...},
+ # other libxml2
+ %{This has an unclosed ]] here...},
+ # xerces+neko
+ %{This has an unclosed }
+ ]
+
+ assert_includes(acceptable_results, result)
end
def test_strip_blank_string
@@ -168,7 +224,20 @@ def test_sanitize_form
end
def test_sanitize_plaintext
- assert_sanitized "foo", "foo"
+ # note that the `plaintext` tag has been deprecated since HTML 2
+ # https://developer.mozilla.org/en-US/docs/Web/HTML/Element/plaintext
+ input = "foo"
+ result = safe_list_sanitize(input)
+ acceptable_results = [
+ # libxml2
+ "foo",
+ # xerces+nekohtml-unit
+ "<span>foo</span></plaintext>",
+ # xerces+cyberneko
+ "<span>foo</span>"
+ ]
+
+ assert_includes(acceptable_results, result)
end
def test_sanitize_script
@@ -187,16 +256,7 @@ def test_sanitize_javascript_href
def test_sanitize_image_src
raw = %{src="javascript:bang" foo, bar}
- assert_sanitized raw, %{src="javascript:bang" foo, bar}
- end
-
- tags = Loofah::HTML5::SafeList::ALLOWED_ELEMENTS - %w(script form)
- tags.each do |tag_name|
- define_method "test_should_allow_#{tag_name}_tag" do
- scope_allowed_tags(tags) do
- assert_sanitized "start <#{tag_name} title=\"1\" onclick=\"foo\">foo bar baz#{tag_name}> end", %(start <#{tag_name} title="1">foo bar baz#{tag_name}> end)
- end
- end
+ assert_sanitized raw, %{src="javascript:bang" foo, bar}
end
def test_should_allow_anchors
@@ -206,8 +266,20 @@ def test_should_allow_anchors
def test_video_poster_sanitization
scope_allowed_tags(%w(video)) do
scope_allowed_attributes %w(src poster) do
- assert_sanitized %(), %()
- assert_sanitized %(), %()
+ expected = if RUBY_PLATFORM == "java"
+ # xerces+nekohtml alphabetizes the attributes! FML.
+ %()
+ else
+ %()
+ end
+ assert_sanitized(
+ %(),
+ expected,
+ )
+ assert_sanitized(
+ %(),
+ %(),
+ )
end
end
end
@@ -219,16 +291,33 @@ def test_allow_colons_in_path_component
%w(src width height alt).each do |img_attr|
define_method "test_should_allow_image_#{img_attr}_attribute" do
- assert_sanitized %(), %()
+ assert_sanitized %(), %()
end
end
+ def test_lang_and_xml_lang
+ # https://html.spec.whatwg.org/multipage/dom.html#the-lang-and-xml:lang-attributes
+ #
+ # 3.2.6.2 The lang and xml:lang attributes
+ #
+ # ... Authors must not use the lang attribute in the XML namespace on HTML elements in HTML
+ # documents. To ease migration to and from XML, authors may specify an attribute in no namespace
+ # with no prefix and with the literal localname "xml:lang" on HTML elements in HTML documents,
+ # but such attributes must only be specified if a lang attribute in no namespace is also
+ # specified, and both attributes must have the same value when compared in an ASCII
+ # case-insensitive manner.
+ input = expected = "foo
"
+ assert_sanitized(input, expected)
+ end
+
def test_should_handle_non_html
assert_sanitized "abc"
end
def test_should_handle_blank_text
- [nil, "", " "].each { |blank| assert_sanitized blank }
+ assert_nil(safe_list_sanitize(nil))
+ assert_equal("", safe_list_sanitize(""))
+ assert_equal(" ", safe_list_sanitize(" "))
end
def test_setting_allowed_tags_affects_sanitization
@@ -287,6 +376,7 @@ def test_should_allow_custom_tags_with_custom_attributes
def test_scrub_style_if_style_attribute_option_is_passed
input = ''
actual = safe_list_sanitize(input, attributes: %w(style))
+
assert_includes(['', ''], actual)
end
@@ -313,28 +403,30 @@ def scrubber.scrub(node); node.name = "h1"; end
def test_should_accept_loofah_inheriting_scrubber
scrubber = Loofah::Scrubber.new
- def scrubber.scrub(node); node.name = "h1"; end
+ def scrubber.scrub(node); node.replace("#{node.inner_html}
"); end
html = ""
assert_equal "hello!
", safe_list_sanitize(html, scrubber: scrubber)
end
def test_should_accept_loofah_scrubber_that_wraps_a_block
- scrubber = Loofah::Scrubber.new { |node| node.name = "h1" }
+ scrubber = Loofah::Scrubber.new { |node| node.replace("#{node.inner_html}
") }
html = ""
assert_equal "hello!
", safe_list_sanitize(html, scrubber: scrubber)
end
def test_custom_scrubber_takes_precedence_over_other_options
- scrubber = Loofah::Scrubber.new { |node| node.name = "h1" }
+ scrubber = Loofah::Scrubber.new { |node| node.replace("#{node.inner_html}
") }
html = ""
assert_equal "hello!
", safe_list_sanitize(html, scrubber: scrubber, tags: ["foo"])
end
- [%w(img src), %w(a href)].each do |(tag, attr)|
- define_method "test_should_strip_#{attr}_attribute_in_#{tag}_with_bad_protocols" do
- assert_sanitized %(<#{tag} #{attr}="javascript:bang" title="1">boo#{tag}>), %(<#{tag} title="1">boo#{tag}>)
- end
+ def test_should_strip_src_attribute_in_img_with_bad_protocols
+ assert_sanitized %(), %()
+ end
+
+ def test_should_strip_href_attribute_in_a_with_bad_protocols
+ assert_sanitized %(boo), %(boo)
end
def test_should_block_script_tag
@@ -366,7 +458,16 @@ def test_should_not_fall_for_xss_image_hack_with_uppercase_tags
end
def test_should_sanitize_tag_broken_up_by_null
- assert_sanitized %(alert(\"XSS\")), ""
+ input = %(alert(\"XSS\"))
+ result = safe_list_sanitize(input)
+ acceptable_results = [
+ # libxml2
+ "",
+ # xerces+neko
+ 'alert("XSS")',
+ ]
+
+ assert_includes(acceptable_results, result)
end
def test_should_sanitize_invalid_script_tag
@@ -375,7 +476,19 @@ def test_should_sanitize_invalid_script_tag
def test_should_sanitize_script_tag_with_multiple_open_brackets
assert_sanitized %(<), "<alert(\"XSS\");//<"
- assert_sanitized %(