From 47f62553135ef1190e4c1d769abf2de00b4b7aa9 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Wed, 17 May 2023 17:11:41 -0400 Subject: [PATCH] fix: ensure LinkSanitizer returns UTF-8 encoded strings Previously, 23622984 ensured SafeListSanitizer returned UTF-8 encoded strings, and 49dfc158 introduced FullSanitizer which did the same. This behavior is now being added to the remaining sanitizer, LinkSanitizer. --- CHANGELOG.md | 9 +++++++-- lib/rails/html/sanitizer.rb | 10 ++-------- test/sanitizer_test.rb | 18 ++++++++++++++++-- 3 files changed, 25 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5b13d09..4600b79 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,12 +32,17 @@ *Mike Dalessio* +* `LinkSanitizer` always returns UTF-8 encoded strings. `SafeListSanitizer` and `FullSanitizer` + already ensured this encoding. + + *Mike Dalessio* + * `SafeListSanitizer` allows `time` tag and `lang` attribute by default. *Mike Dalessio* -* `Rails::Html::XPATHS_TO_REMOVE` has been removed. It's not necessary with the existing sanitizers, - and should have been a private constant all along anyway. +* The constant `Rails::Html::XPATHS_TO_REMOVE` has been removed. It's not necessary with the + existing sanitizers, and should have been a private constant all along anyway. *Mike Dalessio* diff --git a/lib/rails/html/sanitizer.rb b/lib/rails/html/sanitizer.rb index a0ba4a3..a6c81f1 100644 --- a/lib/rails/html/sanitizer.rb +++ b/lib/rails/html/sanitizer.rb @@ -182,12 +182,6 @@ def serialize(fragment) properly_encode(fragment, encoding: "UTF-8") end end - - module SimpleString - def serialize(fragment) - fragment.to_s - end - end end end end @@ -242,7 +236,7 @@ class LinkSanitizer < Rails::HTML::Sanitizer include HTML::Concern::ComposedSanitize include HTML::Concern::Parser::HTML4 include HTML::Concern::Scrubber::Link - include HTML::Concern::Serializer::SimpleString + include HTML::Concern::Serializer::UTF8Encode end # == Rails::HTML4::SafeListSanitizer @@ -352,7 +346,7 @@ class LinkSanitizer < Rails::HTML::Sanitizer include HTML::Concern::ComposedSanitize include HTML::Concern::Parser::HTML5 include HTML::Concern::Scrubber::Link - include HTML::Concern::Serializer::SimpleString + include HTML::Concern::Serializer::UTF8Encode end # == Rails::HTML5::SafeListSanitizer diff --git a/test/sanitizer_test.rb b/test/sanitizer_test.rb index 3cde41a..4e7a66a 100644 --- a/test/sanitizer_test.rb +++ b/test/sanitizer_test.rb @@ -174,6 +174,13 @@ def test_full_sanitize_respect_html_escaping_of_the_given_string assert_equal "omg <script>BOM</script>", full_sanitize("omg <script>BOM</script>") end + def test_sanitize_ascii_8bit_string + full_sanitize("
hello
".encode("ASCII-8BIT")).tap do |sanitized| + assert_equal "hello", sanitized + assert_equal Encoding::UTF_8, sanitized.encoding + end + end + protected def full_sanitize(input, options = {}) module_under_test::FullSanitizer.new.sanitize(input, options) @@ -223,6 +230,13 @@ def test_strip_links_with_linkception assert_equal "Magic", link_sanitize("Magic") end + def test_sanitize_ascii_8bit_string + link_sanitize("
hello
".encode("ASCII-8BIT")).tap do |sanitized| + assert_equal "
hello
", sanitized + assert_equal Encoding::UTF_8, sanitized.encoding + end + end + protected def link_sanitize(input, options = {}) module_under_test::LinkSanitizer.new.sanitize(input, options) @@ -671,8 +685,8 @@ def test_x03a_legitimate end def test_sanitize_ascii_8bit_string - safe_list_sanitize("hello".encode("ASCII-8BIT")).tap do |sanitized| - assert_equal "hello", sanitized + safe_list_sanitize("
hello
".encode("ASCII-8BIT")).tap do |sanitized| + assert_equal "
hello
", sanitized assert_equal Encoding::UTF_8, sanitized.encoding end end