fix: empty fragment encoding

and improve test coverage around fragment encoding Closes #2649
sparklemotion · Sep 19, 2022 · a582143 · a582143
1 parent 75af7e8
commit a582143
Show file tree

Hide file tree

Showing 3 changed files with 63 additions and 36 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -42,6 +42,7 @@ This version of Nokogiri uses [`jar-dependencies`](https://github.com/mkristian/
 ### Fixed
 
 * `SAX::Parser`'s `encoding` attribute will not be clobbered when an alternative encoding is passed into `SAX::Parser#parse_io`. [[#1942](https://github.com/sparklemotion/nokogiri/issues/1942)] (Thanks, [@kp666](https://github.com/kp666)!)
+* Serialized `HTML4::DocumentFragment` will now be properly encoded. Previously this empty string was encoded as `US-ASCII`. [[#2649](https://github.com/sparklemotion/nokogiri/issues/2649)]
 * [CRuby] UTF-16-encoded documents longer than ~4000 code points now serialize properly. Previously the serialized document was corrupted when it exceeded the length of libxml2's internal string buffer. [[#752](https://github.com/sparklemotion/nokogiri/issues/752)]
 * [CRuby] The HTML5 parser now correctly handles text at the end of `form` elements.
 * [CRuby] `HTML5::Document#fragment` now always uses `body` as the parsing context. Previously, fragments were parsed in the context of the associated document's root node, which allowed for inconsistent parsing. [[#2553](https://github.com/sparklemotion/nokogiri/issues/2553)]

diff --git a/lib/nokogiri/xml/node_set.rb b/lib/nokogiri/xml/node_set.rb
@@ -280,7 +280,12 @@ def to_html(*args)
           options[:save_with] ||= Node::SaveOptions::DEFAULT_HTML
           args.insert(0, options)
         end
-        map { |x| x.to_html(*args) }.join
+        if empty?
+          encoding = (args.first.is_a?(Hash) ? args.first[:encoding] : nil) || document.encoding
+          "".encode(encoding)
+        else
+          map { |x| x.to_html(*args) }.join
+        end
       end
 
       ###

diff --git a/test/html4/test_document_fragment.rb b/test/html4/test_document_fragment.rb
@@ -15,19 +15,6 @@ def test_ascii_8bit_encoding
           assert_equal("hello", Nokogiri::HTML4::DocumentFragment.parse(s).to_html)
         end
 
-        def test_inspect_encoding
-          fragment = "<div>こんにちは！</div>".encode("EUC-JP")
-          f = Nokogiri::HTML4::DocumentFragment.parse(fragment)
-          assert_equal("こんにちは！", f.content)
-        end
-
-        def test_html_parse_encoding
-          fragment = "<div>こんにちは！</div>".encode("EUC-JP")
-          f = Nokogiri::HTML4.fragment(fragment)
-          assert_equal("EUC-JP", f.document.encoding)
-          assert_equal("こんにちは！", f.content)
-        end
-
         def test_unlink_empty_document
           frag = Nokogiri::HTML4::DocumentFragment.parse("").unlink # must_not_raise
           assert_nil(frag.parent)
@@ -38,20 +25,6 @@ def test_colons_are_not_removed
           assert_match(/3:30/, doc.to_s)
         end
 
-        def test_parse_encoding
-          fragment = "<div>hello world</div>"
-          f = Nokogiri::HTML4::DocumentFragment.parse(fragment, "ISO-8859-1")
-          assert_equal("ISO-8859-1", f.document.encoding)
-          assert_equal("hello world", f.content)
-        end
-
-        def test_html_parse_with_encoding
-          fragment = "<div>hello world</div>"
-          f = Nokogiri::HTML4.fragment(fragment, "ISO-8859-1")
-          assert_equal("ISO-8859-1", f.document.encoding)
-          assert_equal("hello world", f.content)
-        end
-
         def test_parse_in_context
           assert_equal("<br>", html.root.parse("<br />").to_s)
         end
@@ -76,14 +49,6 @@ def test_ancestors_search
           assert(li.matches?("li"))
         end
 
-        def test_fun_encoding
-          string = %(<body>こんにちは</body>)
-          html = Nokogiri::HTML4::DocumentFragment.parse(
-            string
-          ).to_html(encoding: "UTF-8")
-          assert_equal(string, html)
-        end
-
         def test_new
           assert(Nokogiri::HTML4::DocumentFragment.new(html))
         end
@@ -306,6 +271,62 @@ def test_dup_should_create_an_html_document_fragment
           assert_instance_of(Nokogiri::HTML4::DocumentFragment, duplicate)
         end
 
+        describe "encoding" do
+          describe "#fragment" do
+            it "parses an encoded string" do
+              input = "<div>こんにちは！</div>".encode("EUC-JP")
+              fragment = Nokogiri::HTML4.fragment(input)
+              assert_equal("EUC-JP", fragment.document.encoding)
+              assert_equal("こんにちは！", fragment.content)
+            end
+
+            it "returns a string matching the passed encoding" do
+              input = "<div>hello world</div>"
+
+              fragment = Nokogiri::HTML4.fragment(input, "ISO-8859-1")
+              assert_equal("ISO-8859-1", fragment.document.encoding)
+              assert_equal("hello world", fragment.content)
+            end
+          end
+
+          describe "#parse" do
+            it "parses an encoded string" do
+              input = "<div>こんにちは！</div>".encode("EUC-JP")
+
+              fragment = Nokogiri::HTML4::DocumentFragment.parse(input)
+              assert_equal("EUC-JP", fragment.document.encoding)
+              assert_equal("こんにちは！", fragment.content)
+            end
+
+            it "returns a string matching the passed encoding" do
+              input = "<div>hello world</div>"
+
+              fragment = Nokogiri::HTML4::DocumentFragment.parse(input, "ISO-8859-1")
+              assert_equal("ISO-8859-1", fragment.document.encoding)
+              assert_equal("hello world", fragment.content)
+            end
+
+            it "respects encoding for empty strings" do
+              fragment = Nokogiri::HTML::DocumentFragment.parse("", "UTF-8")
+              assert_equal "UTF-8", fragment.to_html.encoding.to_s
+
+              fragment = Nokogiri::HTML::DocumentFragment.parse("", "US-ASCII")
+              assert_equal "US-ASCII", fragment.to_html.encoding.to_s
+
+              fragment = Nokogiri::HTML::DocumentFragment.parse("", "ISO-8859-1")
+              assert_equal "ISO-8859-1", fragment.to_html.encoding.to_s
+            end
+          end
+
+          describe "#to_html" do
+            it "serializes empty strings with the passed encoding" do
+              fragment = Nokogiri::HTML::DocumentFragment.parse("", "UTF-8")
+              assert_equal "ISO-8859-1", fragment.to_html(encoding: "ISO-8859-1").encoding.to_s
+              assert_equal "US-ASCII", fragment.to_html(encoding: "US-ASCII").encoding.to_s
+            end
+          end
+        end
+
         describe "parse options" do
           let(:html4_default) do
             Nokogiri::XML::ParseOptions.new(Nokogiri::XML::ParseOptions::DEFAULT_HTML)