From 4fe30391d81ff4a954d2e4420c7cb22413d15f89 Mon Sep 17 00:00:00 2001 From: Adrien Rey-Jarthon Date: Sat, 2 Jul 2022 12:01:52 +0200 Subject: [PATCH] fix "invalid byte sequence in UTF-8" exception when unencoding URLs containing non UTF-8 characters --- lib/addressable/uri.rb | 12 +++--------- spec/addressable/uri_spec.rb | 4 ++++ 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/lib/addressable/uri.rb b/lib/addressable/uri.rb index b91d513a..619ffb58 100644 --- a/lib/addressable/uri.rb +++ b/lib/addressable/uri.rb @@ -469,19 +469,13 @@ def self.unencode(uri, return_type=String, leave_encoded='') "Expected Class (String or Addressable::URI), " + "got #{return_type.inspect}" end - uri = uri.dup - # Seriously, only use UTF-8. I'm really not kidding! - uri.force_encoding("utf-8") - unless leave_encoded.empty? - leave_encoded = leave_encoded.dup.force_encoding("utf-8") - end - - result = uri.gsub(/%[0-9a-f]{2}/iu) do |sequence| + result = uri.gsub(/%[0-9a-f]{2}/i) do |sequence| c = sequence[1..3].to_i(16).chr - c.force_encoding("utf-8") + c.force_encoding(sequence.encoding) leave_encoded.include?(c) ? sequence : c end + result.force_encoding("utf-8") if return_type == String return result diff --git a/spec/addressable/uri_spec.rb b/spec/addressable/uri_spec.rb index 00baaacf..0ac76016 100644 --- a/spec/addressable/uri_spec.rb +++ b/spec/addressable/uri_spec.rb @@ -5993,6 +5993,10 @@ def to_str expect(Addressable::URI.unencode_component("ski=%BA%DAɫ")).to eq("ski=\xBA\xDAɫ") end + it "should not fail with UTF-8 incompatible string (ISO-8859-1 encoding in this example)" do + expect(Addressable::URI.unencode_component("/M%E9thode/non/authoris\xE9e?param=\xFC".b)).to eq("/M\xE9thode/non/authoris\xE9e?param=\xFC") + end + it "should result in correct percent encoded sequence as a URI" do expect(Addressable::URI.unencode( "/path?g%C3%BCnther", ::Addressable::URI