Permalink
Browse files

Move normalization logic out of URI::unencode

Instead of upcasing leave_encoded characters inside the unencode call,
leave them as they are and pass the list on to encode_component for
upcasing.

This encapsulation keeps unencode free of any normalization logic.

Also added some more test cases around leave_encoded handling.
  • Loading branch information...
1 parent 08d79da commit 37131bee398bc36ea99f3c42ee03f14260d10a56 @tps12 tps12 committed Dec 28, 2012
Showing with 71 additions and 8 deletions.
  1. +22 −8 lib/addressable/uri.rb
  2. +49 −0 spec/addressable/uri_spec.rb
View
@@ -315,6 +315,12 @@ def self.join(*uris)
# value is the reserved plus unreserved character classes specified in
# <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>.
#
+ # @param [Regexp] upcase_encoded
+ # A string of characters that may already be percent encoded, and whose
+ # encodings should be upcased. This allows normalization of percent
+ # encodings for characters not included in the
+ # <code>character_class</code>.
+ #
# @return [String] The encoded component.
#
# @example
@@ -327,7 +333,8 @@ def self.join(*uris)
# )
# => "simple%2Fexample"
def self.encode_component(component, character_class=
- CharacterClasses::RESERVED + CharacterClasses::UNRESERVED)
+ CharacterClasses::RESERVED + CharacterClasses::UNRESERVED,
+ upcase_encoded='')
return nil if component.nil?
begin
@@ -356,9 +363,15 @@ def self.encode_component(component, character_class=
component = component.dup
component.force_encoding(Encoding::ASCII_8BIT)
end
- return component.gsub(character_class) do |sequence|
+ component.gsub!(character_class) do |sequence|
(sequence.unpack('C*').map { |c| "%" + ("%02x" % c).upcase }).join
end
+ if upcase_encoded.length > 0
+ component.gsub!(/%(#{upcase_encoded.chars.map do |c|
+ c.unpack('C*').map { |c| '%02x' % c }.join
+ end.join('|')})/i) { |s| s.upcase }
+ end
+ return component
end
class << self
@@ -382,8 +395,7 @@ class << self
#
# @param [String] leave_encoded
# A string of characters to leave encoded. If a percent encoded character
- # is encountered then its encoded form will be upcased, but otherwise
- # will remain percent encoded.
+ # in this list is encountered then it will remain percent encoded.
#
# @return [String, Addressable::URI]
# The unencoded component or URI.
@@ -404,7 +416,7 @@ def self.unencode(uri, return_type=String, leave_encoded='')
end
result = uri.gsub(/%[0-9a-f]{2}/i) do |sequence|
c = sequence[1..3].to_i(16).chr
- leave_encoded.include?(c) ? sequence.upcase : c
+ leave_encoded.include?(c) ? sequence : c
end
result.force_encoding("utf-8") if result.respond_to?(:force_encoding)
if return_type == String
@@ -486,8 +498,9 @@ def self.normalize_component(component, character_class=
character_class << '%'
"|%(?!#{leave_encoded.chars.map do |c|
- c.unpack('C*').map { |c| ('%02x' % c).upcase }.join
- end.join('|')})"
+ seq = c.unpack('C*').map { |c| '%02x' % c }.join
+ [seq.upcase, seq.downcase]
+ end.flatten.join('|')})"
end
character_class = /[^#{character_class}]#{leave_re}/
@@ -502,7 +515,8 @@ def self.normalize_component(component, character_class=
begin
encoded = self.encode_component(
Addressable::IDNA.unicode_normalize_kc(unencoded),
- character_class
+ character_class,
+ leave_encoded
)
rescue ArgumentError
encoded = self.encode_component(unencoded)
@@ -3171,6 +3171,28 @@ def to_s
end
end
+describe Addressable::URI, "when parsed from " +
+ "'http://example.com/?v=%7E&w=%&x=%25&y=%2B&z=C%CC%A7'" do
+ before do
+ @uri = Addressable::URI.parse("http://example.com/?v=%7E&w=%&x=%25&y=%2B&z=C%CC%A7")
+ end
+
+ it "should have a normalized query of 'v=~&w=%25&x=%25&y=%2B&z=%C3%87'" do
+ @uri.normalized_query.should == "v=~&w=%25&x=%25&y=%2B&z=%C3%87"
+ end
+end
+
+describe Addressable::URI, "when parsed from " +
+ "'http://example.com/?v=%7E&w=%&x=%25&y=+&z=C%CC%A7'" do
+ before do
+ @uri = Addressable::URI.parse("http://example.com/?v=%7E&w=%&x=%25&y=+&z=C%CC%A7")
+ end
+
+ it "should have a normalized query of 'v=~&w=%25&x=%25&y=+&z=%C3%87'" do
+ @uri.normalized_query.should == "v=~&w=%25&x=%25&y=+&z=%C3%87"
+ end
+end
+
describe Addressable::URI, "when parsed from " +
"'http://example.com/sound%2bvision'" do
before do
@@ -4638,6 +4660,19 @@ def to_str
end
end
+describe Addressable::URI, "when normalizing a string but leaving some characters encoded" do
@sporkmonger

sporkmonger Mar 4, 2013

Owner

I'm not sure I understand these two tests.

+ it "should result in correct percent encoded sequence" do
+ Addressable::URI.normalize_component("%58X%59Y%5AZ", "0-9a-zXY", "Y").should ==
+ "XX%59Y%5A%5A"
+ end
+end
+
+describe Addressable::URI, "when encoding a string with existing encodings to upcase" do
+ it "should result in correct percent encoded sequence" do
+ Addressable::URI.encode_component("JK%4c", "0-9A-IKM-Za-z%", "L").should == "%4AK%4C"
+ end
+end
+
describe Addressable::URI, "when encoding a multibyte string" do
it "should result in correct percent encoded sequence" do
Addressable::URI.encode_component("günther").should == "g%C3%BCnther"
@@ -4683,6 +4718,20 @@ def to_str
end
end
+describe Addressable::URI, "when partially unencoding a string" do
+ it "should unencode all characters by default" do
+ Addressable::URI.unencode('%%25~%7e+%2b', String).should == '%%~~++'
+ end
+
+ it "should unencode characters not in leave_encoded" do
+ Addressable::URI.unencode('%%25~%7e+%2b', String, '~').should == '%%~%7e++'
+ end
+
+ it "should leave characters in leave_encoded alone" do
+ Addressable::URI.unencode('%%25~%7e+%2b', String, '%~+').should == '%%25~%7e+%2b'
+ end
+end
+
describe Addressable::URI, "when unencoding a bogus object" do
it "should raise a TypeError" do
(lambda do

6 comments on commit 37131be

Although this commit seems only to be refactoring, it changed the behaviour of encode_component method. encode_component before was returning a new object without modifying passed component. Now with two gsub replaced with gsub! it modifies passed object.

It's not a problem for Ruby >= 1.9 where string is cloned if it responds to :force_encoding but this caused me problem in Ruby < 1.9

Owner

sporkmonger replied Mar 4, 2013

I'm in the process of removing all usage of gsub! from the library in favor of gsub. I haven't investigated this bug yet, but most of the tests you've submitted look right. Please split the tests out into their own commit and rewrite the fix somehow without using gsub!.

Owner

sporkmonger replied Mar 4, 2013

Oh, nevermind, misread, thought this was a pull request. Hmm, yeah. I'll take a closer look and attempt to revise.

Contributor

tps12 replied Mar 5, 2013

Agh, good catch, sorry about that. You're right, no reason to be messing with the input string there at all.

Owner

sporkmonger replied Mar 31, 2013

This should be resolved by 64f82f2.

Please sign in to comment.