Permalink
Browse files

Make normalize ignore %2B in query strings

In a query string, '+' is reserved as a shorthand for space, so "real"
pluses encoded as %2b should be preserved during normalization:

  http://example.com/one%2btwo/calc?q=1%2b2+2%2b3

is normalized as:

  http://example.com/one+two/calc?q=1%2B2+2%2B3

Previously this would have been normalized to:

  http://example.com/one+two/calc?q=1+2+2+3

making '+' ambiguous.
  • Loading branch information...
tps12 committed Dec 18, 2012
1 parent dc1871b commit 08d79da02b35d943a1ead7e43369a008b31157af
Showing with 54 additions and 6 deletions.
  1. +35 −6 lib/addressable/uri.rb
  2. +19 −0 spec/addressable/uri_spec.rb
View
@@ -380,11 +380,16 @@ class << self
# <code>Addressable::URI</code>. All other values are invalid. Defaults
# to <code>String</code>.
#
+ # @param [String] leave_encoded
+ # A string of characters to leave encoded. If a percent encoded character
+ # is encountered then its encoded form will be upcased, but otherwise
+ # will remain percent encoded.
+ #
# @return [String, Addressable::URI]
# The unencoded component or URI.
# The return type is determined by the <code>return_type</code>
# parameter.
- def self.unencode(uri, return_type=String)
+ def self.unencode(uri, return_type=String, leave_encoded='')
return nil if uri.nil?
begin
@@ -398,7 +403,8 @@ def self.unencode(uri, return_type=String)
"got #{return_type.inspect}"
end
result = uri.gsub(/%[0-9a-f]{2}/i) do |sequence|
- sequence[1..3].to_i(16).chr
+ c = sequence[1..3].to_i(16).chr
+ leave_encoded.include?(c) ? sequence.upcase : c
end
result.force_encoding("utf-8") if result.respond_to?(:force_encoding)
if return_type == String
@@ -433,6 +439,13 @@ class << self
# value is the reserved plus unreserved character classes specified in
# <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>.
#
+ # @param [String] leave_encoded
+ # When <code>character_class</code> is a <code>String</code> then
+ # <code>leave_encoded</code> is a string of characters that should remain
+ # percent encoded while normalizing the component; if they appear percent
+ # encoded in the original component, then they will be upcased ("%2f"
+ # normalized to "%2F") but otherwise left alone.
+ #
# @return [String] The normalized component.
#
# @example
@@ -447,8 +460,15 @@ class << self
# Addressable::URI::CharacterClasses::UNRESERVED
# )
# => "simple%2Fexample"
+ # Addressable::URI.normalize_component(
+ # "one%20two%2fthree%26four",
+ # "0-9a-zA-Z &/",
+ # "/"
+ # )
+ # => "one two%2Fthree&four"
def self.normalize_component(component, character_class=
- CharacterClasses::RESERVED + CharacterClasses::UNRESERVED)
+ CharacterClasses::RESERVED + CharacterClasses::UNRESERVED,
+ leave_encoded='')
return nil if component.nil?
begin
@@ -462,15 +482,23 @@ def self.normalize_component(component, character_class=
"Expected String or Regexp, got #{character_class.inspect}"
end
if character_class.kind_of?(String)
- character_class = /[^#{character_class}]/
+ leave_re = if leave_encoded.length > 0
+ character_class << '%'
+
+ "|%(?!#{leave_encoded.chars.map do |c|
+ c.unpack('C*').map { |c| ('%02x' % c).upcase }.join
+ end.join('|')})"
+ end
+
+ character_class = /[^#{character_class}]#{leave_re}/
end
if component.respond_to?(:force_encoding)
# We can't perform regexps on invalid UTF sequences, but
# here we need to, so switch to ASCII.
component = component.dup
component.force_encoding(Encoding::ASCII_8BIT)
end
- unencoded = self.unencode_component(component)
+ unencoded = self.unencode_component(component, String, leave_encoded)
begin
encoded = self.encode_component(
Addressable::IDNA.unicode_normalize_kc(unencoded),
@@ -1391,7 +1419,8 @@ def normalized_query
(self.query.split("&", -1).map do |pair|
Addressable::URI.normalize_component(
pair,
- Addressable::URI::CharacterClasses::QUERY.sub("\\&", "")
+ Addressable::URI::CharacterClasses::QUERY.sub("\\&", ""),
+ '+'
)
end).join("&")
end)
@@ -3146,6 +3146,10 @@ def to_s
it "should have query_values of {'q' => 'a b'}" do
@uri.query_values.should == {'q' => 'a b'}
end
+
+ it "should have a normalized query of 'q=a+b'" do
+ @uri.normalized_query.should == "q=a+b"
+ end
end
describe Addressable::URI, "when parsed from " +
@@ -3161,6 +3165,21 @@ def to_s
it "should have query_values of {'q' => 'a+b'}" do
@uri.query_values.should == {'q' => 'a+b'}
end
+
+ it "should have a normalized query of 'q=a%2Bb'" do
+ @uri.normalized_query.should == "q=a%2Bb"
+ end
+end
+
+describe Addressable::URI, "when parsed from " +
+ "'http://example.com/sound%2bvision'" do
+ before do
+ @uri = Addressable::URI.parse("http://example.com/sound%2bvision")
+ end
+
+ it "should have a normalized path of '/sound+vision'" do
+ @uri.normalized_path.should == '/sound+vision'
+ end
end
describe Addressable::URI, "when parsed from " +

0 comments on commit 08d79da

Please sign in to comment.