Skip to content

Commit

Permalink
Fixed normalization of percent-encoded paths.
Browse files Browse the repository at this point in the history
  • Loading branch information
sporkmonger committed Dec 24, 2009
1 parent d7a066c commit 48e0b24
Show file tree
Hide file tree
Showing 4 changed files with 186 additions and 52 deletions.
1 change: 1 addition & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
* added HTTP request URI methods
* better handling of Windows file paths
* validation_deferred boolean replaced with defer_validation block
* normalization of percent-encoded paths should now be correct

=== Addressable 2.1.1
* more type checking changes
Expand Down
4 changes: 2 additions & 2 deletions lib/addressable/template.rb
Original file line number Diff line number Diff line change
Expand Up @@ -800,9 +800,9 @@ def expand_list_operator(argument, variables, mapping, partial=false)
# @return [Array]
# A tuple of the operator, argument, variables, and mapping.
def parse_template_expansion(capture, mapping={})
operator, argument, variables = capture[1...-1].split("|")
operator, argument, variables = capture[1...-1].split("|", -1)
operator.gsub!(/^\-/, "")
variables = variables.split(",")
variables = variables.split(",", -1)
mapping = (variables.inject({}) do |accu, var|
varname, _, vardefault = var.scan(/^(.+?)(=(.*))?$/)[0]
accu[varname] = vardefault
Expand Down
134 changes: 86 additions & 48 deletions lib/addressable/uri.rb
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,71 @@ class << self
alias_method :unescape_component, :unencode
end


##
# Normalizes the encoding of a URI component.
#
# @param [String, #to_str] component The URI component to encode.
#
# @param [String, Regexp] character_class
#
# The characters which are not percent encoded. If a <tt>String</tt> is
# passed, the <tt>String</tt> must be formatted as a regular expression
# character class. (Do not include the surrounding square brackets.) For
# example, <tt>"b-zB-Z0-9"</tt> would cause everything but the letters 'b'
# through 'z' and the numbers '0' through '9' to be percent encoded. If a
# <tt>Regexp</tt> is passed, the value <tt>/[^b-zB-Z0-9]/</tt> would have
# the same effect. A set of useful <tt>String</tt> values may be found in
# the <tt>Addressable::URI::CharacterClasses</tt> module. The default
# value is the reserved plus unreserved character classes specified in <a
# href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>.
#
# @return [String] The normalized component.
#
# @example
# Addressable::URI.normalize_component("simpl%65/%65xampl%65", "b-zB-Z")
# => "simple%2Fex%61mple"
# Addressable::URI.normalize_component(
# "simpl%65/%65xampl%65", /[^b-zB-Z]/
# )
# => "simple%2Fex%61mple"
# Addressable::URI.normalize_component(
# "simpl%65/%65xampl%65",
# Addressable::URI::CharacterClasses::UNRESERVED
# )
# => "simple%2Fexample"
def self.normalize_component(component, character_class=
CharacterClasses::RESERVED + CharacterClasses::UNRESERVED)
return nil if component.nil?
if !component.respond_to?(:to_str)
raise TypeError, "Can't convert #{component.class} into String."
end
component = component.to_str
if ![String, Regexp].include?(character_class.class)
raise TypeError,
"Expected String or Regexp, got #{character_class.inspect}"
end
if character_class.kind_of?(String)
character_class = /[^#{character_class}]/
end
if component.respond_to?(:force_encoding)
# We can't perform regexps on invalid UTF sequences, but
# here we need to, so switch to ASCII.
component = component.dup
component.force_encoding(Encoding::ASCII_8BIT)
end
unencoded = self.unencode_component(component)
begin
encoded = self.encode_component(
Addressable::IDNA.unicode_normalize_kc(unencoded),
character_class
)
rescue ArgumentError
encoded = self.encode_component(unencoded)
end
return encoded
end

##
# Percent encodes any special characters in the URI.
#
Expand Down Expand Up @@ -561,10 +626,8 @@ def normalized_scheme
if self.scheme =~ /^\s*ssh\+svn\s*$/i
"svn+ssh"
else
Addressable::URI.encode_component(
Addressable::IDNA.unicode_normalize_kc(
Addressable::URI.unencode_component(
self.scheme.strip.downcase)),
Addressable::URI.normalize_component(
self.scheme.strip.downcase,
Addressable::URI::CharacterClasses::SCHEME
)
end
Expand Down Expand Up @@ -615,9 +678,8 @@ def normalized_user
(!self.password || self.password.strip == "")
nil
else
Addressable::URI.encode_component(
Addressable::IDNA.unicode_normalize_kc(
Addressable::URI.unencode_component(self.user.strip)),
Addressable::URI.normalize_component(
self.user.strip,
Addressable::URI::CharacterClasses::UNRESERVED
)
end
Expand Down Expand Up @@ -676,9 +738,8 @@ def normalized_password
(!self.user || self.user.strip == "")
nil
else
Addressable::URI.encode_component(
Addressable::IDNA.unicode_normalize_kc(
Addressable::URI.unencode_component(self.password.strip)),
Addressable::URI.normalize_component(
self.password.strip,
Addressable::URI::CharacterClasses::UNRESERVED
)
end
Expand Down Expand Up @@ -1044,19 +1105,14 @@ def path
# @return [String] The path component, normalized.
def normalized_path
@normalized_path ||= (begin
begin
result = Addressable::URI.encode_component(
Addressable::IDNA.unicode_normalize_kc(
Addressable::URI.unencode_component(self.path.strip)),
Addressable::URI::CharacterClasses::PATH
# String#split(delimeter, -1) uses the more strict splitting behavior
# found in Python.
result = (self.path.strip.split("/", -1).map do |segment|
Addressable::URI.normalize_component(
segment,
Addressable::URI::CharacterClasses::PCHAR
)
rescue ArgumentError
# Likely a malformed UTF-8 character, skip unicode normalization
result = Addressable::URI.encode_component(
Addressable::URI.unencode_component(self.path.strip),
Addressable::URI::CharacterClasses::PATH
)
end
end).join("/")
result = self.class.normalize_path(result)
if result == "" &&
["http", "https", "ftp", "tftp"].include?(self.normalized_scheme)
Expand Down Expand Up @@ -1121,19 +1177,10 @@ def query
def normalized_query
@normalized_query ||= (begin
if self.query
begin
Addressable::URI.encode_component(
Addressable::IDNA.unicode_normalize_kc(
Addressable::URI.unencode_component(self.query.strip)),
Addressable::URI::CharacterClasses::QUERY
)
rescue ArgumentError
# Likely a malformed UTF-8 character, skip unicode normalization
Addressable::URI.encode_component(
Addressable::URI.unencode_component(self.query.strip),
Addressable::URI::CharacterClasses::QUERY
)
end
Addressable::URI.normalize_component(
self.query.strip,
Addressable::URI::CharacterClasses::QUERY
)
else
nil
end
Expand Down Expand Up @@ -1356,19 +1403,10 @@ def fragment
def normalized_fragment
@normalized_fragment ||= (begin
if self.fragment
begin
Addressable::URI.encode_component(
Addressable::IDNA.unicode_normalize_kc(
Addressable::URI.unencode_component(self.fragment.strip)),
Addressable::URI::CharacterClasses::FRAGMENT
)
rescue ArgumentError
# Likely a malformed UTF-8 character, skip unicode normalization
Addressable::URI.encode_component(
Addressable::URI.unencode_component(self.fragment.strip),
Addressable::URI::CharacterClasses::FRAGMENT
)
end
Addressable::URI.normalize_component(
self.fragment.strip,
Addressable::URI::CharacterClasses::FRAGMENT
)
else
nil
end
Expand Down
99 changes: 97 additions & 2 deletions spec/addressable/uri_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -1116,6 +1116,34 @@ def to_s
end
end

describe Addressable::URI, "when parsed from " +
"'http://example.com/path%2Fsegment/'" do
before do
@uri = Addressable::URI.parse("http://example.com/path%2Fsegment/")
end

it "should be considered to be in normal form" do
@uri.normalize.should be_eql(@uri)
end

it "should be equal to 'http://example.com/path%2Fsegment/'" do
@uri.normalize.should be_eql(
Addressable::URI.parse("http://example.com/path%2Fsegment/")
)
end

it "should not be equal to 'http://example.com/path/segment/'" do
@uri.should_not ==
Addressable::URI.parse("http://example.com/path/segment/")
end

it "should not be equal to 'http://example.com/path/segment/'" do
@uri.normalize.should_not be_eql(
Addressable::URI.parse("http://example.com/path/segment/")
)
end
end

describe Addressable::URI, "when parsed from " +
"'http://example.com/?%F6'" do
before do
Expand Down Expand Up @@ -3080,7 +3108,8 @@ def to_s
# Section 5.4.1 of RFC 3986
it "when joined with '#s' should resolve to http://a/b/c/d;p?q#s" do
(@uri + "#s").to_s.should == "http://a/b/c/d;p?q#s"
Addressable::URI.join(@uri.to_s, "#s").to_s.should == "http://a/b/c/d;p?q#s"
Addressable::URI.join(@uri.to_s, "#s").to_s.should ==
"http://a/b/c/d;p?q#s"
end

# Section 5.4.1 of RFC 3986
Expand Down Expand Up @@ -3177,7 +3206,8 @@ def to_s

it "when joined with '../.././../g' should resolve to http://a/g" do
(@uri + "../.././../g").to_s.should == "http://a/g"
Addressable::URI.join(@uri.to_s, "../.././../g").to_s.should == "http://a/g"
Addressable::URI.join(@uri.to_s, "../.././../g").to_s.should ==
"http://a/g"
end

# Section 5.4.2 of RFC 3986
Expand Down Expand Up @@ -3485,6 +3515,71 @@ def to_str
end
end

describe Addressable::URI, "when normalizing a non-String object" do
it "should correctly parse anything with a 'to_str' method" do
Addressable::URI.normalize_component(SuperString.new(42))
end

it "should raise a TypeError for objects than cannot be converted" do
(lambda do
Addressable::URI.normalize_component(42)
end).should raise_error(TypeError, "Can't convert Fixnum into String.")
end

it "should raise a TypeError for objects than cannot be converted" do
(lambda do
Addressable::URI.normalize_component("component", 42)
end).should raise_error(TypeError)
end
end

describe Addressable::URI, "when normalizing a path with an encoded slash" do
it "should result in correct percent encoded sequence" do
Addressable::URI.parse("/path%2Fsegment/").normalize.path.should ==
"/path%2Fsegment/"
end
end

describe Addressable::URI, "when normalizing a partially encoded string" do
it "should result in correct percent encoded sequence" do
Addressable::URI.normalize_component(
"partially % encoded%21"
).should == "partially%20%25%20encoded!"
end

it "should result in correct percent encoded sequence" do
Addressable::URI.normalize_component(
"partially %25 encoded!"
).should == "partially%20%25%20encoded!"
end
end

describe Addressable::URI, "when normalizing a unicode sequence" do
it "should result in correct percent encoded sequence" do
Addressable::URI.normalize_component(
"/C%CC%A7"
).should == "/%C3%87"
end

it "should result in correct percent encoded sequence" do
Addressable::URI.normalize_component(
"/%C3%87"
).should == "/%C3%87"
end
end

describe Addressable::URI, "when normalizing a multibyte string" do
it "should result in correct percent encoded sequence" do
Addressable::URI.normalize_component("günther").should ==
"g%C3%BCnther"
end

it "should result in correct percent encoded sequence" do
Addressable::URI.normalize_component("g%C3%BCnther").should ==
"g%C3%BCnther"
end
end

describe Addressable::URI, "when encoding a multibyte string" do
it "should result in correct percent encoded sequence" do
Addressable::URI.encode_component("günther").should == "g%C3%BCnther"
Expand Down

0 comments on commit 48e0b24

Please sign in to comment.