Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Parse Content-Disposition (mostly) per RFC 2183. Issue #163

  • Loading branch information...
commit 987b2782aca1a21142fbc47a2da31cd182c7df32 1 parent 1269a92
@drbrain drbrain authored
View
4 Manifest.txt
@@ -45,6 +45,7 @@ lib/mechanize/http.rb
lib/mechanize/http/agent.rb
lib/mechanize/http/auth_challenge.rb
lib/mechanize/http/auth_realm.rb
+lib/mechanize/http/content_disposition_parser.rb
lib/mechanize/http/www_authenticate_parser.rb
lib/mechanize/monkey_patch.rb
lib/mechanize/page.rb
@@ -131,14 +132,17 @@ test/test_mechanize_form_field.rb
test/test_mechanize_form_file_upload.rb
test/test_mechanize_form_image_button.rb
test/test_mechanize_form_keygen.rb
+test/test_mechanize_form_multi_select_list.rb
test/test_mechanize_form_option.rb
test/test_mechanize_form_radio_button.rb
+test/test_mechanize_form_select_list.rb
test/test_mechanize_form_textarea.rb
test/test_mechanize_headers.rb
test/test_mechanize_history.rb
test/test_mechanize_http_agent.rb
test/test_mechanize_http_auth_challenge.rb
test/test_mechanize_http_auth_realm.rb
+test/test_mechanize_http_content_disposition_parser.rb
test/test_mechanize_http_www_authenticate_parser.rb
test/test_mechanize_link.rb
test/test_mechanize_page.rb
View
1  lib/mechanize.rb
@@ -1063,6 +1063,7 @@ def add_to_history(page)
require 'mechanize/http/agent'
require 'mechanize/http/auth_challenge'
require 'mechanize/http/auth_realm'
+require 'mechanize/http/content_disposition_parser'
require 'mechanize/http/www_authenticate_parser'
require 'mechanize/page'
require 'mechanize/monkey_patch'
View
188 lib/mechanize/http/content_disposition_parser.rb
@@ -0,0 +1,188 @@
+# coding: BINARY
+
+require 'strscan'
+require 'time'
+
+class Mechanize::HTTP
+ ContentDisposition = Struct.new :type, :filename, :creation_date,
+ :modification_date, :read_date, :size, :parameters
+end
+
+##
+# Parser Content-Disposition headers that loosely follows RFC 2183.
+#
+# Beyond RFC 2183, this parser allows:
+#
+# * Missing disposition-type
+# * Multiple semicolons
+# * Whitespace around semicolons
+
+class Mechanize::HTTP::ContentDispositionParser
+
+ attr_accessor :scanner # :nodoc:
+
+ @parser = nil
+
+ ##
+ # Parses the disposition type and params in the +content_disposition+
+ # string. The "Content-Disposition:" must be removed.
+
+ def self.parse content_disposition
+ @parser ||= self.new
+ @parser.parse content_disposition
+ end
+
+ ##
+ # Creates a new parser Content-Disposition headers
+
+ def initialize
+ @scanner = nil
+ end
+
+ ##
+ # Parses the +content_disposition+ header. If +header+ is set to true the
+ # "Content-Disposition:" portion will be parsed
+
+ def parse content_disposition, header = false
+ return nil if content_disposition.empty?
+
+ @scanner = StringScanner.new content_disposition
+
+ if header then
+ return nil unless @scanner.scan(/Content-Disposition/i)
+ return nil unless @scanner.scan(/:/)
+ spaces
+ end
+
+ type = rfc_2045_token
+ @scanner.scan(/;+/)
+
+ if @scanner.peek(1) == '=' then
+ @scanner.pos = 0
+ type = nil
+ end
+
+ disposition = Mechanize::HTTP::ContentDisposition.new type
+
+ spaces
+
+ return nil unless parameters = parse_parameters
+
+ disposition.filename = parameters.delete 'filename'
+ disposition.creation_date = parameters.delete 'creation-date'
+ disposition.modification_date = parameters.delete 'modification-date'
+ disposition.read_date = parameters.delete 'read-date'
+ disposition.size = parameters.delete 'size'
+ disposition.parameters = parameters
+
+ disposition
+ end
+
+ ##
+ # Extracts disposition-parm and returns a Hash.
+
+ def parse_parameters
+ parameters = {}
+
+ while true do
+ return nil unless param = rfc_2045_token
+ param.downcase
+ return nil unless @scanner.scan(/=/)
+
+ value = case param
+ when /^filename$/ then
+ rfc_2045_value
+ when /^(creation|modification|read)-date$/ then
+ Time.rfc822 rfc_2045_quoted_string
+ when /^size$/ then
+ @scanner.scan(/\d+/).to_i(10)
+ else
+ rfc_2045_value
+ end
+
+ return nil unless value
+
+ parameters[param] = value
+
+ spaces
+
+ break if @scanner.eos? or not @scanner.scan(/;+/)
+
+ spaces
+ end
+
+ parameters
+ end
+
+ ##
+ # quoted-string = <"> *(qtext/quoted-pair) <">
+ # qtext = <any CHAR excepting <">, "\" & CR,
+ # and including linear-white-space
+ # quoted-pair = "\" CHAR
+ #
+ # Parses an RFC 2045 quoted-string
+
+ def rfc_2045_quoted_string
+ return nil unless @scanner.scan(/"/)
+
+ text = ''
+
+ while true do
+ chunk = @scanner.scan(/[\000-\014\016-\041\043-\133\135-\177]+/) # not \r "
+
+ if chunk then
+ text << chunk
+
+ if @scanner.peek(1) == '\\' then
+ @scanner.get_byte
+ return nil if @scanner.eos?
+ text << @scanner.get_byte
+ elsif @scanner.scan(/\r\n[\t ]+/) then
+ text << " "
+ end
+ else
+ if '"' == @scanner.peek(1) then
+ @scanner.get_byte
+ break
+ else
+ return nil
+ end
+ end
+ end
+
+ text
+ end
+
+ ##
+ # token := 1*<any (US-ASCII) CHAR except SPACE, CTLs, or tspecials>
+ #
+ # Parses an RFC 2045 token
+
+ def rfc_2045_token
+ @scanner.scan(/[^\000-\037\177()<>@,;:\\"\/\[\]?= ]+/)
+ end
+
+ ##
+ # value := token / quoted-string
+ #
+ # Parses an RFC 2045 value
+
+ def rfc_2045_value
+ if @scanner.peek(1) == '"' then
+ rfc_2045_quoted_string
+ else
+ rfc_2045_token
+ end
+ end
+
+ ##
+ # 1*SP
+ #
+ # Parses spaces
+
+ def spaces
+ @scanner.scan(/ +/)
+ end
+
+end
+
View
18 lib/mechanize/parser.rb
@@ -95,6 +95,8 @@ module Mechanize::Parser
# directory is given.
def extract_filename full_path = @full_path
+ handled = false
+
if @uri then
uri = @uri
uri += 'index.html' if uri.path.end_with? '/'
@@ -108,13 +110,17 @@ def extract_filename full_path = @full_path
# Set the filename
if disposition = @response['content-disposition'] then
- disposition.split(/;\s*/).each do |pair|
- k, v = pair.split(/=/, 2)
- filename = v if k && k.downcase == 'filename'
+ content_disposition =
+ Mechanize::HTTP::ContentDispositionParser.parse disposition
+
+ if content_disposition then
+ filename = content_disposition.filename
+ filename = filename.split(/[\\\/]/).last
+ handled = true
end
+ end
- filename = filename.split(/[\\\/]/).last
- elsif @uri then
+ if not handled and @uri then
filename << '.html' unless filename =~ /\./
filename << "?#{@uri.query}" if @uri.query
end
@@ -123,7 +129,7 @@ def extract_filename full_path = @full_path
filename = "_#{filename}"
end
- filename = filename.tr "\x00-\x1f<>:\"/\\|?*", '_'
+ filename = filename.tr "\x00-\x20<>:\"/\\|?*", '_'
@filename = if full_path then
File.join @uri.host, path, filename
View
138 test/test_mechanize_http_content_disposition_parser.rb
@@ -0,0 +1,138 @@
+require 'mechanize/test_case'
+
+class TestMechanizeHttpContentDispositionParser < Mechanize::TestCase
+
+ def setup
+ super
+
+ @parser = Mechanize::HTTP::ContentDispositionParser.new
+ end
+
+ def test_parse
+ now = Time.at Time.now.to_i
+
+ content_disposition = @parser.parse \
+ 'attachment;' \
+ 'filename=value;' \
+ "creation-date=\"#{now.rfc822}\";" \
+ "modification-date=\"#{(now + 1).rfc822}\";" \
+ "read-date=\"#{(now + 2).rfc822}\";" \
+ 'size=5;' \
+ 'arbitrary=value'
+
+ assert_equal 'attachment', content_disposition.type
+ assert_equal 'value', content_disposition.filename
+ assert_equal now, content_disposition.creation_date
+ assert_equal (now + 1), content_disposition.modification_date
+ assert_equal (now + 2), content_disposition.read_date
+ assert_equal 5, content_disposition.size
+ expected = { 'arbitrary' => 'value' }
+ assert_equal expected, content_disposition.parameters
+ end
+
+ def test_parse_header
+ now = Time.at Time.now.to_i
+
+ content_disposition = @parser.parse \
+ 'content-disposition: attachment;' \
+ 'filename=value;' \
+ "creation-date=\"#{now.rfc822}\";" \
+ "modification-date=\"#{(now + 1).rfc822}\";" \
+ "read-date=\"#{(now + 2).rfc822}\";" \
+ 'size=5;' \
+ 'arbitrary=value', true
+
+ assert_equal 'attachment', content_disposition.type
+ assert_equal 'value', content_disposition.filename
+ assert_equal now, content_disposition.creation_date
+ assert_equal (now + 1), content_disposition.modification_date
+ assert_equal (now + 2), content_disposition.read_date
+ assert_equal 5, content_disposition.size
+ expected = { 'arbitrary' => 'value' }
+ assert_equal expected, content_disposition.parameters
+ end
+
+ def test_parse_no_type
+ now = Time.at Time.now.to_i
+
+ content_disposition = @parser.parse \
+ 'filename=value'
+
+ assert_nil content_disposition.type
+ assert_equal 'value', content_disposition.filename
+ end
+
+ def test_parse_semicolons
+ now = Time.at Time.now.to_i
+
+ content_disposition = @parser.parse \
+ 'attachment;;filename=value'
+
+ assert_equal 'attachment', content_disposition.type
+ assert_equal 'value', content_disposition.filename
+ end
+
+ def test_rfc_2045_quoted_string
+ @parser.scanner = StringScanner.new '"text"'
+
+ string = @parser.rfc_2045_quoted_string
+
+ assert_equal 'text', string
+ end
+
+ def test_rfc_2045_quoted_string_bad
+ @parser.scanner = StringScanner.new '"text'
+
+ assert_nil @parser.rfc_2045_quoted_string
+ end
+
+ def test_rfc_2045_quoted_string_crlf
+ @parser.scanner = StringScanner.new "\"multiline\\\r\n\ttext\""
+
+ string = @parser.rfc_2045_quoted_string
+
+ assert_equal "multiline\r\n\ttext", string
+ end
+
+ def test_rfc_2045_quoted_string_escape
+ @parser.scanner = StringScanner.new "\"escape\\ text\""
+
+ string = @parser.rfc_2045_quoted_string
+
+ assert_equal 'escape text', string
+ end
+
+ def test_rfc_2045_quoted_string_escape_bad
+ @parser.scanner = StringScanner.new '"escape\\'
+
+ string = @parser.rfc_2045_quoted_string
+
+ assert_nil string
+ end
+
+ def test_rfc_2045_quoted_string_folded
+ @parser.scanner = StringScanner.new "\"multiline\r\n\ttext\""
+
+ string = @parser.rfc_2045_quoted_string
+
+ assert_equal 'multiline text', string
+ end
+
+ def test_rfc_2045_quoted_string_quote
+ @parser.scanner = StringScanner.new '"escaped \\" here"'
+
+ string = @parser.rfc_2045_quoted_string
+
+ assert_equal 'escaped " here', string
+ end
+
+ def test_rfc_2045_quoted_string_quote_end
+ @parser.scanner = StringScanner.new '"end \\""'
+
+ string = @parser.rfc_2045_quoted_string
+
+ assert_equal 'end "', string
+ end
+
+end
+
View
98 test/test_mechanize_parser.rb
@@ -30,29 +30,7 @@ def test_extract_filename_content_disposition
@parser.uri = URI 'http://example/foo'
@parser.response = {
- 'content-disposition' => 'attachment; filename=genome.jpeg; modification-date="Wed, 12 Feb 1997 16:29:51 -0500"'
- }
-
- assert_equal 'genome.jpeg', @parser.extract_filename
-
- @parser.response = {
- 'content-disposition' => 'filename=genome.jpeg; modification-date="Wed, 12 Feb 1997 16:29:51 -0500"'
- }
-
- assert_equal 'genome.jpeg', @parser.extract_filename
-
- @parser.response = {
- 'content-disposition' => 'filename=genome.jpeg'
- }
-
- assert_equal 'genome.jpeg', @parser.extract_filename
- end
-
- def test_extract_filename_content_disposition_bad
- @parser.uri = URI 'http://example/foo'
-
- @parser.response = {
- 'content-disposition' => 'attachment;; filename=genome.jpeg'
+ 'content-disposition' => 'attachment; filename=genome.jpeg'
}
assert_equal 'genome.jpeg', @parser.extract_filename
@@ -62,13 +40,13 @@ def test_extract_filename_content_disposition_path
@parser.uri = URI 'http://example'
@parser.response = {
- 'content-disposition' => 'attachment; filename=../genome.jpeg'
+ 'content-disposition' => 'attachment; filename="../genome.jpeg"'
}
assert_equal 'example/genome.jpeg', @parser.extract_filename(true)
@parser.response = {
- 'content-disposition' => 'attachment; filename=foo/genome.jpeg'
+ 'content-disposition' => 'attachment; filename="foo/genome.jpeg"'
}
assert_equal 'example/genome.jpeg', @parser.extract_filename(true)
@@ -78,13 +56,13 @@ def test_extract_filename_content_disposition_path_windows
@parser.uri = URI 'http://example'
@parser.response = {
- 'content-disposition' => 'attachment; filename=..\\genome.jpeg'
+ 'content-disposition' => 'attachment; filename="..\\\\genome.jpeg"'
}
assert_equal 'example/genome.jpeg', @parser.extract_filename(true)
@parser.response = {
- 'content-disposition' => 'attachment; filename=foo\\genome.jpeg'
+ 'content-disposition' => 'attachment; filename="foo\\\\genome.jpeg"'
}
assert_equal 'example/genome.jpeg', @parser.extract_filename(true)
@@ -94,37 +72,37 @@ def test_extract_filename_content_disposition_full_path
@parser.uri = URI 'http://example/foo'
@parser.response = {
- 'content-disposition' => 'attachment; filename=genome.jpeg; modification-date="Wed, 12 Feb 1997 16:29:51 -0500"'
+ 'content-disposition' => 'attachment; filename=genome.jpeg'
}
assert_equal 'example/genome.jpeg', @parser.extract_filename(true)
+ end
- @parser.response = {
- 'content-disposition' => 'filename=genome.jpeg; modification-date="Wed, 12 Feb 1997 16:29:51 -0500"'
- }
-
- assert_equal 'example/genome.jpeg', @parser.extract_filename(true)
+ def test_extract_filename_content_disposition_quoted
+ @parser.uri = URI 'http://example'
@parser.response = {
- 'content-disposition' => 'filename=genome.jpeg'
+ 'content-disposition' => 'attachment; filename="some \"file\""'
}
- assert_equal 'example/genome.jpeg', @parser.extract_filename(true)
+ assert_equal 'some__file_', @parser.extract_filename
end
def test_extract_filename_content_disposition_special
@parser.uri = URI 'http://example/foo'
@parser.response = {
- 'content-disposition' => 'attachment; filename=/\\<>:"|?*'
+ 'content-disposition' => 'attachment; filename="/\\\\<>:\\"|?*"'
}
assert_equal '_______', @parser.extract_filename
- chars = (0..31).map { |c| c.chr }.join
+ chars = (0..12).map { |c| c.chr }.join
+ chars += "\\\r"
+ chars += (14..31).map { |c| c.chr }.join
@parser.response = {
- 'content-disposition' => "attachment; filename=#{chars}"
+ 'content-disposition' => "attachment; filename=\"#{chars}\""
}
assert_equal '_' * 32, @parser.extract_filename
@@ -167,28 +145,6 @@ def test_extract_filename_content_disposition_windows_special
end
end
- def test_extract_filename_uri
- @parser.response = {}
- @parser.uri = URI 'http://example/foo'
-
- assert_equal 'foo.html', @parser.extract_filename
-
- @parser.uri += '/foo.jpg'
-
- assert_equal 'foo.jpg', @parser.extract_filename
- end
-
- def test_extract_filename_uri_full_path
- @parser.response = {}
- @parser.uri = URI 'http://example/foo'
-
- assert_equal 'example/foo.html', @parser.extract_filename(true)
-
- @parser.uri += '/foo.jpg'
-
- assert_equal 'example/foo.jpg', @parser.extract_filename(true)
- end
-
def test_extract_filename_host
@parser.response = {}
@parser.uri = URI 'http://example'
@@ -230,6 +186,28 @@ def test_extract_filename_special_character
assert_equal '_.html', @parser.extract_filename, 'asterisk'
end
+ def test_extract_filename_uri
+ @parser.response = {}
+ @parser.uri = URI 'http://example/foo'
+
+ assert_equal 'foo.html', @parser.extract_filename
+
+ @parser.uri += '/foo.jpg'
+
+ assert_equal 'foo.jpg', @parser.extract_filename
+ end
+
+ def test_extract_filename_uri_full_path
+ @parser.response = {}
+ @parser.uri = URI 'http://example/foo'
+
+ assert_equal 'example/foo.html', @parser.extract_filename(true)
+
+ @parser.uri += '/foo.jpg'
+
+ assert_equal 'example/foo.jpg', @parser.extract_filename(true)
+ end
+
def test_extract_filename_uri_query
@parser.response = {}
@parser.uri = URI 'http://example/?id=5'
Please sign in to comment.
Something went wrong with that request. Please try again.