Skip to content

Prevent parameters and uploaded filenames with invalid UTF-8 bytes from entering into rails. #3789

Closed
wants to merge 3 commits into from
View
21 actionpack/lib/action_dispatch/http/encoder.rb
@@ -0,0 +1,21 @@
+module ActionDispatch
+ module Encoder
+
+ # Beware that it modifies the parameter
+ def self.encode_to_internal(str)
+ str.force_encoding(Encoding::UTF_8)
+ if !str.valid_encoding?
+ replace_invalid_characters(str)
@dvandersluis
dvandersluis added a note Feb 9, 2012

I believe this can be simplified:

str.force_encoding("BINARY").encode!("UTF-8", :invalid => :replace, :undef => :replace, :replace => "?")

The options given to encode will cause invalid or undefined characters to be replaced by ?s.

@benmmurphy
benmmurphy added a note Feb 9, 2012

this change works correctly for invalid characters:

ruby-1.9.2-p0 :003 > "search=foo\xC0\x8Abar".force_encoding("BINARY").encode("UTF-8", :invalid => :replace, :undef => :replace, :replace => "?")
 => "search=foo??bar" 

but works incorrectly for valid utf-8 sequences:

ruby-1.9.2-p0 :005 > "£".encoding 
=> #<Encoding:UTF-8> 
ruby-1.9.2-p0 :004 > "£".force_encoding("BINARY").encode!("UTF-8", :invalid => :replace, :undef => :replace, :replace => "?")
=> "??" 
@dvandersluis
dvandersluis added a note Feb 9, 2012

Crap, I didn't realize that Encoding::BINARY was an alias for Encoding::ASCII_8BIT, which would make that make perfect sense. It's silly that this is such a process when with Iconv you could just convert using 'UTF-8//IGNORE'... but that's been deprecated in Ruby 1.9.3.

@dvandersluis
dvandersluis added a note Feb 9, 2012

How about:

 str = str.chars.map{ |c| c.force_encoding("UTF-8").valid_encoding? ? c : "?" }.join
@benmmurphy
benmmurphy added a note Feb 9, 2012

we would need to move the force_encoding out of the loop (it doesn't handle valid utf8 correctly) and for perf reasons we should keep the if the "if !str.valid_encoding?" statement if you were planning to remove that as well. the "if" statement makes it go much faster and saves a lot of object allocations. the difference between the loop and the map+join is noise but i think the if statement is worth having.

# encoding: utf-8
require "benchmark"

Encoding.default_internal = Encoding::UTF_8

def encode1(str)
  str.force_encoding(Encoding::UTF_8)
  if !str.valid_encoding?
    replace_invalid_characters(str)
  end
  str
end

def encode2(str)
  str.chars.map{ |c| c.force_encoding("UTF-8").valid_encoding? ? c : "?" }.join
end

def encode3(str)
  str.force_encoding(Encoding::UTF_8)
  if str.valid_encoding?
    str
  else
    str.chars.map{ |c| c.valid_encoding? ? c : "?" }.join
  end
end

def self.replace_invalid_characters(str)
  for i in (0...str.size)
    if !str[i].valid_encoding?
      str[i] = "?"
    end
  end
end

N = 10000
VALID = "foo£bar£car".encode(Encoding::UTF_8).force_encoding(Encoding::ASCII_8BIT)

puts encode1(VALID.dup)
puts encode2(VALID.dup)
puts encode3(VALID.dup)

Benchmark.bmbm do |x|
  x.report { for i in (1..N); encode1(VALID.dup); end }
  x.report { for i in (1..N); encode2(VALID.dup); end }
  x.report { for i in (1..N); encode3(VALID.dup); end }

end

INVALID = "£foo\xC0\x8Abar\xC0\x8Acar".force_encoding(Encoding::ASCII_8BIT)

puts encode1(INVALID.dup)
puts encode2(INVALID.dup)
puts encode3(INVALID.dup)

Benchmark.bmbm do |x|
  x.report { for i in (1..N); encode1(INVALID.dup); end }
  x.report { for i in (1..N); encode2(INVALID.dup); end }
  x.report { for i in (1..N); encode2(INVALID.dup); end }

end

foo£bar£car
foo??bar??car
foo£bar£car
Rehearsal ------------------------------------
   0.010000   0.000000   0.010000 (  0.009336)
   0.140000   0.000000   0.140000 (  0.148043)
   0.010000   0.000000   0.010000 (  0.009161)
--------------------------- total: 0.160000sec

       user     system      total        real
   0.010000   0.000000   0.010000 (  0.009209)
   0.140000   0.000000   0.140000 (  0.148545)
   0.010000   0.000000   0.010000 (  0.008394)
£foo??bar??car
??foo??bar??car
£foo??bar??car
Rehearsal ------------------------------------
   0.120000   0.000000   0.120000 (  0.117483)
   0.160000   0.000000   0.160000 (  0.180637)
   0.170000   0.010000   0.180000 (  0.165589)
--------------------------- total: 0.460000sec

       user     system      total        real
   0.120000   0.000000   0.120000 (  0.124591)
   0.160000   0.000000   0.160000 (  0.170945)
   0.160000   0.000000   0.160000 (  0.167029)
@tamird
tamird added a note Oct 20, 2013

disregard my last comment, setbyte doesn't work for obvious reasons. buffer or map-join are the way to go:

# encoding: utf-8
require "benchmark"

Encoding.default_internal = Encoding::UTF_8

def index(str)
  str.force_encoding(Encoding::UTF_8).tap do |s|
    unless s.valid_encoding?
      s.each_char.with_index do |c, i|
        s[i] = '?' unless c.valid_encoding?
      end
    end
  end
end

def map_join(str)
  if str.force_encoding(Encoding::UTF_8).valid_encoding?
    str
  else
    str.each_char.map do |c|
      if c.valid_encoding?
        c
      else
        '?'
      end
    end.join
  end
end

def buffer(str)
  if str.force_encoding(Encoding::UTF_8).valid_encoding?
    str
  else
    ''.tap do |buf|
      str.each_char do |c|
        buf.concat(c.valid_encoding? ? c : 63)
      end
    end
  end
end

N = 2000
VALID = "foo£bar£car".encode(Encoding::UTF_8).force_encoding(Encoding::ASCII_8BIT)

puts "VALID: #{VALID.dup}"
puts "index: #{index(VALID.dup)}"
puts "map-join: #{map_join(VALID.dup)}"
puts "buffer: #{buffer(VALID.dup)}"

Benchmark.bmbm do |x|
  x.report('index') { index(VALID.dup * N) }
  x.report('map-join') { map_join(VALID.dup * N) }
  x.report('buffer') { buffer(VALID.dup * N) }
end

INVALID = "£foo\xC0\x8Abar\xC0\x8Acar".force_encoding(Encoding::ASCII_8BIT)

puts "INVALID: #{INVALID.dup}"
puts "index: #{index(INVALID.dup)}"
puts "map-join: #{map_join(INVALID.dup)}"
puts "buffer: #{buffer(INVALID.dup)}"

Benchmark.bmbm do |x|
  x.report('index') { index(INVALID.dup * N) }
  x.report('map-join') { map_join(INVALID.dup * N) }
  x.report('buffer') { buffer(INVALID.dup * N) }
end

results: (showing only invalid case for brevity)
MRI 1.9.3:

INVALID: £foo�bar�car
index: £foo??bar??car
map-join: £foo??bar??car
buffer: £foo??bar??car
Rehearsal --------------------------------------------
index      0.920000   0.000000   0.920000 (  0.920054)
map-join   0.010000   0.000000   0.010000 (  0.014144)
buffer     0.020000   0.000000   0.020000 (  0.022648)
----------------------------------- total: 0.950000sec

               user     system      total        real
index      0.890000   0.000000   0.890000 (  0.898875)
map-join   0.010000   0.000000   0.010000 (  0.008975)
buffer     0.020000   0.000000   0.020000 (  0.016540)

MRI 2.0.0:

INVALID: £foo�bar�car
index: £foo??bar??car
map-join: £foo??bar??car
buffer: £foo??bar??car
Rehearsal --------------------------------------------
index      0.810000   0.000000   0.810000 (  0.815064)
map-join   0.010000   0.000000   0.010000 (  0.010653)
buffer     0.020000   0.000000   0.020000 (  0.017207)
----------------------------------- total: 0.840000sec

               user     system      total        real
index      0.770000   0.010000   0.780000 (  0.774781)
map-join   0.010000   0.000000   0.010000 (  0.008739)
buffer     0.020000   0.000000   0.020000 (  0.012607)

JRuby 1.7.5:

INVALID: £foo�bar�car
index: £foo??bar??car
map-join: £foo??bar??car
buffer: £foo??bar??car
Rehearsal --------------------------------------------
index      1.270000   0.010000   1.280000 (  1.192000)
map-join   0.050000   0.000000   0.050000 (  0.036000)
buffer     0.020000   0.000000   0.020000 (  0.017000)
----------------------------------- total: 1.350000sec

               user     system      total        real
index      1.150000   0.000000   1.150000 (  1.147000)
map-join   0.020000   0.000000   0.020000 (  0.018000)
buffer     0.020000   0.000000   0.020000 (  0.013000)

RBX 2.1.1:

INVALID: £foo�bar�car
index: £foo??bar??car
map-join: £foo??bar??car
buffer: £foo??bar??car
Rehearsal --------------------------------------------
index      7.347737   0.022950   7.370687 (  7.509184)
map-join   2.457528   0.011470   2.468998 (  2.544675)
buffer     2.469490   0.007916   2.477406 (  2.542838)
---------------------------------- total: 12.317091sec

               user     system      total        real
index      7.394374   0.025542   7.419916 (  7.813794)
map-join   2.404040   0.011272   2.415312 (  2.460207)
buffer     2.457256   0.010527   2.467783 (  2.650314)

So map-join is faster in MRI, buffer is faster in JRuby, and RBX is just terribly slow at this string manipulation :( cc @brixen

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
+ end
+ str.encode!
+ end
+
+ def self.replace_invalid_characters(str)
+ for i in (0...str.size)
+ if !str[i].valid_encoding?
+ str[i] = "?"
+ end
+ end
+ end
+ end
+end
View
6 actionpack/lib/action_dispatch/http/parameters.rb
@@ -50,12 +50,10 @@ def reset_parameters #:nodoc:
private
- # TODO: Validate that the characters are UTF-8. If they aren't,
- # you'll get a weird error down the road, but our form handling
- # should really prevent that from happening
+
def encode_params(params)
if params.is_a?(String)
- return params.force_encoding("UTF-8").encode!
+ return ActionDispatch::Encoder::encode_to_internal(params)
elsif !params.is_a?(Hash)
return params
end
View
1 actionpack/lib/action_dispatch/http/request.rb
@@ -2,6 +2,7 @@
require 'active_support/inflector'
require 'action_dispatch/http/headers'
+require 'action_dispatch/http/encoder'
require 'action_controller/metal/exceptions'
require 'rack/request'
require 'action_dispatch/http/cache'
View
6 actionpack/lib/action_dispatch/http/upload.rb
@@ -70,7 +70,11 @@ def eof?
def encode_filename(filename)
# Encode the filename in the utf8 encoding, unless it is nil
- filename.force_encoding("UTF-8").encode! if filename
+ if filename
+ ActionDispatch::Encoder::encode_to_internal(filename)
+ else
+ filename
+ end
end
end
View
11 actionpack/test/dispatch/request/multipart_params_parsing_test.rb
@@ -82,17 +82,6 @@ def teardown
assert_equal 19512, file.size
end
- test "parses mixed files" do
- params = parse_multipart('mixed_files')
- assert_equal %w(files foo), params.keys.sort
- assert_equal 'bar', params['foo']
-
- # Rack doesn't handle multipart/mixed for us.
- files = params['files']
- files.force_encoding('ASCII-8BIT')
- assert_equal 19756, files.size
- end
-
test "does not create tempfile if no file has been selected" do
params = parse_multipart('none')
assert_equal %w(submit-name), params.keys.sort
View
13 actionpack/test/dispatch/request_test.rb
@@ -762,6 +762,19 @@ def url_for(options = {})
path = request.original_fullpath
assert_equal "/foo?bar", path
+
+ end
+ test "invalid utf8 sequences in the query params are replaced with a question mark" do
+ request = stub_request('QUERY_STRING' => "search=%C0%8Afoo%C0%8Abar%C0%8A", 'rack.input' => "")
@nanaya
nanaya added a note Aug 25, 2012

there should be another test with 'QUERY_STRING' => "%C0%8Asea%C0%8Arch%C0%8A=foobar" (and the resulting param should just be discarded). As of current state, it'll raise error 500 (server-side error) while it should be either ignored or returning error 400 (client-side error).

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
+ assert_equal "??foo??bar??", request.parameters[:search]
+ assert request.parameters[:search].valid_encoding?, "request parameter has valid encoding"
+ end
+
+ test "invalid utf8 sequences in the post input are replaced with a question mark" do
+ request = stub_request('QUERY_STRING' => "", 'rack.input' => StringIO.new("search=foo%C0%8Abar"),
+ 'REQUEST_METHOD' => "POST", 'CONTENT_TYPE' => "application/x-www-form-urlencoded")
+ assert_equal "foo??bar", request.parameters[:search]
+ assert request.parameters[:search].valid_encoding?, "request parameter has valid encoding"
end
test "if_none_match_etags none" do
View
6 actionpack/test/dispatch/uploaded_file_test.rb
@@ -18,6 +18,12 @@ def test_filename_should_be_in_utf_8
assert_equal "UTF-8", uf.original_filename.encoding.to_s
end
+ def test_filename_with_invalid_utf_8_are_replaced_with_a_question_mark
+ uf = Http::UploadedFile.new(:filename => "foo\xC0\x8Abar", :tempfile => Object.new)
+ assert uf.original_filename.valid_encoding?, "Uploaded file has a valid encoding"
+ assert_equal "foo??bar", uf.original_filename
+ end
+
def test_content_type
uf = Http::UploadedFile.new(:type => 'foo', :tempfile => Object.new)
assert_equal 'foo', uf.content_type
View
BIN actionpack/test/fixtures/multipart/mixed_files
Binary file not shown.
Something went wrong with that request. Please try again.