Permalink
Browse files

Add verify and clean methods to ActiveSupport::Multibyte.

When accepting character input from outside of your application you can't
blindly trust that all strings are properly encoded. With these methods
you can check incoming strings and clean them up if necessary.

Signed-off-by: Michael Koziarski <michael@koziarski.com>

Conflicts:

	activesupport/lib/active_support/multibyte.rb
  • Loading branch information...
NZKoz committed Aug 31, 2009
1 parent 5e6dab8 commit 9a73630d935e360f3dc896e50dd673afb97cf3b5
@@ -29,7 +29,35 @@ module Multibyte
#
# Example:
# ActiveSupport::Multibyte.proxy_class = CharsForUTF32
mattr_accessor :proxy_class
self.proxy_class = ActiveSupport::Multibyte::Chars
def self.proxy_class=(klass)
@proxy_class = klass
end
# Returns the currect proxy class
def self.proxy_class
@proxy_class ||= ActiveSupport::Multibyte::Chars
end
# Regular expressions that describe valid byte sequences for a character
VALID_CHARACTER = {
# Borrowed from the Kconv library by Shinji KONO - (also as seen on the W3C site)
'UTF-8' => /\A(?:
[\x00-\x7f] |
[\xc2-\xdf] [\x80-\xbf] |
\xe0 [\xa0-\xbf] [\x80-\xbf] |
[\xe1-\xef] [\x80-\xbf] [\x80-\xbf] |
\xf0 [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] |
[\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] |
\xf4 [\x80-\x8f] [\x80-\xbf] [\x80-\xbf])\z /xn,
# Quick check for valid Shift-JIS characters, disregards the odd-even pairing
'Shift_JIS' => /\A(?:
[\x00-\x7e \xa1-\xdf] |
[\x81-\x9f \xe0-\xef] [\x40-\x7e \x80-\x9e \x9f-\xfc])\z /xn
}
end
end
require 'active_support/multibyte/chars'
require 'active_support/multibyte/exceptions'
require 'active_support/multibyte/unicode_database'
require 'active_support/multibyte/utils'
@@ -74,16 +74,7 @@ def self.codepoints_to_pattern(array_of_codepoints) #:nodoc:
UNICODE_TRAILERS_PAT = /(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+\Z/
UNICODE_LEADERS_PAT = /\A(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+/
# Borrowed from the Kconv library by Shinji KONO - (also as seen on the W3C site)
UTF8_PAT = /\A(?:
[\x00-\x7f] |
[\xc2-\xdf] [\x80-\xbf] |
\xe0 [\xa0-\xbf] [\x80-\xbf] |
[\xe1-\xef] [\x80-\xbf] [\x80-\xbf] |
\xf0 [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] |
[\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] |
\xf4 [\x80-\x8f] [\x80-\xbf] [\x80-\xbf]
)*\z/xn
UTF8_PAT = ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8']
attr_reader :wrapped_string
alias to_s wrapped_string
@@ -308,31 +299,31 @@ def center(integer, padstr=' ')
def rstrip
chars(@wrapped_string.gsub(UNICODE_TRAILERS_PAT, ''))
end
# Strips entire range of Unicode whitespace from the left of the string.
def lstrip
chars(@wrapped_string.gsub(UNICODE_LEADERS_PAT, ''))
end
# Strips entire range of Unicode whitespace from the right and left of the string.
def strip
rstrip.lstrip
end
# Returns the number of codepoints in the string
def size
self.class.u_unpack(@wrapped_string).size
end
alias_method :length, :size
# Reverses all characters in the string.
#
# Example:
# 'Café'.mb_chars.reverse.to_s #=> 'éfaC'
def reverse
chars(self.class.u_unpack(@wrapped_string).reverse.pack('U*'))
end
# Implements Unicode-aware slice with codepoints. Slicing on one point returns the codepoints for that
# character.
#
@@ -647,7 +638,7 @@ def tidy_bytes(string)
string.split(//u).map do |c|
c.force_encoding(Encoding::ASCII) if c.respond_to?(:force_encoding)
if !UTF8_PAT.match(c)
if !ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'].match(c)
n = c.unpack('C')[0]
n < 128 ? n.chr :
n < 160 ? [UCD.cp1252[n] || n].pack('U') :
@@ -0,0 +1,61 @@
# encoding: utf-8
module ActiveSupport #:nodoc:
module Multibyte #:nodoc:
if Kernel.const_defined?(:Encoding)
# Returns a regular expression that matches valid characters in the current encoding
def self.valid_character
VALID_CHARACTER[Encoding.default_internal.to_s]
end
else
def self.valid_character
case $KCODE
when 'UTF8'
VALID_CHARACTER['UTF-8']
when 'SJIS'
VALID_CHARACTER['Shift_JIS']
end
end
end
if 'string'.respond_to?(:valid_encoding?)
# Verifies the encoding of a string
def self.verify(string)
string.valid_encoding?
end
else
def self.verify(string)
if expression = valid_character
for c in string.split(//)
return false unless valid_character.match(c)
end
end
true
end
end
# Verifies the encoding of the string and raises an exception when it's not valid
def self.verify!(string)
raise EncodingError.new("Found characters with invalid encoding") unless verify(string)
end
if 'string'.respond_to?(:force_encoding)
# Removes all invalid characters from the string.
#
# Note: this method is a no-op in Ruby 1.9
def self.clean(string)
string
end
else
def self.clean(string)
if expression = valid_character
stripped = []; for c in string.split(//)
stripped << c if valid_character.match(c)
end; stripped.join
else
string
end
end
end
end
end
@@ -0,0 +1,141 @@
# encoding: utf-8
require 'abstract_unit'
require 'multibyte_test_helpers'
class MultibyteUtilsTest < ActiveSupport::TestCase
include MultibyteTestHelpers
test "valid_character returns an expression for the current encoding" do
with_encoding('None') do
assert_nil ActiveSupport::Multibyte.valid_character
end
with_encoding('UTF8') do
assert_equal ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'], ActiveSupport::Multibyte.valid_character
end
with_encoding('SJIS') do
assert_equal ActiveSupport::Multibyte::VALID_CHARACTER['Shift_JIS'], ActiveSupport::Multibyte.valid_character
end
end
test "verify verifies ASCII strings are properly encoded" do
with_encoding('None') do
examples.each do |example|
assert ActiveSupport::Multibyte.verify(example)
end
end
end
test "verify verifies UTF-8 strings are properly encoded" do
with_encoding('UTF8') do
assert ActiveSupport::Multibyte.verify(example('valid UTF-8'))
assert !ActiveSupport::Multibyte.verify(example('invalid UTF-8'))
end
end
test "verify verifies Shift-JIS strings are properly encoded" do
with_encoding('SJIS') do
assert ActiveSupport::Multibyte.verify(example('valid Shift-JIS'))
assert !ActiveSupport::Multibyte.verify(example('invalid Shift-JIS'))
end
end
test "verify! raises an exception when it finds an invalid character" do
with_encoding('UTF8') do
assert_raises(ActiveSupport::Multibyte::EncodingError) do
ActiveSupport::Multibyte.verify!(example('invalid UTF-8'))
end
end
end
test "verify! doesn't raise an exception when the encoding is valid" do
with_encoding('UTF8') do
assert_nothing_raised do
ActiveSupport::Multibyte.verify!(example('valid UTF-8'))
end
end
end
if RUBY_VERSION < '1.9'
test "clean leaves ASCII strings intact" do
with_encoding('None') do
[
'word', "\270\236\010\210\245"
].each do |string|
assert_equal string, ActiveSupport::Multibyte.clean(string)
end
end
end
test "clean cleans invalid characters from UTF-8 encoded strings" do
with_encoding('UTF8') do
cleaned_utf8 = [8].pack('C*')
assert_equal example('valid UTF-8'), ActiveSupport::Multibyte.clean(example('valid UTF-8'))
assert_equal cleaned_utf8, ActiveSupport::Multibyte.clean(example('invalid UTF-8'))
end
end
test "clean cleans invalid characters from Shift-JIS encoded strings" do
with_encoding('SJIS') do
cleaned_sjis = [184, 0, 136, 165].pack('C*')
assert_equal example('valid Shift-JIS'), ActiveSupport::Multibyte.clean(example('valid Shift-JIS'))
assert_equal cleaned_sjis, ActiveSupport::Multibyte.clean(example('invalid Shift-JIS'))
end
end
else
test "clean is a no-op" do
with_encoding('UTF8') do
assert_equal example('invalid Shift-JIS'), ActiveSupport::Multibyte.clean(example('invalid Shift-JIS'))
end
end
end
private
STRINGS = {
'valid ASCII' => [65, 83, 67, 73, 73].pack('C*'),
'invalid ASCII' => [128].pack('C*'),
'valid UTF-8' => [227, 129, 147, 227, 129, 171, 227, 129, 161, 227, 130, 143].pack('C*'),
'invalid UTF-8' => [184, 158, 8, 136, 165].pack('C*'),
'valid Shift-JIS' => [131, 122, 129, 91, 131, 128].pack('C*'),
'invalid Shift-JIS' => [184, 158, 8, 0, 255, 136, 165].pack('C*')
}
if Kernel.const_defined?(:Encoding)
def example(key)
STRINGS[key].force_encoding(Encoding.default_internal)
end
def examples
STRINGS.values.map { |s| s.force_encoding(Encoding.default_internal) }
end
else
def example(key)
STRINGS[key]
end
def examples
STRINGS.values
end
end
if 'string'.respond_to?(:encoding)
def with_encoding(enc)
before = Encoding.default_internal
case enc
when 'UTF8'
Encoding.default_internal = Encoding::UTF_8
when 'SJIS'
Encoding.default_internal = Encoding::Shift_JIS
else
Encoding.default_internal = Encoding::BINARY
end
yield
Encoding.default_internal = before
end
else
alias with_encoding with_kcode
end
end

1 comment on commit 9a73630

@toothrot

This comment has been minimized.

Show comment
Hide comment
@toothrot

toothrot May 6, 2015

best commit!

toothrot commented on 9a73630 May 6, 2015

best commit!

Please sign in to comment.