Skip to content
Browse files

Add methods for string verification and encoding cleanup code.

Signed-off-by: Michael Koziarski <michael@koziarski.com>
  • Loading branch information...
1 parent 2b15854 commit e3db21fe4f54539be7fc212167553665970a955f @Manfred Manfred committed with NZKoz Sep 1, 2009
View
18 activesupport/lib/active_support/multibyte.rb
@@ -3,7 +3,25 @@ module Multibyte #:nodoc:
DEFAULT_NORMALIZATION_FORM = :kc
NORMALIZATIONS_FORMS = [:c, :kc, :d, :kd]
UNICODE_VERSION = '5.0.0'
+
+ # Regular expressions that describe valid byte sequences for a character
+ VALID_CHARACTER = {
+ # Borrowed from the Kconv library by Shinji KONO - (also as seen on the W3C site)
+ 'UTF-8' => /\A(?:
+ [\x00-\x7f] |
+ [\xc2-\xdf] [\x80-\xbf] |
+ \xe0 [\xa0-\xbf] [\x80-\xbf] |
+ [\xe1-\xef] [\x80-\xbf] [\x80-\xbf] |
+ \xf0 [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] |
+ [\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] |
+ \xf4 [\x80-\x8f] [\x80-\xbf] [\x80-\xbf])\z /xn,
+ # Quick check for valid Shift-JIS characters, disregards the odd-even pairing
+ 'Shift_JIS' => /\A(?:
+ [\x00-\x7e \xa1-\xdf] |
+ [\x81-\x9f \xe0-\xef] [\x40-\x7e \x80-\x9e \x9f-\xfc])\z /xn
+ }
end
end
require 'active_support/multibyte/chars'
+require 'active_support/multibyte/utils'
View
13 activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb
@@ -100,16 +100,7 @@ class UTF8Handler
# between little and big endian. This is not an issue in utf-8, so it must be ignored.
UNICODE_LEADERS_AND_TRAILERS = UNICODE_WHITESPACE + [65279] # ZERO-WIDTH NO-BREAK SPACE aka BOM
- # Borrowed from the Kconv library by Shinji KONO - (also as seen on the W3C site)
- UTF8_PAT = /\A(?:
- [\x00-\x7f] |
- [\xc2-\xdf] [\x80-\xbf] |
- \xe0 [\xa0-\xbf] [\x80-\xbf] |
- [\xe1-\xef] [\x80-\xbf] [\x80-\xbf] |
- \xf0 [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] |
- [\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] |
- \xf4 [\x80-\x8f] [\x80-\xbf] [\x80-\xbf]
- )*\z/xn
+ UTF8_PAT = ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8']
# Returns a regular expression pattern that matches the passed Unicode codepoints
def self.codepoints_to_pattern(array_of_codepoints) #:nodoc:
@@ -357,7 +348,7 @@ def g_length(str)
# Replaces all the non-utf-8 bytes by their iso-8859-1 or cp1252 equivalent resulting in a valid utf-8 string
def tidy_bytes(str)
str.split(//u).map do |c|
- if !UTF8_PAT.match(c)
+ if !ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'].match(c)
n = c.unpack('C')[0]
n < 128 ? n.chr :
n < 160 ? [UCD.cp1252[n] || n].pack('U') :
View
39 activesupport/lib/active_support/multibyte/utils.rb
@@ -0,0 +1,39 @@
+module ActiveSupport #:nodoc:
+ module Multibyte #:nodoc:
+ # Returns a regular expression that matches valid characters in the current encoding
+ def self.valid_character
+ case $KCODE
+ when 'UTF8'
+ VALID_CHARACTER['UTF-8']
+ when 'SJIS'
+ VALID_CHARACTER['Shift_JIS']
+ end
+ end
+
+ # Verifies the encoding of a string
+ def self.verify(string)
+ if expression = valid_character
+ for c in string.split(//)
+ return false unless valid_character.match(c)
+ end
+ end
+ true
+ end
+
+ # Verifies the encoding of the string and raises an exception when it's not valid
+ def self.verify!(string)
+ raise ActiveSupport::Multibyte::Handlers::EncodingError.new("Found characters with invalid encoding") unless verify(string)
+ end
+
+ # Removes all invalid characters from the string
+ def self.clean(string)
+ if expression = valid_character
+ stripped = []; for c in string.split(//)
+ stripped << c if valid_character.match(c)
+ end; stripped.join
+ else
+ string
+ end
+ end
+ end
+end
View
106 activesupport/test/multibyte_utils_test.rb
@@ -0,0 +1,106 @@
+require 'abstract_unit'
+
+class MultibyteUtilsTest < Test::Unit::TestCase
+
+ def test_valid_character_returns_an_expression_for_the_current_encoding
+ with_kcode('None') do
+ assert_nil ActiveSupport::Multibyte.valid_character
+ end
+ with_kcode('UTF8') do
+ assert_equal ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'], ActiveSupport::Multibyte.valid_character
+ end
+ with_kcode('SJIS') do
+ assert_equal ActiveSupport::Multibyte::VALID_CHARACTER['Shift_JIS'], ActiveSupport::Multibyte.valid_character
+ end
+ end
+
+ def test_verify_verifies_ASCII_strings_are_properly_encoded
+ with_kcode('None') do
+ examples.each do |example|
+ assert ActiveSupport::Multibyte.verify(example)
+ end
+ end
+ end
+
+ def test_verify_verifies_UTF_8_strings_are_properly_encoded
+ with_kcode('UTF8') do
+ assert ActiveSupport::Multibyte.verify(example('valid UTF-8'))
+ assert !ActiveSupport::Multibyte.verify(example('invalid UTF-8'))
+ end
+ end
+
+ def test_verify_verifies_Shift_JIS_strings_are_properly_encoded
+ with_kcode('SJIS') do
+ assert ActiveSupport::Multibyte.verify(example('valid Shift-JIS'))
+ assert !ActiveSupport::Multibyte.verify(example('invalid Shift-JIS'))
+ end
+ end
+
+ def test_verify_bang_raises_an_exception_when_it_finds_an_invalid_character
+ with_kcode('UTF8') do
+ assert_raises(ActiveSupport::Multibyte::Handlers::EncodingError) do
+ ActiveSupport::Multibyte.verify!(example('invalid UTF-8'))
+ end
+ end
+ end
+
+ def test_verify_bang_doesnt_raise_an_exception_when_the_encoding_is_valid
+ with_kcode('UTF8') do
+ assert_nothing_raised do
+ ActiveSupport::Multibyte.verify!(example('valid UTF-8'))
+ end
+ end
+ end
+
+ def test_clean_leaves_ASCII_strings_intact
+ with_kcode('None') do
+ [
+ 'word', "\270\236\010\210\245"
+ ].each do |string|
+ assert_equal string, ActiveSupport::Multibyte.clean(string)
+ end
+ end
+ end
+
+ def test_clean_cleans_invalid_characters_from_UTF_8_encoded_strings
+ with_kcode('UTF8') do
+ cleaned_utf8 = [8].pack('C*')
+ assert_equal example('valid UTF-8'), ActiveSupport::Multibyte.clean(example('valid UTF-8'))
+ assert_equal cleaned_utf8, ActiveSupport::Multibyte.clean(example('invalid UTF-8'))
+ end
+ end
+
+ def test_clean_cleans_invalid_characters_from_Shift_JIS_encoded_strings
+ with_kcode('SJIS') do
+ cleaned_sjis = [184, 0, 136, 165].pack('C*')
+ assert_equal example('valid Shift-JIS'), ActiveSupport::Multibyte.clean(example('valid Shift-JIS'))
+ assert_equal cleaned_sjis, ActiveSupport::Multibyte.clean(example('invalid Shift-JIS'))
+ end
+ end
+
+ private
+
+ STRINGS = {
+ 'valid ASCII' => [65, 83, 67, 73, 73].pack('C*'),
+ 'invalid ASCII' => [128].pack('C*'),
+ 'valid UTF-8' => [227, 129, 147, 227, 129, 171, 227, 129, 161, 227, 130, 143].pack('C*'),
+ 'invalid UTF-8' => [184, 158, 8, 136, 165].pack('C*'),
+ 'valid Shift-JIS' => [131, 122, 129, 91, 131, 128].pack('C*'),
+ 'invalid Shift-JIS' => [184, 158, 8, 0, 255, 136, 165].pack('C*')
+ }
+
+ def example(key)
+ STRINGS[key]
+ end
+
+ def examples
+ STRINGS.values
+ end
+
+ def with_kcode(code)
+ before = $KCODE
+ $KCODE = code
+ yield
+ $KCODE = before
+ end
+end

0 comments on commit e3db21f

Please sign in to comment.
Something went wrong with that request. Please try again.