Skip to content

Commit

Permalink
Add ActiveSupport::Multibyte::Chars#limit.
Browse files Browse the repository at this point in the history
The limit method limits the number of bytes in a string. Useful when the
storage space of the string is limited, for instance in a database column
definition.

Sharpen up the implementation of translate offset.

[#3192 state:committed]
  • Loading branch information
Manfred committed Nov 4, 2009
1 parent a3d5274 commit 935bd0f
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 14 deletions.
29 changes: 15 additions & 14 deletions activesupport/lib/active_support/multibyte/chars.rb
Expand Up @@ -363,6 +363,16 @@ def slice!(*args)
slice slice
end end


# Limit the byte size of the string to a number of bytes without breaking characters. Usable
# when the storage for a string is limited for some reason.
#
# Example:
# s = 'こんにちは'
# s.mb_chars.limit(7) #=> "こに"
def limit(limit)
slice(0...translate_offset(limit))
end

# Returns the codepoint of the first character in the string. # Returns the codepoint of the first character in the string.
# #
# Example: # Example:
Expand Down Expand Up @@ -651,24 +661,15 @@ def tidy_bytes(string)
end end


protected protected

def translate_offset(byte_offset) #:nodoc: def translate_offset(byte_offset) #:nodoc:
return nil if byte_offset.nil? return nil if byte_offset.nil?
return 0 if @wrapped_string == '' return 0 if @wrapped_string == ''
chunk = @wrapped_string[0..byte_offset]
begin begin
begin @wrapped_string[0...byte_offset].unpack('U*').length
chunk.unpack('U*').length - 1 rescue ArgumentError => e
rescue ArgumentError => e byte_offset -= 1
chunk = @wrapped_string[0..(byte_offset+=1)] retry
# Stop retrying at the end of the string
raise e unless byte_offset < chunk.length
# We damaged a character, retry
retry
end
# Catch the ArgumentError so we can throw our own
rescue ArgumentError
raise EncodingError, 'malformed UTF-8 character'
end end
end end


Expand Down
59 changes: 59 additions & 0 deletions activesupport/test/multibyte_chars_test.rb
Expand Up @@ -169,6 +169,7 @@ def test_string_methods_are_chainable
assert chars('').strip.kind_of?(ActiveSupport::Multibyte.proxy_class) assert chars('').strip.kind_of?(ActiveSupport::Multibyte.proxy_class)
assert chars('').reverse.kind_of?(ActiveSupport::Multibyte.proxy_class) assert chars('').reverse.kind_of?(ActiveSupport::Multibyte.proxy_class)
assert chars(' ').slice(0).kind_of?(ActiveSupport::Multibyte.proxy_class) assert chars(' ').slice(0).kind_of?(ActiveSupport::Multibyte.proxy_class)
assert chars('').limit(0).kind_of?(ActiveSupport::Multibyte.proxy_class)
assert chars('').upcase.kind_of?(ActiveSupport::Multibyte.proxy_class) assert chars('').upcase.kind_of?(ActiveSupport::Multibyte.proxy_class)
assert chars('').downcase.kind_of?(ActiveSupport::Multibyte.proxy_class) assert chars('').downcase.kind_of?(ActiveSupport::Multibyte.proxy_class)
assert chars('').capitalize.kind_of?(ActiveSupport::Multibyte.proxy_class) assert chars('').capitalize.kind_of?(ActiveSupport::Multibyte.proxy_class)
Expand Down Expand Up @@ -196,7 +197,9 @@ def test_sortability
def test_should_return_character_offset_for_regexp_matches def test_should_return_character_offset_for_regexp_matches
assert_nil(@chars =~ /wrong/u) assert_nil(@chars =~ /wrong/u)
assert_equal 0, (@chars =~ /こ/u) assert_equal 0, (@chars =~ /こ/u)
assert_equal 0, (@chars =~ /こに/u)
assert_equal 1, (@chars =~ /に/u) assert_equal 1, (@chars =~ /に/u)
assert_equal 2, (@chars =~ /ち/u)
assert_equal 3, (@chars =~ /わ/u) assert_equal 3, (@chars =~ /わ/u)
end end


Expand Down Expand Up @@ -493,6 +496,44 @@ def test_capitalize_should_be_unicode_aware
end end
end end


def test_limit_should_not_break_on_blank_strings
chars = ''.mb_chars

assert_equal '', chars.limit(0)
assert_equal '', chars.limit(1)
end

def test_limit_should_work_on_a_multibyte_string
chars = UNICODE_STRING.mb_chars

assert_equal UNICODE_STRING, chars.limit(UNICODE_STRING.length)
assert_equal '', chars.limit(0)
assert_equal '', chars.limit(1)
assert_equal 'こ', chars.limit(3)
assert_equal 'こに', chars.limit(6)
assert_equal 'こに', chars.limit(8)
assert_equal 'こにち', chars.limit(9)
assert_equal 'こにちわ', chars.limit(50)
end

def test_limit_should_work_on_an_ascii_string
ascii = ASCII_STRING.mb_chars

assert_equal ASCII_STRING, ascii.limit(ASCII_STRING.length)
assert_equal '', ascii.limit(0)
assert_equal 'o', ascii.limit(1)
assert_equal 'oh', ascii.limit(2)
assert_equal 'ohay', ascii.limit(4)
assert_equal 'ohayo', ascii.limit(50)
end

def test_limit_should_keep_under_the_specified_byte_limit
chars = UNICODE_STRING.mb_chars
(1..UNICODE_STRING.length).each do |limit|
assert chars.limit(limit).to_s.length <= limit
end
end

def test_composition_exclusion_is_set_up_properly def test_composition_exclusion_is_set_up_properly
# Normalization of DEVANAGARI LETTER QA breaks when composition exclusion isn't used correctly # Normalization of DEVANAGARI LETTER QA breaks when composition exclusion isn't used correctly
qa = [0x915, 0x93c].pack('U*') qa = [0x915, 0x93c].pack('U*')
Expand Down Expand Up @@ -603,3 +644,21 @@ def string_from_classes(classes)
end.pack('U*') end.pack('U*')
end end
end end

class MultibyteInternalsTest < ActiveSupport::TestCase
include MultibyteTestHelpers

test "Chars translates a character offset to a byte offset" do
chars = "Puisque c'était son erreur, il m'a aidé".mb_chars
[
[0, 0],
[3, 3],
[12, 11],
[14, 13],
[41, 39]
].each do |byte_offset, character_offset|
assert_equal character_offset, chars.send(:translate_offset, byte_offset),
"Expected byte offset #{byte_offset} to translate to #{character_offset}"
end
end
end

0 comments on commit 935bd0f

Please sign in to comment.