Skip to content
This repository
Browse code

Make tidy_bytes work on 1.9 and improve its performance. [#4350 state…

…:resolved]

Signed-off-by: Jeremy Kemper <jeremy@bitsweat.net>
  • Loading branch information...
commit e416f1d0ab71e3e720e147e5d0e7f6e8b36516a5 1 parent ad22017
Norman Clarke norman authored jeremy committed
5 activesupport/CHANGELOG
... ... @@ -1,3 +1,8 @@
  1 +*Rails 3.0.0 [beta 3] (pending)*
  2 +
  3 +* Speed up and add Ruby 1.9 support for ActiveSupport::Multibyte::Chars#tidy_bytes. #4350 [Norman Clarke]
  4 +
  5 +
1 6 *Rails 3.0.0 [beta 2] (April 1st, 2010)*
2 7
3 8 * Reduced load time by deferring configuration of classes using
85 activesupport/lib/active_support/multibyte/chars.rb
@@ -19,7 +19,7 @@ module Multibyte #:nodoc:
19 19 # bad.explicit_checking_method "T".mb_chars.downcase.to_s
20 20 #
21 21 # The default Chars implementation assumes that the encoding of the string is UTF-8, if you want to handle different
22   - # encodings you can write your own multibyte string handler and configure it through
  22 + # encodings you can write your own multibyte string handler and configure it through
23 23 # ActiveSupport::Multibyte.proxy_class.
24 24 #
25 25 # class CharsForUTF32
@@ -458,8 +458,10 @@ def g_length
458 458 end
459 459
460 460 # Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string.
461   - def tidy_bytes
462   - chars(self.class.tidy_bytes(@wrapped_string))
  461 + #
  462 + # Passing +true+ will forcibly tidy all bytes, assuming that the string's encoding is entirely CP1252 or ISO-8859-1.
  463 + def tidy_bytes(force = false)
  464 + chars(self.class.tidy_bytes(@wrapped_string, force))
463 465 end
464 466
465 467 %w(lstrip rstrip strip reverse upcase downcase tidy_bytes capitalize).each do |method|
@@ -528,7 +530,7 @@ def g_unpack(string)
528 530 unpacked << codepoints[marker..pos-1]
529 531 marker = pos
530 532 end
531   - end
  533 + end
532 534 unpacked
533 535 end
534 536
@@ -644,33 +646,80 @@ def compose_codepoints(codepoints)
644 646 codepoints
645 647 end
646 648
  649 + def tidy_byte(byte)
  650 + if byte < 160
  651 + [UCD.cp1252[byte] || byte].pack("U").unpack("C*")
  652 + elsif byte < 192
  653 + [194, byte]
  654 + else
  655 + [195, byte - 64]
  656 + end
  657 + end
  658 + private :tidy_byte
  659 +
647 660 # Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string.
648   - def tidy_bytes(string)
649   - string.split(//u).map do |c|
650   - c.force_encoding(Encoding::ASCII) if c.respond_to?(:force_encoding)
651   -
652   - if !ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'].match(c)
653   - n = c.unpack('C')[0]
654   - n < 128 ? n.chr :
655   - n < 160 ? [UCD.cp1252[n] || n].pack('U') :
656   - n < 192 ? "\xC2" + n.chr : "\xC3" + (n-64).chr
  661 + #
  662 + # Passing +true+ will forcibly tidy all bytes, assuming that the string's encoding is entirely CP-1252 or ISO-8859-1.
  663 + def tidy_bytes(string, force = false)
  664 + if force
  665 + return string.unpack("C*").map do |b|
  666 + tidy_byte(b)
  667 + end.flatten.compact.pack("C*").unpack("U*").pack("U*")
  668 + end
  669 +
  670 + bytes = string.unpack("C*")
  671 + conts_expected = 0
  672 + last_lead = 0
  673 +
  674 + bytes.each_index do |i|
  675 +
  676 + byte = bytes[i]
  677 + is_ascii = byte < 128
  678 + is_cont = byte > 127 && byte < 192
  679 + is_lead = byte > 191 && byte < 245
  680 + is_unused = byte > 240
  681 + is_restricted = byte > 244
  682 +
  683 + # Impossible or highly unlikely byte? Clean it.
  684 + if is_unused || is_restricted
  685 + bytes[i] = tidy_byte(byte)
  686 + elsif is_cont
  687 + # Not expecting contination byte? Clean up. Otherwise, now expect one less.
  688 + conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
657 689 else
658   - c
  690 + if conts_expected > 0
  691 + # Expected continuation, but got ASCII or leading? Clean backwards up to
  692 + # the leading byte.
  693 + (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
  694 + conts_expected = 0
  695 + end
  696 + if is_lead
  697 + # Final byte is leading? Clean it.
  698 + if i == bytes.length - 1
  699 + bytes[i] = tidy_byte(bytes.last)
  700 + else
  701 + # Valid leading byte? Expect continuations determined by position of
  702 + # first zero bit, with max of 3.
  703 + conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
  704 + last_lead = i
  705 + end
  706 + end
659 707 end
660   - end.join
  708 + end
  709 + bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
661 710 end
662 711 end
663 712
664 713 protected
665   -
  714 +
666 715 def translate_offset(byte_offset) #:nodoc:
667 716 return nil if byte_offset.nil?
668 717 return 0 if @wrapped_string == ''
669   -
  718 +
670 719 if @wrapped_string.respond_to?(:force_encoding)
671 720 @wrapped_string = @wrapped_string.dup.force_encoding(Encoding::ASCII_8BIT)
672 721 end
673   -
  722 +
674 723 begin
675 724 @wrapped_string[0...byte_offset].unpack('U*').length
676 725 rescue ArgumentError => e
73 activesupport/test/multibyte_chars_test.rb
@@ -107,7 +107,7 @@ def setup
107 107 # Ruby 1.9 only supports basic whitespace
108 108 @whitespace = "\n\t ".force_encoding(Encoding::UTF_8)
109 109 end
110   -
  110 +
111 111 @byte_order_mark = [65279].pack('U')
112 112 end
113 113
@@ -468,14 +468,6 @@ def test_acts_like_string
468 468 class MultibyteCharsExtrasTest < Test::Unit::TestCase
469 469 include MultibyteTestHelpers
470 470
471   - if RUBY_VERSION >= '1.9'
472   - def test_tidy_bytes_is_broken_on_1_9_0
473   - assert_raise(ArgumentError) do
474   - assert_equal_codepoints [0xfffd].pack('U'), chars("\xef\xbf\xbd").tidy_bytes
475   - end
476   - end
477   - end
478   -
479 471 def test_upcase_should_be_unicode_aware
480 472 assert_equal "АБВГД\0F", chars("аБвгд\0f").upcase
481 473 assert_equal 'こにちわ', chars('こにちわ').upcase
@@ -504,7 +496,7 @@ def test_limit_should_not_break_on_blank_strings
504 496 def test_limit_should_work_on_a_multibyte_string
505 497 example = chars(UNICODE_STRING)
506 498 bytesize = UNICODE_STRING.respond_to?(:bytesize) ? UNICODE_STRING.bytesize : UNICODE_STRING.size
507   -
  499 +
508 500 assert_equal UNICODE_STRING, example.limit(bytesize)
509 501 assert_equal '', example.limit(0)
510 502 assert_equal '', example.limit(1)
@@ -531,7 +523,7 @@ def test_limit_should_keep_under_the_specified_byte_limit
531 523 assert example.limit(limit).to_s.length <= limit
532 524 end
533 525 end
534   -
  526 +
535 527 def test_composition_exclusion_is_set_up_properly
536 528 # Normalization of DEVANAGARI LETTER QA breaks when composition exclusion isn't used correctly
537 529 qa = [0x915, 0x93c].pack('U*')
@@ -607,28 +599,57 @@ def test_should_compute_grapheme_length
607 599 end
608 600
609 601 def test_tidy_bytes_should_tidy_bytes
  602 +
  603 + single_byte_cases = {
  604 + "\x21" => "!", # Valid ASCII byte, low
  605 + "\x41" => "A", # Valid ASCII byte, mid
  606 + "\x7E" => "~", # Valid ASCII byte, high
  607 + "\x80" => "€", # Continuation byte, low (cp125)
  608 + "\x94" => "”", # Continuation byte, mid (cp125)
  609 + "\x9F" => "Ÿ", # Continuation byte, high (cp125)
  610 + "\xC0" => "À", # Overlong encoding, start of 2-byte sequence, but codepoint < 128
  611 + "\xC1" => "Á", # Overlong encoding, start of 2-byte sequence, but codepoint < 128
  612 + "\xC2" => "Â", # Start of 2-byte sequence, low
  613 + "\xC8" => "È", # Start of 2-byte sequence, mid
  614 + "\xDF" => "ß", # Start of 2-byte sequence, high
  615 + "\xE0" => "à", # Start of 3-byte sequence, low
  616 + "\xE8" => "è", # Start of 3-byte sequence, mid
  617 + "\xEF" => "ï", # Start of 3-byte sequence, high
  618 + "\xF0" => "ð", # Start of 4-byte sequence
  619 + "\xF1" => "ñ", # Unused byte
  620 + "\xFF" => "ÿ", # Restricted byte
  621 + "\x00" => "\x00" # null char
  622 + }
  623 +
  624 + single_byte_cases.each do |bad, good|
  625 + assert_equal good, chars(bad).tidy_bytes.to_s
  626 + assert_equal "#{good}#{good}", chars("#{bad}#{bad}").tidy_bytes
  627 + assert_equal "#{good}#{good}#{good}", chars("#{bad}#{bad}#{bad}").tidy_bytes
  628 + assert_equal "#{good}a", chars("#{bad}a").tidy_bytes
  629 + assert_equal "#{good}á", chars("#{bad}á").tidy_bytes
  630 + assert_equal "a#{good}a", chars("a#{bad}a").tidy_bytes
  631 + assert_equal "á#{good}á", chars("á#{bad}á").tidy_bytes
  632 + assert_equal "a#{good}", chars("a#{bad}").tidy_bytes
  633 + assert_equal "á#{good}", chars("á#{bad}").tidy_bytes
  634 + end
  635 +
610 636 byte_string = "\270\236\010\210\245"
611 637 tidy_string = [0xb8, 0x17e, 0x8, 0x2c6, 0xa5].pack('U*')
612   - ascii_padding = 'aa'
613   - utf8_padding = 'éé'
614   -
615 638 assert_equal_codepoints tidy_string, chars(byte_string).tidy_bytes
616   -
617   - assert_equal_codepoints ascii_padding.dup.insert(1, tidy_string),
618   - chars(ascii_padding.dup.insert(1, byte_string)).tidy_bytes
619   - assert_equal_codepoints utf8_padding.dup.insert(2, tidy_string),
620   - chars(utf8_padding.dup.insert(2, byte_string)).tidy_bytes
621 639 assert_nothing_raised { chars(byte_string).tidy_bytes.to_s.unpack('U*') }
622 640
623   - assert_equal_codepoints "\xC3\xA7", chars("\xE7").tidy_bytes # iso_8859_1: small c cedilla
624   - assert_equal_codepoints "\xE2\x80\x9C", chars("\x93").tidy_bytes # win_1252: left smart quote
625   - assert_equal_codepoints "\xE2\x82\xAC", chars("\x80").tidy_bytes # win_1252: euro
626   - assert_equal_codepoints "\x00", chars("\x00").tidy_bytes # null char
627   - assert_equal_codepoints [0xfffd].pack('U'), chars("\xef\xbf\xbd").tidy_bytes # invalid char
628   - rescue ArgumentError => e
629   - raise e if RUBY_VERSION < '1.9'
  641 + # UTF-8 leading byte followed by too few continuation bytes
  642 + assert_equal_codepoints "\xc3\xb0\xc2\xa5\xc2\xa4\x21", chars("\xf0\xa5\xa4\x21").tidy_bytes
  643 + end
  644 +
  645 + def test_tidy_bytes_should_forcibly_tidy_bytes_if_specified
  646 + byte_string = "\xF0\xA5\xA4\xA4" # valid as both CP-1252 and UTF-8, but with different interpretations.
  647 + assert_not_equal "𥤤", chars(byte_string).tidy_bytes
  648 + # Forcible conversion to UTF-8
  649 + assert_equal "𥤤", chars(byte_string).tidy_bytes(true)
630 650 end
631 651
  652 +
632 653 private
633 654
634 655 def string_from_classes(classes)

0 comments on commit e416f1d

Please sign in to comment.
Something went wrong with that request. Please try again.