Skip to content
This repository
Browse code

Use multibyte proxy class on 1.9, refactor Unicode.

Makes String#mb_chars on Ruby 1.9 return an instance of ActiveSupport::Multibyte::Chars to work around 1.9's lack of Unicode case folding.

Refactors class methods from ActiveSupport::Multibyte::Chars into new Unicode module, adding other related functionality for consistency.

[#4594 state:resolved]

Signed-off-by: Jeremy Kemper <jeremy@bitsweat.net>
  • Loading branch information...
commit f3abc8ac36055afed9fcc902c33ee146e066d17a 1 parent ad4be3d
Norman Clarke authored May 10, 2010 jeremy committed May 21, 2010
4  activesupport/CHANGELOG
... ...
@@ -1,8 +1,10 @@
1 1
 *Rails 3.0.0 [beta 4/release candidate] (unreleased)*
2 2
 
  3
+* Ruby 1.9: support UTF-8 case folding.  #4595 [Norman Clarke]
  4
+
3 5
 * Renames Array#rand -> Array#random_element. [Santiago Pastorino, Rizwan Reza]
4 6
 
5  
-* 1.9 compat: Renames last_(month|year) to prev_(month|year) in Date and Time. [fxn]
  7
+* Ruby 1.9: Renames last_(month|year) to prev_(month|year) in Date and Time. [fxn]
6 8
 
7 9
 * Aliases Date#sunday to Date#end_of_week. [fxn]
8 10
 
205  activesupport/bin/generate_tables
@@ -11,126 +11,129 @@ require 'tmpdir'
11 11
 
12 12
 module ActiveSupport
13 13
   module Multibyte
14  
-    class UnicodeDatabase
15  
-      def load; end
16  
-    end
17  
-    
18  
-    class UnicodeDatabaseGenerator
19  
-      BASE_URI = "http://www.unicode.org/Public/#{ActiveSupport::Multibyte::UNICODE_VERSION}/ucd/"
20  
-      SOURCES = {
21  
-        :codepoints => BASE_URI + 'UnicodeData.txt',
22  
-        :composition_exclusion => BASE_URI + 'CompositionExclusions.txt',
23  
-        :grapheme_break_property => BASE_URI + 'auxiliary/GraphemeBreakProperty.txt',
24  
-        :cp1252 => 'http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT'
25  
-      }
26  
-
27  
-      def initialize
28  
-        @ucd = UnicodeDatabase.new
29  
-
30  
-        default = Codepoint.new
31  
-        default.combining_class = 0
32  
-        default.uppercase_mapping = 0
33  
-        default.lowercase_mapping = 0
34  
-        @ucd.codepoints = Hash.new(default)
35  
-      end
  14
+    module Unicode
36 15
 
37  
-      def parse_codepoints(line)
38  
-        codepoint = Codepoint.new
39  
-        raise "Could not parse input." unless line =~ /^
40  
-          ([0-9A-F]+);        # code
41  
-          ([^;]+);            # name
42  
-          ([A-Z]+);           # general category
43  
-          ([0-9]+);           # canonical combining class
44  
-          ([A-Z]+);           # bidi class
45  
-          (<([A-Z]*)>)?       # decomposition type
46  
-          ((\ ?[0-9A-F]+)*);  # decompomposition mapping
47  
-          ([0-9]*);           # decimal digit
48  
-          ([0-9]*);           # digit
49  
-          ([^;]*);            # numeric
50  
-          ([YN]*);            # bidi mirrored
51  
-          ([^;]*);            # unicode 1.0 name
52  
-          ([^;]*);            # iso comment
53  
-          ([0-9A-F]*);        # simple uppercase mapping
54  
-          ([0-9A-F]*);        # simple lowercase mapping
55  
-          ([0-9A-F]*)$/ix     # simple titlecase mapping
56  
-        codepoint.code              = $1.hex
57  
-        #codepoint.name              = $2
58  
-        #codepoint.category          = $3
59  
-        codepoint.combining_class   = Integer($4)
60  
-        #codepoint.bidi_class        = $5
61  
-        codepoint.decomp_type       = $7
62  
-        codepoint.decomp_mapping    = ($8=='') ? nil : $8.split.collect { |element| element.hex }
63  
-        #codepoint.bidi_mirrored     = ($13=='Y') ? true : false
64  
-        codepoint.uppercase_mapping = ($16=='') ? 0 : $16.hex
65  
-        codepoint.lowercase_mapping = ($17=='') ? 0 : $17.hex
66  
-        #codepoint.titlecase_mapping = ($18=='') ? nil : $18.hex
67  
-        @ucd.codepoints[codepoint.code] = codepoint
  16
+      class UnicodeDatabase
  17
+        def load; end
68 18
       end
69 19
 
70  
-      def parse_grapheme_break_property(line)
71  
-        if line =~ /^([0-9A-F\.]+)\s*;\s*([\w]+)\s*#/
72  
-          type = $2.downcase.intern
73  
-          @ucd.boundary[type] ||= []
74  
-          if $1.include? '..'
75  
-            parts = $1.split '..'
76  
-            @ucd.boundary[type] << (parts[0].hex..parts[1].hex)
77  
-          else
78  
-            @ucd.boundary[type] << $1.hex
  20
+      class DatabaseGenerator
  21
+        BASE_URI = "http://www.unicode.org/Public/#{UNICODE_VERSION}/ucd/"
  22
+        SOURCES = {
  23
+          :codepoints => BASE_URI + 'UnicodeData.txt',
  24
+          :composition_exclusion => BASE_URI + 'CompositionExclusions.txt',
  25
+          :grapheme_break_property => BASE_URI + 'auxiliary/GraphemeBreakProperty.txt',
  26
+          :cp1252 => 'http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT'
  27
+        }
  28
+
  29
+        def initialize
  30
+          @ucd = Unicode::UnicodeDatabase.new
  31
+
  32
+          default = Codepoint.new
  33
+          default.combining_class = 0
  34
+          default.uppercase_mapping = 0
  35
+          default.lowercase_mapping = 0
  36
+          @ucd.codepoints = Hash.new(default)
  37
+        end
  38
+
  39
+        def parse_codepoints(line)
  40
+          codepoint = Codepoint.new
  41
+          raise "Could not parse input." unless line =~ /^
  42
+            ([0-9A-F]+);        # code
  43
+            ([^;]+);            # name
  44
+            ([A-Z]+);           # general category
  45
+            ([0-9]+);           # canonical combining class
  46
+            ([A-Z]+);           # bidi class
  47
+            (<([A-Z]*)>)?       # decomposition type
  48
+            ((\ ?[0-9A-F]+)*);  # decompomposition mapping
  49
+            ([0-9]*);           # decimal digit
  50
+            ([0-9]*);           # digit
  51
+            ([^;]*);            # numeric
  52
+            ([YN]*);            # bidi mirrored
  53
+            ([^;]*);            # unicode 1.0 name
  54
+            ([^;]*);            # iso comment
  55
+            ([0-9A-F]*);        # simple uppercase mapping
  56
+            ([0-9A-F]*);        # simple lowercase mapping
  57
+            ([0-9A-F]*)$/ix     # simple titlecase mapping
  58
+          codepoint.code              = $1.hex
  59
+          #codepoint.name              = $2
  60
+          #codepoint.category          = $3
  61
+          codepoint.combining_class   = Integer($4)
  62
+          #codepoint.bidi_class        = $5
  63
+          codepoint.decomp_type       = $7
  64
+          codepoint.decomp_mapping    = ($8=='') ? nil : $8.split.collect { |element| element.hex }
  65
+          #codepoint.bidi_mirrored     = ($13=='Y') ? true : false
  66
+          codepoint.uppercase_mapping = ($16=='') ? 0 : $16.hex
  67
+          codepoint.lowercase_mapping = ($17=='') ? 0 : $17.hex
  68
+          #codepoint.titlecase_mapping = ($18=='') ? nil : $18.hex
  69
+          @ucd.codepoints[codepoint.code] = codepoint
  70
+        end
  71
+
  72
+        def parse_grapheme_break_property(line)
  73
+          if line =~ /^([0-9A-F\.]+)\s*;\s*([\w]+)\s*#/
  74
+            type = $2.downcase.intern
  75
+            @ucd.boundary[type] ||= []
  76
+            if $1.include? '..'
  77
+              parts = $1.split '..'
  78
+              @ucd.boundary[type] << (parts[0].hex..parts[1].hex)
  79
+            else
  80
+              @ucd.boundary[type] << $1.hex
  81
+            end
79 82
           end
80 83
         end
81  
-      end
82 84
 
83  
-      def parse_composition_exclusion(line)
84  
-        if line =~ /^([0-9A-F]+)/i
85  
-          @ucd.composition_exclusion << $1.hex
  85
+        def parse_composition_exclusion(line)
  86
+          if line =~ /^([0-9A-F]+)/i
  87
+            @ucd.composition_exclusion << $1.hex
  88
+          end
86 89
         end
87  
-      end
88 90
 
89  
-      def parse_cp1252(line)
90  
-        if line =~ /^([0-9A-Fx]+)\s([0-9A-Fx]+)/i
91  
-          @ucd.cp1252[$1.hex] = $2.hex
  91
+        def parse_cp1252(line)
  92
+          if line =~ /^([0-9A-Fx]+)\s([0-9A-Fx]+)/i
  93
+            @ucd.cp1252[$1.hex] = $2.hex
  94
+          end
92 95
         end
93  
-      end
94 96
 
95  
-      def create_composition_map
96  
-        @ucd.codepoints.each do |_, cp|
97  
-          if !cp.nil? and cp.combining_class == 0 and cp.decomp_type.nil? and !cp.decomp_mapping.nil? and cp.decomp_mapping.length == 2 and @ucd.codepoints[cp.decomp_mapping[0]].combining_class == 0 and !@ucd.composition_exclusion.include?(cp.code)
98  
-            @ucd.composition_map[cp.decomp_mapping[0]] ||= {}
99  
-            @ucd.composition_map[cp.decomp_mapping[0]][cp.decomp_mapping[1]] = cp.code
  97
+        def create_composition_map
  98
+          @ucd.codepoints.each do |_, cp|
  99
+            if !cp.nil? and cp.combining_class == 0 and cp.decomp_type.nil? and !cp.decomp_mapping.nil? and cp.decomp_mapping.length == 2 and @ucd.codepoints[cp.decomp_mapping[0]].combining_class == 0 and !@ucd.composition_exclusion.include?(cp.code)
  100
+              @ucd.composition_map[cp.decomp_mapping[0]] ||= {}
  101
+              @ucd.composition_map[cp.decomp_mapping[0]][cp.decomp_mapping[1]] = cp.code
  102
+            end
100 103
           end
101 104
         end
102  
-      end
103 105
 
104  
-      def normalize_boundary_map
105  
-        @ucd.boundary.each do |k,v|
106  
-          if [:lf, :cr].include? k
107  
-            @ucd.boundary[k] = v[0]
  106
+        def normalize_boundary_map
  107
+          @ucd.boundary.each do |k,v|
  108
+            if [:lf, :cr].include? k
  109
+              @ucd.boundary[k] = v[0]
  110
+            end
108 111
           end
109 112
         end
110  
-      end
111 113
 
112  
-      def parse
113  
-        SOURCES.each do |type, url|
114  
-          filename =  File.join(Dir.tmpdir, "#{url.split('/').last}")
115  
-          unless File.exist?(filename)
116  
-            $stderr.puts "Downloading #{url.split('/').last}"
117  
-            File.open(filename, 'wb') do |target|
118  
-              open(url) do |source|
119  
-                source.each_line { |line| target.write line }
  114
+        def parse
  115
+          SOURCES.each do |type, url|
  116
+            filename =  File.join(Dir.tmpdir, "#{url.split('/').last}")
  117
+            unless File.exist?(filename)
  118
+              $stderr.puts "Downloading #{url.split('/').last}"
  119
+              File.open(filename, 'wb') do |target|
  120
+                open(url) do |source|
  121
+                  source.each_line { |line| target.write line }
  122
+                end
120 123
               end
121 124
             end
  125
+            File.open(filename) do |file|
  126
+              file.each_line { |line| send "parse_#{type}".intern, line }
  127
+            end
122 128
           end
123  
-          File.open(filename) do |file|
124  
-            file.each_line { |line| send "parse_#{type}".intern, line }
125  
-          end
  129
+          create_composition_map
  130
+          normalize_boundary_map
126 131
         end
127  
-        create_composition_map
128  
-        normalize_boundary_map
129  
-      end
130 132
 
131  
-      def dump_to(filename)
132  
-        File.open(filename, 'wb') do |f|
133  
-          f.write Marshal.dump([@ucd.codepoints, @ucd.composition_exclusion, @ucd.composition_map, @ucd.boundary, @ucd.cp1252])
  133
+        def dump_to(filename)
  134
+          File.open(filename, 'wb') do |f|
  135
+            f.write Marshal.dump([@ucd.codepoints, @ucd.composition_exclusion, @ucd.composition_map, @ucd.boundary, @ucd.cp1252])
  136
+          end
134 137
         end
135 138
       end
136 139
     end
@@ -138,8 +141,8 @@ module ActiveSupport
138 141
 end
139 142
 
140 143
 if __FILE__ == $0
141  
-  filename = ActiveSupport::Multibyte::UnicodeDatabase.filename
142  
-  generator = ActiveSupport::Multibyte::UnicodeDatabaseGenerator.new
  144
+  filename = ActiveSupport::Multibyte::Unicode::UnicodeDatabase.filename
  145
+  generator = ActiveSupport::Multibyte::Unicode::DatabaseGenerator.new
143 146
   generator.parse
144 147
   print "Writing to: #{filename}"
145 148
   generator.dump_to filename
30  activesupport/lib/active_support/core_ext/string/multibyte.rb
@@ -2,7 +2,7 @@
2 2
 require 'active_support/multibyte'
3 3
 
4 4
 class String
5  
-  unless '1.9'.respond_to?(:force_encoding)
  5
+  if '1.9'.respond_to?(:force_encoding)
6 6
     # == Multibyte proxy
7 7
     #
8 8
     # +mb_chars+ is a multibyte safe proxy for string methods.
@@ -37,23 +37,13 @@ class String
37 37
     # For more information about the methods defined on the Chars proxy see ActiveSupport::Multibyte::Chars. For
38 38
     # information about how to change the default Multibyte behaviour see ActiveSupport::Multibyte.
39 39
     def mb_chars
40  
-      if ActiveSupport::Multibyte.proxy_class.wants?(self)
  40
+      if ActiveSupport::Multibyte.proxy_class.consumes?(self)
41 41
         ActiveSupport::Multibyte.proxy_class.new(self)
42 42
       else
43 43
         self
44 44
       end
45 45
     end
46  
-    
47  
-    # Returns true if the string has UTF-8 semantics (a String used for purely byte resources is unlikely to have
48  
-    # them), returns false otherwise.
49  
-    def is_utf8?
50  
-      ActiveSupport::Multibyte::Chars.consumes?(self)
51  
-    end
52  
-  else
53  
-    def mb_chars #:nodoc
54  
-      self
55  
-    end
56  
-    
  46
+
57 47
     def is_utf8? #:nodoc
58 48
       case encoding
59 49
       when Encoding::UTF_8
@@ -64,5 +54,19 @@ def is_utf8? #:nodoc
64 54
         false
65 55
       end
66 56
     end
  57
+  else
  58
+    def mb_chars
  59
+      if ActiveSupport::Multibyte.proxy_class.wants?(self)
  60
+        ActiveSupport::Multibyte.proxy_class.new(self)
  61
+      else
  62
+        self
  63
+      end
  64
+    end
  65
+
  66
+    # Returns true if the string has UTF-8 semantics (a String used for purely byte resources is unlikely to have
  67
+    # them), returns false otherwise.
  68
+    def is_utf8?
  69
+      ActiveSupport::Multibyte::Chars.consumes?(self)
  70
+    end
67 71
   end
68 72
 end
5  activesupport/lib/active_support/inflector/transliterate.rb
@@ -58,8 +58,9 @@ module Inflector
58 58
     #   transliterate("Jürgen")
59 59
     #   # => "Juergen"
60 60
     def transliterate(string, replacement = "?")
61  
-      I18n.transliterate(Multibyte::Chars.normalize(
62  
-        Multibyte::Chars.tidy_bytes(string), :c), :replacement => replacement)
  61
+      I18n.transliterate(ActiveSupport::Multibyte::Unicode.normalize(
  62
+        ActiveSupport::Multibyte::Unicode.tidy_bytes(string), :c),
  63
+          :replacement => replacement)
63 64
     end
64 65
 
65 66
     # Replaces special characters in a string so that it may be used as part of a 'pretty' URL.
20  activesupport/lib/active_support/multibyte.rb
... ...
@@ -1,30 +1,12 @@
1 1
 # encoding: utf-8
2  
-
3 2
 require 'active_support/core_ext/module/attribute_accessors'
4 3
 
5 4
 module ActiveSupport #:nodoc:
6 5
   module Multibyte
7 6
     autoload :EncodingError, 'active_support/multibyte/exceptions'
8 7
     autoload :Chars, 'active_support/multibyte/chars'
9  
-    autoload :UnicodeDatabase, 'active_support/multibyte/unicode_database'
10  
-    autoload :Codepoint, 'active_support/multibyte/unicode_database'
11  
-    autoload :UCD, 'active_support/multibyte/unicode_database'
  8
+    autoload :Unicode, 'active_support/multibyte/unicode'
12 9
     
13  
-    # A list of all available normalization forms. See http://www.unicode.org/reports/tr15/tr15-29.html for more
14  
-    # information about normalization.
15  
-    NORMALIZATION_FORMS = [:c, :kc, :d, :kd]
16  
-
17  
-    # The Unicode version that is supported by the implementation
18  
-    UNICODE_VERSION = '5.1.0'
19  
-
20  
-    # The default normalization used for operations that require normalization. It can be set to any of the
21  
-    # normalizations in NORMALIZATION_FORMS.
22  
-    #
23  
-    # Example:
24  
-    #   ActiveSupport::Multibyte.default_normalization_form = :c
25  
-    mattr_accessor :default_normalization_form
26  
-    self.default_normalization_form = :kc
27  
-
28 10
     # The proxy class returned when calling mb_chars. You can use this accessor to configure your own proxy
29 11
     # class so you can support other encodings. See the ActiveSupport::Multibyte::Chars implementation for
30 12
     # an example how to do this.
577  activesupport/lib/active_support/multibyte/chars.rb
@@ -34,52 +34,12 @@ module Multibyte #:nodoc:
34 34
     #
35 35
     #   ActiveSupport::Multibyte.proxy_class = CharsForUTF32
36 36
     class Chars
37  
-      # Hangul character boundaries and properties
38  
-      HANGUL_SBASE = 0xAC00
39  
-      HANGUL_LBASE = 0x1100
40  
-      HANGUL_VBASE = 0x1161
41  
-      HANGUL_TBASE = 0x11A7
42  
-      HANGUL_LCOUNT = 19
43  
-      HANGUL_VCOUNT = 21
44  
-      HANGUL_TCOUNT = 28
45  
-      HANGUL_NCOUNT = HANGUL_VCOUNT * HANGUL_TCOUNT
46  
-      HANGUL_SCOUNT = 11172
47  
-      HANGUL_SLAST = HANGUL_SBASE + HANGUL_SCOUNT
48  
-      HANGUL_JAMO_FIRST = 0x1100
49  
-      HANGUL_JAMO_LAST = 0x11FF
50  
-
51  
-      # All the unicode whitespace
52  
-      UNICODE_WHITESPACE = [
53  
-        (0x0009..0x000D).to_a, # White_Space # Cc   [5] <control-0009>..<control-000D>
54  
-        0x0020,                # White_Space # Zs       SPACE
55  
-        0x0085,                # White_Space # Cc       <control-0085>
56  
-        0x00A0,                # White_Space # Zs       NO-BREAK SPACE
57  
-        0x1680,                # White_Space # Zs       OGHAM SPACE MARK
58  
-        0x180E,                # White_Space # Zs       MONGOLIAN VOWEL SEPARATOR
59  
-        (0x2000..0x200A).to_a, # White_Space # Zs  [11] EN QUAD..HAIR SPACE
60  
-        0x2028,                # White_Space # Zl       LINE SEPARATOR
61  
-        0x2029,                # White_Space # Zp       PARAGRAPH SEPARATOR
62  
-        0x202F,                # White_Space # Zs       NARROW NO-BREAK SPACE
63  
-        0x205F,                # White_Space # Zs       MEDIUM MATHEMATICAL SPACE
64  
-        0x3000,                # White_Space # Zs       IDEOGRAPHIC SPACE
65  
-      ].flatten.freeze
66  
-
67  
-      # BOM (byte order mark) can also be seen as whitespace, it's a non-rendering character used to distinguish
68  
-      # between little and big endian. This is not an issue in utf-8, so it must be ignored.
69  
-      UNICODE_LEADERS_AND_TRAILERS = UNICODE_WHITESPACE + [65279] # ZERO-WIDTH NO-BREAK SPACE aka BOM
70  
-
71  
-      # Returns a regular expression pattern that matches the passed Unicode codepoints
72  
-      def self.codepoints_to_pattern(array_of_codepoints) #:nodoc:
73  
-        array_of_codepoints.collect{ |e| [e].pack 'U*' }.join('|')
74  
-      end
75  
-      UNICODE_TRAILERS_PAT = /(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+\Z/u
76  
-      UNICODE_LEADERS_PAT = /\A(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+/u
77 37
 
78 38
       attr_reader :wrapped_string
79 39
       alias to_s wrapped_string
80 40
       alias to_str wrapped_string
81 41
 
82  
-      if '1.9'.respond_to?(:force_encoding)
  42
+      if RUBY_VERSION >= "1.9"
83 43
         # Creates a new Chars instance by wrapping _string_.
84 44
         def initialize(string)
85 45
           @wrapped_string = string
@@ -113,12 +73,6 @@ def acts_like_string?
113 73
         true
114 74
       end
115 75
 
116  
-      # Returns +true+ if the Chars class can and should act as a proxy for the string _string_. Returns
117  
-      # +false+ otherwise.
118  
-      def self.wants?(string)
119  
-        $KCODE == 'UTF8' && consumes?(string)
120  
-      end
121  
-
122 76
       # Returns +true+ when the proxy class can handle the string. Returns +false+ otherwise.
123 77
       def self.consumes?(string)
124 78
         # Unpack is a little bit faster than regular expressions.
@@ -130,89 +84,131 @@ def self.consumes?(string)
130 84
 
131 85
       include Comparable
132 86
 
133  
-      # Returns <tt>-1</tt>, <tt>0</tt> or <tt>+1</tt> depending on whether the Chars object is to be sorted before,
134  
-      # equal or after the object on the right side of the operation. It accepts any object that implements +to_s+.
135  
-      # See <tt>String#<=></tt> for more details.
136  
-      #
137  
-      # Example:
138  
-      #   'é'.mb_chars <=> 'ü'.mb_chars #=> -1
139  
-      def <=>(other)
140  
-        @wrapped_string <=> other.to_s
141  
-      end
  87
+      if RUBY_VERSION < "1.9"
  88
+        # Returns +true+ if the Chars class can and should act as a proxy for the string _string_. Returns
  89
+        # +false+ otherwise.
  90
+        def self.wants?(string)
  91
+          $KCODE == 'UTF8' && consumes?(string)
  92
+        end
142 93
 
143  
-      # Returns a new Chars object containing the _other_ object concatenated to the string.
144  
-      #
145  
-      # Example:
146  
-      #   ('Café'.mb_chars + ' périferôl').to_s #=> "Café périferôl"
147  
-      def +(other)
148  
-        self << other
149  
-      end
  94
+        # Returns <tt>-1</tt>, <tt>0</tt> or <tt>+1</tt> depending on whether the Chars object is to be sorted before,
  95
+        # equal or after the object on the right side of the operation. It accepts any object that implements +to_s+.
  96
+        # See <tt>String#<=></tt> for more details.
  97
+        #
  98
+        # Example:
  99
+        #   'é'.mb_chars <=> 'ü'.mb_chars #=> -1
  100
+        def <=>(other)
  101
+          @wrapped_string <=> other.to_s
  102
+        end
150 103
 
151  
-      # Like <tt>String#=~</tt> only it returns the character offset (in codepoints) instead of the byte offset.
152  
-      #
153  
-      # Example:
154  
-      #   'Café périferôl'.mb_chars =~ /ô/ #=> 12
155  
-      def =~(other)
156  
-        translate_offset(@wrapped_string =~ other)
157  
-      end
  104
+        # Returns a new Chars object containing the _other_ object concatenated to the string.
  105
+        #
  106
+        # Example:
  107
+        #   ('Café'.mb_chars + ' périferôl').to_s #=> "Café périferôl"
  108
+        def +(other)
  109
+          self << other
  110
+        end
158 111
 
159  
-      # Works just like <tt>String#split</tt>, with the exception that the items in the resulting list are Chars
160  
-      # instances instead of String. This makes chaining methods easier.
161  
-      #
162  
-      # Example:
163  
-      #   'Café périferôl'.mb_chars.split(/é/).map { |part| part.upcase.to_s } #=> ["CAF", " P", "RIFERÔL"]
164  
-      def split(*args)
165  
-        @wrapped_string.split(*args).map { |i| i.mb_chars }
166  
-      end
  112
+        # Like <tt>String#=~</tt> only it returns the character offset (in codepoints) instead of the byte offset.
  113
+        #
  114
+        # Example:
  115
+        #   'Café périferôl'.mb_chars =~ /ô/ #=> 12
  116
+        def =~(other)
  117
+          translate_offset(@wrapped_string =~ other)
  118
+        end
167 119
 
168  
-      # Inserts the passed string at specified codepoint offsets.
169  
-      #
170  
-      # Example:
171  
-      #   'Café'.mb_chars.insert(4, ' périferôl').to_s #=> "Café périferôl"
172  
-      def insert(offset, fragment)
173  
-        unpacked = self.class.u_unpack(@wrapped_string)
174  
-        unless offset > unpacked.length
175  
-          @wrapped_string.replace(
176  
-            self.class.u_unpack(@wrapped_string).insert(offset, *self.class.u_unpack(fragment)).pack('U*')
177  
-          )
178  
-        else
179  
-          raise IndexError, "index #{offset} out of string"
  120
+        # Inserts the passed string at specified codepoint offsets.
  121
+        #
  122
+        # Example:
  123
+        #   'Café'.mb_chars.insert(4, ' périferôl').to_s #=> "Café périferôl"
  124
+        def insert(offset, fragment)
  125
+          unpacked = Unicode.u_unpack(@wrapped_string)
  126
+          unless offset > unpacked.length
  127
+            @wrapped_string.replace(
  128
+              Unicode.u_unpack(@wrapped_string).insert(offset, *Unicode.u_unpack(fragment)).pack('U*')
  129
+            )
  130
+          else
  131
+            raise IndexError, "index #{offset} out of string"
  132
+          end
  133
+          self
180 134
         end
181  
-        self
182  
-      end
183 135
 
184  
-      # Returns +true+ if contained string contains _other_. Returns +false+ otherwise.
185  
-      #
186  
-      # Example:
187  
-      #   'Café'.mb_chars.include?('é') #=> true
188  
-      def include?(other)
189  
-        # We have to redefine this method because Enumerable defines it.
190  
-        @wrapped_string.include?(other)
191  
-      end
  136
+        # Returns +true+ if contained string contains _other_. Returns +false+ otherwise.
  137
+        #
  138
+        # Example:
  139
+        #   'Café'.mb_chars.include?('é') #=> true
  140
+        def include?(other)
  141
+          # We have to redefine this method because Enumerable defines it.
  142
+          @wrapped_string.include?(other)
  143
+        end
192 144
 
193  
-      # Returns the position _needle_ in the string, counting in codepoints. Returns +nil+ if _needle_ isn't found.
194  
-      #
195  
-      # Example:
196  
-      #   'Café périferôl'.mb_chars.index('ô') #=> 12
197  
-      #   'Café périferôl'.mb_chars.index(/\w/u) #=> 0
198  
-      def index(needle, offset=0)
199  
-        wrapped_offset = first(offset).wrapped_string.length
200  
-        index = @wrapped_string.index(needle, wrapped_offset)
201  
-        index ? (self.class.u_unpack(@wrapped_string.slice(0...index)).size) : nil
  145
+        # Returns the position _needle_ in the string, counting in codepoints. Returns +nil+ if _needle_ isn't found.
  146
+        #
  147
+        # Example:
  148
+        #   'Café périferôl'.mb_chars.index('ô') #=> 12
  149
+        #   'Café périferôl'.mb_chars.index(/\w/u) #=> 0
  150
+        def index(needle, offset=0)
  151
+          wrapped_offset = first(offset).wrapped_string.length
  152
+          index = @wrapped_string.index(needle, wrapped_offset)
  153
+          index ? (Unicode.u_unpack(@wrapped_string.slice(0...index)).size) : nil
  154
+        end
  155
+
  156
+        # Returns the position _needle_ in the string, counting in
  157
+        # codepoints, searching backward from _offset_ or the end of the
  158
+        # string. Returns +nil+ if _needle_ isn't found.
  159
+        #
  160
+        # Example:
  161
+        #   'Café périferôl'.mb_chars.rindex('é') #=> 6
  162
+        #   'Café périferôl'.mb_chars.rindex(/\w/u) #=> 13
  163
+        def rindex(needle, offset=nil)
  164
+          offset ||= length
  165
+          wrapped_offset = first(offset).wrapped_string.length
  166
+          index = @wrapped_string.rindex(needle, wrapped_offset)
  167
+          index ? (Unicode.u_unpack(@wrapped_string.slice(0...index)).size) : nil
  168
+        end
  169
+
  170
+        # Returns the number of codepoints in the string
  171
+        def size
  172
+          Unicode.u_unpack(@wrapped_string).size
  173
+        end
  174
+        alias_method :length, :size
  175
+
  176
+        # Strips entire range of Unicode whitespace from the right of the string.
  177
+        def rstrip
  178
+          chars(@wrapped_string.gsub(Unicode::TRAILERS_PAT, ''))
  179
+        end
  180
+
  181
+        # Strips entire range of Unicode whitespace from the left of the string.
  182
+        def lstrip
  183
+          chars(@wrapped_string.gsub(Unicode::LEADERS_PAT, ''))
  184
+        end
  185
+
  186
+        # Strips entire range of Unicode whitespace from the right and left of the string.
  187
+        def strip
  188
+          rstrip.lstrip
  189
+        end
  190
+
  191
+        # Returns the codepoint of the first character in the string.
  192
+        #
  193
+        # Example:
  194
+        #   'こんにちは'.mb_chars.ord #=> 12371
  195
+        def ord
  196
+          Unicode.u_unpack(@wrapped_string)[0]
  197
+        end
  198
+
  199
+      else
  200
+        def =~(other)
  201
+          @wrapped_string =~ other
  202
+        end
202 203
       end
203 204
 
204  
-      # Returns the position _needle_ in the string, counting in
205  
-      # codepoints, searching backward from _offset_ or the end of the
206  
-      # string. Returns +nil+ if _needle_ isn't found.
  205
+      # Works just like <tt>String#split</tt>, with the exception that the items in the resulting list are Chars
  206
+      # instances instead of String. This makes chaining methods easier.
207 207
       #
208 208
       # Example:
209  
-      #   'Café périferôl'.mb_chars.rindex('é') #=> 6
210  
-      #   'Café périferôl'.mb_chars.rindex(/\w/u) #=> 13
211  
-      def rindex(needle, offset=nil)
212  
-        offset ||= length
213  
-        wrapped_offset = first(offset).wrapped_string.length
214  
-        index = @wrapped_string.rindex(needle, wrapped_offset)
215  
-        index ? (self.class.u_unpack(@wrapped_string.slice(0...index)).size) : nil
  209
+      #   'Café périferôl'.mb_chars.split(/é/).map { |part| part.upcase.to_s } #=> ["CAF", " P", "RIFERÔL"]
  210
+      def split(*args)
  211
+        @wrapped_string.split(*args).map { |i| i.mb_chars }
216 212
       end
217 213
 
218 214
       # Like <tt>String#[]=</tt>, except instead of byte offsets you specify character offsets.
@@ -234,7 +230,7 @@ def []=(*args)
234 230
         if args.first.is_a?(Regexp)
235 231
           @wrapped_string[*args] = replace_by
236 232
         else
237  
-          result = self.class.u_unpack(@wrapped_string)
  233
+          result = Unicode.u_unpack(@wrapped_string)
238 234
           if args[0].is_a?(Fixnum)
239 235
             raise IndexError, "index #{args[0]} out of string" if args[0] >= result.length
240 236
             min = args[0]
@@ -247,10 +243,10 @@ def []=(*args)
247 243
           else
248 244
             needle = args[0].to_s
249 245
             min = index(needle)
250  
-            max = min + self.class.u_unpack(needle).length - 1
  246
+            max = min + Unicode.u_unpack(needle).length - 1
251 247
             range = Range.new(min, max)
252 248
           end
253  
-          result[range] = self.class.u_unpack(replace_by)
  249
+          result[range] = Unicode.u_unpack(replace_by)
254 250
           @wrapped_string.replace(result.pack('U*'))
255 251
         end
256 252
       end
@@ -294,33 +290,13 @@ def center(integer, padstr=' ')
294 290
         justify(integer, :center, padstr)
295 291
       end
296 292
 
297  
-      # Strips entire range of Unicode whitespace from the right of the string.
298  
-      def rstrip
299  
-        chars(@wrapped_string.gsub(UNICODE_TRAILERS_PAT, ''))
300  
-      end
301  
-
302  
-      # Strips entire range of Unicode whitespace from the left of the string.
303  
-      def lstrip
304  
-        chars(@wrapped_string.gsub(UNICODE_LEADERS_PAT, ''))
305  
-      end
306  
-
307  
-      # Strips entire range of Unicode whitespace from the right and left of the string.
308  
-      def strip
309  
-        rstrip.lstrip
310  
-      end
311  
-
312  
-      # Returns the number of codepoints in the string
313  
-      def size
314  
-        self.class.u_unpack(@wrapped_string).size
315  
-      end
316  
-      alias_method :length, :size
317 293
 
318 294
       # Reverses all characters in the string.
319 295
       #
320 296
       # Example:
321 297
       #   'Café'.mb_chars.reverse.to_s #=> 'éfaC'
322 298
       def reverse
323  
-        chars(self.class.g_unpack(@wrapped_string).reverse.flatten.pack('U*'))
  299
+        chars(Unicode.g_unpack(@wrapped_string).reverse.flatten.pack('U*'))
324 300
       end
325 301
 
326 302
       # Implements Unicode-aware slice with codepoints. Slicing on one point returns the codepoints for that
@@ -336,15 +312,15 @@ def slice(*args)
336 312
         elsif (args.size == 2 && !args[1].is_a?(Numeric))
337 313
           raise TypeError, "cannot convert #{args[1].class} into Integer" # Do as if we were native
338 314
         elsif args[0].kind_of? Range
339  
-          cps = self.class.u_unpack(@wrapped_string).slice(*args)
  315
+          cps = Unicode.u_unpack(@wrapped_string).slice(*args)
340 316
           result = cps.nil? ? nil : cps.pack('U*')
341 317
         elsif args[0].kind_of? Regexp
342 318
           result = @wrapped_string.slice(*args)
343 319
         elsif args.size == 1 && args[0].kind_of?(Numeric)
344  
-          character = self.class.u_unpack(@wrapped_string)[args[0]]
  320
+          character = Unicode.u_unpack(@wrapped_string)[args[0]]
345 321
           result = character.nil? ? nil : [character].pack('U')
346 322
         else
347  
-          result = self.class.u_unpack(@wrapped_string).slice(*args).pack('U*')
  323
+          result = Unicode.u_unpack(@wrapped_string).slice(*args).pack('U*')
348 324
         end
349 325
         result.nil? ? nil : chars(result)
350 326
       end
@@ -372,20 +348,12 @@ def limit(limit)
372 348
         slice(0...translate_offset(limit))
373 349
       end
374 350
 
375  
-      # Returns the codepoint of the first character in the string.
376  
-      #
377  
-      # Example:
378  
-      #   'こんにちは'.mb_chars.ord #=> 12371
379  
-      def ord
380  
-        self.class.u_unpack(@wrapped_string)[0]
381  
-      end
382  
-
383 351
       # Convert characters in the string to uppercase.
384 352
       #
385 353
       # Example:
386 354
       #   'Laurent, où sont les tests ?'.mb_chars.upcase.to_s #=> "LAURENT, OÙ SONT LES TESTS ?"
387 355
       def upcase
388  
-        apply_mapping :uppercase_mapping
  356
+        chars(Unicode.apply_mapping @wrapped_string, :uppercase_mapping)
389 357
       end
390 358
 
391 359
       # Convert characters in the string to lowercase.
@@ -393,7 +361,7 @@ def upcase
393 361
       # Example:
394 362
       #   'VĚDA A VÝZKUM'.mb_chars.downcase.to_s #=> "věda a výzkum"
395 363
       def downcase
396  
-        apply_mapping :lowercase_mapping
  364
+        chars(Unicode.apply_mapping @wrapped_string, :lowercase_mapping)
397 365
       end
398 366
 
399 367
       # Converts the first character to uppercase and the remainder to lowercase.
@@ -409,9 +377,9 @@ def capitalize
409 377
       #
410 378
       # * <tt>form</tt> - The form you want to normalize in. Should be one of the following:
411 379
       #   <tt>:c</tt>, <tt>:kc</tt>, <tt>:d</tt>, or <tt>:kd</tt>. Default is
412  
-      #   ActiveSupport::Multibyte.default_normalization_form
413  
-      def normalize(form=ActiveSupport::Multibyte.default_normalization_form)
414  
-        chars(self.class.normalize(@wrapped_string, form))
  380
+      #   ActiveSupport::Multibyte::Unicode.default_normalization_form
  381
+      def normalize(form = nil)
  382
+        chars(Unicode.normalize(@wrapped_string, form))
415 383
       end
416 384
 
417 385
       # Performs canonical decomposition on all the characters.
@@ -420,7 +388,7 @@ def normalize(form=ActiveSupport::Multibyte.default_normalization_form)
420 388
       #   'é'.length #=> 2
421 389
       #   'é'.mb_chars.decompose.to_s.length #=> 3
422 390
       def decompose
423  
-        chars(self.class.decompose_codepoints(:canonical, self.class.u_unpack(@wrapped_string)).pack('U*'))
  391
+        chars(Unicode.decompose_codepoints(:canonical, Unicode.u_unpack(@wrapped_string)).pack('U*'))
424 392
       end
425 393
 
426 394
       # Performs composition on all the characters.
@@ -429,7 +397,7 @@ def decompose
429 397
       #   'é'.length #=> 3
430 398
       #   'é'.mb_chars.compose.to_s.length #=> 2
431 399
       def compose
432  
-        chars(self.class.compose_codepoints(self.class.u_unpack(@wrapped_string)).pack('U*'))
  400
+        chars(Unicode.compose_codepoints(Unicode.u_unpack(@wrapped_string)).pack('U*'))
433 401
       end
434 402
 
435 403
       # Returns the number of grapheme clusters in the string.
@@ -438,14 +406,14 @@ def compose
438 406
       #   'क्षि'.mb_chars.length #=> 4
439 407
       #   'क्षि'.mb_chars.g_length #=> 3
440 408
       def g_length
441  
-        self.class.g_unpack(@wrapped_string).length
  409
+        Unicode.g_unpack(@wrapped_string).length
442 410
       end
443 411
 
444 412
       # Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string.
445 413
       #
446 414
       # Passing +true+ will forcibly tidy all bytes, assuming that the string's encoding is entirely CP1252 or ISO-8859-1.
447 415
       def tidy_bytes(force = false)
448  
-        chars(self.class.tidy_bytes(@wrapped_string, force))
  416
+        chars(Unicode.tidy_bytes(@wrapped_string, force))
449 417
       end
450 418
 
451 419
       %w(lstrip rstrip strip reverse upcase downcase tidy_bytes capitalize).each do |method|
@@ -459,266 +427,6 @@ def tidy_bytes(force = false)
459 427
         end
460 428
       end
461 429
 
462  
-      class << self
463  
-
464  
-        # Unpack the string at codepoints boundaries. Raises an EncodingError when the encoding of the string isn't
465  
-        # valid UTF-8.
466  
-        #
467  
-        # Example:
468  
-        #   Chars.u_unpack('Café') #=> [67, 97, 102, 233]
469  
-        def u_unpack(string)
470  
-          begin
471  
-            string.unpack 'U*'
472  
-          rescue ArgumentError
473  
-            raise EncodingError, 'malformed UTF-8 character'
474  
-          end
475  
-        end
476  
-
477  
-        # Detect whether the codepoint is in a certain character class. Returns +true+ when it's in the specified
478  
-        # character class and +false+ otherwise. Valid character classes are: <tt>:cr</tt>, <tt>:lf</tt>, <tt>:l</tt>,
479  
-        # <tt>:v</tt>, <tt>:lv</tt>, <tt>:lvt</tt> and <tt>:t</tt>.
480  
-        #
481  
-        # Primarily used by the grapheme cluster support.
482  
-        def in_char_class?(codepoint, classes)
483  
-          classes.detect { |c| UCD.boundary[c] === codepoint } ? true : false
484  
-        end
485  
-
486  
-        # Unpack the string at grapheme boundaries. Returns a list of character lists.
487  
-        #
488  
-        # Example:
489  
-        #   Chars.g_unpack('क्षि') #=> [[2325, 2381], [2359], [2367]]
490  
-        #   Chars.g_unpack('Café') #=> [[67], [97], [102], [233]]
491  
-        def g_unpack(string)
492  
-          codepoints = u_unpack(string)
493  
-          unpacked = []
494  
-          pos = 0
495  
-          marker = 0
496  
-          eoc = codepoints.length
497  
-          while(pos < eoc)
498  
-            pos += 1
499  
-            previous = codepoints[pos-1]
500  
-            current = codepoints[pos]
501  
-            if (
502  
-                # CR X LF
503  
-                one = ( previous == UCD.boundary[:cr] and current == UCD.boundary[:lf] ) or
504  
-                # L X (L|V|LV|LVT)
505  
-                two = ( UCD.boundary[:l] === previous and in_char_class?(current, [:l,:v,:lv,:lvt]) ) or
506  
-                # (LV|V) X (V|T)
507  
-                three = ( in_char_class?(previous, [:lv,:v]) and in_char_class?(current, [:v,:t]) ) or
508  
-                # (LVT|T) X (T)
509  
-                four = ( in_char_class?(previous, [:lvt,:t]) and UCD.boundary[:t] === current ) or
510  
-                # X Extend
511  
-                five = (UCD.boundary[:extend] === current)
512  
-              )
513  
-            else
514  
-              unpacked << codepoints[marker..pos-1]
515  
-              marker = pos
516  
-            end
517  
-          end
518  
-          unpacked
519  
-        end
520  
-
521  
-        # Reverse operation of g_unpack.
522  
-        #
523  
-        # Example:
524  
-        #   Chars.g_pack(Chars.g_unpack('क्षि')) #=> 'क्षि'
525  
-        def g_pack(unpacked)
526  
-          (unpacked.flatten).pack('U*')
527  
-        end
528  
-
529  
-        def padding(padsize, padstr=' ') #:nodoc:
530  
-          if padsize != 0
531  
-            new(padstr * ((padsize / u_unpack(padstr).size) + 1)).slice(0, padsize)
532  
-          else
533  
-            ''
534  
-          end
535  
-        end
536  
-
537  
-        # Re-order codepoints so the string becomes canonical.
538  
-        def reorder_characters(codepoints)
539  
-          length = codepoints.length- 1
540  
-          pos = 0
541  
-          while pos < length do
542  
-            cp1, cp2 = UCD.codepoints[codepoints[pos]], UCD.codepoints[codepoints[pos+1]]
543  
-            if (cp1.combining_class > cp2.combining_class) && (cp2.combining_class > 0)
544  
-              codepoints[pos..pos+1] = cp2.code, cp1.code
545  
-              pos += (pos > 0 ? -1 : 1)
546  
-            else
547  
-              pos += 1
548  
-            end
549  
-          end
550  
-          codepoints
551  
-        end
552  
-
553  
-        # Decompose composed characters to the decomposed form.
554  
-        def decompose_codepoints(type, codepoints)
555  
-          codepoints.inject([]) do |decomposed, cp|
556  
-            # if it's a hangul syllable starter character
557  
-            if HANGUL_SBASE <= cp and cp < HANGUL_SLAST
558  
-              sindex = cp - HANGUL_SBASE
559  
-              ncp = [] # new codepoints
560  
-              ncp << HANGUL_LBASE + sindex / HANGUL_NCOUNT
561  
-              ncp << HANGUL_VBASE + (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT
562  
-              tindex = sindex % HANGUL_TCOUNT
563  
-              ncp << (HANGUL_TBASE + tindex) unless tindex == 0
564  
-              decomposed.concat ncp
565  
-            # if the codepoint is decomposable in with the current decomposition type
566  
-            elsif (ncp = UCD.codepoints[cp].decomp_mapping) and (!UCD.codepoints[cp].decomp_type || type == :compatability)
567  
-              decomposed.concat decompose_codepoints(type, ncp.dup)
568  
-            else
569  
-              decomposed << cp
570  
-            end
571  
-          end
572  
-        end
573  
-
574  
-        # Compose decomposed characters to the composed form.
575  
-        def compose_codepoints(codepoints)
576  
-          pos = 0
577  
-          eoa = codepoints.length - 1
578  
-          starter_pos = 0
579  
-          starter_char = codepoints[0]
580  
-          previous_combining_class = -1
581  
-          while pos < eoa
582  
-            pos += 1
583  
-            lindex = starter_char - HANGUL_LBASE
584  
-            # -- Hangul
585  
-            if 0 <= lindex and lindex < HANGUL_LCOUNT
586  
-              vindex = codepoints[starter_pos+1] - HANGUL_VBASE rescue vindex = -1
587  
-              if 0 <= vindex and vindex < HANGUL_VCOUNT
588  
-                tindex = codepoints[starter_pos+2] - HANGUL_TBASE rescue tindex = -1
589  
-                if 0 <= tindex and tindex < HANGUL_TCOUNT
590  
-                  j = starter_pos + 2
591  
-                  eoa -= 2
592  
-                else
593  
-                  tindex = 0
594  
-                  j = starter_pos + 1
595  
-                  eoa -= 1
596  
-                end
597  
-                codepoints[starter_pos..j] = (lindex * HANGUL_VCOUNT + vindex) * HANGUL_TCOUNT + tindex + HANGUL_SBASE
598  
-              end
599  
-              starter_pos += 1
600  
-              starter_char = codepoints[starter_pos]
601  
-            # -- Other characters
602  
-            else
603  
-              current_char = codepoints[pos]
604  
-              current = UCD.codepoints[current_char]
605  
-              if current.combining_class > previous_combining_class
606  
-                if ref = UCD.composition_map[starter_char]
607  
-                  composition = ref[current_char]
608  
-                else
609  
-                  composition = nil
610  
-                end
611  
-                unless composition.nil?
612  
-                  codepoints[starter_pos] = composition
613  
-                  starter_char = composition
614  
-                  codepoints.delete_at pos
615  
-                  eoa -= 1
616  
-                  pos -= 1
617  
-                  previous_combining_class = -1
618  
-                else
619  
-                  previous_combining_class = current.combining_class
620  
-                end
621  
-              else
622  
-                previous_combining_class = current.combining_class
623  
-              end
624  
-              if current.combining_class == 0
625  
-                starter_pos = pos
626  
-                starter_char = codepoints[pos]
627  
-              end
628  
-            end
629  
-          end
630  
-          codepoints
631  
-        end
632  
-
633  
-        def tidy_byte(byte)
634  
-          if byte < 160
635  
-            [UCD.cp1252[byte] || byte].pack("U").unpack("C*")
636  
-          elsif byte < 192
637  
-            [194, byte]
638  
-          else
639  
-            [195, byte - 64]
640  
-          end
641  
-        end
642  
-        private :tidy_byte
643  
-
644  
-        # Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string.
645  
-        #
646  
-        # Passing +true+ will forcibly tidy all bytes, assuming that the string's encoding is entirely CP1252 or ISO-8859-1.
647  
-        def tidy_bytes(string, force = false)
648  
-          if force
649  
-            return string.unpack("C*").map do |b|
650  
-              tidy_byte(b)
651  
-            end.flatten.compact.pack("C*").unpack("U*").pack("U*")
652  
-          end
653  
-
654  
-          bytes = string.unpack("C*")
655  
-          conts_expected = 0
656  
-          last_lead = 0
657  
-
658  
-          bytes.each_index do |i|
659  
-
660  
-            byte          = bytes[i]
661  
-            is_ascii      = byte < 128
662  
-            is_cont       = byte > 127 && byte < 192
663  
-            is_lead       = byte > 191 && byte < 245
664  
-            is_unused     = byte > 240
665  
-            is_restricted = byte > 244
666  
-
667  
-            # Impossible or highly unlikely byte? Clean it.
668  
-            if is_unused || is_restricted
669  
-              bytes[i] = tidy_byte(byte)
670  
-            elsif is_cont
671  
-              # Not expecting contination byte? Clean up. Otherwise, now expect one less.
672  
-              conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
673  
-            else
674  
-              if conts_expected > 0
675  
-                # Expected continuation, but got ASCII or leading? Clean backwards up to
676  
-                # the leading byte.
677  
-                (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
678  
-                conts_expected = 0
679  
-              end
680  
-              if is_lead
681  
-                # Final byte is leading? Clean it.
682  
-                if i == bytes.length - 1
683  
-                  bytes[i] = tidy_byte(bytes.last)
684  
-                else
685  
-                  # Valid leading byte? Expect continuations determined by position of
686  
-                  # first zero bit, with max of 3.
687  
-                  conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
688  
-                  last_lead = i
689  
-                end
690  
-              end
691  
-            end
692  
-          end
693  
-          bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
694  
-        end
695  
-
696  
-        # Returns the KC normalization of the string by default. NFKC is considered the best normalization form for
697  
-        # passing strings to databases and validations.
698  
-        #
699  
-        # * <tt>string</tt> - The string to perform normalization on.
700  
-        # * <tt>form</tt> - The form you want to normalize in. Should be one of the following:
701  
-        #   <tt>:c</tt>, <tt>:kc</tt>, <tt>:d</tt>, or <tt>:kd</tt>. Default is
702  
-        #   ActiveSupport::Multibyte.default_normali