Simplify ActiveSupport::Multibyte and make it run on Ruby 1.9.

* Unicode methods are now defined directly on Chars instead of a handler * Updated Unicode database to Unicode 5.1.0 * Improved documentation
rails · Sep 21, 2008 · 22f75d5 · 22f75d5
1 parent 5f83e18
commit 22f75d5
Show file tree

Hide file tree

Showing 18 changed files with 1,562 additions and 1,550 deletions.
diff --git a/activesupport/bin/generate_tables b/activesupport/bin/generate_tables
@@ -0,0 +1,147 @@
+#!/usr/bin/env ruby
+
+begin
+  $:.unshift(File.expand_path(File.dirname(__FILE__) + '/../lib'))
+  require 'active_support'
+rescue IOError
+end
+
+require 'open-uri'
+require 'tmpdir'
+
+module ActiveSupport
+  module Multibyte
+    class UnicodeDatabase
+      def load; end
+    end
+
+    class UnicodeDatabaseGenerator
+      BASE_URI = "http://www.unicode.org/Public/#{ActiveSupport::Multibyte::UNICODE_VERSION}/ucd/"
+      SOURCES = {
+        :codepoints => BASE_URI + 'UnicodeData.txt',
+        :composition_exclusion => BASE_URI + 'CompositionExclusions.txt',
+        :grapheme_break_property => BASE_URI + 'auxiliary/GraphemeBreakProperty.txt',
+        :cp1252 => 'http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT'
+      }
+
+      def initialize
+        @ucd = UnicodeDatabase.new
+
+        default = Codepoint.new
+        default.combining_class = 0
+        default.uppercase_mapping = 0
+        default.lowercase_mapping = 0
+        @ucd.codepoints = Hash.new(default)
+      end
+
+      def parse_codepoints(line)
+        codepoint = Codepoint.new
+        raise "Could not parse input." unless line =~ /^
+          ([0-9A-F]+);        # code
+          ([^;]+);            # name
+          ([A-Z]+);           # general category
+          ([0-9]+);           # canonical combining class
+          ([A-Z]+);           # bidi class
+          (<([A-Z]*)>)?       # decomposition type
+          ((\ ?[0-9A-F]+)*);  # decompomposition mapping
+          ([0-9]*);           # decimal digit
+          ([0-9]*);           # digit
+          ([^;]*);            # numeric
+          ([YN]*);            # bidi mirrored
+          ([^;]*);            # unicode 1.0 name
+          ([^;]*);            # iso comment
+          ([0-9A-F]*);        # simple uppercase mapping
+          ([0-9A-F]*);        # simple lowercase mapping
+          ([0-9A-F]*)$/ix     # simple titlecase mapping
+        codepoint.code              = $1.hex
+        #codepoint.name              = $2
+        #codepoint.category          = $3
+        codepoint.combining_class   = Integer($4)
+        #codepoint.bidi_class        = $5
+        codepoint.decomp_type       = $7
+        codepoint.decomp_mapping    = ($8=='') ? nil : $8.split.collect { |element| element.hex }
+        #codepoint.bidi_mirrored     = ($13=='Y') ? true : false
+        codepoint.uppercase_mapping = ($16=='') ? 0 : $16.hex
+        codepoint.lowercase_mapping = ($17=='') ? 0 : $17.hex
+        #codepoint.titlecase_mapping = ($18=='') ? nil : $18.hex
+        @ucd.codepoints[codepoint.code] = codepoint
+      end
+
+      def parse_grapheme_break_property(line)
+        if line =~ /^([0-9A-F\.]+)\s*;\s*([\w]+)\s*#/
+          type = $2.downcase.intern
+          @ucd.boundary[type] ||= []
+          if $1.include? '..'
+            parts = $1.split '..'
+            @ucd.boundary[type] << (parts[0].hex..parts[1].hex)
+          else
+            @ucd.boundary[type] << $1.hex
+          end
+        end
+      end
+
+      def parse_composition_exclusion(line)
+        if line =~ /^([0-9A-F]+)/i
+          @ucd.composition_exclusion << $1.hex
+        end
+      end
+
+      def parse_cp1252(line)
+        if line =~ /^([0-9A-Fx]+)\s([0-9A-Fx]+)/i
+          @ucd.cp1252[$1.hex] = $2.hex
+        end
+      end
+
+      def create_composition_map
+        @ucd.codepoints.each do |_, cp|
+          if !cp.nil? and cp.combining_class == 0 and cp.decomp_type.nil? and !cp.decomp_mapping.nil? and cp.decomp_mapping.length == 2 and @ucd.codepoints[cp.decomp_mapping[0]].combining_class == 0 and !@ucd.composition_exclusion.include?(cp.code)
+            @ucd.composition_map[cp.decomp_mapping[0]] ||= {}
+            @ucd.composition_map[cp.decomp_mapping[0]][cp.decomp_mapping[1]] = cp.code
+          end
+        end
+      end
+
+      def normalize_boundary_map
+        @ucd.boundary.each do |k,v|
+          if [:lf, :cr].include? k
+            @ucd.boundary[k] = v[0]
+          end
+        end
+      end
+
+      def parse
+        SOURCES.each do |type, url|
+          filename =  File.join(Dir.tmpdir, "#{url.split('/').last}")
+          unless File.exist?(filename)
+            $stderr.puts "Downloading #{url.split('/').last}"
+            File.open(filename, 'wb') do |target|
+              open(url) do |source|
+                source.each_line { |line| target.write line }
+              end
+            end
+          end
+          File.open(filename) do |file|
+            file.each_line { |line| send "parse_#{type}".intern, line }
+          end
+        end
+        create_composition_map
+        normalize_boundary_map
+      end
+
+      def dump_to(filename)
+        File.open(filename, 'wb') do |f|
+          f.write Marshal.dump([@ucd.codepoints, @ucd.composition_exclusion, @ucd.composition_map, @ucd.boundary, @ucd.cp1252])
+        end
+      end
+    end
+  end
+end
+
+if __FILE__ == $0
+  filename = ActiveSupport::Multibyte::UnicodeDatabase.filename
+  generator = ActiveSupport::Multibyte::UnicodeDatabaseGenerator.new
+  generator.parse
+  print "Writing to: #{filename}"
+  generator.dump_to filename
+  puts " (#{File.size(filename)} bytes)"
+end
diff --git a/activesupport/lib/active_support/core_ext/string.rb b/activesupport/lib/active_support/core_ext/string.rb
@@ -1,9 +1,11 @@
+# encoding: utf-8
+
 require 'active_support/core_ext/string/inflections'
 require 'active_support/core_ext/string/conversions'
 require 'active_support/core_ext/string/access'
 require 'active_support/core_ext/string/starts_ends_with'
 require 'active_support/core_ext/string/iterators'
-require 'active_support/core_ext/string/unicode'
+require 'active_support/core_ext/string/multibyte'
 require 'active_support/core_ext/string/xchar'
 require 'active_support/core_ext/string/filters'
 require 'active_support/core_ext/string/behavior'
@@ -15,6 +17,6 @@ class String #:nodoc:
   include ActiveSupport::CoreExtensions::String::Inflections
   include ActiveSupport::CoreExtensions::String::StartsEndsWith
   include ActiveSupport::CoreExtensions::String::Iterators
-  include ActiveSupport::CoreExtensions::String::Unicode
   include ActiveSupport::CoreExtensions::String::Behavior
+  include ActiveSupport::CoreExtensions::String::Multibyte
 end
diff --git a/activesupport/lib/active_support/core_ext/string/multibyte.rb b/activesupport/lib/active_support/core_ext/string/multibyte.rb
@@ -0,0 +1,81 @@
+# encoding: utf-8
+
+module ActiveSupport #:nodoc:
+  module CoreExtensions #:nodoc:
+    module String #:nodoc:
+      # Implements multibyte methods for easier access to multibyte characters in a String instance.
+      module Multibyte
+        unless '1.9'.respond_to?(:force_encoding)
+          # +mb_chars+ is a multibyte safe proxy method for string methods.
+          #
+          # In Ruby 1.8 and older it creates and returns an instance of the ActiveSupport::Multibyte::Chars class which
+          # encapsulates the original string. A Unicode safe version of all the String methods are defined on this proxy
+          # class. If the proxy class doesn't respond to a certain method, it's forwarded to the encapsuled string.
+          #
+          #   name = 'Claus Müller'
+          #   name.reverse  #=> "rell??M sualC"
+          #   name.length   #=> 13
+          #
+          #   name.mb_chars.reverse.to_s   #=> "rellüM sualC"
+          #   name.mb_chars.length         #=> 12
+          #
+          # In Ruby 1.9 and newer +mb_chars+ returns +self+ because String is (mostly) encoding aware so we don't need
+          # a proxy class any more. This means that +mb_chars+ makes it easier to write code that runs on multiple Ruby
+          # versions.
+          #
+          # == Method chaining 
+          #
+          # All the methods on the Chars proxy which normally return a string will return a Chars object. This allows
+          # method chaining on the result of any of these methods.
+          #
+          #   name.mb_chars.reverse.length #=> 12
+          #
+          # == Interoperability and configuration
+          #
+          # The Char object tries to be as interchangeable with String objects as possible: sorting and comparing between
+          # String and Char work like expected. The bang! methods change the internal string representation in the Chars
+          # object. Interoperability problems can be resolved easily with a +to_s+ call.
+          #
+          # For more information about the methods defined on the Chars proxy see ActiveSupport::Multibyte::Chars. For
+          # information about how to change the default Multibyte behaviour, see ActiveSupport::Multibyte.
+          def mb_chars
+            if ActiveSupport::Multibyte.proxy_class.wants?(self)
+              ActiveSupport::Multibyte.proxy_class.new(self)
+            else
+              self
+            end
+          end
+
+          # Returns true if the string has UTF-8 semantics (a String used for purely byte resources is unlikely to have
+          # them), returns false otherwise.
+          def is_utf8?
+            ActiveSupport::Multibyte::Chars.consumes?(self)
+          end
+
+          unless '1.8.7 and later'.respond_to?(:chars)
+            alias chars mb_chars
+          end
+        else
+          # In Ruby 1.9 and newer +mb_chars+ returns self. In Ruby 1.8 and older +mb_chars+ creates and returns an
+          # Unicode safe proxy for string operations, this makes it easier to write code that runs on multiple Ruby
+          # versions.
+          def mb_chars
+            self
+          end
+
+          # Returns true if the string has valid UTF-8 encoding.
+          def is_utf8?
+            case encoding
+            when Encoding::UTF_8
+              valid_encoding?
+            when Encoding::ASCII_8BIT, Encoding::US_ASCII
+              dup.force_encoding(Encoding::UTF_8).valid_encoding?
+            else
+              false
+            end
+          end
+        end
+      end
+    end
+  end
+end
diff --git a/activesupport/lib/active_support/core_ext/string/unicode.rb b/activesupport/lib/active_support/core_ext/string/unicode.rb
diff --git a/activesupport/lib/active_support/multibyte.rb b/activesupport/lib/active_support/multibyte.rb
@@ -1,9 +1,33 @@
-module ActiveSupport
+# encoding: utf-8
+
+require 'active_support/multibyte/chars'
+require 'active_support/multibyte/exceptions'
+require 'active_support/multibyte/unicode_database'
+
+module ActiveSupport #:nodoc:
   module Multibyte #:nodoc:
-    DEFAULT_NORMALIZATION_FORM = :kc
+    # A list of all available normalization forms. See http://www.unicode.org/reports/tr15/tr15-29.html for more
+    # information about normalization.
     NORMALIZATIONS_FORMS = [:c, :kc, :d, :kd]
-    UNICODE_VERSION = '5.0.0'
-  end
-end
 
-require 'active_support/multibyte/chars'
+    # The Unicode version that is supported by the implementation
+    UNICODE_VERSION = '5.1.0'
+
+    # The default normalization used for operations that require normalization. It can be set to any of the
+    # normalizations in NORMALIZATIONS_FORMS.
+    #
+    # Example:
+    #   ActiveSupport::Multibyte.default_normalization_form = :c
+    mattr_accessor :default_normalization_form
+    self.default_normalization_form = :kc
+
+    # The proxy class returned when calling mb_chars. You can use this accessor to configure your own proxy
+    # class so you can support other encodings. See the ActiveSupport::Multibyte::Chars implementation for
+    # an example how to do this.
+    #
+    # Example:
+    #   ActiveSupport::Multibyte.proxy_class = CharsForUTF32
+    mattr_accessor :proxy_class
+    self.proxy_class = ActiveSupport::Multibyte::Chars
+  end
+end