Permalink
Browse files

Simplify ActiveSupport::Multibyte and make it run on Ruby 1.9.

* Unicode methods are now defined directly on Chars instead of a handler
* Updated Unicode database to Unicode 5.1.0
* Improved documentation
  • Loading branch information...
1 parent 5f83e18 commit 22f75d539dca7b6f33cbf86e4e9d1944bb22731f @Manfred Manfred committed Sep 21, 2008
@@ -0,0 +1,147 @@
+#!/usr/bin/env ruby
+
+begin
+ $:.unshift(File.expand_path(File.dirname(__FILE__) + '/../lib'))
+ require 'active_support'
+rescue IOError
+end
+
+require 'open-uri'
+require 'tmpdir'
+
+module ActiveSupport
+ module Multibyte
+ class UnicodeDatabase
+ def load; end
+ end
+
+ class UnicodeDatabaseGenerator
+ BASE_URI = "http://www.unicode.org/Public/#{ActiveSupport::Multibyte::UNICODE_VERSION}/ucd/"
+ SOURCES = {
+ :codepoints => BASE_URI + 'UnicodeData.txt',
+ :composition_exclusion => BASE_URI + 'CompositionExclusions.txt',
+ :grapheme_break_property => BASE_URI + 'auxiliary/GraphemeBreakProperty.txt',
+ :cp1252 => 'http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT'
+ }
+
+ def initialize
+ @ucd = UnicodeDatabase.new
+
+ default = Codepoint.new
+ default.combining_class = 0
+ default.uppercase_mapping = 0
+ default.lowercase_mapping = 0
+ @ucd.codepoints = Hash.new(default)
+ end
+
+ def parse_codepoints(line)
+ codepoint = Codepoint.new
+ raise "Could not parse input." unless line =~ /^
+ ([0-9A-F]+); # code
+ ([^;]+); # name
+ ([A-Z]+); # general category
+ ([0-9]+); # canonical combining class
+ ([A-Z]+); # bidi class
+ (<([A-Z]*)>)? # decomposition type
+ ((\ ?[0-9A-F]+)*); # decompomposition mapping
+ ([0-9]*); # decimal digit
+ ([0-9]*); # digit
+ ([^;]*); # numeric
+ ([YN]*); # bidi mirrored
+ ([^;]*); # unicode 1.0 name
+ ([^;]*); # iso comment
+ ([0-9A-F]*); # simple uppercase mapping
+ ([0-9A-F]*); # simple lowercase mapping
+ ([0-9A-F]*)$/ix # simple titlecase mapping
+ codepoint.code = $1.hex
+ #codepoint.name = $2
+ #codepoint.category = $3
+ codepoint.combining_class = Integer($4)
+ #codepoint.bidi_class = $5
+ codepoint.decomp_type = $7
+ codepoint.decomp_mapping = ($8=='') ? nil : $8.split.collect { |element| element.hex }
+ #codepoint.bidi_mirrored = ($13=='Y') ? true : false
+ codepoint.uppercase_mapping = ($16=='') ? 0 : $16.hex
+ codepoint.lowercase_mapping = ($17=='') ? 0 : $17.hex
+ #codepoint.titlecase_mapping = ($18=='') ? nil : $18.hex
+ @ucd.codepoints[codepoint.code] = codepoint
+ end
+
+ def parse_grapheme_break_property(line)
+ if line =~ /^([0-9A-F\.]+)\s*;\s*([\w]+)\s*#/
+ type = $2.downcase.intern
+ @ucd.boundary[type] ||= []
+ if $1.include? '..'
+ parts = $1.split '..'
+ @ucd.boundary[type] << (parts[0].hex..parts[1].hex)
+ else
+ @ucd.boundary[type] << $1.hex
+ end
+ end
+ end
+
+ def parse_composition_exclusion(line)
+ if line =~ /^([0-9A-F]+)/i
+ @ucd.composition_exclusion << $1.hex
+ end
+ end
+
+ def parse_cp1252(line)
+ if line =~ /^([0-9A-Fx]+)\s([0-9A-Fx]+)/i
+ @ucd.cp1252[$1.hex] = $2.hex
+ end
+ end
+
+ def create_composition_map
+ @ucd.codepoints.each do |_, cp|
+ if !cp.nil? and cp.combining_class == 0 and cp.decomp_type.nil? and !cp.decomp_mapping.nil? and cp.decomp_mapping.length == 2 and @ucd.codepoints[cp.decomp_mapping[0]].combining_class == 0 and !@ucd.composition_exclusion.include?(cp.code)
+ @ucd.composition_map[cp.decomp_mapping[0]] ||= {}
+ @ucd.composition_map[cp.decomp_mapping[0]][cp.decomp_mapping[1]] = cp.code
+ end
+ end
+ end
+
+ def normalize_boundary_map
+ @ucd.boundary.each do |k,v|
+ if [:lf, :cr].include? k
+ @ucd.boundary[k] = v[0]
+ end
+ end
+ end
+
+ def parse
+ SOURCES.each do |type, url|
+ filename = File.join(Dir.tmpdir, "#{url.split('/').last}")
+ unless File.exist?(filename)
+ $stderr.puts "Downloading #{url.split('/').last}"
+ File.open(filename, 'wb') do |target|
+ open(url) do |source|
+ source.each_line { |line| target.write line }
+ end
+ end
+ end
+ File.open(filename) do |file|
+ file.each_line { |line| send "parse_#{type}".intern, line }
+ end
+ end
+ create_composition_map
+ normalize_boundary_map
+ end
+
+ def dump_to(filename)
+ File.open(filename, 'wb') do |f|
+ f.write Marshal.dump([@ucd.codepoints, @ucd.composition_exclusion, @ucd.composition_map, @ucd.boundary, @ucd.cp1252])
+ end
+ end
+ end
+ end
+end
+
+if __FILE__ == $0
+ filename = ActiveSupport::Multibyte::UnicodeDatabase.filename
+ generator = ActiveSupport::Multibyte::UnicodeDatabaseGenerator.new
+ generator.parse
+ print "Writing to: #{filename}"
+ generator.dump_to filename
+ puts " (#{File.size(filename)} bytes)"
+end
@@ -1,9 +1,11 @@
+# encoding: utf-8
+
require 'active_support/core_ext/string/inflections'
require 'active_support/core_ext/string/conversions'
require 'active_support/core_ext/string/access'
require 'active_support/core_ext/string/starts_ends_with'
require 'active_support/core_ext/string/iterators'
-require 'active_support/core_ext/string/unicode'
+require 'active_support/core_ext/string/multibyte'
require 'active_support/core_ext/string/xchar'
require 'active_support/core_ext/string/filters'
require 'active_support/core_ext/string/behavior'
@@ -15,6 +17,6 @@ class String #:nodoc:
include ActiveSupport::CoreExtensions::String::Inflections
include ActiveSupport::CoreExtensions::String::StartsEndsWith
include ActiveSupport::CoreExtensions::String::Iterators
- include ActiveSupport::CoreExtensions::String::Unicode
include ActiveSupport::CoreExtensions::String::Behavior
+ include ActiveSupport::CoreExtensions::String::Multibyte
end
@@ -0,0 +1,81 @@
+# encoding: utf-8
+
+module ActiveSupport #:nodoc:
+ module CoreExtensions #:nodoc:
+ module String #:nodoc:
+ # Implements multibyte methods for easier access to multibyte characters in a String instance.
+ module Multibyte
+ unless '1.9'.respond_to?(:force_encoding)
+ # +mb_chars+ is a multibyte safe proxy method for string methods.
+ #
+ # In Ruby 1.8 and older it creates and returns an instance of the ActiveSupport::Multibyte::Chars class which
+ # encapsulates the original string. A Unicode safe version of all the String methods are defined on this proxy
+ # class. If the proxy class doesn't respond to a certain method, it's forwarded to the encapsuled string.
+ #
+ # name = 'Claus Müller'
+ # name.reverse #=> "rell??M sualC"
+ # name.length #=> 13
+ #
+ # name.mb_chars.reverse.to_s #=> "rellüM sualC"
+ # name.mb_chars.length #=> 12
+ #
+ # In Ruby 1.9 and newer +mb_chars+ returns +self+ because String is (mostly) encoding aware so we don't need
+ # a proxy class any more. This means that +mb_chars+ makes it easier to write code that runs on multiple Ruby
+ # versions.
+ #
+ # == Method chaining
+ #
+ # All the methods on the Chars proxy which normally return a string will return a Chars object. This allows
+ # method chaining on the result of any of these methods.
+ #
+ # name.mb_chars.reverse.length #=> 12
+ #
+ # == Interoperability and configuration
+ #
+ # The Char object tries to be as interchangeable with String objects as possible: sorting and comparing between
+ # String and Char work like expected. The bang! methods change the internal string representation in the Chars
+ # object. Interoperability problems can be resolved easily with a +to_s+ call.
+ #
+ # For more information about the methods defined on the Chars proxy see ActiveSupport::Multibyte::Chars. For
+ # information about how to change the default Multibyte behaviour, see ActiveSupport::Multibyte.
+ def mb_chars
+ if ActiveSupport::Multibyte.proxy_class.wants?(self)
+ ActiveSupport::Multibyte.proxy_class.new(self)
+ else
+ self
+ end
+ end
+
+ # Returns true if the string has UTF-8 semantics (a String used for purely byte resources is unlikely to have
+ # them), returns false otherwise.
+ def is_utf8?
+ ActiveSupport::Multibyte::Chars.consumes?(self)
+ end
+
+ unless '1.8.7 and later'.respond_to?(:chars)
+ alias chars mb_chars
+ end
+ else
+ # In Ruby 1.9 and newer +mb_chars+ returns self. In Ruby 1.8 and older +mb_chars+ creates and returns an
+ # Unicode safe proxy for string operations, this makes it easier to write code that runs on multiple Ruby
+ # versions.
+ def mb_chars
+ self
+ end
+
+ # Returns true if the string has valid UTF-8 encoding.
+ def is_utf8?
+ case encoding
+ when Encoding::UTF_8
+ valid_encoding?
+ when Encoding::ASCII_8BIT, Encoding::US_ASCII
+ dup.force_encoding(Encoding::UTF_8).valid_encoding?
+ else
+ false
+ end
+ end
+ end
+ end
+ end
+ end
+end
@@ -1,66 +0,0 @@
-module ActiveSupport #:nodoc:
- module CoreExtensions #:nodoc:
- module String #:nodoc:
- # Define methods for handling unicode data.
- module Unicode
- def self.append_features(base)
- if '1.8.7 and later'.respond_to?(:chars)
- base.class_eval { remove_method :chars }
- end
- super
- end
-
- unless '1.9'.respond_to?(:force_encoding)
- # +chars+ is a Unicode safe proxy for string methods. It creates and returns an instance of the
- # ActiveSupport::Multibyte::Chars class which encapsulates the original string. A Unicode safe version of all
- # the String methods are defined on this proxy class. Undefined methods are forwarded to String, so all of the
- # string overrides can also be called through the +chars+ proxy.
- #
- # name = 'Claus Müller'
- # name.reverse # => "rell??M sualC"
- # name.length # => 13
- #
- # name.chars.reverse.to_s # => "rellüM sualC"
- # name.chars.length # => 12
- #
- #
- # All the methods on the chars proxy which normally return a string will return a Chars object. This allows
- # method chaining on the result of any of these methods.
- #
- # name.chars.reverse.length # => 12
- #
- # The Char object tries to be as interchangeable with String objects as possible: sorting and comparing between
- # String and Char work like expected. The bang! methods change the internal string representation in the Chars
- # object. Interoperability problems can be resolved easily with a +to_s+ call.
- #
- # For more information about the methods defined on the Chars proxy see ActiveSupport::Multibyte::Chars and
- # ActiveSupport::Multibyte::Handlers::UTF8Handler.
- def chars
- ActiveSupport::Multibyte::Chars.new(self)
- end
-
- # Returns true if the string has UTF-8 semantics (a String used for purely byte resources is unlikely to have
- # them), returns false otherwise.
- def is_utf8?
- ActiveSupport::Multibyte::Handlers::UTF8Handler.consumes?(self)
- end
- else
- def chars #:nodoc:
- self
- end
-
- def is_utf8? #:nodoc:
- case encoding
- when Encoding::UTF_8
- valid_encoding?
- when Encoding::ASCII_8BIT
- dup.force_encoding('UTF-8').valid_encoding?
- else
- false
- end
- end
- end
- end
- end
- end
-end
@@ -1,9 +1,33 @@
-module ActiveSupport
+# encoding: utf-8
+
+require 'active_support/multibyte/chars'
+require 'active_support/multibyte/exceptions'
+require 'active_support/multibyte/unicode_database'
+
+module ActiveSupport #:nodoc:
module Multibyte #:nodoc:
- DEFAULT_NORMALIZATION_FORM = :kc
+ # A list of all available normalization forms. See http://www.unicode.org/reports/tr15/tr15-29.html for more
+ # information about normalization.
NORMALIZATIONS_FORMS = [:c, :kc, :d, :kd]
- UNICODE_VERSION = '5.0.0'
- end
-end
-require 'active_support/multibyte/chars'
+ # The Unicode version that is supported by the implementation
+ UNICODE_VERSION = '5.1.0'
+
+ # The default normalization used for operations that require normalization. It can be set to any of the
+ # normalizations in NORMALIZATIONS_FORMS.
+ #
+ # Example:
+ # ActiveSupport::Multibyte.default_normalization_form = :c
+ mattr_accessor :default_normalization_form
+ self.default_normalization_form = :kc
+
+ # The proxy class returned when calling mb_chars. You can use this accessor to configure your own proxy
+ # class so you can support other encodings. See the ActiveSupport::Multibyte::Chars implementation for
+ # an example how to do this.
+ #
+ # Example:
+ # ActiveSupport::Multibyte.proxy_class = CharsForUTF32
+ mattr_accessor :proxy_class
+ self.proxy_class = ActiveSupport::Multibyte::Chars
+ end
+end
Oops, something went wrong.

0 comments on commit 22f75d5

Please sign in to comment.