Permalink
Browse files

Use multibyte proxy class on 1.9, refactor Unicode.

Makes String#mb_chars on Ruby 1.9 return an instance of ActiveSupport::Multibyte::Chars to work around 1.9's lack of Unicode case folding.

Refactors class methods from ActiveSupport::Multibyte::Chars into new Unicode module, adding other related functionality for consistency.

[#4594 state:resolved]

Signed-off-by: Jeremy Kemper <jeremy@bitsweat.net>
  • Loading branch information...
1 parent ad4be3d commit f3abc8ac36055afed9fcc902c33ee146e066d17a @norman norman committed with jeremy May 10, 2010
@@ -1,8 +1,10 @@
*Rails 3.0.0 [beta 4/release candidate] (unreleased)*
+* Ruby 1.9: support UTF-8 case folding. #4595 [Norman Clarke]
+
* Renames Array#rand -> Array#random_element. [Santiago Pastorino, Rizwan Reza]
-* 1.9 compat: Renames last_(month|year) to prev_(month|year) in Date and Time. [fxn]
+* Ruby 1.9: Renames last_(month|year) to prev_(month|year) in Date and Time. [fxn]
* Aliases Date#sunday to Date#end_of_week. [fxn]
@@ -11,135 +11,138 @@ require 'tmpdir'
module ActiveSupport
module Multibyte
- class UnicodeDatabase
- def load; end
- end
-
- class UnicodeDatabaseGenerator
- BASE_URI = "http://www.unicode.org/Public/#{ActiveSupport::Multibyte::UNICODE_VERSION}/ucd/"
- SOURCES = {
- :codepoints => BASE_URI + 'UnicodeData.txt',
- :composition_exclusion => BASE_URI + 'CompositionExclusions.txt',
- :grapheme_break_property => BASE_URI + 'auxiliary/GraphemeBreakProperty.txt',
- :cp1252 => 'http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT'
- }
-
- def initialize
- @ucd = UnicodeDatabase.new
-
- default = Codepoint.new
- default.combining_class = 0
- default.uppercase_mapping = 0
- default.lowercase_mapping = 0
- @ucd.codepoints = Hash.new(default)
- end
+ module Unicode
- def parse_codepoints(line)
- codepoint = Codepoint.new
- raise "Could not parse input." unless line =~ /^
- ([0-9A-F]+); # code
- ([^;]+); # name
- ([A-Z]+); # general category
- ([0-9]+); # canonical combining class
- ([A-Z]+); # bidi class
- (<([A-Z]*)>)? # decomposition type
- ((\ ?[0-9A-F]+)*); # decompomposition mapping
- ([0-9]*); # decimal digit
- ([0-9]*); # digit
- ([^;]*); # numeric
- ([YN]*); # bidi mirrored
- ([^;]*); # unicode 1.0 name
- ([^;]*); # iso comment
- ([0-9A-F]*); # simple uppercase mapping
- ([0-9A-F]*); # simple lowercase mapping
- ([0-9A-F]*)$/ix # simple titlecase mapping
- codepoint.code = $1.hex
- #codepoint.name = $2
- #codepoint.category = $3
- codepoint.combining_class = Integer($4)
- #codepoint.bidi_class = $5
- codepoint.decomp_type = $7
- codepoint.decomp_mapping = ($8=='') ? nil : $8.split.collect { |element| element.hex }
- #codepoint.bidi_mirrored = ($13=='Y') ? true : false
- codepoint.uppercase_mapping = ($16=='') ? 0 : $16.hex
- codepoint.lowercase_mapping = ($17=='') ? 0 : $17.hex
- #codepoint.titlecase_mapping = ($18=='') ? nil : $18.hex
- @ucd.codepoints[codepoint.code] = codepoint
+ class UnicodeDatabase
+ def load; end
end
- def parse_grapheme_break_property(line)
- if line =~ /^([0-9A-F\.]+)\s*;\s*([\w]+)\s*#/
- type = $2.downcase.intern
- @ucd.boundary[type] ||= []
- if $1.include? '..'
- parts = $1.split '..'
- @ucd.boundary[type] << (parts[0].hex..parts[1].hex)
- else
- @ucd.boundary[type] << $1.hex
+ class DatabaseGenerator
+ BASE_URI = "http://www.unicode.org/Public/#{UNICODE_VERSION}/ucd/"
+ SOURCES = {
+ :codepoints => BASE_URI + 'UnicodeData.txt',
+ :composition_exclusion => BASE_URI + 'CompositionExclusions.txt',
+ :grapheme_break_property => BASE_URI + 'auxiliary/GraphemeBreakProperty.txt',
+ :cp1252 => 'http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT'
+ }
+
+ def initialize
+ @ucd = Unicode::UnicodeDatabase.new
+
+ default = Codepoint.new
+ default.combining_class = 0
+ default.uppercase_mapping = 0
+ default.lowercase_mapping = 0
+ @ucd.codepoints = Hash.new(default)
+ end
+
+ def parse_codepoints(line)
+ codepoint = Codepoint.new
+ raise "Could not parse input." unless line =~ /^
+ ([0-9A-F]+); # code
+ ([^;]+); # name
+ ([A-Z]+); # general category
+ ([0-9]+); # canonical combining class
+ ([A-Z]+); # bidi class
+ (<([A-Z]*)>)? # decomposition type
+ ((\ ?[0-9A-F]+)*); # decompomposition mapping
+ ([0-9]*); # decimal digit
+ ([0-9]*); # digit
+ ([^;]*); # numeric
+ ([YN]*); # bidi mirrored
+ ([^;]*); # unicode 1.0 name
+ ([^;]*); # iso comment
+ ([0-9A-F]*); # simple uppercase mapping
+ ([0-9A-F]*); # simple lowercase mapping
+ ([0-9A-F]*)$/ix # simple titlecase mapping
+ codepoint.code = $1.hex
+ #codepoint.name = $2
+ #codepoint.category = $3
+ codepoint.combining_class = Integer($4)
+ #codepoint.bidi_class = $5
+ codepoint.decomp_type = $7
+ codepoint.decomp_mapping = ($8=='') ? nil : $8.split.collect { |element| element.hex }
+ #codepoint.bidi_mirrored = ($13=='Y') ? true : false
+ codepoint.uppercase_mapping = ($16=='') ? 0 : $16.hex
+ codepoint.lowercase_mapping = ($17=='') ? 0 : $17.hex
+ #codepoint.titlecase_mapping = ($18=='') ? nil : $18.hex
+ @ucd.codepoints[codepoint.code] = codepoint
+ end
+
+ def parse_grapheme_break_property(line)
+ if line =~ /^([0-9A-F\.]+)\s*;\s*([\w]+)\s*#/
+ type = $2.downcase.intern
+ @ucd.boundary[type] ||= []
+ if $1.include? '..'
+ parts = $1.split '..'
+ @ucd.boundary[type] << (parts[0].hex..parts[1].hex)
+ else
+ @ucd.boundary[type] << $1.hex
+ end
end
end
- end
- def parse_composition_exclusion(line)
- if line =~ /^([0-9A-F]+)/i
- @ucd.composition_exclusion << $1.hex
+ def parse_composition_exclusion(line)
+ if line =~ /^([0-9A-F]+)/i
+ @ucd.composition_exclusion << $1.hex
+ end
end
- end
- def parse_cp1252(line)
- if line =~ /^([0-9A-Fx]+)\s([0-9A-Fx]+)/i
- @ucd.cp1252[$1.hex] = $2.hex
+ def parse_cp1252(line)
+ if line =~ /^([0-9A-Fx]+)\s([0-9A-Fx]+)/i
+ @ucd.cp1252[$1.hex] = $2.hex
+ end
end
- end
- def create_composition_map
- @ucd.codepoints.each do |_, cp|
- if !cp.nil? and cp.combining_class == 0 and cp.decomp_type.nil? and !cp.decomp_mapping.nil? and cp.decomp_mapping.length == 2 and @ucd.codepoints[cp.decomp_mapping[0]].combining_class == 0 and !@ucd.composition_exclusion.include?(cp.code)
- @ucd.composition_map[cp.decomp_mapping[0]] ||= {}
- @ucd.composition_map[cp.decomp_mapping[0]][cp.decomp_mapping[1]] = cp.code
+ def create_composition_map
+ @ucd.codepoints.each do |_, cp|
+ if !cp.nil? and cp.combining_class == 0 and cp.decomp_type.nil? and !cp.decomp_mapping.nil? and cp.decomp_mapping.length == 2 and @ucd.codepoints[cp.decomp_mapping[0]].combining_class == 0 and !@ucd.composition_exclusion.include?(cp.code)
+ @ucd.composition_map[cp.decomp_mapping[0]] ||= {}
+ @ucd.composition_map[cp.decomp_mapping[0]][cp.decomp_mapping[1]] = cp.code
+ end
end
end
- end
- def normalize_boundary_map
- @ucd.boundary.each do |k,v|
- if [:lf, :cr].include? k
- @ucd.boundary[k] = v[0]
+ def normalize_boundary_map
+ @ucd.boundary.each do |k,v|
+ if [:lf, :cr].include? k
+ @ucd.boundary[k] = v[0]
+ end
end
end
- end
- def parse
- SOURCES.each do |type, url|
- filename = File.join(Dir.tmpdir, "#{url.split('/').last}")
- unless File.exist?(filename)
- $stderr.puts "Downloading #{url.split('/').last}"
- File.open(filename, 'wb') do |target|
- open(url) do |source|
- source.each_line { |line| target.write line }
+ def parse
+ SOURCES.each do |type, url|
+ filename = File.join(Dir.tmpdir, "#{url.split('/').last}")
+ unless File.exist?(filename)
+ $stderr.puts "Downloading #{url.split('/').last}"
+ File.open(filename, 'wb') do |target|
+ open(url) do |source|
+ source.each_line { |line| target.write line }
+ end
end
end
+ File.open(filename) do |file|
+ file.each_line { |line| send "parse_#{type}".intern, line }
+ end
end
- File.open(filename) do |file|
- file.each_line { |line| send "parse_#{type}".intern, line }
- end
+ create_composition_map
+ normalize_boundary_map
end
- create_composition_map
- normalize_boundary_map
- end
- def dump_to(filename)
- File.open(filename, 'wb') do |f|
- f.write Marshal.dump([@ucd.codepoints, @ucd.composition_exclusion, @ucd.composition_map, @ucd.boundary, @ucd.cp1252])
+ def dump_to(filename)
+ File.open(filename, 'wb') do |f|
+ f.write Marshal.dump([@ucd.codepoints, @ucd.composition_exclusion, @ucd.composition_map, @ucd.boundary, @ucd.cp1252])
+ end
end
end
end
end
end
if __FILE__ == $0
- filename = ActiveSupport::Multibyte::UnicodeDatabase.filename
- generator = ActiveSupport::Multibyte::UnicodeDatabaseGenerator.new
+ filename = ActiveSupport::Multibyte::Unicode::UnicodeDatabase.filename
+ generator = ActiveSupport::Multibyte::Unicode::DatabaseGenerator.new
generator.parse
print "Writing to: #{filename}"
generator.dump_to filename
@@ -2,7 +2,7 @@
require 'active_support/multibyte'
class String
- unless '1.9'.respond_to?(:force_encoding)
+ if '1.9'.respond_to?(:force_encoding)
# == Multibyte proxy
#
# +mb_chars+ is a multibyte safe proxy for string methods.
@@ -37,23 +37,13 @@ class String
# For more information about the methods defined on the Chars proxy see ActiveSupport::Multibyte::Chars. For
# information about how to change the default Multibyte behaviour see ActiveSupport::Multibyte.
def mb_chars
- if ActiveSupport::Multibyte.proxy_class.wants?(self)
+ if ActiveSupport::Multibyte.proxy_class.consumes?(self)
ActiveSupport::Multibyte.proxy_class.new(self)
else
self
end
end
-
- # Returns true if the string has UTF-8 semantics (a String used for purely byte resources is unlikely to have
- # them), returns false otherwise.
- def is_utf8?
- ActiveSupport::Multibyte::Chars.consumes?(self)
- end
- else
- def mb_chars #:nodoc
- self
- end
-
+
def is_utf8? #:nodoc
case encoding
when Encoding::UTF_8
@@ -64,5 +54,19 @@ def is_utf8? #:nodoc
false
end
end
+ else
+ def mb_chars
+ if ActiveSupport::Multibyte.proxy_class.wants?(self)
+ ActiveSupport::Multibyte.proxy_class.new(self)
+ else
+ self
+ end
+ end
+
+ # Returns true if the string has UTF-8 semantics (a String used for purely byte resources is unlikely to have
+ # them), returns false otherwise.
+ def is_utf8?
+ ActiveSupport::Multibyte::Chars.consumes?(self)
+ end
end
end
@@ -58,8 +58,9 @@ module Inflector
# transliterate("Jürgen")
# # => "Juergen"
def transliterate(string, replacement = "?")
- I18n.transliterate(Multibyte::Chars.normalize(
- Multibyte::Chars.tidy_bytes(string), :c), :replacement => replacement)
+ I18n.transliterate(ActiveSupport::Multibyte::Unicode.normalize(
+ ActiveSupport::Multibyte::Unicode.tidy_bytes(string), :c),
+ :replacement => replacement)
end
# Replaces special characters in a string so that it may be used as part of a 'pretty' URL.
@@ -1,30 +1,12 @@
# encoding: utf-8
-
require 'active_support/core_ext/module/attribute_accessors'
module ActiveSupport #:nodoc:
module Multibyte
autoload :EncodingError, 'active_support/multibyte/exceptions'
autoload :Chars, 'active_support/multibyte/chars'
- autoload :UnicodeDatabase, 'active_support/multibyte/unicode_database'
- autoload :Codepoint, 'active_support/multibyte/unicode_database'
- autoload :UCD, 'active_support/multibyte/unicode_database'
+ autoload :Unicode, 'active_support/multibyte/unicode'
- # A list of all available normalization forms. See http://www.unicode.org/reports/tr15/tr15-29.html for more
- # information about normalization.
- NORMALIZATION_FORMS = [:c, :kc, :d, :kd]
-
- # The Unicode version that is supported by the implementation
- UNICODE_VERSION = '5.1.0'
-
- # The default normalization used for operations that require normalization. It can be set to any of the
- # normalizations in NORMALIZATION_FORMS.
- #
- # Example:
- # ActiveSupport::Multibyte.default_normalization_form = :c
- mattr_accessor :default_normalization_form
- self.default_normalization_form = :kc
-
# The proxy class returned when calling mb_chars. You can use this accessor to configure your own proxy
# class so you can support other encodings. See the ActiveSupport::Multibyte::Chars implementation for
# an example how to do this.
Oops, something went wrong.

0 comments on commit f3abc8a

Please sign in to comment.