Skip to content

Commit

Permalink
Simplify ActiveSupport::Multibyte and make it run on Ruby 1.9.
Browse files Browse the repository at this point in the history
* Unicode methods are now defined directly on Chars instead of a handler
* Updated Unicode database to Unicode 5.1.0
* Improved documentation
  • Loading branch information
Manfred committed Sep 21, 2008
1 parent 5f83e18 commit 22f75d5
Show file tree
Hide file tree
Showing 18 changed files with 1,562 additions and 1,550 deletions.
147 changes: 147 additions & 0 deletions activesupport/bin/generate_tables
@@ -0,0 +1,147 @@
#!/usr/bin/env ruby

begin
$:.unshift(File.expand_path(File.dirname(__FILE__) + '/../lib'))
require 'active_support'
rescue IOError
end

require 'open-uri'
require 'tmpdir'

module ActiveSupport
module Multibyte
class UnicodeDatabase
def load; end
end

class UnicodeDatabaseGenerator
BASE_URI = "http://www.unicode.org/Public/#{ActiveSupport::Multibyte::UNICODE_VERSION}/ucd/"
SOURCES = {
:codepoints => BASE_URI + 'UnicodeData.txt',
:composition_exclusion => BASE_URI + 'CompositionExclusions.txt',
:grapheme_break_property => BASE_URI + 'auxiliary/GraphemeBreakProperty.txt',
:cp1252 => 'http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT'
}

def initialize
@ucd = UnicodeDatabase.new

default = Codepoint.new
default.combining_class = 0
default.uppercase_mapping = 0
default.lowercase_mapping = 0
@ucd.codepoints = Hash.new(default)
end

def parse_codepoints(line)
codepoint = Codepoint.new
raise "Could not parse input." unless line =~ /^
([0-9A-F]+); # code
([^;]+); # name
([A-Z]+); # general category
([0-9]+); # canonical combining class
([A-Z]+); # bidi class
(<([A-Z]*)>)? # decomposition type
((\ ?[0-9A-F]+)*); # decompomposition mapping
([0-9]*); # decimal digit
([0-9]*); # digit
([^;]*); # numeric
([YN]*); # bidi mirrored
([^;]*); # unicode 1.0 name
([^;]*); # iso comment
([0-9A-F]*); # simple uppercase mapping
([0-9A-F]*); # simple lowercase mapping
([0-9A-F]*)$/ix # simple titlecase mapping
codepoint.code = $1.hex
#codepoint.name = $2
#codepoint.category = $3
codepoint.combining_class = Integer($4)
#codepoint.bidi_class = $5
codepoint.decomp_type = $7
codepoint.decomp_mapping = ($8=='') ? nil : $8.split.collect { |element| element.hex }
#codepoint.bidi_mirrored = ($13=='Y') ? true : false
codepoint.uppercase_mapping = ($16=='') ? 0 : $16.hex
codepoint.lowercase_mapping = ($17=='') ? 0 : $17.hex
#codepoint.titlecase_mapping = ($18=='') ? nil : $18.hex
@ucd.codepoints[codepoint.code] = codepoint
end

def parse_grapheme_break_property(line)
if line =~ /^([0-9A-F\.]+)\s*;\s*([\w]+)\s*#/
type = $2.downcase.intern
@ucd.boundary[type] ||= []
if $1.include? '..'
parts = $1.split '..'
@ucd.boundary[type] << (parts[0].hex..parts[1].hex)
else
@ucd.boundary[type] << $1.hex
end
end
end

def parse_composition_exclusion(line)
if line =~ /^([0-9A-F]+)/i
@ucd.composition_exclusion << $1.hex
end
end

def parse_cp1252(line)
if line =~ /^([0-9A-Fx]+)\s([0-9A-Fx]+)/i
@ucd.cp1252[$1.hex] = $2.hex
end
end

def create_composition_map
@ucd.codepoints.each do |_, cp|
if !cp.nil? and cp.combining_class == 0 and cp.decomp_type.nil? and !cp.decomp_mapping.nil? and cp.decomp_mapping.length == 2 and @ucd.codepoints[cp.decomp_mapping[0]].combining_class == 0 and !@ucd.composition_exclusion.include?(cp.code)
@ucd.composition_map[cp.decomp_mapping[0]] ||= {}
@ucd.composition_map[cp.decomp_mapping[0]][cp.decomp_mapping[1]] = cp.code
end
end
end

def normalize_boundary_map
@ucd.boundary.each do |k,v|
if [:lf, :cr].include? k
@ucd.boundary[k] = v[0]
end
end
end

def parse
SOURCES.each do |type, url|
filename = File.join(Dir.tmpdir, "#{url.split('/').last}")
unless File.exist?(filename)
$stderr.puts "Downloading #{url.split('/').last}"
File.open(filename, 'wb') do |target|
open(url) do |source|
source.each_line { |line| target.write line }
end
end
end
File.open(filename) do |file|
file.each_line { |line| send "parse_#{type}".intern, line }
end
end
create_composition_map
normalize_boundary_map
end

def dump_to(filename)
File.open(filename, 'wb') do |f|
f.write Marshal.dump([@ucd.codepoints, @ucd.composition_exclusion, @ucd.composition_map, @ucd.boundary, @ucd.cp1252])
end
end
end
end
end

if __FILE__ == $0
filename = ActiveSupport::Multibyte::UnicodeDatabase.filename
generator = ActiveSupport::Multibyte::UnicodeDatabaseGenerator.new
generator.parse
print "Writing to: #{filename}"
generator.dump_to filename
puts " (#{File.size(filename)} bytes)"
end
6 changes: 4 additions & 2 deletions activesupport/lib/active_support/core_ext/string.rb
@@ -1,9 +1,11 @@
# encoding: utf-8

require 'active_support/core_ext/string/inflections' require 'active_support/core_ext/string/inflections'
require 'active_support/core_ext/string/conversions' require 'active_support/core_ext/string/conversions'
require 'active_support/core_ext/string/access' require 'active_support/core_ext/string/access'
require 'active_support/core_ext/string/starts_ends_with' require 'active_support/core_ext/string/starts_ends_with'
require 'active_support/core_ext/string/iterators' require 'active_support/core_ext/string/iterators'
require 'active_support/core_ext/string/unicode' require 'active_support/core_ext/string/multibyte'
require 'active_support/core_ext/string/xchar' require 'active_support/core_ext/string/xchar'
require 'active_support/core_ext/string/filters' require 'active_support/core_ext/string/filters'
require 'active_support/core_ext/string/behavior' require 'active_support/core_ext/string/behavior'
Expand All @@ -15,6 +17,6 @@ class String #:nodoc:
include ActiveSupport::CoreExtensions::String::Inflections include ActiveSupport::CoreExtensions::String::Inflections
include ActiveSupport::CoreExtensions::String::StartsEndsWith include ActiveSupport::CoreExtensions::String::StartsEndsWith
include ActiveSupport::CoreExtensions::String::Iterators include ActiveSupport::CoreExtensions::String::Iterators
include ActiveSupport::CoreExtensions::String::Unicode
include ActiveSupport::CoreExtensions::String::Behavior include ActiveSupport::CoreExtensions::String::Behavior
include ActiveSupport::CoreExtensions::String::Multibyte
end end
81 changes: 81 additions & 0 deletions activesupport/lib/active_support/core_ext/string/multibyte.rb
@@ -0,0 +1,81 @@
# encoding: utf-8

module ActiveSupport #:nodoc:
module CoreExtensions #:nodoc:
module String #:nodoc:
# Implements multibyte methods for easier access to multibyte characters in a String instance.
module Multibyte
unless '1.9'.respond_to?(:force_encoding)
# +mb_chars+ is a multibyte safe proxy method for string methods.
#
# In Ruby 1.8 and older it creates and returns an instance of the ActiveSupport::Multibyte::Chars class which
# encapsulates the original string. A Unicode safe version of all the String methods are defined on this proxy
# class. If the proxy class doesn't respond to a certain method, it's forwarded to the encapsuled string.
#
# name = 'Claus Müller'
# name.reverse #=> "rell??M sualC"
# name.length #=> 13
#
# name.mb_chars.reverse.to_s #=> "rellüM sualC"
# name.mb_chars.length #=> 12
#
# In Ruby 1.9 and newer +mb_chars+ returns +self+ because String is (mostly) encoding aware so we don't need
# a proxy class any more. This means that +mb_chars+ makes it easier to write code that runs on multiple Ruby
# versions.
#
# == Method chaining
#
# All the methods on the Chars proxy which normally return a string will return a Chars object. This allows
# method chaining on the result of any of these methods.
#
# name.mb_chars.reverse.length #=> 12
#
# == Interoperability and configuration
#
# The Char object tries to be as interchangeable with String objects as possible: sorting and comparing between
# String and Char work like expected. The bang! methods change the internal string representation in the Chars
# object. Interoperability problems can be resolved easily with a +to_s+ call.
#
# For more information about the methods defined on the Chars proxy see ActiveSupport::Multibyte::Chars. For
# information about how to change the default Multibyte behaviour, see ActiveSupport::Multibyte.
def mb_chars
if ActiveSupport::Multibyte.proxy_class.wants?(self)
ActiveSupport::Multibyte.proxy_class.new(self)
else
self
end
end

# Returns true if the string has UTF-8 semantics (a String used for purely byte resources is unlikely to have
# them), returns false otherwise.
def is_utf8?
ActiveSupport::Multibyte::Chars.consumes?(self)
end

unless '1.8.7 and later'.respond_to?(:chars)
alias chars mb_chars
end
else
# In Ruby 1.9 and newer +mb_chars+ returns self. In Ruby 1.8 and older +mb_chars+ creates and returns an
# Unicode safe proxy for string operations, this makes it easier to write code that runs on multiple Ruby
# versions.
def mb_chars
self
end

# Returns true if the string has valid UTF-8 encoding.
def is_utf8?
case encoding
when Encoding::UTF_8
valid_encoding?
when Encoding::ASCII_8BIT, Encoding::US_ASCII
dup.force_encoding(Encoding::UTF_8).valid_encoding?
else
false
end
end
end
end
end
end
end
66 changes: 0 additions & 66 deletions activesupport/lib/active_support/core_ext/string/unicode.rb

This file was deleted.

36 changes: 30 additions & 6 deletions activesupport/lib/active_support/multibyte.rb
@@ -1,9 +1,33 @@
module ActiveSupport # encoding: utf-8

require 'active_support/multibyte/chars'
require 'active_support/multibyte/exceptions'
require 'active_support/multibyte/unicode_database'

module ActiveSupport #:nodoc:
module Multibyte #:nodoc: module Multibyte #:nodoc:
DEFAULT_NORMALIZATION_FORM = :kc # A list of all available normalization forms. See http://www.unicode.org/reports/tr15/tr15-29.html for more
# information about normalization.
NORMALIZATIONS_FORMS = [:c, :kc, :d, :kd] NORMALIZATIONS_FORMS = [:c, :kc, :d, :kd]
UNICODE_VERSION = '5.0.0'
end
end


require 'active_support/multibyte/chars' # The Unicode version that is supported by the implementation
UNICODE_VERSION = '5.1.0'

# The default normalization used for operations that require normalization. It can be set to any of the
# normalizations in NORMALIZATIONS_FORMS.
#
# Example:
# ActiveSupport::Multibyte.default_normalization_form = :c
mattr_accessor :default_normalization_form
self.default_normalization_form = :kc

# The proxy class returned when calling mb_chars. You can use this accessor to configure your own proxy
# class so you can support other encodings. See the ActiveSupport::Multibyte::Chars implementation for
# an example how to do this.
#
# Example:
# ActiveSupport::Multibyte.proxy_class = CharsForUTF32
mattr_accessor :proxy_class
self.proxy_class = ActiveSupport::Multibyte::Chars
end
end

0 comments on commit 22f75d5

Please sign in to comment.