Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Simplify ActiveSupport::Multibyte and make it run on Ruby 1.9.
* Unicode methods are now defined directly on Chars instead of a handler * Updated Unicode database to Unicode 5.1.0 * Improved documentation
- Loading branch information
Showing
18 changed files
with
1,562 additions
and
1,550 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -0,0 +1,147 @@ | |||
#!/usr/bin/env ruby | |||
|
|||
begin | |||
$:.unshift(File.expand_path(File.dirname(__FILE__) + '/../lib')) | |||
require 'active_support' | |||
rescue IOError | |||
end | |||
|
|||
require 'open-uri' | |||
require 'tmpdir' | |||
|
|||
module ActiveSupport | |||
module Multibyte | |||
class UnicodeDatabase | |||
def load; end | |||
end | |||
|
|||
class UnicodeDatabaseGenerator | |||
BASE_URI = "http://www.unicode.org/Public/#{ActiveSupport::Multibyte::UNICODE_VERSION}/ucd/" | |||
SOURCES = { | |||
:codepoints => BASE_URI + 'UnicodeData.txt', | |||
:composition_exclusion => BASE_URI + 'CompositionExclusions.txt', | |||
:grapheme_break_property => BASE_URI + 'auxiliary/GraphemeBreakProperty.txt', | |||
:cp1252 => 'http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT' | |||
} | |||
|
|||
def initialize | |||
@ucd = UnicodeDatabase.new | |||
|
|||
default = Codepoint.new | |||
default.combining_class = 0 | |||
default.uppercase_mapping = 0 | |||
default.lowercase_mapping = 0 | |||
@ucd.codepoints = Hash.new(default) | |||
end | |||
|
|||
def parse_codepoints(line) | |||
codepoint = Codepoint.new | |||
raise "Could not parse input." unless line =~ /^ | |||
([0-9A-F]+); # code | |||
([^;]+); # name | |||
([A-Z]+); # general category | |||
([0-9]+); # canonical combining class | |||
([A-Z]+); # bidi class | |||
(<([A-Z]*)>)? # decomposition type | |||
((\ ?[0-9A-F]+)*); # decompomposition mapping | |||
([0-9]*); # decimal digit | |||
([0-9]*); # digit | |||
([^;]*); # numeric | |||
([YN]*); # bidi mirrored | |||
([^;]*); # unicode 1.0 name | |||
([^;]*); # iso comment | |||
([0-9A-F]*); # simple uppercase mapping | |||
([0-9A-F]*); # simple lowercase mapping | |||
([0-9A-F]*)$/ix # simple titlecase mapping | |||
codepoint.code = $1.hex | |||
#codepoint.name = $2 | |||
#codepoint.category = $3 | |||
codepoint.combining_class = Integer($4) | |||
#codepoint.bidi_class = $5 | |||
codepoint.decomp_type = $7 | |||
codepoint.decomp_mapping = ($8=='') ? nil : $8.split.collect { |element| element.hex } | |||
#codepoint.bidi_mirrored = ($13=='Y') ? true : false | |||
codepoint.uppercase_mapping = ($16=='') ? 0 : $16.hex | |||
codepoint.lowercase_mapping = ($17=='') ? 0 : $17.hex | |||
#codepoint.titlecase_mapping = ($18=='') ? nil : $18.hex | |||
@ucd.codepoints[codepoint.code] = codepoint | |||
end | |||
|
|||
def parse_grapheme_break_property(line) | |||
if line =~ /^([0-9A-F\.]+)\s*;\s*([\w]+)\s*#/ | |||
type = $2.downcase.intern | |||
@ucd.boundary[type] ||= [] | |||
if $1.include? '..' | |||
parts = $1.split '..' | |||
@ucd.boundary[type] << (parts[0].hex..parts[1].hex) | |||
else | |||
@ucd.boundary[type] << $1.hex | |||
end | |||
end | |||
end | |||
|
|||
def parse_composition_exclusion(line) | |||
if line =~ /^([0-9A-F]+)/i | |||
@ucd.composition_exclusion << $1.hex | |||
end | |||
end | |||
|
|||
def parse_cp1252(line) | |||
if line =~ /^([0-9A-Fx]+)\s([0-9A-Fx]+)/i | |||
@ucd.cp1252[$1.hex] = $2.hex | |||
end | |||
end | |||
|
|||
def create_composition_map | |||
@ucd.codepoints.each do |_, cp| | |||
if !cp.nil? and cp.combining_class == 0 and cp.decomp_type.nil? and !cp.decomp_mapping.nil? and cp.decomp_mapping.length == 2 and @ucd.codepoints[cp.decomp_mapping[0]].combining_class == 0 and !@ucd.composition_exclusion.include?(cp.code) | |||
@ucd.composition_map[cp.decomp_mapping[0]] ||= {} | |||
@ucd.composition_map[cp.decomp_mapping[0]][cp.decomp_mapping[1]] = cp.code | |||
end | |||
end | |||
end | |||
|
|||
def normalize_boundary_map | |||
@ucd.boundary.each do |k,v| | |||
if [:lf, :cr].include? k | |||
@ucd.boundary[k] = v[0] | |||
end | |||
end | |||
end | |||
|
|||
def parse | |||
SOURCES.each do |type, url| | |||
filename = File.join(Dir.tmpdir, "#{url.split('/').last}") | |||
unless File.exist?(filename) | |||
$stderr.puts "Downloading #{url.split('/').last}" | |||
File.open(filename, 'wb') do |target| | |||
open(url) do |source| | |||
source.each_line { |line| target.write line } | |||
end | |||
end | |||
end | |||
File.open(filename) do |file| | |||
file.each_line { |line| send "parse_#{type}".intern, line } | |||
end | |||
end | |||
create_composition_map | |||
normalize_boundary_map | |||
end | |||
|
|||
def dump_to(filename) | |||
File.open(filename, 'wb') do |f| | |||
f.write Marshal.dump([@ucd.codepoints, @ucd.composition_exclusion, @ucd.composition_map, @ucd.boundary, @ucd.cp1252]) | |||
end | |||
end | |||
end | |||
end | |||
end | |||
|
|||
if __FILE__ == $0 | |||
filename = ActiveSupport::Multibyte::UnicodeDatabase.filename | |||
generator = ActiveSupport::Multibyte::UnicodeDatabaseGenerator.new | |||
generator.parse | |||
print "Writing to: #{filename}" | |||
generator.dump_to filename | |||
puts " (#{File.size(filename)} bytes)" | |||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
81 changes: 81 additions & 0 deletions
81
activesupport/lib/active_support/core_ext/string/multibyte.rb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -0,0 +1,81 @@ | |||
# encoding: utf-8 | |||
|
|||
module ActiveSupport #:nodoc: | |||
module CoreExtensions #:nodoc: | |||
module String #:nodoc: | |||
# Implements multibyte methods for easier access to multibyte characters in a String instance. | |||
module Multibyte | |||
unless '1.9'.respond_to?(:force_encoding) | |||
# +mb_chars+ is a multibyte safe proxy method for string methods. | |||
# | |||
# In Ruby 1.8 and older it creates and returns an instance of the ActiveSupport::Multibyte::Chars class which | |||
# encapsulates the original string. A Unicode safe version of all the String methods are defined on this proxy | |||
# class. If the proxy class doesn't respond to a certain method, it's forwarded to the encapsuled string. | |||
# | |||
# name = 'Claus Müller' | |||
# name.reverse #=> "rell??M sualC" | |||
# name.length #=> 13 | |||
# | |||
# name.mb_chars.reverse.to_s #=> "rellüM sualC" | |||
# name.mb_chars.length #=> 12 | |||
# | |||
# In Ruby 1.9 and newer +mb_chars+ returns +self+ because String is (mostly) encoding aware so we don't need | |||
# a proxy class any more. This means that +mb_chars+ makes it easier to write code that runs on multiple Ruby | |||
# versions. | |||
# | |||
# == Method chaining | |||
# | |||
# All the methods on the Chars proxy which normally return a string will return a Chars object. This allows | |||
# method chaining on the result of any of these methods. | |||
# | |||
# name.mb_chars.reverse.length #=> 12 | |||
# | |||
# == Interoperability and configuration | |||
# | |||
# The Char object tries to be as interchangeable with String objects as possible: sorting and comparing between | |||
# String and Char work like expected. The bang! methods change the internal string representation in the Chars | |||
# object. Interoperability problems can be resolved easily with a +to_s+ call. | |||
# | |||
# For more information about the methods defined on the Chars proxy see ActiveSupport::Multibyte::Chars. For | |||
# information about how to change the default Multibyte behaviour, see ActiveSupport::Multibyte. | |||
def mb_chars | |||
if ActiveSupport::Multibyte.proxy_class.wants?(self) | |||
ActiveSupport::Multibyte.proxy_class.new(self) | |||
else | |||
self | |||
end | |||
end | |||
|
|||
# Returns true if the string has UTF-8 semantics (a String used for purely byte resources is unlikely to have | |||
# them), returns false otherwise. | |||
def is_utf8? | |||
ActiveSupport::Multibyte::Chars.consumes?(self) | |||
end | |||
|
|||
unless '1.8.7 and later'.respond_to?(:chars) | |||
alias chars mb_chars | |||
end | |||
else | |||
# In Ruby 1.9 and newer +mb_chars+ returns self. In Ruby 1.8 and older +mb_chars+ creates and returns an | |||
# Unicode safe proxy for string operations, this makes it easier to write code that runs on multiple Ruby | |||
# versions. | |||
def mb_chars | |||
self | |||
end | |||
|
|||
# Returns true if the string has valid UTF-8 encoding. | |||
def is_utf8? | |||
case encoding | |||
when Encoding::UTF_8 | |||
valid_encoding? | |||
when Encoding::ASCII_8BIT, Encoding::US_ASCII | |||
dup.force_encoding(Encoding::UTF_8).valid_encoding? | |||
else | |||
false | |||
end | |||
end | |||
end | |||
end | |||
end | |||
end | |||
end |
66 changes: 0 additions & 66 deletions
66
activesupport/lib/active_support/core_ext/string/unicode.rb
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -1,9 +1,33 @@ | |||
module ActiveSupport | # encoding: utf-8 | ||
|
|||
require 'active_support/multibyte/chars' | |||
require 'active_support/multibyte/exceptions' | |||
require 'active_support/multibyte/unicode_database' | |||
|
|||
module ActiveSupport #:nodoc: | |||
module Multibyte #:nodoc: | module Multibyte #:nodoc: | ||
DEFAULT_NORMALIZATION_FORM = :kc | # A list of all available normalization forms. See http://www.unicode.org/reports/tr15/tr15-29.html for more | ||
# information about normalization. | |||
NORMALIZATIONS_FORMS = [:c, :kc, :d, :kd] | NORMALIZATIONS_FORMS = [:c, :kc, :d, :kd] | ||
UNICODE_VERSION = '5.0.0' | |||
end | |||
end | |||
|
|
||
require 'active_support/multibyte/chars' | # The Unicode version that is supported by the implementation | ||
UNICODE_VERSION = '5.1.0' | |||
|
|||
# The default normalization used for operations that require normalization. It can be set to any of the | |||
# normalizations in NORMALIZATIONS_FORMS. | |||
# | |||
# Example: | |||
# ActiveSupport::Multibyte.default_normalization_form = :c | |||
mattr_accessor :default_normalization_form | |||
self.default_normalization_form = :kc | |||
|
|||
# The proxy class returned when calling mb_chars. You can use this accessor to configure your own proxy | |||
# class so you can support other encodings. See the ActiveSupport::Multibyte::Chars implementation for | |||
# an example how to do this. | |||
# | |||
# Example: | |||
# ActiveSupport::Multibyte.proxy_class = CharsForUTF32 | |||
mattr_accessor :proxy_class | |||
self.proxy_class = ActiveSupport::Multibyte::Chars | |||
end | |||
end |
Oops, something went wrong.