Skip to content

Commit

Permalink
Version '0.17.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
stevendaniels committed Dec 31, 2014
2 parents 4fafea5 + 3e64a2e commit ba8ce4e
Show file tree
Hide file tree
Showing 13 changed files with 194 additions and 114 deletions.
10 changes: 5 additions & 5 deletions lib/zhongwen_tools/regex.rb
Expand Up @@ -33,11 +33,11 @@ def self.lowercase_letters
end

def self.zh
/[\u2E80-\u2E99]|[\u2E9B-\u2EF3]|[\u2F00-\u2FD5]|[\u3005|\u3007]|[\u3021-\u3029]|[\u3038-\u303B]|[\u3400-\u4DB5]|[\u4E00-\u9FCC]|[\uF900-\uFA6D]|[\uFA70-\uFAD9]/
/\p{Han}/
end

def self.punc
/[\u0021-\u0023]|[\u0025-\u002A]|[\u002C-\u002F]|[\u003A\u003B\u003F\u0040]|[\u005B-\u005D\u005F\u007B\u007D\u00A1\u00A7\u00AB\u00B6\u00B7\u00BB\u00BF\u037E\u0387]/
/\p{Punct}/
end

def self.zh_punc
Expand Down Expand Up @@ -74,7 +74,7 @@ def self.zh_number_multiple
#
# Returns a Regex.
def self.bopomofo
/[ㄅㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄐㄑㄒㄓㄔㄕㄖㄗㄘㄙㄚㄛㄜㄝㄞㄟㄠㄡㄢㄣㄤㄥㄦㄧㄨㄩ]/
/\p{Bopomofo}/
end

private
Expand All @@ -86,6 +86,7 @@ def self.pyn_regexes
{
nl_regex: /([nN]eng?|[lnLN](a(i|ng?|o)?|e(i|ng)?|i(ang|a[on]?|e|ng?|u)?|o(ng?|u)|u(o|i|an?|n)?|ve?))/,
bpm_regex: /([mM]iu|[pmPM]ou|[bpmBPM](o|e(i|ng?)?|a(ng?|i|o)?|i(e|ng?|a[no])?|u))/,
y_regex: /[yY](a(o|ng?)?|e|i(n|ng)?|o(u|ng)?|u(e|a?n)?)/,
f_regex: /([fF](ou?|[ae](ng?|i)?|u))/,
dt_regex: /([dD](e(i|ng?)|i(a[on]?|u))|[dtDT](a(i|ng?|o)?|e(i|ng)?|i(a[on]?|e|ng|u)?|o(ng?|u)|u(o|i|an?|n)?))/,
gkh_regex: /([ghkGHK](a(i|ng?|o)?|e(i|ng?)?|o(u|ng)|u(a(i|ng?)?|i|n|o)?))/,
Expand All @@ -94,8 +95,7 @@ def self.pyn_regexes
r_regex: /([rR]([ae]ng?|i|e|ao|ou|ong|u[oin]|ua?n?))/,
jqx_regex: /([jqxJQX](i(a(o|ng?)?|[eu]|ong|ng?)?|u(e|a?n)?))/,
aeo_regex: /(([aA](i|o|ng?)?|[oO]u?|[eE](i|ng?|r)?))/,
w_regex: /([wW](a(i|ng?)?|o|e(i|ng?)?|u))/,
y_regex: /[yY](a(o|ng?)?|e|in?g?|o(u|ng)?|u(e|a?n)?)/
w_regex: /([wW](a(i|ng?)?|o|e(i|ng?)?|u))/
}
end

Expand Down
122 changes: 29 additions & 93 deletions lib/zhongwen_tools/romanization.rb
@@ -1,6 +1,11 @@
# encoding: utf-8
require 'zhongwen_tools/romanization/pinyin'
require 'zhongwen_tools/romanization/pinyin_table'
require 'zhongwen_tools/romanization/zhuyin_fuhao'
require 'zhongwen_tools/romanization/tongyong_pinyin'
require 'zhongwen_tools/romanization/wade_giles'
require 'zhongwen_tools/romanization/yale'
require 'zhongwen_tools/romanization/mps2'
require 'zhongwen_tools/romanization/romanization_table'

# NOTE: Creates several dynamic Modules and their associated methods.
Expand Down Expand Up @@ -29,7 +34,12 @@ def self.convert(str, to, from)
# belongs to another romanization system p a romanization
# system, use the romanization modules specific function.
#
# str - a String to test.
# Zhuyin Fuhao, Tongyong Pinyin, Wade Giles, MSP2 or Yale.
# http://en.wikipedia.org/wiki/Tongyong_Pinyin
# http://pinyin.info/romanization/tongyong/
# http://en.wikipedia.org/wiki/Wade%E2%80%93Giles
# http://en.wikipedia.org/wiki/Bopomofo
# http://pinyin.info/romanization/bopomofo/index.html # str - a String to test.
#
# Examples
# romanization?('hao3') #=> :pyn
Expand All @@ -56,7 +66,8 @@ def self.romanization?(str)
end
end

def split(str, type = nil)
def self.split(str, type = nil)
# should probably yield
type ||= romanization?(str)

if type == :py
Expand All @@ -67,6 +78,22 @@ def split(str, type = nil)

private

def self.detect_romanization(str, regex)
normalized_str = str.downcase.gsub(ZhongwenTools::Regex.punc, '').gsub(/[1-5\s\-']/, '')
#TODO: ignore tonal marks from other systems wade giles, tongyong etc.

normalized_str.scan(regex).join == normalized_str
end

def self.split_romanization(str, regex)
# TODO: ignore tonal marks from other systems wade giles, tongyong etc.
results = str.scan(regex).map do |arr|
arr[0].strip.gsub('-','')
end

results.flatten - ['']
end

def self.convert_romanization(str, from, to)
# NOTE: extract/refactor tokens cause tests to fail.
if from == :pyn
Expand Down Expand Up @@ -104,7 +131,6 @@ def self.find_token_replacement(token, str, to, from)
replace = token_replacement(token, from).fetch(to){ search }
replace = fix_capitalization(str, token, replace)


[search, replace]
end

Expand All @@ -127,82 +153,6 @@ def self.token_replacement(token, from = nil)
result || {}
end


# <module_name>::<romanization_type>?(str)
#
# Public: Checks if a String is a romanization:
# Zhuyin Fuhao, Tongyong Pinyin, Wade Giles, MSP2 or Yale.
# http://en.wikipedia.org/wiki/Tongyong_Pinyin
# http://pinyin.info/romanization/tongyong/
# http://en.wikipedia.org/wiki/Wade%E2%80%93Giles
# http://en.wikipedia.org/wiki/Bopomofo
# http://pinyin.info/romanization/bopomofo/index.html
#
# str - a String. Optional if the object calling the method is a String.
#
# Examples
#
# typy?('chuei niou') #=> true
# wg?('Mao2 Tse2 Tung1') #=> true
# bpmf?('ㄊㄥ') #=> true
#
# Returns a boolean.
def self.create_detect_method(romanization_module, name)
romanization_module.define_singleton_method("#{name}?") do |str|

regex = romanization_module == :ZhuyinFuhao ? ZhongwenTools::Regex.bopomofo : ZhongwenTools::Romanization.detect_regex(name.to_sym)
normalized_str = str.downcase.gsub(ZhongwenTools::Regex.punc,'').gsub(/[1-5\s\-']/,'')
#TODO: ignore tonal marks from other systems wade giles, tongyong etc.
normalized_str.scan(regex).join == normalized_str
end
end

# <module_name>::to_<romanization_type>(str)
# Public: Converts to the given romanization from pyn (pinyin using numbers instead of tone marks.
#
# str = a String to be converted
#
# Examples:
#
#
#
# ZhongwenTools::Romanization::ZhuyinFuhao.to_zyfh('Mao2 Ze2-dong1') # => 'ㄇㄠ2 ㄗㄜ2ㄉㄨㄥ1'
#
# Returns a String.
def self.create_convert_method(romanization_module, romanization_name, name)
romanization_module.define_singleton_method("to_#{ name }") do |*args|
str, from = args
from ||= ZhongwenTools::Romanization.romanization?(str)

ZhongwenTools::Romanization.convert str, romanization_name, from.to_sym
end
end

# <module_name>::split(str)
# Public: splits the romanization's string.
#
# str - a String to be split
#
# Examples
#
#
# split('zhong1guo2')
# # => ['zhong1', 'guo2']
#
# Returns an Array of Strings.
def self.create_split_method(romanization_module, name)
regex = romanization_module == :ZhuyinFuhao ? /([#{ZhongwenTools::Regex.bopomofo}]*)/ : /(#{ZhongwenTools::Romanization.detect_regex(name.to_sym)}*)/

romanization_module.define_singleton_method("split") do |str|
# TODO: ignore tonal marks from other systems wade giles, tongyong etc.
results = str.scan(regex).map do |arr|
arr[0].strip.gsub('-','')
end

results.flatten - ['']
end
end

# Internal: Produces a Regexp for a romanization type.
#
# type - a Symbol for the romanization type.
Expand Down Expand Up @@ -252,19 +202,5 @@ def self.hyphenated?(str)
TongyongPinyin: %w(typy tongyong tongyong_pinyin),
MPS2: ['mps2']
}

RomanizationTypes.each do |module_name, names|
romanization_module = self.const_set(module_name, Module.new) unless self.const_defined?(module_name)
romanization_module ||= self.const_get(module_name)

romanization_name = names.first.to_sym

names.each do |name|
create_convert_method(romanization_module, romanization_name, name)
end

create_detect_method(romanization_module, romanization_name)
create_split_method(romanization_module, romanization_name)
end
end
end
22 changes: 22 additions & 0 deletions lib/zhongwen_tools/romanization/mps2.rb
@@ -0,0 +1,22 @@
module ZhongwenTools
module Romanization
module MPS2
def self.to_mps2(*args)
str, from = args
from ||= ZhongwenTools::Romanization.romanization?(str)

ZhongwenTools::Romanization.convert str, :mps2, from.to_sym
end

def self.mps2?(str)
regex = ZhongwenTools::Romanization.detect_regex(:mps2)
ZhongwenTools::Romanization.detect_romanization(str, regex)
end

def self.split(str)
regex = /(#{ ZhongwenTools::Romanization.detect_regex(:mps2) }*)/
ZhongwenTools::Romanization.split_romanization(str, regex)
end
end
end
end
25 changes: 12 additions & 13 deletions lib/zhongwen_tools/romanization/pinyin.rb
Expand Up @@ -5,7 +5,6 @@

module ZhongwenTools
module Romanization

def self.convert_to_py(str, from)
str = convert_romanization(str, from, :pyn) if from != :pyn
ZhongwenTools::Romanization::Pinyin.convert_pyn_to_pinyin(str)
Expand All @@ -31,7 +30,7 @@ module Pinyin
str, from = args
from ||= ZhongwenTools::Romanization.romanization? str

#_convert_romanization str, _set_type(type.to_sym), _set_type(from)
# _convert_romanization str, _set_type(type.to_sym), _set_type(from)
ZhongwenTools::Romanization.convert str, py_type(romanization), (py_type(from) || from)
end
end
Expand All @@ -40,7 +39,7 @@ def self.split_pyn(str)
# FIXME: ignore punctuation
regex = str[/[1-5]/].nil? ? /(#{ZhongwenTools::Regex.pinyin_toneless})/ : /(#{ZhongwenTools::Regex.pyn}|#{ZhongwenTools::Regex.pinyin_toneless})/

str.scan(regex).map{ |arr| arr[0].strip.gsub('-','') }.flatten
str.scan(regex).map{ |arr| arr[0].strip.gsub('-', '') }.flatten
end

def self.split_py(str)
Expand All @@ -49,7 +48,9 @@ def self.split_py(str)
results = words.map do |word|
word, is_capitalized = normalize_pinyin(word)
# NOTE: Special Case "fǎnguāng" should be "fǎn" + "guāng"
# Special Case "yìnián" should be "yì" + "nián"
word = word.gsub('ngu', 'n-gu')
.gsub(/([#{ ZhongwenTools::Regex.only_tones }])(ni[#{ ZhongwenTools::Regex.py_tones['a'] }])/){ "#{ $1 }-#{ $2 }" }
result = word.split(/['\-]/).flatten.map do |x|
find_py(x)
end
Expand Down Expand Up @@ -89,7 +90,7 @@ def self.py?(str)
# Returns Boolean.
def self.pyn?(str)
# FIXME: use strip_punctuation method
normalized_str = ZhongwenTools::Caps.downcase(str.gsub(ZhongwenTools::Regex.punc,'').gsub(/[\s\-]/,''))
normalized_str = ZhongwenTools::Caps.downcase(str.gsub(ZhongwenTools::Regex.punc, '').gsub(/[\s\-]/, ''))
pyn_arr = split_pyn(normalized_str).map{ |p| p }

pyn_matches_properly?(pyn_arr, normalized_str) &&
Expand Down Expand Up @@ -126,7 +127,6 @@ def self.py_type(romanization)
{ pyn: :pyn, py: :py, pinyin: :py }[romanization]
end


def self.normalize_pinyin(pinyin)
[ZhongwenTools::Caps.downcase(pinyin), capitalized?(pinyin)]
end
Expand Down Expand Up @@ -180,9 +180,9 @@ def self.current_pyn(pyn, pinyin_arr)
replace = pinyin_replacement(pinyin)
match = pinyin
if replacements.size > 0
pyn = pyn.sub(/(#{replacements.join('.*')}.*)#{match}/){ $1 + replace }
pyn = pyn.sub(/(#{ replacements.join('.*') }.*)#{ match }/){ $1 + replace }
else
pyn = pyn.sub(/#{match}/){ "#{$1}#{replace}"}
pyn = pyn.sub(/#{match}/){ "#{ $1 }#{ replace }" }
end
replacements << replace
end
Expand All @@ -195,20 +195,19 @@ def self.pinyin_replacement(py)
py.include? x
end
match = select_pinyin_match(matches)
replace = PYN_PY.find{|k,v| k if v == match}[0]
replace = PYN_PY.find{ |k, v| k if v == match }[0]

py.gsub(match, replace).gsub(/([^\d ]*)(\d)([^\d ]*)/){$1 + $3 + $2}
py.gsub(match, replace).gsub(/([^\d ]*)(\d)([^\d ]*)/){ $1 + $3 + $2 }
end

def self.select_pinyin_match(matches)
# take the longest pinyin match. Use bytes because 'è' is prefered over 'n' or 'r' or 'm'
match = matches.sort{|x,y| x.bytes.to_a.length <=> y.bytes.to_a.length}[-1]
match = matches.sort{ |x, y| x.bytes.to_a.length <=> y.bytes.to_a.length }[-1]

# Edge case.. en/eng pyn -> py conversion is one way only.
match[/^(ē|é|ě|è|e)n?g?/].nil? ? match : match.chars[0]
end


# Internal: Replaces numbered pinyin with actual pinyin. Pinyin separated with hyphens are combined as one word.
#
# str - A String to replace with actual pinyin
Expand All @@ -229,8 +228,8 @@ def self.convert_pyn_to_pinyin(str)
# And finally, correct those apostrophes at the very end.
# It's like magic.
str.gsub(regex) do
($3.nil? ? "#{PYN_PY[$1]}" : ($2 == '' && ['a','e','o'].include?($3[0,1]))? "'#{PYN_PY["#{$3}#{$6}"]}#{$4}#{$5}" : "#{$2}#{PYN_PY["#{$3}#{$6}"]}#{$4}#{$5}") + (($7.to_s.length > 1) ? '-' : '')
end.gsub("-'","-").sub(/^'/,'')
($3.nil? ? "#{ PYN_PY[$1] }" : ($2 == '' && %w(a e o).include?($3[0,1]))? "'#{ PYN_PY["#{ $3 }#{ $6 }"]}#{ $4 }#{ $5 }" : "#{ $2 }#{ PYN_PY["#{ $3 }#{ $6 }"] }#{ $4 }#{ $5 }") + (($7.to_s.length > 1) ? '-' : '')
end.gsub("-'", '-').sub(/^'/, '')
end
end
end
Expand Down
29 changes: 29 additions & 0 deletions lib/zhongwen_tools/romanization/tongyong_pinyin.rb
@@ -0,0 +1,29 @@
module ZhongwenTools
module Romanization
module TongyongPinyin
def self.to_typy(*args)
str, from = args
from ||= ZhongwenTools::Romanization.romanization?(str)

ZhongwenTools::Romanization.convert str, :typy, from.to_sym
end

def self.typy?(str)
regex = ZhongwenTools::Romanization.detect_regex(:typy)
ZhongwenTools::Romanization.detect_romanization(str, regex)
end

def self.split(str)
regex = /(#{ ZhongwenTools::Romanization.detect_regex(:typy) }*)/
ZhongwenTools::Romanization.split_romanization(str, regex)
end

class << self
[:tongyong, :tongyong_pinyin].each do |m|
alias_method "to_#{ m }".to_sym, :to_typy
alias_method "#{ m }?", :typy?
end
end
end
end
end
29 changes: 29 additions & 0 deletions lib/zhongwen_tools/romanization/wade_giles.rb
@@ -0,0 +1,29 @@
module ZhongwenTools
module Romanization
module WadeGiles
def self.to_wg(*args)
str, from = args
from ||= ZhongwenTools::Romanization.romanization?(str)

ZhongwenTools::Romanization.convert str, :wg, from.to_sym
end

def self.wg?(str)
regex = ZhongwenTools::Romanization.detect_regex(:wg)
ZhongwenTools::Romanization.detect_romanization(str, regex)
end

def self.split(str)
regex = /(#{ ZhongwenTools::Romanization.detect_regex(:wg) }*)/
ZhongwenTools::Romanization.split_romanization(str, regex)
end

class << self
[:wade_giles, :wadegiles].each do |m|
alias_method "to_#{ m }".to_sym, :to_wg
alias_method "#{ m }?", :wg?
end
end
end
end
end

0 comments on commit ba8ce4e

Please sign in to comment.