Version '0.17.0'

stevendaniels · Dec 31, 2014 · ba8ce4e · ba8ce4e
2 parents 4fafea5 + 3e64a2e
commit ba8ce4e
Show file tree

Hide file tree

Showing 13 changed files with 194 additions and 114 deletions.
diff --git a/lib/zhongwen_tools/regex.rb b/lib/zhongwen_tools/regex.rb
@@ -33,11 +33,11 @@ def self.lowercase_letters
     end
 
     def self.zh
-      /[\u2E80-\u2E99]|[\u2E9B-\u2EF3]|[\u2F00-\u2FD5]|[\u3005|\u3007]|[\u3021-\u3029]|[\u3038-\u303B]|[\u3400-\u4DB5]|[\u4E00-\u9FCC]|[\uF900-\uFA6D]|[\uFA70-\uFAD9]/
+      /\p{Han}/
     end
 
     def self.punc
-      /[\u0021-\u0023]|[\u0025-\u002A]|[\u002C-\u002F]|[\u003A\u003B\u003F\u0040]|[\u005B-\u005D\u005F\u007B\u007D\u00A1\u00A7\u00AB\u00B6\u00B7\u00BB\u00BF\u037E\u0387]/
+      /\p{Punct}/
     end
 
     def self.zh_punc
@@ -74,7 +74,7 @@ def self.zh_number_multiple
     #
     # Returns a Regex.
     def self.bopomofo
-      /[ㄅㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄐㄑㄒㄓㄔㄕㄖㄗㄘㄙㄚㄛㄜㄝㄞㄟㄠㄡㄢㄣㄤㄥㄦㄧㄨㄩ]/
+      /\p{Bopomofo}/
     end
 
     private
@@ -86,6 +86,7 @@ def self.pyn_regexes
       {
         nl_regex: /([nN]eng?|[lnLN](a(i|ng?|o)?|e(i|ng)?|i(ang|a[on]?|e|ng?|u)?|o(ng?|u)|u(o|i|an?|n)?|ve?))/,
         bpm_regex: /([mM]iu|[pmPM]ou|[bpmBPM](o|e(i|ng?)?|a(ng?|i|o)?|i(e|ng?|a[no])?|u))/,
+        y_regex: /[yY](a(o|ng?)?|e|i(n|ng)?|o(u|ng)?|u(e|a?n)?)/,
         f_regex: /([fF](ou?|[ae](ng?|i)?|u))/,
         dt_regex: /([dD](e(i|ng?)|i(a[on]?|u))|[dtDT](a(i|ng?|o)?|e(i|ng)?|i(a[on]?|e|ng|u)?|o(ng?|u)|u(o|i|an?|n)?))/,
         gkh_regex: /([ghkGHK](a(i|ng?|o)?|e(i|ng?)?|o(u|ng)|u(a(i|ng?)?|i|n|o)?))/,
@@ -94,8 +95,7 @@ def self.pyn_regexes
         r_regex: /([rR]([ae]ng?|i|e|ao|ou|ong|u[oin]|ua?n?))/,
         jqx_regex: /([jqxJQX](i(a(o|ng?)?|[eu]|ong|ng?)?|u(e|a?n)?))/,
         aeo_regex: /(([aA](i|o|ng?)?|[oO]u?|[eE](i|ng?|r)?))/,
-        w_regex: /([wW](a(i|ng?)?|o|e(i|ng?)?|u))/,
-        y_regex: /[yY](a(o|ng?)?|e|in?g?|o(u|ng)?|u(e|a?n)?)/
+        w_regex: /([wW](a(i|ng?)?|o|e(i|ng?)?|u))/
       }
     end
 

diff --git a/lib/zhongwen_tools/romanization.rb b/lib/zhongwen_tools/romanization.rb
@@ -1,6 +1,11 @@
 # encoding: utf-8
 require 'zhongwen_tools/romanization/pinyin'
 require 'zhongwen_tools/romanization/pinyin_table'
+require 'zhongwen_tools/romanization/zhuyin_fuhao'
+require 'zhongwen_tools/romanization/tongyong_pinyin'
+require 'zhongwen_tools/romanization/wade_giles'
+require 'zhongwen_tools/romanization/yale'
+require 'zhongwen_tools/romanization/mps2'
 require 'zhongwen_tools/romanization/romanization_table'
 
 # NOTE: Creates several dynamic Modules and their associated methods.
@@ -29,7 +34,12 @@ def self.convert(str, to, from)
     #         belongs to another romanization system p a romanization
     #         system, use the romanization modules specific function.
     #
-    # str - a String to test.
+    #         Zhuyin Fuhao, Tongyong Pinyin, Wade Giles, MSP2 or Yale.
+    #         http://en.wikipedia.org/wiki/Tongyong_Pinyin
+    #         http://pinyin.info/romanization/tongyong/
+    #         http://en.wikipedia.org/wiki/Wade%E2%80%93Giles
+    #         http://en.wikipedia.org/wiki/Bopomofo
+    #         http://pinyin.info/romanization/bopomofo/index.html  # str - a String to test.
     #
     # Examples
     #    romanization?('hao3') #=> :pyn
@@ -56,7 +66,8 @@ def self.romanization?(str)
       end
     end
 
-    def split(str, type = nil)
+    def self.split(str, type = nil)
+      # should probably yield
       type ||= romanization?(str)
 
       if type == :py
@@ -67,6 +78,22 @@ def split(str, type = nil)
 
     private
 
+    def self.detect_romanization(str, regex)
+      normalized_str = str.downcase.gsub(ZhongwenTools::Regex.punc, '').gsub(/[1-5\s\-']/, '')
+      #TODO: ignore tonal marks from other systems wade giles, tongyong etc.
+
+      normalized_str.scan(regex).join == normalized_str
+    end
+
+    def self.split_romanization(str, regex)
+      # TODO: ignore tonal marks from other systems wade giles, tongyong etc.
+      results = str.scan(regex).map do |arr|
+        arr[0].strip.gsub('-','')
+      end
+
+      results.flatten - ['']
+    end
+
     def self.convert_romanization(str, from, to)
         # NOTE: extract/refactor tokens cause tests to fail.
         if from == :pyn
@@ -104,7 +131,6 @@ def self.find_token_replacement(token, str, to, from)
       replace = token_replacement(token, from).fetch(to){ search }
       replace = fix_capitalization(str, token, replace)
 
-
       [search, replace]
     end
 
@@ -127,82 +153,6 @@ def self.token_replacement(token, from = nil)
       result || {}
     end
 
-
-    # <module_name>::<romanization_type>?(str)
-    #
-    # Public: Checks if a String is a romanization:
-    #         Zhuyin Fuhao, Tongyong Pinyin, Wade Giles, MSP2 or Yale.
-    #         http://en.wikipedia.org/wiki/Tongyong_Pinyin
-    #         http://pinyin.info/romanization/tongyong/
-    #         http://en.wikipedia.org/wiki/Wade%E2%80%93Giles
-    #         http://en.wikipedia.org/wiki/Bopomofo
-    #         http://pinyin.info/romanization/bopomofo/index.html
-    #
-    # str - a String. Optional if the object calling the method is a String.
-    #
-    # Examples
-    #
-    #   typy?('chuei niou')     #=> true
-    #   wg?('Mao2 Tse2 Tung1')  #=> true
-    #   bpmf?('ㄊㄥ')           #=> true
-    #
-    # Returns a boolean.
-    def self.create_detect_method(romanization_module, name)
-      romanization_module.define_singleton_method("#{name}?") do |str|
-
-        regex = romanization_module == :ZhuyinFuhao ? ZhongwenTools::Regex.bopomofo : ZhongwenTools::Romanization.detect_regex(name.to_sym)
-        normalized_str = str.downcase.gsub(ZhongwenTools::Regex.punc,'').gsub(/[1-5\s\-']/,'')
-        #TODO: ignore tonal marks from other systems wade giles, tongyong etc.
-        normalized_str.scan(regex).join == normalized_str
-      end
-    end
-
-    # <module_name>::to_<romanization_type>(str)
-    # Public: Converts to the given romanization from pyn (pinyin using numbers instead of tone marks.
-    #
-    # str = a String to be converted
-    #
-    # Examples:
-    #
-    #
-    #
-    #   ZhongwenTools::Romanization::ZhuyinFuhao.to_zyfh('Mao2 Ze2-dong1') # => 'ㄇㄠ2 ㄗㄜ2ㄉㄨㄥ1'
-    #
-    # Returns a String.
-    def self.create_convert_method(romanization_module, romanization_name, name)
-      romanization_module.define_singleton_method("to_#{ name }") do |*args|
-        str, from = args
-        from ||= ZhongwenTools::Romanization.romanization?(str)
-
-        ZhongwenTools::Romanization.convert str, romanization_name, from.to_sym
-      end
-    end
-
-    # <module_name>::split(str)
-    # Public: splits the romanization's string.
-    #
-    # str - a String to be split
-    #
-    # Examples
-    #
-    #
-    #   split('zhong1guo2')
-    #   # => ['zhong1', 'guo2']
-    #
-    # Returns an Array of Strings.
-    def self.create_split_method(romanization_module, name)
-      regex = romanization_module == :ZhuyinFuhao ? /([#{ZhongwenTools::Regex.bopomofo}]*)/ : /(#{ZhongwenTools::Romanization.detect_regex(name.to_sym)}*)/
-
-      romanization_module.define_singleton_method("split") do |str|
-        # TODO: ignore tonal marks from other systems wade giles, tongyong etc.
-        results = str.scan(regex).map do |arr|
-          arr[0].strip.gsub('-','')
-        end
-
-        results.flatten - ['']
-      end
-    end
-
     # Internal: Produces a Regexp for a romanization type.
     #
     # type - a Symbol for the romanization type.
@@ -252,19 +202,5 @@ def self.hyphenated?(str)
       TongyongPinyin: %w(typy tongyong tongyong_pinyin),
       MPS2: ['mps2']
     }
-
-    RomanizationTypes.each do |module_name, names|
-      romanization_module = self.const_set(module_name, Module.new) unless self.const_defined?(module_name)
-      romanization_module ||= self.const_get(module_name)
-
-      romanization_name = names.first.to_sym
-
-      names.each do |name|
-        create_convert_method(romanization_module, romanization_name, name)
-      end
-
-      create_detect_method(romanization_module, romanization_name)
-      create_split_method(romanization_module, romanization_name)
-    end
   end
 end
diff --git a/lib/zhongwen_tools/romanization/mps2.rb b/lib/zhongwen_tools/romanization/mps2.rb
@@ -0,0 +1,22 @@
+module ZhongwenTools
+  module Romanization
+    module MPS2
+      def self.to_mps2(*args)
+        str, from = args
+        from ||= ZhongwenTools::Romanization.romanization?(str)
+
+        ZhongwenTools::Romanization.convert str, :mps2, from.to_sym
+      end
+
+      def self.mps2?(str)
+        regex = ZhongwenTools::Romanization.detect_regex(:mps2)
+        ZhongwenTools::Romanization.detect_romanization(str, regex)
+      end
+
+      def self.split(str)
+        regex = /(#{ ZhongwenTools::Romanization.detect_regex(:mps2) }*)/
+        ZhongwenTools::Romanization.split_romanization(str, regex)
+      end
+    end
+  end
+end
diff --git a/lib/zhongwen_tools/romanization/pinyin.rb b/lib/zhongwen_tools/romanization/pinyin.rb
@@ -5,7 +5,6 @@
 
 module ZhongwenTools
   module Romanization
-
     def self.convert_to_py(str, from)
       str =  convert_romanization(str, from, :pyn) if from != :pyn
       ZhongwenTools::Romanization::Pinyin.convert_pyn_to_pinyin(str)
@@ -31,7 +30,7 @@ module Pinyin
           str, from = args
           from ||= ZhongwenTools::Romanization.romanization? str
 
-          #_convert_romanization str, _set_type(type.to_sym), _set_type(from)
+          # _convert_romanization str, _set_type(type.to_sym), _set_type(from)
           ZhongwenTools::Romanization.convert str, py_type(romanization), (py_type(from) || from)
         end
       end
@@ -40,7 +39,7 @@ def self.split_pyn(str)
         # FIXME: ignore punctuation
         regex = str[/[1-5]/].nil? ?  /(#{ZhongwenTools::Regex.pinyin_toneless})/ : /(#{ZhongwenTools::Regex.pyn}|#{ZhongwenTools::Regex.pinyin_toneless})/
 
-        str.scan(regex).map{ |arr| arr[0].strip.gsub('-','') }.flatten
+        str.scan(regex).map{ |arr| arr[0].strip.gsub('-', '') }.flatten
       end
 
       def self.split_py(str)
@@ -49,7 +48,9 @@ def self.split_py(str)
         results = words.map do |word|
           word, is_capitalized = normalize_pinyin(word)
           # NOTE: Special Case "fǎnguāng" should be "fǎn" + "guāng"
+          #       Special Case "yìnián" should be "yì" + "nián"
           word = word.gsub('ngu', 'n-gu')
+            .gsub(/([#{ ZhongwenTools::Regex.only_tones }])(ni[#{ ZhongwenTools::Regex.py_tones['a'] }])/){ "#{ $1 }-#{ $2 }" }
           result = word.split(/['\-]/).flatten.map do |x|
             find_py(x)
           end
@@ -89,7 +90,7 @@ def self.py?(str)
       # Returns Boolean.
       def self.pyn?(str)
         # FIXME: use strip_punctuation method
-        normalized_str = ZhongwenTools::Caps.downcase(str.gsub(ZhongwenTools::Regex.punc,'').gsub(/[\s\-]/,''))
+        normalized_str = ZhongwenTools::Caps.downcase(str.gsub(ZhongwenTools::Regex.punc, '').gsub(/[\s\-]/, ''))
         pyn_arr = split_pyn(normalized_str).map{ |p| p }
 
         pyn_matches_properly?(pyn_arr, normalized_str) &&
@@ -126,7 +127,6 @@ def self.py_type(romanization)
         { pyn: :pyn, py: :py, pinyin: :py }[romanization]
       end
 
-
       def self.normalize_pinyin(pinyin)
         [ZhongwenTools::Caps.downcase(pinyin), capitalized?(pinyin)]
       end
@@ -180,9 +180,9 @@ def self.current_pyn(pyn, pinyin_arr)
           replace =  pinyin_replacement(pinyin)
           match = pinyin
           if replacements.size > 0
-            pyn = pyn.sub(/(#{replacements.join('.*')}.*)#{match}/){ $1 + replace }
+            pyn = pyn.sub(/(#{ replacements.join('.*') }.*)#{ match }/){ $1 + replace }
           else
-            pyn = pyn.sub(/#{match}/){ "#{$1}#{replace}"}
+            pyn = pyn.sub(/#{match}/){ "#{ $1 }#{ replace }" }
           end
           replacements << replace
         end
@@ -195,20 +195,19 @@ def self.pinyin_replacement(py)
           py.include? x
         end
         match = select_pinyin_match(matches)
-        replace = PYN_PY.find{|k,v| k if v == match}[0]
+        replace = PYN_PY.find{ |k, v| k if v == match }[0]
 
-        py.gsub(match, replace).gsub(/([^\d ]*)(\d)([^\d ]*)/){$1 + $3 + $2}
+        py.gsub(match, replace).gsub(/([^\d ]*)(\d)([^\d ]*)/){ $1 + $3 + $2 }
       end
 
       def self.select_pinyin_match(matches)
         # take the longest pinyin match. Use bytes because 'è' is prefered over 'n' or 'r' or 'm'
-        match = matches.sort{|x,y| x.bytes.to_a.length <=> y.bytes.to_a.length}[-1]
+        match = matches.sort{ |x, y| x.bytes.to_a.length <=> y.bytes.to_a.length }[-1]
 
         # Edge case.. en/eng pyn -> py conversion is one way only.
         match[/^(ē|é|ě|è|e)n?g?/].nil? ? match : match.chars[0]
       end
 
-
       #  Internal: Replaces numbered pinyin with actual pinyin. Pinyin separated with hyphens are combined as one word.
       #
       #  str - A String to replace with actual pinyin
@@ -229,8 +228,8 @@ def self.convert_pyn_to_pinyin(str)
         #              And finally, correct those apostrophes at the very end.
         #              It's like magic.
         str.gsub(regex) do
-          ($3.nil? ? "#{PYN_PY[$1]}" : ($2 == '' && ['a','e','o'].include?($3[0,1]))? "'#{PYN_PY["#{$3}#{$6}"]}#{$4}#{$5}" : "#{$2}#{PYN_PY["#{$3}#{$6}"]}#{$4}#{$5}") + (($7.to_s.length > 1) ? '-' : '')
-        end.gsub("-'","-").sub(/^'/,'')
+          ($3.nil? ? "#{ PYN_PY[$1] }" : ($2 == '' && %w(a e o).include?($3[0,1]))? "'#{ PYN_PY["#{ $3 }#{ $6 }"]}#{ $4 }#{ $5 }" : "#{ $2 }#{ PYN_PY["#{ $3 }#{ $6 }"] }#{ $4 }#{ $5 }") + (($7.to_s.length > 1) ? '-' : '')
+        end.gsub("-'", '-').sub(/^'/, '')
       end
     end
   end

diff --git a/lib/zhongwen_tools/romanization/tongyong_pinyin.rb b/lib/zhongwen_tools/romanization/tongyong_pinyin.rb
@@ -0,0 +1,29 @@
+module ZhongwenTools
+  module Romanization
+    module TongyongPinyin
+      def self.to_typy(*args)
+        str, from = args
+        from ||= ZhongwenTools::Romanization.romanization?(str)
+
+        ZhongwenTools::Romanization.convert str, :typy, from.to_sym
+      end
+
+      def self.typy?(str)
+        regex = ZhongwenTools::Romanization.detect_regex(:typy)
+        ZhongwenTools::Romanization.detect_romanization(str, regex)
+      end
+
+      def self.split(str)
+        regex = /(#{ ZhongwenTools::Romanization.detect_regex(:typy) }*)/
+        ZhongwenTools::Romanization.split_romanization(str, regex)
+      end
+
+      class << self
+        [:tongyong, :tongyong_pinyin].each do |m|
+          alias_method "to_#{ m }".to_sym, :to_typy
+          alias_method "#{ m }?", :typy?
+        end
+      end
+    end
+  end
+end
diff --git a/lib/zhongwen_tools/romanization/wade_giles.rb b/lib/zhongwen_tools/romanization/wade_giles.rb
@@ -0,0 +1,29 @@
+module ZhongwenTools
+  module Romanization
+    module WadeGiles
+      def self.to_wg(*args)
+        str, from = args
+        from ||= ZhongwenTools::Romanization.romanization?(str)
+
+        ZhongwenTools::Romanization.convert str, :wg, from.to_sym
+      end
+
+      def self.wg?(str)
+        regex = ZhongwenTools::Romanization.detect_regex(:wg)
+        ZhongwenTools::Romanization.detect_romanization(str, regex)
+      end
+
+      def self.split(str)
+        regex = /(#{ ZhongwenTools::Romanization.detect_regex(:wg) }*)/
+        ZhongwenTools::Romanization.split_romanization(str, regex)
+      end
+
+      class << self
+        [:wade_giles, :wadegiles].each do |m|
+          alias_method "to_#{ m }".to_sym, :to_wg
+          alias_method "#{ m }?", :wg?
+        end
+      end
+    end
+  end
+end