Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

big refactor of string_extensions conversions that in hindsight i sho…

…uld have split apart cleaner
  • Loading branch information...
commit 430efc1cc54bf666c230087e9b04958a1d148642 1 parent afc34c9
@rsl authored
View
10 lib/stringex/localization.rb
@@ -1,5 +1,6 @@
# encoding: UTF-8
+require 'stringex/localization/converter'
require 'stringex/localization/default_conversions'
module Stringex
@@ -86,8 +87,11 @@ def reset!
@backend = @translations = @locale = @default_locale = nil
end
- def currencies_supported_regex
- Regexp.new DefaultConversions::CURRENCIES_SUPPORTED.map{|x| Regexp.escape(x)}.join('|')
+ def convert(string, options = {}, &block)
+ converter = Converter.new(string)
+ converter.instance_exec &block
+ converter.smart_strip!
+ converter.string
end
private
@@ -106,4 +110,4 @@ def default_conversion(scope, key)
end
end
end
-end
+end
View
148 lib/stringex/localization/conversion_expressions.rb
@@ -0,0 +1,148 @@
+# encoding: UTF-8
+
+module Stringex
+ module Localization
+ module ConversionExpressions
+ ABBREVIATION = /(\s|^)([[:alpha:]](\.[[:alpha:]])+(\.?)[[:alpha:]]*(\s|$))/
+
+ ACCENTED_HTML_ENTITY = /&([A-Za-z])(grave|acute|circ|tilde|uml|ring|cedil|slash);/
+
+ APOSTROPHE = /(^|[[:alpha:]])'|`([[:alpha:]]|$)/
+
+ CHARACTERS = {
+ :and => /\s*&\s*/,
+ :at => /\s*@\s*/,
+ :degrees => /\s\s*/,
+ :divide => /\s\s*/,
+ :dot => /(\S|^)\.(\S)/,
+ :ellipsis => /\s*\.{3,}\s*/,
+ :equals => /\s*=\s*/,
+ :number => /\s*#/,
+ :percent => /\s*%\s*/,
+ :plus => /\s*\+\s*/,
+ :slash => /\s*(\\|\/|/)\s*/,
+ :star => /\s*\*\s*/,
+ }
+
+ # Things that just get converted to spaces
+ CLEANUP_CHARACTERS = /[\.,:;()\[\]\/\?!\^'ʼ"_\|]/
+ CLEANUP_HTML_ENTITIES = /&[^;]+;/
+
+ CURRENCIES_SUPPORTED_SIMPLE = {
+ :dollars => /\$/,
+ :euros => //,
+ :pounds => /£/,
+ :yen => /¥/,
+ }
+ CURRENCIES_SUPPORTED_COMPLEX = {
+ :dollars => :dollars_cents,
+ :euros => :euros_cents,
+ :pounds => :pounds_pence,
+ }
+ CURRENCIES_SUPPORTED = Regexp.new(CURRENCIES_SUPPORTED_SIMPLE.values.join('|'))
+ CURRENCIES_SIMPLE = CURRENCIES_SUPPORTED_SIMPLE.inject({}) do |hash, content|
+ key, expression = content
+ hash[key] = /(?:\s|^)#{expression}(\d*)(?:\s|$)/u
+ hash
+ end
+ CURRENCIES_COMPLEX = CURRENCIES_SUPPORTED_SIMPLE.inject({}) do |hash, content|
+ key, expression = content
+ # Do we really need to not worry about complex currencies if there are none for the currency?
+ complex_key = CURRENCIES_SUPPORTED_COMPLEX[key]
+ if complex_key
+ hash[complex_key] = /(?:\s|^)#{expression}(\d+)\.(\d+)(?:\s|$)/u
+ end
+ hash
+ end
+ CURRENCIES = CURRENCIES_SIMPLE.merge(CURRENCIES_COMPLEX)
+
+ HTML_ENTITIES = Proc.new(){
+ base = {
+ :amp => %w{#38 amp},
+ :cent => %w{#162 cent},
+ :copy => %w{#169 copy},
+ :deg => %w{#176 deg},
+ :divide => %w{#247 divide},
+ :double_quote => %w{#34 #822[012] quot ldquo rdquo dbquo},
+ :ellipsis => %w{#8230 hellip},
+ :en_dash => %w{#8211 ndash},
+ :em_dash => %w{#8212 emdash},
+ :frac14 => %w{#188 frac14},
+ :frac12 => %w{#189 frac12},
+ :frac34 => %w{#190 frac34},
+ :gt => %w{#62 gt},
+ :lt => %w{#60 lt},
+ :nbsp => %w{#160 nbsp},
+ :pound => %w{#163 pound},
+ :reg => %w{#174 reg},
+ :single_quote => %w{#39 #821[678] apos lsquo rsquo sbquo},
+ :times => %w{#215 times},
+ :trade => %w{#8482 trade},
+ :yen => %w{#165 yen},
+ }
+ base.inject({}) do |hash, content|
+ key, expression = content
+ hash[key] = /&(#{expression.join('|')});/
+ hash
+ end
+ }.call
+
+ HTML_TAG = Proc.new(){
+ name = /[\w:_-]+/
+ value = /([A-Za-z0-9]+|('[^']*?'|"[^"]*?"))/
+ attr = /(#{name}(\s*=\s*#{value})?)/
+ /<[!\/?\[]?(#{name}|--)(\s+(#{attr}(\s+#{attr})*))?\s*([!\/?\]]+|--)?>/
+ }.call
+
+ SMART_PUNCTUATION = {
+ /(“|”|\302\223|\302\224|\303\222|\303\223)/ => '"',
+ /(‘|’|\302\221|\302\222|\303\225)/ => "'",
+ // => "...",
+ }
+
+ # Ordered by denominator then numerator of the value
+ VULGAR_FRACTIONS = {
+ :half => /(&#189;|&frac12;|½)/,
+ :one_third => /(&#8531;|⅓)/,
+ :two_thirds => /(&#8532;|⅔)/,
+ :one_fourth => /(&#188;|&frac14;|¼)/,
+ :three_fourths => /(&#190;|&frac34;|¾)/,
+ :one_fifth => /(&#8533;|⅕)/,
+ :two_fifths => /(&#8534;|⅖)/,
+ :three_fifths => /(&#8535;|⅗)/,
+ :four_fifths => /(&#8536;|⅘)/,
+ :one_sixth => /(&#8537;|⅙)/,
+ :five_sixths => /(&#8538;|⅚)/,
+ :one_eighth => /(&#8539;|⅛)/,
+ :three_eighths => /(&#8540;|⅜)/,
+ :five_eighths => /(&#8541;|⅝)/,
+ :seven_eighths => /(&#8542;|⅞)/,
+ }
+
+ WHITESPACE = /\s+/
+
+ class << self
+ %w{
+ abbreviation
+ accented_html_entity
+ apostrophe
+ characters
+ cleanup_characters
+ cleanup_html_entities
+ currencies
+ currencies_simple
+ currencies_complex
+ html_entities
+ html_tag
+ smart_punctuation
+ vulgar_fractions
+ whitespace
+ }.each do |conversion_type|
+ define_method conversion_type do
+ const_get conversion_type.upcase
+ end
+ end
+ end
+ end
+ end
+end
View
120 lib/stringex/localization/converter.rb
@@ -0,0 +1,120 @@
+# encoding: UTF-8
+
+require 'stringex/localization/conversion_expressions'
+
+module Stringex
+ module Localization
+ class Converter
+ include ConversionExpressions
+
+ attr_reader :ending_whitespace, :options, :starting_whitespace, :string
+
+ def initialize(string, options = {})
+ @string = string.dup
+ @options = Stringex::Configuration::StringExtensions.default_settings.merge(options)
+ string =~ /^(\s+)/
+ @starting_whitespace = $1 unless $1 == ''
+ string =~ /(\s+)$/
+ @ending_whitespace = $1 unless $1 == ''
+ end
+
+ def cleanup_accented_html_entities!
+ string.gsub! expressions.accented_html_entity, '\1'
+ end
+
+ def cleanup_characters!
+ string.gsub! expressions.cleanup_characters, ' '
+ end
+
+ def cleanup_html_entities!
+ string.gsub! expressions.cleanup_html_entities, ''
+ end
+
+ def cleanup_smart_punctuation!
+ expressions.smart_punctuation.each do |expression, replacement|
+ string.gsub! expression, replacement
+ end
+ end
+
+ def smart_strip!
+ string.strip!
+ @string = "#{starting_whitespace}#{string}#{ending_whitespace}"
+ end
+
+ def strip!
+ string.strip!
+ end
+
+ def strip_html_tags!
+ string.gsub! expressions.html_tag, ''
+ end
+
+ def translate!(*conversions)
+ conversions.each do |conversion|
+ send conversion
+ end
+ end
+
+ protected
+
+ def abbreviations
+ string.gsub! expressions.abbreviation do |x|
+ x.gsub '.', ''
+ end
+ end
+
+ def apostrophes
+ string.gsub! expressions.apostrophe, '\1\2'
+ end
+
+ def characters
+ expressions.characters.each do |key, expression|
+ next if key == :slash && options[:allow_slash]
+ replacement = translate(key)
+ replacement = " #{replacement} " unless key == :dot
+ string.gsub! expression, replacement
+ end
+ end
+
+ def currencies
+ if has_currencies?
+ [:currencies_complex, :currencies_simple].each do |type|
+ expressions.send(type).each do |key, expression|
+ string.gsub! expression, " #{translate(key, :currencies)} "
+ end
+ end
+ end
+ end
+
+ def ellipses
+ string.gsub! expressions.characters[:ellipsis], " #{translate(:ellipsis)} "
+ end
+
+ def html_entities
+ expressions.html_entities.each do |key, expression|
+ string.gsub! expression, translate(key, :html_entities)
+ end
+ end
+
+ def vulgar_fractions
+ expressions.vulgar_fractions.each do |key, expression|
+ string.gsub! expression, translate(key, :vulgar_fractions)
+ end
+ end
+
+ private
+
+ def expressions
+ ConversionExpressions
+ end
+
+ def has_currencies?
+ string =~ CURRENCIES_SUPPORTED
+ end
+
+ def translate(key, scope = :characters)
+ Localization.translate scope, key
+ end
+ end
+ end
+end
View
23 lib/stringex/localization/default_conversions.rb
@@ -18,17 +18,18 @@ module DefaultConversions
:star => "star",
}
- CURRENCIES = {
+ CURRENCIES_SIMPLE = {
:dollars => '\1 dollars',
- :dollars_cents => '\1 dollars \2 cents',
:euros => '\1 euros',
- :euros_cents => '\1 euros \2 cents',
:pounds => '\1 pounds',
- :pounds_pence => '\1 pounds \2 pence',
:yen => '\1 yen',
}
-
- CURRENCIES_SUPPORTED = %w{$ £ € ¥}
+ CURRENCIES_COMPLEX = {
+ :dollars_cents => '\1 dollars \2 cents',
+ :euros_cents => '\1 euros \2 cents',
+ :pounds_pence => '\1 pounds \2 pence',
+ }
+ CURRENCIES = CURRENCIES_SIMPLE.merge(CURRENCIES_COMPLEX)
HTML_ENTITIES = {
:amp => "and",
@@ -36,7 +37,7 @@ module DefaultConversions
:copy => "(c)",
:deg => " degrees ",
:divide => "divide",
- :double_quote => "\"",
+ :double_quote => '"',
:ellipsis => "...",
:en_dash => "-",
:em_dash => "--",
@@ -46,21 +47,23 @@ module DefaultConversions
:gt => ">",
:lt => "<",
:nbsp => " ",
- :pound => " pound",
+ :pound => " pounds",
:reg => "(r)",
:single_quote => "'",
:times => "x",
:trade => "(tm)",
+ :yen => " yen"
}
TRANSLITERATIONS = {}
+ # Ordered by denominator then numerator of the value
VULGAR_FRACTIONS = {
- :one_fourth => "one fourth",
:half => "half",
- :three_fourths => "three fourths",
:one_third => "one third",
:two_thirds => "two thirds",
+ :one_fourth => "one fourth",
+ :three_fourths => "three fourths",
:one_fifth => "one fifth",
:two_fifths => "two fifths",
:three_fifths => "three fifths",
View
151 lib/stringex/string_extensions.rb
@@ -24,7 +24,9 @@ def collapse(character = " ")
# Note: This does not do any conversion of Unicode/ASCII accented-characters. For that
# functionality please use <tt>to_ascii</tt>.
def convert_accented_html_entities
- gsub(/&([A-Za-z])(grave|acute|circ|tilde|uml|ring|cedil|slash);/, '\1').strip
+ stringex_convert do
+ cleanup_accented_html_entities!
+ end
end
# Converts various common plaintext characters to a more URI-friendly representation.
@@ -50,59 +52,10 @@ def convert_accented_html_entities
# you should run any methods which convert HTML entities (convert_accented_html_entities and convert_miscellaneous_html_entities)
# before running this method.
def convert_miscellaneous_characters(options = {})
- options = stringex_default_options.merge(options)
-
- dummy = dup.gsub(/\s*\.{3,}\s*/, " #{stringex_translate_character(:ellipsis)} ") # Catch ellipses before single dot rule!
-
- if dummy =~ Localization.currencies_supported_regex
- # Special rules for money
- # Complex currency first
- {
- /(?:\s|^)\$(\d+)\.(\d+)(?:\s|$)/ => :dollars_cents,
- /(?:\s|^)£(\d+)\.(\d+)(?:\s|$)/u => :pounds_pence,
- /(?:\s|^)(\d+)\.(\d+)(?:\s|$)/u => :euros_cents,
- }.each do |found, key|
- replaced = stringex_translate_currency(key)
- dummy.gsub!(found, " #{replaced} ")
- end
- # Simple currency last
- {
- /(?:\s|^)\$(\d*)(?:\s|$)/ => :dollars,
- /(?:\s|^)£(\d*)(?:\s|$)/u => :pounds,
- /(?:\s|^)(\d*)(?:\s|$)/u => :euros,
- /(?:\s|^)¥(\d*)(?:\s|$)/u => :yen,
- }.each do |found, key|
- replaced = stringex_translate_currency(key)
- dummy.gsub!(found, " #{replaced} ")
- end
+ stringex_convert do
+ translate! :ellipses, :currencies, :abbreviations, :characters, :apostrophes
+ cleanup_characters!
end
-
- # Special rules for abbreviations
- dummy.gsub!(/(\s|^)([[:alpha:]](\.[[:alpha:]])+(\.?)[[:alpha:]]*(\s|$))/) do |x|
- x.gsub(".", "")
- end
-
- # Back to normal rules
- misc_characters =
- {
- /\s*&\s*/ => :and,
- /\s*#/ => :number,
- /\s*@\s*/ => :at,
- /(\S|^)\.(\S)/ => :dot,
- /\s*\*\s*/ => :star,
- /\s*%\s*/ => :percent,
- /\s*=\s*/ => :equals,
- /\s*\+\s*/ => :plus,
- /\s\s*/ => :divide,
- /\s\s*/ => :degrees
- }
- misc_characters[/\s*(\\|\/|/)\s*/] = :slash unless options[:allow_slash]
- misc_characters.each do |found, key|
- replaced = stringex_translate_character(key)
- replaced = " #{replaced} " unless key == :dot
- dummy.gsub!(found, replaced)
- end
- dummy = dummy.gsub(/(^|[[:alpha:]])'|`([[:alpha:]]|$)/, '\1\2').gsub(/[\.,:;()\[\]\/\?!\^'ʼ"_\|]/, " ").strip
end
# Converts HTML entities (taken from common Textile/RedCloth formattings) into plain text formats.
@@ -110,74 +63,25 @@ def convert_miscellaneous_characters(options = {})
# Note: This isn't an attempt at complete conversion of HTML entities, just those most likely
# to be generated by Textile.
def convert_miscellaneous_html_entities
- dummy = dup
- {
- "#822[01]" => :double_quote,
- "#821[67]" => :single_quote,
- "#8230" => :ellipsis,
- "#8211" => :en_dash,
- "#8212" => :em_dash,
- "#215" => :times,
- "gt" => :gt,
- "lt" => :lt,
- "(#8482|trade)" => :trade,
- "(#174|reg)" => :reg,
- "(#169|copy)" => :copy,
- "(#38|amp)" => :amp,
- "nbsp" => :nbsp,
- "(#162|cent)" => :cent,
- "(#163|pound)" => :pound,
- "(#188|frac14)" => :frac14,
- "(#189|frac12)" => :frac12,
- "(#190|frac34)" => :frac34,
- "(#247|divide)" => :divide,
- "(#176|deg)" => :deg
- }.each do |textiled, key|
- normal = stringex_translate_html_entity(key)
- dummy.gsub!(/&#{textiled};/, normal)
+ stringex_convert do
+ translate! :html_entities
+ cleanup_html_entities!
end
- dummy.gsub(/&[^;]+;/, "").strip
end
# Converts MS Word 'smart punctuation' to ASCII
#
def convert_smart_punctuation
- dummy = dup
- {
-
- "(“|”|\302\223|\302\224|\303\222|\303\223)" => '"',
- "(‘|’|\302\221|\302\222|\303\225)" => "'",
- "" => "...",
- }.each do |smart, normal|
- dummy.gsub!(/#{smart}/, normal)
+ stringex_convert do
+ cleanup_smart_punctuation!
end
- dummy.strip
end
# Converts vulgar fractions from supported HTML entities and Unicode to plain text formats.
def convert_vulgar_fractions
- dummy = dup
- {
- "(&#188;|&frac14;|¼)" => :one_fourth,
- "(&#189;|&frac12;|½)" => :half,
- "(&#190;|&frac34;|¾)" => :three_fourths,
- "(&#8531;|⅓)" => :one_third,
- "(&#8532;|⅔)" => :two_thirds,
- "(&#8533;|⅕)" => :one_fifth,
- "(&#8534;|⅖)" => :two_fifths,
- "(&#8535;|⅗)" => :three_fifths,
- "(&#8536;|⅘)" => :four_fifths,
- "(&#8537;|⅙)" => :one_sixth,
- "(&#8538;|⅚)" => :five_sixths,
- "(&#8539;|⅛)" => :one_eighth,
- "(&#8540;|⅜)" => :three_eighths,
- "(&#8541;|⅝)" => :five_eighths,
- "(&#8542;|⅞)" => :seven_eighths
- }.each do |textiled, key|
- normal = stringex_translate_vulgar_fraction(key)
- dummy.gsub!(/#{textiled}/, normal)
+ stringex_convert do
+ translate! :vulgar_fractions
end
- dummy
end
# Returns the string limited in size to the value of limit.
@@ -213,11 +117,10 @@ def replace_whitespace(replacement = " ")
# Removes HTML tags from text.
# NOTE: This code is simplified from Tobias Luettke's regular expression in Typo[http://typosphere.org].
def strip_html_tags(leave_whitespace = false)
- name = /[\w:_-]+/
- value = /([A-Za-z0-9]+|('[^']*?'|"[^"]*?"))/
- attr = /(#{name}(\s*=\s*#{value})?)/
- rx = /<[!\/?\[]?(#{name}|--)(\s+(#{attr}(\s+#{attr})*))?\s*([!\/?\]]+|--)?>/
- (leave_whitespace) ? gsub(rx, "").strip : gsub(rx, "").gsub(/\s+/, " ").strip
+ string = stringex_convert do
+ strip_html_tags!
+ end
+ leave_whitespace ? string : string.replace_whitespace(' ')
end
# Returns the string converted (via Textile/RedCloth) to HTML format
@@ -262,24 +165,12 @@ def to_url(options = {})
private
- def stringex_default_options
- Stringex::Configuration::StringExtensions.default_settings
- end
-
- def stringex_translate_character(key)
- Localization.translate(:characters, key)
+ def stringex_convert(options = {}, &block)
+ Localization.convert self, options, &block
end
- def stringex_translate_currency(key)
- Localization.translate(:currencies, key)
- end
-
- def stringex_translate_html_entity(key)
- Localization.translate(:html_entities, key)
- end
-
- def stringex_translate_vulgar_fraction(key)
- Localization.translate(:vulgar_fractions, key)
+ def stringex_default_options
+ Stringex::Configuration::StringExtensions.default_settings
end
end
View
12 test/i18n/default_localization_test.rb
@@ -6,7 +6,6 @@
class DefaultLocalizationTest < Test::Unit::TestCase
def setup
Stringex::Localization.reset!
-
Stringex::Localization.backend = :internal
end
@@ -23,7 +22,11 @@ def test_convert_miscellaneous_characters
"Food+Drink" => "Food plus Drink",
"this & that #2 @ bla.bla for $3" => "this and that number 2 at bla dot bla for 3 dollars",
"three + four ÷ 40 ° fahrenheit... end" => "three plus four divide 40 degrees fahrenheit dot dot dot end",
- "£4 but ¥5 * 100% = two" => "4 pounds but 5 yen star 100 percent equals two"
+ "£4 but ¥5 * 100% = two" => "4 pounds but 5 yen star 100 percent equals two",
+ "N.A.S.A. is cool" => "NASA is cool",
+ "That's not fair" => "Thats not fair",
+ "That`s not fair either" => "Thats not fair either",
+ " whitespace maintained " => " whitespace maintained ",
}.each do |misc, plain|
assert_equal plain, misc.convert_miscellaneous_characters
end
@@ -35,8 +38,9 @@ def test_convert_miscellaneous_html_entities
"Tea &amp; Sympathy" => "Tea and Sympathy",
"To be continued&#8230;" => "To be continued...",
"Foo&nbsp;Bar" => "Foo Bar",
- "100&#163;" => "100 pound",
- "35&deg;" => "35 degrees"
+ "100&#163;" => "100 pounds",
+ "35&deg;" => "35 degrees",
+ " whitespace maintained " => " whitespace maintained ",
}.each do |entitied, plain|
assert_equal plain, entitied.convert_miscellaneous_html_entities
end
Please sign in to comment.
Something went wrong with that request. Please try again.