Skip to content
Permalink
Browse files

Use String#scrub when available to tidy bytes

  • Loading branch information...
norman committed Dec 26, 2013
1 parent 8003c54 commit ab195841ddc7302ca6e6fc4a5962bc5ab3b8c09b
Showing with 35 additions and 35 deletions.
  1. +35 −35 activesupport/lib/active_support/multibyte/unicode.rb
@@ -212,37 +212,43 @@ def compose(codepoints)
codepoints
end

# Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent
# resulting in a valid UTF-8 string.
#
# Passing +true+ will forcibly tidy all bytes, assuming that the string's
# encoding is entirely CP1252 or ISO-8859-1.
def tidy_bytes(string, force = false)
return string if string.empty?

if force
return string.encode(Encoding::UTF_8, Encoding::Windows_1252, invalid: :replace, undef: :replace)
# Ruby >= 2.1 has String#scrub, which is faster than the workaround used for < 2.1.
if RUBY_VERSION >= '2.1'
# Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent
# resulting in a valid UTF-8 string.
#
# Passing +true+ will forcibly tidy all bytes, assuming that the string's
# encoding is entirely CP1252 or ISO-8859-1.
def tidy_bytes(string, force = false)
return string if string.empty?
return recode_windows1252_chars(string) if force
string.scrub { |bad| recode_windows1252_chars(bad) }
end
else
def tidy_bytes(string, force = false)
return string if string.empty?
return recode_windows1252_chars(string) if force

# We can't transcode to the same format, so we choose a nearly-identical encoding.
# We're going to 'transcode' bytes from UTF-8 when possible, then fall back to
# CP1252 when we get errors. The final string will be 'converted' back to UTF-8
# before returning.
reader = Encoding::Converter.new(Encoding::UTF_8, Encoding::UTF_8_MAC)

source = string.dup
out = ''.force_encoding(Encoding::UTF_8_MAC)

loop do
reader.primitive_convert(source, out)
_, _, _, error_bytes, _ = reader.primitive_errinfo
break if error_bytes.nil?
out << error_bytes.encode(Encoding::UTF_8_MAC, Encoding::Windows_1252, invalid: :replace, undef: :replace)
end

# We can't transcode to the same format, so we choose a nearly-identical encoding.
# We're going to 'transcode' bytes from UTF-8 when possible, then fall back to
# CP1252 when we get errors. The final string will be 'converted' back to UTF-8
# before returning.
reader = Encoding::Converter.new(Encoding::UTF_8, Encoding::UTF_8_MAC)

source = string.dup
out = ''.force_encoding(Encoding::UTF_8_MAC)
reader.finish

loop do
reader.primitive_convert(source, out)
_, _, _, error_bytes, _ = reader.primitive_errinfo
break if error_bytes.nil?
out << error_bytes.encode(Encoding::UTF_8_MAC, Encoding::Windows_1252, invalid: :replace, undef: :replace)
out.encode!(Encoding::UTF_8)
end

reader.finish

out.encode!(Encoding::UTF_8)
end

# Returns the KC normalization of the string by default. NFKC is
@@ -371,14 +377,8 @@ def apply_mapping(string, mapping) #:nodoc:
end.pack('U*')
end

def tidy_byte(byte)
if byte < 160
[database.cp1252[byte] || byte].pack("U").unpack("C*")
elsif byte < 192
[194, byte]
else
[195, byte - 64]
end
def recode_windows1252_chars(string)
string.encode(Encoding::UTF_8, Encoding::Windows_1252, invalid: :replace, undef: :replace)
end

def database

0 comments on commit ab19584

Please sign in to comment.
You can’t perform that action at this time.