Skip to content

Commit

Permalink
Improve ActiveSupport::Inflector.transliterate performance
Browse files Browse the repository at this point in the history
resolves #46569

```ruby

require "bundler/inline"

gemfile(true) do
  source "https://rubygems.org"

  git_source(:github) { |repo| "https://github.com/#{repo}.git" }

  gem "rails", github: "rails/rails", branch: "main"
  gem "benchmark-ips"
end

require "active_support"
require "active_support/inflector/transliterate"

module ActiveSupport::Inflector
  def transliterate_fast(string, replacement = "?", locale: nil)
    raise ArgumentError, "Can only transliterate strings. Received #{string.class.name}" unless string.is_a?(String)
    raise ArgumentError, "Cannot transliterate strings with #{string.encoding} encoding" unless ALLOWED_ENCODINGS_FOR_TRANSLITERATE.include?(string.encoding)

    return string if string.ascii_only?

    string = string.dup if string.frozen?
    input_encoding = string.encoding

    # US-ASCII is a subset of UTF-8 so we'll force encoding as UTF-8 if
    # US-ASCII is given. This way we can let tidy_bytes handle the string
    # in the same way as we do for UTF-8
    string.force_encoding(Encoding::UTF_8) if string.encoding == Encoding::US_ASCII

    # GB18030 is Unicode compatible but is not a direct mapping so needs to be
    # transcoded. Using invalid/undef :replace will result in loss of data in
    # the event of invalid characters, but since tidy_bytes will replace
    # invalid/undef with a "?" we're safe to do the same beforehand
    string.encode!(Encoding::UTF_8, invalid: :replace, undef: :replace) if string.encoding == Encoding::GB18030

    transliterated = I18n.transliterate(
      ActiveSupport::Multibyte::Unicode.tidy_bytes(string).unicode_normalize(:nfc),
      replacement: replacement,
      locale: locale
    )

    # Restore the string encoding of the input if it was not UTF-8.
    # Apply invalid/undef :replace as tidy_bytes does
    transliterated.encode!(input_encoding, invalid: :replace, undef: :replace) if input_encoding != transliterated.encoding

    transliterated
  end
end

SCENARIOS = {
  "Empty"                         => "",
  "Single Space"                  => " ",
  "ASCII string"                  => "This is a normal ASCII String.",
  "US ASCII encoded String"       => String.new("This is a normal ASCII String.", encoding: Encoding::US_ASCII),
  "Very Long String"              => "Very Long String :)" * 100,
  "Very Long french String"       => "Very Long Stringé :)" * 100,
  "French string"                 => "Ceci est une chaîne de test pour la méthode de translittération.",
  "UTF-8 encoded Chinese string"  => String.new("這是音譯方法的測試字符串", encoding: Encoding::UTF_8)

}

SCENARIOS.each_pair do |name, value|
  puts
  puts " #{name} ".center(80, "=")
  puts

  Benchmark.ips do |x|
    x.report("transliterate")      { ActiveSupport::Inflector.transliterate(value) }
    x.report("transliterate_fast") { ActiveSupport::Inflector.transliterate_fast(value) }
    x.compare!
  end
end
```

```txt
==================================== Empty =====================================

Warming up --------------------------------------
       transliterate    65.500k i/100ms
  transliterate_fast   687.485k i/100ms
Calculating -------------------------------------
       transliterate    657.632k (± 0.8%) i/s -      3.340M in   5.079936s
  transliterate_fast      6.869M (± 1.6%) i/s -     34.374M in   5.005813s

Comparison:
  transliterate_fast:  6868816.3 i/s
       transliterate:   657631.8 i/s - 10.44x  (± 0.00) slower

================================= Single Space =================================

Warming up --------------------------------------
       transliterate    62.732k i/100ms
  transliterate_fast   678.223k i/100ms
Calculating -------------------------------------
       transliterate    628.475k (± 0.8%) i/s -      3.199M in   5.090978s
  transliterate_fast      6.799M (± 0.2%) i/s -     34.589M in   5.087534s

Comparison:
  transliterate_fast:  6798890.3 i/s
       transliterate:   628475.2 i/s - 10.82x  (± 0.00) slower

================================= ASCII string =================================

Warming up --------------------------------------
       transliterate    32.095k i/100ms
  transliterate_fast   687.396k i/100ms
Calculating -------------------------------------
       transliterate    319.529k (± 0.8%) i/s -      1.605M in   5.022547s
  transliterate_fast      6.879M (± 0.3%) i/s -     35.057M in   5.096224s

Comparison:
  transliterate_fast:  6879113.6 i/s
       transliterate:   319528.9 i/s - 21.53x  (± 0.00) slower

=========================== US ASCII encoded String ============================

Warming up --------------------------------------
       transliterate    33.027k i/100ms
  transliterate_fast   688.354k i/100ms
Calculating -------------------------------------
       transliterate    330.268k (± 0.9%) i/s -      1.651M in   5.000445s
  transliterate_fast      6.861M (± 0.8%) i/s -     34.418M in   5.016963s

Comparison:
  transliterate_fast:  6860726.0 i/s
       transliterate:   330267.8 i/s - 20.77x  (± 0.00) slower

=============================== Very Long String ===============================

Warming up --------------------------------------
       transliterate   985.000  i/100ms
  transliterate_fast   672.674k i/100ms
Calculating -------------------------------------
       transliterate      9.899k (± 0.5%) i/s -     50.235k in   5.074820s
  transliterate_fast      6.729M (± 0.7%) i/s -     34.306M in   5.098807s

Comparison:
  transliterate_fast:  6728668.4 i/s
       transliterate:     9899.2 i/s - 679.72x  (± 0.00) slower

=========================== Very Long french String ============================

Warming up --------------------------------------
       transliterate   671.000  i/100ms
  transliterate_fast   671.000  i/100ms
Calculating -------------------------------------
       transliterate      6.635k (± 1.9%) i/s -     33.550k in   5.058424s
  transliterate_fast      6.622k (± 1.7%) i/s -     33.550k in   5.068289s

Comparison:
       transliterate:     6634.9 i/s
  transliterate_fast:     6621.7 i/s - same-ish: difference falls within error

================================ French string =================================

Warming up --------------------------------------
       transliterate    14.726k i/100ms
  transliterate_fast    14.679k i/100ms
Calculating -------------------------------------
       transliterate    145.933k (± 1.5%) i/s -    736.300k in   5.046537s
  transliterate_fast    146.753k (± 1.2%) i/s -    733.950k in   5.001937s

Comparison:
  transliterate_fast:   146752.8 i/s
       transliterate:   145933.1 i/s - same-ish: difference falls within error

========================= UTF-8 encoded Chinese string =========================

Warming up --------------------------------------
       transliterate    13.905k i/100ms
  transliterate_fast    14.093k i/100ms
Calculating -------------------------------------
       transliterate    141.222k (± 1.9%) i/s -    709.155k in   5.023366s
  transliterate_fast    140.510k (± 1.7%) i/s -    704.650k in   5.016400s

Comparison:
       transliterate:   141221.9 i/s
  transliterate_fast:   140510.4 i/s - same-ish: difference falls within error
```
  • Loading branch information
SeanLF committed Nov 28, 2022
1 parent fae0bae commit 8194f8e
Showing 1 changed file with 3 additions and 1 deletion.
4 changes: 3 additions & 1 deletion activesupport/lib/active_support/inflector/transliterate.rb
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,12 @@ module Inflector
# Transliteration is restricted to UTF-8, US-ASCII, and GB18030 strings.
# Other encodings will raise an ArgumentError.
def transliterate(string, replacement = "?", locale: nil)
string = string.dup if string.frozen?
raise ArgumentError, "Can only transliterate strings. Received #{string.class.name}" unless string.is_a?(String)
raise ArgumentError, "Cannot transliterate strings with #{string.encoding} encoding" unless ALLOWED_ENCODINGS_FOR_TRANSLITERATE.include?(string.encoding)

string = string.dup if string.frozen?
return string if string.ascii_only?

input_encoding = string.encoding

# US-ASCII is a subset of UTF-8 so we'll force encoding as UTF-8 if
Expand Down

0 comments on commit 8194f8e

Please sign in to comment.