From a1fb7dec9fac2fed6e93653243cecbd01e78e9e1 Mon Sep 17 00:00:00 2001 From: Adrien Rey-Jarthon Date: Tue, 11 Apr 2023 19:23:16 +0200 Subject: [PATCH] Rename IDNA backend implementations and refactor loading mechanism --- .github/workflows/test.yml | 2 +- README.md | 32 +++++++++++++++- benchmark/idna.rb | 37 +++++++++--------- lib/addressable/idna.rb | 32 +++++++++++++++- lib/addressable/idna/libidn1.rb | 60 +++++++++++++++++++++++++++++ lib/addressable/idna/libidn2.rb | 59 +++++++++++++++++++++++++++++ lib/addressable/idna/native.rb | 62 ++---------------------------- lib/addressable/idna/native2.rb | 67 --------------------------------- lib/addressable/idna/pure.rb | 6 +-- spec/addressable/idna_spec.rb | 17 ++++----- tasks/profile.rake | 12 +++--- 11 files changed, 218 insertions(+), 168 deletions(-) create mode 100644 lib/addressable/idna/libidn1.rb create mode 100644 lib/addressable/idna/libidn2.rb delete mode 100644 lib/addressable/idna/native2.rb diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 2d4f5beb..08d804a5 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -13,7 +13,7 @@ jobs: fail-fast: false matrix: ruby: [2.7] - idna_mode: [native2, native, pure] + idna_mode: [libidn2, libidn1, pure] os: [ubuntu-20.04] env: IDNA_MODE: ${{ matrix.idna_mode }} diff --git a/README.md b/README.md index 9892f615..bd981761 100644 --- a/README.md +++ b/README.md @@ -94,8 +94,21 @@ template.extract(uri) $ gem install addressable ``` -You may optionally turn on native IDN support by installing libidn and the -idn gem: +# IDNA support (unicode hostnames) + +Three IDNA implementations are available, the first one available is used: +- A `libidn2` wrapper (if `libidn2` is installed), supporting IDNA2008+UTS#46. +- A `libidn1` wrapper (if `libidn` and the `idn` gem are installed), supporting IDNA2003. +- A pure ruby implementation (slower), [almost](https://github.com/sporkmonger/addressable/issues/491) supporting IDNA2008. + +To install `libidn2`: + +```console +$ sudo apt-get install libidn2-dev # Debian/Ubuntu +$ brew install libidn # OS X +``` + +To install the legacy `libidn1` and the `idn` gem (also add it to your Gemfile): ```console $ sudo apt-get install libidn11-dev # Debian/Ubuntu @@ -103,6 +116,21 @@ $ brew install libidn # OS X $ gem install idn-ruby ``` +You can check which implementation is active with: + +```ruby +puts Addressable::IDNA.backend.name +``` + +Finally if you want to force a different IDNA implementation, you can do so like this (after addressable is required): + +```ruby +require "addressable/idna/pure.rb" +Addressable::IDNA.backend = Addressable::IDNA::Pure +require "addressable/idna/libidn1" +Addressable::IDNA.backend = Addressable::IDNA::Libidn1 +``` + # Semantic Versioning This project uses [Semantic Versioning](https://semver.org/). You can (and should) specify your diff --git a/benchmark/idna.rb b/benchmark/idna.rb index 69978299..0cadc11a 100644 --- a/benchmark/idna.rb +++ b/benchmark/idna.rb @@ -2,32 +2,30 @@ # frozen_string_literal: true. require "benchmark" +require "addressable/idna/libidn2" +require "addressable/idna/libidn1" +require "addressable/idna/pure" value = "fiᆵリ宠퐱卄.com" expected = "xn--fi-w1k207vk59a3qk9w9r.com" N = 100_000 +fail "pure ruby does not match" unless expected == Addressable::IDNA::Pure.to_ascii(value) +fail "libidn does not match" unless expected == Addressable::IDNA::Libidn1.to_ascii(value) +fail "addressable does not match" unless expected == Addressable::IDNA::Libidn2.to_ascii(value) + Benchmark.bmbm do |x| - x.report("pure") { - load "lib/addressable/idna/pure.rb" - fail "pure ruby does not match" unless expected == Addressable::IDNA.to_ascii(value) - N.times { Addressable::IDNA.to_unicode(Addressable::IDNA.to_ascii(value)) } - Addressable.send(:remove_const, :IDNA) - } + x.report("pure") { N.times { + Addressable::IDNA::Pure.to_unicode(Addressable::IDNA::Pure.to_ascii(value)) + } } - x.report("libidn") { - load "lib/addressable/idna/native.rb" - fail "libidn does not match" unless expected == Addressable::IDNA.to_ascii(value) - N.times { Addressable::IDNA.to_unicode(Addressable::IDNA.to_ascii(value)) } - Addressable.send(:remove_const, :IDNA) - } + x.report("libidn") { N.times { + Addressable::IDNA::Libidn1.to_unicode(Addressable::IDNA::Libidn1.to_ascii(value)) + } } - x.report("libidn2") { - load "lib/addressable/idna/native2.rb" - fail "addressable does not match" unless expected == Addressable::IDNA.to_ascii(value) - N.times { Addressable::IDNA.to_unicode(Addressable::IDNA.to_ascii(value)) } - Addressable.send(:remove_const, :IDNA) - } + x.report("libidn2") { N.times { + Addressable::IDNA::Libidn2.to_unicode(Addressable::IDNA::Libidn2.to_ascii(value)) + } } end # > ruby benchmark/idna.rb @@ -43,10 +41,9 @@ # libidn2 0.764782 0.000000 0.764782 ( 0.764863) puts "\nMemory leak test for libidn2 (memory should stabilize quickly):" -load "lib/addressable/idna/native2.rb" GC.disable # Only run GC when manually called 10.times do - N.times { Addressable::IDNA.to_unicode(Addressable::IDNA.to_ascii(value)) } + N.times { Addressable::IDNA::Libidn2.to_unicode(Addressable::IDNA::Libidn2.to_ascii(value)) } GC.start # Run a major GC pid, size = `ps ax -o pid,rss | grep -E "^[[:space:]]*#{$$}"`.strip.split.map(&:to_i) puts " Memory: #{size/1024}MB" # show process memory diff --git a/lib/addressable/idna.rb b/lib/addressable/idna.rb index d66b5db2..db7f8c2e 100644 --- a/lib/addressable/idna.rb +++ b/lib/addressable/idna.rb @@ -16,15 +16,43 @@ # limitations under the License. #++ +module Addressable + module IDNA + class << self + attr_accessor :backend + + # public interface implemented by all backends + def to_ascii(value) + backend.to_ascii(value) + end + + def to_unicode(value) + backend.to_unicode(value) + end + + # @deprecated Use {String#unicode_normalize(:nfkc)} instead + def unicode_normalize_kc(value) + value.to_s.unicode_normalize(:nfkc) + end + + extend Gem::Deprecate + deprecate :unicode_normalize_kc, "String#unicode_normalize(:nfkc)", 2023, 4 + end + end +end + begin - require "addressable/idna/native2" + require "addressable/idna/libidn2" + Addressable::IDNA.backend = Addressable::IDNA::Libidn2 rescue LoadError # libidn2 or the ffi gem was not available, fall back on libidn1 begin - require "addressable/idna/native" + require "addressable/idna/libidn1" + Addressable::IDNA.backend = Addressable::IDNA::Libidn1 rescue LoadError # libidn or the idn gem was not available, fall back on a pure-Ruby # implementation... require "addressable/idna/pure" + Addressable::IDNA.backend = Addressable::IDNA::Pure end end \ No newline at end of file diff --git a/lib/addressable/idna/libidn1.rb b/lib/addressable/idna/libidn1.rb new file mode 100644 index 00000000..0ca5e66e --- /dev/null +++ b/lib/addressable/idna/libidn1.rb @@ -0,0 +1,60 @@ +# frozen_string_literal: true + +#-- +# Copyright (C) Bob Aman +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#++ + +# libidn1 implementing IDNA2003 +require "idn" + +module Addressable + module IDNA + module Libidn1 + class << self + # @deprecated Use {String#unicode_normalize(:nfkc)} instead + def unicode_normalize_kc(value) + value.to_s.unicode_normalize(:nfkc) + end + + extend Gem::Deprecate + deprecate :unicode_normalize_kc, "String#unicode_normalize(:nfkc)", 2023, 4 + end + + def self.to_ascii(value) + value.to_s.split('.', -1).map do |segment| + if segment.size > 0 && segment.size < 64 + IDN::Idna.toASCII(segment, IDN::Idna::ALLOW_UNASSIGNED) + elsif segment.size >= 64 + segment + else + '' + end + end.join('.') + end + + def self.to_unicode(value) + value.to_s.split('.', -1).map do |segment| + if segment.size > 0 && segment.size < 64 + IDN::Idna.toUnicode(segment, IDN::Idna::ALLOW_UNASSIGNED) + elsif segment.size >= 64 + segment + else + '' + end + end.join('.') + end + end + end +end diff --git a/lib/addressable/idna/libidn2.rb b/lib/addressable/idna/libidn2.rb new file mode 100644 index 00000000..a63df507 --- /dev/null +++ b/lib/addressable/idna/libidn2.rb @@ -0,0 +1,59 @@ +# frozen_string_literal: true + +#-- +# Copyright (C) Bob Aman +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#++ + +# libidn2 implementing IDNA2008+TR46 +require "ffi" + +module Addressable + module IDNA + module Libidn2 + extend FFI::Library + + ffi_lib ["idn2", "libidn2.0", "libidn2.so.0"] + + attach_function :idn2_to_ascii_8z, %i[string pointer int], :int + attach_function :idn2_to_unicode_8z8z, %i[string pointer int], :int + attach_function :idn2_strerror, [:int], :string + attach_function :idn2_free, [:pointer], :void + + IDN2_TRANSITIONAL = 4 + IDN2_NONTRANSITIONAL = 8 + + def self.to_ascii(value) + return value if value.ascii_only? + pointer = FFI::MemoryPointer.new(:pointer) + res = idn2_to_ascii_8z(value, pointer, IDN2_NONTRANSITIONAL) + # Fallback to Transitional mode in case of disallowed character + res = idn2_to_ascii_8z(value, pointer, IDN2_TRANSITIONAL) if res != 0 + raise "libidn2 failed to convert \"#{value}\" to ascii (#{idn2_strerror(res)})" if res != 0 + result = pointer.read_pointer.read_string + idn2_free(pointer.read_pointer) + result + end + + def self.to_unicode(value) + pointer = FFI::MemoryPointer.new(:pointer) + res = idn2_to_unicode_8z8z(value, pointer, IDN2_NONTRANSITIONAL) + return value if res != 0 + result = pointer.read_pointer.read_string + idn2_free(pointer.read_pointer) + result.force_encoding('UTF-8') + end + end + end +end diff --git a/lib/addressable/idna/native.rb b/lib/addressable/idna/native.rb index 1ebdf720..88ef5fde 100644 --- a/lib/addressable/idna/native.rb +++ b/lib/addressable/idna/native.rb @@ -1,58 +1,4 @@ -# frozen_string_literal: true - -#-- -# Copyright (C) Bob Aman -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -#++ - -# libidn1 implementing IDNA2003 -require "idn" - -module Addressable - module IDNA - class << self - # @deprecated Use {String#unicode_normalize(:nfkc)} instead - def unicode_normalize_kc(value) - value.to_s.unicode_normalize(:nfkc) - end - - extend Gem::Deprecate - deprecate :unicode_normalize_kc, "String#unicode_normalize(:nfkc)", 2023, 4 - end - - def self.to_ascii(value) - value.to_s.split('.', -1).map do |segment| - if segment.size > 0 && segment.size < 64 - IDN::Idna.toASCII(segment, IDN::Idna::ALLOW_UNASSIGNED) - elsif segment.size >= 64 - segment - else - '' - end - end.join('.') - end - - def self.to_unicode(value) - value.to_s.split('.', -1).map do |segment| - if segment.size > 0 && segment.size < 64 - IDN::Idna.toUnicode(segment, IDN::Idna::ALLOW_UNASSIGNED) - elsif segment.size >= 64 - segment - else - '' - end - end.join('.') - end - end -end +# Deprecated, for backward compatibility only +require "addressable/idna/libidn1" +Addressable::IDNA.backend = Addressable::IDNA::Libidn1 +warn "NOTE: loading 'addressable/idna/native' is deprecated; use 'addressable/idna/libidn1' instead and set `Addressable::IDNA.backend = Addressable::IDNA::Libidn1` to force libidn1." \ No newline at end of file diff --git a/lib/addressable/idna/native2.rb b/lib/addressable/idna/native2.rb deleted file mode 100644 index 9bb59e46..00000000 --- a/lib/addressable/idna/native2.rb +++ /dev/null @@ -1,67 +0,0 @@ -# frozen_string_literal: true - -#-- -# Copyright (C) Bob Aman -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -#++ - -# libidn2 implementing IDNA2008+TR46 -require "ffi" - -module Addressable - module IDNA - extend FFI::Library - - ffi_lib ["idn2", "libidn2.0", "libidn2.so.0"] - - attach_function :idn2_to_ascii_8z, %i[string pointer int], :int - attach_function :idn2_to_unicode_8z8z, %i[string pointer int], :int - attach_function :idn2_strerror, [:int], :string - attach_function :idn2_free, [:pointer], :void - - IDN2_TRANSITIONAL = 4 - IDN2_NONTRANSITIONAL = 8 - - class << self - # @deprecated Use {String#unicode_normalize(:nfkc)} instead - def unicode_normalize_kc(value) - value.to_s.unicode_normalize(:nfkc) - end - - extend Gem::Deprecate - deprecate :unicode_normalize_kc, "String#unicode_normalize(:nfkc)", 2023, 4 - end - - def self.to_ascii(value) - return value if value.ascii_only? - pointer = FFI::MemoryPointer.new(:pointer) - res = idn2_to_ascii_8z(value, pointer, IDN2_NONTRANSITIONAL) - # Fallback to Transitional mode in case of disallowed character - res = idn2_to_ascii_8z(value, pointer, IDN2_TRANSITIONAL) if res != 0 - raise "libidn2 failed to convert \"#{value}\" to ascii (#{idn2_strerror(res)})" if res != 0 - result = pointer.read_pointer.read_string - idn2_free(pointer.read_pointer) - result - end - - def self.to_unicode(value) - pointer = FFI::MemoryPointer.new(:pointer) - res = idn2_to_unicode_8z8z(value, pointer, IDN2_NONTRANSITIONAL) - return value if res != 0 - result = pointer.read_pointer.read_string - idn2_free(pointer.read_pointer) - result.force_encoding('UTF-8') - end - end -end diff --git a/lib/addressable/idna/pure.rb b/lib/addressable/idna/pure.rb index 3d6ffbad..78696022 100644 --- a/lib/addressable/idna/pure.rb +++ b/lib/addressable/idna/pure.rb @@ -17,8 +17,8 @@ #++ -module Addressable - module IDNA +module Addressable::IDNA + module Pure # This module is loosely based on idn_actionmailer by Mick Staugaard, # the unicode library by Yoshida Masato, and the punycode implementation # by Kazuhiro Nishiyama. Most of the code was copied verbatim, but @@ -97,7 +97,7 @@ def self.to_unicode(input) if part =~ /^#{ACE_PREFIX}(.+)/ begin punycode_decode(part[/^#{ACE_PREFIX}(.+)/, 1]) - rescue Addressable::IDNA::PunycodeBadInput + rescue Addressable::IDNA::Pure::PunycodeBadInput # toUnicode is explicitly defined as never-fails by the spec part end diff --git a/spec/addressable/idna_spec.rb b/spec/addressable/idna_spec.rb index 9e2fb7d6..f8a8cec3 100644 --- a/spec/addressable/idna_spec.rb +++ b/spec/addressable/idna_spec.rb @@ -259,8 +259,8 @@ describe Addressable::IDNA, "when using the pure-Ruby implementation" do before :all do - Addressable.send(:remove_const, :IDNA) - load "addressable/idna/pure.rb" + require "addressable/idna/pure" + Addressable::IDNA.backend = Addressable::IDNA::Pure end it_should_behave_like "converting from unicode to ASCII" @@ -275,8 +275,9 @@ it "should not blow up inside fibers" do f = Fiber.new do - Addressable.send(:remove_const, :IDNA) + Addressable::IDNA.send(:remove_const, :Pure) load "addressable/idna/pure.rb" + Addressable::IDNA.backend = Addressable::IDNA::Pure end f.resume end @@ -287,12 +288,11 @@ end begin - require "idn" + require "addressable/idna/libidn1" describe Addressable::IDNA, "when using the libidn1 native implementation (idn gem)" do before :all do - Addressable.send(:remove_const, :IDNA) - load "addressable/idna/native.rb" + Addressable::IDNA.backend = Addressable::IDNA::Libidn1 end it_should_behave_like "converting from unicode to ASCII" @@ -310,12 +310,11 @@ end begin - require "addressable/idna/native2.rb" + require "addressable/idna/libidn2" describe Addressable::IDNA, "when using the libidn2 native implementation (ffi)" do before :all do - Addressable.send(:remove_const, :IDNA) - load "addressable/idna/native2.rb" + Addressable::IDNA.backend = Addressable::IDNA::Libidn2 end it_should_behave_like "converting from unicode to ASCII" diff --git a/tasks/profile.rake b/tasks/profile.rake index 29bc5459..1ec75e0a 100644 --- a/tasks/profile.rake +++ b/tasks/profile.rake @@ -39,11 +39,11 @@ namespace :profile do require "memory_profiler" require "addressable/uri" if ENV["IDNA_MODE"] == "pure" - Addressable.send(:remove_const, :IDNA) - load "addressable/idna/pure.rb" - elsif ENV["IDNA_MODE"] == "native" - Addressable.send(:remove_const, :IDNA) - load "addressable/idna/native.rb" + require "addressable/idna/pure" + Addressable::IDNA.backend = Addressable::IDNA::Pure + elsif ENV["IDNA_MODE"] == "libidn1" + require "addressable/idna/libidn1" + Addressable::IDNA.backend = Addressable::IDNA::Libidn1 end start_at = Time.now.to_f @@ -56,7 +56,6 @@ namespace :profile do end end_at = Time.now.to_f print_options = { scale_bytes: true, normalize_paths: true } - puts "\n\n" if ENV["CI"] report.pretty_print(**print_options) @@ -67,6 +66,7 @@ namespace :profile do puts "Total allocated: #{t_allocated} (#{report.total_allocated} objects)" puts "Total retained: #{t_retained} (#{report.total_retained} objects)" puts "Took #{end_at - start_at} seconds" + puts "IDNA backend: #{Addressable::IDNA.backend.name}" FileUtils.mkdir_p("tmp") report.pretty_print(to_file: "tmp/memprof.txt", **print_options)