diff --git a/app/models/name.rb b/app/models/name.rb index 04d605fc..d2ca2ee5 100644 --- a/app/models/name.rb +++ b/app/models/name.rb @@ -141,6 +141,7 @@ class Name < ApplicationRecord include Name::Inferences include Name::Network include Name::Wiki + include Name::FuzzySearch attr_accessor :only_display attr_accessor :nomenclatural_type_entry @@ -191,57 +192,6 @@ def all_public where(status: public_status, redirect: nil) end - ## - # Performs a fuzzy search for names similar to +query+. The search - # parameters are: - # - method: One of +:similarity+ (default) or +:levenshtein+ - # - threshold: Limit to find matches, - # by default 0.7 (similarity) or 2 (levenshtein) - # - limit: Maximum number of results to return - # - selection: Pre-selection of names included in the search. One of: - # - all_valid: (default) All validly published names - # - all_public: All publicly visible names - # - valid_genera: All validly published genus names - # - public_genera: All publicly visible genus names - # - An ActiveRecord query on the +names+ table - def fuzzy_match( - query, method: :similarity, threshold: nil, limit: 10, - selection: :all_valid - ) - return unless ActiveRecord::Base.connection.adapter_name == 'PostgreSQL' - - case selection - when :all_valid - selection = all_valid - when :all_public - selection = all_public - when :valid_genera - selection = all_valid.where(rank: :genus) - when :public_genera - selection = all_public.where(rank: :genus) - end - - clean_query = ActiveRecord::Base.connection.quote(query) - case method.to_sym - when :similarity - threshold ||= 0.7 - selection - .select("id, name, similarity(name, #{clean_query}) AS score") - .where('similarity(name, ?) > ?', query, threshold) - .order('score DESC') - .limit(limit) - when :levenshtein - threshold ||= 2 - selection - .select("id, name, levenshtein(name, #{clean_query}) AS score") - .where('levenshtein(name, ?) <= ?', query, threshold) - .order('score ASC') - .limit(limit) - else - raise ArgumentError, "Unsupported fuzzy match method: #{method}" - end - end - # ============ --- CLASS > ETYMOLOGY --- ============ def etymology_particles @@ -465,7 +415,7 @@ def abbr_base_name # which is only possible (under the SeqCode) for genera and species but rank # is not evaluated. Use +is_type_species?+ to test for the rank first and # avoid unnecessary database queries - # + # # TODO - Performance issue # This loads the full parent, which causes N+1 issues when simply rendering # a list of names @@ -683,7 +633,7 @@ def is_variant?(alt_spelling) ## # This method always return +nil+ for names that are not at (inferred) rank # of genus or species - # + # # Find names similar to the current one (using the canonical spelling # from +base_name+) with Levenshtein ≤ 3, considering a search space # defined by the taxonomic rank and +among+: @@ -716,7 +666,7 @@ def similar_names(among = :valid) end selection = selection.where.not(id: id) - self.class.fuzzy_match( + self.class.fuzzy_search( base_name, method: :levenshtein, selection: selection ) end @@ -858,7 +808,7 @@ def can_edit?(user) return true if user.curator? return true if draft? && user?(user) false - end + end def can_edit_validated?(user) return false if only_display @@ -1110,7 +1060,7 @@ def update_type_genome(new_accession, new_database = nil) ## # Returns the expected type of type as the String representation of the # expected class - # + # # Note that this differs from +expected_type_rank+ in that the current # function uses +inferred_rank+ regardless of defined +rank+ def expected_type_type diff --git a/app/models/name/fuzzy_search.rb b/app/models/name/fuzzy_search.rb new file mode 100644 index 00000000..aac8ebc8 --- /dev/null +++ b/app/models/name/fuzzy_search.rb @@ -0,0 +1,79 @@ +module Name::FuzzySearch + extend ActiveSupport::Concern + + class_methods do + # Performs a fuzzy search for names similar to the given query. + # @param query [String] The search query. + # @param method [Symbol] The search method (:similarity or :levenshtein). + # @param threshold [Float, Integer] The threshold for matching. + # @param limit [Integer] The maximum number of results to return. + # @param selection [Symbol, ActiveRecord::Relation] The selection of names to search. + # Can be :all_valid, :all_public, :valid_genera, :public_genera, or a custom query. + # @return [ActiveRecord::Relation] The matching names. + def fuzzy_search( + query, + method: :similarity, + threshold: nil, + limit: 10, + selection: :all_valid + ) + return unless ActiveRecord::Base.connection.adapter_name == 'PostgreSQL' + + selection = resolve_selection(selection) + + case method.to_sym + when :similarity + fuzzy_similarity_search(selection, query, threshold || 0.7, limit) + when :levenshtein + fuzzy_levenshtein_search(selection, query, threshold || 2, limit) + else + raise ArgumentError, "Unsupported fuzzy match method: #{method}" + end + end + + private + + def resolve_selection(selection) + case selection + when :all_valid + Name.all_valid + when :all_public + Name.all_public + when :valid_genera + Name.all_valid.where(rank: :genus) + when :public_genera + Name.all_public.where(rank: :genus) + when ActiveRecord::Relation + selection + else + raise ArgumentError, "Unsupported selection: #{selection}" + end + end + + def fuzzy_similarity_search(selection, query, threshold, limit) + selection + .select( + sanitize_sql_array([ + 'id, name, similarity(name, ?) AS score', + query + ]) + ) + .where('similarity(name, ?) > ?', query, threshold) + .order('score DESC') + .limit(limit) + end + + def fuzzy_levenshtein_search(selection, query, threshold, limit) + selection + .select( + sanitize_sql_array([ + 'id, name, levenshtein(name, ?) AS score', + query + ]) + ) + .where('levenshtein(name, ?) <= ?', query, threshold) + .order('score ASC') + .limit(limit) + end + end +end diff --git a/app/services/name/fuzzy_search.rb b/app/services/name/fuzzy_search.rb deleted file mode 100644 index a731ff13..00000000 --- a/app/services/name/fuzzy_search.rb +++ /dev/null @@ -1,73 +0,0 @@ -# frozen_string_literal: true - -module Services - module Name - # Service object to handle fuzzy search for names. - # This encapsulates the logic for finding similar names using - # PostgreSQL's similarity functions. - class FuzzySearch - # Performs a fuzzy search for names similar to the given query. - # @param query [String] The search query. - # @param method [Symbol] The search method (:similarity or :levenshtein). - # @param threshold [Float, Integer] The threshold for matching. - # @param limit [Integer] The maximum number of results to return. - # @param selection [Symbol, ActiveRecord::Relation] The selection of names to search. - # Can be :all_valid, :all_public, :valid_genera, :public_genera, or a custom query. - # @return [ActiveRecord::Relation] The matching names. - def self.call( - query, - method: :similarity, - threshold: nil, - limit: 10, - selection: :all_valid - ) - return unless ActiveRecord::Base.connection.adapter_name == 'PostgreSQL' - - selection = resolve_selection(selection) - clean_query = ActiveRecord::Base.connection.quote(query) - - case method.to_sym - when :similarity - perform_similarity_search(selection, clean_query, threshold || 0.7, limit) - when :levenshtein - perform_levenshtein_search(selection, clean_query, threshold || 2, limit) - else - raise ArgumentError, "Unsupported fuzzy match method: #{method}" - end - end - - private_class_method def self.resolve_selection(selection) - case selection - when :all_valid - ::Name.all_valid - when :all_public - ::Name.all_public - when :valid_genera - ::Name.all_valid.where(rank: :genus) - when :public_genera - ::Name.all_public.where(rank: :genus) - when ActiveRecord::Relation - selection - else - raise ArgumentError, "Unsupported selection: #{selection}" - end - end - - private_class_method def self.perform_similarity_search(selection, query, threshold, limit) - selection - .select("id, name, similarity(name, #{query}) AS score") - .where('similarity(name, ?) > ?', query, threshold) - .order('score DESC') - .limit(limit) - end - - private_class_method def self.perform_levenshtein_search(selection, query, threshold, limit) - selection - .select("id, name, levenshtein(name, #{query}) AS score") - .where('levenshtein(name, ?) <= ?', query, threshold) - .order('score ASC') - .limit(limit) - end - end - end -end diff --git a/lib/tasks/fuzzy_match.rake b/lib/tasks/fuzzy_match.rake index fedddb97..30d61918 100644 --- a/lib/tasks/fuzzy_match.rake +++ b/lib/tasks/fuzzy_match.rake @@ -22,7 +22,7 @@ namespace :name do puts "Searching for fuzzy matches to: '#{query}'\n\n" - matches = Name.fuzzy_match(query, method: method) + matches = Name.fuzzy_search(query, method: method) if matches.any? matches.each do |match| puts "- Match: #{match.name} (Score: #{match.score})" @@ -32,4 +32,3 @@ namespace :name do end end end - diff --git a/test/fixtures/names.yml b/test/fixtures/names.yml index 258c4d6c..bd79188b 100644 --- a/test/fixtures/names.yml +++ b/test/fixtures/names.yml @@ -2,3 +2,28 @@ unregistered: name: MyString + +escherichia: + name: Escherichia + rank: genus + status: 15 # SeqCode + +escherichia_coli: + name: Escherichia coli + rank: species + status: 15 # SeqCode + +escherichia_colie: + name: Escherichia colie + rank: species + status: 5 # draft + +bacillus: + name: Bacillus + rank: genus + status: 15 # SeqCode + +bacillus_subtilis: + name: Bacillus subtilis + rank: species + status: 20 # ICNP diff --git a/test/models/name/fuzzy_search_test.rb b/test/models/name/fuzzy_search_test.rb new file mode 100644 index 00000000..15634dda --- /dev/null +++ b/test/models/name/fuzzy_search_test.rb @@ -0,0 +1,53 @@ +require 'test_helper' + +class Name::FuzzySearchTest < ActiveSupport::TestCase + test 'finds similar names by similarity' do + matches = Name.fuzzy_search('Escherichia coli').to_a + + assert_equal names(:escherichia_coli), matches.first + assert_equal 1.0, matches.first.score + end + + test 'uses all valid names by default' do + matches = Name.fuzzy_search('Escherichia colie', threshold: 0.5).to_a + + assert_includes matches, names(:escherichia_coli) + assert_not_includes matches, names(:escherichia_colie) + end + + test 'supports levenshtein searches' do + matches = Name.fuzzy_search( + 'Bacillus subtiliss', method: :levenshtein, threshold: 1 + ).to_a + + assert_equal [names(:bacillus_subtilis)], matches + assert_equal 1, matches.first.score + end + + test 'supports genus selections' do + matches = Name.fuzzy_search( + 'Bacilus', + method: :levenshtein, + threshold: 2, + selection: :valid_genera + ).to_a + + assert_equal [names(:bacillus)], matches + end + + test 'limits matches' do + matches = Name.fuzzy_search( + 'Escherichia', + threshold: 0, + limit: 1 + ).to_a + + assert_equal 1, matches.size + end + + test 'raises for unsupported methods' do + assert_raises(ArgumentError) do + Name.fuzzy_search('Escherichia coli', method: :unknown).to_a + end + end +end