Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 6 additions & 56 deletions app/models/name.rb
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ class Name < ApplicationRecord
include Name::Inferences
include Name::Network
include Name::Wiki
include Name::FuzzySearch

attr_accessor :only_display
attr_accessor :nomenclatural_type_entry
Expand Down Expand Up @@ -191,57 +192,6 @@ def all_public
where(status: public_status, redirect: nil)
end

##
# Performs a fuzzy search for names similar to +query+. The search
# parameters are:
# - method: One of +:similarity+ (default) or +:levenshtein+
# - threshold: Limit to find matches,
# by default 0.7 (similarity) or 2 (levenshtein)
# - limit: Maximum number of results to return
# - selection: Pre-selection of names included in the search. One of:
# - all_valid: (default) All validly published names
# - all_public: All publicly visible names
# - valid_genera: All validly published genus names
# - public_genera: All publicly visible genus names
# - An ActiveRecord query on the +names+ table
def fuzzy_match(
query, method: :similarity, threshold: nil, limit: 10,
selection: :all_valid
)
return unless ActiveRecord::Base.connection.adapter_name == 'PostgreSQL'

case selection
when :all_valid
selection = all_valid
when :all_public
selection = all_public
when :valid_genera
selection = all_valid.where(rank: :genus)
when :public_genera
selection = all_public.where(rank: :genus)
end

clean_query = ActiveRecord::Base.connection.quote(query)
case method.to_sym
when :similarity
threshold ||= 0.7
selection
.select("id, name, similarity(name, #{clean_query}) AS score")
.where('similarity(name, ?) > ?', query, threshold)
.order('score DESC')
.limit(limit)
when :levenshtein
threshold ||= 2
selection
.select("id, name, levenshtein(name, #{clean_query}) AS score")
.where('levenshtein(name, ?) <= ?', query, threshold)
.order('score ASC')
.limit(limit)
else
raise ArgumentError, "Unsupported fuzzy match method: #{method}"
end
end

# ============ --- CLASS > ETYMOLOGY --- ============

def etymology_particles
Expand Down Expand Up @@ -465,7 +415,7 @@ def abbr_base_name
# which is only possible (under the SeqCode) for genera and species but rank
# is not evaluated. Use +is_type_species?+ to test for the rank first and
# avoid unnecessary database queries
#
#
# TODO - Performance issue
# This loads the full parent, which causes N+1 issues when simply rendering
# a list of names
Expand Down Expand Up @@ -683,7 +633,7 @@ def is_variant?(alt_spelling)
##
# This method always return +nil+ for names that are not at (inferred) rank
# of genus or species
#
#
# Find names similar to the current one (using the canonical spelling
# from +base_name+) with Levenshtein ≤ 3, considering a search space
# defined by the taxonomic rank and +among+:
Expand Down Expand Up @@ -716,7 +666,7 @@ def similar_names(among = :valid)
end

selection = selection.where.not(id: id)
self.class.fuzzy_match(
self.class.fuzzy_search(
base_name, method: :levenshtein, selection: selection
)
end
Expand Down Expand Up @@ -858,7 +808,7 @@ def can_edit?(user)
return true if user.curator?
return true if draft? && user?(user)
false
end
end

def can_edit_validated?(user)
return false if only_display
Expand Down Expand Up @@ -1110,7 +1060,7 @@ def update_type_genome(new_accession, new_database = nil)
##
# Returns the expected type of type as the String representation of the
# expected class
#
#
# Note that this differs from +expected_type_rank+ in that the current
# function uses +inferred_rank+ regardless of defined +rank+
def expected_type_type
Expand Down
79 changes: 79 additions & 0 deletions app/models/name/fuzzy_search.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
module Name::FuzzySearch
extend ActiveSupport::Concern

class_methods do
# Performs a fuzzy search for names similar to the given query.
# @param query [String] The search query.
# @param method [Symbol] The search method (:similarity or :levenshtein).
# @param threshold [Float, Integer] The threshold for matching.
# @param limit [Integer] The maximum number of results to return.
# @param selection [Symbol, ActiveRecord::Relation] The selection of names to search.
# Can be :all_valid, :all_public, :valid_genera, :public_genera, or a custom query.
# @return [ActiveRecord::Relation] The matching names.
def fuzzy_search(
query,
method: :similarity,
threshold: nil,
limit: 10,
selection: :all_valid
)
return unless ActiveRecord::Base.connection.adapter_name == 'PostgreSQL'

selection = resolve_selection(selection)

case method.to_sym
when :similarity
fuzzy_similarity_search(selection, query, threshold || 0.7, limit)
when :levenshtein
fuzzy_levenshtein_search(selection, query, threshold || 2, limit)
else
raise ArgumentError, "Unsupported fuzzy match method: #{method}"
end
end

private

def resolve_selection(selection)
case selection
when :all_valid
Name.all_valid
when :all_public
Name.all_public
when :valid_genera
Name.all_valid.where(rank: :genus)
when :public_genera
Name.all_public.where(rank: :genus)
when ActiveRecord::Relation
selection
else
raise ArgumentError, "Unsupported selection: #{selection}"
end
end

def fuzzy_similarity_search(selection, query, threshold, limit)
selection
.select(
sanitize_sql_array([
'id, name, similarity(name, ?) AS score',
query
])
)
.where('similarity(name, ?) > ?', query, threshold)
.order('score DESC')
.limit(limit)
end

def fuzzy_levenshtein_search(selection, query, threshold, limit)
selection
.select(
sanitize_sql_array([
'id, name, levenshtein(name, ?) AS score',
query
])
)
.where('levenshtein(name, ?) <= ?', query, threshold)
.order('score ASC')
.limit(limit)
end
end
end
73 changes: 0 additions & 73 deletions app/services/name/fuzzy_search.rb

This file was deleted.

3 changes: 1 addition & 2 deletions lib/tasks/fuzzy_match.rake
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ namespace :name do

puts "Searching for fuzzy matches to: '#{query}'\n\n"

matches = Name.fuzzy_match(query, method: method)
matches = Name.fuzzy_search(query, method: method)
if matches.any?
matches.each do |match|
puts "- Match: #{match.name} (Score: #{match.score})"
Expand All @@ -32,4 +32,3 @@ namespace :name do
end
end
end

25 changes: 25 additions & 0 deletions test/fixtures/names.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,28 @@

unregistered:
name: MyString

escherichia:
name: Escherichia
rank: genus
status: 15 # SeqCode

escherichia_coli:
name: Escherichia coli
rank: species
status: 15 # SeqCode

escherichia_colie:
name: Escherichia colie
rank: species
status: 5 # draft

bacillus:
name: Bacillus
rank: genus
status: 15 # SeqCode

bacillus_subtilis:
name: Bacillus subtilis
rank: species
status: 20 # ICNP
53 changes: 53 additions & 0 deletions test/models/name/fuzzy_search_test.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
require 'test_helper'

class Name::FuzzySearchTest < ActiveSupport::TestCase
test 'finds similar names by similarity' do
matches = Name.fuzzy_search('Escherichia coli').to_a

assert_equal names(:escherichia_coli), matches.first
assert_equal 1.0, matches.first.score
end

test 'uses all valid names by default' do
matches = Name.fuzzy_search('Escherichia colie', threshold: 0.5).to_a

assert_includes matches, names(:escherichia_coli)
assert_not_includes matches, names(:escherichia_colie)
end

test 'supports levenshtein searches' do
matches = Name.fuzzy_search(
'Bacillus subtiliss', method: :levenshtein, threshold: 1
).to_a

assert_equal [names(:bacillus_subtilis)], matches
assert_equal 1, matches.first.score
end

test 'supports genus selections' do
matches = Name.fuzzy_search(
'Bacilus',
method: :levenshtein,
threshold: 2,
selection: :valid_genera
).to_a

assert_equal [names(:bacillus)], matches
end

test 'limits matches' do
matches = Name.fuzzy_search(
'Escherichia',
threshold: 0,
limit: 1
).to_a

assert_equal 1, matches.size
end

test 'raises for unsupported methods' do
assert_raises(ArgumentError) do
Name.fuzzy_search('Escherichia coli', method: :unknown).to_a
end
end
end
Loading