From e6384cdf047ceddf08b5287cad550cb01d5db92e Mon Sep 17 00:00:00 2001 From: Dana Sherson Date: Sat, 15 Aug 2020 12:23:51 +1000 Subject: [PATCH] Wordlist set --- .spellr_wordlists/english.txt | 1 + lib/spellr/check.rb | 11 ++-- lib/spellr/check_interactive.rb | 12 ++--- lib/spellr/config.rb | 6 +-- lib/spellr/line_tokenizer.rb | 4 +- lib/spellr/tokenizer.rb | 4 +- lib/spellr/wordlist.rb | 2 + lib/spellr/wordlist_set.rb | 89 +++++++++++++++++++++++++++++++++ spellr.gemspec | 2 + 9 files changed, 109 insertions(+), 22 deletions(-) create mode 100644 lib/spellr/wordlist_set.rb diff --git a/.spellr_wordlists/english.txt b/.spellr_wordlists/english.txt index dfb0f3e..5763510 100644 --- a/.spellr_wordlists/english.txt +++ b/.spellr_wordlists/english.txt @@ -23,6 +23,7 @@ css ctrl customisations cyclomatic +damerau def desaturate dict diff --git a/lib/spellr/check.rb b/lib/spellr/check.rb index 83321d9..62615e2 100644 --- a/lib/spellr/check.rb +++ b/lib/spellr/check.rb @@ -3,6 +3,7 @@ require_relative '../spellr' require_relative 'tokenizer' require_relative 'string_format' +require_relative 'wordlist_set' module Spellr class Check @@ -38,18 +39,12 @@ def check_and_count_file(file) reporter.warn "Skipped unreadable file: #{aqua file.relative_path}" end - def check_file(file, start_at = nil, found_word_proc = wordlist_proc_for(file)) + def check_file(file, start_at = nil, wordlist_set = ::Spellr::WordlistSet.for_file(file)) Spellr::Tokenizer.new(file, start_at: start_at) - .each_token(skip_term_proc: found_word_proc) do |token| + .each_token(skip_if_included: wordlist_set) do |token| reporter.call(token) reporter.output.exit_code = 1 end end - - def wordlist_proc_for(file) - wordlists = Spellr.config.wordlists_for(file).sort_by(&:length).reverse - - ->(term) { wordlists.any? { |w| w.include?(term) } } - end end end diff --git a/lib/spellr/check_interactive.rb b/lib/spellr/check_interactive.rb index 78d8589..30c482a 100644 --- a/lib/spellr/check_interactive.rb +++ b/lib/spellr/check_interactive.rb @@ -7,18 +7,18 @@ module Spellr class CheckInteractive < Check private - def check_file_from_restart(file, restart_token, wordlist_proc) + def check_file_from_restart(file, restart_token, wordlist_set) # new wordlist cache when adding a word - wordlist_proc = wordlist_proc_for(file) unless restart_token.replacement - check_file(file, restart_token.location, wordlist_proc) + wordlist_set = Spellr::WordlistSet.for_file(file) unless restart_token.replacement + check_file(file, restart_token.location, wordlist_set) end - def check_file(file, start_at = nil, wordlist_proc = wordlist_proc_for(file)) + def check_file(file, start_at = nil, wordlist_set = Spellr::WordlistSet.for_file(file)) restart_token = catch(:check_file_from) do - super(file, start_at, wordlist_proc) + super(file, start_at, wordlist_set) nil end - check_file_from_restart(file, restart_token, wordlist_proc) if restart_token + check_file_from_restart(file, restart_token, wordlist_set) if restart_token end end end diff --git a/lib/spellr/config.rb b/lib/spellr/config.rb index cbe4978..3f309d5 100644 --- a/lib/spellr/config.rb +++ b/lib/spellr/config.rb @@ -5,6 +5,7 @@ require_relative 'language' require_relative 'config_validator' require_relative 'output' +require_relative 'wordlist_set' require 'pathname' @@ -51,10 +52,6 @@ def languages_for(file) languages.select { |l| l.matches?(file) } end - def wordlists_for(file) - languages_for(file).flat_map(&:wordlists) - end - def config_file=(value) reset! @config = ConfigLoader.new(value) @@ -86,6 +83,7 @@ def reset! # rubocop:disable Metrics/MethodLength remove_instance_variable(:@word_minimum_length) if defined?(@word_minimum_length) remove_instance_variable(:@key_heuristic_weight) if defined?(@key_heuristic_weight) remove_instance_variable(:@key_minimum_length) if defined?(@key_minimum_length) + ::Spellr::WordlistSet.clear_cache end private diff --git a/lib/spellr/line_tokenizer.rb b/lib/spellr/line_tokenizer.rb index 8cfad02..ec47fb7 100644 --- a/lib/spellr/line_tokenizer.rb +++ b/lib/spellr/line_tokenizer.rb @@ -37,11 +37,11 @@ def each_term end end - def each_token(skip_term_proc: nil) # rubocop:disable Metrics/MethodLength + def each_token(skip_if_included: nil) # rubocop:disable Metrics/MethodLength until eos? term = next_term next unless term - next if @disabled || skip_term_proc&.call(term) + next if @disabled || skip_if_included&.include?(term) yield Token.new(term, line: line, location: column_location(term)) end diff --git a/lib/spellr/tokenizer.rb b/lib/spellr/tokenizer.rb index e0a0f10..8af1770 100644 --- a/lib/spellr/tokenizer.rb +++ b/lib/spellr/tokenizer.rb @@ -34,9 +34,9 @@ def each_term(&block) file.close end - def each_token(skip_term_proc: nil) # rubocop:disable Metrics/MethodLength + def each_token(skip_if_included: nil) # rubocop:disable Metrics/MethodLength each_line_with_stats do |line, line_number, char_offset, byte_offset| - prepare_tokenizer_for_line(line)&.each_token(skip_term_proc: skip_term_proc) do |token| + prepare_tokenizer_for_line(line)&.each_token(skip_if_included: skip_if_included) do |token| token.line = prepare_line(line, line_number, char_offset, byte_offset) yield token diff --git a/lib/spellr/wordlist.rb b/lib/spellr/wordlist.rb index 97f8102..f0146cb 100644 --- a/lib/spellr/wordlist.rb +++ b/lib/spellr/wordlist.rb @@ -3,6 +3,7 @@ require 'pathname' require_relative '../spellr' require_relative 'token' # for spellr_normalize +require_relative 'wordlist_set' module Spellr class Wordlist @@ -72,6 +73,7 @@ def touch @path.dirname.mkpath @path.write('') clear_cache + ::Spellr::WordlistSet.clear_cache end def length diff --git a/lib/spellr/wordlist_set.rb b/lib/spellr/wordlist_set.rb new file mode 100644 index 0000000..9413823 --- /dev/null +++ b/lib/spellr/wordlist_set.rb @@ -0,0 +1,89 @@ +# frozen_string_literal: true + +module Spellr + class WordlistSet + def self.for_file(file) + languages_for_file = Spellr.config.languages_for(file) + cache.fetch(languages_for_file) do + cache[languages_for_file] = new(languages_for_file) + end + end + + def self.cache + @cache ||= {} + end + + def self.clear_cache + @wordlist_sets = nil + end + + def initialize(languages) + @wordlists = languages.flat_map(&:wordlists) + @wordlists.sort_by!(&:length) + @wordlists.reverse! + end + + def include?(term) + @wordlists.any? { |w| w.include?(term) } + end + + # this is the same correction algorithm as ruby's DidYouMean::SpellChecker.correct + # but with early returns and using gems with c extensions + Suggestion = Struct.new(:word, :jw, :dl) + def suggestions_unsorted(input) # rubocop:disable Metrics/MethodLength + require 'jaro_winkler' + require 'damerau-levenshtein' + + input = input.spellr_normalize + threshold = 0.77 + suggestions = [] + + @wordlists.each do |wordlist| + wordlist.words.each do |word| + jw = JaroWinkler.distance(word, input) + next unless jw >= threshold + + dl = DamerauLevenshtein.distance(word, input) + suggestions << Suggestion.new(word, jw, dl) + end + end + + suggestions.sort_by!(&:jw) + suggestions.reverse! + end + + def suggestions(input) # rubocop:disable Metrics/MethodLength, Metrics/AbcSize + input = input.spellr_normalize + suggestions = suggestions_unsorted(input) + # correct mistypes + threshold = (input.length * 0.25).ceil + corrections = suggestions.select { |suggestion| suggestion.dl <= threshold } + + return corrections unless corrections.empty? + + # Correct misspells + suggestions.select do |suggestion| + length = input.length < suggestion.word.length ? input.length : suggestion.word.length + + suggestion.dl < length + end + end + + def suggestion(input) # rubocop:disable Metrics/MethodLength, Metrics/AbcSize + input = input.spellr_normalize + suggestions = suggestions_unsorted(input) + # correct mistypes + threshold = (input.length * 0.25).ceil + correction = suggestions.find { |suggestion| suggestion.dl <= threshold } + + return correction.word if correction + + # Correct misspells + suggestions.find do |suggestion| + length = input.length < suggestion.word.length ? input.length : suggestion.word.length + + suggestion.dl < length + end&.word + end + end +end diff --git a/spellr.gemspec b/spellr.gemspec index de499ab..5c54981 100644 --- a/spellr.gemspec +++ b/spellr.gemspec @@ -47,6 +47,8 @@ Gem::Specification.new do |spec| spec.add_development_dependency 'tty_string', '>= 0.2.1' spec.add_development_dependency 'webmock', '~> 3.8' + spec.add_dependency 'damerau-levenshtein' spec.add_dependency 'fast_ignore', '>= 0.11.0' + spec.add_dependency 'jaro_winkler' spec.add_dependency 'parallel', '~> 1.0' end