Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Wordlist set #69

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .spellr_wordlists/english.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ css
ctrl
customisations
cyclomatic
damerau
def
desaturate
dict
Expand Down
11 changes: 3 additions & 8 deletions lib/spellr/check.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
require_relative '../spellr'
require_relative 'tokenizer'
require_relative 'string_format'
require_relative 'wordlist_set'

module Spellr
class Check
Expand Down Expand Up @@ -38,18 +39,12 @@ def check_and_count_file(file)
reporter.warn "Skipped unreadable file: #{aqua file.relative_path}"
end

def check_file(file, start_at = nil, found_word_proc = wordlist_proc_for(file))
def check_file(file, start_at = nil, wordlist_set = ::Spellr::WordlistSet.for_file(file))
Spellr::Tokenizer.new(file, start_at: start_at)
.each_token(skip_term_proc: found_word_proc) do |token|
.each_token(skip_if_included: wordlist_set) do |token|
reporter.call(token)
reporter.output.exit_code = 1
end
end

def wordlist_proc_for(file)
wordlists = Spellr.config.wordlists_for(file).sort_by(&:length).reverse

->(term) { wordlists.any? { |w| w.include?(term) } }
end
end
end
12 changes: 6 additions & 6 deletions lib/spellr/check_interactive.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,18 @@ module Spellr
class CheckInteractive < Check
private

def check_file_from_restart(file, restart_token, wordlist_proc)
def check_file_from_restart(file, restart_token, wordlist_set)
# new wordlist cache when adding a word
wordlist_proc = wordlist_proc_for(file) unless restart_token.replacement
check_file(file, restart_token.location, wordlist_proc)
wordlist_set = Spellr::WordlistSet.for_file(file) unless restart_token.replacement
check_file(file, restart_token.location, wordlist_set)
end

def check_file(file, start_at = nil, wordlist_proc = wordlist_proc_for(file))
def check_file(file, start_at = nil, wordlist_set = Spellr::WordlistSet.for_file(file))
restart_token = catch(:check_file_from) do
super(file, start_at, wordlist_proc)
super(file, start_at, wordlist_set)
nil
end
check_file_from_restart(file, restart_token, wordlist_proc) if restart_token
check_file_from_restart(file, restart_token, wordlist_set) if restart_token
end
end
end
6 changes: 2 additions & 4 deletions lib/spellr/config.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
require_relative 'language'
require_relative 'config_validator'
require_relative 'output'
require_relative 'wordlist_set'

require 'pathname'

Expand Down Expand Up @@ -51,10 +52,6 @@ def languages_for(file)
languages.select { |l| l.matches?(file) }
end

def wordlists_for(file)
languages_for(file).flat_map(&:wordlists)
end

def config_file=(value)
reset!
@config = ConfigLoader.new(value)
Expand Down Expand Up @@ -86,6 +83,7 @@ def reset! # rubocop:disable Metrics/MethodLength
remove_instance_variable(:@word_minimum_length) if defined?(@word_minimum_length)
remove_instance_variable(:@key_heuristic_weight) if defined?(@key_heuristic_weight)
remove_instance_variable(:@key_minimum_length) if defined?(@key_minimum_length)
::Spellr::WordlistSet.clear_cache
end

private
Expand Down
4 changes: 2 additions & 2 deletions lib/spellr/line_tokenizer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,11 @@ def each_term
end
end

def each_token(skip_term_proc: nil) # rubocop:disable Metrics/MethodLength
def each_token(skip_if_included: nil) # rubocop:disable Metrics/MethodLength
until eos?
term = next_term
next unless term
next if @disabled || skip_term_proc&.call(term)
next if @disabled || skip_if_included&.include?(term)

yield Token.new(term, line: line, location: column_location(term))
end
Expand Down
4 changes: 2 additions & 2 deletions lib/spellr/tokenizer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@ def each_term(&block)
file.close
end

def each_token(skip_term_proc: nil) # rubocop:disable Metrics/MethodLength
def each_token(skip_if_included: nil) # rubocop:disable Metrics/MethodLength
each_line_with_stats do |line, line_number, char_offset, byte_offset|
prepare_tokenizer_for_line(line)&.each_token(skip_term_proc: skip_term_proc) do |token|
prepare_tokenizer_for_line(line)&.each_token(skip_if_included: skip_if_included) do |token|
token.line = prepare_line(line, line_number, char_offset, byte_offset)

yield token
Expand Down
2 changes: 2 additions & 0 deletions lib/spellr/wordlist.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
require 'pathname'
require_relative '../spellr'
require_relative 'token' # for spellr_normalize
require_relative 'wordlist_set'

module Spellr
class Wordlist
Expand Down Expand Up @@ -72,6 +73,7 @@ def touch
@path.dirname.mkpath
@path.write('')
clear_cache
::Spellr::WordlistSet.clear_cache
end

def length
Expand Down
89 changes: 89 additions & 0 deletions lib/spellr/wordlist_set.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# frozen_string_literal: true

module Spellr
class WordlistSet
def self.for_file(file)
languages_for_file = Spellr.config.languages_for(file)
cache.fetch(languages_for_file) do
cache[languages_for_file] = new(languages_for_file)
end
end

def self.cache
@cache ||= {}
end

def self.clear_cache
@wordlist_sets = nil
end

def initialize(languages)
@wordlists = languages.flat_map(&:wordlists)
@wordlists.sort_by!(&:length)
@wordlists.reverse!
end

def include?(term)
@wordlists.any? { |w| w.include?(term) }
end

# this is the same correction algorithm as ruby's DidYouMean::SpellChecker.correct
# but with early returns and using gems with c extensions
Suggestion = Struct.new(:word, :jw, :dl)
def suggestions_unsorted(input) # rubocop:disable Metrics/MethodLength
require 'jaro_winkler'
require 'damerau-levenshtein'

input = input.spellr_normalize
threshold = 0.77
suggestions = []

@wordlists.each do |wordlist|
wordlist.words.each do |word|
jw = JaroWinkler.distance(word, input)
next unless jw >= threshold

dl = DamerauLevenshtein.distance(word, input)
suggestions << Suggestion.new(word, jw, dl)
end
end

suggestions.sort_by!(&:jw)
suggestions.reverse!
end

def suggestions(input) # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
input = input.spellr_normalize
suggestions = suggestions_unsorted(input)
# correct mistypes
threshold = (input.length * 0.25).ceil
corrections = suggestions.select { |suggestion| suggestion.dl <= threshold }

return corrections unless corrections.empty?

# Correct misspells
suggestions.select do |suggestion|
length = input.length < suggestion.word.length ? input.length : suggestion.word.length

suggestion.dl < length
end
end

def suggestion(input) # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
input = input.spellr_normalize
suggestions = suggestions_unsorted(input)
# correct mistypes
threshold = (input.length * 0.25).ceil
correction = suggestions.find { |suggestion| suggestion.dl <= threshold }

return correction.word if correction

# Correct misspells
suggestions.find do |suggestion|
length = input.length < suggestion.word.length ? input.length : suggestion.word.length

suggestion.dl < length
end&.word
end
end
end
2 changes: 2 additions & 0 deletions spellr.gemspec
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ Gem::Specification.new do |spec|
spec.add_development_dependency 'tty_string', '>= 0.2.1'
spec.add_development_dependency 'webmock', '~> 3.8'

spec.add_dependency 'damerau-levenshtein'
spec.add_dependency 'fast_ignore', '>= 0.11.0'
spec.add_dependency 'jaro_winkler'
spec.add_dependency 'parallel', '~> 1.0'
end