In [1]:
from gensim.models import KeyedVectors

from spellchecking.fasttext import SpellCheckerFastTextSimilarity
from utils.database import read_vocabulary
from utils.tester import test_on_datasets



In [2]:
from os.path import join

from utils.database import read_vocabulary
from utils.tester import test_on_datasets
from utils.paths import PATH_PROJECT

PATH_MODEL = join(PATH_PROJECT, 'models/wiki-news-300d-1M-subword.vec')
PATH_VOCABULARY = join(PATH_PROJECT, 'data/big.txt')
PATHS_DATASETS = list(map(lambda suffix: join(PATH_PROJECT, suffix),
                          ['data/spell-testset1.txt', 'data/spell-testset2.txt']))

vocabulary = read_vocabulary(PATH_VOCABULARY)

In [3]:
%%time
model = KeyedVectors.load_word2vec_format(PATH_MODEL, limit=999999)

Wall time: 4min 26s


In [16]:
"""Based on the article by Peter Norvig:
http://norvig.com/spell-correct.html
"""

from string import ascii_lowercase as LETTERS_LOWERCASE


class SpellCheckerEditsWithFastTextSimilarity:
    def __init__(self, vocabulary, model):
        self._vocabulary = {word: vocabulary[word] / len(vocabulary) for word in vocabulary}
        self._model = model

    def filter_words_by_vocabulary(self, words):
        return set(word for word in words if word in self._vocabulary)

    def generate_spelling_candidates(self, word):
        edits = self.generate_edits(word)
        result = self.filter_words_by_vocabulary(edits)
        if not result:
            edits = (edit2 for edit in edits for edit2 in self.generate_edits(edit))
            result = self.filter_words_by_vocabulary(edits)
        return result

    def generate_edits(self, word):
        splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
        deletes = {prefix + suffix[1:] for prefix, suffix in splits if suffix}
        transposes = {prefix + suffix[1] + suffix[0] + suffix[2:]
                      for prefix, suffix in splits if len(suffix) > 1}
        replaces = {prefix + letter + suffix[1:] for prefix, suffix in splits for letter in LETTERS_LOWERCASE}
        inserts = {prefix + letter + suffix for prefix, suffix in splits for letter in LETTERS_LOWERCASE}
        return deletes | transposes | replaces | inserts

    def get_word_similarity(self, word_base, word_similar):
        result = 0
        if word_similar in self._model.vocab:
            result = self._model.similarity(word_base, word_similar)
        return result

    def correct(self, word):
        candidates = self.generate_spelling_candidates(word)
        result = None
        if word in self._model.vocab:
            result = max(candidates, key=lambda candidate: self._vocabulary[candidate] * self.get_word_similarity(word, candidate), default=None)

        if result is None:
            result = max(candidates, key=lambda candidate: self._vocabulary[candidate], default=word)
        return result

In [17]:
spellchecker = SpellCheckerEditsWithFastTextSimilarity(vocabulary, model)

In [18]:
spellchecker.correct('uniqe')

'unite'

In [19]:
from utils.tester import test_on_datasets

test_on_datasets(spellchecker, PATHS_DATASETS)

base: D:\My Docs\Studying\mipt-materials\5c1t\Machine Learning (ABBYY)\task1-spellcheck\data/spell-testset1.txt
words count: 270
accuracy: 0.762962962962963
time: 7.6815879344940186
-----
base: D:\My Docs\Studying\mipt-materials\5c1t\Machine Learning (ABBYY)\task1-spellcheck\data/spell-testset2.txt
words count: 400
accuracy: 0.69
time: 13.614521265029907
-----
