In [6]:
import re
from collections import Counter

In [7]:
# function to tokenise words
def words(document):
    "Convert text to lower case and tokenise the document"
    return re.findall(r'\w+', document.lower())

In [8]:
# create a frequency table of all the words of the document
all_words = Counter(words(open('big.txt').read()))

In [9]:
# check frequency of a random word, say, 'chair'
all_words['chair']

135

In [10]:
# look at top 10 frequent words
all_words.most_common(10)

[('the', 79809),
 ('of', 40024),
 ('and', 38312),
 ('to', 28765),
 ('in', 22023),
 ('a', 21124),
 ('that', 12512),
 ('he', 12401),
 ('was', 11410),
 ('it', 10681)]

In [11]:
def edits_one(word):
    "Create all edits that are one edit away from `word`."
    alphabets    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])                   for i in range(len(word) + 1)]
    deletes    = [left + right[1:]                       for left, right in splits if right]
    inserts    = [left + c + right                       for left, right in splits for c in alphabets]
    replaces   = [left + c + right[1:]                   for left, right in splits if right for c in alphabets]
    transposes = [left + right[1] + right[0] + right[2:] for left, right in splits if len(right)>1]
    return set(deletes + inserts + replaces + transposes)

In [12]:
def edits_two(word):
    "Create all edits that are two edits away from `word`."
    return (e2 for e1 in edits_one(word) for e2 in edits_one(e1))

In [13]:
def known(words):
    "The subset of `words` that appear in the `all_words`."
    return set(word for word in words if word in all_words)

In [14]:
def possible_corrections(word):
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits_one(word)) or known(edits_two(word)) or [word])

In [15]:
def prob(word, N=sum(all_words.values())): 
    "Probability of `word`: Number of appearances of 'word' / total number of tokens"
    return all_words[word] / N

In [16]:
print(len(set(edits_one("monney"))))
print(edits_one("monney"))

336
{'monneyp', 'monnye', 'monneyh', 'monner', 'monnney', 'ronney', 'monnety', 'moeney', 'monkey', 'monne', 'monneyb', 'imonney', 'monnqy', 'mmonney', 'monvey', 'monbney', 'monneoy', 'monnpey', 'monnzey', 'mfonney', 'moenney', 'eonney', 'monfey', 'monnel', 'monqey', 'monnty', 'mkonney', 'monnezy', 'mownney', 'mxonney', 'moncey', 'monneyt', 'qonney', 'dmonney', 'monaney', 'ymonney', 'mohnney', 'monnley', 'monbey', 'monley', 'jonney', 'gmonney', 'monnfey', 'monnoey', 'mgnney', 'monnee', 'mobnney', 'mqnney', 'monnaey', 'moneny', 'monnjy', 'moqney', 'monneyg', 'monzney', 'zonney', 'monoey', 'monnep', 'monxey', 'mononey', 'smonney', 'monnry', 'monnew', 'monnmey', 'monnwey', 'monlney', 'mpnney', 'monwey', 'monnedy', 'mznney', 'monnec', 'monnkey', 'monneym', 'monzey', 'monneyj', 'mofney', 'moaney', 'mvonney', 'monnpy', 'monhey', 'monnjey', 'monnem', 'lonney', 'monneye', 'momney', 'moynney', 'monnhy', 'monxney', 'monniy', 'monnhey', 'monneyo', 'monvney', 'monneu', 'bonney', 'moxney', 'moonney'

In [17]:
print(edits_one('emfasize'))

{'emfasijze', 'semfasize', 'effasize', 'emfaaize', 'elfasize', 'emfasie', 'emfasgize', 'emfausize', 'emfasuize', 'emfasirze', 'emfarsize', 'emfatsize', 'emfaqize', 'emfzsize', 'emfasizl', 'emfiasize', 'emfasizc', 'emfbsize', 'xemfasize', 'fmfasize', 'emfasiye', 'emfasrze', 'emtfasize', 'emfasizse', 'emrfasize', 'emfasizu', 'enfasize', 'emfasigze', 'emfasizel', 'emfysize', 'hemfasize', 'emfasioe', 'emfassize', 'emfasizen', 'emfasizeq', 'zemfasize', 'emfaisze', 'emfaoize', 'emfasisze', 'emfasizxe', 'evfasize', 'emfauize', 'eimfasize', 'emfrsize', 'emfasizle', 'emfastize', 'ejfasize', 'emfpsize', 'emqfasize', 'emfwasize', 'emfastze', 'emfjasize', 'emfayize', 'emuasize', 'tmfasize', 'remfasize', 'emfasizeu', 'emwfasize', 'emfaswize', 'emfavsize', 'emfasizw', 'emfisize', 'memfasize', 'emfasizue', 'eifasize', 'emfaskze', 'ermfasize', 'emfasvize', 'emfasizes', 'emfasize', 'wemfasize', 'emfsasize', 'eufasize', 'emfasire', 'emmasize', 'eymfasize', 'emfasizex', 'eamfasize', 'emfasizez', 'emfdasi

In [19]:
print(len(edits_one('emfasize')))

442


In [None]:
print(known(edits_one("monney")))

In [None]:
# Let's look at words that are two edits away
print(len(set(edits_two("monney"))))
print(known(edits_one("monney")))

In [23]:
print(known(edits_two("emfasize")))


{'emphasize'}


In [None]:
# Let's look at possible corrections of a word
print(possible_corrections("monney"))

In [24]:
# Let's look at possible corrections of a word
print(possible_corrections("emfasize"))

{'emphasize'}


In [None]:
# Let's look at probability of a word
print(prob("money"))
print(prob("monkey"))

In [None]:
def spell_check(word):
    "Print the most probable spelling correction for `word` out of all the `possible_corrections`"
    correct_word = max(possible_corrections(word), key=prob)
    if correct_word != word:
        return "Did you mean " + correct_word + "?"
    else:
        return "Correct spelling."

In [None]:
# test spell check
print(spell_check("monney"))

In [27]:
from spell_corrector import rectify
correct = rectify("remeber")
print(correct)

remember
