In [1]:
import re
from collections import Counter

In [2]:
# function to tokenise words
def words(document):
    "Convert text to lower case and tokenise the document"
    return re.findall(r'\w+', document.lower())

In [3]:
# create a frequency table of all the words of the document
all_words = Counter(words(open('big.txt').read()))

In [4]:
# check frequency of a random word, say, 'chair'
all_words['chair']

135

In [5]:
# look at top 10 frequent words
all_words.most_common(10)

[('the', 79809),
 ('of', 40024),
 ('and', 38312),
 ('to', 28765),
 ('in', 22023),
 ('a', 21124),
 ('that', 12512),
 ('he', 12401),
 ('was', 11410),
 ('it', 10681)]

In [15]:
def edits_one(word):
    "Create all edits that are one edit away from `word`."
    alphabets  = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes    = [left + right[1:] for left, right in splits if right]
    inserts    = [left + c + right for left, right in splits for c in alphabets]
    replaces   = [left + c + right[1:] for left, right in splits if right for c in alphabets]
    transposes = [left + right[1] + right[0] + right[2:] for left, right in splits if len(right)>1]
    return set(deletes + inserts + replaces + transposes)

In [16]:
def edits_two(word):
    "Create all edits that are two edits away from `word`."
    return (e2 for e1 in edits_one(word) for e2 in edits_one(e1))

In [24]:
def known(words):
    "The subset of `words` that appear in the `all_words`."
    return set(word for word in words if word in all_words)

In [25]:
def possible_corrections(word):
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits_one(word)) or known(edits_two(word)) or [word])

In [26]:
def prob(word, N=sum(all_words.values())): 
    "Probability of `word`: Number of appearances of 'word' / total number of tokens"
    return all_words[word] / N

In [27]:
print(len(set(edits_one("monney"))))
print(edits_one("monney"))

336
{'monneiy', 'monneys', 'monbey', 'moqnney', 'money', 'monnezy', 'dmonney', 'monnec', 'monneq', 'monneoy', 'moynney', 'moniney', 'fonney', 'monnkey', 'mondey', 'moncney', 'vonney', 'monlney', 'monwney', 'moeney', 'molney', 'monnehy', 'monneyw', 'zmonney', 'monnuey', 'mponney', 'moznney', 'monneye', 'monneyy', 'monneyx', 'nmonney', 'monneu', 'monneo', 'wonney', 'monnen', 'monneyf', 'montey', 'mnnney', 'mcnney', 'mognney', 'moiney', 'mlnney', 'mfnney', 'mouney', 'mooney', 'xonney', 'monnew', 'modnney', 'oonney', 'mvnney', 'monnejy', 'mornney', 'monnepy', 'monnewy', 'qmonney', 'monrney', 'monnhy', 'monndy', 'monnaey', 'rmonney', 'monnegy', 'monneyc', 'monnmy', 'munney', 'lmonney', 'monhney', 'msnney', 'mowney', 'fmonney', 'monnky', 'monneqy', 'monneny', 'mosney', 'mnney', 'mlonney', 'mokney', 'monnevy', 'lonney', 'monneh', 'gmonney', 'moknney', 'mzonney', 'monnbey', 'mnonney', 'monnedy', 'monqey', 'monbney', 'monnyey', 'monpney', 'monneyp', 'mmnney', 'qonney', 'mdnney', 'monrey', 'monn

In [28]:
print(known(edits_one("monney")))

{'money', 'monkey'}


In [29]:
# Let's look at words that are two edits away
print(len(set(edits_two("monney"))))
print(known(edits_two("monney")))

51013
{'honey', 'manner', 'monkeys', 'money', 'donkey', 'monday', 'donne', 'moaned', 'bonnet', 'tonne', 'monkey', 'olney', 'donned', 'bonne', 'moines', 'convey', 'morley', 'manned', 'motley', 'bonny', 'monger'}


In [30]:
# Let's look at possible corrections of a word
print(possible_corrections("monney"))

{'money', 'monkey'}


In [35]:
# Let's look at probability of a word
print(prob("money"))
print(prob("monkey"))

0.0002922233626303688
5.378344097491451e-06


In [36]:
def spell_check(word):
    "Print the most probable spelling correction for `word` out of all the `possible_corrections`"
    correct_word = max(possible_corrections(word), key=prob)
    if correct_word != word:
        return "Did you mean " + correct_word + "?"
    else:
        return "Correct spelling."

In [37]:
# test spell check
print(spell_check("monney"))

Did you mean money?


In [38]:
print(spell_check("monny"))

Did you mean money?


In [6]:
word= 'san'
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
splits


[('', 'san'), ('s', 'an'), ('sa', 'n'), ('san', '')]

In [7]:
deletes = [left + right[1:] for left, right in splits if right]
deletes

['an', 'sn', 'sa']

In [8]:
alphabets    = 'abcdefghijklmnopqrstuvwxyz'
inserts = [left + c + right for left, right in splits for c in alphabets]
len(inserts)

104

In [9]:
inserts

['asan',
 'bsan',
 'csan',
 'dsan',
 'esan',
 'fsan',
 'gsan',
 'hsan',
 'isan',
 'jsan',
 'ksan',
 'lsan',
 'msan',
 'nsan',
 'osan',
 'psan',
 'qsan',
 'rsan',
 'ssan',
 'tsan',
 'usan',
 'vsan',
 'wsan',
 'xsan',
 'ysan',
 'zsan',
 'saan',
 'sban',
 'scan',
 'sdan',
 'sean',
 'sfan',
 'sgan',
 'shan',
 'sian',
 'sjan',
 'skan',
 'slan',
 'sman',
 'snan',
 'soan',
 'span',
 'sqan',
 'sran',
 'ssan',
 'stan',
 'suan',
 'svan',
 'swan',
 'sxan',
 'syan',
 'szan',
 'saan',
 'sabn',
 'sacn',
 'sadn',
 'saen',
 'safn',
 'sagn',
 'sahn',
 'sain',
 'sajn',
 'sakn',
 'saln',
 'samn',
 'sann',
 'saon',
 'sapn',
 'saqn',
 'sarn',
 'sasn',
 'satn',
 'saun',
 'savn',
 'sawn',
 'saxn',
 'sayn',
 'sazn',
 'sana',
 'sanb',
 'sanc',
 'sand',
 'sane',
 'sanf',
 'sang',
 'sanh',
 'sani',
 'sanj',
 'sank',
 'sanl',
 'sanm',
 'sann',
 'sano',
 'sanp',
 'sanq',
 'sanr',
 'sans',
 'sant',
 'sanu',
 'sanv',
 'sanw',
 'sanx',
 'sany',
 'sanz']

In [10]:
replaces = [left + c + right[1:] for left, right in splits if right for c in alphabets]
len(replaces)  

78

In [11]:
replaces

['aan',
 'ban',
 'can',
 'dan',
 'ean',
 'fan',
 'gan',
 'han',
 'ian',
 'jan',
 'kan',
 'lan',
 'man',
 'nan',
 'oan',
 'pan',
 'qan',
 'ran',
 'san',
 'tan',
 'uan',
 'van',
 'wan',
 'xan',
 'yan',
 'zan',
 'san',
 'sbn',
 'scn',
 'sdn',
 'sen',
 'sfn',
 'sgn',
 'shn',
 'sin',
 'sjn',
 'skn',
 'sln',
 'smn',
 'snn',
 'son',
 'spn',
 'sqn',
 'srn',
 'ssn',
 'stn',
 'sun',
 'svn',
 'swn',
 'sxn',
 'syn',
 'szn',
 'saa',
 'sab',
 'sac',
 'sad',
 'sae',
 'saf',
 'sag',
 'sah',
 'sai',
 'saj',
 'sak',
 'sal',
 'sam',
 'san',
 'sao',
 'sap',
 'saq',
 'sar',
 'sas',
 'sat',
 'sau',
 'sav',
 'saw',
 'sax',
 'say',
 'saz']

In [12]:
transposes = [left + right[1] + right[0] + right[2:] for left, right in splits if len(right)>1]
transposes

['asn', 'sna']

In [13]:
final = set(deletes + inserts + replaces + transposes)
final

{'aan',
 'an',
 'asan',
 'asn',
 'ban',
 'bsan',
 'can',
 'csan',
 'dan',
 'dsan',
 'ean',
 'esan',
 'fan',
 'fsan',
 'gan',
 'gsan',
 'han',
 'hsan',
 'ian',
 'isan',
 'jan',
 'jsan',
 'kan',
 'ksan',
 'lan',
 'lsan',
 'man',
 'msan',
 'nan',
 'nsan',
 'oan',
 'osan',
 'pan',
 'psan',
 'qan',
 'qsan',
 'ran',
 'rsan',
 'sa',
 'saa',
 'saan',
 'sab',
 'sabn',
 'sac',
 'sacn',
 'sad',
 'sadn',
 'sae',
 'saen',
 'saf',
 'safn',
 'sag',
 'sagn',
 'sah',
 'sahn',
 'sai',
 'sain',
 'saj',
 'sajn',
 'sak',
 'sakn',
 'sal',
 'saln',
 'sam',
 'samn',
 'san',
 'sana',
 'sanb',
 'sanc',
 'sand',
 'sane',
 'sanf',
 'sang',
 'sanh',
 'sani',
 'sanj',
 'sank',
 'sanl',
 'sanm',
 'sann',
 'sano',
 'sanp',
 'sanq',
 'sanr',
 'sans',
 'sant',
 'sanu',
 'sanv',
 'sanw',
 'sanx',
 'sany',
 'sanz',
 'sao',
 'saon',
 'sap',
 'sapn',
 'saq',
 'saqn',
 'sar',
 'sarn',
 'sas',
 'sasn',
 'sat',
 'satn',
 'sau',
 'saun',
 'sav',
 'savn',
 'saw',
 'sawn',
 'sax',
 'saxn',
 'say',
 'sayn',
 'saz',
 'sazn',
 'sba

In [14]:
len(final)

182

In [19]:
len(edits_one('san'))

182

In [21]:
set(edits_two('san'))

{'sagnw',
 'seanj',
 'sapyn',
 'sakn',
 'msag',
 'gany',
 'seasn',
 'sayo',
 'aao',
 'nanj',
 'sfjn',
 'jsian',
 'xslan',
 'smns',
 'bsanr',
 'csjan',
 'szak',
 'uian',
 'shahn',
 'stfan',
 'sasi',
 'hant',
 'sjagn',
 'sdnz',
 'sein',
 'wsaun',
 'csayn',
 'xesan',
 'dsax',
 'bxsan',
 'sadnv',
 'ysae',
 'jsam',
 'sthan',
 'xin',
 'wsna',
 'sranb',
 'xxan',
 'xean',
 'ar',
 'ssnl',
 'sqlan',
 'csean',
 'hsman',
 'suhan',
 'siu',
 'dsanu',
 'ssgan',
 'ssbn',
 'skain',
 'sanoh',
 'za',
 'tqsan',
 'wbn',
 'saqwn',
 'szon',
 'rsjn',
 'samsn',
 'sxanq',
 'sabnn',
 'fanb',
 'sanvz',
 'qpn',
 'stxn',
 'zawn',
 'syno',
 'sznz',
 'smxn',
 'sahne',
 'vand',
 'psar',
 'fanf',
 'ysaan',
 'esaln',
 'nanb',
 'asarn',
 'szpan',
 'sjfan',
 'qaln',
 'av',
 'sansz',
 'salun',
 'usant',
 'jan',
 'satnj',
 'sujan',
 'nsfn',
 'sudan',
 'saent',
 'rsgan',
 'rszn',
 'zsant',
 'sffn',
 'dsaxn',
 'seapn',
 'gsan',
 'sasu',
 'wasan',
 'sajnd',
 'lsajn',
 'svj',
 'csae',
 'swabn',
 'jmsan',
 'sanwb',
 'saonw',
 'o

In [23]:
len(set(edits_two('san')))

14352

In [48]:
possible_corrections('sherlo')

{'hello',
 'hero',
 'sero',
 'shelf',
 'shell',
 'shells',
 'sherlock',
 'sherry',
 'sterno',
 'thermo'}

In [46]:
spell_check('sherlo')

'Did you mean sherlock?'

In [52]:
possible_corrections('emfasize')

{'emphasize'}

In [2]:
from spell_corrector import rectify
correct = rectify("snam")
print(correct)

snap


In [12]:
import re
ww = 'The Nobel Prize is a set of five annual international awards bestowed in several categories by Swedish and Norwegian institutions in recognition of academic, cultural or scientific advances. The Nobel family, known for their innovations in the oil industry in Azerbaijan, was the leading representative of foreign capital in Baku in the 19th century. Alfred Nobel’s personal fortune was funded by the personal fortune, the Nobel Prize. The Board of the Nobel Foundation decided that after this addition, no new prizes would be granted'
ww = ww.lower()
len(re.findall(('\w+'),string = ww))

85

In [16]:
re.findall('nobel prize',ww)

['nobel prize', 'nobel prize']