### Import Libraries

In [1]:
import re
import string
from collections import Counter
import numpy as np

In [3]:
def read_corpus(filename):
  with open(filename, "r") as file:
    lines = file.readlines() #Reade Every Lines
    words = [] #Make list
    for line in lines:
      words += re.findall(r'\w+', line.lower())#Regular Expression and Convert to lowercase

  return words

In [4]:
words = read_corpus("./big.txt")
print(f"There are {len(words)} total words in the corpus")

There are 1115585 total words in the corpus


In [5]:
vocabs = set(words)#Set fuction will remove duplicates words
print(f"There are {len(vocabs)} unique words in the vocabulary")

There are 32198 unique words in the vocabulary


In [8]:
#Check the word counts
word_counts = Counter(words) # from collections|Dictionery| import Counter 
print(word_counts["love"])
print(word_counts["happy"])

484
218


## Calculate word probability
P(W)=C(W)/T(W)

P(W) = Word Probability

C(W) = Count of that word

T(W) = Total words

In [11]:
total_word_count = float(sum(word_counts.values()))
word_probas = {word: word_counts[word] / total_word_count for word in word_counts.keys()}

In [12]:
word_probas["love"]

0.000433853090530977

### First need to Split the words

In [13]:
def split(word):
  return [(word[:i], word[i:]) for i in range(len(word) + 1)]

In [14]:
split("trash")

[('', 'trash'),
 ('t', 'rash'),
 ('tr', 'ash'),
 ('tra', 'sh'),
 ('tras', 'h'),
 ('trash', '')]

### Delete word

In [15]:
def delete(word):
  return [l + r[1:] for l,r in split(word) if r]

delete("trash")

['rash', 'tash', 'trsh', 'trah', 'tras']

### Swap the words

In [16]:
def swap(word):
  return [l + r[1] + r[0] + r[2:] for l, r in split(word) if len(r)>1]

swap("trash")

['rtash', 'tarsh', 'trsah', 'trahs']

### Replace the letters

In [18]:
#ascii letters
string.ascii_lowercase

'abcdefghijklmnopqrstuvwxyz'

In [23]:
def replace(word):
  letters = string.ascii_lowercase
  return [l + c + r[1:] for l, r in split(word) if r for c in letters]

print(replace("trash"))

['arash', 'brash', 'crash', 'drash', 'erash', 'frash', 'grash', 'hrash', 'irash', 'jrash', 'krash', 'lrash', 'mrash', 'nrash', 'orash', 'prash', 'qrash', 'rrash', 'srash', 'trash', 'urash', 'vrash', 'wrash', 'xrash', 'yrash', 'zrash', 'taash', 'tbash', 'tcash', 'tdash', 'teash', 'tfash', 'tgash', 'thash', 'tiash', 'tjash', 'tkash', 'tlash', 'tmash', 'tnash', 'toash', 'tpash', 'tqash', 'trash', 'tsash', 'ttash', 'tuash', 'tvash', 'twash', 'txash', 'tyash', 'tzash', 'trash', 'trbsh', 'trcsh', 'trdsh', 'tresh', 'trfsh', 'trgsh', 'trhsh', 'trish', 'trjsh', 'trksh', 'trlsh', 'trmsh', 'trnsh', 'trosh', 'trpsh', 'trqsh', 'trrsh', 'trssh', 'trtsh', 'trush', 'trvsh', 'trwsh', 'trxsh', 'trysh', 'trzsh', 'traah', 'trabh', 'trach', 'tradh', 'traeh', 'trafh', 'tragh', 'trahh', 'traih', 'trajh', 'trakh', 'tralh', 'tramh', 'tranh', 'traoh', 'traph', 'traqh', 'trarh', 'trash', 'trath', 'trauh', 'travh', 'trawh', 'traxh', 'trayh', 'trazh', 'trasa', 'trasb', 'trasc', 'trasd', 'trase', 'trasf', 'trasg', 

### Insert letters into the word

In [25]:
def insert(word):
  letters = string.ascii_lowercase
  return [l + c + r for l, r in split(word) for c in letters]

In [27]:
print(insert("trash"))

['atrash', 'btrash', 'ctrash', 'dtrash', 'etrash', 'ftrash', 'gtrash', 'htrash', 'itrash', 'jtrash', 'ktrash', 'ltrash', 'mtrash', 'ntrash', 'otrash', 'ptrash', 'qtrash', 'rtrash', 'strash', 'ttrash', 'utrash', 'vtrash', 'wtrash', 'xtrash', 'ytrash', 'ztrash', 'tarash', 'tbrash', 'tcrash', 'tdrash', 'terash', 'tfrash', 'tgrash', 'thrash', 'tirash', 'tjrash', 'tkrash', 'tlrash', 'tmrash', 'tnrash', 'torash', 'tprash', 'tqrash', 'trrash', 'tsrash', 'ttrash', 'turash', 'tvrash', 'twrash', 'txrash', 'tyrash', 'tzrash', 'traash', 'trbash', 'trcash', 'trdash', 'treash', 'trfash', 'trgash', 'trhash', 'triash', 'trjash', 'trkash', 'trlash', 'trmash', 'trnash', 'troash', 'trpash', 'trqash', 'trrash', 'trsash', 'trtash', 'truash', 'trvash', 'trwash', 'trxash', 'tryash', 'trzash', 'traash', 'trabsh', 'tracsh', 'tradsh', 'traesh', 'trafsh', 'tragsh', 'trahsh', 'traish', 'trajsh', 'traksh', 'tralsh', 'tramsh', 'transh', 'traosh', 'trapsh', 'traqsh', 'trarsh', 'trassh', 'tratsh', 'traush', 'travsh',

### Edit 1

In [28]:
def edit1(word):
  return set(delete(word) + swap(word) + replace(word) + insert(word))

print(edit1("trash"))

{'toash', 'trasr', 'trasu', 'btrash', 'tprash', 'trasth', 'traswh', 'tramh', 'trssh', 'traxh', 'trcash', 'zrash', 'tjash', 'otrash', 'trashl', 'traesh', 'trzsh', 'trasjh', 'trashb', 'trhash', 'ntrash', 'tlrash', 'nrash', 'trosh', 'traseh', 'trbash', 'trashm', 'trwsh', 'tralh', 'rtash', 'ttrash', 'trasuh', 'trdash', 'trlash', 'trvsh', 'trqsh', 'trayh', 'mrash', 'atrash', 'mtrash', 'trgash', 'trasrh', 'ctrash', 'trfash', 'rrash', 'tiash', 'trlsh', 'trbsh', 'trajsh', 'trgsh', 'traah', 'trsh', 'transh', 'tresh', 'tras', 'thrash', 'tqrash', 'tkash', 'etrash', 'rash', 'trsash', 'trxash', 'traqsh', 'tvash', 'trasn', 'torash', 'krash', 'trashx', 'trashc', 'trysh', 'trksh', 'ttash', 'wtrash', 'trashq', 'tcrash', 'vtrash', 'tramsh', 'trauh', 'tdrash', 'jtrash', 'trasho', 'traeh', 'trmsh', 'truash', 'itrash', 'trasl', 'traash', 'trahs', 'tranh', 'tarash', 'traush', 'trasc', 'tjrash', 'tmash', 'traish', 'strash', 'trashd', 'tzash', 'tbash', 'traso', 'tkrash', 'trasm', 'trask', 'tzrash', 'tragsh', 

### Edit 2

In [29]:
def edit2(word):
  return set(e2 for e1 in edit1(word) for e2 in edit1(e1))

print(edit2("trash"))

{'tnvrash', 'tlasih', 'tnrase', 'tgramh', 'tycsh', 'trxsb', 'trdashk', 'arashh', 'tzasn', 'trashjk', 'tpavsh', 'atlash', 'brasxh', 'ddrash', 'trasbd', 'qtrfash', 'trasehi', 'trashly', 'tdarh', 'trwaeh', 'triazh', 'trfqash', 'trxsxh', 'sresh', 'drasho', 'trasykh', 'rtrtsh', 'tzashl', 'ztragsh', 'trashwr', 'tarasd', 'traszvh', 'trasqqh', 'trwagh', 'rtasho', 'toratsh', 'truuash', 'rgrash', 'mtragh', 'dtrnsh', 'ttadsh', 'trsskh', 'ntraso', 'orasvh', 'grrsh', 'tyrawsh', 'ptrasih', 'tkavh', 'ptransh', 'trvaush', 'urashm', 'ttrashg', 'trsnash', 'trqashd', 'teask', 'ltrase', 'tmrafsh', 'trhasoh', 'trajhi', 'ktqash', 'traphb', 'traimsh', 'gtzrash', 'jtcrash', 'traqm', 'trakbh', 'trabshl', 'tcaseh', 'tasrash', 'triashx', 'tromsh', 'srasvh', 'bthrash', 'tsraqsh', 'traeshh', 'trzah', 'trxtsh', 'trbasm', 'trasvn', 'tjaish', 'tyrah', 'typash', 'truazsh', 'ltramh', 'txafh', 'traol', 'trpsr', 'trkasm', 'trmwh', 'mktrash', 'xtrashb', 'tvashy', 'tzrasbh', 'trxasm', 'trashci', 'lvash', 'trtnh', 'vash', '

### Spelling correction

In [30]:
def correct_spelling(word, vocabulary, word_probabilities):
  if word in vocabulary:
    print(f"{word} is already correctly spelt")
    return 

  suggestions = edit1(word) or edit2(word) or [word]
  best_guesses = [w for w in suggestions if w in vocabulary]
  return [(w, word_probabilities[w]) for w in best_guesses]

In [31]:
word = "famile"
corrections = correct_spelling(word, vocabs, word_probas)
corrections

[('famine', 2.6891720487457255e-06), ('family', 0.0001882420434122008)]

In [32]:
if corrections:
  print(corrections)
  probs = np.array([c[1] for c in corrections])
  best_ix = np.argmax(probs)
  correct = corrections[best_ix][0]
  print(f"{correct} is suggested for {word}")

[('famine', 2.6891720487457255e-06), ('family', 0.0001882420434122008)]
family is suggested for famile


# Big Python Code in one Cell using Class

In [34]:
class SpellChecker(object):

  def __init__(self, corpus_file_path):
    with open(corpus_file_path, "r") as file:
      lines = file.readlines()
      words = []
      for line in lines:
        words += re.findall(r'\w+', line.lower())

    self.vocabs = set(words)
    self.word_counts = Counter(words)
    total_words = float(sum(self.word_counts.values()))
    self.word_probas = {word: self.word_counts[word] / total_words for word in self.vocabs}

  def _level_one_edits(self, word):
    letters = string.ascii_lowercase
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes = [l + r[1:] for l,r in splits if r]
    swaps = [l + r[1] + r[0] + r[2:] for l, r in splits if len(r)>1]
    replaces = [l + c + r[1:] for l, r in splits if r for c in letters]
    inserts = [l + c + r for l, r in splits for c in letters] 

    return set(deletes + swaps + replaces + inserts)

  def _level_two_edits(self, word):
    return set(e2 for e1 in self._level_one_edits(word) for e2 in self._level_one_edits(e1))

  def check(self, word):
    candidates = self._level_one_edits(word) or self._level_two_edits(word) or [word]
    valid_candidates = [w for w in candidates if w in self.vocabs]
    return sorted([(c, self.word_probas[c]) for c in valid_candidates], key=lambda tup: tup[1], reverse=True)

In [35]:
checker = SpellChecker("./big.txt")

In [36]:
checker.check("sentense")

[('sentence', 2.3306157755796287e-05)]