<a href="https://colab.research.google.com/github/rahiakela/natural-language-processing-research-and-practice/blob/main/text-similarity-works/14_word_autocorrect.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Word Autocorrect

**Reference**:

https://www.section.io/engineering-education/building-autocorrect-feature-using-nlp-with-python/

https://thecleverprogrammer.com/2020/10/04/autocorrect-with-python/

https://www.geeksforgeeks.org/spelling-checker-in-python/

##Setup

In [None]:
%%shell

pip install pattern
pip install pyspellchecker
pip install autocorrect
pip install textblob
pip install textdistance

In [None]:
!wget https://github.com/dentex22/Autocorrect_System/raw/main/sample.txt

In [3]:
import re  # regular expression
from collections import Counter
import numpy as np
import pandas as pd

##Data Preprocessing

In [5]:
"""
Here, we do the followings things:

1. Reads in a corpus 
2. Changes everything to lowercase
3. Returns a list of words
"""
words = []

with open("sample.txt", "r", encoding="utf8") as f:
  word = f.read()
  word = word.lower()
  words = re.findall("\w+", word)

# vocabulary
vocab = set(words)
print(f"The first 10 words in our dictionary are: \n{words[0:10]}")
print(f"The dictionary has {len(vocab)} words.")

The first 10 words in our dictionary are: 
['a', 'ability', 'able', 'about', 'above', 'accept', 'according', 'account', 'across', 'act']
The dictionary has 1001 words.


##Utility functions

In [7]:
# Let's find the frequency of the words
def get_word_count(words):
  word_count_dict = {}
  for word in words:
    if word in word_count_dict:
      word_count_dict[word] += 1
    else:
      word_count_dict[word] = 1
  return word_count_dict 

word_count_dict = get_word_count(words)
print(f"There are {len(word_count_dict)} key values pairs.")

There are 1001 key values pairs.


In [8]:
# Let's calculate the probability that any word will appear if randomly selected from the dictionary
def get_probs(word_count_dict):
  probs = {}
  total_word = sum(word_count_dict.values())
  for key in word_count_dict.keys():
    probs[key] = word_count_dict[key] / total_word
  return probs

In [11]:
"""
Now we implement 4 edit word functions

1. delete_letter:removes a letter from a given word
2. SwitchLetter:swap two adjacent letters
3. replace_letter: changes one letter to another
4. insert_letter: adds additional characters
"""

def delete_letter(word):
  delete_list = []
  split_list = []
  for i in range(len(word)):
    split_list.append((word[0:i], word[i:]))
  for a, b in split_list:
    delete_list.append(a + b[1:])
  return delete_list

In [12]:
print(delete_letter(word="cans"))

['ans', 'cns', 'cas', 'can']


In [13]:
delete_letter(word="trash")

['rash', 'tash', 'trsh', 'trah', 'tras']

In [15]:
def switch_letter(word):
  split_letter = []
  switch_letter = []
  for i in range(len(word)):
    split_letter.append((word[0:i], word[i:]))
  switch_letter = [a + b[1] + b[0] + b[2:] for a, b in split_letter if len(b) >= 2]
  return switch_letter

In [16]:
print(switch_letter("trash"))

['rtash', 'tarsh', 'trsah', 'trahs']


In [17]:
def replace_letter(word):
  split_letter = []
  replace_list = []
  for i in range(len(word)):
    split_letter.append((word[0:i], word[i:]))
  alphabets = "abcdefghijklmnopqrstuvwxyz"
  replace_list = [a + letter + (b[1:] if len(b) > 1 else "") for a, b in split_letter if b for letter in alphabets]
  return replace_list

In [18]:
print(replace_letter("trash"))

['arash', 'brash', 'crash', 'drash', 'erash', 'frash', 'grash', 'hrash', 'irash', 'jrash', 'krash', 'lrash', 'mrash', 'nrash', 'orash', 'prash', 'qrash', 'rrash', 'srash', 'trash', 'urash', 'vrash', 'wrash', 'xrash', 'yrash', 'zrash', 'taash', 'tbash', 'tcash', 'tdash', 'teash', 'tfash', 'tgash', 'thash', 'tiash', 'tjash', 'tkash', 'tlash', 'tmash', 'tnash', 'toash', 'tpash', 'tqash', 'trash', 'tsash', 'ttash', 'tuash', 'tvash', 'twash', 'txash', 'tyash', 'tzash', 'trash', 'trbsh', 'trcsh', 'trdsh', 'tresh', 'trfsh', 'trgsh', 'trhsh', 'trish', 'trjsh', 'trksh', 'trlsh', 'trmsh', 'trnsh', 'trosh', 'trpsh', 'trqsh', 'trrsh', 'trssh', 'trtsh', 'trush', 'trvsh', 'trwsh', 'trxsh', 'trysh', 'trzsh', 'traah', 'trabh', 'trach', 'tradh', 'traeh', 'trafh', 'tragh', 'trahh', 'traih', 'trajh', 'trakh', 'tralh', 'tramh', 'tranh', 'traoh', 'traph', 'traqh', 'trarh', 'trash', 'trath', 'trauh', 'travh', 'trawh', 'traxh', 'trayh', 'trazh', 'trasa', 'trasb', 'trasc', 'trasd', 'trase', 'trasf', 'trasg', 

In [19]:
def insert_letter(word):
  split_letter = []
  insert_list = []
  for i in range(len(word) + 1):
    split_letter.append((word[0:i], word[i:]))
  alphabets = "abcdefghijklmnopqrstuvwxyz"
  insert_list = [a + letter + b for a, b in split_letter if b for letter in alphabets]
  return insert_list

In [20]:
print(insert_letter("trash"))

['atrash', 'btrash', 'ctrash', 'dtrash', 'etrash', 'ftrash', 'gtrash', 'htrash', 'itrash', 'jtrash', 'ktrash', 'ltrash', 'mtrash', 'ntrash', 'otrash', 'ptrash', 'qtrash', 'rtrash', 'strash', 'ttrash', 'utrash', 'vtrash', 'wtrash', 'xtrash', 'ytrash', 'ztrash', 'tarash', 'tbrash', 'tcrash', 'tdrash', 'terash', 'tfrash', 'tgrash', 'thrash', 'tirash', 'tjrash', 'tkrash', 'tlrash', 'tmrash', 'tnrash', 'torash', 'tprash', 'tqrash', 'trrash', 'tsrash', 'ttrash', 'turash', 'tvrash', 'twrash', 'txrash', 'tyrash', 'tzrash', 'traash', 'trbash', 'trcash', 'trdash', 'treash', 'trfash', 'trgash', 'trhash', 'triash', 'trjash', 'trkash', 'trlash', 'trmash', 'trnash', 'troash', 'trpash', 'trqash', 'trrash', 'trsash', 'trtash', 'truash', 'trvash', 'trwash', 'trxash', 'tryash', 'trzash', 'traash', 'trabsh', 'tracsh', 'tradsh', 'traesh', 'trafsh', 'tragsh', 'trahsh', 'traish', 'trajsh', 'traksh', 'tralsh', 'tramsh', 'transh', 'traosh', 'trapsh', 'traqsh', 'trarsh', 'trassh', 'tratsh', 'traush', 'travsh',

We then combine these edit functions to allow the autocorrect features, like delete, replace, insert, and swap the letters.

In [21]:
def edit_one_letter(word, allow_switches=True):
  edit_set = set()
  edit_set.update(delete_letter(word))
  if allow_switches:
    edit_set.update(switch_letter(word))
  edit_set.update(replace_letter(word))
  edit_set.update(insert_letter(word))
  return edit_set

def edit_two_letters(word, allow_switches=True):
  edit_set2 = set()
  edit_one = edit_one_letter(word, allow_switches=allow_switches)
  for w in edit_one:
    if w:
      edit_two = edit_one_letter(w, allow_switches=allow_switches)
      edit_set2.update(edit_two)
  return edit_set2

##Autocorrect word

In [22]:
def get_corrections(word, probs, vocabs, n=2):
  suggested_word = []
  best_suggestion = []

  suggested_word = list(
    (word in vocabs and word) 
      or edit_one_letter(word).intersection(vocabs)
      or edit_two_letters(word).intersection(vocabs)
  )

  best_suggestion = [[s, probs[s]] for s in list(reversed(suggested_word))]
  return best_suggestion

In [24]:
probs = get_probs(word_count_dict)
tmp_corrections = get_corrections("daed", probs, vocab, 2)
for i, word_prob in enumerate(tmp_corrections):
  print(f"word {i}: {word_prob[0]}, probability {word_prob[1] * 100:.6f}")

word 0: dead, probability 0.099900


In [30]:
tmp_corrections = get_corrections("correct", probs, vocab, 2)
for i, word_prob in enumerate(tmp_corrections):
  print(f"word {i}: {word_prob[0]}, probability {word_prob[1] * 100 :.6f}")

word 0: current, probability 0.099900
