### Spell Correction

This code is taken from https://norvig.com/spell-correct.html

We are going to look at this code to see how we can write our own simple spelling correction code.

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


### Reading the Words

In [0]:
import re
from collections import Counter

def words(text): 
    return re.findall(r'\w+', text.lower())

data_path = "/content/drive/My Drive/Datahack NLP Workshop/"
WORDS = Counter(words(open(data_path + 'big.txt').read())) #can add stemming and lem also

In [0]:
len(WORDS)

32198

In [0]:
sum(WORDS.values())

1115585

In [0]:
WORDS.most_common(10)

[('the', 79809),
 ('of', 40024),
 ('and', 38312),
 ('to', 28765),
 ('in', 22023),
 ('a', 21124),
 ('that', 12512),
 ('he', 12401),
 ('was', 11410),
 ('it', 10681)]

### Probability for the presence of given word

In [0]:
def P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N

P('the')

0.07154004401278254

In [0]:
P('he')


0.011116140858831914

In [0]:
P('dhs')

0.0

In [0]:
P('The')

0.0

### Get the words that are 1 or 2 edit distances away

In [0]:
def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

In [0]:
known(["content"])

{'content'}

In [0]:
known(["notseen"])

set()

In [0]:
### Edits1 ###
# This can be a big set. For a word of length n, there will be n deletions, 
# n-1 transpositions, 26n alterations, and 26(n+1) insertions, 
# for a total of 54n+25 (of which a few are typically duplicates)
word = "speling"
len(edits1(word))

390

In [0]:
known(edits1(word))

{'spelling'}

In [0]:
# Edits 2
word = "somthng"
known(edits1(word))

set()

In [0]:
known(edits2(word))

{'something', 'soothing', 'sorting'}

### Candidate generation & Spelling correction

In [0]:
def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

In [0]:
correction("somthng")

'something'

In [0]:
correction("spelling")

'spelling'

https://nbviewer.jupyter.org/url/norvig.com/ipython/How%20to%20Do%20Things%20with%20Words.ipynb

### Spell correction using Modules

In [0]:
!pip install pyspellchecker

Collecting pyspellchecker
[?25l  Downloading https://files.pythonhosted.org/packages/3b/62/e9da86d71e3ccc500b979f0afb88c1f3ae151766004a0de92775b686a311/pyspellchecker-0.5.2-py2.py3-none-any.whl (1.9MB)
[K     |████████████████████████████████| 1.9MB 2.7MB/s 
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.5.2


In [0]:
from spellchecker import SpellChecker

spell = SpellChecker()
def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)
        
text = "speling correctin"
correct_spellings(text)

'spelling correction'