In [1]:
import pandas as pd
import numpy as np
import textdistance
import re
from collections import Counter

# File Opening And Cleaning (change formate to utf-8)

In [2]:


word = []

with open('autocorrect book.txt', 'r', encoding='utf-8') as f:
    data = f.read()
    data = data.lower()
    words = re.findall('\w+', data)
    words += words

print(words[0:10])


['the', 'project', 'gutenberg', 'ebook', 'of', 'moby', 'dick', 'or', 'the', 'whale']


# make vocabulary

In [3]:

len(words)

445326

In [4]:
V = set(words)
V

{'slantings',
 'elevated',
 'loveliest',
 'mayest',
 'stuck',
 'hugely',
 'voracity',
 '890',
 'age',
 'distinctive',
 'pincers',
 'inequality',
 'believe',
 'cruising',
 'capting',
 'western',
 'celebrated',
 'comforting',
 'minute',
 'features',
 'sat',
 'loweringly',
 'obeyest',
 'slack',
 'rascals',
 'cliffs',
 'customary',
 'throbbing',
 'obscurely',
 'revealed',
 'tudors',
 'dividends',
 'proof',
 'spoken',
 'calm',
 'browed',
 'belt',
 'guise',
 'ingeniously',
 'susan',
 'sixteen',
 '_december_',
 'marlinspikes',
 'necessity',
 'flooded',
 'ordinary',
 'clutch',
 'fa',
 '40',
 'fiends',
 'exhausting',
 'settees',
 'slanderous',
 'sterned',
 'sanctum',
 'delicious',
 'accepted',
 '1776',
 'written',
 'shuddered',
 'tool',
 'helping',
 'visual',
 'brimming',
 'sights',
 'shark',
 'damped',
 'inventor',
 'canvas',
 'ebbs',
 'hang',
 'outburst',
 'memoirs',
 'copenhagen',
 'practically',
 'strays',
 'satisfy',
 'finical',
 'subjected',
 'junks',
 'apeak',
 'materials',
 'dismissal',

In [5]:
len(V)

17647

# build the frequency of those words

In [6]:
words_freq_dict = {}
words_freq_dict = Counter(words)

In [7]:
words_freq_dict.most_common()[0:10]

[('the', 29406),
 ('of', 13484),
 ('and', 13034),
 ('a', 9598),
 ('to', 9414),
 ('in', 8476),
 ('that', 6162),
 ('it', 5068),
 ('his', 5060),
 ('i', 4240)]

# Relative Frequency of words
Now we want to get the probability of occurrence of each word, this equals the relative frequencies of the words:

The formula used to calculate the probability of a word in the provided code is:

Probability(word) = Frequency(word) / Total count of all words

In [8]:
Total = sum(words_freq_dict.values())
probs = {}

for k in words_freq_dict.keys():
    probs[k] = words_freq_dict[k] / Total

In [9]:
probs

{'the': 0.06603252448767868,
 'project': 0.0004086893646452262,
 'gutenberg': 0.0004221626404027611,
 'ebook': 4.4910919191783095e-05,
 'of': 0.030278941719100165,
 'moby': 0.0004041982727260479,
 'dick': 0.0004041982727260479,
 'or': 0.003579400259585113,
 'whale': 0.005524043060589321,
 'by': 0.005488114325235894,
 'herman': 1.796436767671324e-05,
 'melville': 1.796436767671324e-05,
 'this': 0.006462681271697588,
 'is': 0.007863901950481221,
 'for': 0.007383355115129142,
 'use': 0.0002200635040397372,
 'anyone': 2.694655151506986e-05,
 'anywhere': 7.185747070685296e-05,
 'at': 0.005995607712103043,
 'no': 0.002667708599991916,
 'cost': 1.796436767671324e-05,
 'and': 0.029268446037285047,
 'with': 0.00794474160502643,
 'almost': 0.000884745108078127,
 'restrictions': 8.98218383835662e-06,
 'whatsoever': 3.143764343424817e-05,
 'you': 0.004302466058572821,
 'may': 0.001145228439390469,
 'copy': 8.533074646438789e-05,
 'it': 0.011380426923197837,
 'give': 0.0004041982727260479,
 'away':

# Finding Similar Words
Now we will sort similar words according to the Jaccard distance by calculating the 2 grams Q of the words. Next, we will return the 5 most similar words ordered by similarity and probability:


The Jaccard distance measures the dissimilarity between two sets by comparing their intersection and union

In [10]:
def autocorrect(word):
    word = word.lower()
    if word in V:
        return('Your word seems to be correct', word)
    else:
        similarities = [1-(textdistance.Jaccard(qval=2).distance(v,word))  for v in words_freq_dict.keys()]
        df = pd.DataFrame.from_dict(probs, orient='index').reset_index()
        df = df.rename(columns={'index':'Word',0:'Prob'})
        df['Similarity'] = similarities
        output = df.sort_values(['Similarity','Prob'],ascending=False).head(3)
        return(output)
autocorrect("helo")

Unnamed: 0,Word,Prob,Similarity
2969,help,0.000184,0.5
3031,held,0.000166,0.5
4473,helm,0.000157,0.5
