In [13]:
import bs4
import requests
import json
import modules.Levenstein.levenstein as levenstein
from random import shuffle
from thefuzz import process, fuzz
from modules.CountMinSketch.count_min_sketch import CountMinSketch
from modules.BloomFilter.bloom_filter2 import BloomFilter2
from modules.Dataset.dataset import Dataset

In [6]:
ds = Dataset(links_base_url='https://en.wikipedia.org/wiki/', 
             links_uri='Wikipedia:Good_articles/By_length',
             links_column='Article')

words_list = ds.build_words_list(num_of_articles = 12)

In [7]:
len(words_list)

238280

In [8]:
NUM_COUNTERS = 200000
cms = CountMinSketch(NUM_COUNTERS)

for word in words_list:
    cms.increment(word)

In [9]:
n = len(words_list) #no of items to add 
p = 0.01 #false positive probability 
bloomf = BloomFilter2(n,p)

for word in words_list:
    bloomf.add(word)

In [18]:
#some_word_with_error = "appel"
some_sentance_with_error = "I wanna appel pleases or a bannana with a paer and a lemone, or a wartermelon if possibe. I hope i don't have an errof"
new_sentance = []
for word in some_sentance_with_error.split():
    if not word in bloomf:
        success_percentage = max((len(word) - 1) / len(word) * 100, 75)
        print(f"Scanning {word}, success_percentage is {success_percentage}")
        word_results = [x for x in process.extract(word, words_list, scorer=fuzz.ratio)]# if x[1] >= success_percentage]
        if (len(word_results) > 0 and word_results[0][1] < 100):
            list_fixes = [x[0] for x in word_results]
            list_fixes = list(set(list_fixes))
            for fix in list_fixes:
                print(f"Frequency of {fix} => {cms.approximateCount(fix)} and match rate is {word_results[list_fixes.index(fix)][1]}")
            #new_sentance.append(word_results[0][0] if word_results[0][1] >= success_percentage else word)
        else:
            continue
            #new_sentance.append(word)
    else:
        print(f"Word {word} is correct. Skipping")

#print(' '.join(new_sentance))

Word I is correct. Skipping
Scanning wanna, success_percentage is 80.0
Scanning appel, success_percentage is 80.0
Frequency of Appeal => 3 and match rate is 91
Frequency of appeal => 7 and match rate is 91
Scanning pleases, success_percentage is 85.71428571428571
Frequency of Pleasure => 1 and match rate is 92
Frequency of lease => 1 and match rate is 92
Frequency of Please => 2 and match rate is 86
Frequency of pleased => 5 and match rate is 83
Frequency of please => 2 and match rate is 80
Word or is correct. Skipping
Word a is correct. Skipping
Scanning bannana, success_percentage is 85.71428571428571
Frequency of Bannon => 1 and match rate is 77
Frequency of Anna => 9 and match rate is 73
Word with is correct. Skipping
Word a is correct. Skipping
Scanning paer, success_percentage is 75.0
Frequency of paper. => 1 and match rate is 89
Frequency of Paper. => 1 and match rate is 89
Frequency of paper => 5 and match rate is 89
Frequency of Paper => 1 and match rate is 89
Frequency of pap

In [19]:
#some_word_with_error = "appel"
some_sentance_with_error = "I wanna appel pleases or a bannana with a paer and a lemone, or a wartermelon if possibe. I hope i don't have an errof"
new_sentance = []
for word in some_sentance_with_error.split():
    if not word in bloomf:
        target_distance = 1
        print(f"Scanning {word}, target_distance is {target_distance}")
        word_results = [x for x in words_list if levenstein.levenshtein_two_matrix_rows(x, word) == 1]
        for word_fix in list(set(word_results)):
            print(f"Potential fix of {word} is {word_fix}. Frequency is {cms.approximateCount(word_fix)}")
            #new_sentance.append(word_results[0][0] if word_results[0][1] >= success_percentage else word)
        else:
            continue
            #new_sentance.append(word)
    else:
        print(f"Word {word} is correct. Skipping")

#print(' '.join(new_sentance))

Word I is correct. Skipping
Scanning wanna, target_distance is 1
Potential fix of wanna is wanga. Frequency is 3
Potential fix of wanna is Hanna. Frequency is 3
Potential fix of wanna is Wanna. Frequency is 2
Scanning appel, target_distance is 1
Potential fix of appel is appeal. Frequency is 7
Scanning pleases, target_distance is 1
Potential fix of pleases is pleased. Frequency is 5
Potential fix of pleases is please. Frequency is 2
Word or is correct. Skipping
Word a is correct. Skipping
Scanning bannana, target_distance is 1
Word with is correct. Skipping
Word a is correct. Skipping
Scanning paer, target_distance is 1
Potential fix of paer is pair. Frequency is 10
Potential fix of paer is Baer. Frequency is 1
Potential fix of paer is par. Frequency is 1
Potential fix of paer is paper. Frequency is 5
Potential fix of paer is per. Frequency is 24
Word and is correct. Skipping
Word a is correct. Skipping
Scanning lemone,, target_distance is 1
Word or is correct. Skipping
Word a is corre