In [38]:
import bs4
import requests
import json
import time
from random import shuffle
from thefuzz import process, fuzz
from modules.CountMinSketch.count_min_sketch2 import CountMinSketch2
from modules.BloomFilter.bloom_filter2 import BloomFilter2
from modules.Dataset.dataset import Dataset

In [34]:
ds = Dataset(links_base_url='https://en.wikipedia.org/wiki/', 
             links_uri='Wikipedia:Good_articles/By_length',
             links_column='Article')

words_list = ds.build_words_list(num_of_articles = 20)

In [35]:
len(words_list)

387084

In [36]:
NUM_COUNTERS = 300000
NUM_HASH_FUNCTIONS = 100
cms = CountMinSketch2(buckets=NUM_COUNTERS, tables=NUM_HASH_FUNCTIONS)

for word in words_list:
    cms.increment(word)

In [37]:
n = len(words_list) #no of items to add 
p = 0.01 #false positive probability 
bloomf = BloomFilter2(n,p)

for word in words_list:
    bloomf.add(word)

In [43]:
start = time.time()

some_sentance_with_error = "I wanna appel pleases or a bannana with a paer and a lemone, or a wartermelon if possibe. I hope i don't have an errof"
new_sentance = []
for word in some_sentance_with_error.split():
    if not word in bloomf:
        success_percentage = max((len(word) - 1) / len(word) * 100, 75)
        print(f"Scanning {word}, success_percentage is {success_percentage}")
        print("==========================")
        word_results = [x for x in process.extract(word, words_list, scorer=fuzz.ratio)]# if x[1] >= success_percentage]
        if (len(word_results) > 0 and word_results[0][1] < 100):
            list_fixes = [x[0] for x in word_results]
            list_fixes = list(set(list_fixes))
            for fix in list_fixes:
                print(f"Potential fix of {word} is {fix}")
                print(f"Frequency is {cms.count(fix)}")
                print(f"Match rate is {word_results[list_fixes.index(fix)][1]}")
                print("==========================")
            #new_sentance.append(word_results[0][0] if word_results[0][1] >= success_percentage else word)
        else:
            continue
            #new_sentance.append(word)
    else:
        print(f"Word {word} is correct. Skipping")
        print("==========================")

end = time.time()
print(f"Total runtime of {(end - start)} seconds")
#print(' '.join(new_sentance))

Word I is correct. Skipping
Scanning wanna, success_percentage is 80.0
Potential fix of wanna is Anna;
Frequency is 2.0
Match rate is 89
Potential fix of wanna is Anna
Frequency is 8.0
Match rate is 89
Scanning appel, success_percentage is 80.0
Scanning pleases, success_percentage is 85.71428571428571
Potential fix of pleases is "Please
Frequency is 3.0
Match rate is 92
Potential fix of pleases is Please,
Frequency is 1.0
Match rate is 92
Potential fix of pleases is Please
Frequency is 4.0
Match rate is 92
Word or is correct. Skipping
Word a is correct. Skipping
Scanning bannana, success_percentage is 85.71428571428571
Potential fix of bannana is Bafana
Frequency is 1.0
Match rate is 83
Potential fix of bannana is annona
Frequency is 2.0
Match rate is 83
Potential fix of bannana is Annan
Frequency is 2.0
Match rate is 77
Word with is correct. Skipping
Word a is correct. Skipping
Scanning paer, success_percentage is 75.0
Potential fix of paer is Paper.
Frequency is 1.0
Match rate is 89
