In [5]:
import bs4
import requests
import json
import os
from random import shuffle
from DeezyMatch import train as dm_train, plot_log, finetune as dm_finetune, inference as dm_inference, combine_vecs, candidate_ranker, candidate_ranker_init
from modules.CountMinSketch.count_min_sketch import CountMinSketch
from modules.BloomFilter.bloom_filter2 import BloomFilter2
from modules.Dataset.dataset import Dataset

In [2]:
ds = Dataset(links_base_url='https://en.wikipedia.org/wiki/', 
             links_uri='Wikipedia:Good_articles/By_length',
             links_column='Article')

words_list = ds.build_words_list(num_of_articles = 20)

In [3]:
len(words_list)

387264

In [6]:
PATH = './deezymatch_generator/deezymatch_train/candidates'
if not os.path.exists(PATH):
    os.makedirs(PATH)

with open(f"{PATH}/dataset_candidates.txt", 'w') as f:
    f.write('\n'.join(words_list))

In [29]:
NUM_COUNTERS = 300000
cms = CountMinSketch(NUM_COUNTERS)

for word in words_list:
    cms.increment(word)

In [30]:
n = len(words_list) #no of items to add 
p = 0.01 #false positive probability 
bloomf = BloomFilter2(n,p)

for word in words_list:
    bloomf.add(word)

In [31]:
#some_word_with_error = "appel"
some_sentance_with_error = "I wanna appel pleases or a bannana with a paer and a lemone, or a wartermelon if possibe. I hope i don't have an errof"
new_sentance = []
for word in some_sentance_with_error.split():
    if not word in bloomf:
        success_percentage = max((len(word) - 1) / len(word) * 100, 75)
        print(f"Scanning {word}, success_percentage is {success_percentage}")
        word_results = [x for x in process.extract(word, words_list, scorer=fuzz.ratio)]# if x[1] >= success_percentage]
        if (len(word_results) > 0 and word_results[0][1] < 100):
            list_fixes = [x[0] for x in word_results]
            list_fixes = list(set(list_fixes))
            for fix in list_fixes:
                print(f"Potential fix of {word} is {fix}. Frequency is {cms.approximateCount(fix)} and match rate is {word_results[list_fixes.index(fix)][1]}")
            #new_sentance.append(word_results[0][0] if word_results[0][1] >= success_percentage else word)
        else:
            continue
            #new_sentance.append(word)
    else:
        print(f"Word {word} is correct. Skipping")

#print(' '.join(new_sentance))

Word I is correct. Skipping
Scanning wanna, success_percentage is 80.0
Scanning appel, success_percentage is 80.0
Scanning pleases, success_percentage is 85.71428571428571
Potential fix of pleases is Please. Frequency is 8 and match rate is 92
Word or is correct. Skipping
Word a is correct. Skipping
Scanning bannana, success_percentage is 85.71428571428571
Potential fix of bannana is Banana. Frequency is 1 and match rate is 92
Potential fix of bannana is Bannon. Frequency is 4 and match rate is 83
Potential fix of bannana is Hannan. Frequency is 5 and match rate is 77
Potential fix of bannana is Annan. Frequency is 1 and match rate is 77
Potential fix of bannana is Hannan,. Frequency is 4 and match rate is 77
Word with is correct. Skipping
Word a is correct. Skipping
Scanning paer, success_percentage is 75.0
Potential fix of paer is paper. Frequency is 14 and match rate is 89
Potential fix of paer is Pater. Frequency is 3 and match rate is 89
Potential fix of paer is Paper. Frequency i

In [32]:
#some_word_with_error = "appel"
some_sentance_with_error = "I wanna appel pleases or a bannana with a paer and a lemone, or a wartermelon if possibe. I hope i don't have an errof"
new_sentance = []
for word in some_sentance_with_error.split():
    if not word in bloomf:
        target_distance = 1
        print(f"Scanning {word}, target_distance is {target_distance}")
        word_results = [x for x in words_list if levenstein.levenshtein_two_matrix_rows(x, word) == 1]
        for word_fix in list(set(word_results)):
            print(f"Potential fix of {word} is {word_fix}. Frequency is {cms.approximateCount(word_fix)}")
            #new_sentance.append(word_results[0][0] if word_results[0][1] >= success_percentage else word)
        else:
            continue
            #new_sentance.append(word)
    else:
        print(f"Word {word} is correct. Skipping")

#print(' '.join(new_sentance))

Word I is correct. Skipping
Scanning wanna, target_distance is 1
Potential fix of wanna is Wanna. Frequency is 1
Potential fix of wanna is Hanna. Frequency is 1
Scanning appel, target_distance is 1
Potential fix of appel is Oppel. Frequency is 3
Potential fix of appel is Appel. Frequency is 2
Potential fix of appel is appeal. Frequency is 3
Scanning pleases, target_distance is 1
Potential fix of pleases is pleased. Frequency is 2
Potential fix of pleases is please. Frequency is 1
Word or is correct. Skipping
Word a is correct. Skipping
Scanning bannana, target_distance is 1
Word with is correct. Skipping
Word a is correct. Skipping
Scanning paer, target_distance is 1
Potential fix of paer is pair. Frequency is 11
Potential fix of paer is par. Frequency is 4
Potential fix of paer is paper. Frequency is 14
Potential fix of paer is per. Frequency is 89
Potential fix of paer is Caer. Frequency is 1
Potential fix of paer is pier. Frequency is 1
Word and is correct. Skipping
Word a is correc