In [54]:
import bs4
import requests
import json
import time
import os
import pandas as pd
import numpy as np
from random import shuffle
from thefuzz import process, fuzz
from modules.CountMinSketch.count_min_sketch2 import CountMinSketch2
from modules.BloomFilter.bloom_filter2 import BloomFilter2
from modules.Dataset.dataset import Dataset

In [45]:
def format_word(word):
  if len(word) > 1:
    last_char = word[-1]
    while not last_char.isalpha():
      word = word[:-1]
      last_char = word[-1]

    first_char = word[0]
    while not first_char.isalpha():
      word = word[1:]
      first_char = word[0]

    word = word.lower()

  return word

In [46]:
words_set = []
for i in range(10):
  candidates_file = f"./dataset/dataset_candidates_{i}.txt"
  with open(candidates_file, 'r') as f:
    words_list = f.read().split('\n')
    words_set.append({"words_list": words_list})

In [60]:
NUM_COUNTERS = 300000
NUM_HASH_FUNCTIONS = 100

for index, words_list in enumerate(words_set):
    print(f"Init CMS - {index + 1} of {len(words_set)}")
    cms = CountMinSketch2(buckets=NUM_COUNTERS, tables=NUM_HASH_FUNCTIONS)
    for word in words_list["words_list"]:
        cms.increment(word)
    words_set[index]['words_cms'] = cms
    del cms

Init CMS - 1 of 10
Init CMS - 2 of 10
Init CMS - 3 of 10
Init CMS - 4 of 10
Init CMS - 5 of 10
Init CMS - 6 of 10
Init CMS - 7 of 10
Init CMS - 8 of 10
Init CMS - 9 of 10
Init CMS - 10 of 10


In [65]:
p = 0.01 #false positive probability 

for index, words_list in enumerate(words_set):
    print(f"Init BLOOM - {index + 1} of {len(words_set)}")
    n = len(words_list["words_list"]) #no of items to add
    bloomf = BloomFilter2(n,p)
    for word in words_list["words_list"]:
        bloomf.add(word)
    words_set[index]['words_bloomf'] = bloomf
    del bloomf

Init BLOOM - 1 of 10
Init BLOOM - 2 of 10
Init BLOOM - 3 of 10
Init BLOOM - 4 of 10
Init BLOOM - 5 of 10
Init BLOOM - 6 of 10
Init BLOOM - 7 of 10
Init BLOOM - 8 of 10
Init BLOOM - 9 of 10
Init BLOOM - 10 of 10


In [70]:
#some_word_with_error = "appel"
some_sentance_with_error = "I wanna appel pleases or a bannana with a paer and a lemone, or a wartermelon if possibe. I hope i don't have an errof"
correct_sentence = "I wanna apple please or a banana with a pear and a lemon, or a watermelon if possible. I hope i don't have an error"
diff_words_to_correct = list(set([format_word(x) for x in correct_sentence.split()]) - set([format_word(x) for x in some_sentance_with_error.split()]))
new_sentance = []
words_set_out = []

for index, words_list in enumerate(words_set):
    time_start_naive = time.time()
    words_set_out.append({})
    words_set_out[index]["matched_words"] = []
    words_set_out[index]["matched_scores"] = []
    words_set_out[index]["words_frequency"] = []
    start = time.time()
    for word in some_sentance_with_error.split():
        word = format_word(word)
        if not word in words_list["words_bloomf"]:
            success_percentage = max((len(word) - 1) / len(word) * 100, 75)
            print(f"Scanning {word}, success_percentage is {success_percentage}")
            print("==========================")
            word_results = [(format_word(x[0]), x[1]) for x in process.extract(word, words_list["words_list"], scorer=fuzz.ratio)]# if x[1] >= success_percentage]
            word_results = list(set(word_results))
            if (len(word_results) > 0 and word_results[0][1] < 100):
                list_fixes = list([x[0] for x in word_results])
                list_scores = list([x[1] for x in word_results])
                list_freq = list([words_list["words_cms"].count(x[0]) for x in word_results])
                for idx, fix in enumerate(list_fixes):
                    print(f"Potential fix of {word} is {fix}")
                    print(f"Frequency is {list_freq[idx]}")
                    print(f"Match rate is {list_scores[idx]}")
                    print("==========================")
                words_set_out[index]["matched_words"].append(set(list_fixes) if len(list_fixes) > 1 else list_fixes[0])
                words_set_out[index]["matched_scores"].append(set(list_scores) if len(list_scores) > 1 else list_scores[0])
                words_set_out[index]["words_frequency"].append(set(list_freq) if len(list_freq) > 1 else list_freq[0])
            else:
                words_set_out[index]["matched_words"].append(None)
                words_set_out[index]["matched_scores"].append(None)
                words_set_out[index]["words_frequency"].append(None)
                continue
        else:
            print(f"Word {word} is correct. Skipping")
            words_set_out[index]["matched_words"].append(None)
            words_set_out[index]["matched_scores"].append(None)
            words_set_out[index]["words_frequency"].append(None)
    
    end = time.time()
    print(f"Total runtime of {(end - start)} seconds")
    words_set_out[index]["time_sec"] = (end - start)

Word I is correct. Skipping
Word wanna is correct. Skipping
Scanning appel, success_percentage is 80.0
Potential fix of appel is appeal
Frequency is 9.0
Match rate is 91
Scanning pleases, success_percentage is 85.71428571428571
Potential fix of pleases is please
Frequency is 1.0
Match rate is 92
Word or is correct. Skipping
Word a is correct. Skipping
Scanning bannana, success_percentage is 85.71428571428571
Potential fix of bannana is bangunan
Frequency is 0.0
Match rate is 80
Potential fix of bannana is bananas
Frequency is 1.0
Match rate is 86
Potential fix of bannana is bandanna
Frequency is 2.0
Match rate is 80
Potential fix of bannana is lannan
Frequency is 0.0
Match rate is 77
Word with is correct. Skipping
Word a is correct. Skipping
Scanning paer, success_percentage is 75.0
Potential fix of paer is paper
Frequency is 15.0
Match rate is 89
Word and is correct. Skipping
Word a is correct. Skipping
Scanning lemone, success_percentage is 83.33333333333334
Potential fix of lemone i

In [71]:
OUT_DIR = './output/2-count-min-sketch+bloom_freq'
if not os.path.exists(OUT_DIR):
    os.makedirs(OUT_DIR)

In [72]:
for index, words_list in enumerate(words_set_out):
    df_comparisons = pd.DataFrame(data=words_list, index=[format_word(x) for x in correct_sentence.split()])
    df_comparisons = df_comparisons.groupby(df_comparisons.index).first()
    df_comparisons.to_csv(f'{OUT_DIR}/dataset_{index}.csv')

In [73]:
df_comparisons

Unnamed: 0,matched_words,matched_scores,words_frequency,time_sec
I,,,,0.38325
a,,,,0.38325
an,,,,0.38325
and,,,,0.38325
apple,appeal,91,13.0,0.38325
banana,"{gannan, nana, qiannan, anna, bandanna}","{80, 73, 77, 71}","{0.0, 1.0}",0.38325
don't,,,,0.38325
error,"{error, errors}","{80, 73}","{2.0, 4.0}",0.38325
have,,,,0.38325
hope,,,,0.38325
