In [78]:
import numpy as np
import pandas as pd
import os
import json
import difflib
import requests
import bs4
from thefuzz import process, fuzz
from random import shuffle 
from modules.BloomFilter.bloom_filter import BloomFilter

In [125]:
with open('wordsapi_sample.json', 'r') as f:
    words_file = json.loads(f.read())
words_list = list(words_file.keys())

In [126]:
WEBSITE_SCRAPES = [
    "https://en.wikipedia.org/wiki/Elvis_Presley",
    "https://en.wikipedia.org/wiki/History_of_Poland_(1945%E2%80%931989)",
    "https://en.wikipedia.org/wiki/Manhattan_Project",
    "https://en.wikipedia.org/wiki/Military_history_of_Puerto_Rico",
    "https://en.wikipedia.org/wiki/Pope_Pius_XII"
]

for WEBSITE in WEBSITE_SCRAPES:
    response = requests.get(WEBSITE,headers={'User-Agent': 'Mozilla/5.0'})
    soup = bs4.BeautifulSoup(response.text,'html.parser')
    
    articale_list = soup.body.get_text(' ', strip=True).split()
    words_list.extend(articale_list)

In [128]:
shuffle(words_list)
len(words_list)

173584

In [129]:
n = len(words_list) #no of items to add 
p = 0.01 #false positive probability 
bloomf = BloomFilter(n,p)

In [130]:
for word in words_list:
    bloomf.add(word)

In [109]:
some_words = "Hello my name is Stav Cohen and I'm 28 from Herzliya, and I don't know why there is no any words in this list. Maybe the word apple exists ? I found out that fruits exist like banana, pear, mango, watermelon"
some_words = some_words.replace(',', '').replace('.', '').replace('?', '').replace('!', '').lower().split()
print(some_words)

['hello', 'my', 'name', 'is', 'stav', 'cohen', 'and', "i'm", '28', 'from', 'herzliya', 'and', 'i', "don't", 'know', 'why', 'there', 'is', 'no', 'any', 'words', 'in', 'this', 'list', 'maybe', 'the', 'word', 'apple', 'exists', 'i', 'found', 'out', 'that', 'fruits', 'exist', 'like', 'banana', 'pear', 'mango', 'watermelon']


In [110]:
for word in some_words:
    if bloomf.check(word):
        print(f"{word} exists in the words list")

my exists in the words list
name exists in the words list
is exists in the words list
and exists in the words list
28 exists in the words list
from exists in the words list
and exists in the words list
i exists in the words list
don't exists in the words list
know exists in the words list
why exists in the words list
there exists in the words list
is exists in the words list
no exists in the words list
any exists in the words list
words exists in the words list
in exists in the words list
this exists in the words list
list exists in the words list
the exists in the words list
word exists in the words list
apple exists in the words list
i exists in the words list
found exists in the words list
out exists in the words list
that exists in the words list
like exists in the words list


In [139]:
#some_word_with_error = "appel"
some_sentance_with_error = "I wanna apple pleases or a bannana with a paer and a lemone, or a wartermelon if possibe"
new_sentance = []
for word in some_sentance_with_error.split():
    success_percentage = max((len(word) - 1) / len(word) * 100, 75)
    print(f"Scanning {word}, success_percentage is {success_percentage}")
    word_results = [x for x in process.extract(word, words_list, scorer=fuzz.ratio) if x[1] >= success_percentage]
    if (len(word_results) > 0 and word_results[0][1] < 100):
        print(word_results)
        new_sentance.append(word_results[0][0] if word_results[0][1] >= success_percentage else word)
    else:
        new_sentance.append(word)

print(' '.join(new_sentance))

Scanning I, success_percentage is 75
Scanning wanna, success_percentage is 80.0
Scanning apple, success_percentage is 80.0
Scanning pleases, success_percentage is 85.71428571428571
[('Please', 92), ('Please', 92), ('please', 92), ('please', 92), ('Please', 92)]
Scanning or, success_percentage is 75
Scanning a, success_percentage is 75
Scanning bannana, success_percentage is 85.71428571428571
[('Banana', 92), ('banana,', 92), ('bandana', 86)]
Scanning with, success_percentage is 75.0
Scanning a, success_percentage is 75
Scanning paer, success_percentage is 75.0
[('Paper', 89), ('paper', 89), ('Paper', 89), ('per', 86), ('per', 86)]
Scanning and, success_percentage is 75
Scanning a, success_percentage is 75
Scanning lemone,, success_percentage is 85.71428571428571
[('Leone', 91)]
Scanning or, success_percentage is 75
Scanning a, success_percentage is 75
Scanning wartermelon, success_percentage is 90.9090909090909
Scanning if, success_percentage is 75
Scanning possibe, success_percentage 

In [113]:
bloomf.check("appel")

False