In [None]:
from TRIE_implementation import *
import re
import string

# Corpus reading

In [None]:
import requests
def download_save(url, name):
    r = requests.get(url, stream=True)
    print(f'\n%%%%%% Dataset downloaded %%%%%')
    with open(name, 'wb') as f:
        f.write(r.content)


In [None]:
name = 'corpus.txt'
if(not os.path.exists(f'{os.getcwd()}\{name}')):
   download_save('https://www.ime.usp.br/~pf/dicios/br-utf8.txt', name)

In [None]:
def read_corpus(filename):
  with open(filename, "r", encoding="utf8") as file:
    lines = file.readlines()
    words = []
    for line in lines:
      words += re.findall(r'\w+', line.lower())
  return words

In [None]:
words = read_corpus(name)

## Initiate TRIE node with the corpus

In [None]:
root = TrieNode('*')
for word in set(words):
    add(root,word)

## Example of suggestion of words on the TRIE based on a prefix

In [None]:

exist, radical, n = find_prefix(root, 'abs')
a = suggestions_trie(radical, 'abs', [])
print(f'Numero de sugestoes: {len(a)}; \n {a}')

# Types of mispelling

In [None]:
def split(word):
  return [(word[:i], word[i:]) for i in range(len(word) + 1)]

In [None]:
def delete(word):
  return [l + r[1:] for l,r in split(word) if r]

In [None]:
def swap(word):
  return [l + r[1] + r[0] + r[2:] for l, r in split(word) if len(r)>1]

In [None]:
def replace(word):
  letters = string.ascii_lowercase
  return [l + c + r[1:] for l, r in split(word) if r for c in letters]

In [None]:
def insert(word):
  letters = string.ascii_lowercase
  return [l + c + r for l, r in split(word) for c in letters]

In [None]:
def mod1(word):
  return set(delete(word) + swap(word) + replace(word) + insert(word))

In [None]:
def mod2(word):
  return set(e2 for e1 in mod1(word) for e2 in mod1(e1))

In [None]:
def correct_spelling(root, word):
  exist_in_trie , radical, _ = find_prefix(root, word) ## search Trie to find the word
  if exist_in_trie:
    if radical.word_finished:
      print(f"{word} is already correctly spelt")
      return word

  # suggestions = [mod1(word) or mod2(word) or [word]
  suggestions = mod1(word) or mod2(word) # | set([word]) unnecessary given that the word is already correctly spelt
  best_guesses = []
  exist_in_trie , radical, _ = find_prefix(root, word[0]) ## search Trie to find the word
  word_rad = word[0]
  
  # for word_rad in split(word)[1::]: #toma a maior parte da palavra que ainda está na trie; primeiro elemento do split é ("", word)
  #   print(f"WordRad: {word_rad}")
  #   exist_in_trie , radical, _ = find_prefix(root, word_rad[0]) ## search Trie to find the word
  #   if(exist_in_trie and not find_prefix(root, word_rad[1][0])[0]): #radical pertence a arvore e radical+proxima_letra nao pertence a TRIE
  #     word = word_rad[0]
  #     break
    
  if(radical is not None):
    trie_suggestions = suggestions_trie(radical, word_rad, [])
    best_guesses = [w for w in suggestions if w in trie_suggestions]
    if(best_guesses == []):
      return word
  return best_guesses

# REAL TEST

In [None]:
sentences = [" As feramentas tradicionais de negócios não são capases de processar uma enorme quantdade de dados não estruturados. Dessa forma, a ciência de dados oferece soluções mais avançadas para analisar grandes volumes de informações provenientes de diferentes tipos de fontes, como registros financeiros, arquivos multimídia, formulários de marketing, sensores, instrumentos e arquivos de texto.",
            'Este e umm tsto qe demonstra cmo o algoritimo funciona', 
            'O Pojeto Acadêmico da UFABC pocura levar em conta as mudancas no canpo da ciênsia, propondo uma matris interdicipinar']

In [None]:
def spell_check(sentence):
    for word in re.split('(\W)', sentence.lower()):
        if word in ",.?!<>":
            print(word, end="")
        elif word == ' ':
            print(" ", end='')
        elif(find_prefix(root, word)[0]): # palavra correta, na TRIE
            print(word, end='')
        else:
            print(correct_spelling(root, word),  end='')
    print('\n')

In [None]:
for sentence in sentences:
    spell_check(sentence)