# Libs

In [1]:
import requests
import re
import pandas as pd
import time
from bs4 import BeautifulSoup
from spacy.tokenizer import Tokenizer
from spacy.vocab import Vocab
from spacy.language import Language

# Functions

In [2]:
def scrape_dict(word):
    """Scrapes the content of the Priberam website to a list"""
    word = word.replace(".","-")
    link = 'https://www.priberam.pt/dlpo/' + word
    response = requests.get(link)
    soup = BeautifulSoup(response.text,'html.parser')
    results = soup.find_all(id='resultados')
    definitions = results[0].find_all('span', class_= 'def')
    deflist = list()
    for definition in definitions:
        deflist.append(definition.get_text())
    return deflist

In [3]:
nlp = Language(Vocab()) #Spacy variables
tokenizer = Tokenizer(nlp.vocab)
def tokenization_process(string):
    """tokenizes one string, removing not used punctuation in portuguese language in the process"""
    aux = tokenizer(string)
    output = list()
    for index, word in enumerate(aux):
        if re.search('([A-Za-zÀ-ÿ]+(-|\.)[A-Za-zÀ-ÿ]+|[A-Za-zÀ-ÿ]+)', str(word)):
                output.append(str(re.search('([A-Za-zÀ-ÿ]+(-|\.)[A-Za-zÀ-ÿ]+|[A-Za-zÀ-ÿ]+)', str(word)).group(0)).lower())
    return output

In [4]:
def tokenization(deflist):
    """Creates a list that's sorted to input in the "Dictionary" DataFrame"""
    tokenized_list = list()
    for string in deflist:
        tokenized_list.append(tokenization_process(string))
    flattened_list =[element for string in tokenized_list for element in string]
    no_edups_list = list(set(flattened_list))
    no_edups_list.sort()
    return no_edups_list

In [5]:
def create_Dictionary(initial_word):
    """Create first instance of dictionary"""
    return pd.DataFrame(data = {'Words':[initial_word.lower()],'Searched':[True]})

In [6]:
def append_Dictionary(tokenized_list, Dictionary):
    """Append a list to a dictionary"""
    input_list = list()
    for word in tokenized_list:
        if not (word in Dictionary.Words.values):
            input_list.append([word, False])
    return Dictionary.append(pd.DataFrame(input_list, columns = ['Words','Searched']),ignore_index=True)

In [7]:
def next_search(Dictionary):
    """Search first value that's not searched in the Dictionary"""
    word = Dictionary[Dictionary['Searched']==0].iloc[0].Words
    index = Dictionary[Dictionary['Searched']==0].iloc[0].name
    return word, index

In [8]:
def validate_search(Dictionary, index):
    """Changes de status of searched of a word in the Dictionary"""
    Dictionary.at[index, 'Searched'] = True

In [9]:
def unsearched_words(Dictionary):
    """Number of unsearched words in dictionary"""
    return Dictionary[Dictionary['Searched']==0].count().Searched

In [10]:
def searched_words(Dictionary):
    """Number of searched words in dictionary"""
    return Dictionary[Dictionary['Searched']==1].count().Searched

# Script

##### Starting Kick

In [11]:
read_websites = 0 #counter
starting_word ='adicionado'
Dictionary = create_Dictionary(starting_word)
raw_text = scrape_dict(starting_word)
read_websites += 1
tokenized_text = tokenization(raw_text)
Dictionary = append_Dictionary(tokenized_text, Dictionary)

##### Loop ("perfect" version)

In [None]:
while unsearched_words(Dictionary) != 0:
    actual_time = time.time()
    searched_word, sw_index = next_search(Dictionary)
    raw_text = scrape_dict(searched_word)
    tokenized_text = tokenization(raw_text)
    Dictionary = append_Dictionary(tokenized_text, Dictionary)
    validate_search(Dictionary, sw_index)
    read_websites += 1
    if time.time()-actual_time< 1:
        time.sleep(1-(time.time()-actual_time))
    if read_websites % 50 == 0:
        print('Read 50 more pages')

Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more pages
Read 50 more

##### Loop ("satisfied" version)

In [None]:
threshold = 100
while read_websites < threshold:
    actual_time = time.time()
    searched_word, sw_index = next_search(Dictionary)
    raw_text = scrape_dict(searched_word)
    tokenized_text = tokenization(raw_text)
    Dictionary = append_Dictionary(tokenized_text, Dictionary)
    validate_search(Dictionary, sw_index)
    read_websites += 1
    if time.time()-actual_time< 1:
        time.sleep(1-(time.time()-actual_time))
    print(time.time()-actual_time)

In [40]:
Ordered_Dictionary =Dictionary.sort_values('Words')

In [52]:
Ordered_Dictionary.to_csv('Dicionario.csv',columns= ["Words"], index = False)

In [37]:
unsearched_words(Dictionary)/(unsearched_words(Dictionary)+searched_words(Dictionary))

0.0