In [None]:
from bs4 import BeautifulSoup as bs4
import nltk
import numpy as np
from nltk.corpus import stopwords
import pandas as pd
from tqdm.auto import tqdm
import spacy
import json
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
#import unicodedata
#def remove_accented_chars(text):
#    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

In [None]:
with open("./corpus/e990519_mod.htm", "r", encoding="utf-8") as f:
    original_data = f.read()

In [None]:
html_soup = bs4(original_data, features="html.parser")

In [None]:
html_only_text = html_soup.get_text().lower()

In [None]:
tokenized_text = nltk.WordPunctTokenizer().tokenize(html_only_text)

In [None]:
len(tokenized_text)

In [None]:
vocab = sorted(list(set(tokenized_text)))

In [None]:
print("Vocab before Alphabetical filter: {}".format(len(vocab)))

In [None]:
vocab = [ x.lower() for x in vocab if x.isalpha() ]

In [None]:
print("Vocab after Alphabetical filter: {}".format(len(vocab)))

In [None]:
stword = stopwords.words("spanish")

In [None]:
vocab = [ x for x in vocab if x not in stword ]

In [None]:
print("Vocab without stopwords: {}".format(len(vocab)))

In [None]:
vocab[:100]


In [None]:
## Lematizacion

In [None]:
sp_nlp = spacy.load("es_core_news_lg")

In [None]:
lemmatized_vocab = set()
for v in vocab:
    sp_doc = sp_nlp(v)
    word_lemma = [ t for t in sp_doc ][0].lemma_
    #print(v, word_lemma.split(" ")[0])
    lemmatized_vocab.add(word_lemma.split(" ")[0])
    

In [None]:
len(lemmatized_vocab)

In [None]:
vocab = sorted(list(lemmatized_vocab))

In [None]:
print("Vocab size after Lemmatization [no duplicates]: {}".format(len(vocab)))

In [None]:
# 2nd time remove stop words
vocab = [ x for x in vocab if x not in stword ]

In [None]:
print("Vocab size final: {}".format(len(vocab)))

In [None]:
vocab[:100]

In [None]:
with open("vocab.txt", "w", encoding="utf8") as f:
    f.writelines( [ "{}\n".format(x) for x in vocab ])

In [None]:
tokenized_text

In [None]:
# Eliminar tokens no alfabeticas
clean_text = [ x for x in tokenized_text if x.isalpha() ]
# Eliminar stopwords del texto principal
clean_text = [ x.lower() for x in clean_text if x.lower() not in stword ]

In [None]:
# Lematizar el texto original
lemmatized_text = []
for item in clean_text:
    sp_doc = sp_nlp(item)
    word_lemma = [ t for t in sp_doc ][0].lemma_
    #print(v, word_lemma.split(" ")[0])
    lemmatized_text.append(word_lemma.split(" ")[0])

In [None]:
lemmatized_text = [ x for x in lemmatized_text if x not in stword ]

In [None]:
vsm_data = {}
temp_words_array = np.array(lemmatized_text)
for word in tqdm(vocab):
    indices = np.where(temp_words_array == word)[0]
    word_vocab_coincidences = [0] * len(vocab)
    word_contexts = set()
    for idx in indices:
        context = " ".join(temp_words_array[idx-4:idx+5])
        word_contexts.add(context)
    #print("Word: {} - Context: {}".format(word, word_contexts))
    for cntx in word_contexts:
        for w in cntx.split():
            if w == word:
                continue
            vocab_idx = vocab.index(w)
            #print("Cntx Word: {} - Vocab Index: {}".format(w, vocab_idx))
            word_vocab_coincidences[vocab_idx] = word_vocab_coincidences[vocab_idx] + 1
    
    vsm_data[word] = list(word_vocab_coincidences)

In [None]:
len(vsm_data)

In [None]:
with open("vocab_embeddings.json", "w", encoding="utf8") as f:
    json.dump(vsm_data, f)

In [None]:
len(vsm_data)

In [None]:
total_items_to_export = 1000
matrix = [ [] ] * total_items_to_export
for i in range(0, total_items_to_export):
    matrix[i] = [0] * total_items_to_export
for i in tqdm(range(0, len(vocab[:total_items_to_export]))):
    actual_item = vsm_data[vocab[i]]
    for j in range(i, len(vocab[:total_items_to_export])):
        to_compare = vsm_data[vocab[j]]
        cosine_simil = np.dot(actual_item,to_compare)/(np.linalg.norm(actual_item)*np.linalg.norm(to_compare))
        matrix[i][j] = round(cosine_simil, 5)
        matrix[j][i] = round(cosine_simil, 5)
        #print(i,j, cosine_simil, matrix[i][j])
    #print(matrix)
        

In [None]:
with open("simil_table.csv", "w", encoding="utf8") as f:
    f.write( ",".join( [""] + vocab[:total_items_to_export]) )
    f.write("\n")
    for i in range(0, total_items_to_export):
        f.write(",".join([vocab[i]] + [str(x) for x in matrix[i]]))
        f.write("\n")
    
    