In [1]:
import sys
sys.path.append("..")
import mesinesp2.tokenizer

In [2]:
descriptions = mesinesp2.tokenizer.get_descriptions("../data/raw/DeCS2020.obo", tokenize_definition = False, tokenize_name = False)
decs = {}
for key,val in descriptions.items():
    if key.startswith("D"):
        code = key
        document = val["name"]
        if "def" in val:
            document = document + " " + val["def"]
        decs[code] = document

In [None]:
# pip install fasttext : si es que no está instalado fasttext

In [4]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Matias\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
import numpy as np
import pandas as pd
import re
import gensim # module for computing word embeddings
import numpy as np # linear algebra module
import sklearn.feature_extraction # package to perform tf-idf vertorization
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from gensim.models.keyedvectors import KeyedVectors
import fasttext.util
import time
stop_words = stopwords.words('spanish')

ModuleNotFoundError: No module named 'fasttext'

In [None]:
def get_idf(decs):
    tokenized_decs = {}
    for i in decs.keys():
        tokenized_decs[i] = tokenized_decs[i] = mesinesp2.tokenizer.tokenizer(decs[i], split_sentences=False, is_df = False, normalize = True)
        tokenized_decs[i] = [word for word in tokenized_decs[i] if word not in stop_words]
    sentences = []
    for sentence in tokenized_decs.values():
        sentences.append(' '.join(sentence))
    tfidfvectorizer = sklearn.feature_extraction.text.TfidfVectorizer()           # instance of the tf-idf vectorizer
    tfidfvectorizer.fit(sentences)                                                # fitting the vectorizer and transforming the properties
    idf = {key:val for key, val in zip(tfidfvectorizer.get_feature_names(), tfidfvectorizer.idf_)}
    #with open(filepath, 'w+', encoding='utf-8') as json_file:
    #    json.dump(idf, json_file, indent=2, ensure_ascii=False)
    return idf, tokenized_decs

In [None]:
def to_vector(text, model, idf):
    """ Receives a sentence string along with a word embedding model and 
    returns the vector representation of the sentence"""
    tokens = text.split() # splits the text by space and returns a list of words
    vec = np.zeros(300) # creates an empty vector of 300 dimensions
    for word in tokens: # iterates over the sentence
        if (word in model) & (word in idf): # checks if the word is both in the word embedding and the tf-idf model
            vec += model[word]*idf[word] # adds every word embedding to the vector
    if np.linalg.norm(vec) > 0:
        return vec / np.linalg.norm(vec) # divides the vector by their normal
    else:
        return vec

In [None]:
def get_pretrained_embeddings(sentences, model, idf, tokenized_decs):
  vectorized_sent = [to_vector(text, model, idf) for text in sentences]
  embedding_dict = {list(tokenized_decs.keys())[i]: vectorized_sent[i].tolist() for i in range(len(list(tokenized_decs.keys())))}   
  return embedding_dict

In [None]:
def get_contextualized_embeddings(decs, embeddings):
  contextualized_embeddings = {}
  cnt = 0
  for k, v in decs.items():
    cnt += 1
    if cnt%100==0: print(f'{cnt} codes transformed..')
    s = Sentence(v)
    embeddings.embed(s)
    contextualized_embeddings[k] = s.embedding.detach().numpy() 
  return contextualized_embeddings

Pre-trained embeddings

In [None]:
idf, tokenized_decs = get_idf(decs)

# SBW
start = time.time()
sbw = KeyedVectors.load_word2vec_format(r'C:\Users\carol\Desktop\Practica 2\SBW-vectors-300-min5.txt', limit = 100000)
sbw_embeddings = get_pretrained_embeddings(sentences, sbw, idf, tokenized_decs)
with open('decs_sbw.json', 'w') as fp:
  json.dump(sbw_embeddings, fp) 
print(f'{time.time()-start} seconds to get sbw embeddings.')


# Mix
start = time.time()
mix = fasttext.load_model(r'C:\Users\carol\Desktop\Practica 2\mix_fasttext.bin')
mix_embeddings = get_pretrained_embeddings(sentences, sbw, idf, tokenized_decs)
with open('decs_sbw.json', 'w') as fp:
  json.dump(mix_embeddings, fp) 
print(f'{time.time()-start} seconds to get mix embeddings.')

Contextualized embeddings

In [None]:
# pip install flair : si es que no está instalado flair

In [None]:
from flair.embeddings import DocumentPoolEmbeddings, StackedEmbeddings, FlairEmbeddings, TransformerDocumentEmbeddings
from flair.embeddings import SentenceTransformerDocumentEmbeddings
from flair.data import Sentence

In [None]:
# Bert embeddings
start = time.time()
bert = TransformerDocumentEmbeddings("dccuchile/bert-base-spanish-wwm-uncased") # Reference: https://github.com/UKPLab/sentence-transformers
bert_embeddings = get_embeddings(decs, bert) # 768 Dimensiones
with open('bert.json', 'w') as fp:
    json.dump(bert_embeddings, fp)
print(f'{time.time()-start} seconds to get bert embeddings.')

# Flair emebedings
start = time.time()
stacked_embeddings = StackedEmbeddings(embeddings = [FlairEmbeddings('es-forward'), FlairEmbeddings('es-backward')])
flair = DocumentPoolEmbeddings([stacked_embeddings])
flair_embeddings = get_embeddings(decs, flair) # 4096 Dimensiones
with open('flair.json', 'w') as fp:
    json.dump(flair_embeddings, fp)
print(f'{time.time()-start} seconds to get flair embeddings.')