# Embeddings

In this notebook two different embeddings, FastText and Word2Vex are trained on domain specific corpora.  Furthermore a few sanity checks on the objects involved are done.

In [None]:
import pickle
import gensim
from gensim.models import FastText, Word2Vec, KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import datapath, get_tmpfile

## Loading a pretrained model

In [None]:
# Load Facebook pretrained FastText in gensim, either bin or vec.
# Source: https://fasttext.cc/docs/en/crawl-vectors.html

# model = FastText.load_fasttext_format('../WordEmbeddings/FastText/cc.de.300.bin') 
model =  KeyedVectors.load_word2vec_format('../WordEmbeddings/FastText/cc.de.300.vec') 

In [None]:
# sanity checks

print(model.wv.vectors.shape)

print(model.similarity('Lehrer', 'Lehrerin'))
# Output = 0.683924396754

#print(model.most_similar(positive=['frau', 'könig'], negative=['mann']))
print(model.most_similar(positive=["Auto", "Fahrer"], negative=["Fahrrad"]))

## Training word2vec and FastText on domain specific data corpora

In [None]:
# import and merge data

with open("../Data/CRQ_preprocessed.pkl",'rb') as file:
    CRQ_data = pickle.load(file)

with open("../Data/CRQ_test_preprocessed.pkl",'rb') as file:
    CRQ_test_data = pickle.load(file)
    
with open("../Data/CRQ_train_preprocessed.pkl",'rb') as file:
    CRQ_train_data = pickle.load(file)
    
with open("../Data/Patents_preprocessed.pkl",'rb') as file:
    patents_data = pickle.load(file)
    
with open("../Data/Books_preprocessed.pkl",'rb') as file:
    books_data = pickle.load(file)
    
training_data = CRQ_data + CRQ_test_data + CRQ_train_data + patents_data + books_data

In [None]:
# sanity checks

print(len(CRQ_data))
print(len(CRQ_test_data))
print(len(CRQ_train_data))
print(len(patents_data))
print(len(books_data))
print(len(training_data))

In [None]:
# Train FastText on local data
model_ft = FastText(training_data, size=300, window=5, min_count=5, workers=4)

In [None]:
# save model to disk
model_ft.save("../WordEmbeddings/ft_ds.model")

In [None]:
# sanity checks 
print(len(model_ft.wv.vocab))
similarity = model_ft.wv.similarity('motor', 'starter')
print(similarity)

In [None]:
# Train word2vec on local dataa
model_w2v = Word2Vec(training_data, size=300, window=5, min_count=5, workers=4)

In [None]:
#save model to disk
model_w2v.save("../WordEmbeddings/w2v_ds.model")

In [None]:
#sanity checks

print(len(model_w2v.wv.vocab))
similarity = model_w2v.wv.similarity('motor', 'starter')
print(similarity)