# Embeddings

In this notebook I train different embeddings on domain specific corpora. The corpora are preprocessed in the notebook "Data Collection".
Furthermore I do a few sanity checks on the objects involved.

In [1]:
import pickle
import gensim
from gensim.models import FastText, Word2Vec, KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import datapath, get_tmpfile

In [2]:
# Load Facebook pretrained FastText in gensim, either bin or vec.
# Source: https://fasttext.cc/docs/en/crawl-vectors.html


# model = FastText.load_fasttext_format('../WordEmbeddings/FastText/cc.de.300.bin') 
model =  KeyedVectors.load_word2vec_format('../WordEmbeddings/FastText/cc.de.300.vec') 

In [None]:
print(model.most_similar('teacher'))
# Output = [('headteacher', 0.8075869083404541), ('schoolteacher', 0.7955552339553833), ('teachers', 0.733420729637146), ('teaches', 0.6839243173599243), ('meacher', 0.6825737357139587), ('teach', 0.6285147070884705), ('taught', 0.6244685649871826), ('teaching', 0.6199781894683838), ('schoolmaster', 0.6037642955780029), ('lessons', 0.5812176465988159)]

print(model.similarity('Lehrer', 'Lehrerin'))
# Output = 0.683924396754

#print(model.most_similar(positive=['frau', 'könig'], negative=['mann']))
print(model.most_similar(positive=["Auto", "Fahrer"], negative=["Fahrrad"]))

In [None]:
model.wv.vectors.shape

# Training on domain data

In [48]:
# import and merge data

with open("../Data/CRQ_preprocessed.pkl",'rb') as file:
    CRQ_data = pickle.load(file)

with open("../Data/CRQ_test_preprocessed.pkl",'rb') as file:
    CRQ_test_data = pickle.load(file)
    
with open("../Data/CRQ_train_preprocessed.pkl",'rb') as file:
    CRQ_train_data = pickle.load(file)
    
with open("../Data/Patents_preprocessed.pkl",'rb') as file:
    patents_data = pickle.load(file)
    
with open("../Data/Books_preprocessed.pkl",'rb') as file:
    books_data = pickle.load(file)
    
training_data = CRQ_data + CRQ_test_data + CRQ_train_data + patents_data + books_data

In [49]:
print(len(CRQ_data))
print(len(CRQ_test_data))
print(len(CRQ_train_data))
print(len(patents_data))
print(len(books_data))
print(len(training_data))

17552
966
8663
560
60
27801


In [50]:
print(books_data[13])



In [27]:
# Train FastText on local data
model_ft = FastText(training_data, size=300, window=5, min_count=5, workers=4)

In [29]:
model_ft.save("../WordEmbeddings/ft_ds.model")

In [32]:
print(len(model_ft.wv.vocab))
similarity = model_ft.wv.similarity('motor', 'starter')
print(similarity)

91716
0.29978982


In [33]:
# Train word2vec on local dataa
model_w2v = Word2Vec(training_data, size=300, window=5, min_count=5, workers=4)

In [35]:
model_w2v.save("../WordEmbeddings/w2v_ds.model")

In [38]:
print(len(model_w2v.wv.vocab))
similarity = model_w2v.wv.similarity('motor', 'starter')
print(similarity)

91716
0.62028044
