In [None]:
import pickle
import matplotlib.pyplot as plt
import numpy as np

In [None]:
with open('wiki_articles_preprocessed.pkl', 'rb') as in_file:
    sentences = pickle.load(in_file)
print(f'nr of sentences: {len(sentences)}')

In [None]:
word_frequency = {}
for sent in sentences:
    for word in sent:
        word_frequency[word] = word_frequency.get(word, 0) + 1
nr_unique_words = len(word_frequency)

In [None]:
frequency_sorted = sorted(word_frequency.items(), key=lambda x: x[1], reverse=True)

In [None]:
frequencies_only = [freq[1] for freq in frequency_sorted]
n_words_to_plot = 200
plt.figure(figsize=(15,8))
plt.bar(list(range(n_words_to_plot)), frequencies_only[:n_words_to_plot]);

In [None]:
from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
from sklearn.metrics.pairwise import cosine_similarity

path = get_tmpfile("word2vec.model")

model = Word2Vec(sentences, size=200, window=5, min_count=1, workers=4, iter=10)
model.save("word2vec.model")

In [None]:
from scipy import spatial
vec1 = model.wv['dütsch']
vec2 = model.wv['französisch']
spatial.distance.cosine(vec1, vec2)

In [None]:
word2idx = dict()
idx2word = dict()
word_vectors = []
for i, word in enumerate(model.wv.vocab):
    word2idx[word] = i
    idx2word[i] = word
    word_vectors.append(model.wv[word])

In [None]:
def get_most_similar_words(word: str, n: int = 5):
    similarities = cosine_similarity([model.wv[word]], word_vectors)
    indices = similarities.argsort()[0][-n:]
    similarities_by_indices = []
    most_similar_words = []
    for idx in indices[::-1]:
        similarities_by_indices.append(similarities[0][idx])
        most_similar_words.append(idx2word[idx])
    return list(zip(most_similar_words, similarities_by_indices))

In [None]:
get_most_similar_words('acht', n=10)

In [None]:
# good examples of nouns: stadt, frankriich
# good examples of verbs: 
# bad examples: 