### Word2Vec Minimal PoC

This notebook shows that we can create artificial embeddings that are similar to their original.

In [1]:
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec
import numpy as np

# Calculate initial text embeddings for the "text8" dataset
dataset = api.load("text8")
model = Word2Vec(dataset)

In [10]:
word_of_interest = "vector"

# Check closest words to word_of_interest
model.wv.most_similar(word_of_interest)

[('topological', 0.8520455956459045),
 ('scalar', 0.8470284938812256),
 ('matrix', 0.8333741426467896),
 ('inverse', 0.8174808025360107),
 ('orthogonal', 0.8171056509017944),
 ('euclidean', 0.8150019645690918),
 ('linear', 0.8076074123382568),
 ('topology', 0.8004781007766724),
 ('associative', 0.8003930449485779),
 ('banach', 0.7997741103172302)]

In [11]:
update = []

# Create an updated set of words with all instances of word_of_interest replaced by an artificial entity
for part in dataset:
    update.append([w if w != word_of_interest else word_of_interest+"_" for w in part])

In [12]:
# Extend vocab with the new word
model.build_vocab(update, update=True)

In [13]:
# Optional: lock existing words against change
v = np.zeros(len(model.wv.vectors), np.float32)
v[-1] = 1
model.wv.vectors_lockf = v

In [14]:
# Continue training with the update
# Notice: No warning about alpha given
model.train(update, total_examples=model.corpus_count, epochs=model.epochs)

(62530208, 85026035)

In [18]:
# Demonstrate that the artificial word is very close to the original in terms of cosine distance
model.wv.most_similar(word_of_interest)

[('vector_', 0.9323139786720276),
 ('topological', 0.8520455956459045),
 ('scalar', 0.8470284938812256),
 ('matrix', 0.8333741426467896),
 ('inverse', 0.8174808025360107),
 ('orthogonal', 0.8171056509017944),
 ('euclidean', 0.8150019645690918),
 ('linear', 0.8076074123382568),
 ('topology', 0.8004781007766724),
 ('associative', 0.8003930449485779)]