### Word2Vec Minimal PoC

This notebook shows that we can create artificial embeddings that are similar to their original.

In [36]:
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec
import numpy as np

# Calculate initial text embeddings for the "text8" dataset
dataset = api.load("text8")
model = Word2Vec(dataset)

In [37]:
word_of_interest = "space"

# Check closest words to word_of_interest
model.wv.most_similar(word_of_interest)

[('spacecraft', 0.6211189031600952),
 ('shuttle', 0.6114946603775024),
 ('spaces', 0.6003172993659973),
 ('probe', 0.5929622650146484),
 ('orbit', 0.5790212750434875),
 ('probes', 0.5662890076637268),
 ('manned', 0.565723180770874),
 ('satellites', 0.5644378066062927),
 ('planet', 0.5641493201255798),
 ('plane', 0.5581268072128296)]

In [38]:
update = []

# Create an updated set of words with all instances of word_of_interest replaced by an artificial entity
for part in dataset:
    update.append([w if w != word_of_interest else word_of_interest+"_" for w in part])

In [39]:
# Extend vocab with the new word
model.build_vocab(update, update=True)

In [40]:
# Optional: lock existing words against change
v = np.zeros(len(model.wv.vectors), np.float32)
v[-1] = 1
model.wv.vectors_lockf = v

In [41]:
# Continue training with the update
# Notice: No warning about alpha given
model.train(update, total_examples=model.corpus_count, epochs=model.epochs)

(62528163, 85026035)

In [42]:
# Demonstrate that the artificial word is very close to the original in terms of cosine distance - or is it?
model.wv.most_similar(word_of_interest)

[('space_', 0.9595503211021423),
 ('spacecraft', 0.6211189031600952),
 ('shuttle', 0.6114946603775024),
 ('spaces', 0.6003172993659973),
 ('probe', 0.5929622650146484),
 ('orbit', 0.5790212750434875),
 ('probes', 0.5662890076637268),
 ('manned', 0.565723180770874),
 ('satellites', 0.5644378066062927),
 ('planet', 0.5641493201255798)]

In [26]:
import pickle

results = []

for word_of_interest in ["vector", "dinosaur", "king", "computer", "oxygen", "time", "caesar", "merkel", "washington", "space"]:
    print(word_of_interest)
    
    model = pickle.loads(pickle.dumps(model))

    update = []

    # Create an updated set of words with all instances of word_of_interest replaced by an artificial entity
    for part in dataset:
        update.append([w if w != word_of_interest else word_of_interest+"_" for w in part])

    # Extend vocab with the new word
    model.build_vocab(update, update=True)

    # Optional: lock existing words against change
    v = np.zeros(len(model.wv.vectors), np.float32)
    v[-1] = 1
    model.wv.vectors_lockf = v

    # Continue training with the update
    # Notice: No warning about alpha given
    model.train(update, total_examples=model.corpus_count, epochs=model.epochs)

    # Demonstrate that the artificial word is very close to the original in terms of cosine distance
    results.append(model.wv.most_similar(word_of_interest)[0])

vector
dinosaur
king
computer
oxygen
time
caesar
merkel
washington
space


In [27]:
results

[('vector_', 0.9348177909851074),
 ('dinosaur_', 0.8981873393058777),
 ('king_', 0.980374276638031),
 ('computer_', 0.9713629484176636),
 ('oxygen_', 0.8975173234939575),
 ('time_', 0.9628214836120605),
 ('caesar_', 0.8691752552986145),
 ('angela', 0.8305437564849854),
 ('washington_', 0.9386913776397705),
 ('space_', 0.9476979970932007)]