<h1 style="text-align: center;">Word Embedding - word2vec</h1>

> In this notebook, we illustrate some basic ideas of word embeddings via the word2vec model.
> **If it has an error when you install gensim, to patch the problem temporarily without downgrading, I replaced the import inside gensim/matutils.py in my venv with from numpy import triu**

In [None]:
! pip install gensim

In [None]:
! pip install scikit-learn

In [None]:
import random
import tempfile
import numpy as np  
import matplotlib.pyplot as plt

import gensim
from gensim import utils
from gensim.test.utils import datapath
from sklearn.decomposition import IncrementalPCA    # inital reduction
from sklearn.manifold import TSNE                   # final reduction

In [None]:
# Assume there's one document per line, tokens separated by whitespace
corpus_path = datapath('lee_background.cor')
for ind, line in enumerate(open(corpus_path)):
    print(ind, line[0:100])
    if ind > 10:
        break

### Build corpus

In [None]:
class MyCorpus:
    """An iterator that yields sentences (lists of str)."""
    def __iter__(self):
        corpus_path = datapath('lee_background.cor')
        for line in open(corpus_path):
            # assume there's one document per line, tokens separated by whitespace
            yield utils.simple_preprocess(line)

### Training word2vec model

In [None]:
# get list of sentences
sentences = MyCorpus()
model = gensim.models.Word2Vec(sentences=sentences, vector_size=100, alpha=0.025, window=5, 
                               min_count=5, sample=0.001, seed=1, workers=3, min_alpha=0.0001, 
                               sg=1, negative=5, ns_exponent=0.75, epochs=5, sorted_vocab=1)

In [None]:
vec_king = model.wv['king']
print(vec_king)
for index, word in enumerate(model.wv.index_to_key):
    if index == 10:
        break
    print(f"word #{index}/{len(model.wv.index_to_key)} is {word}")

### Storing and loading models

In [None]:
with tempfile.NamedTemporaryFile(prefix='gensim-model-', delete=False) as tmp:
    temporary_filepath = tmp.name
    model.save(temporary_filepath)
    # The model is now safely stored in the filepath.
    # You can copy it to other machines, share it with others, etc.
    # To load a saved model:
    new_model = gensim.models.Word2Vec.load(temporary_filepath)

In [None]:
model = gensim.models.Word2Vec.load(temporary_filepath)
more_sentences = [
    ['Advanced', 'users', 'can', 'load', 'a', 'model',
     'and', 'continue', 'training', 'it', 'with', 'more', 'sentences'],]
model.build_vocab(more_sentences, update=True)
model.train(more_sentences, total_examples=model.corpus_count, epochs=model.epochs)

# cleaning up temporary file
import os
os.remove(temporary_filepath)

### Visualising Word Embeddings using t-SNE

In [None]:
def reduce_dimensions(model):
    num_dimensions = 2  # final num dimensions (2D, 3D, etc)
    # extract the words & their vectors, as numpy arrays
    vectors = np.asarray(model.wv.vectors)
    labels = np.asarray(model.wv.index_to_key)  # fixed-width numpy strings
    # reduce using t-SNE
    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)
    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels

x_vals, y_vals, labels = reduce_dimensions(model)

def plot_with_matplotlib(x_vals, y_vals, labels):
    random.seed(0)
    plt.figure(figsize=(12, 12))
    plt.scatter(x_vals, y_vals)
    #
    # Label randomly subsampled 25 data points
    #
    indices = list(range(len(labels)))
    selected_indices = random.sample(indices, 25)
    for i in selected_indices:
        plt.annotate(labels[i], (x_vals[i], y_vals[i]))
        
plot_with_matplotlib(x_vals, y_vals, labels)