"Glove" method of word2vec, adapted from PK Mital: https://github.com/pkmital/pycadl/blob/master/cadl/glove.py and Thrones2Vect by Yuriy Guts

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from cadl import utils
import zipfile
from scipy.spatial import distance, distance_matrix
from sklearn.decomposition import PCA
import sklearn.manifold
import os


In [None]:
# Download the glove model and open a zip file
file = utils.download('http://nlp.stanford.edu/data/wordvecs/glove.6B.zip')
zf = zipfile.ZipFile(file)

# Collect the words and their vectors
words = []
vectors = []
for l in zf.open("glove.6B.300d.txt"):
    t = l.strip().split()
    words.append(t[0].decode())
    vectors.append(list(map(np.double, t[1:])))

# Store as a lookup table
wordvecs = np.asarray(vectors, dtype=np.double)
word2id = {word: i for i, word in enumerate(words)}


ON THE NEXT ITERATION, we can use our own trained model - TRY LATER!
TODO: Slice dim entries from top of txt files or there will be a sequence error when making the array

In [None]:
zf = zipfile.ZipFile('trained/thrones2vec.zip')

# Collect the words and their vectors
words = []
vectors = []
for l in zf.open("thrones2vec.txt"):
    t = l.strip().split()
    words.append(t[0].decode())
    vectors.append(list(map(np.double, t[1:])))

# Store as a lookup table
wordvecs = np.asarray(vectors, dtype=np.double)
word2id = {word: i for i, word in enumerate(words)}

In [None]:
len(words)

In [None]:
word = 'zoo'
print(word2id[word])
print(wordvecs[word2id[word]])

In [None]:
# Get distances to target word
target_vec = wordvecs[word2id[word]]
dists = []
for vec_i in wordvecs:
    dists.append(distance.cosine(target_vec, vec_i))

k = 20

# Print top nearest words
idxs = np.argsort(dists)
for idx_i in idxs[:k]:
    print(words[idx_i], dists[idx_i])

# Plot top nearest words
labels = [words[idx_i] for idx_i in idxs[:k]]
plt.figure()
plt.bar(range(k),
        [dists[idx_i] for idx_i in idxs[:k]])
ax = plt.gca()
ax.set_xticks(range(len(labels)))
ax.set_xticklabels(labels, rotation='vertical')
plt.xlabel('label')
plt.ylabel('distances')

# Create distance matrix
vecs = [wordvecs[idx_i] for idx_i in idxs[:k]]
dm = distance_matrix(vecs, vecs)
plt.figure()
plt.imshow(dm)
ax = plt.gca()
ax.set_xticks(range(len(labels)))
ax.set_yticks(range(len(labels)))
ax.set_xticklabels(labels, rotation='vertical')
ax.set_yticklabels(labels)
plt.colorbar()
plt.show()

In [None]:
# Plot data points in reduced dimensionality using principal components
# of the distance matrix
res = PCA(2).fit_transform(dm / np.mean(dm, axis=0, keepdims=True))
pc1, pc2 = res[:, 0], res[:, 1]
plt.figure()
plt.scatter(pc1, pc2)
for i in range(len(labels)):
    plt.text(pc1[i], pc2[i], labels[i])
plt.show()

In [None]:
res.shape

Now let's use a 'dataframe' to plot a lot more words and show their embedded relationships

In [None]:
# Create distance matrix to plot many word embeddings - will TAKE A BIT

many_vec = 10000

def pca_many(): 
    vecs = [wordvecs[idx_i] for idx_i in idxs[:many_vec]]
    dm = distance_matrix(vecs, vecs)
    res = PCA(2).fit_transform(dm / np.mean(dm, axis=0, keepdims=True))
    return res

res = pca_many()

In [None]:
#Plot all words by using pandas dataframe
word_arr = np.asarray(words[:many_vec])

points = pd.DataFrame(
    [
        (label, coords[0], coords[1])
        for label, coords in [
            (label,  res[word2id[label]])
            for label in word_arr
        ]
    ],
    columns=["label", "x", "y"]
)

In [None]:
points[200:215]

In [None]:
%pylab inline

In [None]:
points.plot.scatter("x", "y", s=10, figsize=(20, 12))

Zoom in on different regions and visualize word relationships

In [None]:
def plot_region(x_bounds, y_bounds):
    slice = points[
        (x_bounds[0] <= points.x) &
        (points.x <= x_bounds[1]) & 
        (y_bounds[0] <= points.y) &
        (points.y <= y_bounds[1])
    ]
    
    ax = slice.plot.scatter("x", "y", s=35, figsize=(10, 8))
    for i, point in slice.iterrows():
        ax.text(point.x + 0.005, point.y + 0.005, point.label, fontsize=11)

In [None]:
# Try some other regions - what can we infer?
plot_region(x_bounds=(1., 2), y_bounds=(-2, -1.5))

However, since our model actually contains some 400,000 words and it would take a long time to analyze all together, let's continue by analyzing any word but with only its k-nearest neighbors

In [None]:
def plot_nearest_words(word, k=20):
        """Summary
        Parameters
        ----------
        word : TYPE
            Description
        k : int, optional
            Description
        """
        # Get distances to target word
        target_vec = wordvecs[word2id[word]]
        dists = []
        for vec_i in wordvecs:
            dists.append(distance.cosine(target_vec, vec_i))
        idxs = np.argsort(dists)
        labels = [words[idx_i] for idx_i in idxs[:k]]
        vecs = [wordvecs[idx_i] for idx_i in idxs[:k]]
        dm = distance_matrix(vecs, vecs)
        plt.figure()
        plt.imshow(dm)
        fig, axs = plt.subplots(1, 2, figsize=(10, 4))

        # Create distance matrix
        axs[0].imshow(dm)
        axs[0].set_xticks(range(len(labels)))
        axs[0].set_yticks(range(len(labels)))
        axs[0].set_xticklabels(labels, rotation='vertical')
        axs[0].set_yticklabels(labels)

        # Center the distance matrix
        dm = dm / np.mean(dm, axis=0, keepdims=True)

        # Plot data points in reduced dimensionality using principal components
        # of the distance matrix
        res = PCA(2).fit_transform(dm)
        pc1, pc2 = res[:, 0], res[:, 1]
        axs[1].scatter(pc1, pc2)
        for i in range(len(labels)):
            axs[1].text(pc1[i], pc2[i], labels[i])
        plt.show()



In [None]:
plot_nearest_words('2000')

In [None]:
plot_nearest_words('age')

In [None]:
# Let's create a function which will return us the nearest words rather than
# plot them:
def get_nearest_words(target_vec, k=20):
    """Summary
    Parameters
    ----------
    target_vec : TYPE
        Description
    k : int, optional
        Description
    Returns
    -------
    TYPE
        Description
    """
    # Get distances to target vector
    dists = []
    for vec_i in wordvecs:
        dists.append(distance.cosine(target_vec, vec_i))
    # Get top nearest words
    idxs = np.argsort(dists)
    res = []
    for idx_i in idxs[:k]:
        res.append((words[idx_i], dists[idx_i]))
    return res

# And a convenience function for returning a vector
def get_vector(word):
    """Summary
    Parameters
    ----------
    word : TYPE
        Description
    Returns
    -------
    TYPE
        Description
    """
    return wordvecs[word2id[word]]

In [None]:
# Word embeddings can sometimes show relationships with vector arithmetic
get_nearest_words(get_vector('disease') - get_vector('death') + get_vector('hospital'))