In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


# Gensim word vector visualisation of various word vectors

In [2]:
import numpy as np
#import matplotlib

# Get the interactive tools for matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use("ggplot")

from sklearn.decomposition import PCA  

import gensim.downloader as api
from gensim.models import KeyedVectors

For looking at word vectors, I'll use Gensim. I also use it in hw1 for word vectors. Gensim isn't really a deep learning package. It's a package for word and text similarity modelling, which started with (LDA-style) topic models and grew into SVD and neural word representations. But it's efficient and scalable, and quite widely used.

I'll use GloVe word vectors. Gensim provides a library of several sets of word vectors that you can easily load. You can find out more about GloVe on the GloVe page. 

In [3]:
model = api.load("glove-wiki-gigaword-300")
type(model) 



gensim.models.keyedvectors.Word2VecKeyedVectors

In [5]:
model["bread"]

array([ 0.42445  ,  0.44214  ,  0.37833  , -0.069403 , -0.073338 ,
       -0.081135 , -0.05778  ,  0.16375  , -0.099045 , -0.81689  ,
       -0.43912  , -0.49712  , -0.50495  ,  0.7461   ,  0.3235   ,
       -0.69495  , -0.1999   ,  0.085718 ,  0.10033  ,  0.059362 ,
       -0.043522 ,  0.050225 , -0.22728  , -0.049659 ,  0.52579  ,
       -0.13038  ,  0.11082  , -0.0057026, -0.40167  , -0.3036   ,
       -0.35504  ,  0.56465  , -0.42691  ,  0.095305 , -0.89807  ,
        0.17838  , -0.029647 , -0.12109  , -0.13524  ,  0.40596  ,
        0.15597  , -0.84324  ,  0.51727  , -0.16569  , -0.11147  ,
        0.077322 ,  0.26823  ,  0.057215 , -0.14571  ,  0.23168  ,
        0.91794  ,  0.1811   ,  0.44626  ,  0.84291  , -0.31894  ,
       -0.2061   , -0.30366  ,  0.004545 ,  0.030833 ,  0.17034  ,
        0.31111  ,  0.12178  ,  0.10801  ,  0.079893 , -0.6787   ,
        0.084787 , -0.54224  ,  0.15838  , -0.31336  ,  0.12879  ,
        0.61677  , -0.053287 , -0.38343  , -0.23622  ,  0.0044

In [6]:
model["croissant"]

array([ 6.2413e-01,  2.3175e-02,  8.9691e-02,  1.0652e-01, -2.0159e-01,
        1.2857e-01,  3.6291e-01, -6.9018e-01, -3.3965e-01,  2.0832e-01,
       -1.5173e-01, -1.0514e-01, -4.7687e-01, -1.1231e-01, -4.7100e-01,
       -9.2807e-02,  1.8103e-01, -2.4176e-01, -2.0960e-01,  3.6195e-01,
        6.7968e-02,  4.4313e-01, -3.1275e-01, -7.4449e-03,  3.1996e-04,
       -2.1016e-02,  3.4211e-01,  2.7163e-03,  7.2581e-01, -9.5318e-02,
        1.8115e-01, -4.6783e-01,  5.2100e-03,  4.6143e-02,  4.4550e-01,
        2.5552e-01, -3.4298e-01,  1.1777e-01, -4.3287e-01,  4.5380e-02,
       -2.4952e-03,  5.1222e-01,  4.3946e-01, -1.3227e-01,  2.5923e-01,
        2.8955e-02,  1.1018e+00, -5.4847e-02,  1.3028e-01, -3.6312e-02,
        3.1070e-01,  6.0720e-02,  7.5523e-01,  1.0790e-01,  3.8481e-01,
        4.3716e-02,  6.0888e-02,  4.0668e-01, -1.8425e-01,  1.2845e-02,
       -2.1517e-01, -2.0177e-01, -1.0162e+00, -1.5201e-01, -1.0461e-02,
        2.2242e-01, -8.6899e-02, -2.4085e-01, -3.1132e-01,  1.71

In [16]:
model.most_similar("croissant")

[('croissants', 0.5564220547676086),
 ('brioche', 0.4930887818336487),
 ('muffins', 0.4551553726196289),
 ('focaccia', 0.44012367725372314),
 ('gruyere', 0.4369955360889435),
 ('bagel', 0.43625181913375854),
 ('beignets', 0.4310298562049866),
 ('hazelnut', 0.4305444657802582),
 ('scones', 0.4241448640823364),
 ('muffin', 0.41624417901039124)]

In [8]:
model["usa"]

array([ 0.30712  , -0.2076   ,  0.3119   ,  0.32139  , -0.22534  ,
        0.31222  , -0.62658  ,  0.65077  ,  0.14656  , -0.051405 ,
        0.36335  , -0.032669 ,  0.27812  ,  0.11234  ,  0.39403  ,
        0.34251  , -0.21661  ,  0.34092  , -0.20981  , -0.21636  ,
        0.31925  ,  0.20072  ,  0.05664  , -0.012065 ,  0.18049  ,
       -0.0046816, -0.19974  ,  0.68226  , -0.032802 ,  0.10777  ,
       -0.17278  , -0.21042  ,  0.12185  ,  0.6397   , -1.081    ,
        0.045184 , -0.089771 , -0.20178  , -0.51699  , -0.12299  ,
       -0.33547  , -0.50289  ,  0.028705 ,  0.52416  , -0.37586  ,
       -0.18716  ,  0.13887  , -0.36952  ,  0.68     ,  0.040084 ,
       -0.15813  ,  0.51359  ,  0.2888   ,  0.062583 ,  0.047545 ,
        0.1303   , -0.12693  ,  0.58991  ,  0.67815  , -0.3722   ,
        0.19495  ,  0.70353  , -0.23534  , -0.11962  ,  0.11586  ,
       -0.14723  , -0.042322 ,  0.13139  , -0.30844  , -0.34615  ,
        0.1996   , -0.17157  , -0.29216  ,  0.14701  , -0.7453

In [9]:
model["banana"]

array([ 4.2141e-01,  2.0467e-02,  1.2666e-01,  3.9762e-01, -1.1016e-01,
       -3.5956e-02, -4.7214e-01, -1.3916e-01,  5.6812e-01, -3.4969e-01,
       -9.3232e-02, -1.7035e-01, -3.8677e-01, -1.6811e-01, -1.0157e-01,
       -2.6612e-01,  4.8094e-02, -4.6771e-01, -6.0725e-01,  4.0952e-01,
        3.1771e-01,  5.0098e-01,  6.6368e-01, -1.1827e-01, -7.4267e-01,
       -1.0472e-01, -6.4353e-01, -4.4023e-01, -3.9101e-01,  3.5694e-01,
       -9.3489e-01,  4.8317e-01,  1.5223e-01,  7.9339e-02, -2.5111e-01,
        3.9968e-01, -1.7982e-01, -2.8874e-01, -1.0891e-01,  3.8821e-01,
       -2.3147e-01, -5.0337e-01, -2.5231e-01, -2.2184e-02, -2.7874e-01,
       -2.4193e-01,  5.7466e-02, -5.3955e-01, -3.4875e-02, -4.0482e-01,
       -3.8067e-02, -4.2337e-01,  4.2861e-01,  3.5166e-01, -1.8165e-01,
       -3.1131e-01, -5.3276e-01, -5.0954e-02,  6.6779e-01, -4.0077e-01,
        2.1403e-01, -2.9861e-01, -3.6637e-01,  2.8489e-01, -3.7663e-01,
        5.9604e-02, -3.1795e-01,  2.5463e-01, -2.2185e-01,  2.30

In [10]:
model.most_similar("usa")

[('u.s.a.', 0.5414650440216064),
 ('united', 0.49127647280693054),
 ('states', 0.4846649467945099),
 ('america', 0.4511302709579468),
 ('inc', 0.41656187176704407),
 ('inc.', 0.40956565737724304),
 ('2010', 0.4045969247817993),
 ('x2', 0.39059048891067505),
 ('espn', 0.38786134123802185),
 ('canada', 0.3860737085342407)]

In [11]:
model.most_similar("apple")

[('iphone', 0.5987042784690857),
 ('macintosh', 0.5836330652236938),
 ('ipod', 0.5761124491691589),
 ('microsoft', 0.5663832426071167),
 ('ipad', 0.5628098249435425),
 ('intel', 0.5457563400268555),
 ('ibm', 0.5286195278167725),
 ('google', 0.5282472372055054),
 ('imac', 0.5072519779205322),
 ('software', 0.4962984323501587)]

In [12]:
model.most_similar(negative="banana")

[('keyrates', 0.6847262382507324),
 ('rw97', 0.6595869064331055),
 ('+9.00', 0.6340475678443909),
 ('ryryryryryry', 0.6322759985923767),
 ('zety', 0.5784541368484497),
 ('.0342', 0.5776804089546204),
 ('k586-1', 0.5598777532577515),
 ('cw96', 0.5540916323661804),
 ('mongkolporn', 0.5488854050636292),
 ('purva.patel@chron.com', 0.5483731627464294)]

In [14]:
result = model.most_similar(positive=["woman", "king"], negative="man")
print("{}: {:.4f}".format(*result[0]))

niece: 0.3541


In [18]:
# x1: x2 :: y1 :: returned
def analogy(x1, x2, y1):
    result = model.most_similar(positive=[y1, x2], negative=[x1])
    return result[0][0]

In [19]:
analogy("man", "king", "woman")

'queen'

In [20]:
analogy("king", "man", "queen")

'woman'

In [27]:
analogy("pencil", "sketching", "camera")

'cameras'

In [28]:
analogy("tall", "tallest", "short")

'longest'