## GLoVe Embeddings in PyTorch
#### Generating Analogies

In [1]:
import torch
import torchtext
import torchtext.vocab

In [2]:
# this GloVe corpus has been trained using 6 billion words
# every word is represented using a dimensionality of 100 and contains 400K words in its vocabulary
glove = torchtext.vocab.GloVe(name='6B', dim=100)
glove.vectors.shape

torch.Size([400000, 100])

In [3]:
# Every word in the vocabulary is associated using a unique numeric index 
# and can be accessed using itos
print(glove.itos[:20])

['the', ',', '.', 'of', 'to', 'and', 'in', 'a', '"', "'s", 'for', '-', 'that', 'on', 'is', 'was', 'said', 'with', 'he', 'as']


In [4]:
# finding the integer representation of the word
# all the words in glove are in lowercase
print(glove.stoi['the'], glove.stoi['for'], glove.stoi['london'])

0 10 516


In [5]:
# returns 100 dim vector for each words
def get_vector(word):
    
    # will throw assertion error, if word is not present in glove
    assert word in glove.stoi, f'*{word}* is not present in the vocab'
    
    # glove.vectors work with index of the word and not the string val
    return glove.vectors[glove.stoi[word]]

In [6]:
get_vector('london')

tensor([ 6.0553e-01, -5.0886e-02, -1.5461e-01, -1.2327e-01,  6.6270e-01,
        -2.8506e-01, -6.8844e-01,  4.9135e-01, -6.8924e-01,  3.8926e-01,
         1.4359e-01, -4.8802e-01,  1.5746e-01,  8.3178e-01, -2.7923e-01,
         9.4755e-03, -1.1207e-01, -5.2099e-01, -3.7159e-01, -3.7951e-01,
         5.0083e-01, -3.4160e-01,  4.8098e-01, -1.1453e+00,  4.5958e-01,
        -6.5640e-01,  4.3018e-01, -4.2527e-01,  2.3089e-01,  7.8911e-01,
        -7.5434e-01,  1.0830e-01, -1.8071e-01, -5.5543e-04, -4.1071e-01,
         8.6157e-01,  5.3711e-02,  2.4208e-01, -2.6254e-01, -3.0915e-01,
        -2.9787e-01, -5.0758e-01, -2.9940e-01, -3.0442e-01,  7.3099e-01,
         1.4165e-01,  1.0339e-01, -2.9659e-01,  9.9400e-01, -4.1594e-01,
         3.8918e-01,  9.3532e-02,  1.0815e+00,  7.1774e-01, -1.1604e+00,
        -3.0277e+00, -9.2490e-01, -8.8455e-02,  6.1408e-01, -2.5770e-01,
        -2.6942e-01,  4.4647e-01, -8.3637e-01,  7.2481e-02,  3.0968e-02,
        -2.5574e-01, -2.4832e-01,  4.5399e-01,  6.7

In [7]:
# new delhi is not present in the vocab and so it produces an assertion error
get_vector('new delhi')

AssertionError: *new delhi* is not present in the vocab

In [8]:
# to find similar words similar, first vectorize the input word and then scan through the vocab for similar word vectors
# function below returns the closest 6 words of an input word vector

def closest_words(vector, n = 6):
    distances = []
    
    # iterate over glove embeddings and calculate dist between input vector and current word
    for i in glove.itos:
        distances.append((i, torch.dist(vector, get_vector(i))))
    
    return sorted(distances, key= lambda x: x[1])[:n]

In [9]:
# closest words to London 
closest_words(get_vector('london'))

[('london', tensor(0.)),
 ('sydney', tensor(4.2347)),
 ('paris', tensor(4.6192)),
 ('melbourne', tensor(4.6299)),
 ('dublin', tensor(4.6677)),
 ('edinburgh', tensor(4.8436))]

In [10]:
closest_words(get_vector('king'))

[('king', tensor(0.)),
 ('prince', tensor(4.0922)),
 ('queen', tensor(4.2813)),
 ('monarch', tensor(4.4742)),
 ('brother', tensor(4.5367)),
 ('uncle', tensor(4.6690))]

In [11]:
# generating Analogies using glove - king:queen ~ man:?
# via equation -> queen - king + man
def analogy(w1, w2, w3, n=6):
    print('\n[%s : %s :: %s : ?]' %(w1, w2, w3))
    
    analogy_vector = get_vector(w2) - get_vector(w1) + get_vector(w3) 
    closest_analogy = closest_words(analogy_vector, n+3)
    
    # remove the original words 
    closest_analogy = [x for x in closest_analogy if x[0] not in (w1, w2, w3)][:n]
    
    return closest_analogy

In [12]:
analogy('king','queen','man')


[king : queen :: man : ?]


[('woman', tensor(4.0811)),
 ('girl', tensor(4.6916)),
 ('she', tensor(5.2703)),
 ('teenager', tensor(5.2788)),
 ('boy', tensor(5.3084)),
 ('mother', tensor(5.3352))]

In [13]:
analogy('sun', 'hot', 'cloud')


[sun : hot :: cloud : ?]


[('sticky', tensor(6.6092)),
 ('chill', tensor(6.6885)),
 ('filling', tensor(6.7411)),
 ('roiling', tensor(6.8482)),
 ('ash', tensor(6.8523)),
 ('bubbling', tensor(6.8533))]

In [14]:
analogy('london', 'england', 'paris')


[london : england :: paris : ?]


[('france', tensor(4.1426)),
 ('lyon', tensor(5.0363)),
 ('italy', tensor(5.0874)),
 ('holland', tensor(5.0943)),
 ('spain', tensor(5.1148)),
 ('belgium', tensor(5.1447))]