<a href="https://colab.research.google.com/github/rajlm10/D2L-Torch/blob/main/D2L_Word_Vectors.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install d2l -q

[K     |████████████████████████████████| 82 kB 554 kB/s 
[K     |████████████████████████████████| 11.2 MB 9.6 MB/s 
[K     |████████████████████████████████| 15.7 MB 27.7 MB/s 
[K     |████████████████████████████████| 61 kB 6.6 MB/s 
[K     |████████████████████████████████| 9.9 MB 22.4 MB/s 
[K     |████████████████████████████████| 930 kB 58.7 MB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires requests~=2.23.0, but you have requests 2.25.1 which is incompatible.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.
albumentations 0.1.12 requires imgaug<0.2.7,>=0.2.5, but you have imgaug 0.2.9 which is incompatible.[0m
[?25h

In [1]:
import os
import torch
from torch import nn
from d2l import torch as d2l

In [2]:
d2l.DATA_HUB['glove.6b.50d'] = (d2l.DATA_URL + 'glove.6B.50d.zip', '0b8703943ccdb6eb788e6f091b8946e82231bc4d')

d2l.DATA_HUB['glove.6b.100d'] = (d2l.DATA_URL + 'glove.6B.100d.zip', 'cd43bfb07e44e6f27cbcc7bc9ae3d80284fdaf5a')

d2l.DATA_HUB['glove.42b.300d'] = (d2l.DATA_URL + 'glove.42B.300d.zip', 'b5116e234e9eb9076672cfeabf5469f3eec904fa')

d2l.DATA_HUB['wiki.en'] = (d2l.DATA_URL + 'wiki.en.zip', 'c1816da3821ae9f43899be655002f6c723e91b88')

In [3]:
class TokenEmbedding:
  """Token Embedding."""
  def __init__(self, embedding_name):
    self.idx_to_token, self.idx_to_vec = self._load_embedding(embedding_name)
    self.unknown_idx = 0
    self.token_to_idx = {token: idx for idx, token in enumerate(self.idx_to_token)}

  def _load_embedding(self, embedding_name):
    idx_to_token, idx_to_vec = ['<unk>'], []
    data_dir = d2l.download_extract(embedding_name)
    # GloVe website: https://nlp.stanford.edu/projects/glove/ 
    # fastText website: https://fasttext.cc/
    with open(os.path.join(data_dir, 'vec.txt'), 'r') as f:
      for line in f:
        elems = line.rstrip().split(' ')
        token, elems = elems[0], [float(elem) for elem in elems[1:]] # Skip header information, such as the top row in fastText 
        if len(elems) > 1:
          idx_to_token.append(token)
          idx_to_vec.append(elems)
    idx_to_vec = [[0] * len(idx_to_vec[0])] + idx_to_vec #for the unk token, all 0s
    return idx_to_token, torch.tensor(idx_to_vec)

  def __getitem__(self, tokens):
    indices = [self.token_to_idx.get(token, self.unknown_idx) for token in tokens]
    vecs = self.idx_to_vec[torch.tensor(indices)] 
    return vecs

  def __len__(self):
    return len(self.idx_to_token)

In [4]:
glove_6b50d = TokenEmbedding('glove.6b.50d')
len(glove_6b50d)

400001

In [5]:
glove_6b50d.token_to_idx['beautiful'], glove_6b50d.idx_to_token[3367]

(3367, 'beautiful')

In [8]:
glove_6b50d[['pretty']]

tensor([[-0.2492, -0.3984, -0.4585, -0.3485,  0.7494, -0.3157, -0.4866,  0.0888,
         -0.5066,  0.4843, -0.6611,  0.0834, -0.6331,  0.3835,  0.5399,  0.1442,
          0.5899,  0.2353, -0.0325, -0.9441, -0.9784,  0.7925,  0.3346,  0.0793,
          1.0367, -1.1998, -1.1811,  1.3858,  1.4019, -0.5025,  2.9963, -0.0218,
          0.7850,  0.0100,  0.1198, -0.0169,  0.0850,  0.7879, -0.1398, -1.1586,
         -0.4945, -0.0492, -0.0585,  0.4244,  0.2616, -0.0854,  0.1407, -0.1651,
          0.4529,  1.3669]])

In [11]:
glove_6b50d.idx_to_vec.shape

torch.Size([400001, 50])

#Word similarity

In [9]:
def knn(W, x, k):
  # Add 1e-9 for numerical stability 
  cos = torch.mv(W, x.reshape(-1,)) / (torch.sqrt(torch.sum(W * W, axis=1) + 1e-9) * torch.sqrt((x * x).sum()))
  vals, topk = torch.topk(cos, k=k)
  return topk, vals

In [12]:
def get_similar_tokens(query_token, k, embed):
  topk, cos = knn(embed.idx_to_vec, embed[[query_token]], k + 1) 
  for i, c in zip(topk[1:], cos[1:]): # Exclude the input word
    print(f'cosine sim={float(c):.3f}: {embed.idx_to_token[int(i)]}')

In [13]:
get_similar_tokens('messi', 3, glove_6b50d)

cosine sim=0.937: ronaldinho
cosine sim=0.902: rivaldo
cosine sim=0.899: ronaldo


# Word Analogy

Besides finding similar words, we can also apply word vectors to word analogy tasks. For example, “man”:“woman”::“son”:“daughter” is the form of a word analogy: “man” is to “woman” as “son” is to “daughter”. Specifically, the word analogy completion task can be defined as: for a word analogy a : b :: c : d, given the first three words a, b and c, find d. Denote the vector of word w by vec(w). To complete the analogy, we will find the word whose vector is most similar to the result of vec(c) + vec(b) − vec(a).

In [22]:
def get_analogy(token_a, token_b, token_c, embed): 
  vecs = embed[[token_a, token_b, token_c]]
  x = vecs[2]+ (vecs[1] - vecs[0]) 
  topk, cos = knn(embed.idx_to_vec, x, 1)
  return embed.idx_to_token[int(topk[0])] # Remove unknown words

In [30]:
get_analogy('china', 'beijing', 'india', glove_6b50d)

'delhi'