In [1]:
# Code Reference : https://classic.d2l.ai/chapter_natural-language-processing/similarity-analogy.html

In [2]:
# Added this part to install the necessary libraries
!pip install d2l==0.15.1
!pip install -U mxnet-cu101==1.7.0

Collecting d2l==0.15.1
[?25l  Downloading https://files.pythonhosted.org/packages/28/fd/89b6b8fd34b4e2e54fadf5de6e8f63fd96e0c14d2b6c81ba40e9edcd964a/d2l-0.15.1-py3-none-any.whl (61kB)
[K     |█████▍                          | 10kB 22.7MB/s eta 0:00:01[K     |██████████▊                     | 20kB 15.9MB/s eta 0:00:01[K     |████████████████                | 30kB 14.5MB/s eta 0:00:01[K     |█████████████████████▍          | 40kB 14.1MB/s eta 0:00:01[K     |██████████████████████████▊     | 51kB 11.7MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 5.8MB/s 
Installing collected packages: d2l
Successfully installed d2l-0.15.1
Collecting mxnet-cu101==1.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/40/26/9655677b901537f367c3c473376e4106abc72e01a8fc25b1cb6ed9c37e8c/mxnet_cu101-1.7.0-py2.py3-none-manylinux2014_x86_64.whl (846.0MB)
[K     |███████████████████████████████▌| 834.1MB 1.2MB/s eta 0:00:11tcmalloc: large alloc 1147494400 bytes == 0x65ac40

In [3]:
# same as given in ref
from mxnet import nd
from mxnet.contrib import text

text.embedding.get_pretrained_file_names().keys()

dict_keys(['glove', 'fasttext'])

In [5]:
# choosing various dimension models, 
glove_6b50d = text.embedding.create(
    'glove', pretrained_file_name='glove.6B.50d.txt')
glove_6b300d = text.embedding.create(
    'glove', pretrained_file_name='glove.6B.300d.txt')

Downloading /root/.mxnet/embeddings/glove/glove.6B.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/embeddings/glove/glove.6B.zip...


In [8]:
# same as given in ref
# function for finding the K-nearest neighbours
def knn(W, x, k):
    cos = nd.dot(W, x.reshape((-1,))) / (
        (nd.sum(W * W, axis=1) + 1e-9).sqrt() * nd.sum(x * x).sqrt())
    topk = nd.topk(cos, k=k, ret_typ='indices').asnumpy().astype('int32')
    return topk, [cos[i].asscalar() for i in topk]

In [9]:
# defined our own function to get norm l2 distance for analogy task
def L2_dis(token_a, token_b, token_c, token_d, embed):
    vecs = embed.get_vecs_by_tokens([token_a, token_b, token_c, token_d])
    x1 = vecs[1] - vecs[0] + vecs[2]
    x2 = vecs[3]
    x = nd.norm(x1-x2)
    return x  

In [10]:
# modified the function as given in ref to print the 5 nearest neighbours
def get_analogy(token_a, token_b, token_c, embed):
    vecs = embed.get_vecs_by_tokens([token_a, token_b, token_c])
    x = vecs[1] - vecs[0] + vecs[2]
    topk, cos = knn(embed.idx_to_vec, x, 5)
    for i in range(5):
      print(embed.idx_to_token[topk[i]])
    return  

In [18]:
# This cell prints the L2 norms and the 5 nearest neighbours for the chosen examples
print(get_analogy('man', 'woman', 'son', glove_6b50d))
print(L2_dis('man', 'woman', 'son', 'daughter', glove_6b50d))

print(get_analogy('beijing', 'china', 'tokyo', glove_6b50d))
print(L2_dis('beijing', 'china', 'tokyo', 'japan', glove_6b50d))

print(get_analogy('bad', 'worst', 'big', glove_6b50d))
print(L2_dis('bad', 'worst', 'big', 'biggest', glove_6b50d))

print(get_analogy('do', 'did', 'go', glove_6b50d))
print(L2_dis('do', 'did', 'go', 'went', glove_6b50d))

print(get_analogy('prince', 'boy', 'girl', glove_6b50d))
print(L2_dis('prince', 'boy', 'girl', 'princess', glove_6b50d))

print(get_analogy('king', 'male', 'female', glove_6b50d))
print(L2_dis('king', 'male', 'female', 'queen', glove_6b50d))

daughter
mother
wife
son
niece
None

[1.5812494]
<NDArray 1 @cpu(0)>
japan
tokyo
japanese
singapore
shanghai
None

[2.7868757]
<NDArray 1 @cpu(0)>
biggest
worst
big
sweep
nation
None

[3.4864936]
<NDArray 1 @cpu(0)>
went
before
came
when
took
None

[1.7921385]
<NDArray 1 @cpu(0)>
girl
boy
kid
toddler
teen
None

[10.508645]
<NDArray 1 @cpu(0)>
female
male
bisexual
adults
adult
None

[11.540567]
<NDArray 1 @cpu(0)>


In [20]:
print(get_analogy('man', 'woman', 'son', glove_6b300d))
print(L2_dis('man', 'woman', 'son', 'daughter', glove_6b300d))

print(get_analogy('beijing', 'china', 'tokyo', glove_6b300d))
print(L2_dis('beijing', 'china', 'tokyo', 'japan', glove_6b300d))

print(get_analogy('bad', 'worst', 'big', glove_6b300d))
print(L2_dis('bad', 'worst', 'big', 'biggest', glove_6b300d))

print(get_analogy('do', 'did', 'go', glove_6b300d))
print(L2_dis('do', 'did', 'go', 'went', glove_6b300d))

print(get_analogy('prince', 'boy', 'girl', glove_6b300d))
print(L2_dis('prince', 'boy', 'girl', 'princess', glove_6b300d))

print(get_analogy('king', 'male', 'female', glove_6b300d))
print(L2_dis('king', 'male', 'female', 'queen', glove_6b300d))

daughter
son
mother
wife
eldest
None

[3.9047852]
<NDArray 1 @cpu(0)>
tokyo
japan
japanese
yen
asia
None

[5.353972]
<NDArray 1 @cpu(0)>
worst
biggest
big
major
ever
None

[5.833743]
<NDArray 1 @cpu(0)>
went
go
did
came
gone
None

[3.7308247]
<NDArray 1 @cpu(0)>
girl
boy
girls
teenager
boys
None

[14.357573]
<NDArray 1 @cpu(0)>
female
male
males
females
women
None

[16.038239]
<NDArray 1 @cpu(0)>
