In [1]:
import torch
import numpy as np
import torch.functional as F
import torch.nn.functional as F
import torch.nn as nn

In [2]:
corpus = [
    'he is a king',
    'she is a queen',
    'king is a man',
    'queen is a woman',
    'he is a man',
    'she is a woman',
    'warsaw is poland capital',
    'berlin is germany capital',
    'paris is france capital',
    'paris is a nice city to visit',
    'london is england capital',
    'brasilia is brasil capital',
    'buenos aires is argentina capital'
]

In [3]:
corpus

['he is a king',
 'she is a queen',
 'king is a man',
 'queen is a woman',
 'he is a man',
 'she is a woman',
 'warsaw is poland capital',
 'berlin is germany capital',
 'paris is france capital',
 'paris is a nice city to visit',
 'london is england capital',
 'brasilia is brasil capital',
 'buenos aires is argentina capital']

In [4]:
def tokenize_corpus(corpus):
    tokens = [x.split() for x in corpus]
    return tokens

tokenized_corpus = tokenize_corpus(corpus)

In [5]:
tokenized_corpus

[['he', 'is', 'a', 'king'],
 ['she', 'is', 'a', 'queen'],
 ['king', 'is', 'a', 'man'],
 ['queen', 'is', 'a', 'woman'],
 ['he', 'is', 'a', 'man'],
 ['she', 'is', 'a', 'woman'],
 ['warsaw', 'is', 'poland', 'capital'],
 ['berlin', 'is', 'germany', 'capital'],
 ['paris', 'is', 'france', 'capital'],
 ['paris', 'is', 'a', 'nice', 'city', 'to', 'visit'],
 ['london', 'is', 'england', 'capital'],
 ['brasilia', 'is', 'brasil', 'capital'],
 ['buenos', 'aires', 'is', 'argentina', 'capital']]

In [6]:
tokenized_corpus[0]

['he', 'is', 'a', 'king']

In [7]:
vocabulary = []
for sentence in tokenized_corpus:
    for token in sentence:
        if token not in vocabulary:
            vocabulary.append(token)

word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}

In [8]:
word2idx

{'he': 0,
 'is': 1,
 'a': 2,
 'king': 3,
 'she': 4,
 'queen': 5,
 'man': 6,
 'woman': 7,
 'warsaw': 8,
 'poland': 9,
 'capital': 10,
 'berlin': 11,
 'germany': 12,
 'paris': 13,
 'france': 14,
 'nice': 15,
 'city': 16,
 'to': 17,
 'visit': 18,
 'london': 19,
 'england': 20,
 'brasilia': 21,
 'brasil': 22,
 'buenos': 23,
 'aires': 24,
 'argentina': 25}

In [9]:
idx2word

{0: 'he',
 1: 'is',
 2: 'a',
 3: 'king',
 4: 'she',
 5: 'queen',
 6: 'man',
 7: 'woman',
 8: 'warsaw',
 9: 'poland',
 10: 'capital',
 11: 'berlin',
 12: 'germany',
 13: 'paris',
 14: 'france',
 15: 'nice',
 16: 'city',
 17: 'to',
 18: 'visit',
 19: 'london',
 20: 'england',
 21: 'brasilia',
 22: 'brasil',
 23: 'buenos',
 24: 'aires',
 25: 'argentina'}

In [10]:
vocabulary_size = len(vocabulary)
vocabulary_size

26

In [17]:
window_size = 3
idx_pairs = []

In [18]:
# for each sentence
for sentence in tokenized_corpus:
    indices = [word2idx[word] for word in sentence]
    # for each word, threated as center word
    for center_word_pos in range(len(indices)):
        # for each window position
        for w in range(-window_size, window_size + 1):
            context_word_pos = center_word_pos + w
            # make soure not jump out sentence
            if context_word_pos < 0 or context_word_pos >= len(indices) or center_word_pos == context_word_pos:
                continue
            context_word_idx = indices[context_word_pos]
            idx_pairs.append((indices[center_word_pos], context_word_idx))

In [19]:
idx_pairs = np.array(idx_pairs) # it will be useful to have this as numpy array
idx_pairs[0:15]

array([[0, 1],
       [0, 2],
       [0, 3],
       [1, 0],
       [1, 2],
       [1, 3],
       [2, 0],
       [2, 1],
       [2, 3],
       [3, 0],
       [3, 1],
       [3, 2],
       [4, 1],
       [4, 2],
       [4, 5]])

<img src="w2v_1.png" width="150"/>

In [20]:
ix = 0
print(f"Pair - {ix} - {idx_pairs[ix]}")
[idx2word[x] for x in idx_pairs[ix]]

Pair - 0 - [0 1]


['he', 'is']

In [21]:
ix = 1
print(f"Pair - {ix} - {idx_pairs[ix]}")
[idx2word[x] for x in idx_pairs[ix]]

Pair - 1 - [0 2]


['he', 'a']

In [22]:
ix = 2
print(f"Pair - {ix} - {idx_pairs[ix]}")
[idx2word[x] for x in idx_pairs[ix]]

Pair - 2 - [0 3]


['he', 'king']

In [23]:
def get_input_one_hot(word_idx):
    x = torch.zeros(vocabulary_size).float()
    x[word_idx] = 1.0
    return x

In [24]:
get_input_one_hot(0)

tensor([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.])

In [25]:
get_input_one_hot(-1)

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1.])

In [26]:
get_input_one_hot(-2)

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 1., 0.])

In [28]:
embedding_dims = 10
W1 = torch.randn(embedding_dims, vocabulary_size, requires_grad=True).float()
W2 = torch.randn(vocabulary_size, embedding_dims, requires_grad=True).float()
num_epochs = 500
learning_rate = 0.001
debug = False

In [29]:
for epo in range(num_epochs):
    loss_val = 0
    for data, target in idx_pairs:
        x = get_input_one_hot(data).float()
        y_true = torch.from_numpy(np.array([target])).long()
        z1 = torch.matmul(W1, x)
        z2 = torch.matmul(W2, z1)
    
        log_softmax = F.log_softmax(z2, dim=0)

        loss = F.nll_loss(log_softmax.view(1,-1), y_true)
        loss_val += loss.item()
        loss.backward()
        W1.data -= learning_rate * W1.grad.data
        W2.data -= learning_rate * W2.grad.data

        W1.grad.data.zero_()
        W2.grad.data.zero_()
        
        if debug:
            print(f"Data:   {data}")
            print(f"Target: {target}")
            #print(f"Log SoftMax: {log_softmax}")
            print(f"Arg Max Log SoftMax: {torch.argmax(log_softmax)}")
            print(f"True Label: {y_true}")
            print("-"*30)
    if epo % 50 == 0:    
        print(f'Loss at epo {epo}: {loss_val/len(idx_pairs)}')

Loss at epo 0: 6.915566307968564
Loss at epo 50: 3.9562672475973764
Loss at epo 100: 3.308071145415306
Loss at epo 150: 2.9743654479583106
Loss at epo 200: 2.760923661126031
Loss at epo 250: 2.6125941935512755
Loss at epo 300: 2.5028090198834736
Loss at epo 350: 2.4165219058593115
Loss at epo 400: 2.3464569449424744
Loss at epo 450: 2.2886743346850076


In [30]:
W1.shape

torch.Size([10, 26])

In [31]:
W2.shape

torch.Size([26, 10])

In [32]:
def get_embed(word):
    return W1@get_input_one_hot(word2idx[word])

In [34]:
embed_king = get_embed("king")
embed_king

tensor([-1.3280,  2.2264, -0.4459, -0.8265,  0.5568, -0.9928, -1.3922, -0.5202,
         0.1527, -0.6687], grad_fn=<MvBackward>)

In [35]:
embed_queen = get_embed("queen")
embed_queen

tensor([-0.4386,  0.1551,  1.2959, -0.9430, -0.5632, -1.1942, -0.8030, -0.4712,
        -1.5649, -2.0273], grad_fn=<MvBackward>)

## Compare Similarity

In [36]:
cos = nn.CosineSimilarity(dim=0, eps=1e-6)

In [37]:
cos(embed_king, embed_king)

tensor(1., grad_fn=<DivBackward0>)

In [38]:
cos(get_embed("king"), get_embed("england"))

tensor(0.0209, grad_fn=<DivBackward0>)

In [39]:
cos(get_embed("king"), get_embed("argentina"))

tensor(0.0668, grad_fn=<DivBackward0>)

In [40]:
cos(get_embed("king"), get_embed("man"))

tensor(0.9168, grad_fn=<DivBackward0>)

In [41]:
cos(get_embed("king"), get_embed("woman"))

tensor(0.7321, grad_fn=<DivBackward0>)