In [1]:
import torch
from torch import nn, optim
import torch.nn.functional as F

In [2]:
CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right
raw_text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".split()

# By deriving a set from `raw_text`, we deduplicate the array
vocab = set(raw_text)
vocab_size = len(vocab)

word_to_ix = {word: i for i, word in enumerate(vocab)}
data = []
for i in range(2, len(raw_text) - 2):
    context = [raw_text[i - 2], raw_text[i - 1],
               raw_text[i + 1], raw_text[i + 2]]
    target = raw_text[i]
    data.append((context, target))
print(data[:5])

[(['We', 'are', 'to', 'study'], 'about'), (['are', 'about', 'study', 'the'], 'to'), (['about', 'to', 'the', 'idea'], 'study'), (['to', 'study', 'idea', 'of'], 'the'), (['study', 'the', 'of', 'a'], 'idea')]


In [3]:
class CBOW(nn.Module):

    def __init__(self, vocab_size, embedding_dim_left, embedding_dim_right, context_size):
        super(CBOW, self).__init__()
        self.embeddings_left = nn.Embedding(vocab_size, embedding_dim_left)
        self.embeddings_right = nn.Embedding(vocab_size, embedding_dim_right)
        self.linear1 = nn.Linear(context_size * embedding_dim_left, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        left_embeds = self.embeddings_left(inputs[:2]).view((1,-1))
        right_embeds = self.embeddings_left(inputs[2:]).view((1,-1))
        left_out = F.relu(self.linear1(left_embeds))
        right_out = F.relu(self.linear1(right_embeds))
        
        out = self.linear2(left_out + right_out)
        
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

In [4]:
model = CBOW(vocab_size, 10, 10, 2)

In [5]:
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)

In [6]:
from tqdm import tqdm
losses = []

for epoch in tqdm(range(10)):
    total_loss = 0
    for context, target in data:

        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()
        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model(context_idxs)
        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))
        
        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    losses.append(total_loss)
print(losses)  # The loss decreased every iteration over the training data!

100%|██████████| 10/10 [00:00<00:00, 26.30it/s]

[230.35563230514526, 227.19607210159302, 224.15406250953674, 221.21957325935364, 218.3815402984619, 215.6315939426422, 212.96200942993164, 210.36465167999268, 207.83235359191895, 205.3590123653412]



