In [1]:
import torch
from torch import nn, optim
import numpy as np

sentences = [
    'human feeds dog',
    'human walks dog',
    'dog follows human',
    'dog guards house',
    'human lives in house',
    'dog lives in house',
    'human plays with dog',
    'dog fetches ball',
    'human throws ball',
    'dog sleeps on floor',
    'human sleeps on bed',
    'dog waits near door',
    'human opens door',
    'dog runs outside',
    'dog returns to human',
    'human pats dog',
    'dog wags tail',
    'human and dog are friends',
    'human and dog are pals',
    'human trains dog',
    'dog learns commands',
    'dog eats food',
    'human buys food',
    'dog stays with human'
]

In [2]:
def get_vocab(data):
  i = 1
  word2idx = dict()
  idx2word = dict()
  temp = set()

  for sentence in sentences:
    for word in sentence.split():
      if word not in temp:
        temp.add(word)
        word2idx[word] = i # mapping word to id
        idx2word[i] = word # mapping id to word
        i += 1
  return i, word2idx, idx2word

vocab_size, word2idx, idx2word = get_vocab(sentences)

In [3]:
def prev_words(i, doc, window_size):
  p_out = []
  for index in range(i - window_size, i):
    if index >= 0:
      p_out.append(word2idx[doc[index]])
    else:
      p_out.append(0)
  return p_out

def next_words(i, doc, window_size):
  n_out = []
  for index in range(i+1, i + window_size + 1):
    if index < len(doc):
      n_out.append(word2idx[doc[index]])
    else:
      n_out.append(0)
  return n_out

def get_training(sentences, window_size):
  pairs = []
  for sentence in sentences:
    sentence = sentence.split()
    for index, word in enumerate(sentence):
      prev = prev_words(index, sentence, window_size//2)
      next = next_words(index, sentence, window_size//2)
      context_words = prev + next
      center_idx = word2idx[word]
      for ctx in context_words:
        if ctx != 0:
          pairs.append([center_idx, ctx])

  x, y = zip(*pairs)
  return list(x), list(y)

window_size = 5
x, y = get_training(sentences, window_size)


In [4]:
x_tensor = torch.LongTensor(x)
y_tensor = torch.LongTensor(y)
x_tensor.shape, y_tensor.shape

(torch.Size([192]), torch.Size([192]))

In [5]:
class SkipGram(nn.Module):
  def __init__(self, vocab_size, embedding_dim):
    super(SkipGram, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.linear = nn.Linear(embedding_dim, vocab_size)

  def forward(self, input):
    embeds = self.embedding(input)
    out = self.linear(embeds)
    return out

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
lr = 0.01
epochs = 10000

model = SkipGram(vocab_size, embedding_dim = 2).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

x_tensor = x_tensor.to(device)
y_tensor = y_tensor.to(device)
for epoch in range(epochs):
  pred = model(x_tensor)
  loss = criterion(pred, y_tensor)
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

  if (epoch+1) % 1000 == 0:
    print(f'epoch {epoch}, loss: {loss.item(): .4f}')

epoch 999, loss:  2.4214
epoch 1999, loss:  2.3681
epoch 2999, loss:  2.3422
epoch 3999, loss:  2.3255
epoch 4999, loss:  2.3133
epoch 5999, loss:  2.3038
epoch 6999, loss:  2.2961
epoch 7999, loss:  2.2898
epoch 8999, loss:  2.2846
epoch 9999, loss:  2.2794
