In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import random

In [2]:
device = 'cuda' if torch.cuda.is_available else 'cpu'

In [3]:
raw_text = "We are about to study the idea of a computational process. Computational processes are abstract beings that inhabit computers. As they evolve, processes manipulate other abstract things called data. The evolution of a process is directed by a pattern of rules called a program. People create programs to direct processes. In effect, we conjure the spirits of the computer with our spells."

In [4]:
def clean_text(text):
    text = text.replace(',', '').replace('.','')
    return text

In [5]:
text = list(set(clean_text(raw_text).split()))

In [6]:
print(text)

['manipulate', 'The', 'effect', 'spells', 'Computational', 'computers', 'our', 'As', 'to', 'that', 'a', 'evolve', 'idea', 'with', 'We', 'rules', 'the', 'spirits', 'are', 'about', 'direct', 'program', 'programs', 'data', 'by', 'computational', 'inhabit', 'pattern', 'processes', 'In', 'abstract', 'called', 'conjure', 'create', 'computer', 'study', 'of', 'other', 'beings', 'evolution', 'is', 'directed', 'we', 'process', 'they', 'things', 'People']


In [7]:
id2word = dict()
word2id = dict()
for i, word in enumerate(text):
    id2word[i] = word
    word2id[word] = i

In [8]:
def skipgram(sentence, window_size=2, neg_samples=5, raw_sentence=False):
    pairs = []
    if raw_sentence:
        sentence = sentence.lower().split()
    
    for i, word in enumerate(sentence):
        cnt = 0
        for j in range(-window_size, window_size+1):
            if j != 0 and (i+j) >= 0 and (i+j) < len(sentence):
                pairs.append((sentence[i], sentence[i+j], 1))
                cnt += 1
                
        #NAIVE negative sampling
        for _ in range(neg_samples):
            ran_num = random.randint(0, len(word2id)-1)
            while ran_num == word2id[word]:
                ran_num = random.randint(0, len(word2id)-1)
            neg_sample = id2word[ran_num]
            pairs.append((sentence[i], neg_sample, 0))

    return pairs

In [9]:
def pair_to_input(pairs, id2word, word2id):
    center_ix = []
    context_ix = []
    targets = []
    for pair in pairs:
        center_ix.append(word2id[pair[0]])
        context_ix.append(word2id[pair[1]])
        targets.append(pair[2])
    
    return center_ix, context_ix, targets

In [10]:
pairs = skipgram(text)

In [11]:
center_ix, context_ix, targets = pair_to_input(pairs, id2word, word2id)

In [12]:
class Skipgram(nn.Module):
    
    def __init__(self, vocab_size, embed_dim):
        super(Skipgram, self).__init__()
        self.center_emb = nn.Embedding(vocab_size, embed_dim)
        self.context_emb = nn.Embedding(vocab_size, embed_dim)
        
    def forward(self, u_ix, v_ix):
        
        u = self.context_emb(u_ix).view(1,-1)
        v = self.center_emb(v_ix).view(1,-1)
        score = torch.mm(u, v.transpose(1,0))
        
        return torch.sigmoid(score)

In [13]:
num_epoch = 30

In [14]:
model = Skipgram(len(word2id), 100)
model.to(device)

Skipgram(
  (center_emb): Embedding(47, 100)
  (context_emb): Embedding(47, 100)
)

In [15]:
def train():
    
    optimizer = optim.SGD(lr=1e-3, params=model.parameters())
    
    for epoch in range(num_epoch):
        for i in range(len(pairs)):
            center = torch.tensor(center_ix[i]).to(device).long()
            context = torch.tensor(context_ix[i]).to(device).long()
            target = torch.tensor(targets[i]).view(1,-1).to(device).float()
            
            out = model.forward(context, center)
            loss = F.binary_cross_entropy(out, target)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            if (i+1) % 300 == 0:
                print('Epoch %d  %d steps, loss : %0.4f' %(epoch+1, i+1, loss))

In [20]:
train()

Epoch 1  300 steps, loss : 0.0984
Epoch 2  300 steps, loss : 0.0968
Epoch 3  300 steps, loss : 0.0952
Epoch 4  300 steps, loss : 0.0937
Epoch 5  300 steps, loss : 0.0923
Epoch 6  300 steps, loss : 0.0908
Epoch 7  300 steps, loss : 0.0895
Epoch 8  300 steps, loss : 0.0881
Epoch 9  300 steps, loss : 0.0868
Epoch 10  300 steps, loss : 0.0856
Epoch 11  300 steps, loss : 0.0843
Epoch 12  300 steps, loss : 0.0832
Epoch 13  300 steps, loss : 0.0820
Epoch 14  300 steps, loss : 0.0809
Epoch 15  300 steps, loss : 0.0798
Epoch 16  300 steps, loss : 0.0787
Epoch 17  300 steps, loss : 0.0777
Epoch 18  300 steps, loss : 0.0767
Epoch 19  300 steps, loss : 0.0757
Epoch 20  300 steps, loss : 0.0747
Epoch 21  300 steps, loss : 0.0738
Epoch 22  300 steps, loss : 0.0729
Epoch 23  300 steps, loss : 0.0720
Epoch 24  300 steps, loss : 0.0711
Epoch 25  300 steps, loss : 0.0703
Epoch 26  300 steps, loss : 0.0695
Epoch 27  300 steps, loss : 0.0686
Epoch 28  300 steps, loss : 0.0679
Epoch 29  300 steps, loss : 0

In [21]:
def n_neighbors(word, model, k=3):
    
    embedding = model.center_emb.weight #(vocab_size x dim)
    word_emb = embedding[word2id[word]] #(1 x dim)
    
    distances = []
    for word in embedding:
        distances.append(torch.pow(torch.pow(word - word_emb, 2).sum(), 0.5))
    similarities = torch.mm(embedding, word_emb.view(1,-1).transpose(1,0)).cpu().detach().numpy()
    
    _, closest = torch.topk(torch.tensor(distances), k+1, largest=False)
    closest = closest.cpu().detach().numpy().tolist()
    
    words = []
    for i, close in enumerate(closest):
        if i != 0:
            words.append(id2word[close])
    print('%d closest words :' %k, words)
    
    #return words

In [22]:
n_neighbors('inhabit', model, 5)

5 closest words : ['they', 'that', 'evolve', 'rules', 'to']
