In [38]:
import numpy as np
import torch
from torch.autograd import Variable
import torch.nn.functional as F
from torch import optim

In [1]:
corpus = [
    'he is a king',
    'she is a queen',
    'he is a man',
    'she is a woman',
    'warsaw is poland capital',
    'berlin is germany capital',
    'paris is france capital',
]

# Create a vocabulary of unique words from this

# In real implementation we would have to perform case normalization, removing some punctuation etc,
#but for simplicity let’s use this nice and clean data

In [2]:
def tokenize_corpus(corpus):
    tokens = [x.split() for x in corpus]
    return tokens

In [4]:
tokenized_corpus = tokenize_corpus(corpus)
print tokenized_corpus

[['he', 'is', 'a', 'king'], ['she', 'is', 'a', 'queen'], ['he', 'is', 'a', 'man'], ['she', 'is', 'a', 'woman'], ['warsaw', 'is', 'poland', 'capital'], ['berlin', 'is', 'germany', 'capital'], ['paris', 'is', 'france', 'capital']]


In [8]:
vocabulary = []
for sentence in tokenized_corpus:
    for token in sentence:
        if token not in vocabulary:
            vocabulary.append(token)
            
# ask bob ?????????
word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}

vocabulary_size = len(vocabulary)
print len(vocabulary)
print vocabulary

15
['he', 'is', 'a', 'king', 'she', 'queen', 'man', 'woman', 'warsaw', 'poland', 'capital', 'berlin', 'germany', 'paris', 'france']


In [16]:
window_size = 2
idx_pairs = []

for sentence in tokenized_corpus:
    indices = [word2idx[word] for word in sentence] # take out word, change this word to index
    
    # for each word treated as center word
    for center_word_pos in range(len(indices)):
        
        # for each window position
        for w in range(-window_size, window_size+1):
            context_word_pos = center_word_pos + w
            
            if context_word_pos<0 or context_word_pos>=len(indices) or center_word_pos == context_word_pos:
                continue
            context_word_idx = indices[context_word_pos]
            idx_pairs.append((indices[center_word_pos], context_word_idx))
            
idx_pairs = np.array(idx_pairs)

In [17]:
# skip gram
# For single pair, Prob(context/center)  e.g. P(king/is)

# Not a good approach
# max { multiplication(center) * multiplicatin(context) P(context/center) } 

# Negative log likelihood
# min { -log(multiplication(center) * multiplicatin(context) P(context/center)) }

# Loss = -1/T summation(center) * summation(context) log( P(context/center) )

# p(context/center) = exp(u.context * v.Center) / summation(exp(u.i * v.Center)) (i -> vocab)

In [23]:
# Input Layer
def get_input_layer(word_idx):
    x = torch.zeros(vocabulary_size).float()
    x[word_idx] = 1.0
    return x

In [44]:
# TRAINING

# Hidden Layer
embedding_dims = 5
w1 = Variable(torch.randn(embedding_dims, vocabulary_size).float(), requires_grad=True)
w2 = Variable(torch.randn(vocabulary_size, embedding_dims).float(), requires_grad=True)
num_epochs = 100
learning_rate = 0.001

for epoch in range(num_epochs):
    loss_val = 0
    for data, target in idx_pairs:
        x = Variable(get_input_layer(data)).float()
        y_true = Variable(torch.from_numpy(np.array([target])).long())
        
        # Output Layer
        z1 = torch.matmul(w1,x)
        z2 = torch.matmul(w2,z1)
        
        log_softmax = F.log_softmax(z2, dim=0)
        
        loss = F.nll_loss(log_softmax.view(1,-1), y_true)
        loss_val += loss.data[0]
        
        loss.backward()
        
        optimizer = optim.Adam([w1,w2], lr=learning_rate)
        optimizer.step()
        optimizer.zero_grad()
        
#         w1.data -= learning_rate * w1.grad.data
#         w2.data -= learning_rate * w2.grad.data
#         w1.grad.data.zero_()
#         w2.grad.data.zero_()
        
    if epoch % 10 == 0:
        print 'loss at ' + str(epoch) + ': ' + str(loss/len(idx_pairs))



loss at 0: tensor(0.1014)
loss at 10: tensor(1.00000e-02 *
       9.5724)
loss at 20: tensor(1.00000e-02 *
       9.0294)
loss at 30: tensor(1.00000e-02 *
       8.5159)
loss at 40: tensor(1.00000e-02 *
       8.0377)
loss at 50: tensor(1.00000e-02 *
       7.6342)
loss at 60: tensor(1.00000e-02 *
       7.3529)
loss at 70: tensor(1.00000e-02 *
       7.0874)
loss at 80: tensor(1.00000e-02 *
       6.8457)
loss at 90: tensor(1.00000e-02 *
       6.6479)


In [None]:
# Now How to Extract Vectors from this????????????