In [1]:
# code by Tae Hwan Jung @graykode
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable

dtype = torch.FloatTensor

In [2]:
sentences = [ "i like dog", "i love coffee", "i hate milk"]

In [3]:
word_list = " ".join(sentences).split()
word_list = list(set(word_list))
word_dict = {w: i for i, w in enumerate(word_list)}
number_dict = {i: w for i, w in enumerate(word_list)}
n_class = len(word_dict) # number of Vocabulary

In [4]:
word_dict

{'love': 0, 'dog': 1, 'hate': 2, 'i': 3, 'like': 4, 'milk': 5, 'coffee': 6}

In [6]:
number_dict

{0: 'love', 1: 'dog', 2: 'hate', 3: 'i', 4: 'like', 5: 'milk', 6: 'coffee'}

In [5]:
n_class

7

In [7]:
def make_batch(sentences):
    input_batch = []
    target_batch = []

    for sen in sentences:
        word = sen.split()
        input = [word_dict[n] for n in word[:-1]]
        target = word_dict[word[-1]]

        input_batch.append(input)
        target_batch.append(target)

    return input_batch, target_batch

In [8]:
input_batch, target_batch = make_batch(sentences)
input_batch = Variable(torch.LongTensor(input_batch))
target_batch = Variable(torch.LongTensor(target_batch))

In [9]:
# {'love': 0, 'dog': 1, 'hate': 2, 'i': 3, 'like': 4, 'milk': 5, 'coffee': 6}
input_batch

tensor([[3, 4],
        [3, 0],
        [3, 2]])

In [10]:
target_batch

tensor([1, 6, 5])

In [11]:
class NNLM(nn.Module):
    def __init__(self):
        super(NNLM, self).__init__()
        self.C = nn.Embedding(n_class, m)
        self.H = nn.Parameter(torch.randn(n_step * m, n_hidden).type(dtype))
        self.W = nn.Parameter(torch.randn(n_step * m, n_class).type(dtype))
        self.d = nn.Parameter(torch.randn(n_hidden).type(dtype))
        self.U = nn.Parameter(torch.randn(n_hidden, n_class).type(dtype))
        self.b = nn.Parameter(torch.randn(n_class).type(dtype))

    def forward(self, X):
        X = self.C(X) 
        X = X.view(-1, n_step * m) # [batch_size, n_step * n_class]
        tanh = torch.tanh(self.d + torch.mm(X, self.H)) # [batch_size, n_hidden]
        output = self.b + torch.mm(X, self.W) + torch.mm(tanh, self.U) # [batch_size, n_class]
        return output

In [12]:
# NNLM Parameter
n_step = 2 # n-1 in paper
n_hidden = 2 # h in paper
m = 2 # m in paper

### step1

In [13]:
# self.C = nn.Embedding(n_class, m)
layer1 = nn.Embedding(7, 2)
input_batch = layer1(input_batch)

In [16]:
print(input_batch)
print(input_batch.size())

tensor([[[-0.4834, -0.4924],
         [ 0.9009,  0.3777]],

        [[-0.4834, -0.4924],
         [-0.3069, -0.9502]],

        [[-0.4834, -0.4924],
         [ 0.8882,  0.1755]]], grad_fn=<EmbeddingBackward>)
torch.Size([3, 2, 2])


* i like -> [3,4] -> 
[-0.4834, -0.4924],
[ 0.9009,  0.3777]

### step2

In [17]:
# X = X.view(-1, n_step * m)

input_batch = input_batch.view(-1, 2 * 2)

In [18]:
print(input_batch)
print(input_batch.size())

tensor([[-0.4834, -0.4924,  0.9009,  0.3777],
        [-0.4834, -0.4924, -0.3069, -0.9502],
        [-0.4834, -0.4924,  0.8882,  0.1755]], grad_fn=<ViewBackward>)
torch.Size([3, 4])


### step3

In [19]:
# tanh = torch.tanh(self.d + torch.mm(X, self.H)) # [batch_size, n_hidden]


d = nn.Parameter(torch.randn(2).type(dtype))
H = nn.Parameter(torch.randn(2 * 2, 2).type(dtype))


tanh = torch.tanh(d + torch.mm(input_batch, H))

[3 * 4]와 [4 * 2]를 내적하면서 -> [3 * 2]

In [20]:
print(d.shape)
print(H.shape)

torch.Size([2])
torch.Size([4, 2])


In [21]:
print(tanh.shape)

torch.Size([3, 2])


### step4

In [23]:
# output = self.b + torch.mm(X, self.W) + torch.mm(tanh, self.U) # [batch_size, n_class]


b = nn.Parameter(torch.randn(7).type(dtype))
W = nn.Parameter(torch.randn(4, 7).type(dtype))
U = nn.Parameter(torch.randn(2, 7).type(dtype))

output = b + torch.mm(input_batch, W) + torch.mm(tanh, U)

In [24]:
print(b.shape)
print(W.shape)
print(U.shape)

torch.Size([7])
torch.Size([4, 7])
torch.Size([2, 7])


In [25]:
print(output.shape)

torch.Size([3, 7])


In [26]:
output

tensor([[-3.6326,  1.7987,  1.4586, -2.5563, -0.0386,  1.7940, -0.2317],
        [ 0.0491,  1.8512,  1.1408, -1.2138, -1.8759,  1.2862, -3.9210],
        [-3.3913,  1.8834,  1.4202, -2.6676, -0.3190,  1.6556, -0.4457]],
       grad_fn=<AddBackward0>)

### Training

In [27]:
model = NNLM()

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [29]:
input_batch, target_batch = make_batch(sentences)
input_batch = Variable(torch.LongTensor(input_batch))
target_batch = Variable(torch.LongTensor(target_batch))

for epoch in range(5000):

    optimizer.zero_grad()
    output = model(input_batch)

    # output : [batch_size, n_class], target_batch : [batch_size] (LongTensor, not one-hot)
    loss = criterion(output, target_batch)
    if (epoch + 1)%1000 == 0:
        print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))

    loss.backward()
    optimizer.step()

# Predict
predict = model(input_batch).data.max(1, keepdim=True)[1]

# Test
print([sen.split()[:2] for sen in sentences], '->', [number_dict[n.item()] for n in predict.squeeze()])

Epoch: 1000 cost = 0.056290
Epoch: 2000 cost = 0.009048
Epoch: 3000 cost = 0.003249
Epoch: 4000 cost = 0.001505
Epoch: 5000 cost = 0.000785
[['i', 'like'], ['i', 'love'], ['i', 'hate']] -> ['dog', 'coffee', 'milk']
