# LSTM

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x1fa4d69eb70>

In [3]:
input_dim = 3
output_dim = 3
lstm = nn.LSTM(input_dim, output_dim)
print(lstm)

LSTM(3, 3)


In [4]:
# sequence 생성
inputs = [torch.randn(1, 3) for _ in range(5)]
print(inputs)

[tensor([[-0.5525,  0.6355, -0.3968]]), tensor([[-0.6571, -1.6428,  0.9803]]), tensor([[-0.0421, -0.8206,  0.3133]]), tensor([[-1.1352,  0.3773, -0.2824]]), tensor([[-2.5667, -1.4303,  0.5009]])]


In [5]:
hidden = (torch.randn(1, 1, 3), torch.randn(1, 1, 3))

# 한번에 하나의 단어만 lstm에 투입
for i in inputs:
    print(i.shape)
    out, hidden = lstm(i.view(1, 1, -1), hidden)

torch.Size([1, 3])
torch.Size([1, 3])
torch.Size([1, 3])
torch.Size([1, 3])
torch.Size([1, 3])


In [6]:
# 한꺼번에 sequence 전체를 lstm에 투입
inputs = torch.cat(inputs).view(len(inputs), 1, -1)
print(inputs.shape)
hidden = (torch.randn(1, 1, 3), torch.randn(1, 1, 3))

torch.Size([5, 1, 3])


In [7]:
out, hidden = lstm(inputs, hidden)
print(out)
print(hidden)

tensor([[[-0.0187,  0.1713, -0.2944]],

        [[-0.3521,  0.1026, -0.2971]],

        [[-0.3191,  0.0781, -0.1957]],

        [[-0.1634,  0.0941, -0.1637]],

        [[-0.3368,  0.0959, -0.0538]]])
(tensor([[[-0.3368,  0.0959, -0.0538]]]), tensor([[[-0.9825,  0.4715, -0.0633]]]))


# An LSTM for Part-of-Speech Tagging

In [8]:
# prepard data
# sequence vector to indexs vector
def prepare_sequence(seq, word2idx):
    idxs = [word2idx[w] for w in seq]
    return torch.LongTensor(idxs)

In [9]:
# input: 문장의 token
# target: token들의 품사
train_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]

word2idx = {}
for sentence, tag in train_data:
    for word in sentence:
        if word not in word2idx:
            word2idx[word] = len(word2idx)
print(word2idx)

tag2idx = {"DET": 0, "NN": 1, "V": 2}

{'The': 0, 'dog': 1, 'ate': 2, 'the': 3, 'apple': 4, 'Everybody': 5, 'read': 6, 'that': 7, 'book': 8}


In [10]:
embedding_dim = 6
hidden_dim = 6
vocab_size = len(word2idx)
tagset_size = len(tag2idx)

class LSTMTagger(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger,self).__init__()
        
        self.hidden_dim = hidden_dim
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        
        self.hidden = self.init_hidden()
    
    def init_hidden(self):
        # num_layers, minibatch_size, hidden_dim
        return (torch.zeros(1, 1, self.hidden_dim), torch.zeros(1, 1, self.hidden_dim))
    
    def forward(self, x):
#         print("Original input size:", x.shape)
        emb = self.embedding(x)
#         print("Embedding output size:", emb.shape)
        emb = emb.view(len(x), 1, -1) 
#         print("Resized embedding output size:", emb.shape)
        
        lstm_out, self.hidden = self.lstm(emb.view(len(x), 1, -1), self.hidden)
        tag_out = self.hidden2tag(lstm_out.view(len(x), -1))
        out = F.log_softmax(tag_out, dim=1)
        return out

In [11]:
model = LSTMTagger(embedding_dim, hidden_dim, vocab_size, tagset_size)
print(model)

LSTMTagger(
  (embedding): Embedding(9, 6)
  (lstm): LSTM(6, 6)
  (hidden2tag): Linear(in_features=6, out_features=3, bias=True)
)


In [12]:
criterion = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [13]:
# 학습 전의 output 확인
with torch.no_grad():
    inputs = prepare_sequence(train_data[0][0], word2idx)
    outputs = model(inputs)
    print(outputs)

tensor([[-1.1389, -1.2024, -0.9693],
        [-1.1065, -1.2200, -0.9834],
        [-1.1286, -1.2093, -0.9726],
        [-1.1190, -1.1960, -0.9916],
        [-1.0137, -1.2642, -1.0366]])


In [15]:
# 학습 과정
import numpy as np
for epoch in range(300):
    losses = []
    for sentence, tags in train_data:
        model.zero_grad()
        model.hidden = model.init_hidden()
        
        sentence = prepare_sequence(sentence, word2idx)
        tags = prepare_sequence(tags, tag2idx)
        
#         print(sentence.shape)
#         print(tags.shape)
        
        outputs = model(sentence)
        loss = criterion(outputs, tags)
        print(loss)
        
        
        loss.backward()
        optimizer.step()
        
        losses.append(loss.item())
    if (epoch+1) % 30 == 0:
        print("[%d/%d] loss:%.3f" % (epoch+1, 300, np.mean(losses)))

tensor(1.00000e-02 *
       3.0276)
tensor(1.00000e-02 *
       3.0777)
tensor(1.00000e-02 *
       3.0097)
tensor(1.00000e-02 *
       3.0601)
tensor(1.00000e-02 *
       2.9919)
tensor(1.00000e-02 *
       3.0428)
tensor(1.00000e-02 *
       2.9744)
tensor(1.00000e-02 *
       3.0256)
tensor(1.00000e-02 *
       2.9570)
tensor(1.00000e-02 *
       3.0086)
tensor(1.00000e-02 *
       2.9398)
tensor(1.00000e-02 *
       2.9918)
tensor(1.00000e-02 *
       2.9228)
tensor(1.00000e-02 *
       2.9751)
tensor(1.00000e-02 *
       2.9060)
tensor(1.00000e-02 *
       2.9587)
tensor(1.00000e-02 *
       2.8893)
tensor(1.00000e-02 *
       2.9423)
tensor(1.00000e-02 *
       2.8728)
tensor(1.00000e-02 *
       2.9262)
tensor(1.00000e-02 *
       2.8565)
tensor(1.00000e-02 *
       2.9102)
tensor(1.00000e-02 *
       2.8403)
tensor(1.00000e-02 *
       2.8944)
tensor(1.00000e-02 *
       2.8243)
tensor(1.00000e-02 *
       2.8787)
tensor(1.00000e-02 *
       2.8085)
tensor(1.00000e-02 *
       

tensor(1.00000e-02 *
       1.7360)
tensor(1.00000e-02 *
       1.8013)
tensor(1.00000e-02 *
       1.7294)
tensor(1.00000e-02 *
       1.7947)
[120/300] loss:0.018
tensor(1.00000e-02 *
       1.7229)
tensor(1.00000e-02 *
       1.7882)
tensor(1.00000e-02 *
       1.7165)
tensor(1.00000e-02 *
       1.7818)
tensor(1.00000e-02 *
       1.7101)
tensor(1.00000e-02 *
       1.7753)
tensor(1.00000e-02 *
       1.7037)
tensor(1.00000e-02 *
       1.7690)
tensor(1.00000e-02 *
       1.6974)
tensor(1.00000e-02 *
       1.7626)
tensor(1.00000e-02 *
       1.6911)
tensor(1.00000e-02 *
       1.7563)
tensor(1.00000e-02 *
       1.6849)
tensor(1.00000e-02 *
       1.7501)
tensor(1.00000e-02 *
       1.6787)
tensor(1.00000e-02 *
       1.7439)
tensor(1.00000e-02 *
       1.6725)
tensor(1.00000e-02 *
       1.7377)
tensor(1.00000e-02 *
       1.6664)
tensor(1.00000e-02 *
       1.7315)
tensor(1.00000e-02 *
       1.6604)
tensor(1.00000e-02 *
       1.7255)
tensor(1.00000e-02 *
       1.6543)
tensor(

tensor(1.00000e-02 *
       1.2253)
tensor(1.00000e-02 *
       1.1631)
tensor(1.00000e-02 *
       1.2221)
tensor(1.00000e-02 *
       1.1600)
tensor(1.00000e-02 *
       1.2189)
tensor(1.00000e-02 *
       1.1569)
tensor(1.00000e-02 *
       1.2158)
tensor(1.00000e-02 *
       1.1538)
tensor(1.00000e-02 *
       1.2126)
tensor(1.00000e-02 *
       1.1507)
tensor(1.00000e-02 *
       1.2094)
tensor(1.00000e-02 *
       1.1477)
tensor(1.00000e-02 *
       1.2063)
tensor(1.00000e-02 *
       1.1446)
tensor(1.00000e-02 *
       1.2032)
tensor(1.00000e-02 *
       1.1416)
tensor(1.00000e-02 *
       1.2001)
tensor(1.00000e-02 *
       1.1386)
tensor(1.00000e-02 *
       1.1970)
tensor(1.00000e-02 *
       1.1356)
tensor(1.00000e-02 *
       1.1939)
tensor(1.00000e-02 *
       1.1326)
tensor(1.00000e-02 *
       1.1909)
tensor(1.00000e-02 *
       1.1296)
tensor(1.00000e-02 *
       1.1878)
tensor(1.00000e-02 *
       1.1266)
tensor(1.00000e-02 *
       1.1848)
tensor(1.00000e-02 *
       

In [14]:
# 학습 후의 output 확인
with torch.no_grad():
    inputs = prepare_sequence(train_data[0][0], word2idx)
    outputs = model(inputs)
    print(outputs)

tensor([[-0.0858, -2.9355, -3.5374],
        [-5.2313, -0.0234, -4.0314],
        [-3.9098, -4.1279, -0.0368],
        [-0.0187, -4.7809, -4.5960],
        [-5.8170, -0.0183, -4.1879]])


In [15]:
# test
test_sentence = "The boy goes to high school"
test_sentence = test_sentence.split()

print(test_sentence)

word2idx = {}
for word in test_sentence:
    if word not in word2idx:
        word2idx[word] = len(word2idx)
    
tag2idx = {"DET": 0, "NN": 1, "V": 2}
idx2tag = {idx: tag for tag, idx in tag2idx.items()}

test_sentence = prepare_sequence(test_sentence, word2idx)
outputs = model(test_sentence)
scores, idxs = torch.max(outputs, dim=1)
result_tag = [idx2tag[idx.item()] for idx in idxs]
print(result_tag)

['The', 'boy', 'goes', 'to', 'high', 'school']
['DET', 'NN', 'V', 'DET', 'NN', 'NN']
