Sequence Models and Long-Short Term Memory Networks

# Import modules

In [9]:
from pprint import pprint
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x115853110>

# LSTM's in Pytorch

In [11]:
lstm = nn.LSTM(3, 3)  # Input dim: 3, Output dim: 3
inputs = [torch.randn(1, 3) for _ in range(5)]  # 特徴3つ持った length 5 のシーケンス
print("--- Inputs ---")
pprint(inputs)
print()

# Initialize the hidden state
hidden = (torch.randn(1, 1, 3),
                  torch.randn(1, 1, 3))
print("--- hidden ---")
pprint(hidden)
print()

for i in inputs:
    out, hidden = lstm(i.view(1, 1, -1), hidden)

print("--- out ---")
pprint(out)
print()
print("--- hidden ---")
pprint(hidden)

--- Inputs ---
[tensor([[-0.1473,  0.3482,  1.1371]]),
 tensor([[-0.3339, -1.4724,  0.7296]]),
 tensor([[-0.1312, -0.6368,  1.0429]]),
 tensor([[ 0.4903,  1.0318, -0.5989]]),
 tensor([[ 1.6015, -1.0735, -1.2173]])]

--- hidden ---
(tensor([[[ 0.6472, -0.0412, -0.1775]]]),
 tensor([[[-0.5000,  0.8673, -0.2732]]]))

--- out ---
tensor([[[-0.1077,  0.0289, -0.0487]]], grad_fn=<CatBackward>)

--- hidden ---
(tensor([[[-0.1077,  0.0289, -0.0487]]], grad_fn=<ViewBackward>),
 tensor([[[-0.1439,  0.1426, -0.2563]]], grad_fn=<ViewBackward>))


In [21]:
pprint(inputs[0])
print(inputs[0].shape)
pprint(inputs[0].view(1, 1, -1))
print(inputs[0].view(1, 1, -1).shape)

tensor([[-0.1473,  0.3482,  1.1371]])
torch.Size([1, 3])
tensor([[[-0.1473,  0.3482,  1.1371]]])
torch.Size([1, 1, 3])


In [35]:
inputs = torch.cat(tuple(inputs)).view(len(inputs), 1, -1)
hidden = (torch.randn(1, 1, 3), torch.randn(1, 1, 3))
out, hidden = lstm(inputs, hidden)
pprint(out)
pprint(hidden)

tensor([[[-0.1180,  0.0690, -0.4030]],

        [[-0.2783,  0.0456, -0.2278]],

        [[-0.3332,  0.0701, -0.2882]],

        [[-0.2436,  0.0887, -0.1496]],

        [[-0.0298,  0.0178, -0.0704]]], grad_fn=<CatBackward>)
(tensor([[[-0.0298,  0.0178, -0.0704]]], grad_fn=<ViewBackward>),
 tensor([[[-0.0399,  0.0882, -0.3806]]], grad_fn=<ViewBackward>))


# Example: An LSTM for Part-of-Speech Tagging

センテンスに含まれる単語に品詞タグを付ける

```
w: 単語 ∈ V (Vocab)
Input sentence: w1, w2, ..., wM
T: tag set
yi: wi のtag
yi^: wi のtag予測値
```

## Prepare data

In [37]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

training_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]
word_to_ix = {}

# 学習データから1センテンスずつ読込
for sent, tags in training_data:
    
    # センテンスから1単語ずつ読込
    for word in sent:
        
        # word_to_ix に単語が含まれていなかったら新しいidxを追加
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
            
pprint(word_to_ix)
tag_to_ix = {"DET": 0, "NN": 1, "V": 2}

EMBEDDING_DIM = 6
HIDDEN_DIM = 6

{'Everybody': 5,
 'The': 0,
 'apple': 4,
 'ate': 2,
 'book': 8,
 'dog': 1,
 'read': 6,
 'that': 7,
 'the': 3}


## Create the model

In [51]:
class LSTMTagger(nn.Module):
    
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        
        # LSTM は単語埋め込みを入力としてとり、隠れ状態とその次元を出力する
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        
        # Linear layer では隠れ状態スペースからタグスペースへの写像を行う
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.hidden = self.init_hidden()
        
    def init_hidden(self):
        # 初期状態では隠れ状態を保持していない。
        # (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(1, 1, self.hidden_dim),
                     torch.zeros(1, 1, self.hidden_dim))
    
    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)  # 埋め込みベクトル?

        '''
        print("--- embeds")
        pprint(embeds)
        print("--- embeds.view()")
        pprint(embeds.view(len(sentence), 1, -1))
        '''

        lstm_out, self.hidden = self.lstm(embeds.view(len(sentence), 1, -1), self.hidden)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

## Train the model

In [52]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [53]:
# 出力の要素 i, j  は word i に対する tag j のスコア
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)
    print("=== タグスコア")
    print(tag_scores)

=== タグスコア
tensor([[-0.8692, -1.2669, -1.2073],
        [-0.9927, -1.2815, -1.0447],
        [-0.9718, -1.1951, -1.1428],
        [-0.9181, -1.2389, -1.1679],
        [-0.9034, -1.2687, -1.1596]])


In [54]:
for epoch in range(300):
    for sentence, tags in training_data:
        # Step 1. 勾配初期化
        model.zero_grad()

        # LSTM の隠れ状態初期化
        model.hidden = model.init_hidden()

        # Step 2. Input の準備。単語インデックスからなる torch.tensor 型。
        # 予め、単語をインデックスにマップするものを作成しておき、
        # さらにそれを使って、センテンスをテンソルに変換する。
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Optim
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

タグスコア見る

In [99]:
ix_to_tag = {v: k for k, v, in tag_to_ix.items()}

with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)
    
    pred_tags = torch.max(tag_scores, dim=1)[1].tolist()
    
    # The sentence is "the dog ate the apple".
    print("--- tag_scores")
    pprint(tag_scores)
    print("--- max tag_scores")
    print(torch.max(tag_scores, dim=1))
    print()
    print("True: ", training_data[0][0])
    print("Preds: ", [ix_to_tag[pred] for pred in pred_tags])

--- tag_scores
tensor([[-0.1917, -1.8349, -4.2134],
        [-3.8420, -0.0579, -3.3582],
        [-2.9986, -3.9851, -0.0709],
        [-0.0594, -3.2447, -3.9816],
        [-2.4667, -0.0944, -5.2584]])
--- max tag_scores
(tensor([-0.1917, -0.0579, -0.0709, -0.0594, -0.0944]), tensor([0, 1, 2, 0, 1]))

True:  ['The', 'dog', 'ate', 'the', 'apple']
Preds:  ['DET', 'NN', 'V', 'DET', 'NN']
