# POS tagger
NLTKのデータセットを使ってLSTMでPOS taggerを試してみる

In [59]:
import nltk
from nltk.corpus import treebank as treebank
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from tqdm import tqdm

## corpusの読み込み、学習・テストデータを作成

In [32]:
torch.manual_seed(0)

# corpusはtreebank（10万単語）を用いる
corpus = list(treebank.tagged_sents())
np.random.shuffle(corpus)
train_data = corpus[0:int(len(corpus)*0.8)]
test_data = corpus[int(len(corpus)*0.8):]

In [33]:
tag = set([w[1] for w in list(treebank.tagged_words())])
print('文章の数: ', len(corpus))
print('tagの種類: ', len(tag))

文章の数:  3914
tagの種類:  46


## word, tagのインデックスを作る

In [39]:
word_to_ix = {}
tag_to_ix = {}
for word, tag in treebank.tagged_words():
    if word not in word_to_ix:
        word_to_ix[word] = len(word_to_ix)
    if tag not in tag_to_ix:
        tag_to_ix[tag] = len(tag_to_ix)

print('単語の種類: ', len(word_to_ix))
print('tagの種類: ', len(tag_to_ix))

単語の種類:  12408
tagの種類:  46


In [54]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

## LSTMネットワークの構築

In [42]:
EMBEDDING_DIM = 10
HIDDEN_DIM = 10

In [41]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.zeros(1, 1, self.hidden_dim),
                torch.zeros(1, 1, self.hidden_dim))

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, self.hidden = self.lstm(
            embeds.view(len(sentence), 1, -1), self.hidden)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

## ネットワークのインスタンス作成、CPU/GPU切り替え

In [45]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
if torch.cuda.device_count() > 1:
    model = nn.DataParallel(model)
model = model.to(device)

In [None]:
## 損失関数、最適化アルゴリズム設定

In [None]:
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

## 学習

In [None]:
for epoch in tqdm(range(300)):
    for data in train_data:
        sentence = [d[0] for d in data]
        tags = [d[1] for d in data]

        # 初期化
        model.zero_grad()
        model.hidden = model.init_hidden()
        
        # 単語、タグをそれぞれインデックスに変換
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)
        
        # CPU/GPU切り替え
        sentence_in = sentence_in.to(device)
        targets = targets.to(device)

        # 文章に含まれる単語に対するtagのスコアを取得
        tag_scores = model(sentence_in)
        
        # 損失関数計算、最適化アルゴリズム実行
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

 14%|█▍        | 43/300 [10:09<1:00:41, 14.17s/it]

# NER
NLTKのデータセットを使ってLSTMでNERを試してみる