## pytorchの演算の扱い

In [50]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x1002bcd50>

In [51]:
#行を連結
# By default, it concatenates along the first axis (concatenates rows)
x_1 = torch.randn(2, 5)
y_1 = torch.randn(3, 5)
z_1 = torch.cat([x_1, y_1])
print(z_1)

#列を連結
# Concatenate columns:
x_2 = torch.randn(2, 3)
y_2 = torch.randn(2, 5)
# second arg specifies which axis to concat along
z_2 = torch.cat([x_2, y_2], 1)
print(z_2)


-2.9718  1.7070 -0.4305 -2.2820  0.5237
 0.0004 -1.2039  3.5283  0.4434  0.5848
 0.8407  0.5510  0.3863  0.9124 -0.8410
 1.2282 -1.8661  1.4146 -1.8781 -0.4674
-0.7576  0.4215 -0.4827 -1.1198  0.3056
[torch.FloatTensor of size 5x5]


 1.0386  0.5206 -0.5006 -1.9441 -0.9596  0.5489 -0.9901 -0.3826
 1.2182  0.2117 -1.0613  1.5037  1.8267  0.5561  1.6445  0.4973
[torch.FloatTensor of size 2x8]



In [52]:
#.view：reshape関数
x = torch.randn(2, 3, 4)
print(x)

#2*12にreshape
print(x.view(2, 12))  # Reshape to 2 rows, 12 columns

#上記と同じ。最終層を一つ減らす。サイズは推論される
# Same as above.  If one of the dimensions is -1, its size can be inferred
print(x.view(2, -1))


(0 ,.,.) = 
 -1.5067  1.7661 -0.3569 -0.1713
  0.4068 -0.4284 -1.1299  1.4274
 -1.4027  1.4825 -1.1559  1.6190

(1 ,.,.) = 
  0.9581  0.7747  0.1940  0.1687
  0.3061  1.0743 -1.0327  1.0930
  0.7769 -1.3128  0.7099  0.9944
[torch.FloatTensor of size 2x3x4]



Columns 0 to 9 
-1.5067  1.7661 -0.3569 -0.1713  0.4068 -0.4284 -1.1299  1.4274 -1.4027  1.4825
 0.9581  0.7747  0.1940  0.1687  0.3061  1.0743 -1.0327  1.0930  0.7769 -1.3128

Columns 10 to 11 
-1.1559  1.6190
 0.7099  0.9944
[torch.FloatTensor of size 2x12]



Columns 0 to 9 
-1.5067  1.7661 -0.3569 -0.1713  0.4068 -0.4284 -1.1299  1.4274 -1.4027  1.4825
 0.9581  0.7747  0.1940  0.1687  0.3061  1.0743 -1.0327  1.0930  0.7769 -1.3128

Columns 10 to 11 
-1.1559  1.6190
 0.7099  0.9944
[torch.FloatTensor of size 2x12]



## NLP導入

In [53]:
#作成したデータ。文章を単語ごとに分けて格納する
data = [("me gusta comer en la cafeteria".split(), "SPANISH"),
        ("Give it to me".split(), "ENGLISH"),
        ("No creo que sea una buena idea".split(), "SPANISH"),
        ("No it is not a good idea to get lost at sea".split(), "ENGLISH")]

test_data = [("Yo creo que si".split(), "SPANISH"),
             ("it is lost on me".split(), "ENGLISH")]

In [54]:
#出てくる単語にidを割り振る
# word_to_ix maps each word in the vocab to a unique integer, which will be its
# index into the Bag of words vector
word_to_ix = {}
for sent, _ in data + test_data:
    for word in sent:
        if word not in word_to_ix:
            #長さは1ずつ伸びるのでこれをidとして利用
            word_to_ix[word] = len(word_to_ix)
print(word_to_ix)

VOCAB_SIZE = len(word_to_ix)
NUM_LABELS = 2

print
print (('Num of vocab size: ') + str(VOCAB_SIZE))

{'en': 3, 'No': 9, 'buena': 14, 'it': 7, 'at': 22, 'sea': 12, 'cafeteria': 5, 'Yo': 23, 'la': 4, 'to': 8, 'creo': 10, 'is': 16, 'a': 18, 'good': 19, 'get': 20, 'idea': 15, 'que': 11, 'not': 17, 'me': 0, 'on': 25, 'gusta': 1, 'lost': 21, 'Give': 6, 'una': 13, 'si': 24, 'comer': 2}

Num of vocab size: 26


### Bag of Wordsでのロジスティック回帰をpytorchで実装する

$$ output = Log\_Softmax(Logistic(input)) $$

$$ Logistic(x) = \frac{1}{1+e^{-(ax+b)}} $$ <br>
$$ Log\_Softmax(x_i) = log(\frac{e^x_i}{\sum_j(e^{x_j})}) $$ 

In [61]:
#BoW識別器の実装
class BoWClassifier(nn.Module):  # inheriting from nn.Module!

    def __init__(self, num_labels, vocab_size):
        #必ず継承の必要がある
        super(BoWClassifier, self).__init__()
        
        #input: vocabraryの種類の数
        #output: labelの数
        self.linear = nn.Linear(vocab_size, num_labels)

    def forward(self, bow_vec):
        return F.log_softmax(self.linear(bow_vec))

#文章をBoW表現のvectorに変換
def make_bow_vector(sentence, word_to_ix):
    #vocabraryの種類の長さのvector
    vec = torch.zeros(len(word_to_ix))
    
    #文章中の単語をそれぞれのvectorに+1ずつ行う
    for word in sentence:
        vec[word_to_ix[word]] += 1
    return vec.view(1, -1)

#labelをTensor型に変換
def make_target(label, label_to_ix):
    return torch.LongTensor([label_to_ix[label]])

#モデル定義
model = BoWClassifier(NUM_LABELS, VOCAB_SIZE)

In [62]:
#モデルのパラメータ数の確認
#単語の種類が26, English, Spanishでラベルが2種類
for param in model.parameters():
    print(param)

Parameter containing:

Columns 0 to 9 
-0.1560  0.0131 -0.0337  0.1765  0.0763 -0.0027 -0.0337  0.0159 -0.1765  0.1041
 0.1206 -0.0480 -0.0401  0.0151 -0.1313  0.0597  0.1677 -0.0544 -0.0597  0.0279

Columns 10 to 19 
 0.0141 -0.1783  0.0642 -0.1412  0.0058  0.1147  0.1744 -0.1844  0.0339  0.1503
 0.0984  0.0541  0.0886 -0.1466  0.1503  0.0746  0.0485  0.0580  0.0984 -0.0573

Columns 20 to 25 
 0.1582  0.0160 -0.1422 -0.0204 -0.1415  0.1538
-0.0593  0.1032 -0.0902 -0.0563  0.1553  0.0992
[torch.FloatTensor of size 2x26]

Parameter containing:
-0.0282
 0.1496
[torch.FloatTensor of size 2]



In [63]:
#データ・入力ベクトル・対数確率の確認
sample = data[0]
print ("データ")
print (sample)

bow_vector = make_bow_vector(sample[0], word_to_ix)
print ("入力用ベクトル")
print (bow_vector)

log_probs = model(Variable(bow_vector))
print ("スペイン語, 英語の対数確率")
print(log_probs)

データ
(['me', 'gusta', 'comer', 'en', 'la', 'cafeteria'], 'SPANISH')
入力用ベクトル


Columns 0 to 12 
    1     1     1     1     1     1     0     0     0     0     0     0     0

Columns 13 to 25 
    0     0     0     0     0     0     0     0     0     0     0     0     0
[torch.FloatTensor of size 1x26]

スペイン語, 英語の対数確率
Variable containing:
-0.7341 -0.6538
[torch.FloatTensor of size 1x2]



In [64]:
label_to_ix = {"SPANISH": 0, "ENGLISH": 1}

# forwardのみ
for instance, label in test_data:
    bow_vec = Variable(make_bow_vector(instance, word_to_ix))
    log_probs = model(bow_vec)
    print(log_probs)


# Print the matrix column corresponding to "creo"
print("What is the parameter of the layer of spanish word 'creo'")
print(next(model.parameters())[:, word_to_ix["creo"]])

#Loss関数の計算
loss_function = nn.NLLLoss()

#optimizerの設定
optimizer = optim.SGD(model.parameters(), lr=0.1)

#普通は5~30エポック程度でOK
#今度はback propagation有り
for epoch in range(100):
    for instance, label in data:
        
        # 前の勾配情報をリセットする
        model.zero_grad()

        #Pytorchのモデルに入れられるように変換
        bow_vec = Variable(make_bow_vector(instance, word_to_ix))
        target = Variable(make_target(label, label_to_ix))

        # forward step
        log_probs = model(bow_vec)

        #Lossの計算とbackpropagation
        #最適化
        loss = loss_function(log_probs, target)
        loss.backward()
        optimizer.step()

for instance, label in test_data:
    bow_vec = Variable(make_bow_vector(instance, word_to_ix))
    log_probs = model(bow_vec)
    print (label)
    print(("probability: ") + str(log_probs))

# Index corresponding to Spanish goes up, English goes down!
print("Check the spanish word 'creo' goes up")
print(next(model.parameters())[:, word_to_ix["creo"]])

Variable containing:
-1.1405 -0.3852
[torch.FloatTensor of size 1x2]

Variable containing:
-0.8490 -0.5583
[torch.FloatTensor of size 1x2]

What is the parameter of the layer of spanish word 'creo'
Variable containing:
1.00000e-02 *
  1.4080
  9.8377
[torch.FloatTensor of size 2]

SPANISH
probability: Variable containing:
-0.2095 -1.6658
[torch.FloatTensor of size 1x2]

ENGLISH
probability: Variable containing:
-2.6878 -0.0705
[torch.FloatTensor of size 1x2]

Check the spanish word 'creo' goes up
Variable containing:
 0.4846
-0.3721
[torch.FloatTensor of size 2]



たしかに文章1はスペイン語の確率大・文章2は英語の確率大になるよう学習されている<br>
"creo"というワードもスペイン語の可能性が高いようにレイヤー学習が進んでいる

## Word Embeddings

Bag of Words形式は単語にそれぞれidを振るが、
<ul>
    <li>数多くなったら計算大変
    <li>単語の関係性が全くわからない(意味の近さ・遠さなど)
</ul>
といった問題点がある

### Word Embeddings in Pytorch

In [65]:
#各単語をインデックス化
word_to_ix = {"hello": 0, "world": 1}

#合計二つの単語を5次元ベクトルに変換
embeds = nn.Embedding(2, 5)  # 2 words in vocab, 5 dimensional embeddings

#Tensorに型変換
lookup_tensor = torch.LongTensor([word_to_ix["hello"]])
print(lookup_tensor)

#"hello"をベクトル化(ここでは最適化されていない)
hello_embed = embeds(Variable(lookup_tensor))
print(hello_embed)


 0
[torch.LongTensor of size 1]

Variable containing:
-0.2694  0.1495 -0.0336 -0.6076 -1.0048
[torch.FloatTensor of size 1x5]



###  N-Gram Language Modeling

N-Gramを実装する。N-gramは単語$wi$前のN単語から次の単語を推測する<br>
$P(w_i|w_{i−1},w_{i−2},…,w_{i−n+1})$

In [66]:
#前の前後関係：前2単語のみ見る
CONTEXT_SIZE = 2

#埋め込んだ次元
EMBEDDING_DIM = 10

# 文章サンプル
# We will use Shakespeare Sonnet 2
test_sentence = """When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold.""".split()

In [99]:
# tri-gramの例
# we should tokenize the input, but we will ignore that for now
# build a list of tuples.  Each tuple is ([ word_i-2, word_i-1 ], target word)
trigrams = [([test_sentence[i], test_sentence[i + 1]], test_sentence[i + 2])
            for i in range(len(test_sentence) - 2)]

#中身を確認
#when fortyときたら次はwintersを予測する
# print the first 3, just so you can see what they look like
print(trigrams[:3])

#登場する単語にインデックスを割り振る
vocab = set(test_sentence)
word_to_ix = {word: i for i, word in enumerate(vocab)}

[(['When', 'forty'], 'winters'), (['forty', 'winters'], 'shall'), (['winters', 'shall'], 'besiege')]


In [100]:
context, target = trigrams[0]
context_idxs = [word_to_ix[w] for w in context]

# 変換元の単語
print context

# 単語のid
print context_idxs

# Tensor変換
context_var = Variable(torch.LongTensor(context_idxs))

# 単語id -> vector変換
# 2単語 × vector変換 ： (embedding層　× 2) のサイズ
embeddings = nn.Embedding(len(vocab), EMBEDDING_DIM)

print "vocabrary size: " + str(len(vocab))
print "vocabrary size: " + str(EMBEDDING_DIM)
print embeddings(context_var)

['When', 'forty']
[41, 25]
vocabrary size: 97
vocabrary size: 10
Variable containing:
-0.8429 -0.2405 -0.5897  1.4151  0.3399  0.0112  1.1586 -0.2975 -1.7608  1.7539
-0.1815 -0.6644  0.4104  0.8742 -0.1276  0.6828 -1.9091  1.4970 -1.2883 -0.0708
[torch.FloatTensor of size 2x10]



In [101]:
#N-Gramモデルの生成
class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        
        # 合計の単語の種類を潜在次元に落とし込む
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out)
        return log_probs

In [102]:
#ロスの確認
losses = []

#ロス関数の定義
#The negative log likelihood loss
loss_function = nn.NLLLoss()

#モデル定義
model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)

#optimizerの定義
optimizer = optim.SGD(model.parameters(), lr=0.001)

In [103]:
print model.parameters

<bound method NGramLanguageModeler.parameters of NGramLanguageModeler (
  (embeddings): Embedding(97, 10)
  (linear1): Linear (20 -> 128)
  (linear2): Linear (128 -> 97)
)>


In [104]:
#トレーニング
for epoch in range(10):
    total_loss = torch.Tensor([0])
    
    # 各単語の予測
    # 前のN単語を見て次の単語を予測する
    for context, target in trigrams:
        
        # 文章を読み込める形式に変える
        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in variables)
        context_idxs = [word_to_ix[w] for w in context]
        context_var = Variable(torch.LongTensor(context_idxs))

        # 勾配初期化
        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # モデル
        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model(context_var)

        #ロス関数の計算
        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a variable)
        loss = loss_function(log_probs, Variable(
            torch.LongTensor([word_to_ix[target]])))

        #誤差逆伝播法
        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        total_loss += loss.data
    losses.append(total_loss)
print(losses)  # The loss decreased every iteration over the training data!

[
 519.7821
[torch.FloatTensor of size 1]
, 
 517.4017
[torch.FloatTensor of size 1]
, 
 515.0360
[torch.FloatTensor of size 1]
, 
 512.6841
[torch.FloatTensor of size 1]
, 
 510.3459
[torch.FloatTensor of size 1]
, 
 508.0204
[torch.FloatTensor of size 1]
, 
 505.7052
[torch.FloatTensor of size 1]
, 
 503.4016
[torch.FloatTensor of size 1]
, 
 501.1090
[torch.FloatTensor of size 1]
, 
 498.8246
[torch.FloatTensor of size 1]
]


## Computing Word Embeddings: Continuous Bag-of-Words

CBoW： 前後の数ワードから中の単語を予測する<br>
embeddingsの初期化に用いられる

単語$w_i$に対して、前後N個のキーワード$w_i−1,…,w_{i−N}$、$w_{i+1},…,w_{i+N}$が与えられた時に、<br>
$q_w$をembeddingしたwordとすれば、以下の確率を最小化するものを考える

$−log⁡p(w_i|C)=−log⁡Softmax(A(\sum_{w∈C}q_w)+b)$

Implement this model in Pytorch by filling in the class below. Some tips:

Think about which parameters you need to define.<br>
Make sure you know what shape each operation expects. Use .view() if you need to reshape.

In [114]:
# 前後2単語ずつを推測に用いる
CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right

#埋め込んだ次元
EMBEDDING_DIM = 10

# 原文
raw_text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".split()

# インデックス化
word_to_ix = {word: i for i, word in enumerate(raw_text)}

# 前後2単語ずつと予測単語の組み合わせを作成する
data = []
for i in range(2, len(raw_text) - 2):
    context = [raw_text[i - 2], raw_text[i - 1],
               raw_text[i + 1], raw_text[i + 2]]
    target = raw_text[i]
    data.append((context, target))
print(data[:5])

[(['We', 'are', 'to', 'study'], 'about'), (['are', 'about', 'study', 'the'], 'to'), (['about', 'to', 'the', 'idea'], 'study'), (['to', 'study', 'idea', 'of'], 'the'), (['study', 'the', 'of', 'a'], 'idea')]


In [141]:
#CBOWのクラス定義
class CBOW(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(2 * context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out)
        return log_probs

# create your model and train.  here are some functions to help you make
# the data ready for use by your module


def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    tensor = torch.LongTensor(idxs)
    return Variable(tensor)


make_context_vector(data[0][0], word_to_ix)  # example

Variable containing:
  0
 13
 47
  4
[torch.LongTensor of size 4]

In [148]:
#ロスの確認
losses = []

#ロス関数の定義
#The negative log likelihood loss
loss_function = nn.NLLLoss()

#モデル定義
model = CBOW(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)

#optimizerの定義
optimizer = optim.SGD(model.parameters(), lr=0.001)

In [149]:
model.parameters

<bound method CBOW.parameters of CBOW (
  (embeddings): Embedding(97, 10)
  (linear1): Linear (40 -> 128)
  (linear2): Linear (128 -> 97)
)>

In [150]:
#トレーニング
for epoch in range(10):
    total_loss = torch.Tensor([0])
    
    # 各単語の予測
    # 前のN単語を見て次の単語を予測する
    for context, target in data:
        
        # 文章を読み込める形式に変える
        context_idxs = [word_to_ix[w] for w in context]
        context_var = Variable(torch.LongTensor(context_idxs))

        # 勾配初期化
        model.zero_grad()

        # モデル
        log_probs = model(context_var)

        #ロス関数の計算
        loss = loss_function(log_probs, Variable(
            torch.LongTensor([word_to_ix[target]])))

        #誤差逆伝播法
        loss.backward()
        optimizer.step()

        total_loss += loss.data
    losses.append(total_loss)
print(losses)  # The loss decreased every iteration over the training data!

[
 266.7071
[torch.FloatTensor of size 1]
, 
 264.9733
[torch.FloatTensor of size 1]
, 
 263.2492
[torch.FloatTensor of size 1]
, 
 261.5334
[torch.FloatTensor of size 1]
, 
 259.8259
[torch.FloatTensor of size 1]
, 
 258.1265
[torch.FloatTensor of size 1]
, 
 256.4332
[torch.FloatTensor of size 1]
, 
 254.7435
[torch.FloatTensor of size 1]
, 
 253.0574
[torch.FloatTensor of size 1]
, 
 251.3734
[torch.FloatTensor of size 1]
]


## Sequence Models and Long-Short Term Memory Networks

### LSTMの挙動をチェック

In [152]:
#LSTM層の定義
lstm = nn.LSTM(3, 3)  # Input dim is 3, output dim is 3
inputs = [Variable(torch.randn((1, 3)))
    for _ in range(5)]  # make a sequence of length 5
inputs

[Variable containing:
 -1.5859 -1.4814  0.4191
 [torch.FloatTensor of size 1x3], Variable containing:
 -0.9734  0.4680  1.6193
 [torch.FloatTensor of size 1x3], Variable containing:
 -0.8317  1.1417  0.2224
 [torch.FloatTensor of size 1x3], Variable containing:
  1.3163  1.7850  1.3064
 [torch.FloatTensor of size 1x3], Variable containing:
  0.3383 -0.6922  0.9433
 [torch.FloatTensor of size 1x3]]

In [154]:
#隠れ層の初期化
# initialize the hidden state.
hidden = (Variable(torch.randn(1, 1, 3)),
          Variable(torch.randn((1, 1, 3))))
hidden

(Variable containing:
 (0 ,.,.) = 
   0.7190  0.9216 -1.0710
 [torch.FloatTensor of size 1x1x3], Variable containing:
 (0 ,.,.) = 
  -0.2065  1.0174 -0.3371
 [torch.FloatTensor of size 1x1x3])

In [160]:
#1x3を3次元にするために1x1x3(バッチサイズ1に等しい)に変換
print(inputs[0])
print(inputs[0].view(1, 1, -1))
print 

Variable containing:
-1.5859 -1.4814  0.4191
[torch.FloatTensor of size 1x3]

Variable containing:
(0 ,.,.) = 
 -1.5859 -1.4814  0.4191
[torch.FloatTensor of size 1x1x3]




In [159]:
#inputs内の各値に対してLSTMを通す
for i in inputs:
    # Step through the sequence one element at a time.
    # after each step, hidden contains the hidden state.
    out, hidden = lstm(i.view(1, 1, -1), hidden)
print(out)
print(hidden)

Variable containing:
-1.5859 -1.4814  0.4191
[torch.FloatTensor of size 1x3]

Variable containing:
(0 ,.,.) = 
 -1.5859 -1.4814  0.4191
[torch.FloatTensor of size 1x1x3]


Variable containing:
(0 ,.,.) = 
  0.1221  0.2631  0.1254
[torch.FloatTensor of size 1x1x3]

(Variable containing:
(0 ,.,.) = 
  0.1221  0.2631  0.1254
[torch.FloatTensor of size 1x1x3]
, Variable containing:
(0 ,.,.) = 
  0.1675  0.5136  0.3686
[torch.FloatTensor of size 1x1x3]
)


In [163]:
#for文を使わないための変換
#元のinputs
print(inputs)

#1x3が5個あるやつをくっつける
print(torch.cat(inputs))

#5x1x3に変換
print(torch.cat(inputs).view(len(inputs), 1, -1))

[Variable containing:
-1.5859 -1.4814  0.4191
[torch.FloatTensor of size 1x3]
, Variable containing:
-0.9734  0.4680  1.6193
[torch.FloatTensor of size 1x3]
, Variable containing:
-0.8317  1.1417  0.2224
[torch.FloatTensor of size 1x3]
, Variable containing:
 1.3163  1.7850  1.3064
[torch.FloatTensor of size 1x3]
, Variable containing:
 0.3383 -0.6922  0.9433
[torch.FloatTensor of size 1x3]
]
Variable containing:
-1.5859 -1.4814  0.4191
-0.9734  0.4680  1.6193
-0.8317  1.1417  0.2224
 1.3163  1.7850  1.3064
 0.3383 -0.6922  0.9433
[torch.FloatTensor of size 5x3]

Variable containing:
(0 ,.,.) = 
 -1.5859 -1.4814  0.4191

(1 ,.,.) = 
 -0.9734  0.4680  1.6193

(2 ,.,.) = 
 -0.8317  1.1417  0.2224

(3 ,.,.) = 
  1.3163  1.7850  1.3064

(4 ,.,.) = 
  0.3383 -0.6922  0.9433
[torch.FloatTensor of size 5x1x3]



In [41]:
#for文なしでまとめて実行
# alternatively, we can do the entire sequence all at once.
# the first value returned by LSTM is all of the hidden states throughout
# the sequence. the second is just the most recent hidden state
# (compare the last slice of "out" with "hidden" below, they are the same)
# The reason for this is that:
# "out" will give you access to all hidden states in the sequence
# "hidden" will allow you to continue the sequence and backpropogate,
# by passing it as an argument  to the lstm at a later time
# Add the extra 2nd dimension
inputs = torch.cat(inputs).view(len(inputs), 1, -1)
hidden = (Variable(torch.randn(1, 1, 3)), Variable(
    torch.randn((1, 1, 3))))  # clean out hidden state
out, hidden = lstm(inputs, hidden)
print(out)
print(hidden)

Variable containing:
(0 ,.,.) = 
 -0.5690  0.2984 -0.2307

(1 ,.,.) = 
 -0.4488  0.2394  0.0703

(2 ,.,.) = 
 -0.3698  0.0887  0.3511

(3 ,.,.) = 
 -0.4498  0.3913  0.2789

(4 ,.,.) = 
 -0.4714  0.1113  0.4174
[torch.FloatTensor of size 5x1x3]

(Variable containing:
(0 ,.,.) = 
 -0.4714  0.1113  0.4174
[torch.FloatTensor of size 1x1x3]
, Variable containing:
(0 ,.,.) = 
 -0.6453  0.1624  0.5910
[torch.FloatTensor of size 1x1x3]
)


### Example: An LSTM for Part-of-Speech Tagging

In [166]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    tensor = torch.LongTensor(idxs)
    return Variable(tensor)

#トレーニングデータ
#単語と品詞の対応
training_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]

#単語のインデックス
word_to_ix = {}
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
print(word_to_ix)

#品詞のインデックス
tag_to_ix = {"DET": 0, "NN": 1, "V": 2}

#通常は32か64次元
# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

{'Everybody': 5, 'ate': 2, 'apple': 4, 'that': 7, 'read': 6, 'dog': 1, 'book': 8, 'the': 3, 'The': 0}


In [167]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        
        # 隠れ層の次元
        self.hidden_dim = hidden_dim
        
        # embedding層
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # LSTMの定義
        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # 潜在空間
        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        
        #LSTMの隠れ層の定義
        self.hidden = self.init_hidden()
    
    #隠れ層の定義
    def init_hidden(self):
        # 隠れ層は(num_layers, minibatch_size, hidden_dim)のように定義する必要がある
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (Variable(torch.zeros(1, 1, self.hidden_dim)),
                Variable(torch.zeros(1, 1, self.hidden_dim)))
    
    #順伝播
    def forward(self, sentence):
        
        #embedding
        embeds = self.word_embeddings(sentence)
        
        #LSTM
        lstm_out, self.hidden = self.lstm(
            embeds.view(len(sentence), 1, -1), self.hidden)
        
        #Linear
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        
        #Log_Softmax
        tag_scores = F.log_softmax(tag_space)
        return tag_scores

In [168]:
#モデル定義諸々
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [169]:
# LSTMに通してみる
# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
inputs = prepare_sequence(training_data[0][0], word_to_ix)
tag_scores = model(inputs)
print(tag_scores)

Variable containing:
-0.9500 -1.1105 -1.2593
-0.9220 -1.1113 -1.2978
-1.0782 -1.0665 -1.1534
-0.9738 -1.0909 -1.2502
-1.0779 -1.0665 -1.1537
[torch.FloatTensor of size 5x3]



In [171]:
#トレーニング
for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data
    
    #文章ごとの処理
    for sentence, tags in training_data:
        
        #勾配初期化
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        #隠れ層(LSTMの記憶部分)の状態を初期化
        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model.hidden = model.init_hidden()

        #文章変換
        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Variables of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

In [203]:
# See what the scores are after training
inputs = prepare_sequence(training_data[0][0], word_to_ix)
#スコアチェック
tag_scores = model(inputs)
# The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
#  for word i. The predicted tag is the maximum scoring tag.
# Here, we can see the predicted sequence below is 0 1 2 0 1
# since 0 is index of the maximum value of row 1,
# 1 is the index of maximum value of row 2, etc.
# Which is DET NOUN VERB DET NOUN, the correct sequence!
print(tag_scores)

Variable containing:
-0.1564 -1.9627 -5.4380
-4.4316 -0.0133 -6.6546
-5.4240 -4.4229 -0.0165
-0.0224 -4.0915 -5.2090
-5.2533 -0.0105 -5.2639
[torch.FloatTensor of size 5x3]



In [204]:
#予測値についての確認
print "raw: "+ str(training_data[0][0])
print "target: "+ str(training_data[0][1])
print "predict: " + str([tag_to_ix.keys()[i] for i in torch.max(tag_scores,1)[1].data.numpy().T[0]])

raw: ['The', 'dog', 'ate', 'the', 'apple']
target: ['DET', 'NN', 'V', 'DET', 'NN']
predict: ['DET', 'NN', 'V', 'DET', 'NN']


## Exercise: Augmenting the LSTM part-of-speech tagger with character-level features

LSTMの品詞予測に文字レベルの特徴量を追加する

In the example above, each word had an embedding, which served as the inputs to our sequence model. Let’s augment the word embeddings with a representation derived from the characters of the word. We expect that this should help significantly, since character-level information like affixes have a large bearing on part-of-speech. For example, words with the affix -ly are almost always tagged as adverbs in English.

Do do this, let cwcw be the character-level representation of word ww. Let xwxw be the word embedding as before. Then the input to our sequence model is the concatenation of xwxw and cwcw. So if xwxw has dimension 5, and cwcw dimension 3, then our LSTM should accept an input of dimension 8.

To get the character level representation, do an LSTM over the characters of a word, and let cwcw be the final hidden state of this LSTM. Hints:

There are going to be two LSTM’s in your new model. The original one that outputs POS tag scores, and the new one that outputs a character-level representation of each word.
To do a sequence model over characters, you will have to embed characters. The character embeddings will be the input to the character LSTM.

In the example above, each word had an embedding, which served as the inputs to our sequence model. Let’s augment the word embeddings with a representation derived from the characters of the word. We expect that this should help significantly, since character-level information like affixes have a large bearing on part-of-speech. For example, words with the affix -ly are almost always tagged as adverbs in English.

## Advanced: Making Dynamic Decisions and the Bi-LSTM CRF

### Dynamic versus Static Deep Learning Toolkits

KerasやTheano：Static<br>
Pytorch：Dynamic<br>
なので、毎ループの度にグラフが計算される

この違いを考えるために、木構造のグラフを考える

<ul>
<li>ボトムアップの木を作成する
<li>根に単語や文のノードをつける
<li>NNやEmbeddingを用いて、この木構造を定める
</ul>

このような場合、KerasのようなStaticなネットワークは難しい<br>
またTensorFlowなどと比較するとより通常のpythonに近い(class定義でモデルを指定するなど)

### Bi-LSTM Conditional Random Field Discussion

固有名詞の抽出

xを入力の単語のシーケンス、yを単語についての品詞のシーケンスだとすると、これらの事象の確率は以下のように表されるm

\begin{align}P(y|x) = \frac{\exp{(\text{Score}(x, y)})}{\sum_{y'} \exp{(\text{Score}(x, y')})}\end{align}

$\log \psi_i(x,y)$をLogポテンシャルとすると、上記のスコアは以下のように表される

\begin{align}\text{Score}(x,y) = \sum_i \log \psi_i(x,y)\end{align}

Bi-LSTM CRFにおいて、二つのポテンシャル(emissionとtransition)を定義する

index iの単語のBiLSTMのTimestep i における隠れ状態とする


In the Bi-LSTM CRF, we define two kinds of potentials: emission and
transition. The emission potential for the word at index $i$ comes
from the hidden state of the Bi-LSTM at timestep $i$. The
transition scores are stored in a $|T|x|T|$ matrix
$\textbf{P}$, where $T$ is the tag set. In my
implementation, $\textbf{P}_{j,k}$ is the score of transitioning
to tag $j$ from tag $k$. So:

\begin{align}\text{Score}(x,y) = \sum_i \log \psi_\text{EMIT}(y_i \rightarrow x_i) + \log \psi_\text{TRANS}(y_{i-1} \rightarrow y_i)\end{align}

\begin{align}= \sum_i h_i[y_i] + \textbf{P}_{y_i, y_{i-1}}\end{align}


### Implementation Notes

In [208]:
#ヘルパー関数
def to_scalar(var):
    # returns a python float
    return var.view(-1).data.tolist()[0]


def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return to_scalar(idx)


def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    tensor = torch.LongTensor(idxs)
    return Variable(tensor)


# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

In [209]:
#モデル定義
class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)
        
        # LSTMをtagに変換するレイヤー
        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        
        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        self.hidden = self.init_hidden()
    
    #隠れ層の初期化
    def init_hidden(self):
        return (Variable(torch.randn(2, 1, self.hidden_dim)),
                Variable(torch.randn(2, 1, self.hidden_dim)))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.Tensor(1, self.tagset_size).fill_(-10000.)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = Variable(init_alphas)

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward variables at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        lstm_out, self.hidden = self.lstm(embeds)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = Variable(torch.Tensor([0]))
        tags = torch.cat([torch.LongTensor([self.tag_to_ix[START_TAG]]), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.Tensor(1, self.tagset_size).fill_(-10000.)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = Variable(init_vvars)
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id])
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        self.hidden = self.init_hidden()
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        self.hidden = self.init_hidden()
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

In [210]:
START_TAG = "<START>"
STOP_TAG = "<STOP>"
EMBEDDING_DIM = 5
HIDDEN_DIM = 4

# Make up some training data
training_data = [(
    "the wall street journal reported today that apple corporation made money".split(),
    "B I I I O O O B I O O".split()
), (
    "georgia tech is a university in georgia".split(),
    "B I O O O O B".split()
)]

word_to_ix = {}
for sentence, tags in training_data:
    for word in sentence:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

tag_to_ix = {"B": 0, "I": 1, "O": 2, START_TAG: 3, STOP_TAG: 4}

model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

# Check predictions before training
precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
precheck_tags = torch.LongTensor([tag_to_ix[t] for t in training_data[0][1]])
print(model(precheck_sent))

# Make sure prepare_sequence from earlier in the LSTM section is loaded
for epoch in range(
        300):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Variables of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = torch.LongTensor([tag_to_ix[t] for t in tags])

        # Step 3. Run our forward pass.
        neg_log_likelihood = model.neg_log_likelihood(sentence_in, targets)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        neg_log_likelihood.backward()
        optimizer.step()

# Check predictions after training
precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
print(model(precheck_sent))
# We got it!

(Variable containing:
 15.9463
[torch.FloatTensor of size 1]
, [2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2])
(Variable containing:
 39.0223
[torch.FloatTensor of size 1]
, [0, 1, 1, 1, 2, 2, 2, 0, 1, 2, 2])
