# 7章 RNNによる文章生成

## 7.1.2 文章生成の実装

In [2]:
import sys
sys.path.append('..')
import numpy as np
from common.functions import softmax
from ch06.rnnlm import Rnnlm
from ch06.better_rnnlm import BetterRnnlm

In [3]:
class RnnlmGen(Rnnlm):
    def generate(sefl, start_id, skip_ids=None, sample_size=100):
        """
        start_id: 最初に与える単語ID
        sample_size: サンプリングする単語の数
        skip_ids: 単語IDのリストをのうち、そこで指定された単語IDがサンプリングされないようにする
        """
        word_ids = [start_id]
        
        x = start_id
        while len(word_ids) < sample_size:
            x = np.array(x).reshape(1, 1)
            # predictによって、各単語のスコアを出力する(正規化される前の値)
            score = self.predict(x)
            # スコアをSoftmax関数て正規化
            p = softmax(score.flatten())
            
            sampled = np.random.choice(len(p), size=1, p=p)
            if (skip_ids is None) or (sampled not in skip_ids):
                x = sampled
                word_ids.append(int(x))
        return word_ids

### Rnnlmクラスを用いた、文章生成

In [6]:
import sys
sys.path.append('..')
from rnnlm_gen import RnnlmGen
from dataset import ptb

corpus, word_to_id, id_to_word = ptb.load_data('train')
vocab_size = len(word_to_id)
corpus_size = len(corpus)

model = RnnlmGen()
# model.load_params('../ch06/Rnnlm.pkl')

# start文字とskip文字の設定
start_word = 'you'
start_id = word_to_id[start_word]
skip_words = ['N', '<unk>', '$']
skip_ids = [word_to_id[w] for w in skip_words]

# 文章生成
word_ids = model.generate(start_id, skip_ids)
txt = ' '.join([id_to_word[i] for i in word_ids])
txt = txt.replace(' <eos>', '.\n')
print(txt)

you denied toshiba video suitor skiers seize webster rules prosecution evaluating farm heights acquired leaves allegations signal osha throughout heroes larger beyond torn wider torrijos abundant suites blocking deb allianz gained massage antitrust save directed declined cholesterol pieces simmons transit bronfman kageyama affiliate refined constraints intelligence dropped substantial contributed noxell investigating navy fights elite ports development hearts payments accounts fortunes hydro-quebec cholesterol nuclear averages baltimore both extending restoration cultural margins hailed assure integration murphy carpets budgets casting play retailer defined prosecutor businessland silicon scotland heller buses harm elsewhere gas matching venerable paintings folk violate along naturally cnbc presumed we subordinated


In [8]:
import sys
sys.path.append('..')
from rnnlm_gen import RnnlmGen
from dataset import ptb

corpus, word_to_id, id_to_word = ptb.load_data('train')
vocab_size = len(word_to_id)
corpus_size = len(corpus)

model = RnnlmGen()
model.load_params('../ch06/Rnnlm.pkl')

# start文字とskip文字の設定
start_word = 'you'
start_id = word_to_id[start_word]
skip_words = ['N', '<unk>', '$']
skip_ids = [word_to_id[w] for w in skip_words]

# 文章生成
word_ids = model.generate(start_id, skip_ids)
txt = ' '.join([id_to_word[i] for i in word_ids])
txt = txt.replace(' <eos>', '.\n')
print(txt)

you oppose any money and more image are using mr. gates.
 it merely to make apartheid if it was at home price research founded security colleagues that both the wholesale decline in a year.
 a spokesman for the bargaining prominent cell sentenced softer exports between the first professional term shop is seeking finding broader news problems she is n't clear for the candy role of them or on subordinated construction spending for two-thirds and that the group of walter oil this information may become awarded for.
 gregory turner producer said that the stock market is n't clear


In [13]:
# !wget "https://www.oreilly.co.jp/pub/9784873118369/BetterRnnlm.pkl"

--2021-09-13 18:33:26--  https://www.oreilly.co.jp/pub/9784873118369/BetterRnnlm.pkl
Resolving www.oreilly.co.jp (www.oreilly.co.jp)... 18.182.240.173, 35.74.96.30
Connecting to www.oreilly.co.jp (www.oreilly.co.jp)|18.182.240.173|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 39550902 (38M)
Saving to: ‘BetterRnnlm.pkl’


2021-09-13 18:33:49 (1.72 MB/s) - ‘BetterRnnlm.pkl’ saved [39550902/39550902]



In [14]:
# !mv BetterRnnlm.pkl "../ch06/"

## 7.2.4 足し算データセット 

In [15]:
import sys
sys.path.append('..')
from dataset import sequence

(x_train, t_train), (x_test, t_test) = sequence.load_data('addition.txt', seed=1984)
char_to_id, id_to_char = sequence.get_vocab()

print(x_train.shape, t_train.shape)
print(x_test.shape, t_test.shape)

print(x_train[0])
print(t_train[0])

print(''.join([id_to_char[c] for c in x_train[0]]))
print(''.join([id_to_char[c] for c in t_train[0]]))

(45000, 7) (45000, 5)
(5000, 7) (5000, 5)
[ 3  0  2  0  0 11  5]
[ 6  0 11  7  5]
71+118 
_189 


## 7.3 seq2seqの実装

### 7.3.1 Encoderクラス

In [16]:
from common.time_layers import *

In [18]:
class Encoder(object):
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        rn = np.random.randn
        
        embed_W = (rn(V, D) / 100).astype('f')
        lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
        lstm_Wh = (rn(D, 4 * H) / np.sqrt(H)).astype('f')
        lstm_b = np.zeros(4 * H).astype('f')
        
        self.embed = TimeEmbedding(embed_w)
        self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=False)
        
        self.params = self.embed.params + self.lstm.params
        self.grads = self.embed.grads + self.lstm.grads
        self.hs = None
        
    def forward(self, xs):
        xs = self.embed.forward(xs)
        hs = self.lstm.forward(xs)
        self.hs = hs
        return hs[:, -1, :]
    
    def backward(self, dh):
        dhs = np.zeros_like(self.hs)
        dhs[:, -1, :] = dh
        
        dout = self.lstm.backward(dhs)
        dout = self.embed.backward(dout)
        return dout

### 7.3.2 Decoderクラス

In [19]:
class Decoder(object):
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        rn = np.random.randn
        
        embed_W = (rn(V, D) / 100).atype('f')
        lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
        lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm_b = np.zeros(4 * H).astype('f')
        affine_W = (rn(H, V) / np.sqrt(H)).astype('f')
        affine_b = np.zeros(V).astype('f')
        
        self.embed = TimeEmbedding(embed_W)
        self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True)
        self.affine = TimeAffine(affine_W, affine_b)
        
        self.params, self.grads = [], []
        for layer in (self.embed, self.lstm, self.affine):
            self.params = layer.params
            self.grads += layer.grads
            
    def forward(self, xs, h):
        self.lstm.set_state(h)
        
        out = self.embed.forward(xs)
        out = self.lstm.forward(out)
        score = self.affine.foward(out)
        return score
    
    def backward(self, dscore):
        dout = self.affine.backward(dscore)
        dout = self.lstm.backward(dout)
        dtou = self.embed.backward(dout)
        dh = self.lstm.dh
        return dh
    
    def generate(self, h, start_id, sample_size):
        sampled = []
        sample_id = start_id
        self.lstm.set_state(h)
        
        for _ in range(sample_size):
            x = np.array(sample_id).reshape((1, 1))
            out = self.embed.forward(x)
            out = self.lstm.forward(out)
            score = self.affine.forward(out)
            
            sample_id = np.argmax(score.flatten())
            sampled.append(int(sample_id))
            
        return sampled

## 7.3.3 seq2seqクラス 
- EncoderクラスとDecoderクラスを繋ぎ合わせ、Time Softmax with Lossレイヤを使って損失を計算する。

In [24]:
import sys
sys.path.append('..')
from common.time_layers import *
from common.base_model import BaseModel

class Seq2seq(BaseModel):
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        self.encoder = Encoder(V, D, H)
        self.decoder = Decoder(V, D, H)
        self.softmax = TimeSoftmaxWithLoss()
        
        self.params = self.encoder.params + self.decoder.params
        self.grads = self.encoder.grads + self.decoder.grads
        
    def forward(self, xs, ts):
        decoder_xs, decoder_ts = ts[:, :-1], ts[:, 1:]
        
        h = self.encoder_forward(xs)
        score = self.decoder.forward(decoder_xs, h)
        loss = self.softmax.forward(score, decoder_ts)
        return loss
    
    def backward(sefl, dout=1):
        dout = self.softmax.backward(dout)
        dh = self.decoder.backward(dout)
        dout = slef.encoder.backward(dh)
        return dout
    
    def generate(self, xs, start_id, sample_size):
        h = self.encoder.forward(xs)
        sampled = self.encoder.generate(h, start_id, sample_size)
        return sampled

### 7.3.4 seq2seqの評価

In [27]:
import sys
sys.path.append('..')
import numpy as np
import matplotlib.pyplot as plt
from dataset import sequence
from common.optimizer import Adam
from common.trainer import Trainer
from common.util import eval_seq2seq
from seq2seq import Seq2seq
from peeky_seq2seq import PeekySeq2seq

# データセットの読み込み
(x_train, t_train), (x_test, t_test) = sequence.load_data('addition.txt')
char_to_id, id_to_char = sequence.get_vocab()

# ハイパーパラメータの設定
vacab_size = len(char_to_id)
wordvec_size = 16
hidden_size = 128
batch_size = 128
max_epoch = 25
max_grad = 5.0

# モデル/オプティマイザ/トレーナーの生成
model = Seq2seq(vocab_size, wordvec_size, hidden_size)
optimizer = Adam()
trainer = Trainer(model, optimizer)

acc_list = []
for epoch in range(max_epoch):
    trainer.fit(x_train, t_train, max_epoch=1, batch_size=batch_size, max_grad=max_grad)
    
    correct_num = 0
    for i in range(len(x_test)):
        question, correct = x_test[[i]], t_test[[i]]
        verbose = i < 10
        correct_num += eval_seq2seq(model, question, correct, id_to_char, verbose)
    
    acc = float(correct_num) / len(x_test)
    acc_list.append(acc)
    print('val acc %.3f%%' % (acc * 100))

| epoch 1 |  iter 1 / 351 | time 1[s] | loss 9.21
| epoch 1 |  iter 21 / 351 | time 3[s] | loss 9.11
| epoch 1 |  iter 41 / 351 | time 5[s] | loss 5.67
| epoch 1 |  iter 61 / 351 | time 7[s] | loss 2.31
| epoch 1 |  iter 81 / 351 | time 9[s] | loss 2.14
| epoch 1 |  iter 101 / 351 | time 11[s] | loss 2.06
| epoch 1 |  iter 121 / 351 | time 14[s] | loss 2.02
| epoch 1 |  iter 141 / 351 | time 16[s] | loss 1.98
| epoch 1 |  iter 161 / 351 | time 18[s] | loss 1.95
| epoch 1 |  iter 181 / 351 | time 20[s] | loss 1.93
| epoch 1 |  iter 201 / 351 | time 22[s] | loss 1.91
| epoch 1 |  iter 221 / 351 | time 25[s] | loss 1.90
| epoch 1 |  iter 241 / 351 | time 28[s] | loss 1.89
| epoch 1 |  iter 261 / 351 | time 30[s] | loss 1.88
| epoch 1 |  iter 281 / 351 | time 33[s] | loss 1.88
| epoch 1 |  iter 301 / 351 | time 35[s] | loss 1.87
| epoch 1 |  iter 321 / 351 | time 38[s] | loss 1.86
| epoch 1 |  iter 341 / 351 | time 41[s] | loss 1.86
Q 77+85  
T 162 
[91m☒[0m 104 
---
Q 975+164
T 1139
[9

KeyboardInterrupt: 

## 7.4 seq2seqの改良

### 7.4.1 入力データの反転
`x_train, t_test = x_train[:, ::-1], t_test[:, ::-1]`
- 理論的なことはわかっていないが、勾配の伝播がスムーズになるため、学習の進みが改善すると考えられている。

### 7.4.2 覗き見(Peeky)

In [28]:
class PeekyDecoder:
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        rn = np.random.randn

        embed_W = (rn(V, D) / 100).astype('f')
        lstm_Wx = (rn(H + D, 4 * H) / np.sqrt(H + D)).astype('f')
        lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm_b = np.zeros(4 * H).astype('f')
        affine_W = (rn(H + H, V) / np.sqrt(H + H)).astype('f')
        affine_b = np.zeros(V).astype('f')

        self.embed = TimeEmbedding(embed_W)
        self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True)
        self.affine = TimeAffine(affine_W, affine_b)

        self.params, self.grads = [], []
        for layer in (self.embed, self.lstm, self.affine):
            self.params += layer.params
            self.grads += layer.grads
        self.cache = None

    def forward(self, xs, h):
        N, T = xs.shape
        N, H = h.shape

        self.lstm.set_state(h)

        out = self.embed.forward(xs)
        hs = np.repeat(h, T, axis=0).reshape(N, T, H)
        out = np.concatenate((hs, out), axis=2)

        out = self.lstm.forward(out)
        out = np.concatenate((hs, out), axis=2)

        score = self.affine.forward(out)
        self.cache = H
        return score

    def backward(self, dscore):
        H = self.cache

        dout = self.affine.backward(dscore)
        dout, dhs0 = dout[:, :, H:], dout[:, :, :H]
        dout = self.lstm.backward(dout)
        dembed, dhs1 = dout[:, :, H:], dout[:, :, :H]
        self.embed.backward(dembed)

        dhs = dhs0 + dhs1
        dh = self.lstm.dh + np.sum(dhs, axis=1)
        return dh

    def generate(self, h, start_id, sample_size):
        sampled = []
        char_id = start_id
        self.lstm.set_state(h)

        H = h.shape[1]
        peeky_h = h.reshape(1, 1, H)
        for _ in range(sample_size):
            x = np.array([char_id]).reshape((1, 1))
            out = self.embed.forward(x)

            out = np.concatenate((peeky_h, out), axis=2)
            out = self.lstm.forward(out)
            out = np.concatenate((peeky_h, out), axis=2)
            score = self.affine.forward(out)

            char_id = np.argmax(score.flatten())
            sampled.append(char_id)

        return sampled


In [30]:
from seq2seq import Seq2seq, Encoder

class PeekySeq2seq(Seq2seq):
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        self.encoder = Encoder(V, D, H)
        self.decoder = PeekyDecoder(V, D, H)
        self.softmax = TimeSoftmaxWithLoss()

        self.params = self.encoder.params + self.decoder.params
        self.grads = self.encoder.grads + self.decoder.grads