In [1]:
#google driveとの連携
from google.colab import drive
drive.mount('/content/drive')

#カレントディレクトリの設定
dir_path = '/content/drive/My Drive/Colab Notebooks/repo9'
import os
os.chdir(dir_path)

#モジュール追加用pathの設定
import sys
sys.path.append('..')

#モジュールのインポート
import numpy as np

print(os.getcwd())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/Colab Notebooks/repo9


**RNNによる文章生成**\
確率分布から次の単語を確率的に選択する。

**文章生成の実装**

In [2]:
from common.functions import softmax
from ch06.rnnlm import Rnnlm
from ch06.better_rnnlm import BetterRnnlm

In [3]:
class RnnlmGen(Rnnlm):
  def generate(self, start_id, skip_ids=None, sample_size=100): # start_id;最初に与える単語ID, skip_ids;スキップする単語IDを指定。この単語はサンプリングしない
    word_ids = [start_id] #word_idsの初期化
    x = start_id # xの初期化
    while len(word_ids) < sample_size:
      x = np.array(x).reshape(1, 1) # numpy(1, 1)に形状変化
      score = self.predict(x) # スコアの計算
      p = softmax(score.flatten()) # スコアから確率分布の取得

      sampled = np.random.choice(len(p), size=1, p=p)
      if(skip_ids is None) or (sampled not in skip_ids):
        x = sampled
        word_ids.append(int(x))
    
    return word_ids

**文章生成実行**

In [4]:
from dataset import ptb

In [5]:
corpus, word_to_id, id_to_word = ptb.load_data('train')
vocab_size = len(word_to_id)
corpus_size = len(corpus)
model = RnnlmGen()

# start文字とskip文字の設定
start_word = 'you'
start_id = word_to_id[start_word]
skip_words = ['N', '<unk>', '$']
skip_ids = [word_to_id[i] for i in skip_words]

# 文章生成
word_ids = model.generate(start_id, skip_ids)
txt = ' '.join([id_to_word[i] for i in word_ids])
txt = txt.replace(' <eos>', ' .\n')
print(txt)

you statistics encountered restaurant conceptual ruble spreading base exact trails antitrust clothes conservatives instantly municipal disagreed callers damaged verge tag outfit oversees method root merge profits uncommon vienna lipton fled phase hispanic anyone fresh pennies glasnost fournier treated advertisements investigate ogilvy introduced ian lacks kia resign tomorrow adjusters resume mergers wine spokesman rebuffed quebec solved amortization slid famous consistent flight cd rejection wheels winner fool specialists gradually tied topiary cigarette complicated obvious interesting tide outright steppenwolf gaubert projection comply violent contractor deng alexander gained scandals mips defeat pack rivals every healthy restrain mark subscription sold membership surveys lighter retrieve aer


確かに文章は生成できたが、重みの学習を行っていないので意味が通らない文章になっている。

学習済みの重みを用いた場合以下のようになる。

In [6]:
corpus, word_to_id, id_to_word = ptb.load_data('train')
vocab_size = len(word_to_id)
corpus_size = len(corpus)
model = RnnlmGen()
model.load_params('../ch06/Rnnlm.pkl')

# start文字とskip文字の設定
start_word = 'you'
start_id = word_to_id[start_word]
skip_words = ['N', '<unk>', '$']
skip_ids = [word_to_id[i] for i in skip_words]

# 文章生成
word_ids = model.generate(start_id, skip_ids)
txt = ' '.join([id_to_word[i] for i in word_ids])
txt = txt.replace(' <eos>', ' .\n')
print(txt)

you promised door and the pool of gen-probe 's economic proceedings throughout maryland lift its image .
 but yes that could attract scenes of competition in apart for carbon concerns .
 foreigners is knocked out looking functions to the minds of the glare of people who want to show that opponent .
 rep. thomas frank leader could put his argument .
 mr. roman become president of the panel before the los angeles begins .
 the stuff might be to try to drain to netherlands representatives that federal express tried to use full wage effective and no win penalties and performing


先ほどより結びつきのよい個所が多い。例えば、"he think that", "it's a diificult"などの構文のような表現や、"the effect", "these europian"などの形容詞、冠詞などうまく機能している。 

**seq2seq**\
Encoder-Decoderモデルとも呼ばれる。時系列データを別の時系列データに変換する手法の一つである。\
**Encoder**\
RNNを用いて、時系列データを隠れ状態ベクトルhに変換する。任意の長さの時系列データを固定長のhに変換することになる。このhをDecoderに渡す。\
**Decoder**\
上の文章作成モデルと同じ。hを貰っている点が異なる。

**時系列データ変換用のトイ・プロブレム**\
"57+5"のような文字列を"62"という文字列に変換させることを考える。つまり足し算を行うプログラムを考える。\
今回は文字ごとに区切る。すなわちサンプルごとに時系列方向の大きさが異なる。これはパディングで解決。パディングは無効な文字で埋めることでデータの長さをそろえる手法である。

In [7]:
from dataset import sequence

In [8]:
(x_train, t_train), (x_test, t_test) = sequence.load_data('../dataset/addition.txt', seed=1984)
char_to_id, id_to_char = sequence.get_vocab()
print(x_train.shape, t_train.shape)
print(x_test.shape, t_test.shape)
print(x_train[0]) # 文字ID
print(t_train[0]) #文字ID

print(''.join([id_to_char[c] for c in x_train[0]]), ''.join([id_to_char[c] for c in t_train[0]]))

(45000, 7) (45000, 5)
(5000, 7) (5000, 5)
[ 3  0  2  0  0 11  5]
[ 6  0 11  7  5]
71+118  _189 


**seq2seqの実装**\
**Encoderクラスの実装**\
入力 -> Time Embeddingレイヤ -> TimeLSTMレイヤ -> h\
で考える。hは時系列方向に流し、レイヤ方向には出力しない。

In [9]:
from common.time_layers import TimeEmbedding
from common.time_layers import TimeLSTM
from common.time_layers import TimeAffine
from common.time_layers import TimeSoftmaxWithLoss

In [10]:
class Encoder:
  def __init__(self, vocab_size, wordvec_size, hidden_size):
    V, D, H = vocab_size, wordvec_size, hidden_size
    rn = np.random.randn

    embed_W = (rn(V, D) / 100).astype('f')
    lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
    lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
    lstm_b = np.zeros(4 * H).astype('f')

    self.embed = TimeEmbedding(embed_W)
    self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=False)

    self.params = self.embed.params + self.lstm.params
    self.grads = self.embed.grads + self.lstm.grads
    self.hs = None

  def forward(self, xs):
    xs = self.embed.forward(xs)
    hs = self.lstm.forward(xs)
    self.hs = hs
    return hs[:, -1, :]

  def backward(self, dh):
    dhs = np.zeros_like(self.hs)
    dh[:, -1, :] = dh

    dout = self.lstm.backward(dhs)
    dout = self.embed.backward(dout)
    return dout

**Decoderクラスの実装**\
学習時\
入力T個 -> Embedding -> LSTM -> Affine -> SoftmaxWithLoss -> T個の損失 ->和\
文章生成は決定的に行う。そのため文章生成時はSotmaxレイヤを省略できる。したがってSoftmaxレイヤを除いたAffineまでをDecoderクラスとする。

In [11]:
class Decoder:
  def __init__(self, vocab_size, wordvec_size, hidden_size):
    V, D, H = vocab_size, wordvec_size, hidden_size
    rn = np.random.randn

    embed_W = (rn(V, D) / 100).astype('f')
    lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
    lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
    lstm_b = np.zeros(4 * H).astype('f')
    affine_W = (rn(H, V) / np.sqrt(H)).astype('f')
    affine_b = np.zeros(V).astype('f')

    self.embed = TimeEmbedding(embed_W)
    self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=False)
    self.affine = TimeAffine(affine_W, affine_b)

    self.params, self.grads = [], []

    for layer in (self.embed, self.lstm, self.affine):
      self.params += layer.params
      self.grads += layer.grads

  def forward(self, xs, h): #学習
    self.lstm.set_state(h)
    out = self.embed.forward(xs)
    out = self.lstm.forward(out)
    score = self.affine.forward(out)
    return score

  def backward(self, dscore):
    dout = self.affine.backward(dout)
    dout = self.lstm.backward(dout)
    dout = self.embed.backward(dout)
    dh = self.lstm.dh
    return dh

  def generate(self, h, start_id, sample_size):
    sampled = []
    sample_id = start_id
    self.lstm.set_state(h)

    for _ in range(sample_size):
      x = np.array(sample_id).reshape((1, 1))
      out = self.embed.forward(x)
      out = self.lstm.forward(out)
      score = self.affine.forward(out)

      sample_id = np.argmax(score.flatten())
      sampled.append(int(sample_id))

    return sampled

**seq2seqクラスの実装**

In [12]:
class Seq2seq:
  def __init__(self, vocab_size, wordvec_size, hidden_size):
    V, D, H = vocab_size, wordvec_size, hidden_size
    self.encoder = Encoder(V, D, H)
    self.decoder = Decoder(V, D, H)
    self.softmax = TimeSoftmaxWithLoss()

    self.params = self.encoder.params + self.decoder.params
    self.grads = self.encoder.grads + self.decoder.grads
  
  def forward(self, xs, ts):
    decoder_xs, decoder_ts = ts[:, :-1], ts[:, 1:]

    h = self.encoder.forward(xs)
    score = self.decoder.forward(decoder_xs, h)
    loss = self.softmax.forward(score, decoder_ts)
    return loss

  def backward(self, dout=1):
    dout = self.softmax.backward(dout)
    dh = self.decoder.backward(dout)
    dout= self.encoder.backward(dh)
    return dout

  def genarate(self, xs, start_id, sample_size):
    h = self.encoder.forward(xs)
    sampled = self.decoder.genarate(h, start_id, sample_size)
    return sampled

**seq2seqの評価**

In [7]:
import matplotlib.pyplot as plt
from dataset import  sequence
from common.optimizer import Adam
from common.trainer import Trainer
from common.util import eval_seq2seq
from ch07.seq2seq import Seq2seq

In [8]:
# データセット読み込み
(x_train, t_train), (x_test, t_test) = sequence.load_data('addition.txt')
char_to_id, id_to_char = sequence.get_vocab()

# ハイパーパラメータの設定
vocab_size = len(char_to_id)
wordvec_size = 16
hidden_size = 128
batch_size = 128
max_epoch = 25
max_grad = 5.0

In [9]:
model = Seq2seq(vocab_size, wordvec_size, hidden_size)
optimizer = Adam()
trainer = Trainer(model, optimizer)

acc_list = []
for epoch in range(max_epoch):
  trainer.fit(x_train, t_train, max_epoch=1, batch_size=batch_size, max_grad=max_grad)

  correct_num = 0
  for i in range(len(x_test)):
    question, correct = x_test[[i]], t_test[[i]]
    verbose = i < 10
    correct_num += eval_seq2seq(model, question, correct, id_to_char, char_to_id)
    acc = float(correct_num)/ len(x_test)
    acc_list.append(acc)
    print('val acc %.3f%%' % ( acc * 100))

[1;30;43mストリーミング出力は最後の 5000 行に切り捨てられました。[0m
T 1507
[91m☒[0m 1504
---
val acc 8.780%
Q 34+364 
T 398 
[91m☒[0m 402 
---
val acc 8.780%
Q 635+653
T 1288
[91m☒[0m 1299
---
val acc 8.780%
Q 339+17 
T 356 
[91m☒[0m 353 
---
val acc 8.780%
Q 175+48 
T 223 
[91m☒[0m 222 
---
val acc 8.780%
Q 1+293  
T 294 
[91m☒[0m 291 
---
val acc 8.780%
Q 875+1  
T 876 
[91m☒[0m 877 
---
val acc 8.780%
Q 64+37  
T 101 
[91m☒[0m 102 
---
val acc 8.780%
Q 3+95   
T 98  
[92m☑[0m 98  
---
val acc 8.800%
Q 801+708
T 1509
[91m☒[0m 1504
---
val acc 8.800%
Q 782+19 
T 801 
[91m☒[0m 799 
---
val acc 8.800%
Q 102+36 
T 138 
[91m☒[0m 141 
---
val acc 8.800%
Q 6+63   
T 69  
[91m☒[0m 72  
---
val acc 8.800%
Q 1+850  
T 851 
[91m☒[0m 849 
---
val acc 8.800%
Q 92+939 
T 1031
[91m☒[0m 1034
---
val acc 8.800%
Q 218+34 
T 252 
[91m☒[0m 251 
---
val acc 8.800%
Q 324+877
T 1201
[91m☒[0m 1199
---
val acc 8.800%
Q 69+639 
T 708 
[91m☒[0m 712 
---
val acc 8.800%
Q 5+841  
T 846 
[91m☒[0m 

In [11]:
a = acc_list

たしかに学習が進んでいる。\
最終的には10%程度の精度になっている。max_epochを増やせばさらに精度が上がると考えられる。

**seq2seqの改良**
1. 入力データの反転
2. hをDecoderの複数のレイヤ(LSTM, Affine)に渡す\

実装は上のSeq2seqを少し変更するだけであるので省略する。

In [3]:
import matplotlib.pyplot as plt
from dataset import  sequence
from common.optimizer import Adam
from common.trainer import Trainer
from common.util import eval_seq2seq
from ch07.seq2seq import Seq2seq
from peeky_seq2seq import PeekySeq2seq

In [4]:
# データセット読み込み
(x_train, t_train), (x_test, t_test) = sequence.load_data('addition.txt')
char_to_id, id_to_char = sequence.get_vocab()

# reverse
x_train, x_test = x_train[:, ::-1], x_test[:, ::-1]

# ハイパーパラメータの設定
vocab_size = len(char_to_id)
wordvec_size = 16
hidden_size = 128
batch_size = 128
max_epoch = 25
max_grad = 5.0

In [5]:
model = PeekySeq2seq(vocab_size, wordvec_size, hidden_size)
optimizer = Adam()
trainer = Trainer(model, optimizer)

acc_list = []
for epoch in range(max_epoch):
  trainer.fit(x_train, t_train, max_epoch=1, batch_size=batch_size, max_grad=max_grad)

  correct_num = 0
  for i in range(len(x_test)):
    question, correct = x_test[[i]], t_test[[i]]
    verbose = i < 10
    correct_num += eval_seq2seq(model, question, correct, id_to_char, char_to_id)
    acc = float(correct_num)/ len(x_test)
    acc_list.append(acc)
    print('val acc %.3f%%' % ( acc * 100))

[1;30;43mストリーミング出力は最後の 5000 行に切り捨てられました。[0m
T 1507
[92m☑[0m 1507
---
val acc 79.180%
Q  463+43
T 398 
[92m☑[0m 398 
---
val acc 79.200%
Q 356+536
T 1288
[92m☑[0m 1288
---
val acc 79.220%
Q  71+933
T 356 
[92m☑[0m 356 
---
val acc 79.240%
Q  84+571
T 223 
[92m☑[0m 223 
---
val acc 79.260%
Q   392+1
T 294 
[92m☑[0m 294 
---
val acc 79.280%
Q   1+578
T 876 
[92m☑[0m 876 
---
val acc 79.300%
Q   73+46
T 101 
[92m☑[0m 101 
---
val acc 79.320%
Q    59+3
T 98  
[92m☑[0m 98  
---
val acc 79.340%
Q 807+108
T 1509
[91m☒[0m 1519
---
val acc 79.340%
Q  91+287
T 801 
[92m☑[0m 801 
---
val acc 79.360%
Q  63+201
T 138 
[92m☑[0m 138 
---
val acc 79.380%
Q    36+6
T 69  
[92m☑[0m 69  
---
val acc 79.400%
Q   058+1
T 851 
[92m☑[0m 851 
---
val acc 79.420%
Q  939+29
T 1031
[92m☑[0m 1031
---
val acc 79.440%
Q  43+812
T 252 
[92m☑[0m 252 
---
val acc 79.460%
Q 778+423
T 1201
[92m☑[0m 1201
---
val acc 79.480%
Q  936+96
T 708 
[92m☑[0m 708 
---
val acc 79.500%
Q   148+5


改良前に比べて、精度が大きく上昇している。およそ99パーセントの精度であった。