In [1]:
#google driveとの連携
from google.colab import drive
drive.mount('/content/drive')

#カレントディレクトリの設定
dir_path = '/content/drive/My Drive/Colab Notebooks/repo10/'
import os
os.chdir(dir_path)

#モジュール追加用pathの設定
import sys
sys.path.append('..')

#モジュールのインポート
import numpy as np

print(os.getcwd())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/Colab Notebooks/repo10


**Attention**\
seq2seqの問題点は入力分の長さにかかわらず固定超に変換してしまうこと。そこで、今まではEncoderの最後の隠れ状態だけを利用していたが、全各tr状態ベクトルを利用する。

In [6]:
class WeightSum:
  def __init__(self):
    self.params, self.grads = [], []
    self.cache = None
  
  def forward(self, hs, a):
    N, T, H = hs.shape

    ar = a.reshape(N, T, 1).repeat(H, axis =2)
    t = hs * ar
    c = np.sum(t, axis=1)
    
    self.cache = (hs, ar)
    return c

  def backward(self, dc):
    hs, ar = self.cache
    N, T, H = hs.shape

    dt = dc.reshape(N, 1, H).repeat(T, axis=1) #sumの逆伝播
    dar = dt * hs
    dhs = dt * ar
    da = np.sum(dar, axis=2)

    return dhs, da


In [7]:
from common.np import *
from common.layers import Softmax

In [8]:
class AttentionWeight:
  def __init__(self):
    self.params, self.grads = [], []
    self.softmax = Softmax()
    self.cache = None

  def forward(self, hs, h):
    N, T, H = hs.shape

    hr = h.reshape(N, 1, H).repeat(T, axis=1)
    t = hs * hr
    s = np.sum(t, axis=2)
    a = self.softmax.forward(s)
    
    self.cache = (hs, hr)
    return a

    def backward(self, da):
      hs, hr = self.cache
      N, T, H = hs.shape

      ds = self.softmax.backward(da)
      dt = ds.reshape(N, T, 1).repeat(H, axis=2)
      dhs = dt * hr
      dhr = dt * hs
      dh = np.sum(dhr, sxis=1)

      return dhs, dh

In [9]:
class Attention:
  def __init__(self):
    self.params, self.grads = [], []
    self.attention_weight_layer = AttentionWeight()
    self.weight_sum_layer = WeightSum()
    self.attention_weight = None
  
  def forward(self, hs, h):
    a = self.attention_weight_layer.forward(hs, h)
    out = self.weight_sum_layer.forward(hs, a)
    self.attention_weight = a
    return out

  def backward(self, dout):
    dhs0, da = self.weight_sum_layer.backward(dout)
    dhs1, dh = self.attention_weight_layer.backward(da)
    dhs = dhs0 + dhs1
    return dhs, dh

In [10]:
class TumeAttention:
  def __init__(self):
    self.params, self.grads = [], []
    self.layers = None
    self.attention_weights = None
  
  def forward(self, hs_enc, hs_dec):
    N, T, H = hs_dec.shape
    out = np.empty_like(hs_dec)
    self.layers = []
    self.attention_weights = []

    for t in range(T): #全単語
      layer = Attention()
      out[:, t, :] = layer.forward(hs_enc, hs_dec[:, t, :])
      self.layers.append(layer)
      self.attention_weights.append(layer.attention_weight)
    
    return out

  def backward(self, dout):
    N, T, H = dout.shape
    dhs_enc = 0
    dhs_dec = np.empty_like(dout)

    for t in range(T):
      layer = self.layers[t]
      dhs, dh = layer.backward(dout[:, t, :])
      dhs_enc += dhs
      dhs_dec[:, t, :] = dh

    return dhs_enc, dhs_dec

In [12]:
from common.time_layers import *
from ch07.seq2seq import  Encoder, Seq2seq
from ch08.attention_layer import TimeAttention

In [13]:
class AttentionEncoder(Encoder):
  def forward(self, xs):
    xs = self.embed.forward
    hs = self.lstm.forward(xs)
    return hs
  
  def backward(self, dhs):
    dout = self.lstm.backward(dhs)
    dout = self.embed.backward(dout)
    return dout

In [15]:
class AttentionDecoder:
  def __init__(self, vocab_size, wordvec_size, hidden_size):
    V, D, H = vocab_size, wordvec_size, hidden_size
    rn = np.random.randn

    embed_W = (rn(V, D) / 100).astype('f')
    lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
    lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
    lstm_b = np.zeros(4 * H).astype('f')
    affine_W = (rn(2 * H, V) / np.sqrt(2 * H)).astype('f')
    affine_b = np.zeros(V).astype('f')

    self.embed = TimeEmbedding(embed_W)
    self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True)
    self.attention = TimeAttention()
    self.affine = TimeAffine(affine_W, affine_b)
    layers = [self.embed, self.lstm, self.attention, self.affine]

    self.params, self.grads = [], []
    for layer in layers:
      self.params += layer.params
      self.grads += layer.grads

  #略



**Atteintionつきseq2seqの学習**

In [4]:
import numpy as np
from dataset import sequence
from common.optimizer import  Adam
from common.trainer import Trainer
from common.util import eval_seq2seq
from ch08.attention_seq2seq import AttentionSeq2seq
from ch07.seq2seq import Seq2seq
from peeky_seq2seq import PeekySeq2seq

In [11]:
(x_train, t_train), (x_test, t_test) = sequence.load_data('date.txt')
char_to_id, id_to_char = sequence.get_vocab()

#反転
x_train, x_test = x_train[:, ::-1], x_test[:, ::-1]

#ハイパーパラメータの設定
vocab_size = len(char_to_id)
wordvec_size = 16
hidden_size = 256
batch_size = 128
max_epoch = 10
max_grad = 5.0

model = AttentionSeq2seq(vocab_size=vocab_size, wordvec_size=wordvec_size, hidden_size=hidden_size)
optimizer = Adam()
trainer = Trainer(model, optimizer)

acc_list = []
for epoch in range(max_epoch):
  trainer.fit(x_train, t_train, max_epoch=1, batch_size=batch_size, max_grad=max_grad)

  correct_num = 0
  for i in range(len(x_test)):
    question, correct = x_test[[i]], t_test[[i]]
    verbose = i < 10
    correct_num += eval_seq2seq(model, question, correct, id_to_char, verbose, is_reverse=True)
  
  acc = float(correct_num) / len(x_test)
  acc_list.append(acc)
  print('val acc %.3f%%' % (acc * 100))

model.save_params()

| epoch 1 |  iter 1 / 351 | time 0[s] | loss 4.08
| epoch 1 |  iter 21 / 351 | time 16[s] | loss 3.09
| epoch 1 |  iter 41 / 351 | time 32[s] | loss 1.90
| epoch 1 |  iter 61 / 351 | time 47[s] | loss 1.72
| epoch 1 |  iter 81 / 351 | time 62[s] | loss 1.46
| epoch 1 |  iter 101 / 351 | time 79[s] | loss 1.19
| epoch 1 |  iter 121 / 351 | time 95[s] | loss 1.14
| epoch 1 |  iter 141 / 351 | time 110[s] | loss 1.09
| epoch 1 |  iter 161 / 351 | time 126[s] | loss 1.06
| epoch 1 |  iter 181 / 351 | time 142[s] | loss 1.04
| epoch 1 |  iter 201 / 351 | time 158[s] | loss 1.03
| epoch 1 |  iter 221 / 351 | time 174[s] | loss 1.02
| epoch 1 |  iter 241 / 351 | time 189[s] | loss 1.02
| epoch 1 |  iter 261 / 351 | time 204[s] | loss 1.01
| epoch 1 |  iter 281 / 351 | time 219[s] | loss 1.00
| epoch 1 |  iter 301 / 351 | time 234[s] | loss 1.00
| epoch 1 |  iter 321 / 351 | time 249[s] | loss 1.00
| epoch 1 |  iter 341 / 351 | time 263[s] | loss 1.00
Q 10/15/94                     
T 1994-10-

実行できた。結果を見ると、epoch1からepoch2にかけて大きく精度が上がっていて、epoch3では99%近くまでになっている。また、epoch6で精度が落ちてしまっている。