# GRUレイヤの実装

In [1]:
import numpy as np
from common.functions import sigmoid

## [演習]
* 以下のGRUレイヤのクラスを完成させましょう

In [2]:
class GRU:
    def __init__(self, Wx, Wh, b):
        '''
        Wx: 入力x用の重みパラーメタ（3つ分の重みをまとめたもの）
        Wh: 隠れ状態h用の重みパラメータ（3つ分の重みをまとめたもの）
        b: バイアス（3つ分のバイアスをまとめたもの）
        '''
        self.params = [Wx, Wh, b]
        self.grads = [np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(b)]
        self.cache = None

    def forward(self, x, h_prev):
        """
        順伝播計算
        """
        Wx, Wh, b = self.params
        N, H = h_prev.shape
        
        Wxz, Wxr, Wxh = Wx[:, :H], Wx[:, H:2 * H], Wx[:, 2 * H:]
        Whz, Whr, Whh = Wh[:, :H], Wh[:, H:2 * H], Wh[:, 2 * H:]
        bhz,   bhr,  bhh =  b[:H], b[H:2 * H], b[2 * H:]
        
        z = sigmoid(np.dot(x, Wxz) + np.dot(h_prev, Whz) + bhz)
        r = sigmoid(np.dot(x, Wxr) + np.dot(h_prev, Whr) + bhr)
        h_hat = np.tanh(np.dot(x, Wxh) + np.dot(r*h_prev, Whh) + bhh)
        h_next = z * h_prev + (1-z) * h_hat

        self.cache = (x, h_prev, z, r, h_hat)

        return h_next

    def backward(self, dh_next):
        """
        逆伝播計算
        """        
        Wx, Wh, b = self.params
    
        H = Wh.shape[0]
        Wxz, Wxr, Wxh = Wx[:, :H], Wx[:, H:2 * H], Wx[:, 2 * H:]
        Whz, Whr, Whh = Wh[:, :H], Wh[:, H:2 * H], Wh[:, 2 * H:]
        x, h_prev, z, r, h_hat = self.cache

        dh_hat = dh_next * (1 - z)
        dh_prev = dh_next * z

        # tanh
        dt = dh_hat * (1 - h_hat ** 2)
        dbt = dt
        dWhh = np.dot((r * h_prev).T, dt)
        dhr = np.dot(dt, Whh.T)
        dWxh = np.dot(x.T, dt)
        dx = np.dot(dt, Wxh.T)
        dh_prev += r * dhr

        # update gate(z)
        dz =  dh_next * h_prev - dh_next * h_hat
        dt = dz * z * (1-z)
        dbz = dt
        dWhz = np.dot(h_prev.T, dt)
        dh_prev += np.dot(dt, Whz.T)
        dWxz = np.dot(x.T, dt)
        dx += np.dot(dt, Wxz.T)

        # reset gate(r)
        dr = dhr * h_prev
        dt = dr * r * (1-r)
        dbr = dt
        dWhr = np.dot(h_prev.T, dt)
        dh_prev += np.dot(dt, Whr.T)
        dWxr = np.dot(x.T, dt)
        dx += np.dot(dt, Wxr.T)

        dA = np.hstack((dbz, dbr, dbt ))
        
        dWx = np.hstack((dWxz, dWxr, dWxh))
        dWh = np.hstack((dWhz, dWhr, dWhh))
        db = dA.sum(axis=0)
        
        self.grads[0][:] = dWx # 同じメモリ位置に代入
        self.grads[1][:] = dWh # 同じメモリ位置に代入
        self.grads[2][:] = db # 同じメモリ位置に代入
        
        return dx, dh_prev

In [3]:
D = 10 # 入力データの次元
H = 5 # 中間層のノード数

Wx = (np.random.randn(D, 3 * H) / np.sqrt(D))
Wh = (np.random.randn(H, 3 * H) / np.sqrt(H))
b = np.zeros(3 * H)

# オブジェクトの生成
gru = GRU(Wx, Wh, b)

# 順伝播計算
N = 4 # バッチサイズ
x = np.random.randn(N, D)
h_prev = np.random.randn(N, H)
h_next = gru.forward(x, h_prev)
print("h_next=", h_next)
print()

# 逆伝播計算
dh_next = np.random.randn(N, H)
dx, dh_prev = gru.backward(dh_next)
print("dx=", dx)
print()
print("dh_prev=", dh_prev)
print()


h_next= [[ 1.37174917  1.35953644 -0.0719135  -0.79057516 -0.03648648]
 [ 0.63100663 -0.6182674   0.42369713 -0.00713864 -0.29954729]
 [ 0.47953534  0.79990723  0.33964142 -0.04317748 -0.49139853]
 [ 0.48499539 -0.54363183 -2.06522002  0.07705205  0.53260063]]

dx= [[-0.1190209   0.42159885 -0.33385821 -0.22949841 -0.098765    0.52653064
  -0.32496882 -0.12149423  0.17733786  0.14108265]
 [-0.27880517 -0.47281174  0.13254005  1.03252321 -0.29335144  0.0836461
   0.13374576 -0.40778533 -0.46568994 -0.0423481 ]
 [-0.4443641   0.15290937 -0.14044379  0.47186149  0.44190563 -0.31598489
   0.33219282 -0.32836017  0.08441649 -0.03965066]
 [ 0.15716995 -0.1560326  -0.02928383 -0.13411953 -0.15916505  0.06550601
  -0.0266207  -0.01608586 -0.0609929   0.00836882]]

dh_prev= [[-0.06312103 -0.05760479 -0.92460568  0.92484726 -0.22880171]
 [-0.15366518  0.5443629   0.62947564 -0.68542953  0.0274309 ]
 [-0.27930113  0.89770641  0.9093154   0.43719066  0.51267648]
 [-0.19253278 -0.25784882 -0.080897