In [1]:
import numpy as np

In [2]:
T, H = 5, 4
hs = np.random.randn(T, H)
a = np.array([0.8, 0.1, 0.03, 0.05, 0.02])

In [3]:
ar = a.reshape(5, 1).repeat(4, axis=1)
print(ar.shape)

(5, 4)


In [4]:
t = hs * ar

In [5]:
t.shape

(5, 4)

In [6]:
c = np.sum(t, axis=0)
print(c.shape)

(4,)


In [None]:
"""
设时序数据的长度 T=5，隐藏状态向量的元素个数 H=4，这里给出了加权和的计算过程。我们先关注代码 ar = a.reshape(5, 1).repeat(4, axis=1)。
"""

In [7]:
N, T, H = 10, 5, 4
hs = np.random.randn(N, T, H)
a = np.random.randn(N, T)
ar = a.reshape(N, T, 1).repeat(H, axis=2)
# ar = a.reshape(N, T, 1) # 使用广播

In [8]:
t = hs * ar
print(t.shape)
# (10, 5, 4)

(10, 5, 4)


In [9]:
c = np.sum(t, axis=1)
print(c.shape)
# (10, 4)

(10, 4)


In [10]:
class WeightSum:
    def __init__(self):
        self.params, self.grads = [], []
        self.cache = None

    def forward(self, hs, a):
        N, T, H = hs.shape

        ar = a.reshape(N, T, 1).repeat(H, axis=2)
        t = hs * ar
        c = np.sum(t, axis=1)

        self.cache = (hs, ar)
        return c

    def backward(self, dc):
        hs, ar = self.cache
        N, T, H = hs.shape
        dt = dc.reshape(N, 1, H).repeat(T, axis=1)  # sum的反向传播
        dar = dt * hs
        dhs = dt * ar
        da = np.sum(dar, axis=2)  # repeat的反向传播
        return dhs, da

## 8.1.4 解码器的改进②

In [11]:
import sys

sys.path.append('..')
from common.layers import Softmax
import numpy as np

In [12]:
N, T, H = 10, 5, 4
hs = np.random.randn(N, T, H)
h = np.random.randn(N, H)
hr = h.reshape(N, 1, H).repeat(T, axis=1)

In [13]:
t = hs * hr
print(t.shape)

(10, 5, 4)


In [14]:
s = np.sum(t, axis=2)
print(s.shape)

(10, 5)


In [15]:
softmax = Softmax()
a = softmax.forward(s)
print(a.shape)

(10, 5)


In [16]:
from common.np import *  # import numpy as np
from common.layers import Softmax

In [17]:
class AttentionWeight:
    def __init__(self):
        self.params, self.grads = [], []
        self.softmax = Softmax()
        self.cache = None

    def forward(self, hs, h):
        N, T, H = hs.shape
        hr = h.reshape(N, 1, H).repeat(T, axis=1)
        t = hs * hr
        s = np.sum(t, axis=2)
        a = self.softmax.forward(s)
        self.cache = (hs, hr)
        return a

    def backward(self, da):
        hs, hr = self.cache
        N, T, H = hs.shape
        ds = self.softmax.backward(da)
        dt = ds.reshape(N, T, 1).repeat(H, axis=2)
        dhs = dt * hr
        dhr = dt * hs
        dh = np.sum(dhr, axis=1)
        return dhs, dh

In [None]:
"""
Attention Weight 层关注编码器输出的各个单词向量 hs，并计算各个单词的权重 a；然后，Weight Sum 层计算 a 和 hs 的加权和，并输出上下文向量 c。我们将进行这一系列计算的层称为 Attention 层
"""

In [18]:
class Attention:
    def __init__(self):
        self.params, self.grads = [], []
        self.attention_weight_layer = AttentionWeight()
        self.weight_sum_layer = WeightSum()
        self.attention_weight = None

    def forward(self, hs, h):
        a = self.attention_weight_layer.forward(hs, h)
        out = self.weight_sum_layer.forward(hs, a)
        self.attention_weight = a
        return out

    def backward(self, dout):
        dhs0, da = self.weight_sum_layer.backward(dout)
        dhs1, dh = self.attention_weight_layer.backward(da)
        dhs = dhs0 + dhs1
        return dhs, dh

In [19]:
# Time Attention 层只是组合了多个 Attention 层
class TimeAttention:
    def __init__(self):
        self.params, self.grads = [], []
        self.layers = None
        self.attention_weights = None

    def forward(self, hs_enc, hs_dec):
        N, T, H = hs_dec.shape
        out = np.empty_like(hs_dec)
        self.layers = []
        self.attention_weights = []
        for t in range(T):
            layer = Attention()
            out[:, t, :] = layer.forward(hs_enc, hs_dec[:, t, :])
            self.layers.append(layer)
            self.attention_weights.append(layer.attention_weight)
        return out

    def backward(self, dout):
        N, T, H = dout.shape
        dhs_enc = 0
        dhs_dec = np.empty_like(dout)
        for t in range(T):
            layer = self.layers[t]
            dhs, dh = layer.backward(dout[:, t, :])
            dhs_enc += dhs
            dhs_dec[:, t, :] = dh
        return dhs_enc, dhs_dec

In [None]:
"""
这里仅创建必要数量的 Attention 层（代码中为 T 个），各自进行正向传播和反向传播。另外，attention_weights 列表中保存了各个Attention 层对各个单词的权重。
"""