# 实现示意图

![](images/640.webp)

# 导入库和工具函数

In [27]:
import os
from os.path import exists
import torch
import torch.nn as nn
from torch.nn.functional import log_softmax, pad
import math
import copy
import time
from torch.optim.lr_scheduler import LambdaLR
import pandas as pd
import altair as alt
from torchtext.data import to_map_style_dataset
from torch.utils.data import DataLoader
from torchtext.vocab import build_vocab_from_iterator
import torchtext.datasets as datasets
import spacy
import GPUtil
import warnings
from torch.utils.data.distributed import DistributedSampler
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP


# Set to False to skip notebook execution (e.g. for debugging)
warnings.filterwarnings("ignore")
RUN_EXAMPLES = True

In [28]:
# Some convenience helper functions used throughout the notebook
alt.renderers.enable("mimetype")  # 解决 Altair 渲染问题


def is_interactive_notebook():
    return __name__ == "__main__"


def show_example(fn, args=[]):
    if __name__ == "__main__" and RUN_EXAMPLES:
        return fn(*args)


class DummyOptimizer(torch.optim.Optimizer):
    def __init__(self):
        self.param_groups = [{"lr": 0}]
        None

    def step(self):
        None

    def zero_grad(self, set_to_none=False):
        None


class DummyScheduler:
    def step(self):
        None

# 代码结构

这里面，通常二维张量的形状可能是 (seq_len, d_model)，三维张量的形状可能是 (batch_size, seq_len, d_model)

## Embedding

论文说，In our model, we share the same weight matrix between the two embedding layers and
the pre-softmax linear transformation, similar to [(cite)](https://arxiv.org/abs/1608.05859). 然而，输入层和输出层的 vocab_size 很可能不同，所以这种情况下就不太可能有相同的权值矩阵。

说到权值矩阵，这玩意显然可以学习。

论文说，In the embedding layers, we multiply those weights by $\sqrt{d_{\text{model}}}$. 代码里也确实是这么写的。

In [29]:
class Embeddings(nn.Module):
    """
    直接调 torch 的 embedding
    d_model 是 dim of model 的意思，也就是论文中的 512
    lut 是 lookup table 的意思
    """

    def __init__(self, d_model: int, vocab_size: int):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab_size, d_model)
        self.d_model = d_model

    def forward(self, x: torch.Tensor):
        """
        输出是一个高维向量张量，形状与输入张量相同，但最后加上一个嵌入维度
        如果输入张量的形状是 (batch_size, sequence_length)
        输出张量的形状是 (batch_size, sequence_length, d_model)
        """
        return self.lut(x) * math.sqrt(self.d_model)

## 位置编码

In this work, we use sine and cosine functions of different frequencies:

$$PE_{(pos,2i)} = \sin(pos / 10000^{2i/d_{\text{model}}})$$

$$PE_{(pos,2i+1)} = \cos(pos / 10000^{2i/d_{\text{model}}})$$

$pos / 10000^{2i/d_{\text{model}}} = pos \times 10000^{-2i/d_{\text{model}}}$。我们可以计算 $10000^{-2i/d_{\text{model}}}$ by $\exp(\log(10000^{-2i/d_{\text{model}}})) = \exp(-2i\cdot\log(10000)/d_{\text{model}})$。其中 $2i$ 的取值就是 $0,2,\ldots,d_{\text{model}}-2$

where $pos$ is the position and $i$ is the dimension.  That is, each
dimension of the positional encoding corresponds to a sinusoid.  The
wavelengths form a geometric progression from $2\pi$ to $10000 \cdot
2\pi$.  We chose this function because we hypothesized it would
allow the model to easily learn to attend by relative positions,
since for any fixed offset $k$, $PE_{pos+k}$ can be represented as a
linear function of $PE_{pos}$. 尽管训练集的序列长度有限，我们也容许长一点的序列，但是还是不能太长，不然位置编码不能预生成得现算了。

In addition, we apply dropout to the sums of the embeddings and the
positional encodings in both the encoder and decoder stacks.  For
the base model, we use a rate of $P_{drop}=0.1$. 所以，最后要加 drop

In [30]:
class PositionalEncoding(nn.Module):
    """
    Implement the PE function.
    forward 输入张量形状是 (batch_size, seq_len, d_model)，即 (batch, seq, feature)
    """

    def __init__(self, d_model: int, dropout_rate: float, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(dropout_rate)  # 创建 dropout 层
        # 第一维序列下标，第二维 d_model
        position = torch.arange(0, max_len).unsqueeze(1)  # 变成 (max_len, 1) 形状
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000) / d_model))

        pe = torch.zeros(max_len, d_model)
        pe[:, 0::2] = torch.sin(position * div_term)  # 注意广播机制和 hadamard 积
        pe[:, 1::2] = torch.cos(position * div_term)  # 注意广播机制和 hadamard 积
        pe = pe.unsqueeze(0)  # pe 变成 (1, max_len, d_model) 形状
        self.register_buffer("pe", pe)  # 这是个缓冲区，不是个参数

    def forward(self, x: torch.Tensor):
        # 和 pe 通过广播机制相加
        # 第二个维度的大小确定好，第三个维度不变
        # 防止 pe 计算梯度
        x = x + self.pe[:, : x.size(1)].requires_grad_(False)
        return self.dropout(x)

测试位置编码

In [31]:
def example_positional():
    pe = PositionalEncoding(20, 0)
    y = pe.forward(torch.zeros(1, 100, 20))

    data = pd.concat(
        [
            pd.DataFrame(
                {
                    "embedding": y[0, :, dim],
                    "dimension": dim,
                    "position": list(range(100)),
                }
            )
            for dim in [4, 5, 6, 7]
        ]
    )

    return (
        alt.Chart(data)
        .mark_line()
        .properties(width=800)
        .encode(x="position", y="embedding", color="dimension:N")
        .interactive()
    )


show_example(example_positional)

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/display_frontends.html#troubleshooting


## 注意力机制

便捷地克隆模块：

In [32]:
def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

单头注意力（缩放点积注意力，也就是V的参数是缩放点积的softmax）

$$ \mathrm{Attention}(Q, K, V) = \mathrm{softmax}(\frac{QK^T}{\sqrt{d_k}})V $$

![](images/ModalNet-19.png)

In [33]:
def attention(
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    mask=None,
    dropout=None,
):
    """
    传入：Q、K、V
    mask：要保留的为 1，不保留的为 0
    dropout：一个 dropout 层
    """
    d_k = query.size(-1)  # 张量最后一个维度大小，即 d_model
    # 转置：交换后两个维度
    # 如果这两个矩阵都是 (a, b, c) 维，那么其实相当于做了个 batch=a 的矩阵乘法
    # 如果维度更高也行，反正最后两维是矩阵的行、列
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
    # 在训练decoder的时候，我们不想让decoder位置靠前的看到位置靠后的
    # 这个masked多头注意力，就是在计算出来注意力评分以后softmax之前，把不想要的v的评分置为负无穷
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    p_attn = scores.softmax(dim=-1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

多头注意力

$$
\mathrm{MultiHead}(Q, K, V) =
    \mathrm{Concat}(\mathrm{head_1}, ..., \mathrm{head_h})W^O \\
    \text{where}~\mathrm{head_i} = \mathrm{Attention}(QW^Q_i, KW^K_i, VW^V_i)
$$

Where the projections are parameter matrices $W^Q_i \in
\mathbb{R}^{d_{\text{model}} \times d_k}$, $W^K_i \in
\mathbb{R}^{d_{\text{model}} \times d_k}$, $W^V_i \in
\mathbb{R}^{d_{\text{model}} \times d_v}$ and $W^O \in
\mathbb{R}^{hd_v \times d_{\text{model}}}$.

In [34]:
class MultiHeadedAttention(nn.Module):
    """
    h：头数。必须能整除 d_model
    """

    def __init__(self, h: int, d_model: int, dropout_rate=0.1):
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0  #
        self.d_k = d_model // h  # key 的维度。也用作 value 的维度
        self.h = h
        # 四个线性变换，前三个用于 Q、K、V 分头投影
        # 第四个用于 concat 后的线性变换（投影矩阵 W^O）
        # 分成 h x d_k 个矩阵就是分成 1 个 d_model 个矩阵
        # 所以这四个线性层变换还可以一样
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None  # 把多头注意力保存下来后面可以可视化
        self.dropout = nn.Dropout(dropout_rate)  # 这玩意也没参数，公用一个也行

    def forward(
        self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, mask=None
    ):
        nbatches = query.size(0)

        # 1. 线性映射。把 d_model 的 query/key/value -> h x d_k
        # 即 (batch_size, seq_len, d_model) -> (batch_size, seq_len, h, d_k)
        # 再转置一把，变成 (batch_size, h, seq_len, d_k)
        query, key, value = [
            lin(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
            for lin, x in zip(self.linears, (query, key, value))
        ]
        # 2. mask 扩充。(batch_size, seq_len, seq_len) -> (batch_size, 1, seq_len, seq_len)
        if mask is not None:
            # Same mask applied to all h heads.
            mask = mask.unsqueeze(1)
        # 3. 多头注意力计算
        x, self.attn = attention(query, key, value, mask, self.dropout)
        # 4. concat
        # 复原张量。把内存变连续，然后才能 view
        x = x.transpose(1, 2).contiguous().view(nbatches, -1, self.h * self.d_k)
        del query
        del key
        del value
        # 5. 线性计算
        return self.linears[-1](x)

### 生成 mask

In [35]:
def subsequent_mask(size: int):
    "Mask out subsequent positions."
    attn_shape = (1, size, size)
    subsequent_mask = torch.triu(torch.ones(attn_shape), diagonal=1).type(torch.uint8)
    # 生成符合形状的下三角
    return subsequent_mask == 0

In [36]:
def example_mask():
    LS_data = pd.concat(
        [
            pd.DataFrame(
                {
                    "Subsequent Mask": subsequent_mask(20)[0][x, y].flatten(),
                    "Window": y,
                    "Masking": x,
                }
            )
            for y in range(20)
            for x in range(20)
        ]
    )

    return (
        alt.Chart(LS_data)
        .mark_rect()
        .properties(height=250, width=250)
        .encode(
            alt.X("Window:O"),
            alt.Y("Masking:O"),
            alt.Color("Subsequent Mask:Q", scale=alt.Scale(scheme="viridis")),
        )
        .interactive()
    )


show_example(example_mask)

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/display_frontends.html#troubleshooting


## 前馈网络

就是俩线性层

In [37]:
class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."

    def __init__(self, d_model: int, d_ff: int, dropout_rate=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x: torch.Tensor):
        return self.w_2(self.dropout(self.w_1(x).relu()))  # 这么写 relu 也行

## 层归一化、残差连接，及子层连接

层归一化里面也有可学习参数

和论文里不太一样，这里是先层归一化，再子层，再 dropout，再残差

In [38]:
class LayerNorm(nn.Module):
    "Construct a layernorm module (See citation for details)."
    """
    对最后一维（feature 维）做层归一化
    """

    def __init__(self, features_size: int, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features_size))  # 可学习参数
        self.b_2 = nn.Parameter(torch.zeros(features_size))  # 可学习参数
        self.eps = eps

    def forward(self, x: torch.Tensor):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

In [39]:
class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    size 就是 d_model
    也就是多头注意力/掩码多头注意力/前馈网络的 wrapper
    """

    def __init__(self, size: int, dropout_rate: float):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout_rate)

    # sublayer 可能是 nn.Module，也可能是 lambda 函数
    # 总之能通过 () 调用就行
    def forward(self, x: torch.Tensor, sublayer):
        return x + self.dropout(sublayer(self.norm(x)))

## 一层 encoder



In [40]:
class EncoderLayer(nn.Module):
    "Encoder is made up of self-attn and feed forward (defined below)"
    """
    传入多头注意力模块和前馈模块
    """

    def __init__(self, size: int, self_attn, feed_forward, dropout_rate: float):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        # encoder 有两层，一层多头注意力，一头前馈网络
        self.sublayer_connections = clones(SublayerConnection(size, dropout_rate), 2)
        self.size = size

    def forward(self, x: torch.Tensor, mask):
        # 自注意力。要传入一个 lambda 供调用
        x = self.sublayer_connections[0](x, lambda x: self.self_attn(x, x, x, mask))
        return self.sublayer_connections[1](x, self.feed_forward)

## 一层 decoder

简直跟 encoder 一模一样！就是多了一层

In [41]:
class DecoderLayer(nn.Module):
    "Decoder is made of self-attn, src-attn, and feed forward (defined below)"
    """
    传入多头注意力模块和前馈模块
    """

    def __init__(
        self, size: int, self_attn, src_attn, feed_forward, dropout_rate: float
    ):
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout_rate), 3)

    def forward(self, x: torch.Tensor, memory: torch.Tensor, src_mask, tgt_mask):
        "Follow Figure 1 (right) for connections."
        "tgt_mask 是给 decoder 那里用的"
        "src_mask 是给 encoder 到 decoder 那里的 mask 用的"
        m = memory
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
        x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
        return self.sublayer[2](x, self.feed_forward)

## 6 层 encoder 堆叠成一个 encoder

In [42]:
class Encoder(nn.Module):
    "Core encoder is a stack of N layers"

    # layer 应该是 EncoderLayer
    def __init__(self, layer, N: int):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)

    def forward(self, x: torch.Tensor, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

## 6 层 decoder 堆叠成一个 decoder

In [43]:
class Decoder(nn.Module):
    "Generic N layer decoder with masking."

    def __init__(self, layer, N: int):
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)

    def forward(self, x: torch.Tensor, memory: torch.Tensor, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
        return self.norm(x)

## 最后的 generator：又生成词的概率

In [44]:
class Generator(nn.Module):
    "Define standard linear + softmax generation step."
    "最后一维"

    def __init__(self, d_model: int, vocab_size: int):
        super(Generator, self).__init__()
        self.proj = nn.Linear(d_model, vocab_size)

    def forward(self, x: torch.Tensor):
        return log_softmax(self.proj(x), dim=-1)

## 终极无敌最终模型

In [45]:
class EncoderDecoder(nn.Module):
    """
    A standard Encoder-Decoder architecture. Base for this and many
    other models.
    encoder：Encoder
    decoder：Decoder
    src_embed：在 src 起到 embedding 和位置编码的模块
    dst_embed：在 dst 起到 embedding 和位置编码的模块
    """

    def __init__(
        self,
        encoder: nn.Module,
        decoder: nn.Module,
        src_embed: nn.Module,
        tgt_embed: nn.Module,
        generator: nn.Module,
    ):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.generator = generator

    def encode(self, src: torch.Tensor, src_mask):
        return self.encoder(self.src_embed(src), src_mask)

    def decode(self, memory: torch.Tensor, src_mask, tgt: torch.Tensor, tgt_mask):
        return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)

    def forward(self, src, tgt, src_mask, tgt_mask):
        "Take in and process masked src and target sequences."
        return self.decode(self.encode(src, src_mask), src_mask, tgt, tgt_mask)