In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch import Tensor
from torch.nn.init import constant_, xavier_uniform_
torch.manual_seed(4)

class PositionalEncoding(nn.Module):  # 位置编码，用在输入层，每一层的开始，编码器，解码器开始

    def __init__(self, d_model, n_position=200, device=None, dtype=None):
        factory_kwargs = {'device': device, 'dtype': dtype}
        # d_hid是维度，即每个单词由长度为多少的向量进行表示。
        # n_position为最大位置，即所能处理的单词在句子中的最大的位置
        super().__init__()
        self.d_model = d_model
        self.n_position = n_position # 其实应该就是maxlen
        # 将tensor注册成buffer, optim.step()的时候不会更新
        # 定义一组参数，参数名称为‘pos_table’，
        # 这组tensor形式的参数为self._get_sinusoid_encoding_table(n_position, d_hid)
        self.register_buffer('pos_table', self._get_sinusoid_encoding_table(n_position, d_model))

    def _get_sinusoid_encoding_table(self, n_position, d_model):
        # TODO: make it with torch instead of numpy   todo是将要做的事，用在团队协作中，好处是和注释分开
        def get_position_angle_vec(pos):
            # 2 * (i // 2) 依次为：0 0 2 2 4 4 6 6 8 8 ... 然后/d_hid归一化到0~1，再变到指数
            # 变到指数范围在1~10000,然后用pos除以这个指数，得到范围0~200/10000，最后偶维度用sin 奇维度用cos
            return [pos / torch.pow(torch.tensor(10000), torch.tensor(2 * (i // 2) / d_model)) for i in range(d_model)]

        # 创建一个位置的array，每一行是位置i的位置嵌入向量
        # 列表列数为词嵌入维度，列表行数为单词个数
        sinusoid_table = torch.tensor([get_position_angle_vec(pos) for pos in range(n_position)]) # 二维的一个表格

        # 返回每个位置的sin/cos括号里面的
        sinusoid_table[:, 0::2] = torch.sin(sinusoid_table[:, 0::2])  # dim 2i 偶数 奇数偶数嵌入维度分别取sin/cos值
        sinusoid_table[:, 1::2] = torch.cos(sinusoid_table[:, 1::2])  # dim 2i+1 奇数
        # 转为Float类型的张量  # unsqueeze(0)是升维，在0维度升一维，最高维是batchsize所在的维度
        return sinusoid_table

    def forward(self, x: Tensor) -> Tensor:  # [[1,2,3], [4,5,6]] 0维是2，1维是3，越高就是越靠里面的
        # clone()创建一个相同的tensor，不共享内存地址（数据无关），但是新tensor的梯度会叠加在源tensor上
        # detach()是返回一个共享内存（数值一样），但是不计算grad的tensor
        # x应该是已经经过词编码的向量 [:, :词嵌入维度d_hid]
        return x + self.pos_table.clone().detach()  # 数据、梯度均无关，索引编码+位置编码
    
def create_mask(src_seq: Tensor, tgt_seq: Tensor, PAD_IDX):
    is_batched = src_seq.dim() == 2
    if is_batched:
        src_seq_len = src_seq.shape[1]
        tgt_seq_len = tgt_seq.shape[1]
    else:
        src_seq_len = src_seq.shape[0]
        tgt_seq_len = tgt_seq.shape[0]

    tgt_mask = (torch.triu(torch.ones(tgt_seq_len, tgt_seq_len)) == 1).transpose(0, 1)
    tgt_mask = tgt_mask.float().masked_fill(tgt_mask == 0, float('-inf')).masked_fill(tgt_mask == 1, float(0.0))
    src_mask = torch.zeros( (src_seq_len, src_seq_len) ).type(torch.bool) # 一般源不需要mask，所以都置为0

    src_key_padding_mask = (src_seq == PAD_IDX) # 后面的都会变成1作为mask
    tgt_key_padding_mask = (tgt_seq == PAD_IDX)
    return src_mask, tgt_mask, src_key_padding_mask, tgt_key_padding_mask


src_seq = torch.LongTensor([[1,2,0],[1,3,0]])
tgt_seq = torch.LongTensor([[1,3,0],[2,1,0]])
num_heads = 4
embed_dim = 4

src_mask, tgt_mask, src_key_padding_mask, tgt_key_padding_mask = create_mask(src_seq, tgt_seq, 0)
# print(src_mask)
# print(tgt_mask)
# print(src_key_padding_mask)
# print(tgt_key_padding_mask)

# embedding = nn.Embedding(num_embeddings=embed_dim, embedding_dim=embed_dim,padding_idx=0)
src_embedding = nn.Embedding(num_embeddings=embed_dim, embedding_dim=embed_dim,padding_idx=0)
tgt_embedding = nn.Embedding(num_embeddings=embed_dim, embedding_dim=embed_dim,padding_idx=0)
pose = PositionalEncoding(d_model=embed_dim,n_position=3)
dropout = nn.Dropout(0.1)
layer_norm = nn.LayerNorm(4, eps=0.00001)
Model_nnTransformer = nn.Transformer(d_model=embed_dim, nhead=num_heads, batch_first=True)
tgt_out = nn.Linear(4, 4, bias=False)
src_input = layer_norm(dropout(pose(src_embedding(src_seq))))
tgt_input = layer_norm(dropout(pose(tgt_embedding(tgt_seq))))
output = Model_nnTransformer(src_input, tgt_input, src_mask=src_mask, tgt_mask=tgt_mask,
                memory_mask=None, src_key_padding_mask=src_key_padding_mask,
                tgt_key_padding_mask=tgt_key_padding_mask, memory_key_padding_mask=None)

output = F.softmax(tgt_out(output),-1)
print(output)

# x1 = torch.tensor([
#     [ [[1,2,1],[1,2,1],[1,2,1]], [[1,2,1],[1,2,1],[1,2,1]], [[1,2,1],[1,2,1],[1,2,1]], [[1,2,1],[1,2,1],[1,2,1]] ],
#     [ [[1,2,1],[1,2,1],[1,2,1]], [[1,2,1],[1,2,1],[1,2,1]], [[1,2,1],[1,2,1],[1,2,1]], [[1,2,1],[1,2,1],[1,2,1]] ] ])
# y1 = torch.tensor([[[[0,0,-99999]]],[[[0,-99999,-99999]]]])
# print(x1.shape)
# print(y1.shape)
# print(x1)
# print(y1)
# print(x1+y1)
# tgt_mask = (torch.triu(torch.ones(6, 6)) == 1).transpose(0, 1)
# print(tgt_mask)

i_seq = torch.linspace(0, 10 - 1, 10)
j_seq = torch.linspace(0, 16 - 2, 16 // 2)
pos, two_i = torch.meshgrid(i_seq, j_seq)
# print(pos.shape)
# # print(pos)
# print(two_i)
x = torch.tensor([[1,2,3],[4,5,7]])
print(*x)

tensor([[[0.4114, 0.1927, 0.1980, 0.1979],
         [0.3803, 0.3329, 0.1926, 0.0943],
         [0.3818, 0.3147, 0.1800, 0.1235]],

        [[0.4161, 0.1665, 0.2039, 0.2135],
         [0.2720, 0.2323, 0.1895, 0.3062],
         [0.3177, 0.2436, 0.1880, 0.2507]]], grad_fn=<SoftmaxBackward0>)
tensor([1, 2, 3]) tensor([4, 5, 7])


In [7]:
torch.manual_seed(4)
batch_size = src_seq.shape[0]
seq_len = src_seq.shape[1]
embedding = nn.Embedding(num_embeddings=embed_dim, embedding_dim=embed_dim,padding_idx=0)
query = embedding(src_seq) # N L E batchsize seq_length embedding_dim
query = key = value = query.transpose(1, 0) # 输入的qkv L N E
in_proj_weight = nn.Parameter(torch.empty((seq_len * embed_dim, embed_dim))) # 权重Wq Wk Wv
xavier_uniform_(in_proj_weight)
in_proj_bias = nn.Parameter(torch.empty(seq_len * embed_dim,)) # 偏置
out_proj = nn.Linear(embed_dim, embed_dim, bias=True) 
constant_(in_proj_bias, 0.0) # 使用指定的值填充tensor
constant_(out_proj.bias, 0.0)
# 将屏蔽转换为标准格式 并确保其他类型的屏蔽（如果存在）与目标屏蔽具有相同的数据类型
src_key_padding_mask = F._canonical_mask( # N S batchsize source_seq_length
    mask=src_key_padding_mask,
    mask_name="key_padding_mask",
    other_type=F._none_or_dtype(src_mask), # 如果存在就返回其数据类型，否则返回None
    other_name="attn_mask",
    target_type=query.dtype # 用于指定屏蔽的数据类型
)

attn_output, attn_output_weights = F.multi_head_attention_forward(
            query, key, value, embed_dim_to_check = embed_dim, num_heads = num_heads,
            in_proj_weight = in_proj_weight, in_proj_bias = in_proj_bias,
            bias_k = None, bias_v= None, add_zero_attn = False,
            dropout_p = 0.1, out_proj_weight = out_proj.weight, out_proj_bias = out_proj.bias,
            training=False,
            key_padding_mask=src_key_padding_mask, need_weights=True,
            attn_mask=src_mask,
            average_attn_weights=False,
            is_causal=False)
attn_output = attn_output.transpose(1, 0)
print(attn_output.shape)
print(attn_output)
print(attn_output_weights)

torch.Size([2, 3, 4])
tensor([[[ 0.0476,  0.3066, -0.8684,  0.5803],
         [ 0.1157,  0.4071, -0.5418,  0.4576],
         [ 0.0593,  0.3850, -0.6665,  0.4705]],

        [[-0.3918,  0.8157, -0.9510,  0.1099],
         [-0.2867,  0.9686, -0.8271,  0.2097],
         [-0.3277,  0.6723, -1.0601,  0.2519]]], grad_fn=<TransposeBackward0>)
tensor([[[[0.7595, 0.2405, 0.0000],
          [0.3648, 0.6352, 0.0000],
          [0.5000, 0.5000, 0.0000]],

         [[0.6013, 0.3987, 0.0000],
          [0.3939, 0.6061, 0.0000],
          [0.5000, 0.5000, 0.0000]],

         [[0.4774, 0.5226, 0.0000],
          [0.5598, 0.4402, 0.0000],
          [0.5000, 0.5000, 0.0000]],

         [[0.5029, 0.4971, 0.0000],
          [0.5396, 0.4604, 0.0000],
          [0.5000, 0.5000, 0.0000]]],


        [[[0.2195, 0.7805, 0.0000],
          [0.1252, 0.8748, 0.0000],
          [0.5000, 0.5000, 0.0000]],

         [[0.4872, 0.5128, 0.0000],
          [0.4765, 0.5235, 0.0000],
          [0.5000, 0.5000, 0.0000]],



In [8]:
key_padding_mask = src_key_padding_mask
attn_mask = src_mask
training = False
dropout = nn.Dropout(0.1)

Q = (torch.matmul(query,in_proj_weight[:embed_dim,:].transpose(0,1)) + in_proj_bias[:embed_dim]).transpose(0,1)
K = (torch.matmul(key,in_proj_weight[embed_dim:2*embed_dim,:].transpose(0,1)) + in_proj_bias[embed_dim:2*embed_dim]).transpose(0,1)
V = (torch.matmul(value,in_proj_weight[2*embed_dim:3*embed_dim,:].transpose(0,1)) + in_proj_bias[2*embed_dim:3*embed_dim]).transpose(0,1)
Q = Q.view(batch_size, seq_len, embed_dim, (embed_dim//num_heads)).transpose(1,2) # batch_size seq_len num_heads head_dim
K = K.view(batch_size, seq_len, embed_dim, (embed_dim//num_heads)).transpose(1,2) # batch_size num_heads seq_len head_dim
V = V.view(batch_size, seq_len, embed_dim, (embed_dim//num_heads)).transpose(1,2)

attn_output_weights = torch.matmul(Q / ((embed_dim//num_heads) ** 0.5 ), K.transpose(-2, -1))  # [batch_size, num_heads, seq_len, seq_len]
if key_padding_mask is not None:
    src_key_padding_mask = key_padding_mask.unsqueeze(1).unsqueeze(2)  # [batch_size, 1, 1, seq_len]
    attn_output_weights = attn_output_weights.masked_fill(src_key_padding_mask !=0 , float('-inf'))  # Apply mask by setting scores to -inf
else:
    pass
if attn_mask is not None:
    attn_output_weights = attn_output_weights + attn_mask.unsqueeze(0).unsqueeze(1)

attn_output_weights = torch.softmax(attn_output_weights, dim=-1)
if training:
    attn_output_weights = dropout(attn_output_weights)
# Apply attention weights to value
attn_output = torch.matmul(attn_output_weights, V)  # [batch_size, num_heads, seq_len, embed_dim // num_heads]
# # Transpose and reshape to restore original shape
attn_output = attn_output.transpose(1, 2)  # [batch_size, seq_len, num_heads, embed_dim // num_heads]
attn_output = attn_output.contiguous().view(batch_size, seq_len,embed_dim)  # [batch_size, seq_len, embed_dim]
# # Apply output projection
attn_output = out_proj(attn_output).transpose(0,1)
print(attn_output.shape)
print(attn_output)
print(attn_output_weights)


torch.Size([3, 2, 4])
tensor([[[ 0.0476,  0.3066, -0.8684,  0.5803],
         [-0.3918,  0.8157, -0.9510,  0.1099]],

        [[ 0.1157,  0.4071, -0.5418,  0.4576],
         [-0.2867,  0.9686, -0.8271,  0.2097]],

        [[ 0.0593,  0.3850, -0.6665,  0.4705],
         [-0.3277,  0.6723, -1.0601,  0.2519]]], grad_fn=<TransposeBackward0>)
tensor([[[[0.7595, 0.2405, 0.0000],
          [0.3648, 0.6352, 0.0000],
          [0.5000, 0.5000, 0.0000]],

         [[0.6013, 0.3987, 0.0000],
          [0.3939, 0.6061, 0.0000],
          [0.5000, 0.5000, 0.0000]],

         [[0.4774, 0.5226, 0.0000],
          [0.5598, 0.4402, 0.0000],
          [0.5000, 0.5000, 0.0000]],

         [[0.5029, 0.4971, 0.0000],
          [0.5396, 0.4604, 0.0000],
          [0.5000, 0.5000, 0.0000]]],


        [[[0.2195, 0.7805, 0.0000],
          [0.1252, 0.8748, 0.0000],
          [0.5000, 0.5000, 0.0000]],

         [[0.4872, 0.5128, 0.0000],
          [0.4765, 0.5235, 0.0000],
          [0.5000, 0.5000, 0.0000]],

In [17]:
a = nn.Embedding(10,8)
print(a.weight)
print(a(torch.tensor([0,1,2,3,4,5,6,7,8,9])))

Parameter containing:
tensor([[ 0.4300, -0.7901, -1.8083, -1.1338,  1.2966, -0.1208, -0.5116, -1.5176],
        [ 0.4556, -1.6697,  0.0218, -0.8667,  1.3035,  0.6310, -0.9778,  0.3571],
        [-0.8921,  1.6783,  1.1638,  0.5075, -1.1120,  1.3915, -0.5504,  0.1138],
        [-0.8925, -2.2975, -0.7189,  0.6959, -0.3114,  0.1659,  1.3819, -1.3395],
        [-0.6346, -0.6993, -1.3065,  0.5795,  0.4003, -0.3327, -0.6629,  0.1936],
        [-0.6293, -0.7184,  1.1213, -0.1945, -2.0126, -0.2996, -1.1608,  0.0406],
        [ 1.1254,  3.1009, -0.5068, -1.3209,  0.2486, -1.5305,  1.2565, -0.6270],
        [ 0.5675, -0.2091, -1.6583, -0.8675,  0.7634, -0.6300, -0.7686, -0.3769],
        [ 0.8233, -2.0816, -0.5910, -0.2266,  1.2819, -0.2478,  0.7555,  1.1254],
        [-0.7683, -0.8072, -0.4347, -0.8952,  0.3564, -0.3828, -1.7014,  1.0922]],
       requires_grad=True)
tensor([[ 0.4300, -0.7901, -1.8083, -1.1338,  1.2966, -0.1208, -0.5116, -1.5176],
        [ 0.4556, -1.6697,  0.0218, -0.8667,  1.

In [9]:

# x = torch.tensor([[1,3],[3,3],[0,2]])

# print(torch.unsqueeze(x, -3))  # tensor([[1., 2., 3., 4.]])
# print(torch.unsqueeze(x, -3).size())  # torch.Size([1, 4])
# print(torch.unsqueeze(x, -3).dim())
# # 0 -> 1 3 2
# # 1 -> 3 1 2
# # 2 -> 3 2 1
# # -1 -> 3 2 1
# # -2 -> 3 1 2
# # -3 -> 1 3 2
# print(torch.tensor([[1,2,3,4,5]]).size())
# x1 = torch.tensor([1,3,4,6,7,98.5,4,23],requires_grad=True)
# x2 = torch.tensor([98.5,1,3,4,6,7,4,23],requires_grad=True)
# y = max((x1**2) * x2)
# print(y.grad_fn)

In [None]:
# batch_size = query.shape[1] # [seq_len, batchsize, d_model]
#         seq_len_q = query.shape[0] #50
#         seq_len_k = key.shape[0]
#         seq_len_v = value.shape[0]
#         Q = (torch.matmul(query,self.in_QKV_weight[:self.embed_dim,:].transpose(0,1)) + self.in_QKV_bias[:self.embed_dim]).transpose(0,1)
#         K = (torch.matmul(key,self.in_QKV_weight[self.embed_dim:2*self.embed_dim,:].transpose(0,1)) + self.in_QKV_bias[self.embed_dim:2*self.embed_dim]).transpose(0,1)
#         V = (torch.matmul(value,self.in_QKV_weight[2*self.embed_dim:3*self.embed_dim,:].transpose(0,1)) + self.in_QKV_bias[2*self.embed_dim:3*self.embed_dim]).transpose(0,1)
#         Q = Q.view(batch_size, seq_len_q, self.num_heads, (self.embed_dim//self.num_heads)).transpose(1,2) # batch_size seq_len num_heads head_dim
#         K = K.view(batch_size, seq_len_k, self.num_heads, (self.embed_dim//self.num_heads)).transpose(1,2) # batch_size num_heads seq_len head_dim
#         V = V.view(batch_size, seq_len_v, self.num_heads, (self.embed_dim//self.num_heads)).transpose(1,2)
#         attn_output_weights = torch.matmul(Q / ((self.embed_dim) ** 0.5 ), K.transpose(-2, -1)).to(self.device)  # [batch_size, num_heads, seq_len, seq_len]
#         if key_padding_mask is not None:
#             src_key_padding_mask = key_padding_mask.unsqueeze(1).unsqueeze(2)  # [batch_size, 1, 1, seq_len]
#             # attn_output_weights = attn_output_weights.masked_fill(src_key_padding_mask !=0 , float('-inf'))  # Apply mask by setting scores to -inf
#             attn_output_weights = attn_output_weights + src_key_padding_mask # Apply mask by setting scores to -inf
#         else:
#             pass
#         if attn_mask is not None:
#             attn_output_weights = attn_output_weights + attn_mask.unsqueeze(0).unsqueeze(1)

#         attn_output_weights = F.softmax(attn_output_weights, dim=-1)
#         if self.training:
#             attn_output_weights = self.dropout(attn_output_weights)
#         # Apply attention weights to value
#         attn_output = torch.matmul(attn_output_weights, V)  # [batch_size, num_heads, seq_len, embed_dim // num_heads]
#         # # Transpose and reshape to restore original shape
#         attn_output = attn_output.transpose(1, 2)  # [batch_size, seq_len, num_heads, embed_dim // num_heads]
#         attn_output = attn_output.contiguous().view(batch_size, seq_len_q, self.embed_dim)  # [batch_size, seq_len, embed_dim] ##########################
#         # # Apply output projection
#         # attn_output = self.out_proj(attn_output).transpose(0,1)

In [10]:
class MultiHeadAttention(nn.Module):  # 多头自注意力模块
    ''' Multi-Head Attention module '''

    def __init__(self, nhead, d_model, d_k, d_v, dropout=0.1):
        super().__init__()

        self.nhead = nhead  # 多头，头的个数
        self.d_k = d_k  # 单词向量的维度
        self.d_v = d_v
        # 三个线性层，创建方法，每个包含一个权重(d_model, nhead * d_k)为权重矩阵维度
        self.w_qs = nn.Linear(d_model, nhead * d_k, bias=False)
        self.w_ks = nn.Linear(d_model, nhead * d_k, bias=False)
        self.w_vs = nn.Linear(d_model, nhead * d_v, bias=False)
        self.fc = nn.Linear(nhead * d_v, d_model, bias=False)
        # ScaledDotProductAttention见下方
        self.attention = ScaledDotProductAttention(temperature=d_k ** 0.5)

        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)

    def forward(self, q, k, v, mask=None):  # q, k, v,是三个输入，其实都是输入x

        d_k, d_v, nhead = self.d_k, self.d_v, self.nhead
        sz_b, len_q, len_k, len_v = q.size(0), q.size(1), k.size(1), v.size(1)
        # 输入batchsize，输入单词个数
        residual = q

        # Pass through the pre-attention projection: b x lq x (n*dv)
        # b: batch_size, lq: translation task的seq长度, n: head数, dv: embedding vector length
        # Separate different heads: b x lq x n x dv.
        q = self.w_qs(q).view(sz_b, len_q, nhead, d_k)  # project & reshape
        k = self.w_ks(k).view(sz_b, len_k, nhead, d_k)
        v = self.w_vs(v).view(sz_b, len_v, nhead, d_v)

        # Transpose for attention dot product: bs x nhead x len_q x d_k 分成多头的输出
        q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)
        # 先变到多头的Q K V，然后进行缩放点击注意力
        if mask is not None:
            mask = mask.unsqueeze(1)   # For head axis broadcasting.
            # (batchSize, 1, seqLen) -> (batchSize, 1, 1, seqLen)

        q, attn = self.attention(q, k, v, mask=mask)  # 缩放点积注意力
        # 此时的q还是多头的q
        # Transpose to move the head dimension back: b x lq x n x d_k
        # Combine the last two dimensions to concatenate all the heads together: b x lq x (n*dv)
        # view只能用在contiguous的variable上
        q = q.transpose(1, 2).contiguous().view(sz_b, len_q, -1)
        # contiguous()强制拷贝一份tensor，与原tensor断开联系
        q = self.dropout(self.fc(q))  # 把多头的输出经过一个线性层换成单头维度的！！！
        # add & norm
        q += residual  # 添加残联接

        q = self.layer_norm(q)  # 层归一化

        return q, attn


class ScaledDotProductAttention(nn.Module):  # 缩放点积注意力机制
    ''' Scaled Dot-Product Attention '''

    def __init__(self, temperature, attn_dropout=0.1):
        super().__init__()
        self.temperature = temperature
        self.dropout = nn.Dropout(attn_dropout)

    def forward(self, q, k, v, mask=None):
        # q x k^T
        attn = torch.matmul(q / self.temperature, k.transpose(2, 3))
        # self.temperature dk^(1/2)

        if mask is not None:
            # 把mask中为0的数置为-1e9, 用于decoder中的masked self-attention
            attn = attn.masked_fill(mask == 0, -1e9)

        # dim=-1表示对最后一维softmax
        attn = self.dropout(F.softmax(attn, dim=-1))  # softmax
        output = torch.matmul(attn, v)  # 和V矩阵的矩阵乘法

        return output, attn


class PositionwiseFeedForward(nn.Module):
    ''' A two-feed-forward-layer module '''

    def __init__(self, d_in, d_hid, dropout=0.1):
        super().__init__()
        self.w_1 = nn.Linear(d_in, d_hid)  # position-wise nn.Linear(输入维度，输出维度)
        self.w_2 = nn.Linear(d_hid, d_in)  # position-wise 自动创建权重和偏置，被自动更新
        self.layer_norm = nn.LayerNorm(d_in, eps=1e-6)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):

        residual = x

        x = self.w_2(F.relu(self.w_1(x)))  # feed Forward经过两个线性层
        x = self.dropout(x)  # 以及dropout层
        # add & norm
        x += residual

        x = self.layer_norm(x)

        return x


class EncoderLayer(nn.Module):  # 一个完整的编码器模块
    ''' Compose with two layers '''

    def __init__(self, d_model, dim_feedforward, nhead, d_k, d_v, dropout=0.1):
        super(EncoderLayer, self).__init__()
        # MultiHeadAttention创建类的实例
        self.slf_attn = MultiHeadAttention(
            nhead, d_model, d_k, d_v, dropout=dropout)
        # FeedForward前传
        self.pos_ffn = PositionwiseFeedForward(
            d_model, dim_feedforward, dropout=dropout)

    def forward(self, enc_input, slf_attn_mask=None):
        # enc_slf_attn多头自注意力机制的输出
        # 输入enc_input分成三份，分别与wq,wk,wv相乘得到QKV矩阵
        enc_output, enc_slf_attn = self.slf_attn(
            enc_input, enc_input, enc_input, mask=slf_attn_mask)
        enc_output = self.pos_ffn(enc_output)
        return enc_output, enc_slf_attn


class DecoderLayer(nn.Module):
    ''' Compose with three layers '''

    def __init__(self, d_model, dim_feedforward, nhead, d_k, d_v, dropout=0.1):
        super(DecoderLayer, self).__init__()
        # masked self-attention
        self.slf_attn = MultiHeadAttention(
            nhead, d_model, d_k, d_v, dropout=dropout)
        # encoder-decoder attention
        self.enc_attn = MultiHeadAttention(
            nhead, d_model, d_k, d_v, dropout=dropout)
        self.pos_ffn = PositionwiseFeedForward(
            d_model, dim_feedforward, dropout=dropout)

    def forward(self, dec_input, enc_output, slf_attn_mask=None, dec_enc_attn_mask=None):
        dec_output, dec_slf_attn = self.slf_attn(  # 一个多头，输入是上一个解码器的输出
            dec_input, dec_input, dec_input, mask=slf_attn_mask)
        # q用自己的, k和v是encoder的输出
        dec_output, dec_enc_attn = self.enc_attn(  # 一个多头，输入有两个是编码器的输出！！！
            dec_output, enc_output, enc_output, mask=dec_enc_attn_mask)
        dec_output = self.pos_ffn(dec_output)  # FeedForward层
        return dec_output, dec_slf_attn, dec_enc_attn


class PositionalEncoding(nn.Module):  # 位置编码，用在输入层，每一层的开始，编码器，解码器开始

    def __init__(self, d_hid, n_position=200):
        # d_hid是维度，即每个单词由长度为多少的向量进行表示。
        # n_position为最大位置，即所能处理的单词在句子中的最大的位置
        super(PositionalEncoding, self).__init__()

        # 将tensor注册成buffer, optim.step()的时候不会更新
        # 定义一组参数，参数名称为‘pos_table’，
        # 这组tensor形式的参数为self._get_sinusoid_encoding_table(n_position, d_hid)
        self.register_buffer(
            'pos_table', self._get_sinusoid_encoding_table(n_position, d_hid))

    def _get_sinusoid_encoding_table(self, n_position, d_hid):
        # TODO: make it with torch instead of numpy   todo是将要做的事，用在团队协作中，好处是和注释分开
        def get_position_angle_vec(position):
            # 2 * (hid_j // 2) 依次为：0 0 2 2 4 4 6 6 8 8 ... 然后/d_hid归一化到0~1，再变到指数
            # 变到指数范围在1~10000,然后用pos除以这个指数，得到范围0~200/10000，最后偶维度用sin 奇维度用cos
            return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]

        # 创建一个位置的array，每一行是位置i的位置嵌入向量
        # 列表列数为词嵌入维度，列表行数为单词个数
        sinusoid_table = np.array([get_position_angle_vec(pos_i)
                                  for pos_i in range(n_position)])

        # 返回每个位置的sin/cos括号里面的
        sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i 偶数
        sinusoid_table[:, 1::2] = np.cos(
            sinusoid_table[:, 1::2])  # dim 2i+1 奇数
        # 转为Float类型的张量  # unsqueeze(0)是升维，在0维度升一维
        return torch.FloatTensor(sinusoid_table).unsqueeze(0)

    def forward(self, x):  # [[1,2,3], [4,5,6]] 0维是2，1维是3，越高就是越靠里面的
        # clone()创建一个相同的tensor，不共享内存地址（数据无关），但是新tensor的梯度会叠加在源tensor上
        # detach()是返回一个共享内存（数值一样），但是不计算grad的tensor
        # x应该是已经经过词编码的向量 [:, :词嵌入维度d_hid]
        return x + self.pos_table[:, :x.size(1)].clone().detach()  # 数据、梯度均无关


class Encoder(nn.Module):  # 编码器部分
    ''' A encoder model with self attention mechanism. '''

    def __init__(
            self, n_src_vocab, d_word_vec, n_layers, nhead, d_k, d_v,
            d_model, dim_feedforward, pad_idx, dropout=0.1, n_position=200):
        # d_word_vec是单词嵌入的向量的维度，n_position是最大的单词位置
        # n_src_vocab是源词汇表大小，pad_idx是指定单词索引为多少的表示填充
        super().__init__()
        # 这一步实际上是创建一个权重矩阵，(n_src_vocab, d_word_vec)维度的
        self.src_word_emb = nn.Embedding(
            n_src_vocab, d_word_vec, padding_idx=pad_idx)
        # 位置编码，对每个位置进行编码
        self.position_enc = PositionalEncoding(
            d_word_vec, n_position=n_position)
        self.dropout = nn.Dropout(p=dropout)  # 随机失活，将张量某些元素置为0，减少过拟合
        # 多个Encoder Layer叠加
        self.layer_stack = nn.ModuleList([
            EncoderLayer(d_model, dim_feedforward, nhead,
                         d_k, d_v, dropout=dropout)  # 多头和前传
            for _ in range(n_layers)])  # n_layers是编码器模块个数
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)

    def forward(self, src_seq, src_mask, return_attns=False):

        enc_slf_attn_list = []

        # -- Forward

        # Embedding & Position encoding 词嵌入和位置编码，第一个编码层之前的部分
        enc_output = self.dropout(
            self.position_enc(self.src_word_emb(src_seq)))

        # ==================================================================
        print('encoder embedding')
        print(self.src_word_emb(src_seq))
        # ==================================================================

        # 先进行单词索引embedding，可以认为是把每个单词索引，用一个向量替换。
        # 再进行位置编码叠加起来，最后进行dropout
        enc_output = self.layer_norm(enc_output)  # 然后进行LayerNorm

        # Encoder Layers
        for enc_layer in self.layer_stack:  # 然后经过编码器模块
            enc_output, enc_slf_attn = enc_layer(
                enc_output, slf_attn_mask=src_mask)
            enc_slf_attn_list += [enc_slf_attn] if return_attns else []

        if return_attns:
            return enc_output, enc_slf_attn_list
        return enc_output,


class Decoder(nn.Module):  # 解码器部分
    ''' A decoder model with self attention mechanism. '''

    def __init__(
            self, n_trg_vocab, d_word_vec, n_layers, nhead, d_k, d_v,
            d_model, dim_feedforward, pad_idx, n_position=200, dropout=0.1):

        super().__init__()
        # ebedding词集变成目标语言词集
        self.trg_word_emb = nn.Embedding(
            n_trg_vocab, d_word_vec, padding_idx=pad_idx)
        # Position encoding
        self.position_enc = PositionalEncoding(
            d_word_vec, n_position=n_position)
        self.dropout = nn.Dropout(p=dropout)
        # 多个Decoder Layer叠加
        self.layer_stack = nn.ModuleList([
            DecoderLayer(d_model, dim_feedforward, nhead,
                         d_k, d_v, dropout=dropout)
            for _ in range(n_layers)])
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)

    def forward(self, trg_seq, trg_mask, enc_output, src_mask, return_attns=False):

        dec_slf_attn_list, dec_enc_attn_list = [], []

        # -- Forward

        # Embedding & Position encoding
        dec_output = self.dropout(
            self.position_enc(self.trg_word_emb(trg_seq)))

        # ============================================================
        print('decoder embedding:')
        print(self.trg_word_emb(trg_seq))
        # ============================================================

        # 先对目的句子进行词嵌入以及位置编码，然后dropout，然后LN层
        dec_output = self.layer_norm(dec_output)

        # Decoder Layers
        for dec_layer in self.layer_stack:
            dec_output, dec_slf_attn, dec_enc_attn = dec_layer(  # 每一层输入都有编码器输出的
                dec_output, enc_output, slf_attn_mask=trg_mask, dec_enc_attn_mask=src_mask)
            dec_slf_attn_list += [dec_slf_attn] if return_attns else []
            dec_enc_attn_list += [dec_enc_attn] if return_attns else []

        if return_attns:
            return dec_output, dec_slf_attn_list, dec_enc_attn_list
        return dec_output,


# def get_pad_mask(seq, pad_idx):
#     # (batch, seqlen) -> (batch, 1, seqlen)
#     # pad_idx是填充的索引值
#     # 输出是(batch, 1, seqlen)，填充位置被标记为false
#     return (torch.tensor(seq != pad_idx)).unsqueeze(-2)  # (batch, 1, seqlen)


# def get_subsequent_mask(seq):
#     ''' For masking out the subsequent info. '''
#     sz_b, len_s = seq.size()
#     # torch.triu(diagonal=1)保留矩阵上三角部分，其余部分(包括对角线)定义为0。
#     subsequent_mask = (1 - torch.triu(
#         torch.ones((1, len_s, len_s), device=seq.device), diagonal=1)).bool()
#     return subsequent_mask





        # self.src_pad_idx, self.trg_pad_idx = src_pad_idx, trg_pad_idx
        # # mask
        # src_mask = get_pad_mask(src_seq, self.src_pad_idx)
        # trg_mask = get_pad_mask(
        #     trg_seq, self.trg_pad_idx) & get_subsequent_mask(trg_seq)
# # Encoder 创建类的实例
#         self.encoder = Encoder(
#             n_src_vocab=n_src_vocab, n_position=n_position,
#             d_word_vec=d_word_vec, d_model=d_model, dim_feedforward=dim_feedforward,
#             n_layers=num_encoder_layers, nhead=nhead, d_k=d_k, d_v=d_v,
#             pad_idx=src_pad_idx, dropout=dropout)

#         # Decoder 创建类的实例
#         self.decoder = Decoder(
#             n_trg_vocab=n_trg_vocab, n_position=n_position,
#             d_word_vec=d_word_vec, d_model=d_model, dim_feedforward=dim_feedforward,
#             n_layers=num_decoder_layers, nhead=nhead, d_k=d_k, d_v=d_v,
#             pad_idx=trg_pad_idx, dropout=dropout)

#         # 最后的linear输出层
#         self.trg_word_prj = nn.Linear(d_model, n_trg_vocab, bias=False)


                #  n_src_vocab, n_trg_vocab, src_pad_idx, trg_pad_idx,
                #  d_word_vec=8,
                #  d_k=64, d_v=64, n_position=200,
                #  trg_emb_prj_weight_sharing=False, emb_src_trg_weight_sharing=False):

                
        # n_src_vocab是源词汇索引表的大小
        # n_trg_vocab是目的词汇索引表的大小
        # src_pad_idx是源词汇索引表中需要补充padding对应的索引的值
        # trg_pad_idx是目的词汇索引表中需要补充padding对应的索引的值

        # d_word_vec是单词向量的维度
        # n_layers是有几个编码层，有几个解码层

        # d_k nhead*d_k是多头自注意力里面矩阵乘法输出的维度
        # d_v nhead*d_v是多头自注意力里面矩阵乘法输出的维度

        # n_position是最大的单词位置

        # trg_emb_prj_weight_sharing是否共享权重
        # emb_src_trg_weight_sharing是否共享权重

        
        # self.x_logit_scale = 1.
        # if trg_emb_prj_weight_sharing:
        #     # Share the weight between target word embedding & last dense layer
        #     self.trg_word_prj.weight = self.decoder.trg_word_emb.weight
        #     self.x_logit_scale = (d_model ** -0.5)

        # if emb_src_trg_weight_sharing:
        #     self.encoder.src_word_emb.weight = self.decoder.trg_word_emb.weight

                # enc_output, *_ = self.encoder(src_seq, src_mask)
        # dec_output, *_ = self.decoder(trg_seq, trg_mask, enc_output, src_mask)

        # # ==========================================================
        # print('dec_output:')
        # print(dec_output.shape)
        # print(dec_output)
        # # ==========================================================

        # # final linear layer得到logit vector最后的一个线性层，输出通道为输出词汇表大小
        # seq_logit = self.trg_word_prj(
        #     dec_output) * self.x_logit_scale  # x_logit_scale是缩放
        # # print(seq_logit.shape)
        # seq_logit = F.softmax(seq_logit, dim=2)  # 最后一个softmax
        # # 应该最后再加一个softmax
        # return seq_logit.view(-1, seq_logit.size(2))

                # src_seq输入序列张量,(batchsize,seqlen)

In [None]:
        # attn_output, attn_output_weights = F.multi_head_attention_forward(
        #     query, key, value, self.embed_dim, self.num_heads,
        #     self.in_proj_weight, self.in_proj_bias,
        #     self.bias_k, self.bias_v, self.add_zero_attn,
        #     self.dropout, self.out_proj.weight, self.out_proj.bias,
        #     training=False,
        #     key_padding_mask=key_padding_mask, need_weights=need_weights,
        #     attn_mask=attn_mask,
        #     average_attn_weights=average_attn_weights,
        #     is_causal=is_causal)
        # # print('attn_output:')
        # # print(attn_output)

In [2]:
import torch
import torch.nn as nn
# W_q = nn.Linear(512, 2048, bias=True)
# x = torch.rand((10,512))
# print(W_q.weight.shape)
# print(x.shape)
# print(W_q(x).shape)
# K = torch.tensor([[1,2,3,0,0],[1,5,4,6,0]])
# key_masks = torch.sign(torch.abs(K))
# print(key_masks)
# key_masks = torch.unsqueeze(key_masks, 1)
# print(key_masks)
# key_masks = key_masks.repeat(1, 5, 1)
# print(key_masks)
# diag_vals = torch.ones((3,3))

# tril = torch.tril(diag_vals, diagonal=0)
# print(tril)

label = torch.tensor([[1,2,5,8],[1,5,3,4]])
y_onehot = torch.zeros(2 * 4, 10)
y_onehot = y_onehot.scatter_(1,label.view(-1,1).data,1)
print(y_onehot)

tensor([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]])
