In [1]:
import torch
import torch.nn as nn

# 定义输入维度和输出维度
input_dim = 4  # 输入特征数量
dim_k = 3      # 输出特征数量

# 创建线性层
linear_layer = nn.Linear(4, 3)

# 创建一个输入张量（假设有两个样本）
input_tensor = torch.tensor([[1.0, 2.0, 3.0, 4.0], 
                              [5.0, 6.0, 7.0, 8.0]])

# 前向传播
output_tensor = linear_layer(input_tensor)

print("Output:\n", output_tensor)

Output:
 tensor([[-0.9007,  1.2370,  0.9771],
        [-0.1645,  3.6467,  0.6665]], grad_fn=<AddmmBackward0>)


In [9]:
from math import sqrt
import torch
from torch import nn
import numpy as np
import math


class Self_Attention(nn.Module):
    # input : batch_size * seq_len * input_dim
    # q : batch_size * input_dim * dim_k
    # k : batch_size * input_dim * dim_k
    # v : batch_size * input_dim * dim_v
    def __init__(self,input_dim,dim_k,dim_v):
        super(Self_Attention,self).__init__()
        self.q = nn.Linear(input_dim,dim_k)
        self.k = nn.Linear(input_dim,dim_k)
        self.v = nn.Linear(input_dim,dim_v)
        self._norm_fact = 1 / sqrt(dim_k)
        
    
    def forward(self,x):
        Q = self.q(x) # Q: batch_size * seq_len * dim_k
        K = self.k(x) # K: batch_size * seq_len * dim_k
        V = self.v(x) # V: batch_size * seq_len * dim_v
         
        atten = nn.Softmax(dim=-1)(torch.bmm(Q,K.permute(0,2,1))) * self._norm_fact # Q * K.T() # batch_size * seq_len * seq_len
        
        output = torch.bmm(atten,V) # Q * K.T() * V # batch_size * seq_len * dim_v
        
        return output

class Mutihead_Attention(nn.Module):
    def __init__(self,d_model,dim_k,dim_v,n_heads):
        super(Mutihead_Attention, self).__init__()
        self.dim_v = dim_v
        self.dim_k = dim_k
        self.n_heads = n_heads

        self.q = nn.Linear(d_model,dim_k)
        self.k = nn.Linear(d_model,dim_k)
        self.v = nn.Linear(d_model,dim_v)

        self.o = nn.Linear(dim_v,d_model)
        self.norm_fact = 1 / math.sqrt(d_model)

    def generate_mask(self,dim):
        # 此处是 sequence mask ，防止 decoder窥视后面时间步的信息。
        # padding mask 在数据输入模型之前完成。
        matirx = np.ones((dim,dim))
        mask = torch.Tensor(np.tril(matirx))

        return mask==1

    def forward(self,x,y,requires_mask=False):
        assert self.dim_k % self.n_heads == 0 and self.dim_v % self.n_heads == 0
        # size of x : [batch_size * seq_len * batch_size]
        # 对 x 进行自注意力
        Q = self.q(x).reshape(-1,x.shape[0],x.shape[1],self.dim_k // self.n_heads) # n_heads * batch_size * seq_len * dim_k
        K = self.k(x).reshape(-1,x.shape[0],x.shape[1],self.dim_k // self.n_heads) # n_heads * batch_size * seq_len * dim_k
        V = self.v(y).reshape(-1,y.shape[0],y.shape[1],self.dim_v // self.n_heads) # n_heads * batch_size * seq_len * dim_v
        
        # print("Attention V shape : {}".format(V.shape))
        attention_score = torch.matmul(Q,K.permute(0,1,3,2)) * self.norm_fact
        
        if requires_mask:
            mask = self.generate_mask(x.shape[1])
            # masked_fill 函数中，对Mask位置为True的部分进行Mask
            attention_score.masked_fill(mask,value=float("-inf")) # 注意这里的小Trick，不需要将Q,K,V 分别MASK,只MASKSoftmax之前的结果就好了
        print("V = {}".format(V.shape))
        print("attention_score = {}".format(attention_score.shape))
        output1 = torch.matmul(attention_score,V)
        print("output1 = {}".format(output1.shape))
        output = output1.reshape(y.shape[0],y.shape[1],-1)
        print("output = {}".format(output.shape))
        output = self.o(output)
        print("output = {}".format(output.shape))
        return output

class Self_Attention_Muti_Head(nn.Module):
    # input : batch_size * seq_len * input_dim
    # q : batch_size * input_dim * dim_k
    # k : batch_size * input_dim * dim_k
    # v : batch_size * input_dim * dim_v
    def __init__(self,input_dim,dim_k,dim_v,nums_head):
        super(Self_Attention_Muti_Head,self).__init__()
        assert dim_k % nums_head == 0
        assert dim_v % nums_head == 0
        self.q = nn.Linear(input_dim,dim_k)
        self.k = nn.Linear(input_dim,dim_k)
        self.v = nn.Linear(input_dim,dim_v)
        
        self.nums_head = nums_head
        self.dim_k = dim_k
        self.dim_v = dim_v
        self._norm_fact = 1 / sqrt(dim_k)
        
    
    def forward(self,x):
        Q = self.q(x).reshape(-1,x.shape[0],x.shape[1],self.dim_k // self.nums_head) 
        K = self.k(x).reshape(-1,x.shape[0],x.shape[1],self.dim_k // self.nums_head) 
        V = self.v(x).reshape(-1,x.shape[0],x.shape[1],self.dim_v // self.nums_head)
        print("input shape = {}".format(x.shape))
        print("Q shape = {}".format(Q.size()))
        print("K shape = {}".format(K.size()))

        atten = nn.Softmax(dim=-1)(torch.matmul(Q,K.permute(0,1,3,2))) # Q * K.T() # batch_size * seq_len * seq_len
        print("atten shape = {}".format(atten.shape))
        output1 = torch.matmul(atten,V)
        print("matmul shape = {}".format(output1.shape))

        output = output1.reshape(x.shape[0],x.shape[1],-1) # Q * K.T() * V # batch_size * seq_len * dim_v
        
        return output
    
x = torch.randn(4,3,10)
multi_attention = Self_Attention_Muti_Head(10,4,4,2)
res=multi_attention(x)
print(res.shape)

input shape = torch.Size([4, 3, 10])
Q shape = torch.Size([2, 4, 3, 2])
K shape = torch.Size([2, 4, 3, 2])
atten shape = torch.Size([2, 4, 3, 3])
matmul shape = torch.Size([2, 4, 3, 2])
torch.Size([4, 3, 4])


In [15]:

# @Author:Yifx
# @Contact: Xxuyifan1999@163.com
# @Time:2021/9/16 20:02
# @Software: PyCharm

"""
文件说明：
"""

import torch
import torch.nn as nn
import numpy as np
import math


class Config(object):
    def __init__(self):
        self.vocab_size = 6

        self.d_model = 20
        self.n_heads = 2

        assert self.d_model % self.n_heads == 0
        dim_k  = self.d_model // self.n_heads
        dim_v = self.d_model // self.n_heads



        self.padding_size = 30
        self.UNK = 5
        self.PAD = 4

        self.N = 6
        self.p = 0.1

config = Config()


class Embedding(nn.Module):
    def __init__(self,vocab_size):
        super(Embedding, self).__init__()
        # 一个普通的 embedding层，我们可以通过设置padding_idx=config.PAD 来实现论文中的 padding_mask
        self.embedding = nn.Embedding(vocab_size,config.d_model,padding_idx=config.PAD)


    def forward(self,x):
        # 根据每个句子的长度，进行padding，短补长截
        for i in range(len(x)):
            if len(x[i]) < config.padding_size:
                x[i].extend([config.UNK] * (config.padding_size - len(x[i]))) # 注意 UNK是你词表中用来表示oov的token索引，这里进行了简化，直接假设为6
            else:
                x[i] = x[i][:config.padding_size]
        x = self.embedding(torch.tensor(x)) # batch_size * seq_len * d_model
        return x



class Positional_Encoding(nn.Module):

    def __init__(self,d_model):
        super(Positional_Encoding,self).__init__()
        self.d_model = d_model


    def forward(self,seq_len,embedding_dim):
        positional_encoding = np.zeros((seq_len,embedding_dim))
        for pos in range(positional_encoding.shape[0]):
            for i in range(positional_encoding.shape[1]):
                positional_encoding[pos][i] = math.sin(pos/(10000**(2*i/self.d_model))) if i % 2 == 0 else math.cos(pos/(10000**(2*i/self.d_model)))
        return torch.from_numpy(positional_encoding)


class Mutihead_Attention(nn.Module):
    def __init__(self,d_model,dim_k,dim_v,n_heads):
        super(Mutihead_Attention, self).__init__()
        self.dim_v = dim_v
        self.dim_k = dim_k
        self.n_heads = n_heads

        self.q = nn.Linear(d_model,dim_k)
        self.k = nn.Linear(d_model,dim_k)
        self.v = nn.Linear(d_model,dim_v)

        self.o = nn.Linear(dim_v,d_model)
        self.norm_fact = 1 / math.sqrt(d_model)

    def generate_mask(self,dim):
        # 此处是 sequence mask ，防止 decoder窥视后面时间步的信息。
        # padding mask 在数据输入模型之前完成。
        matirx = np.ones((dim,dim))
        mask = torch.Tensor(np.tril(matirx))

        return mask==1

    def forward(self,x,y,requires_mask=False):
        assert self.dim_k % self.n_heads == 0 and self.dim_v % self.n_heads == 0
        # size of x : [batch_size * seq_len * batch_size]
        # 对 x 进行自注意力
        Q = self.q(x).reshape(-1,x.shape[0],x.shape[1],self.dim_k // self.n_heads) # n_heads * batch_size * seq_len * dim_k
        K = self.k(x).reshape(-1,x.shape[0],x.shape[1],self.dim_k // self.n_heads) # n_heads * batch_size * seq_len * dim_k
        V = self.v(y).reshape(-1,y.shape[0],y.shape[1],self.dim_v // self.n_heads) # n_heads * batch_size * seq_len * dim_v
        # print("Attention V shape : {}".format(V.shape))
        attention_score = torch.matmul(Q,K.permute(0,1,3,2)) * self.norm_fact
        if requires_mask:
            mask = self.generate_mask(x.shape[1])
            # masked_fill 函数中，对Mask位置为True的部分进行Mask
            attention_score.masked_fill(mask,value=float("-inf")) # 注意这里的小Trick，不需要将Q,K,V 分别MASK,只MASKSoftmax之前的结果就好了
        output = torch.matmul(attention_score,V).reshape(y.shape[0],y.shape[1],-1)
        # print("Attention output shape : {}".format(output.shape))

        output = self.o(output)
        return output


class Feed_Forward(nn.Module):
    """
        先转成 低维度，通过 relu，再转成高维度。

        最后输入输出的维度相同
    """
    def __init__(self,input_dim,hidden_dim=2048):
        super(Feed_Forward, self).__init__()
        self.L1 = nn.Linear(input_dim,hidden_dim)
        self.L2 = nn.Linear(hidden_dim,input_dim)  # 将最后一维的 hidden_dim 转成 input_dim，即修改最后一维的维度。

    def forward(self,x):
        output = nn.ReLU()(self.L1(x))
        output = self.L2(output)
        return output

class Add_Norm(nn.Module):
    """
    做 batch 维度的归一化
    """
    def __init__(self):
        self.dropout = nn.Dropout(config.p)
        super(Add_Norm, self).__init__()

    def forward(self,x,sub_layer,**kwargs):
        sub_output = sub_layer(x,**kwargs)
        # print("{} output : {}".format(sub_layer,sub_output.size()))
        x = self.dropout(x + sub_output)

        layer_norm = nn.LayerNorm(x.size()[1:])
        out = layer_norm(x)
        return out


class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        self.positional_encoding = Positional_Encoding(config.d_model)
        self.muti_atten = Mutihead_Attention(config.d_model,config.dim_k,config.dim_v,config.n_heads)
        self.feed_forward = Feed_Forward(config.d_model)

        self.add_norm = Add_Norm()


    def forward(self,x): # batch_size * seq_len 并且 x 的类型不是tensor，是普通list

        x += self.positional_encoding(x.shape[1],config.d_model)
        # print("After positional_encoding: {}".format(x.size()))
        output = self.add_norm(x,self.muti_atten,y=x)
        output = self.add_norm(output,self.feed_forward)

        return output

# 在 Decoder 中，Encoder的输出作为Query和KEy输出的那个东西。即 Decoder的Input作为V。此时是可行的
# 因为在输入过程中，我们有一个padding操作，将Inputs和Outputs的seq_len这个维度都拉成一样的了
# 我们知道，QK那个过程得到的结果是 batch_size * seq_len * seq_len .既然 seq_len 一样，那么我们可以这样操作
# 这样操作的意义是，Outputs 中的 token 分别对于 Inputs 中的每个token作注意力

class Decoder(nn.Module):
    def __init__(self):
        super(Decoder, self).__init__()
        self.positional_encoding = Positional_Encoding(config.d_model)
        self.muti_atten = Mutihead_Attention(config.d_model,config.dim_k,config.dim_v,config.n_heads)
        self.feed_forward = Feed_Forward(config.d_model)
        self.add_norm = Add_Norm()

    def forward(self,x,encoder_output): # batch_size * seq_len 并且 x 的类型不是tensor，是普通list
        # print(x.size())
        x += self.positional_encoding(x.shape[1],config.d_model)
        # print(x.size())
        # 第一个 sub_layer
        output = self.add_norm(x,self.muti_atten,y=x,requires_mask=True)
        # 第二个 sub_layer
        output = self.add_norm(x,self.muti_atten,y=encoder_output,requires_mask=True)
        # 第三个 sub_layer
        output = self.add_norm(output,self.feed_forward)
        return output

class Transformer_layer(nn.Module):
    def __init__(self):
        super(Transformer_layer, self).__init__()
        self.encoder = Encoder()
        self.decoder = Decoder()

    def forward(self,x):
        x_input,x_output = x
        encoder_output = self.encoder(x_input)
        decoder_output = self.decoder(x_output,encoder_output)
        return (encoder_output,decoder_output)

class Transformer(nn.Module):
    def __init__(self,N,vocab_size,output_dim):
        super(Transformer, self).__init__()
        self.embedding_input = Embedding(vocab_size=vocab_size)
        self.embedding_output = Embedding(vocab_size=vocab_size)

        self.output_dim = output_dim
        self.linear = nn.Linear(config.d_model,output_dim)
        self.softmax = nn.Softmax(dim=-1)
        self.model = nn.Sequential(*[Transformer_layer() for _ in range(N)])


    def forward(self,x):
        x_input , x_output = x
        x_input = self.embedding_input(x_input)
        x_output = self.embedding_output(x_output)

        _ , output = self.model((x_input,x_output))

        output = self.linear(output)
        output = self.softmax(output)

        return output

t=Transformer(1, 10000, 10000)
x = torch.randn(4,100,1)
res=t(x)

AttributeError: 'Config' object has no attribute 'dim_k'

In [10]:
import torch
from torch import nn

# 精度损失那么一点，没啥事
x = 2 * torch.rand([2,3,4], dtype=torch.bfloat16) - 1
y = x.to(torch.float16)

print(x)
print(y)
print(x.dtype)
relu = nn.ReLU()

# relu 是对每个元素进行分析的
z=relu(y)

print(z)
print(z.shape)
print(z)

tensor([[[-0.7578,  0.2109, -0.7031, -0.1797],
         [-0.3438, -0.7969,  0.9453,  0.3984],
         [-0.0312,  0.5625,  0.2344,  0.8750]],

        [[-0.8281,  0.4922,  0.5469, -0.7031],
         [-0.9453,  0.7656, -0.0312,  0.9531],
         [-0.6484,  0.5078,  0.9766,  0.0781]]], dtype=torch.bfloat16)
tensor([[[-0.7578,  0.2109, -0.7031, -0.1797],
         [-0.3438, -0.7969,  0.9453,  0.3984],
         [-0.0312,  0.5625,  0.2344,  0.8750]],

        [[-0.8281,  0.4922,  0.5469, -0.7031],
         [-0.9453,  0.7656, -0.0312,  0.9531],
         [-0.6484,  0.5078,  0.9766,  0.0781]]], dtype=torch.float16)
torch.bfloat16
tensor([[[0.0000, 0.2109, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.9453, 0.3984],
         [0.0000, 0.5625, 0.2344, 0.8750]],

        [[0.0000, 0.4922, 0.5469, 0.0000],
         [0.0000, 0.7656, 0.0000, 0.9531],
         [0.0000, 0.5078, 0.9766, 0.0781]]], dtype=torch.float16)
torch.Size([2, 3, 4])
tensor([[[0.0000, 0.2109, 0.0000, 0.0000],
         [0.0000, 0.00

In [14]:
import torch
from torch import nn

# 精度损失那么一点，没啥事
x = 2 * torch.rand([1,2,3], dtype=torch.float16) - 1
print(x)
dropout =  nn.Dropout(0.4)
# dropout 对每个元素进行，会按照 0.4 的比例丢弃元素，剩余元素乘以 1/(1-0.4) 
y=dropout(x)
print(y)
print(y.shape)

tensor([[[0.1162, 0.6670, 0.1475],
         [0.9736, 0.3252, 0.1533]]], dtype=torch.float16)
tensor([[[0.1937, 0.0000, 0.0000],
         [1.6230, 0.5420, 0.0000]]], dtype=torch.float16)
torch.Size([1, 2, 3])


In [27]:
import torch
from torch import nn

# 精度损失那么一点，没啥事
#x = 2 * torch.rand([1,2,3], dtype=torch.float32) - 1
x=torch.ones([2,2,3], dtype=torch.float32)
print(x)
print(x.size()[1:])
layer_norm = nn.LayerNorm([2,3])
out=layer_norm(x)
print(out)
print(out.shape)



input_tensor = torch.tensor([[1.0, 2.0, 3.0, 4.0],
                              [5.0, 6.0, 7.0, 8.0]])
layer_norm = nn.LayerNorm(4)
output_tensor = layer_norm(input_tensor)

print(output_tensor)

"""
nn.LayerNorm 看懂了，就是为了做归一化的，就是所选的元素集合的均值为 0，方差为 1
如果 nn.LayerNorm([3,4]) 则代表，输入的是 [batch_size,3,4]，即一个 batch 中的数据做归一化
如果 nn.LayerNorm([4]) 则代表，输入的是 [batch_size,3,4]，即最后一维做数据的归一化
"""


tensor([[[1., 1., 1.],
         [1., 1., 1.]],

        [[1., 1., 1.],
         [1., 1., 1.]]])
torch.Size([2, 3])
tensor([[[0., 0., 0.],
         [0., 0., 0.]],

        [[0., 0., 0.],
         [0., 0., 0.]]], grad_fn=<NativeLayerNormBackward0>)
torch.Size([2, 2, 3])
tensor([[-1.3416, -0.4472,  0.4472,  1.3416],
        [-1.3416, -0.4472,  0.4472,  1.3416]],
       grad_fn=<NativeLayerNormBackward0>)


In [32]:
# position 网络
import torch
import numpy as np
import math
class Positional_Encoding(nn.Module):
    def __init__(self,d_model):
        super(Positional_Encoding,self).__init__()
        self.d_model = d_model


    def forward(self,seq_len,embedding_dim):
        positional_encoding = np.zeros((seq_len,embedding_dim))
        for pos in range(positional_encoding.shape[0]):
            for i in range(positional_encoding.shape[1]):
                positional_encoding[pos][i] = math.sin(pos/(10000**(2*i/self.d_model))) if i % 2 == 0 else math.cos(pos/(10000**(2*i/self.d_model)))
        return torch.from_numpy(positional_encoding)

pe = Positional_Encoding(1024)
out=pe(10, 1000)
print(out.shape)
print(out)

torch.Size([10, 1000])
tensor([[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  ...,  1.0000e+00,
          0.0000e+00,  1.0000e+00],
        [ 8.4147e-01,  5.5522e-01,  8.2186e-01,  ...,  1.0000e+00,
          1.5963e-08,  1.0000e+00],
        [ 9.0930e-01, -3.8347e-01,  9.3641e-01,  ...,  1.0000e+00,
          3.1927e-08,  1.0000e+00],
        ...,
        [ 6.5699e-01,  8.2982e-01,  4.5239e-01,  ...,  1.0000e+00,
          1.1174e-07,  1.0000e+00],
        [ 9.8936e-01, -3.3935e-03,  9.9067e-01,  ...,  1.0000e+00,
          1.2771e-07,  1.0000e+00],
        [ 4.1212e-01, -8.3358e-01,  6.7637e-01,  ...,  1.0000e+00,
          1.4367e-07,  1.0000e+00]], dtype=torch.float64)
