In [2]:
import torch 

In [4]:
from model import PositionalEncoding
# 位置编码示例用法

d_model = 512
max_len = 100
num_heads = 8
# 位置编码
pos_encoder = PositionalEncoding(max_len,d_model)

# 示例输入序列
input_sequence = torch.randn(5, max_len, d_model)
print("输入序列的位置编码:")
print(input_sequence.shape)
# 应用位置编码
output_sequence = pos_encoder(input_sequence)
print("输出序列的位置编码:")
print(output_sequence.shape)

输入序列的位置编码:
torch.Size([5, 100, 512])
输出序列的位置编码:
torch.Size([5, 100, 512])


In [5]:
# 多头注意力机制示例用法
from model import MutiHeadAttention
d_model = 512
max_len = 100
num_heads = 8
d_ff = 2048
# 多头注意力
multihead_attn = MutiHeadAttention(num_heads,d_model)

# 示例输入序列
input_sequence = torch.randn(5, max_len, d_model)

# 多头注意力
attention_output= multihead_attn(input_sequence, input_sequence, input_sequence)
print("attention_output shape:", attention_output.shape)

attention_output shape: torch.Size([5, 100, 512])


In [6]:
# 前馈示例用法
from model  import FeedForward
d_model = 512
max_len = 100
num_heads = 8
d_ff = 2048

# 多头注意力
multihead_attn = MutiHeadAttention(num_heads,d_model)

# 前馈网络
ff_network = FeedForward(d_model, d_ff)

# 示例输入序列
input_sequence = torch.randn(5, max_len, d_model)

# 多头注意力
attention_output= multihead_attn(input_sequence, input_sequence, input_sequence)

# 前馈网络
output_ff = ff_network(attention_output)
print('input_sequence',input_sequence.shape)
print("output_ff", output_ff.shape)


input_sequence torch.Size([5, 100, 512])
output_ff torch.Size([5, 100, 512])


In [7]:
# 定义DecoderLayer的参数
from  model  import encodelayer
d_model = 512
max_len = 100
num_heads = 8
d_ff = 2048


# 多头注意力
encoder_layer = encodelayer(num_heads,d_model,  d_ff, 0.1)

# 示例输入序列
input_sequence = torch.randn(1, max_len, d_model)

# 多头注意力
encoder_output= encoder_layer(input_sequence, None)
print("encoder output shape:", encoder_output.shape)

encoder output shape: torch.Size([1, 100, 512])


In [8]:
# 定义DecoderLayer的参数
from  model  import decodelayer
d_model = 512  # 模型的维度
num_heads = 8  # 注意力头的数量
d_ff = 2048    # 前馈网络的维度
dropout = 0.1  # 丢弃概率
batch_size = 1 # 批量大小
max_len = 100  # 序列的最大长度

# 定义DecoderLayer实例
decoder_layer = decodelayer(num_heads,d_model,  d_ff, dropout)


src_mask = torch.rand(batch_size, max_len, max_len) > 0.5
tgt_mask = torch.tril(torch.ones(max_len, max_len)).unsqueeze(0) == 0

# 将输入张量传递到DecoderLayer
output = decoder_layer(input_sequence, encoder_output, src_mask, tgt_mask)

# 输出形状
print("Output shape:", output.shape)

Output shape: torch.Size([1, 100, 512])


In [12]:
# 示例用法
src_vocab_size = 500
tgt_vocab_size = 500
from   transformers  import  AutoTokenizer
from  model  import Transformer
tokenizer = AutoTokenizer.from_pretrained('./model_/gpt2_chinese')
src_vocab_size = len(tokenizer)
tgt_vocab_size = len(tokenizer)
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_len = 100
dropout = 0.1

model =Transformer(src_vocab_size,tgt_vocab_size,num_heads,d_model,d_ff,num_layers,max_len,dropout)

src_data = torch.randint(1, src_vocab_size, (2, max_len))  # (batch_size, seq_length)
tgt_data = torch.randint(1, tgt_vocab_size, (2, max_len))  # (batch_size, seq_length)
output = model(src_data,tgt_data)
print(output)
print(output.shape)

tensor([[[ 0.0834, -0.4275, -0.5930,  ..., -0.3214, -0.0957, -0.3598],
         [-0.6972, -0.2875,  0.4555,  ...,  0.0044, -1.4032, -0.2224],
         [ 0.1854, -0.3504,  0.1302,  ...,  0.7794, -0.0074,  0.7875],
         ...,
         [ 0.3470, -1.0003,  0.0825,  ...,  0.7346,  0.1772, -0.2238],
         [-0.1555, -0.3822, -0.8029,  ...,  0.1747,  1.0066, -0.0496],
         [-0.6953,  1.3396, -0.1791,  ...,  0.0960,  0.6512,  0.0535]],

        [[-0.0422, -0.6675, -0.7893,  ..., -0.2648,  0.3255, -0.4737],
         [-0.3078, -0.2277,  0.1688,  ...,  0.9857, -0.4644, -0.6475],
         [-0.4555, -0.7379,  0.0341,  ..., -1.2845, -0.3576, -0.7086],
         ...,
         [ 0.3605, -0.5538, -0.1183,  ...,  0.7157,  0.5060, -0.1151],
         [-0.7584,  0.6960,  0.1792,  ...,  0.0642,  1.2483, -0.0780],
         [ 1.4671, -0.2962,  0.4607,  ...,  0.1683,  0.0286, -0.3764]]],
       grad_fn=<ViewBackward0>)
torch.Size([2, 100, 21129])


In [13]:

src_texts = ["hello world", "how are you"]  # 真实的源文本
tgt_texts = ["你好世界", "你好吗"]  # 真实的目标文本

# 分词和填充
src_encoded = tokenizer(src_texts, padding="max_length", truncation=True, max_length=max_len, return_tensors="pt")
tgt_encoded = tokenizer(tgt_texts, padding="max_length", truncation=True, max_length=max_len, return_tensors="pt")

src_data = src_encoded['input_ids'] #  (batch_size, seq_len)
tgt_data = tgt_encoded['input_ids'] #  (batch_size, seq_len)

# 添加 <CLS>
cls_tokens = torch.full((src_data.shape[0],1), tokenizer.cls_token_id, dtype=torch.long) # create <cls>
tgt_input_data = torch.cat([cls_tokens, tgt_data[:, :-1]],dim=-1) # 在目标数据的开头添加cls
# 准备 label 数据
tgt_labels = tgt_data[:,1:] #  目标数据偏移一个位置，作为 label

print(src_vocab_size,tgt_vocab_size )
# 调用模型
print(src_data,tgt_data)
output = model(src_data,tgt_data)
print(output)


21129 21129
tensor([[ 101, 8701, 8572,  102,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0],
        [ 101, 9510, 8995, 8357,  102,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0, 