In [116]:
from io import open
import random

# 深度学习库pytorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import DataLoader
import math
import torch.nn.functional as F

# 用于绘制损失函数下降曲线
from matplotlib import pyplot as plt
%pdb on

Automatic pdb calling has been turned ON


# 模型构建

## 数据集选取

In [117]:
# 单行诗最大长度
MAX_LEN = 64
MIN_LEN = 5
# 禁用的字符，拥有以下符号的诗将被忽略
DISALLOWED_WORDS = ['（', '）', '(', ')', '__', '《', '》', '【', '】', '[', ']', '？', '；']

# 一首诗（一行）对应一个列表的元素
poetry = []

# 按行读取数据 poetry.txt
with open('./poetry.txt', 'r', encoding='utf-8') as f:
    lines = f.readlines()
# 遍历处理每一条数据    
for line in lines:
    # 利用正则表达式拆分 标题 和 内容
    fields = line.split(":")
    # 跳过异常数据
    if len(fields) != 2:
        continue
    # 得到诗词内容（后面不需要标题）
    content = fields[1]
    # 过滤数据：跳过内容过长、过短、存在禁用符的诗词
    if len(content) > MAX_LEN - 2 or len(content) < MIN_LEN:
        continue
    if any(word in content for word in DISALLOWED_WORDS):
        continue
        
    poetry.append(content.replace('\n', '')) # 最后要记得删除换行符


In [118]:
for i in range(0, 10):
    print(poetry[i])
    
print(f"current_line_count = {len(poetry)}")

寒随穷律变，春逐鸟声开。初风飘带柳，晚雪间花梅。碧林青旧竹，绿沼翠新苔。芝田初雁去，绮树巧莺来。
晚霞聊自怡，初晴弥可喜。日晃百花色，风动千林翠。池鱼跃不同，园鸟声还异。寄言博通者，知予物外志。
夏律昨留灰，秋箭今移晷。峨嵋岫初出，洞庭波渐起。桂白发幽岩，菊黄开灞涘。运流方可叹，含毫属微理。
寒惊蓟门叶，秋发小山枝。松阴背日转，竹影避风移。提壶菊花岸，高兴芙蓉池。欲知凉气早，巢空燕不窥。
山亭秋色满，岩牖凉风度。疏兰尚染烟，残菊犹承露。古石衣新苔，新巢封古树。历览情无极，咫尺轮光暮。
慨然抚长剑，济世岂邀名。星旗纷电举，日羽肃天行。遍野屯万骑，临原驻五营。登山麾武节，背水纵神兵。在昔戎戈动，今来宇宙平。
翠野驻戎轩，卢龙转征旆。遥山丽如绮，长流萦似带。海气百重楼，岩松千丈盖。兹焉可游赏，何必襄城外。
玄兔月初明，澄辉照辽碣。映云光暂隐，隔树花如缀。魄满桂枝圆，轮亏镜彩缺。临城却影散，带晕重围结。驻跸俯九都，停观妖氛灭。
碧原开雾隰，绮岭峻霞城。烟峰高下翠，日浪浅深明。斑红妆蕊树，圆青压溜荆。迹岩劳傅想，窥野访莘情。巨川何以济，舟楫伫时英。
春蒐驰骏骨，总辔俯长河。霞处流萦锦，风前漾卷罗。水花翻照树，堤兰倒插波。岂必汾阴曲，秋云发棹歌。
current_line_count = 24375


In [119]:
from collections import Counter
# 最小词频
MIN_WORD_FREQUENCY = 8

# 统计词频，利用Counter可以直接按单个字符进行统计词频
counter = Counter()
for line in poetry:
    counter.update(line)
# 过滤掉低词频的词
tokens = [token for token, count in counter.items() if count >= MIN_WORD_FREQUENCY]
# 打印一下出现次数前5的字
for i, (token, count) in enumerate(counter.items()):
    print(token, "->",count)
    if i >= 4:
        break

寒 -> 2612
随 -> 1036
穷 -> 482
律 -> 118
变 -> 286


### 编码器

In [120]:
class Tokenizer:
    """
    词典编码器
    """
    UNKNOWN = "<unknown>"
    PAD = "<pad>"
    BOS = "<bos>" 
    EOS = "<eos>" 

    def __init__(self, tokens):
        # 补上特殊词标记：未知词标记、填充字符标记、开始标记、结束标记
        tokens = [Tokenizer.UNKNOWN, Tokenizer.PAD, Tokenizer.BOS, Tokenizer.EOS] + tokens
        # 词汇表大小
        self.dict_size = len(tokens)
        # 生成映射关系
        self.token_id = {} # 映射: 词 -> 编号
        self.id_token = {} # 映射: 编号 -> 词
        for idx, word in enumerate(tokens):
            self.token_id[word] = idx
            self.id_token[idx] = word
        
        # 各个特殊标记的编号id，方便其他地方使用
        self.unknown_id = self.token_id[Tokenizer.UNKNOWN]
        self.pad_id = self.token_id[Tokenizer.PAD]
        self.bos_id = self.token_id[Tokenizer.BOS]
        self.eos_id = self.token_id[Tokenizer.EOS]
    
    def id_to_token(self, token_id):
        """
        编号 -> 词
        """
        return self.id_token.get(token_id)

    def token_to_id(self, token):
        """
        词 -> 编号，取不到时给 UNKNOWN
        """
        return self.token_id.get(token, self.unknown_id)

    def encode(self, tokens):
        """
        词列表 -> <bos>编号 + 编号列表 + <eos>编号
        """
        token_ids = [self.bos_id, ] # 起始标记
        # 遍历，词转编号
        for token in tokens:
            token_ids.append(self.token_to_id(token))
        token_ids.append(self.eos_id) # 结束标记
        return token_ids

    def decode(self, token_ids):
        """
        编号列表 -> 词列表(去掉起始、结束标记)
        """
        tokens = []
        for idx in token_ids:
            # 跳过起始、结束标记
            if idx != self.bos_id and idx != self.eos_id:
                tokens.append(self.id_to_token(idx))
        return tokens
    
    def __len__(self):
        return self.dict_size

In [121]:
def index2onehot(word_ids, vocab_size):
    if word_ids.dim() == 1:
        # 一维情况：(seq_len,)
        onehot_tensor = torch.zeros(len(word_ids), vocab_size)
        for i, s in enumerate(word_ids): 
            onehot_tensor[i, s] = 1
    elif word_ids.dim() == 2:
        # 二维情况：(batch_size, seq_len)
        batch_size, seq_len = word_ids.size()
        onehot_tensor = torch.zeros(batch_size, seq_len, vocab_size, dtype=torch.float32)
        onehot_tensor.scatter_(2, word_ids.unsqueeze(2), 1)
    else:
        raise ValueError("word_ids must be a 1D or 2D tensor")
    return onehot_tensor

def onehot2index(word_ids):
    return torch.argmax(word_ids, dim=-1)

In [122]:
from torch.utils.data import TensorDataset
class MyDataset(TensorDataset):
    
    def __init__(self, data, tokenizer, v, max_len=64):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len  # 每条数据的最大长度
        self.v = v
        
    def __getitem__(self, index):
        line = self.data[index]
        word_ids = self.encode_pad_line(line)
        return torch.tensor(word_ids)
    
    def __len__(self):
        return len(self.data)
    
    def encode_pad_line(self, line):
        # 编码
        word_ids = self.tokenizer.encode(line)
        # 如果句子长度不足max_length，填充PAD
        if len(word_ids) <= self.max_len:
            word_ids = word_ids + [self.tokenizer.pad_id] * (self.max_len - len(word_ids))
        else:
            word_ids = word_ids[:self.max_len]
        return word_ids

## 模型类定义

### 嵌入层

In [123]:
class Embedding(nn.Module):
    def __init__(self, v, h, max_len, padidx = 0):
        # 调用父类的初始化方法，所有子类均需要该操作
        super().__init__()
        # 在子类初始化声明中需要定义其包含哪些基本层
        self.embedding = nn.Linear(v, h)
        self.h = h
        self.v = v
        self.max_len = max_len
        self.padidx = padidx

    def forward(self, src):
        # print(src.size())
        src = self.key_padding(src, self.max_len)
        onehot_tensor = index2onehot(src, self.v)
        # print(onehot_tensor.size())
        # 在forward方法中，我们声明输入张量如何经过这些基本层得到输出张量。
        return self.embedding(onehot_tensor)

    def key_padding(self, tokens, max_len = 64):
        # 如果句子长度不足max_length，填充PAD        
        tokens = F.pad(tokens, (self.padidx, (max_len - tokens.size()[1] - 1)))
        return tokens


In [124]:
class PositionalEncoding(nn.Module):
    def __init__(self, h, dropout=0.1, max_len=200):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, h)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, h, 2).float() * (-math.log(10000.0) / h))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # print(x.size(), self.pe.size())
        x = x + Variable(self.pe[:, :x.size(1)], requires_grad=False) 
        return self.dropout(x)

### 注意力模块

In [125]:
class Attention(nn.Module):
    def __init__(self, h, a, dropout=0.1, type = 'self'):
        '''
        h: 嵌入层维度
        a: 注意力头数
        d_k: 每个注意力头的第二个维度

        X: (s,h) ---Wq,Wk,Wv: (h, h//a) ---> Q,K,V: (s, h//a) 
            ---> softmax(Q*K.t / sqrt(d_k)) * V: (s, h//a)
            ---> output: (s, h)
        '''
        super().__init__()  # 注意这里的修正，使用super()而不是super.__init__()
        self.h = h
        self.a = a
        self.d_k = h // a
        self.types = type
        self.dropout = nn.Dropout(p=dropout)
        
        # 初始化Q, K, V的权重矩阵
        # 每个权重矩阵的维数是(s, h//a) 这里是(h, h)，是将每个头的相应矩阵拼接到一起了
        self.Wq = nn.Linear(h, h)
        self.Wk = nn.Linear(h, h)
        self.Wv = nn.Linear(h, h)
        
        # 缩放因子，用于缩放点积结果
        self.scale = 1 / math.sqrt(self.d_k)

    def forward(self, x, y = None, padding_mask=None, tgt_sequence_mask = None):
        """
        x: (batch_size, s, h)
        """
        batch_size = x.size(0)
        """
        Step #1 通过线性变换得到Q, K, V
        q,k,v: (batch_size, s, h) ---> (batch_size, s, a, d_k) ---> (batch_size, a, s, d_k)
        """
        if self.types == 'self':            # 自注意力机制，均来自输入x            
            assert y is None, ("Self Attention but different input for Q K V")
            q = k = v = x
        elif self.types == 'cross':         # 交叉注意力机制，q来自x，k v来自y
            assert y is not None, ("Cross Attention but the same input for Q K V")
            q = x
            k = v = y
        else: raise ValueError("Undefined Attention Type")

        q = self.Wq(q).view(batch_size, -1, self.a, self.d_k).transpose(1, 2)
        k = self.Wk(k).view(batch_size, -1, self.a, self.d_k).transpose(1, 2)
        v = self.Wv(v).view(batch_size, -1, self.a, self.d_k).transpose(1, 2)

        """
        Step#2 计算注意力分数
        x: (batch_size, s, h)
        k: (batch_size, a, s, d_k) ---> (batch_size, a, d_k, s)
        tgt_sequence_mask: (s, s) ---> (batch_size, a, s, s)
        padding_mask : (batch_size, s) ---> (batch_size, a, s, s)
        """
        scores = torch.matmul(q, k.transpose(-1, -2)) * self.scale
        if padding_mask is not None:
            mask = padding_mask.view(batch_size, 1, 1, x.size(1)).expand(batch_size, self.a, x.size(1), x.size(1))
            if tgt_sequence_mask is not None: 
                s_mask = tgt_sequence_mask.view(1, 1, x.size(1), x.size(1)).   \
                expand(batch_size, self.a, -1, -1)
                mask = s_mask.logical_or(mask)
            # assert self.types == 'self'
            # print(mask.size(), scores.size())
            scores = scores.masked_fill(mask, float("-inf"))
        attention_weights = F.softmax(scores, dim=-1)
        attention_weights = self.dropout(attention_weights)
        output = torch.matmul(attention_weights, v)
        output = output.transpose(1, 2).contiguous().view(batch_size, -1, self.h)
        
        return output

### FeedForward

In [126]:
class FeedForward(nn.Module):
    def __init__(self, h, hiddenDim = None, outDim = None, dropout = 0.1, type = 'relu'):
        """
        x: (h, h) ---> x * W_1: (h, hiddenDim) ---> relu/gelu: (h, hiddenDim) ---> A' * W2: (h, outDim)
        W1: (h, hiddenDim)
        W2: (hiddenDim, outDim)
        默认hiddenDim = 4 * h, outDim = h
        """
        super().__init__()
        self.h = h
        if hiddenDim is None: hiddenDim = 4 * h
        if outDim is None: outDim = h
        self.W1 = nn.Linear(h, hiddenDim)
        self.dropout = nn.Dropout(dropout)
        self.W2 = nn.Linear(hiddenDim, outDim)
        self.types = type
    
    def forward(self, x):
        x = self.W1(x)
        if self.types == 'relu': x = F.relu(x)
        elif self.types == 'gelu': x = F.gelu(x)
        else: raise ValueError("Unsupported activation type")
        x = self.dropout(x)
        x = self.W2(x)
        return x

### LayerNorm

In [127]:
class LayerNorm(nn.Module):
    def __init__(self, normalized_shape, eps=1e-5, elementwise_affine=True):
        super().__init__()
        self.normalized_shape = normalized_shape
        self.eps = eps
        self.elementwise_affine = elementwise_affine
        if self.elementwise_affine:
            self.weight = nn.Parameter(torch.ones(normalized_shape))
            self.bias = nn.Parameter(torch.zeros(normalized_shape))
        else:
            self.register_parameter('weight', None)
            self.register_parameter('bias', None)
    def forward(self, input):
        return F.layer_norm(input, self.normalized_shape, self.weight, self.bias, self.eps)

### Transformer Encoder和Decoder

包含从嵌入层进入attention block之后的所有流程：encoder decoder feedforward add&norm
对于encoder来讲，self-attention ---> add&norm ---> feedforward ---> add&norm
对于decoder来讲, self-attention ---> add&norm ---> cross-attention ---> add&norm ---> feedforward ---> add&norm


In [128]:
class TransformerEncoderDecoder(nn.Module):
    def __init__(self, h, a, num_encoder_layers, num_decoder_layers, dim_feedforward=2048, dropout=0.1):
        super().__init__()
        self.encoders = nn.ModuleList([
            nn.ModuleList([
                Attention(h, a, dropout),
                LayerNorm((h,)),
                FeedForward(h, dropout = dropout),
                LayerNorm((h,))
            ]) for _ in range(num_encoder_layers)
        ])
        
        self.decoders = nn.ModuleList([
            nn.ModuleList([
                Attention(h, a, dropout),
                LayerNorm((h,)),
                Attention(h, a, dropout, type='cross'),
                LayerNorm((h,)),
                FeedForward(h, dropout = dropout),
                LayerNorm((h,))
            ]) for _ in range(num_decoder_layers)
        ])

    def forward(self, encoder_input, decoder_input, src_padding_mask=None, tgt_padding_mask = None, tgt_sequence_mask=None):
        global DEVICE        
        # print(src_padding_mask, tgt_sequence_mask)
        for enc in self.encoders:
            attention, norm1, ff, norm2 = enc
            encoder_input = norm1(attention(encoder_input, padding_mask=src_padding_mask) + encoder_input)
            encoder_input = norm2(ff(encoder_input) + encoder_input)

        for dec in self.decoders:
            self_attention, norm1, cross_attention, norm2, ff, norm3 = dec
            decoder_input = norm1(self_attention(decoder_input, padding_mask=tgt_padding_mask, \
                                                 tgt_sequence_mask = tgt_sequence_mask) + decoder_input)
            decoder_input = norm2(cross_attention(decoder_input, encoder_input, \
                                                  padding_mask=tgt_padding_mask, tgt_sequence_mask = tgt_sequence_mask) + decoder_input)
            decoder_input = norm3(ff(decoder_input) + decoder_input)
        return decoder_input        

### 预测层

In [129]:
class Prediction(nn.Module):
    def __init__(self, h, v):
        super().__init__()
        self.w = nn.Linear(h, v)

    def forward(self, x):
        return F.softmax(self.w(x), dim = -1)

### Transformer模型

In [130]:
# 定义Transformer模型
class Transformer(nn.Module):
    def __init__(self, v, h, a, num_encoder_layers, num_decoder_layers, dimFF, dropout, max_len, padidx):
        super().__init__()
        self.embedding = Embedding(v, h, max_len, padidx)
        self.pos_encoder = PositionalEncoding(h, dropout, max_len)
        self.pos_decoder = PositionalEncoding(h, dropout, max_len)
        self.transformer = TransformerEncoderDecoder(h, a, num_encoder_layers, num_decoder_layers, dimFF, dropout)
        self.predict = Prediction(h, v)
        self.max_len = max_len

    def forward(self, src, tgt, src_padding_mask = None, tgt_padding_mask = None, tgt_sequence_mask = None):
        # print(src.size())
        if src_padding_mask is None: 
            src_padding_mask = self.get_key_padding_mask(src).to(DEVICE)
        if tgt_padding_mask is None: 
            tgt_padding_mask = self.get_key_padding_mask(tgt).to(DEVICE)
        if tgt_sequence_mask is None: 
            tgt_sequence_mask = self.get_sequence_mask().to(DEVICE)
        # print(src_padding_mask.size(), tgt.size())
        src = self.embedding(src) * math.sqrt(self.embedding.h)
        tgt = self.embedding(tgt) * math.sqrt(self.embedding.h)
        src = self.pos_encoder(src)
        tgt = self.pos_decoder(tgt)
        output = self.transformer(src, tgt, src_padding_mask, tgt_padding_mask, tgt_sequence_mask)
        output = self.predict(output)
        return output
    
    # @staticmethod
    def get_sequence_mask(self):
        size = self.max_len
        return torch.triu(torch.full((size, size), True, device=DEVICE), diagonal=1)
    
    # @staticmethod
    def get_key_padding_mask(self, tokens):
        key_padding_mask = torch.zeros(tokens.size())
        key_padding_mask[tokens == Tokenizer.PAD] = True
        key_padding_mask = F.pad(key_padding_mask, (0, (self.max_len - tokens.size()[1])), "constant", True)
        # print(f"keypaddingmasksize{key_padding_mask}")
        return key_padding_mask

## 模型训练和预测

### 模型参数

In [131]:
tokenizer = Tokenizer(tokens)
v = len(tokenizer)
batch_size = 64
max_len = 64

# 创建数据集和数据加载器
dataset = MyDataset(poetry, tokenizer, v, max_len)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# 初始化模型、优化器和损失函数
h = 128
a = 4
num_encoder_layers = 3
num_decoder_layers = 3
dim_feedforward = 4 * h
dropout = 0.1

model = Transformer(v, h, a, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout, max_len, tokenizer.pad_id)
optimizer = optim.Adam(model.parameters(), lr=0.1)
criterion = nn.CrossEntropyLoss()
print(model)

Transformer(
  (embedding): Embedding(
    (embedding): Linear(in_features=3428, out_features=128, bias=True)
  )
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (pos_decoder): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): TransformerEncoderDecoder(
    (encoders): ModuleList(
      (0): ModuleList(
        (0): Attention(
          (dropout): Dropout(p=0.1, inplace=False)
          (Wq): Linear(in_features=128, out_features=128, bias=True)
          (Wk): Linear(in_features=128, out_features=128, bias=True)
          (Wv): Linear(in_features=128, out_features=128, bias=True)
        )
        (1): LayerNorm()
        (2): FeedForward(
          (W1): Linear(in_features=128, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (W2): Linear(in_features=512, out_features=128, bias=True)
        )
        (3): LayerNorm()
      )
      (1): ModuleList(
        (0): Attentio

### 模型训练

In [132]:
num_epochs = 5
import tqdm
for epoch in range(1, num_epochs + 1):
    model.train()
    total_loss = 0
    data_progress = tqdm.tqdm(dataloader, desc="Train...")
    for step, data in enumerate(data_progress, 1):
        data = data.to(DEVICE)
        # data: (batch_size, seq_len)
        # 随机选一个位置，拆分src和tgt
        # print(data.size())
        e = random.randint(1, 20)
        src = data[:, :e]
        # print(src.size())
        # tgt不要最后一个token，tgt_y不要第一个的token
        tgt, tgt_y = data[:, e:-1], data[:, e + 1:]
        # 进行Transformer的计算和预测 out:(batch_size, max_len, v)
        out = model(src, tgt)
        # print(out.size(), tgt_y.size()) 
        # 将tgt_y 转化为与out形状相同的变量
        tgt_y = F.pad(tgt_y, (0, (max_len - tgt_y.size()[1])), "constant", tokenizer.pad_id)
        tgt_y = index2onehot(tgt_y, v)
        # print(tgt_y.size())
        # 在forward方法中，我们声明输入张量如何经过这些基本层得到输出张量。
        loss = criterion(out.view(-1, out.size(-1)), tgt_y.contiguous().view(-1, tgt_y.size(-1)))
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()

        # 更新训练进度
        data_progress.set_description(f"Train... [epoch {epoch}/{num_epochs}, loss {(total_loss / step):.5f}]")
        break
    break
# print(src_padding_mask.size(), tgt.size())

Train... [epoch 1/5, loss 8.13977]:   0%|          | 0/381 [00:01<?, ?it/s]


### 模型表现

In [133]:
model.eval()
with torch.no_grad():
    word_ids = torch.tensor(tokenizer.encode("清明时节"))
    # word_ids = index2onehot(word_ids, v)
    # word_ids: (batch_size, max_len)
    src = word_ids[:-2].view(1, -1)
    tgt = word_ids[-2:-1].view(1, -1)
    # 一个一个词预测，直到预测为<eos>，或者达到句子最大长度
    for i in range(64):
        out = model(src, tgt)
        # 预测结果，只需最后一个词
        predict = out[:,-1:,:]
        # 找出最大值的index并转换为onehot编码，和之前的结果拼到一起
        y = torch.argmax(predict, dim=2)
        next = torch.zeros(1,1,v)
        next[0,0,y] = 1
        next = onehot2index(next)
        # 和之前的预测结果拼接到一起
        tgt = torch.cat([tgt, next], dim=1)

        # 如果为<eos>
        if y == tokenizer.eos_id:
            break
        if y in [tokenizer.pad_id, tokenizer.unknown_id]:
            continue
    
    # src = onehot2index(src)
    # tgt = onehot2index(tgt)
    src_decode = "".join([w for w in tokenizer.decode(src[0].tolist()) if w not in [Tokenizer.PAD, Tokenizer.UNKNOWN]])
    print(f"src = {src}, src_decode = {src_decode}")
    tgt_decode = "".join([w for w in tokenizer.decode(tgt[0].tolist()) if w not in [Tokenizer.PAD, Tokenizer.UNKNOWN]])
    print(f"tgt = {tgt}, tgt_decode = {tgt_decode}")

src = tensor([[  2, 403, 235, 293]]), src_decode = 清明时
tgt = tensor([[197,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1]]), tgt_decode = 节
