### 1. 读取原始数据

In [1]:
from tqdm import tqdm
import pandas as pd
import os, joblib, jieba, opencc

ImportError: cannot import name 'opencc_clib' from 'opencc.clib' (D:\software\Anaconda\envs\py39_normal\lib\site-packages\opencc\clib\__init__.py)

### 2. 构建分词器

In [None]:
class Tokenizer(object):
    """
        自定义一个分词器，实现基本功能：
            - 1. 根据输入的语料，构建字典
            - 2. 输入src的句子，输出对应的id
            - 3. 输入tgt的句子，输出对应的id
            - 4. 输入tgt的id，输出tgt的句子
    """
    def __init__(self, data_file):
        """
            分词器初始化
                - 默认：根据输入的语料，构建字典
        """
        self.data_file = data_file
        self.data = None
        self.src_token2idx = None
        self.src_idx2token = None
        self.tgt_token2idx = None
        self.tgt_idx2token = None
        self._build_dict()
    
    def _build_dict(self):
        """
            构建字典
        """
        
        if self.src_token2idx:
            print("字典已经构建过了")
            return
        elif os.path.exists(os.path.join(".cache", "dicts.lxh")):
            print("从缓存中读取字典")
            self.src_token2idx, self.src_idx2token, self.tgt_token2idx, self.tgt_idx2token = joblib.load(filename=os.path.join(".cache", "dicts.lxh"))
            return
        
        # 从零构建字典
        self.data = pd.read_csv(filepath_or_buffer=self.data_file, sep="\t", names=["src", "tgt"])
        # self.data.columns = ["src", "tgt"]
        rows, cols  = self.data.shape
        # 构建词典
        src_tokens = {"<UNK>", "<PAD>", "<SOS>", "EOS"}
        tgt_tokens = {"<UNK>", "<PAD>", "<SOS>", "EOS"}
        for row_idx in tqdm(range(rows)):
            src, tgt = self.data.loc[row_idx, :]
            src_tokens.update(set(self.split_english_sentence(src)))
            tgt_tokens.update(set(self.split_chinese_sentence(tgt)))
        # 构建 src 的 字典
        self.src_token2idx = {token: idx for idx, token in enumerate(src_tokens)}
        self.src_idx2token = {idx: token for token, idx in self.src_token2idx.items()}

        # 构建 tgt 的 字典
        self.tgt_token2idx = {token: idx for idx, token in enumerate(tgt_tokens)}
        self.tgt_idx2token = {idx: token for token, idx in self.tgt_token2idx.items()}

        # 保存
        dicts = [self.src_token2idx, self.src_idx2token, self.tgt_token2idx, self.tgt_idx2token]
        joblib.dump(value=dicts, filename=os.path.join(".cache", "dicts.lxh"))
        
    def split_english_sentence(self, sentecne):
        """
            英文句子切分
        """
        sentecne = sentecne.strip()
        tokens = [word for word in jieba.lcut(sentecne.lower()) if word not in ("", " ", "'")]
        return tokens
    
    def split_chinese_sentence(self, sentence):
        """
            中文句子切分
        """
        # 实例化一个繁体转简体的工具
        converter = opencc.OpenCC(config="t2s")
        sentence = converter.convert(text=sentence)
        # 分词
        tokens = jieba.lcut(sentence)
        return tokens
        
    def __str__(self):
        """
            返回必要的打印信息
        """
        if self.src_token2idx:
            out = f"Tokenizer: [src: {len(self.src_token2idx)}, tgt: {len(self.tgt_token2idx)}]"
        else:
            out = f"尚无字典信息"
        return out
    
    def __repr__(self):
        """
            返回必要的打印信息
        """
        return self.__str__()

    def encode_src(self, src_sentence, src_max_len):
        """
            把分词后的句子，变成 id
        """
        src_idx = [self.src_token2idx.get(token, self.src_token2idx.get("<UNK>")) for token in src_sentence]
        src_idx = (src_idx + [self.src_token2idx.get("<PAD>")] * src_max_len)[:src_max_len]
        return src_idx

    def encode_tgt(self, tgt_sentence, tgt_max_len):
        """
            把分词后的tgt句子变成 id
                - <SOS>, 我, 爱, 北京, 天安门, ！, <EOS>, <PAD>, <PAD>
        """
        tgt_sentence = ["SOS"] + tgt_sentence + ["EOS"]
        tgt_max_len += 2
        tgt_idx = [self.tgt_token2idx.get(token, self.tgt_token2idx.get("<UNK>")) for token in tgt_sentence]
        tgt_idx = (tgt_idx + [self.tgt_token2idx.get("<PAD>")] * tgt_max_len)[:tgt_max_len]
        return tgt_idx       

### 3. 数据打包
- 既要又要
    - 既要批量化训练
    - 又要消除填充PAD的噪声污染
- collate_fn
    - 手动排序！！！

In [2]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import os
import pandas
import joblib

OSError: [WinError 126] 找不到指定的模块。 Error loading "D:\software\Anaconda\envs\py39_normal\lib\site-packages\torch\lib\torch_python.dll" or one of its dependencies.

In [4]:
tokenizer = Tokenizer(data_file="data.txt")

从缓存中读取字典


In [5]:
tokenizer

Tokenizer: [src: 7106, tgt: 12547]

In [6]:
class Seq2SeqDataset(Dataset):
    """
        自定义数据集
    """
    def __init__(self, data_file, part="train", tokenizer=tokenizer):
        """
            初始化
        """
        self.data_file = data_file
        self.tokenier = tokenizer
        self.part = part
        self.data = None
        self._load_data()

    def _load_data(self):
        """
            加载数据
        """
        if self.data:
            print("数据集已经构建过了")
            return
        elif os.path.exists(os.path.join(".cache", "data.lxh")):
            print("从缓存中读取数据")
            # 原始数据
            data = joblib.load(filename=os.path.join(".cache", "data.lxh"))
            # 80% 训练集
            # 20% 测试集
            nums = int(len(data) * 0.80)
            self.data = data[:nums] if self.part == "train" else data[nums:]
            return
        # 从零读取
        data = pd.read_csv(filepath_or_buffer=self.data_file, sep="\t", header=None)
        data = data.sample(frac=1).to_numpy()
        # 保存数据
        joblib.dump(value=data, filename=os.path.join(".cache", "data.lxh"))
        # 数据截取
        nums = int(len(data) * 0.80)
        self.data = data[:nums] if self.part == "train" else data[nums:]
        
    def __getitem__(self, idx):
        """
            通过索引来访问样本
        """
        src, tgt = self.data[idx]
        src = tokenizer.split_english_sentence(src)
        tgt = tokenizer.split_chinese_sentence(tgt)
        return src, len(src), tgt, len(tgt)
        

    def __len__(self):
        """
            返回该数据集的样本个数
        """
        return len(self.data)

In [7]:
def collate_fn(batch, tokenizer=tokenizer):
    """
        回调函数
    """
    # 按 src_len 逆序
    batch = sorted(batch, key=lambda ele: ele[1], reverse=True)
    # 分拆成4个集合
    src_sentences, src_lens, tgt_sentences, tgt_lens = zip(*batch)
    # 1. src 转 id
    src_max_len = src_lens[0]
    src_idxes = []
    for src_sentence in src_sentences:
        src_idxes.append(tokenizer.encode_src(src_sentence, src_max_len))

    # 2. tgt 转 id
    tgt_max_len = max(tgt_lens)
    tgt_idxes = []
    for tgt_sentence in tgt_sentences:
        tgt_idxes.append(tokenizer.encode_tgt(tgt_sentence, tgt_max_len))

    # 所有数据转张量 torch.long
    # [src_max_len, batch_size]
    src_idxes = torch.tensor(data=src_idxes, dtype=torch.long).t()
    # (batch_size, )
    src_lens = torch.tensor(data=src_lens, dtype=torch.long)
    # [tgt_max_len + 2, batch_size]
    tgt_idxes = torch.tensor(data=tgt_idxes, dtype=torch.long).t()
    # (batch_size, )
    tgt_lens = torch.tensor(data=tgt_lens, dtype=torch.long)

    return src_idxes, src_lens, tgt_idxes, tgt_lens
    

In [8]:
# 训练集
train_dataset = Seq2SeqDataset(data_file="data.txt", part="train")
train_dataloader = DataLoader(dataset=train_dataset, 
                              shuffle=True, 
                              batch_size=32,
                              collate_fn=collate_fn)
# 测试集
test_dataset = Seq2SeqDataset(data_file="data.txt", part="test")
test_dataloader = DataLoader(dataset=test_dataset, 
                              shuffle=False, 
                              batch_size=32,
                              collate_fn=collate_fn)

从缓存中读取数据
从缓存中读取数据


In [9]:
for src, src_lens, tgt, tgt_lens in test_dataloader:
    print(src.shape, src_lens, tgt.shape, tgt_lens)
    break

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\63447\AppData\Local\Temp\jieba.cache
Loading model cost 0.355 seconds.
Prefix dict has been built successfully.


torch.Size([14, 32]) tensor([14, 13, 13, 10,  9,  9,  9,  8,  8,  8,  8,  8,  8,  8,  8,  8,  7,  7,
         6,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  4,  3]) torch.Size([15, 32]) tensor([ 8,  8, 10,  6,  8,  8,  9,  6,  6,  5,  8,  8, 13,  8,  7, 11,  7,  6,
         5,  6,  7,  7,  3,  6,  6,  8,  7,  6,  6,  6,  5,  2])


### 4. 编码器设计

In [10]:
import torch
from torch import nn

In [11]:
class Encoder(nn.Module):
    """
        自定义一个编码器，处理 src
            - `Seq` 2 Seq
            - 只是 一个很单纯 的 RNN
            - 没有任何的差别
    """
    def __init__(self, num_embeddings=len(tokenizer.src_token2idx), embedding_dim=256):
        # 仅用于上坟，没有任何其他作用！
        super().__init__()
        self.embed = nn.Embedding(num_embeddings=num_embeddings,
                                 embedding_dim=embedding_dim, 
                                 padding_idx=tokenizer.src_token2idx.get("<PAD>"))
        self.gru = nn.GRU(input_size=embedding_dim, 
                          hidden_size=embedding_dim)

    def forward(self, src, src_lens):
        """
            前向传播
                - 消除 PAD 影响
        """
        # [src_max_len, batch_size] --> [src_max_len, batch_size, embed_dim]
        src = self.embed(src)
        # 压紧被填充的序列
        src = nn.utils.rnn.pack_padded_sequence(input=src, lengths=src_lens, batch_first=False)
        out, hn = self.gru(src)
        return hn[0, :, :]

In [12]:
encoder = Encoder()

In [13]:
for src, src_lens, tgt, tgt_lens in train_dataloader:
    memory = encoder(src, src_lens)
    print(memory.shape)
    break

torch.Size([32, 256])


### 5. 解码器设计

In [14]:
import random

In [15]:
class Decoder(nn.Module):
    """
        实现解码器：
            - 训练时：
                - 考虑 teacher forcing
            - 推理时：
                - 考虑 自回归
    """
    def __init__(self, num_embeddings=len(tokenizer.tgt_token2idx), embedding_dim=256):
        super().__init__()
        # 向量化的过程
        self.embed = nn.Embedding(num_embeddings=num_embeddings, 
                                  embedding_dim=embedding_dim, 
                                  padding_idx=tokenizer.tgt_token2idx.get("<PAD>"))
        
        # 手动挡，分步特征抽取，实现自回归逻辑！！！
        self.gru_cell = nn.GRUCell(input_size=embedding_dim,
                                  hidden_size=embedding_dim)
        
        # 输出 embed_dim --> dict_len
        self.out = nn.Linear(in_features=embedding_dim, out_features=len(tokenizer.tgt_token2idx))
    
    def forward(self, context, tgt, tgt_lens):
        """
            训练时的正向推理：

                context: 上下文向量，中间表达
                tgt：标签
                tgt_lens：生成的句子的有效长度（不包含 <SOS>和<EOS>）     
        """
        # 生成侧的输入
        tgt_input = tgt[:-1, :]
        # 生成侧的输出
        tgt_output = tgt[1:, :]
        # 输入序列长度和批量大小
        SEQ_LEN, BATCH_SIZE = tgt_input.shape
        # 准备初始状态
        hn = context
        # 有多少步，就循环多少次
        outs = []
        step_input = self.embed(tgt_input[0, :].view(1, -1))[0, :, :]
        
        for step in range(SEQ_LEN):
            # 正向传播
            hn = self.gru_cell(step_input, hn)
            # 生成结果
            y_pred = self.out(hn)
            # 保留所有生成的结果（做交叉熵损失用）
            outs.append(y_pred)
            
            # 训练时采用 50% 的概率去使用 teacher forcing 优化策略
            teacher_forcing = random.random() > 0.5
            if teacher_forcing:
                step_input = self.embed(tgt_input[step + 1, :].view(1, -1))[0, :, :]
            else:
                y_pred = y_pred.argmax(dim=-1, keepdim=True).view(1, -1)
                step_input = self.embed(y_pred)[0, :, :]
        
        return outs, tgt_lens

In [16]:
encoder = Encoder()
decoder = Decoder()

In [17]:
for src, src_lens, tgt, tgt_lens in train_dataloader:
    # 1. 实现编码过程
    context = encoder(src, src_lens)
    # 2. 实现解码过程
    outs, tgt_lens = decoder(context, tgt, tgt_lens)
    break

IndexError: index 15 is out of bounds for dimension 0 with size 15

In [None]:
len(outs)

In [None]:
tgt_lens