### 导包

In [5]:
import torch
from transformers import AutoTokenizer, T5ForConditionalGeneration, get_scheduler
import numpy as np
from torch.utils.data import Dataset, DataLoader
import json
import matplotlib.pyplot as plt
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

### 路径及参数

In [6]:
train_path = "./DuReaderQG/train.json"
test_path = "./DuReaderQG/dev.json"
max_target_len = 32
max_source_len = 256
device = "cuda" if torch.cuda.is_available() else "cpu"
batch_size = 16
lr = 5e-5
save_path = "./checkpoints/"
num_train_epochs = 20

### 加载model和tokenizer

In [7]:
pertrained_path = './uer/t5-base-chinese-cluecorpussmall'
tokenizer = AutoTokenizer.from_pretrained(pertrained_path)
model = T5ForConditionalGeneration.from_pretrained(pertrained_path)
tokenizer, model

(BertTokenizerFast(name_or_path='./uer/t5-base-chinese-cluecorpussmall', vocab_size=21229, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
 	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 },
 T5ForConditionalGeneration(
   (shared): Embedding(21228, 768)
   (enc

### 自定义dataset

In [None]:
class QADataset(Dataset):
    def __init__(self, data_path):
        self.data = []
        with open(data_path, "r") as f:
            for line in f:
                self.data.append(json.loads(line))


    def __len__(self):
        return len(self.data)


    def __getitem__(self, idx):
        item = self.data[idx]
        input_seq = (f"问题：{item['question']}{tokenizer.sep_token}原文：{item['context']}")
        output_seq = f"答案：{item['answer']}{tokenizer.eos_token}"
        return input_seq, output_seq

In [None]:
train_dataset = QADataset(train_path)
train_dataset[0]

Using eos_token, but it is not set yet.


('问题：仙剑奇侠传3第几集上天界[SEP]原文：第35集雪见缓缓张开眼睛，景天又惊又喜之际，长卿和紫萱的仙船驶至，见众人无恙，也十分高兴。众人登船，用尽合力把自身的真气和水分输给她。雪见终于醒过来了，但却一脸木然，全无反应。众人向常胤求助，却发现人世界竟没有雪见的身世纪录。长卿询问清微的身世，清微语带双关说一切上了天界便有答案。长卿驾驶仙船，众人决定立马动身，往天界而去。众人来到一荒山，长卿指出，魔界和天界相连。由魔界进入通过神魔之井，便可登天。众人至魔界入口，仿若一黑色的蝙蝠洞，但始终无法进入。后来花楹发现只要有翅膀便能飞入。于是景天等人打下许多乌鸦，模仿重楼的翅膀，制作数对翅膀状巨物。刚佩戴在身，便被吸入洞口。众人摔落在地，抬头发现魔界守卫。景天和众魔套交情，自称和魔尊重楼相熟，众魔不理，打了起来。',
 '答案：第35集None')

In [None]:
def collate_fn(batch):
    batched_data = {
        "input_ids": [],
        "attention_mask": [],
        "decoder_input_ids": [],
        "labels": [],
    }
    for _, (input_seq, output_seq) in enumerate(batch):
        # tokenize输入
        inputs = tokenizer(text=input_seq, truncation=True, max_length=max_source_len, padding=True)
       
        # tokenize输出,并将输出的ids作为inputs的label
        output_ids = tokenizer.encode(text=output_seq, truncation=True, max_length=max_target_len)
        decoder_input_ids = output_ids[:-2] # 去掉eos和[cls]
        decoder_input_ids =  decoder_input_ids + [tokenizer.pad_token_id] * (max_target_len - len(decoder_input_ids)) # padding
        
        labels = output_ids[1: -1] # 去掉起始token和[cls]
        labels = labels + [-100] * (max_target_len - len(labels))
        
        batched_data["input_ids"].append(inputs["input_ids"])
        batched_data["attention_mask"].append(inputs["attention_mask"])
        batched_data["decoder_input_ids"].append(decoder_input_ids)
        batched_data["labels"].append(labels)
        
    for k, v in batched_data.items():
        batched_data[k] = torch.tensor(np.array(v))
    return batched_data

test_train = []
for i in range(5):
    test_train.append(train_dataset[i]) 

train_test_dataloader = DataLoader(test_train, batch_size=2, shuffle=True, collate_fn=collate_fn)
train_test_dataloader            

Using eos_token, but it is not set yet.
Using eos_token, but it is not set yet.
Using eos_token, but it is not set yet.
Using eos_token, but it is not set yet.
Using eos_token, but it is not set yet.


TypeError: 'DataLoader' object is not subscriptable

In [22]:
def plot_metrics(value, name):
    plt.figure()
    plt.plot(value)
    plt.xlabel("Batch")
    plt.ylabel(f"{name}")
    plt.title(f"{name}")
    plt.savefig(f"log/{name}.png")

In [25]:
def evaluate_model(data_loader):
    model.eval()
    bleu1 = []
    bleu2 = []
    bleu3 = []
    bleu4 = []
    smoothie = SmoothingFunction().method4
    print("Evaluation")
    with torch.no_grad():
        for _, batch in enumerate(data_loader):
            outputs = model.generate(
                input_ids=batch["input_ids"].to(device)
            )
            label_tokens = batch["labels"].cpu().numpy()
            decode_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            label_tokens = np.where(
                batch["labels"] != -100, label_tokens, tokenizer.pad_token_id
            )
            decode_labels = tokenizer.batch_decode(
                label_tokens, skip_special_tokens=True
            )
            for pred, ref in zip(decode_preds, decode_labels):
                bleu1.append(
                    sentence_bleu(
                        [ref.split()],
                        pred.split(),
                        smoothing_function=smoothie,
                        weights=(1, 0, 0, 0),
                    )
                )
                bleu2.append(
                    sentence_bleu(
                        [ref.split()],
                        pred.split(),
                        smoothing_function=smoothie,
                        weights=(0.5, 0.5, 0, 0),
                    )
                )
                bleu3.append(
                    sentence_bleu(
                        [ref.split()],
                        pred.split(),
                        smoothing_function=smoothie,
                        weights=(0.33, 0.33, 0.33, 0),
                    )
                )
                bleu4.append(
                    sentence_bleu(
                        [ref.split()],
                        pred.split(),
                        smoothing_function=smoothie,
                        weights=(0.25, 0.25, 0.25, 0.25),
                    )
                )
    model.train()  
    return [
        sum(bleu1) / len(bleu1),
        sum(bleu2) / len(bleu2),
        sum(bleu3) / len(bleu3),  
        sum(bleu4) / len(bleu4),
    ]   

In [None]:
def train(num_train_epochs, train_loader, model, optimizer, lr_scheduler, device, logging_steps, loss_list, global_step, tic_train):
    for epoch in range(num_train_epochs):
        for batch in train_loader:
            outputs = model(
                input_ids=batch["input_ids"].to(device),
                attention_mask=batch["attention_mask"].to(device),
                decoder_input_ids=batch["decoder_input_ids"].to(device),
                labels=batch["labels"].to(device),
            )
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            loss_list.append(float(loss.cpu().detach()))

            global_step += 1
            if global_step % logging_steps == 0:
                time_diff = time.time() - tic_train
                loss_avg = sum(loss_list) / len(loss_list)
                print(
                    "global step %d, epoch: %d, loss: %.5f, speed: %.2f step/s"
                    % (global_step, epoch, loss_avg, logging_steps / time_diff)
                )
                tic_train = time.time()

In [None]:
import time

def train_loop():
    tokenizer.eos_token = tokenizer.sep_token
    tokenizer.bos_token = tokenizer.cls_token

    train_dataset = QADataset(data_path=train_path)
    test_dataset = QADataset(data_path=test_path)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_train_epochs * len(train_loader),
    )
    model.to(device)

    loss_list = []
    bleu1 = []
    bleu2 = []
    bleu3 = []
    bleu4 = []
    tic_train = time.time()
    global_step, best_bleu4 = 0, 0

    for epoch in range(num_train_epochs):
        train(num_train_epochs, train_loader, model, optimizer, lr_scheduler, device, logging_steps, loss_list, global_step, tic_train)
        # 在每个epoch结束后评估模型
        bleu_scores = evaluate_model(test_loader)
        print(f"Epoch {epoch}, BLEU-1: {bleu_scores[0]}, BLEU-2: {bleu_scores[1]}, BLEU-3: {bleu_scores[2]}, BLEU-4: {bleu_scores[3]}")
        if bleu_scores[3] > best_bleu4:
            best_bleu4 = bleu_scores[3]
            torch.save(model.state_dict(), f"{save_path}/best_model.pth")