In [1]:
import os
import paddle
import paddlenlp

  _nlv = LooseVersion(_np_version)
  _np_version_under1p16 = _nlv < LooseVersion("1.16")
  _np_version_under1p17 = _nlv < LooseVersion("1.17")
  _np_version_under1p18 = _nlv < LooseVersion("1.18")
  _np_version_under1p19 = _nlv < LooseVersion("1.19")
  _np_version_under1p20 = _nlv < LooseVersion("1.20")
  other = LooseVersion(other)
  if LooseVersion(_np_version) >= LooseVersion("1.17.0"):
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from paddlenlp.datasets import load_dataset

train_ds, dev_ds, test_ds = load_dataset("chnsenticorp", splits=["train", "dev", "test"])


In [3]:
from paddlenlp.transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "ernie-3.0-base-zh"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_classes=len(train_ds.label_list))
tokenizer = AutoTokenizer.from_pretrained(model_name)

[32m[2022-05-30 01:50:40,564] [    INFO][0m - We are using <class 'paddlenlp.transformers.ernie.modeling.ErnieForSequenceClassification'> to load 'ernie-3.0-base-zh'.[0m
[32m[2022-05-30 01:50:40,568] [    INFO][0m - Already cached /home/qiuzihan/.paddlenlp/models/ernie-3.0-base-zh/ernie_3.0_base_zh.pdparams[0m
W0530 01:50:40.572010 1318026 gpu_context.cc:278] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 10.2, Runtime API Version: 10.2
W0530 01:50:40.581347 1318026 gpu_context.cc:306] device: 0, cuDNN Version: 7.6.
[32m[2022-05-30 01:50:47,544] [    INFO][0m - We are using <class 'paddlenlp.transformers.ernie.tokenizer.ErnieTokenizer'> to load 'ernie-3.0-base-zh'.[0m
[32m[2022-05-30 01:50:47,547] [    INFO][0m - Already cached /home/qiuzihan/.paddlenlp/models/ernie-3.0-base-zh/ernie_3.0_base_zh_vocab.txt[0m


In [4]:
import functools
import numpy as np

from paddle.io import DataLoader, BatchSampler
from paddlenlp.data import DataCollatorWithPadding

# 数据预处理函数，利用分词器将文本转化为整数序列
def preprocess_function(examples, tokenizer, max_seq_length, is_test=False):

    result = tokenizer(text=examples["text"], max_seq_len=max_seq_length)
    if not is_test:
        result["labels"] = examples["label"]
    return result

trans_func = functools.partial(preprocess_function, tokenizer=tokenizer, max_seq_length=128)
train_ds = train_ds.map(trans_func)
dev_ds = dev_ds.map(trans_func)

# collate_fn函数构造，将不同长度序列充到批中数据的最大长度，再将数据堆叠
collate_fn = DataCollatorWithPadding(tokenizer)

# 定义BatchSampler，选择批大小和是否随机乱序，进行DataLoader
train_batch_sampler = BatchSampler(train_ds, batch_size=32, shuffle=True)
dev_batch_sampler = BatchSampler(dev_ds, batch_size=32, shuffle=False)
train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=collate_fn)
dev_data_loader = DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=collate_fn)

In [5]:
optimizer = paddle.optimizer.AdamW(learning_rate=2e-5, parameters=model.parameters())
criterion = paddle.nn.loss.CrossEntropyLoss()
metric = paddle.metric.Accuracy()

In [6]:
def evaluate(model, criterion, metric, dataloader):
    label_map = {0: '负面', 1: '正面'}
    results = []
    model.eval()
    eval_metric = paddle.metric.Accuracy()
    for batch in dataloader:
        logits = model(batch['input_ids'], batch['token_type_ids'])
        probs = F.softmax(logits, axis=-1)
        correct = eval_metric.compute(probs, batch['labels'])
        metric.update(correct)
        acc = metric.accumulate()
        return acc

In [7]:
import time
import paddle.nn.functional as F


epochs = 5 # 训练轮次
ckpt_dir = "ernie_ckpt" #训练过程中保存模型参数的文件夹
best_acc = 0
best_step = 0
global_step = 0 #迭代次数
tic_train = time.time()
for epoch in range(1, epochs + 1):
    for step, batch in enumerate(train_data_loader, start=1):
        input_ids, token_type_ids, labels = batch['input_ids'], batch['token_type_ids'], batch['labels']

        # 计算模型输出、损失函数值、分类概率值、准确率
        logits = model(input_ids, token_type_ids)
        loss = criterion(logits, labels)
        probs = F.softmax(logits, axis=1)
        correct = metric.compute(probs, labels)
        metric.update(correct)
        acc = metric.accumulate()

        # 每迭代20次，打印损失函数值、准确率、计算速度
        global_step += 1
        if global_step % 20 == 0:
            print(
                "global step %d, epoch: %d, batch: %d, loss: %.5f, accu: %.5f, speed: %.2f step/s"
                % (global_step, epoch, step, loss, acc,
                    10 / (time.time() - tic_train)))
            tic_train = time.time()
        
        # 反向梯度回传，更新参数
        loss.backward()
        optimizer.step()
        optimizer.clear_grad()

        # 每迭代200次，评估当前训练的模型、保存当前模型参数和分词器的词表等
        if global_step % 200 == 0:
            save_dir = ckpt_dir
            if not os.path.exists(save_dir):
                os.makedirs(save_dir)
            print(global_step, end=' ')
            acc_eval = evaluate(model, criterion, metric, dev_data_loader)
            print("eval acc:", acc_eval)
            if acc_eval > best_acc:
                best_acc = acc_eval
                best_step = global_step

                model.save_pretrained(save_dir)
                tokenizer.save_pretrained(save_dir)

global step 20, epoch: 1, batch: 20, loss: 0.62837, accu: 0.63594, speed: 1.97 step/s
global step 40, epoch: 1, batch: 40, loss: 0.34484, accu: 0.74766, speed: 2.14 step/s
global step 60, epoch: 1, batch: 60, loss: 0.35660, accu: 0.78594, speed: 2.15 step/s
global step 80, epoch: 1, batch: 80, loss: 0.30064, accu: 0.81328, speed: 2.15 step/s
global step 100, epoch: 1, batch: 100, loss: 0.17698, accu: 0.83313, speed: 2.15 step/s
global step 120, epoch: 1, batch: 120, loss: 0.42386, accu: 0.84401, speed: 2.15 step/s
global step 140, epoch: 1, batch: 140, loss: 0.28324, accu: 0.85446, speed: 2.15 step/s
global step 160, epoch: 1, batch: 160, loss: 0.41796, accu: 0.86172, speed: 2.13 step/s
global step 180, epoch: 1, batch: 180, loss: 0.34469, accu: 0.86528, speed: 2.06 step/s
global step 200, epoch: 1, batch: 200, loss: 0.38285, accu: 0.86766, speed: 2.00 step/s
200 eval acc: 0.8681592039800995


[32m[2022-05-30 01:52:04,807] [    INFO][0m - tokenizer config file saved in ernie_ckpt/tokenizer_config.json[0m
[32m[2022-05-30 01:52:04,809] [    INFO][0m - Special tokens file saved in ernie_ckpt/special_tokens_map.json[0m


global step 220, epoch: 1, batch: 220, loss: 0.07424, accu: 0.87359, speed: 1.20 step/s
global step 240, epoch: 1, batch: 240, loss: 0.14120, accu: 0.87707, speed: 1.99 step/s
global step 260, epoch: 1, batch: 260, loss: 0.16949, accu: 0.88087, speed: 1.99 step/s
global step 280, epoch: 1, batch: 280, loss: 0.17400, accu: 0.88301, speed: 1.98 step/s
global step 300, epoch: 1, batch: 300, loss: 0.25334, accu: 0.88590, speed: 1.99 step/s
global step 320, epoch: 2, batch: 20, loss: 0.08075, accu: 0.88931, speed: 1.95 step/s
global step 340, epoch: 2, batch: 40, loss: 0.02499, accu: 0.89223, speed: 1.99 step/s
global step 360, epoch: 2, batch: 60, loss: 0.08604, accu: 0.89552, speed: 1.98 step/s
global step 380, epoch: 2, batch: 80, loss: 0.05529, accu: 0.89870, speed: 1.98 step/s
global step 400, epoch: 2, batch: 100, loss: 0.20544, accu: 0.90181, speed: 1.98 step/s
400 eval acc: 0.9018967661691543


[32m[2022-05-30 01:52:58,534] [    INFO][0m - tokenizer config file saved in ernie_ckpt/tokenizer_config.json[0m
[32m[2022-05-30 01:52:58,536] [    INFO][0m - Special tokens file saved in ernie_ckpt/special_tokens_map.json[0m


global step 420, epoch: 2, batch: 120, loss: 0.08471, accu: 0.90418, speed: 1.19 step/s
global step 440, epoch: 2, batch: 140, loss: 0.17566, accu: 0.90632, speed: 1.98 step/s
global step 460, epoch: 2, batch: 160, loss: 0.06226, accu: 0.90923, speed: 1.98 step/s
global step 480, epoch: 2, batch: 180, loss: 0.10699, accu: 0.91124, speed: 1.98 step/s
global step 500, epoch: 2, batch: 200, loss: 0.04766, accu: 0.91272, speed: 1.99 step/s
global step 520, epoch: 2, batch: 220, loss: 0.47114, accu: 0.91445, speed: 1.99 step/s
global step 540, epoch: 2, batch: 240, loss: 0.13604, accu: 0.91599, speed: 1.99 step/s
global step 560, epoch: 2, batch: 260, loss: 0.05447, accu: 0.91765, speed: 1.98 step/s
global step 580, epoch: 2, batch: 280, loss: 0.17768, accu: 0.91849, speed: 1.99 step/s
global step 600, epoch: 2, batch: 300, loss: 0.08136, accu: 0.92001, speed: 1.99 step/s
600 eval acc: 0.9201388888888888


[32m[2022-05-30 01:53:52,272] [    INFO][0m - tokenizer config file saved in ernie_ckpt/tokenizer_config.json[0m
[32m[2022-05-30 01:53:52,274] [    INFO][0m - Special tokens file saved in ernie_ckpt/special_tokens_map.json[0m


global step 620, epoch: 3, batch: 20, loss: 0.01826, accu: 0.92205, speed: 1.17 step/s
global step 640, epoch: 3, batch: 40, loss: 0.16683, accu: 0.92413, speed: 1.99 step/s
global step 660, epoch: 3, batch: 60, loss: 0.00969, accu: 0.92609, speed: 1.99 step/s
global step 680, epoch: 3, batch: 80, loss: 0.01197, accu: 0.92785, speed: 1.99 step/s
global step 700, epoch: 3, batch: 100, loss: 0.01086, accu: 0.92941, speed: 1.99 step/s
global step 720, epoch: 3, batch: 120, loss: 0.03129, accu: 0.93076, speed: 1.99 step/s
global step 740, epoch: 3, batch: 140, loss: 0.01876, accu: 0.93220, speed: 1.99 step/s
global step 760, epoch: 3, batch: 160, loss: 0.00664, accu: 0.93357, speed: 1.99 step/s
global step 780, epoch: 3, batch: 180, loss: 0.01458, accu: 0.93499, speed: 1.99 step/s
global step 800, epoch: 3, batch: 200, loss: 0.00516, accu: 0.93622, speed: 1.99 step/s
800 eval acc: 0.9362950870646766


[32m[2022-05-30 01:54:46,131] [    INFO][0m - tokenizer config file saved in ernie_ckpt/tokenizer_config.json[0m
[32m[2022-05-30 01:54:46,136] [    INFO][0m - Special tokens file saved in ernie_ckpt/special_tokens_map.json[0m


global step 820, epoch: 3, batch: 220, loss: 0.13002, accu: 0.93735, speed: 1.19 step/s
global step 840, epoch: 3, batch: 240, loss: 0.14899, accu: 0.93809, speed: 1.99 step/s
global step 860, epoch: 3, batch: 260, loss: 0.01138, accu: 0.93887, speed: 1.98 step/s
global step 880, epoch: 3, batch: 280, loss: 0.03181, accu: 0.93966, speed: 1.99 step/s
global step 900, epoch: 3, batch: 300, loss: 0.01262, accu: 0.94071, speed: 1.99 step/s
global step 920, epoch: 4, batch: 20, loss: 0.19415, accu: 0.94169, speed: 1.95 step/s
global step 940, epoch: 4, batch: 40, loss: 0.01577, accu: 0.94280, speed: 1.99 step/s
global step 960, epoch: 4, batch: 60, loss: 0.07460, accu: 0.94395, speed: 1.99 step/s
global step 980, epoch: 4, batch: 80, loss: 0.00344, accu: 0.94496, speed: 1.99 step/s
global step 1000, epoch: 4, batch: 100, loss: 0.05695, accu: 0.94590, speed: 1.99 step/s
1000 eval acc: 0.9459577114427861


[32m[2022-05-30 01:55:40,041] [    INFO][0m - tokenizer config file saved in ernie_ckpt/tokenizer_config.json[0m
[32m[2022-05-30 01:55:40,046] [    INFO][0m - Special tokens file saved in ernie_ckpt/special_tokens_map.json[0m


global step 1020, epoch: 4, batch: 120, loss: 0.00382, accu: 0.94689, speed: 1.18 step/s
global step 1040, epoch: 4, batch: 140, loss: 0.00406, accu: 0.94773, speed: 1.98 step/s
global step 1060, epoch: 4, batch: 160, loss: 0.00341, accu: 0.94868, speed: 1.98 step/s
global step 1080, epoch: 4, batch: 180, loss: 0.01068, accu: 0.94940, speed: 1.99 step/s
global step 1100, epoch: 4, batch: 200, loss: 0.04355, accu: 0.95011, speed: 1.99 step/s
global step 1120, epoch: 4, batch: 220, loss: 0.00341, accu: 0.95083, speed: 1.99 step/s
global step 1140, epoch: 4, batch: 240, loss: 0.03903, accu: 0.95134, speed: 1.99 step/s
global step 1160, epoch: 4, batch: 260, loss: 0.00416, accu: 0.95198, speed: 1.98 step/s
global step 1180, epoch: 4, batch: 280, loss: 0.02726, accu: 0.95258, speed: 1.98 step/s
global step 1200, epoch: 4, batch: 300, loss: 0.00762, accu: 0.95319, speed: 1.98 step/s
1200 eval acc: 0.9532027363184079


[32m[2022-05-30 01:56:33,982] [    INFO][0m - tokenizer config file saved in ernie_ckpt/tokenizer_config.json[0m
[32m[2022-05-30 01:56:33,984] [    INFO][0m - Special tokens file saved in ernie_ckpt/special_tokens_map.json[0m


global step 1220, epoch: 5, batch: 20, loss: 0.00448, accu: 0.95389, speed: 1.16 step/s
global step 1240, epoch: 5, batch: 40, loss: 0.00217, accu: 0.95460, speed: 1.98 step/s
global step 1260, epoch: 5, batch: 60, loss: 0.00207, accu: 0.95525, speed: 1.98 step/s
global step 1280, epoch: 5, batch: 80, loss: 0.00651, accu: 0.95590, speed: 1.98 step/s
global step 1300, epoch: 5, batch: 100, loss: 0.00189, accu: 0.95650, speed: 1.97 step/s
global step 1320, epoch: 5, batch: 120, loss: 0.08823, accu: 0.95708, speed: 1.97 step/s
global step 1340, epoch: 5, batch: 140, loss: 0.00297, accu: 0.95761, speed: 1.98 step/s
global step 1360, epoch: 5, batch: 160, loss: 0.07320, accu: 0.95818, speed: 1.97 step/s
global step 1380, epoch: 5, batch: 180, loss: 0.00122, accu: 0.95872, speed: 1.97 step/s
global step 1400, epoch: 5, batch: 200, loss: 0.00103, accu: 0.95924, speed: 1.98 step/s
1400 eval acc: 0.9592217484008528


[32m[2022-05-30 01:57:28,256] [    INFO][0m - tokenizer config file saved in ernie_ckpt/tokenizer_config.json[0m
[32m[2022-05-30 01:57:28,259] [    INFO][0m - Special tokens file saved in ernie_ckpt/special_tokens_map.json[0m


global step 1420, epoch: 5, batch: 220, loss: 0.01125, accu: 0.95973, speed: 1.17 step/s
global step 1440, epoch: 5, batch: 240, loss: 0.00887, accu: 0.96018, speed: 1.98 step/s
global step 1460, epoch: 5, batch: 260, loss: 0.00206, accu: 0.96061, speed: 1.99 step/s
global step 1480, epoch: 5, batch: 280, loss: 0.00070, accu: 0.96106, speed: 1.99 step/s
global step 1500, epoch: 5, batch: 300, loss: 0.00053, accu: 0.96151, speed: 1.99 step/s


In [9]:
params_path = 'ernie_ckpt/model_state.pdparams'
state_dict = paddle.load(params_path)
model.set_dict(state_dict)

# 也可以选择加载预先训练好的模型参数结果查看模型训练结果
# model.set_dict(paddle.load('ernie_ckpt_trained/model_state.pdparams'))

print('ERNIE 3.0 在ChnSentiCorp的dev集表现', end=' ')
eval_acc = evaluate(model, criterion, metric, dev_data_loader)
print(eval_acc)

ERNIE 3.0 在ChnSentiCorp的dev集表现 0.9614811133200796
