[Open In Colab](https://colab.research.google.com/github/shibing624/textgen/blob/main/example/T5/T5_Finetune_Chinese_Poem.ipynb)


# T5 写诗
- 设计：Pretrained T5 + “写诗 prompt” fine-tuning
  - 对比我的 [transformer training from scratch](https://github.com/hululuzhu/chinese-ai-writing-share/blob/main/%E4%B8%AD%E6%96%87%E5%86%99%E8%AF%97Transformer_Source_Code_Share_V1.ipynb)
  - 想要加入作者作为可选输入
    - 每个文章分两次输入，一次作者名字，一次“None”名字（通用）
- 数据：[诗歌github](https://github.com/chinese-poetry/chinese-poetry)
- 相关内容
  - [Huggingface](https://huggingface.co/)
  - LangZhou Chinese [MengZi T5 pretrained Model](https://huggingface.co/Langboat/mengzi-t5-base) and [paper](https://arxiv.org/pdf/2110.06696.pdf)
  - [textgen](https://github.com/shibing624/textgen) 


## Prepare Data

In [None]:
#!nvidia-smi

Mon Feb  7 22:10:16 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [1]:
IS_TEST_FLOW = False  #@param {type: "boolean"}

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
import json
import urllib.request
import pandas as pd
!pip install -q "tqdm>=4.36.1" > /tmp/na
from tqdm.notebook import tqdm
!pip install -q chinese-converter > /tmp/na
import chinese_converter  # 繁体到简体需要
import pickle
import os
import pandas as pd
import numpy as np

In [4]:
# https://github.com/chinese-poetry/chinese-poetry
POEM_CONTENT = {
    'tang': {
        'total': 58,
        'pattern': "https://raw.githubusercontent.com/chinese-poetry/chinese-poetry/master/json/poet.tang.{0}.json"
    },
    'song': {
        'total': 255,
        'pattern': "https://raw.githubusercontent.com/chinese-poetry/chinese-poetry/master/json/poet.song.{0}.json"
    }
}


def get_poems(is_test=True, verbose=True):
    df_list = []
    for dynasty in POEM_CONTENT:
        size = 3 if is_test else POEM_CONTENT[dynasty]['total']
        pbar = tqdm(total=size, desc="Dynasty " + dynasty)
        for i in range(size):
            url = POEM_CONTENT[dynasty]['pattern'].format(i * 1000)
            if verbose:
                print(f"download {url} now")
            df_list.append(pd.read_json(url))
            pbar.update(1)
    return pd.concat(df_list)

In [5]:
df = get_poems(is_test=IS_TEST_FLOW, verbose=False)
df['concat_paragraphs'] = [''.join(map(str, l)) for l in df['paragraphs']]
df = df[['author', 'title', 'concat_paragraphs']]

def convert_schinese(tchinese):
    return chinese_converter.to_simplified(tchinese)

df['s_content'] = df.apply(lambda row: convert_schinese(''.join(row.concat_paragraphs)), axis=1)
df['s_title'] = df.apply(lambda row: convert_schinese(''.join(row.title)), axis=1)
df['s_author'] = df.apply(lambda row: convert_schinese(''.join(row.author)), axis=1)

my_df = df
print("my_df size", len(my_df))

Dynasty tang:   0%|          | 0/58 [00:00<?, ?it/s]

Dynasty song:   0%|          | 0/255 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [6]:
MAX_AUTHOR_CHAR = 4
MAX_TITLE_CHAR = 12
MIN_CONTENT_CHAR = 10
MAX_CONTENT_CHAR = 64


def trim_author_fn(row):
    return row.s_author[:MAX_AUTHOR_CHAR]


def trim_title_fn(row):
    trimed_title = row.s_title[:MAX_TITLE_CHAR].replace(" ", "").replace("(", "").replace(")", "")
    return trimed_title


def trim_content_fn(row):
    trimed_content = row.s_content[:MAX_CONTENT_CHAR]
    # # End with a period to avoid partial ending to confuse model
    # last_period = trimed_content.rfind("。")
    # return trimed_content[:last_period+1]
    return trimed_content


# Trim the size, a soft copy to avoid the view/copy conflict warning
my_df['s_author_trim'] = my_df.copy().apply(trim_author_fn, axis=1)
my_df['s_title_trim'] = my_df.copy().apply(trim_title_fn, axis=1)
my_df['s_content_trim'] = my_df.copy().apply(trim_content_fn, axis=1)

NameError: name 'my_df' is not defined

In [None]:
# Title cannot be empty
empty_title_mask = (my_df['s_title_trim'].str.len() == 0)
too_short_cotent_mask = (my_df['s_content_trim'].str.len() <= MIN_CONTENT_CHAR)
invalid_mask = (('无正文' == my_df['s_content_trim']) | ('无正文' == my_df['s_author_trim']))
too_short_mask =  empty_title_mask | too_short_cotent_mask | invalid_mask

qualitied_df = my_df.loc[~too_short_mask][['s_author_trim', 's_title_trim', 's_content_trim']]

In [None]:
qualitied_df.sample(3)

In [None]:
TITLE_PROMPT = "作诗："
AUTHOR_PROMPT = "作者："
EOS_TOKEN = '</s>'


def build_dataset_df(df, include_author=True):
    dfc = df.copy()
    dfc['prefix'] = TITLE_PROMPT
    if include_author:
        dfc['input_text'] = df['s_title_trim'] + EOS_TOKEN + AUTHOR_PROMPT + df['s_author_trim']
    else:
        dfc['input_text'] = TITLE_PROMPT + df['s_title_trim']
    dfc['target_text'] = df['s_content_trim']
    dfc = dfc[['prefix', 'input_text', 'target_text']]
    return dfc

In [None]:
df_author_title_content = build_dataset_df(qualitied_df, True)
df_author_title_content[:3]

In [None]:
df_title_content = build_dataset_df(qualitied_df, False)
df_title_content[:3]

In [None]:
merged_df = pd.concat([df_author_title_content, df_title_content])

In [None]:
merged_df

## Modeling

In [None]:
# Quiet install textgen package
!pip install -q textgen

In [None]:
import torch
import sys
sys.path.append('../..')
from textgen.t5 import T5Model

In [None]:
model_type = 't5'
model_name = "Langboat/mengzi-t5-base"
output_dir = 'outputs/mengzi_t5_poem/'
max_seq_length = 50
num_epochs = 10
batch_size = 32

In [None]:
model_args = {
    "reprocess_input_data": True,
    "overwrite_output_dir": True,
    "max_seq_length": max_seq_length,
    "max_length": max_seq_length,
    "train_batch_size": batch_size,
    "num_train_epochs": num_epochs,
    "save_eval_checkpoints": False,
    "save_model_every_epoch": False,
    "evaluate_generated_text": True,
    "evaluate_during_training": True,
    "evaluate_during_training_verbose": True,
    "use_multiprocessing": False,
    "save_best_model": True,
    "output_dir": output_dir,
    "use_early_stopping": True,
}
# model_type: t5  model_name: Langboat/mengzi-t5-base
model = T5Model(model_type, model_name, args=model_args)


Downloading:   0%|          | 0.00/725k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/659 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/990M [00:00<?, ?B/s]

In [None]:
model.tokenizer("桥形通汉上，峰势接云危。</s>烟霞交隐映，花鸟自参差。")

{'input_ids': [1012, 955, 406, 921, 23, 3, 1440, 2180, 799, 355, 4008, 4, 1, 1448, 4152, 690, 3934, 4990, 3, 17544, 178, 2572, 769, 4, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
model.tokenizer.decode([1012, 955, 406, 921, 23, 3, 1440, 2180, 799, 355, 4008, 4, 1, 1448, 4152, 690, 3934, 4990, 3, 17544, 178, 2572, 769, 4, 1])

'桥形通汉上,峰势接云危。</s> 烟霞交隐映,花鸟自参差。</s>'

In [None]:
def predict_now(sentences, model=model, prefix=TITLE_PROMPT):
    sentences_add_prefix = [prefix + ": " + i for i in sentences]
    print("inputs:", sentences)
    print("outputs:", model.predict(sentences_add_prefix))

predict_now("过温汤", model=model)

# Training

In [None]:
from sklearn.model_selection import train_test_split
merged_df = merged_df.sample(frac=1) # Shuffle
train_df, eval_df = train_test_split(merged_df, test_size=0.01)

In [None]:
print("train", len(train_df), "eval", len(eval_df))

train 607776 eval 12404


In [None]:
def sim_text_chars(text1, text2):
    if not text1 or not text2:
        return 0.0
    same = set(text1) | set(text2)
    m = len(same)
    n = len(text1) if len(text1) > len(text2) else len(text2)
    return m / n

def count_matches(labels, preds):
    logger.debug(f"labels: {labels[:10]}")
    logger.debug(f"preds: {preds[:10]}")
    match = sum([sim_text_chars(label, pred) for label, pred in zip(labels, preds)]) / len(labels)
    logger.debug(f"match: {match}")
    return match

model.train_model(train_df, eval_data=eval_df, matches=count_matches)
print(model.eval_model(eval_df, matches=count_matches))

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 247 M 
-----------------------------------------------------
247 M     Trainable params
0         Non-trainable params
247 M     Total params
990.311   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
Global seed set to 42
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."


Training: -1it [00:00, ?it/s]

In [None]:
# Predict

In [None]:
predict_now("过温汤", model=model)