In [1]:
import torch
from transformers import (
    AutoTokenizer,
    GPT2LMHeadModel,
    TextGenerationPipeline,
    Trainer,
    TrainingArguments,
)


root_path = "/home/qizhen/projects/train_shici/"
model_path = root_path + "gw_model_8/"


In [2]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

In [3]:
def read_data_set(filename):
    with open(filename, "r", encoding="utf8") as f:
        return [line.strip() for line in f]


train_set = read_data_set(root_path + "train.txt")
test_set = read_data_set(root_path + "test.txt")


In [4]:
max_length = 128

class TextDataset(torch.utils.data.Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        for txt in txt_list:
            encodings_dict = tokenizer(txt, truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

train_data = TextDataset(train_set, tokenizer, max_length)
test_data = TextDataset(test_set, tokenizer, max_length)

In [5]:
batch_size = 8

training_args = TrainingArguments(
    output_dir="/tmp",
    logging_dir="/tmp",
    num_train_epochs=1,
    logging_steps=200,
    save_steps=50000,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_steps=1,
    weight_decay=0.01,
)


In [6]:
trainner = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    data_collator=lambda data: {
        "input_ids": torch.stack([f[0] for f in data]),
        "attention_mask": torch.stack([f[1] for f in data]),
        "labels": torch.stack([f[0] for f in data]),
    },
)

trainner.train()
trainner.save_model(root_path + "gw_model_9/")

***** Running training *****
  Num examples = 718105
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 89764


Step,Training Loss
200,0.6659
400,0.6464
600,0.6683
800,0.6801
1000,0.6504
1200,0.6481
1400,0.6578
1600,0.6522
1800,0.6671
2000,0.6726


Saving model checkpoint to /tmp/checkpoint-50000
Configuration saved in /tmp/checkpoint-50000/config.json
Model weights saved in /tmp/checkpoint-50000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to /home/qizhen/projects/train_shici/gw_model_9/
Configuration saved in /home/qizhen/projects/train_shici/gw_model_9/config.json
Model weights saved in /home/qizhen/projects/train_shici/gw_model_9/pytorch_model.bin


In [8]:
sample_test = tokenizer("", return_tensors="pt").input_ids.cuda()
sample_test = sample_test[:, :-1]
sample_outputs = model.generate(
    sample_test,
    do_sample=True,
    max_length=128,
    temperature=0.5,
    num_return_sequences=10,
    repetition_penalty=1.2,
)

for i, sample_output in enumerate(sample_outputs):
    print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: 一 抹 夕 阳 斜 度 | 几 声 蝉 唱 轻 扬
1: 山 泉 流 翠 色 | 海 浪 涌 金 波
2: 三 月 二 十 六 日 夜 大 雨 不 止 > 雨 声 连 日 暗 ， 人 意 到 床 寒 。 无 奈 春 风 急 ， 愁 来 天 亦 悭 。
3: 一 杯 酒 ， 两 口 羊 ， 三 更 月 色 | 半 盏 茶 ， 五 声 梵 ， 六 代 禅 宗
4: 春 归 福 满 | 喜 临 门 红
5: 一 夜 春 风 花 烂 漫 | 几 番 美 景 柳 轻 柔
6: 雨 打 芭 蕉 惊 睡 梦 | 风 吹 杨 柳 惹 痴 情
7: 人 间 万 事 皆 为 幻 | 天 上 一 轮 满 是 云
8: 题 壁 > 万 事 皆 由 命 ， 一 心 不 为 名 。 但 余 山 水 句 ， 犹 有 故 人 情 。
9: 江 山 多 秀 色 | 花 柳 尽 春 光
