In [1]:
!git clone https://github.com/NiuTrans/Classical-Modern.git

Cloning into 'Classical-Modern'...
remote: Enumerating objects: 135, done.[K
remote: Counting objects: 100% (135/135), done.[K
remote: Compressing objects: 100% (123/123), done.[K
remote: Total 135 (delta 54), reused 50 (delta 11), pack-reused 0[K
Receiving objects: 100% (135/135), 114.05 MiB | 11.96 MiB/s, done.
Resolving deltas: 100% (54/54), done.
Updating files: 100% (83/83), done.


In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.0.0-py3-none-any.whl (325 kB)
     |████████████████████████████████| 325 kB 557 kB/s            
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
     |████████████████████████████████| 212 kB 8.5 MB/s            
Installing collected packages: xxhash, responses, datasets
Successfully installed datasets-2.0.0 responses-0.18.0 xxhash-3.0.0


In [3]:
import numpy as np
import os
from transformers import T5Tokenizer, T5ForConditionalGeneration
from sklearn.model_selection import train_test_split
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
from datasets import load_metric
import datasets

In [4]:
class GuwenDataLoader:
    def __init__(self, data_dir):
        dataset = []
        self.data_dir = data_dir
        self.tokenizer = T5Tokenizer.from_pretrained("Langboat/mengzi-t5-base")
        self.to_classical_prefix = self.tokenizer("转古文：")
        self.to_modern_prefix = self.tokenizer("转现代文：")
        for (dirpath, dirnames, filenames) in os.walk(self.data_dir):
            for filename in filenames:
#                 if filename.endswith("史"):
                data = self.load_corpus(os.path.join(self.data_dir, filename))
                print(f"loaded {filename} with {len(data)} pairs")
                dataset.extend(data)
        self.train, self.dev = train_test_split(dataset, test_size=0.025)
        print(f"Train: {len(self.train)} pairs, Dev: {len(self.dev)} pairs")
        

    def load_corpus(self, filename):
        parallel_corpus = []
        with open(filename, "r") as file:
            classic_to_modern = {}
            modern_to_classic = {}
            lines = file.readlines()
            for line in lines:
                if line.startswith("古文"):
                    tokenized = self.tokenizer(line[3:].strip(), truncation=True, max_length=100)
#                     classic_to_modern['input_ids'] = self.to_modern_prefix['input_ids'] + tokenized['input_ids']
#                     classic_to_modern['attention_mask'] = self.to_modern_prefix['attention_mask'] + tokenized['attention_mask']
                    modern_to_classic['labels'] = tokenized['input_ids']
                elif line.startswith("现代文"):
                    tokenized = self.tokenizer(line[4:].strip(), truncation=True, max_length=100)
                    modern_to_classic['input_ids'] = self.to_classical_prefix['input_ids'] + tokenized['input_ids']
                    modern_to_classic['attention_mask'] = self.to_classical_prefix['attention_mask'] + tokenized['attention_mask']
#                     classic_to_modern['labels'] = tokenized['input_ids']
                elif line == "\n":
#                     for item in [modern_to_classic, classic_to_modern]:
                    for item in [modern_to_classic]:
                        if 'input_ids' in item and 'labels' in item:
                            parallel_corpus.append(item)
                        else:
                            raise Exception("Unexpected data corrupton")
                    classic_to_modern = {}
                    modern_to_classic = {}
                else:
                    raise Exception("unexpected!")
        return parallel_corpus

In [5]:
model = T5ForConditionalGeneration.from_pretrained("Langboat/mengzi-t5-base")

Downloading:   0%|          | 0.00/659 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/945M [00:00<?, ?B/s]

In [6]:
data_loader = GuwenDataLoader("/kaggle/working/Classical-Modern/bitext")

Downloading:   0%|          | 0.00/708k [00:00<?, ?B/s]

loaded 宋史 with 77853 pairs
loaded 北齐书 with 10947 pairs
loaded 新唐书 with 12359 pairs
loaded 徐霞客 with 22750 pairs
loaded 晋书 with 21133 pairs
loaded 南史 with 13838 pairs
loaded 金史 with 13758 pairs
loaded 短篇章和资治通鉴 with 348726 pairs
loaded 陈书 with 7096 pairs
loaded 新五代史 with 10147 pairs
loaded 汉书 with 37622 pairs
loaded 梁书 with 14318 pairs
loaded 史记 with 17701 pairs
loaded 隋书 with 8204 pairs
loaded 宋书 with 23794 pairs
loaded 北史 with 25823 pairs
loaded 辽史 with 9278 pairs
loaded 后汉书 with 17753 pairs
loaded 魏书 with 28178 pairs
loaded 太平广记 with 59357 pairs
loaded 明史 with 85179 pairs
loaded 周书 with 14930 pairs
loaded 水经注全 with 11630 pairs
loaded 旧五代史 with 11377 pairs
loaded 旧唐书 with 29185 pairs
loaded 南齐书 with 13137 pairs
loaded 元史 with 21182 pairs
Train: 943073 pairs, Dev: 24182 pairs


In [7]:
data_collator = DataCollatorForSeq2Seq(tokenizer=data_loader.tokenizer, model=model, padding='longest')
training_args = Seq2SeqTrainingArguments(
    output_dir="results",
    evaluation_strategy="steps",
    eval_steps=1500,
    save_steps=1500,
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=8,
    per_device_eval_batch_size=12,
    save_total_limit=3,
    warmup_steps=1000,
    num_train_epochs=1,
    fp16=True,
    report_to="none",
    dataloader_num_workers=2,
    group_by_length=True
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=data_loader.train,
    eval_dataset=data_loader.dev,
    data_collator=data_collator
)

Using amp half precision backend


In [8]:
trainer.train()

***** Running training *****
  Num examples = 943073
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 8
  Total optimization steps = 14735
  args.max_grad_norm,


Step,Training Loss,Validation Loss
1500,2.2079,2.047852
3000,1.9573,1.861795
4500,1.8422,1.778362
6000,1.798,1.719341
7500,1.7594,1.684877
9000,1.7128,1.655098
10500,1.7026,1.634269
12000,1.6841,1.620749
13500,1.6684,1.611374


***** Running Evaluation *****
  Num examples = 24182
  Batch size = 12
Saving model checkpoint to results/checkpoint-1500
Configuration saved in results/checkpoint-1500/config.json
Model weights saved in results/checkpoint-1500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 24182
  Batch size = 12
Saving model checkpoint to results/checkpoint-3000
Configuration saved in results/checkpoint-3000/config.json
Model weights saved in results/checkpoint-3000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 24182
  Batch size = 12
Saving model checkpoint to results/checkpoint-4500
Configuration saved in results/checkpoint-4500/config.json
Model weights saved in results/checkpoint-4500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 24182
  Batch size = 12
Saving model checkpoint to results/checkpoint-6000
Configuration saved in results/checkpoint-6000/config.json
Model weights saved in results/checkpoint-6000/pytorch_model.bin
Deleting old

TrainOutput(global_step=14735, training_loss=1.8888721746956652, metrics={'train_runtime': 16646.1411, 'train_samples_per_second': 56.654, 'train_steps_per_second': 0.885, 'total_flos': 3.978427378522522e+16, 'train_loss': 1.8888721746956652, 'epoch': 1.0})

In [9]:
def generate(text):
    input_ids = data_loader.tokenizer("转古文：" + text, return_tensors="pt").input_ids.to('cuda')
    outputs = model.generate(input_ids, max_length=100)
    return data_loader.tokenizer.batch_decode(outputs, skip_special_tokens=True)

In [10]:
generate("先帝开创的大业未完成一半却中途去世了。")

['先帝创业未成,中道卒。']

In [11]:
generate("在天下分为三国, 益州地区民力匮乏,这确实是国家危急存亡的时期啊。")

['天下为三国,益州民乏,此诚国家危亡之期也。']

In [12]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session