# Train Vec4Gloss - defgen

In [1]:
## reference: https://huggingface.co/course/chapter7/4

In [2]:
%env WANDB_PROJECT=vec4gloss

env: WANDB_PROJECT=vec4gloss


In [3]:
%load_ext autoreload
%autoreload 2
import sys
if "../src" not in sys.path:
    sys.path.append("../src")

In [4]:
from vec4gloss import check_hashes
from vec4gloss import Vec4GlossModel
from pathlib import Path
from datasets import Dataset
from datetime import datetime
import numpy as np
from tqdm.auto import tqdm

## Data dependencies

```
(data) -> ../data/defgen_dataset_cwn/train/dataset.arrow 65a56d
20.10 -> ../data/models/vec4gloss-denoise-220628-1510/pytorch_model.bin 39138d 
```

In [5]:
denoised_model_dir = "../data/models/vec4gloss-denoise-220628-1510"
_ = check_hashes([
    "../data/defgen_dataset_cwn/train/dataset.arrow",
    denoised_model_dir + "/pytorch_model.bin"
])

../data/defgen_dataset_cwn/train/dataset.arrow 65a56d
../data/models/vec4gloss-denoise-220628-1510/pytorch_model.bin 39138d


## Prepare dataset

In [6]:
import numpy as np
from transformers import MT5ForConditionalGeneration, MT5TokenizerFast
from transformers import DataCollatorForSeq2Seq
import datasets
from datasets import load_metric

In [7]:
ds_defgen = datasets.load_from_disk("../data/defgen_dataset_cwn")

In [8]:
print({k: len(v) for k, v in ds_defgen.items()})

{'train': 76969, 'test': 8553}


In [9]:
tokenizer = MT5TokenizerFast.from_pretrained(denoised_model_dir)

### Eye-balling

In [10]:
ds_defgen["train"][10:12]

{'cwnid': ['07060501', '09309106'],
 'src': ['縣委書記也拿不出辦法來，只好建議各社將曬乾的紅薯藤子磨成<粉>，煮成糊糊，藉以維持生命。',
  '行萬里路也不是為了收集更多的繪畫素材，而是為了<開闊>胸襟。'],
 'tgt': ['Na。極細的小顆粒。', 'VHC。使心胸寬大。']}

## Preprocess

In [11]:
def get_marked_pos(text):
    assert text.count("<") == text.count(">") == 1
    s, e = text.index("<")+1, text.index(">")
    assert s != e
    return s, e

In [12]:
## eye-balling
print(ds_defgen["train"][10]["src"])
ds_defgen["train"][10]["src"][slice(*get_marked_pos(ds_defgen["train"][10]["src"]))]

縣委書記也拿不出辦法來，只好建議各社將曬乾的紅薯藤子磨成<粉>，煮成糊糊，藉以維持生命。


'粉'

In [13]:
max_length = 256
def add_marked_pos(ex):
    pos = get_marked_pos(ex["src"])
    return {"decoder_start_markers": pos[0], "decoder_end_markers": pos[1]}

def preprocess_fn(batch):    
    src_batch = tokenizer(batch["src"], 
                          max_length=max_length, truncation=True)
    start_markers = [src_batch.char_to_token(bi,s) 
                     for bi, s in enumerate(batch["decoder_start_markers"])]
    end_markers = [src_batch.char_to_token(bi,e) 
                   for bi, e in enumerate(batch["decoder_end_markers"])]
    
    with tokenizer.as_target_tokenizer():
        tgt_batch = tokenizer(batch["tgt"],
                              max_length=max_length, truncation=True)        
        
    return {
        **src_batch, 
        "decoder_start_markers": start_markers,
        "decoder_end_markers": end_markers,
        "labels": tgt_batch["input_ids"]
    }

In [14]:
drop_columns = ["cwnid", "src", "tgt"]
ds_defgen = (ds_defgen.map(add_marked_pos)
             .map(preprocess_fn, batched=True, remove_columns=drop_columns))

Loading cached processed dataset at ../data/defgen_dataset_cwn/train/cache-edc9a2f8200950ba.arrow
Loading cached processed dataset at ../data/defgen_dataset_cwn/test/cache-cdf34f4111063fd7.arrow
Loading cached processed dataset at ../data/defgen_dataset_cwn/train/cache-d6bd5dcda216cbbb.arrow
Loading cached processed dataset at ../data/defgen_dataset_cwn/test/cache-4a58e3f489841b2f.arrow


### Eye-balling

In [15]:
print(" ".join(tokenizer.convert_ids_to_tokens(ds_defgen["train"][14]["input_ids"])))
print(" ".join(tokenizer.convert_ids_to_tokens(ds_defgen["train"][14]["labels"])))

▁ 募 款 委員 曾 率 工作 同 仁 , 分 赴 國 內 、 外 各地 校 友 會 舉 辦 < 勸 募 > 說 明 會 , 帶 動 募 款 的 風 氣 。 </s>
▁nom , VD 。 以 勸 說 的方式 希望 能 廣 泛 收集 到 他人 財 物 。 </s>


In [16]:
ent = ds_defgen["train"][14]
xids = ent["input_ids"]
tokenizer.convert_ids_to_tokens(xids[ent["decoder_start_markers"]:ent["decoder_end_markers"]])

['勸', '募']

## Define BLEU metrics

In [17]:
metric = load_metric("sacrebleu")

In [18]:
metric.compute(predictions=["中文數字"], references=[["中文數學"]], tokenize="char")

{'score': 59.460355750136046,
 'counts': [3, 2, 1, 0],
 'totals': [4, 3, 2, 1],
 'precisions': [75.0, 66.66666666666667, 50.0, 50.0],
 'bp': 1.0,
 'sys_len': 4,
 'ref_len': 4}

In [19]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
  
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, 
                            tokenize="char")
    return {"bleu": result["score"]}

## Trainer

In [20]:
import wandb
# wandb.login()
timestamp = datetime.now().strftime("%y%m%d-%H%M")
wandb.init(project="vec4gloss", 
           name=f"vec4gloss-{timestamp}",
           notes="vec4gloss, based on denoising, ten epochs, continued from 20220629-0713"
       )

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mseantyh[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [21]:
# model = Vec4GlossModel.from_pretrained("google/mt5-base").to("cuda")
model = Vec4GlossModel.from_pretrained(denoised_model_dir).to("cuda")
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding="longest")

In [22]:
out_dir = Path("/mnt/md0/seantyh/vec4gloss")
if not out_dir.exists():
    out_dir = "vec4gloss"
print(out_dir)

/mnt/md0/seantyh/vec4gloss


In [23]:
from transformers import Seq2SeqTrainingArguments
timestamp = datetime.now().strftime("%y%m%d-%H%M")

args = Seq2SeqTrainingArguments(
    out_dir,
    evaluation_strategy="steps",    
    save_strategy="epoch",    
    learning_rate=1e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    logging_steps=100,  # 10 for debug, else 500
    eval_steps=3000,
    num_train_epochs=10,
    # report_to="wandb",
    run_name=f"vec4gloss-{timestamp}",
    predict_with_generate=False,    # can't do it here because of two additional parameters
)

In [24]:
train_ds = ds_defgen["train"]
test_ds = ds_defgen["test"]
# train_ds = train_ds.select(range(100))
# test_ds = test_ds.select(range(200))

In [25]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    data_collator=data_collator,
    tokenizer=tokenizer,       
)

## ignore the following message of tokenizers and fork
## we need tokenizer to check the tokenization, and parallelism doesn't matter here

In [27]:
# trainer.train()
trainer.train(out_dir / "checkpoint-57732")

Loading model from /mnt/md0/seantyh/vec4gloss/checkpoint-57732.
***** Running training *****
  Num examples = 76969
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 96220
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 6
  Continuing training from global step 57732
  Will skip the first 6 epochs then the first 0 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


0it [00:00, ?it/s]

Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss
60000,0.7387,0.717863
63000,0.78,0.69467
66000,0.7857,0.673919
69000,0.6759,0.661424
72000,0.7346,0.646968
75000,0.74,0.632612
78000,0.6406,0.625972
81000,0.6646,0.614901
84000,0.6357,0.606401
87000,0.6206,0.6009


***** Running Evaluation *****
  Num examples = 8553
  Batch size = 16
***** Running Evaluation *****
  Num examples = 8553
  Batch size = 16
***** Running Evaluation *****
  Num examples = 8553
  Batch size = 16
Saving model checkpoint to /mnt/md0/seantyh/vec4gloss/checkpoint-67354
Configuration saved in /mnt/md0/seantyh/vec4gloss/checkpoint-67354/config.json
Model weights saved in /mnt/md0/seantyh/vec4gloss/checkpoint-67354/pytorch_model.bin
tokenizer config file saved in /mnt/md0/seantyh/vec4gloss/checkpoint-67354/tokenizer_config.json
Special tokens file saved in /mnt/md0/seantyh/vec4gloss/checkpoint-67354/special_tokens_map.json
Copy vocab file to /mnt/md0/seantyh/vec4gloss/checkpoint-67354/spiece.model
***** Running Evaluation *****
  Num examples = 8553
  Batch size = 16
***** Running Evaluation *****
  Num examples = 8553
  Batch size = 16
***** Running Evaluation *****
  Num examples = 8553
  Batch size = 16
Saving model checkpoint to /mnt/md0/seantyh/vec4gloss/checkpoint-7697

TrainOutput(global_step=96220, training_loss=0.2798700779942806, metrics={'train_runtime': 6236.7769, 'train_samples_per_second': 123.412, 'train_steps_per_second': 15.428, 'total_flos': 9.33999560372736e+16, 'train_loss': 0.2798700779942806, 'epoch': 10.0})

## Save model

In [28]:
timestamp

'220629-1250'

In [29]:
trainer.save_model(f"../data/models/vec4gloss-defgen-{timestamp}")

Saving model checkpoint to ../data/models/vec4gloss-defgen-220629-1250
Configuration saved in ../data/models/vec4gloss-defgen-220629-1250/config.json
Model weights saved in ../data/models/vec4gloss-defgen-220629-1250/pytorch_model.bin
tokenizer config file saved in ../data/models/vec4gloss-defgen-220629-1250/tokenizer_config.json
Special tokens file saved in ../data/models/vec4gloss-defgen-220629-1250/special_tokens_map.json
Copy vocab file to ../data/models/vec4gloss-defgen-220629-1250/spiece.model
