# Train Vec4Gloss - defgen

In [1]:
## reference: https://huggingface.co/course/chapter7/4

In [2]:
%env WANDB_PROJECT=vec4gloss

env: WANDB_PROJECT=vec4gloss


In [3]:
%load_ext autoreload
%autoreload 2
import sys
if "../src" not in sys.path:
    sys.path.append("../src")

In [4]:
from vec4gloss import check_hashes
from vec4gloss import Vec4GlossModel
from pathlib import Path
from datasets import Dataset
from datetime import datetime
import numpy as np
from tqdm.auto import tqdm

## Data dependencies

```
(data) -> ../data/defgen_dataset_cwn/train/dataset.arrow 65a56d
20.10 -> ../data/models/vec4gloss-denoise-220628-1510/pytorch_model.bin 39138d 
```

In [5]:
denoised_model_dir = "../data/models/vec4gloss-denoise-220628-1510"
_ = check_hashes([
    "../data/defgen_dataset_cwn/train/dataset.arrow",
    denoised_model_dir + "/pytorch_model.bin"
])

../data/defgen_dataset_cwn/train/dataset.arrow 65a56d
../data/models/vec4gloss-denoise-220628-1510/pytorch_model.bin 39138d


## Prepare dataset

In [6]:
import numpy as np
from transformers import MT5ForConditionalGeneration, MT5TokenizerFast
from transformers import DataCollatorForSeq2Seq
import datasets
from datasets import load_metric

In [7]:
ds_defgen = datasets.load_from_disk("../data/defgen_dataset_cwn")

In [8]:
print({k: len(v) for k, v in ds_defgen.items()})

{'train': 76969, 'test': 8553}


In [9]:
tokenizer = MT5TokenizerFast.from_pretrained(denoised_model_dir)

### Eye-balling

In [10]:
ds_defgen["train"][10:12]

{'cwnid': ['07060501', '09309106'],
 'src': ['縣委書記也拿不出辦法來，只好建議各社將曬乾的紅薯藤子磨成<粉>，煮成糊糊，藉以維持生命。',
  '行萬里路也不是為了收集更多的繪畫素材，而是為了<開闊>胸襟。'],
 'tgt': ['Na。極細的小顆粒。', 'VHC。使心胸寬大。']}

## Preprocess

In [11]:
def get_marked_pos(text):
    assert text.count("<") == text.count(">") == 1
    s, e = text.index("<")+1, text.index(">")
    assert s != e
    return s, e

In [12]:
## eye-balling
print(ds_defgen["train"][10]["src"])
ds_defgen["train"][10]["src"][slice(*get_marked_pos(ds_defgen["train"][10]["src"]))]

縣委書記也拿不出辦法來，只好建議各社將曬乾的紅薯藤子磨成<粉>，煮成糊糊，藉以維持生命。


'粉'

In [13]:
max_length = 256
def add_marked_pos(ex):
    pos = get_marked_pos(ex["src"])
    return {"decoder_start_markers": pos[0], "decoder_end_markers": pos[1]}

def preprocess_fn(batch):    
    src_batch = tokenizer(batch["src"], 
                          max_length=max_length, truncation=True)
    start_markers = [src_batch.char_to_token(bi,s) 
                     for bi, s in enumerate(batch["decoder_start_markers"])]
    end_markers = [src_batch.char_to_token(bi,e) 
                   for bi, e in enumerate(batch["decoder_end_markers"])]
    
    with tokenizer.as_target_tokenizer():
        tgt_batch = tokenizer(batch["tgt"],
                              max_length=max_length, truncation=True)        
        
    return {
        **src_batch, 
        "decoder_start_markers": start_markers,
        "decoder_end_markers": end_markers,
        "labels": tgt_batch["input_ids"]
    }

In [14]:
drop_columns = ["cwnid", "src", "tgt"]
ds_defgen = (ds_defgen.map(add_marked_pos)
             .map(preprocess_fn, batched=True, remove_columns=drop_columns))

Loading cached processed dataset at ../data/defgen_dataset_cwn/train/cache-edc9a2f8200950ba.arrow
Loading cached processed dataset at ../data/defgen_dataset_cwn/test/cache-cdf34f4111063fd7.arrow
Loading cached processed dataset at ../data/defgen_dataset_cwn/train/cache-d6bd5dcda216cbbb.arrow
Loading cached processed dataset at ../data/defgen_dataset_cwn/test/cache-4a58e3f489841b2f.arrow


### Eye-balling

In [15]:
print(" ".join(tokenizer.convert_ids_to_tokens(ds_defgen["train"][14]["input_ids"])))
print(" ".join(tokenizer.convert_ids_to_tokens(ds_defgen["train"][14]["labels"])))

▁ 募 款 委員 曾 率 工作 同 仁 , 分 赴 國 內 、 外 各地 校 友 會 舉 辦 < 勸 募 > 說 明 會 , 帶 動 募 款 的 風 氣 。 </s>
▁nom , VD 。 以 勸 說 的方式 希望 能 廣 泛 收集 到 他人 財 物 。 </s>


In [16]:
ent = ds_defgen["train"][14]
xids = ent["input_ids"]
tokenizer.convert_ids_to_tokens(xids[ent["decoder_start_markers"]:ent["decoder_end_markers"]])

['勸', '募']

## Define BLEU metrics

In [17]:
metric = load_metric("sacrebleu")

In [18]:
metric.compute(predictions=["中文數字"], references=[["中文數學"]], tokenize="char")

{'score': 59.460355750136046,
 'counts': [3, 2, 1, 0],
 'totals': [4, 3, 2, 1],
 'precisions': [75.0, 66.66666666666667, 50.0, 50.0],
 'bp': 1.0,
 'sys_len': 4,
 'ref_len': 4}

In [19]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
  
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, 
                            tokenize="char")
    return {"bleu": result["score"]}

## Trainer

In [20]:
import wandb
# wandb.login()
timestamp = datetime.now().strftime("%y%m%d-%H%M")
wandb.init(project="vec4gloss", 
           name=f"vec4gloss-{timestamp}",
           notes="vec4gloss, based on denoising, five epochs"
       )

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mseantyh[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [21]:
# model = Vec4GlossModel.from_pretrained("google/mt5-base").to("cuda")
model = Vec4GlossModel.from_pretrained(denoised_model_dir).to("cuda")
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding="longest")

In [22]:
out_dir = Path("/mnt/md0/seantyh/vec4gloss")
if not out_dir.exists():
    out_dir = "vec4gloss"
print(out_dir)

/mnt/md0/seantyh/vec4gloss


In [23]:
from transformers import Seq2SeqTrainingArguments
timestamp = datetime.now().strftime("%y%m%d-%H%M")

args = Seq2SeqTrainingArguments(
    out_dir,
    evaluation_strategy="steps",    
    save_strategy="epoch",    
    learning_rate=1e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    logging_steps=100,  # 10 for debug, else 500
    eval_steps=3000,
    num_train_epochs=5,
    # report_to="wandb",
    run_name=f"vec4gloss-{timestamp}",
    predict_with_generate=False,    # can't do it here because of two additional parameters
)

In [24]:
train_ds = ds_defgen["train"]
test_ds = ds_defgen["test"]
# train_ds = train_ds.select(range(100))
# test_ds = test_ds.select(range(200))

In [25]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    data_collator=data_collator,
    tokenizer=tokenizer,       
)

## ignore the following message of tokenizers and fork
## we need tokenizer to check the tokenization, and parallelism doesn't matter here

In [None]:
trainer.train()

***** Running training *****
  Num examples = 76969
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 48110
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss


## Save model

In [28]:
timestamp

'220628-1546'

In [None]:
trainer.save_model(f"../data/models/vec4gloss-defgen-{timestamp}")

## Sandboxing

In [28]:
import torch
torch.manual_seed(12345)
model = Vec4GlossModel.from_pretrained("google/mt5-small").to("cuda")
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding="longest")

loading configuration file https://huggingface.co/google/mt5-small/resolve/main/config.json from cache at /home/seantyh/.cache/huggingface/transformers/97693496c1a0cae463bd18428187f9e9924d2dfbadaa46e4d468634a0fc95a41.dadce13f8f85f4825168354a04675d4b177749f8f11b167e87676777695d4fe4
Model config MT5Config {
  "architectures": [
    "MT5ForConditionalGeneration"
  ],
  "d_ff": 1024,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "mt5",
  "num_decoder_layers": 8,
  "num_heads": 6,
  "num_layers": 8,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "tokenizer_class": "T5Tokenizer",
  "transformers_version": "4.20.1",
  "use_cache": true,
  "vocab_size"

In [29]:
from torch.utils.data import DataLoader
import torch.optim as optim
optimizer = optim.AdamW(model.parameters(), lr=1e-4)
train_ds = ds_defgen["train"]
test_ds = ds_defgen["test"]
g_cuda = torch.Generator()
g_cuda.manual_seed(211321)
loader = DataLoader(train_ds, batch_size=2, collate_fn=data_collator, shuffle=True, generator=g_cuda)
from itertools import islice
batches = list(islice(loader, 10))

In [30]:
tokenizer.batch_decode(torch.where(batches[0]["labels"] < 0, 0, batches[0]["labels"]))

['VC。模仿或照原樣重製他人的創意當作自己的。</s>',
 'Nc。美術館的建築物及建築物所在的位置。</s><pad><pad><pad><pad><pad>']

In [31]:
batches[0]["input_ids"][0]

tensor([   259,  13524,   3139,  79761,   1083, 170591, 217109, 159356,  16160,
          4779,    261,   5991,  29693,  38401,  47694,  16160,   4779,  30407,
         24134,   2709, 167138,    669, 233119,  51418,  17481,  10559,   3355,
         62746,    306,      1,      0,      0,      0,      0,      0])

In [35]:
import torch
from itertools import islice
from torch.utils.data import DataLoader
g_cuda = torch.Generator()
g_cuda.manual_seed(211321)
loader = DataLoader(train_ds, batch_size=2, collate_fn=data_collator, shuffle=True, generator=g_cuda)
batches = list(islice(loader, 10))
with torch.no_grad():    
    batch = batches[0].to("cuda")
    out = model(**batch)
tokenizer.batch_decode(out.logits.argmax(2))

['VC。比仿特定辨備來的的複的的文件意。作。。</s>', 'Nc。美術館的建築物及建築物所在的位置。</s>會 N N N N']

In [43]:
import torch
test_loader = DataLoader(test_ds, batch_size=8, collate_fn=data_collator)
with torch.no_grad():
    batch = next(iter(test_loader)).to("cuda")
    out = model(**batch)
tokenizer.batch_decode(out.logits.argmax(2))

['VE。回自己的的要求。</s>。 VE VE VE VE VE VE VE VE VE VE VE VE VE',
 'D。表強調平常的程度。</s></s> D D D D D D D D D D D',
 'VE,nom。依據特定標準確定別並做出後</s>德 VE VE VE VE VE VE',
 'Nb。治理伊的政府。</s></s> N N N N N N N N N N N',
 'VH。形容不意見或不同的不同的話語。。</s>。  VB VA  VA',
 'A。VH。表超過述對象的圍大,數量多。</s></s>  ',
 'VK。nom。受外界刺激而而緒反應。</s>引 VK VK VK VK VK',
 'VA。特定述物體因礙其他述物象移動使其無法自由移動。</s>']

In [50]:
gen_batch = {k:v for k, v in batch.items() if k not in ("labels", "decoder_input_ids")}
tokenizer.batch_decode(model.generate(**gen_batch))

['<pad> VE。同意他人的要求。</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>',
 '<pad> VH。形容程度高,帶有明確意圖。</s><pad><pad><pad><pad>',
 '<pad> VE,nom。做出後述訊息。</s><pad><pad><pad><pad><pad><pad><pad><pad>',
 '<pad> Nb。治理伊拉克的政府。</s><pad><pad><pad><pad><pad><pad><pad><pad><pad>',
 '<pad> D。表不同的地方互相說表聲。</s><pad><pad><pad><pad><pad><pad><pad><pad>',
 '<pad> VH,D。形容在短時間內會造成多種改變的。</s>',
 '<pad> VK。因感知而內心感觸而產生特定意念。</s><pad>',
 '<pad> VC。以手或手持物碰到特定對象。</s><pad><pad><pad><pad>']

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

In [48]:
tokenizer.batch_decode(torch.where(batch["labels"]>=0, batch["labels"], 0))

['VE。同意他人的要求。</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>',
 'D。表超過平常的程度。</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>',
 'VE,nom。依照特定標準鑑別並確定。</s><pad><pad><pad><pad><pad><pad><pad>',
 'Nb。治理伊朗的政府。</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>',
 'VH。形容不同的人說出來的話內容一致。</s><pad><pad><pad><pad><pad><pad>',
 'A,D。表後述對象範圍大或數量多。</s><pad><pad><pad>',
 'VK,nom。受外界刺激引起情緒反應。</s><pad><pad><pad><pad><pad><pad>',
 'VA。前述物體阻礙後述對象,使其無法自由行動。</s>']