# Train Vec4Gloss

In [None]:
## reference: https://huggingface.co/course/chapter7/4

In [2]:
%load_ext autoreload
%autoreload 2
import sys
if "../src" not in sys.path:
    sys.path.append("../src")

In [3]:
from vec4gloss import check_hashes
from pathlib import Path
import numpy as np
from tqdm.auto import tqdm

## Data dependencies

```
../data/denoising_dataset_cwn/train/dataset.arrow a7ef3e
../data/defgen_dataset_cwn/train/dataset.arrow ea55fd
```

In [4]:
_ = check_hashes([
    "../data/denoising_dataset_cwn/train/dataset.arrow",
    "../data/defgen_dataset_cwn/train/dataset.arrow",
])

../data/denoising_dataset_cwn/train/dataset.arrow a7ef3e
../data/defgen_dataset_cwn/train/dataset.arrow ea55fd


## Prepare dataset

In [5]:
import numpy as np
from transformers import MT5ForConditionalGeneration, MT5TokenizerFast
from transformers import DataCollatorForSeq2Seq
import datasets
from datasets import load_metric

In [9]:
ds_denoise = datasets.load_from_disk("../data/denoising_dataset_cwn/")
ds_defgen = datasets.load_from_disk("../data/defgen_dataset_cwn")

FileNotFoundError: [Errno 2] No such file or directory: '../data/defgen_dataset_cwn/state.json'

In [None]:
tokenizer = MT5TokenizerFast.from_pretrained("google/mt5-base")
ds = datasets.load_from_disk("cwn_seq2seq_charlie_ds")

Downloading:   0%|          | 0.00/376 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.11M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/702 [00:00<?, ?B/s]

In [None]:
max_length = 256
def preprocess_fn(batch):    
    src_batch = tokenizer(batch["src"], 
                          max_length=max_length, truncation=True)
    with tokenizer.as_target_tokenizer():
      tgt_batch = tokenizer(batch["tgt"],
                            max_length=max_length, truncation=True)    
    return {
        **src_batch, "labels": tgt_batch["input_ids"]
    }

In [None]:
tokenized_ds = ds.map(preprocess_fn, batched=True, remove_columns=ds["train"].column_names)

  0%|          | 0/95 [00:00<?, ?ba/s]

  0%|          | 0/28 [00:00<?, ?ba/s]

In [None]:
model = MT5ForConditionalGeneration.from_pretrained("google/mt5-base").to("cuda")
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding="longest")

Downloading:   0%|          | 0.00/2.17G [00:00<?, ?B/s]

## Define BLEU metrics

In [None]:
metric = load_metric("sacrebleu")

Downloading builder script:   0%|          | 0.00/2.36k [00:00<?, ?B/s]

In [None]:
metric.compute(predictions=["中文數字"], references=[["中文數學"]], tokenize="char")

{'bp': 1.0,
 'counts': [3, 2, 1, 0],
 'precisions': [75.0, 66.66666666666667, 50.0, 50.0],
 'ref_len': 4,
 'score': 59.460355750136046,
 'sys_len': 4,
 'totals': [4, 3, 2, 1]}

In [None]:
def compute_metrics(eval_preds):
  preds, labels = eval_preds
  if isinstance(preds, tuple):
    preds = preds[0]
  
  decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
  # Replace -100s in the labels as we can't decode them
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  # Some simple post-processing
  decoded_preds = [pred.strip() for pred in decoded_preds]
  decoded_labels = [[label.strip()] for label in decoded_labels]

  result = metric.compute(predictions=decoded_preds, references=decoded_labels, 
                          tokenize="char")
  return {"bleu": result["score"]}

## Trainer

In [None]:
import wandb
wandb.login()

In [None]:
from transformers import Seq2SeqTrainingArguments

args = Seq2SeqTrainingArguments(
    f"cwn_seq2seq_cherry",
    evaluation_strategy="steps",
    save_strategy="no",    
    learning_rate=1e-4,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    weight_decay=0.01,
    logging_steps=500,
    save_total_limit=3,
    num_train_epochs=3,
    report_to="wandb",
    run_name="cwn-seq2seq-cherry",
    predict_with_generate=True,    
)

In [None]:
train_ds = tokenized_ds["train"]
# test_ds = tokenized_ds["test"]
# train_ds = tokenized_ds["train"].select(range(100))
tokenized_ds["test"].shuffle(seed=6532)
test_ds = tokenized_ds["test"].select(range(200))

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 94727
  Num Epochs = 3
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 142092
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mseantyh[0m (use `wandb login --relogin` to force relogin)


Step,Training Loss,Validation Loss,Bleu
500,5.0087,3.651752,2.780459
1000,3.0202,3.388342,4.237818
1500,2.6929,3.200469,4.463003
2000,2.6395,3.120623,5.144895
2500,2.5499,3.0514,8.777265
3000,2.4941,2.923128,8.31088
3500,2.3227,2.885219,9.023786
4000,2.4713,2.839002,9.92761
4500,2.355,2.782748,9.815332
5000,2.2109,2.79622,10.963444


***** Running Evaluation *****
  Num examples = 200
  Batch size = 2
***** Running Evaluation *****
  Num examples = 200
  Batch size = 2
***** Running Evaluation *****
  Num examples = 200
  Batch size = 2
***** Running Evaluation *****
  Num examples = 200
  Batch size = 2
***** Running Evaluation *****
  Num examples = 200
  Batch size = 2
***** Running Evaluation *****
  Num examples = 200
  Batch size = 2
***** Running Evaluation *****
  Num examples = 200
  Batch size = 2
***** Running Evaluation *****
  Num examples = 200
  Batch size = 2
***** Running Evaluation *****
  Num examples = 200
  Batch size = 2
***** Running Evaluation *****
  Num examples = 200
  Batch size = 2
***** Running Evaluation *****
  Num examples = 200
  Batch size = 2
***** Running Evaluation *****
  Num examples = 200
  Batch size = 2
***** Running Evaluation *****
  Num examples = 200
  Batch size = 2
***** Running Evaluation *****
  Num examples = 200
  Batch size = 2
***** Running Evaluation *****
  N

Step,Training Loss,Validation Loss,Bleu
500,5.0087,3.651752,2.780459
1000,3.0202,3.388342,4.237818
1500,2.6929,3.200469,4.463003
2000,2.6395,3.120623,5.144895
2500,2.5499,3.0514,8.777265
3000,2.4941,2.923128,8.31088
3500,2.3227,2.885219,9.023786
4000,2.4713,2.839002,9.92761
4500,2.355,2.782748,9.815332
5000,2.2109,2.79622,10.963444


***** Running Evaluation *****
  Num examples = 200
  Batch size = 2
***** Running Evaluation *****
  Num examples = 200
  Batch size = 2
***** Running Evaluation *****
  Num examples = 200
  Batch size = 2
***** Running Evaluation *****
  Num examples = 200
  Batch size = 2
***** Running Evaluation *****
  Num examples = 200
  Batch size = 2
***** Running Evaluation *****
  Num examples = 200
  Batch size = 2
***** Running Evaluation *****
  Num examples = 200
  Batch size = 2
***** Running Evaluation *****
  Num examples = 200
  Batch size = 2
***** Running Evaluation *****
  Num examples = 200
  Batch size = 2
***** Running Evaluation *****
  Num examples = 200
  Batch size = 2
***** Running Evaluation *****
  Num examples = 200
  Batch size = 2
***** Running Evaluation *****
  Num examples = 200
  Batch size = 2
***** Running Evaluation *****
  Num examples = 200
  Batch size = 2
***** Running Evaluation *****
  Num examples = 200
  Batch size = 2
***** Running Evaluation *****
  N

TrainOutput(global_step=142092, training_loss=1.2878654471282147, metrics={'train_runtime': 35330.408, 'train_samples_per_second': 8.044, 'train_steps_per_second': 4.022, 'total_flos': 2.558515034110464e+16, 'train_loss': 1.2878654471282147, 'epoch': 3.0})

In [None]:
!rm -rf cwn-seq2seq-cherry

In [None]:
trainer.save_model("cwn-seq2seq-cherry")

Saving model checkpoint to cwn-seq2seq-cherry
Configuration saved in cwn-seq2seq-cherry/config.json
Model weights saved in cwn-seq2seq-cherry/pytorch_model.bin
tokenizer config file saved in cwn-seq2seq-cherry/tokenizer_config.json
Special tokens file saved in cwn-seq2seq-cherry/special_tokens_map.json
Copy vocab file to cwn-seq2seq-cherry/spiece.model


In [None]:
!gsutil cp -r ./cwn-seq2seq-cherry gs://langon-us

## Text generation

In [None]:
wandb.finish()

In [None]:
from transformers import pipeline
text2text_gen = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=0)
text2text_gen("唉唷。<唉唷>，好啦！你不要老是學那個電視上啦！")

Error in callback <bound method _WandbInit._resume_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7fdb3eca8810>> (for pre_run_cell):


Exception: ignored

[{'generated_text': 'I。表不耐煩的語氣。'}]

Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7fdb3eca8810>> (for post_run_cell):


Exception: ignored

In [None]:
text2text_gen("輪。雙方展開第4<輪>談判。")

In [None]:
text2text_gen("氣候暖化。會議中對<氣候暖化>議題交換意見。")

In [None]:
text2text_gen("終於。他們<終於>玩起來了。")

[{'generated_text': 'D。表從時間參考點起,直到最後的時間。'}]

In [None]:
text2text_gen("氣候暖化。會議中對<氣候暖化>議題交換意見。")