### Environment Settting

In [1]:
import sys
!{sys.executable} -m pip install transformers datasets evaluate sacrebleu accelerate jiwer rouge_score sentencepiece protobuf

[0m

### Login to HuggingFace

In [2]:
import getpass
from huggingface_hub import login

IS_LOGGED_IN = False

try:
    print("--- Login into HuggingFace ---")
    print("Note: If no Token is entered, the system will run in Offline mode (no Push model).")

    token = getpass.getpass("Enter HuggingFace Token (Write): ")

    if token.strip() != "":
        login(token=token)
        IS_LOGGED_IN = True
        print("✅ Login successful!")
    else:
        print("⚠️ No token. Will NOT push model.")

except Exception as e:
    print(f"⚠️ Login bypassed. Error: {e}")
    IS_LOGGED_IN = False

--- Login into HuggingFace ---
Note: If no Token is entered, the system will run in Offline mode (no Push model).


Enter HuggingFace Token (Write):  ········


✅ Login successful!


### Load and process data

In [3]:
from datasets import load_dataset

DATASET_ID = "pqthinh232/vietnamese-correction-60k"

dataset = load_dataset(DATASET_ID)

print("Data structure:")
print(dataset)

print("\nSample train:", dataset['train'][0])

Data structure:
DatasetDict({
    train: Dataset({
        features: ['input', 'target'],
        num_rows: 50000
    })
    validation: Dataset({
        features: ['input', 'target'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['input', 'target'],
        num_rows: 5000
    })
})

Sample train: {'input': 'Cùng vói Le Peuple, tháng 7/1938, Dang Còng san lai cho lap', 'target': 'Cùng với Le Peuple, tháng 7/1938, Đảng Cộng sản lại cho lập'}


### Tokenizer

In [4]:
from transformers import AutoTokenizer

MODEL_CHECKPOINT = "vinai/bartpho-syllable"
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

MAX_LENGTH = 256

def preprocess_function(examples):
    # Tokenize input (câu sai)
    model_inputs = tokenizer(
        examples["input"],
        max_length=MAX_LENGTH,
        truncation=True,
        padding="max_length"
    )

    labels = tokenizer(
        examples["target"],
        max_length=MAX_LENGTH,
        truncation=True,
        padding="max_length"
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)

### Load Model & Metrics

In [5]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
import evaluate
import numpy as np
import re

model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

metric_bleu = evaluate.load("sacrebleu")
metric_cer = evaluate.load("cer")
metric_wer = evaluate.load("wer")
metric_rouge = evaluate.load("rouge")

def postprocess_text(text):
    text = text.replace("_", " ")
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    if isinstance(preds, tuple):
        preds = preds[0]

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [postprocess_text(pred) for pred in decoded_preds]
    decoded_labels = [postprocess_text(label) for label in decoded_labels]

    decoded_labels_bleu = [[label] for label in decoded_labels]

    result_bleu = metric_bleu.compute(predictions=decoded_preds, references=decoded_labels_bleu)
    result_cer = metric_cer.compute(predictions=decoded_preds, references=decoded_labels)
    result_wer = metric_wer.compute(predictions=decoded_preds, references=decoded_labels)
    result_rouge = metric_rouge.compute(predictions=decoded_preds, references=decoded_labels)

    return {
        "bleu": result_bleu["score"],
        "cer": result_cer,
        "wer": result_wer,
        "f1": result_rouge["rouge1"]
    }

### Training Arguments

In [6]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

training_args = Seq2SeqTrainingArguments(
    output_dir="./HCMUS_correction_60k_result",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=5,
    predict_with_generate=True,
    bf16=True,
    fp16=False,
    generation_max_length=256,
    push_to_hub=IS_LOGGED_IN,
    hub_model_id="HCMUS-vietnamese-correction-project" if IS_LOGGED_IN else None,
    report_to="none",
    overwrite_output_dir=True,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss"
)

### Training

In [7]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("Start training...")
trainer.train()

  trainer = Seq2SeqTrainer(


Start training...


Epoch,Training Loss,Validation Loss,Bleu,Cer,Wer,F1
1,0.3907,0.033774,83.83871,0.03432,0.089208,0.962582
2,0.0355,0.023987,87.656546,0.024437,0.066872,0.976275
3,0.0278,0.022513,88.631138,0.021608,0.060511,0.980365
4,0.021,0.022032,88.889776,0.020036,0.058032,0.982052
5,0.0187,0.021304,89.357417,0.019577,0.056313,0.982604


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=3910, training_loss=0.07430378604118172, metrics={'train_runtime': 2715.0133, 'train_samples_per_second': 92.081, 'train_steps_per_second': 1.44, 'total_flos': 1.35445610496e+17, 'train_loss': 0.07430378604118172, 'epoch': 5.0})

### Push fine-tuned model to HuggingFace

In [8]:
trainer.push_to_hub()

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

CommitInfo(commit_url='https://huggingface.co/pqthinh232/HCMUS-vietnamese-correction-project/commit/7d25057872ca4cb4fbfc1e4cd6f96930acc43279', commit_message='End of training', commit_description='', oid='7d25057872ca4cb4fbfc1e4cd6f96930acc43279', pr_url=None, repo_url=RepoUrl('https://huggingface.co/pqthinh232/HCMUS-vietnamese-correction-project', endpoint='https://huggingface.co', repo_type='model', repo_id='pqthinh232/HCMUS-vietnamese-correction-project'), pr_revision=None, pr_num=None)

### Inference

In [10]:
from transformers import pipeline
MODEL_ID = "pqthinh232/HCMUS-vietnamese-correction-project"

corrector = pipeline("text2text-generation", model=MODEL_ID, device=0)

sentences = [
    "toi dang hoc chuyen nganh tri tue nhan tao tai truong dai hoc khoa hoc tu nhien",
    "nen kinh te viet nam dang dung truoc nhieu thach thuc trong thoi ky hoi nhap",
    "chinh phu luon co gang cai thien chat luong nen giao duc quoc gia",
    "nhieu sinh vien nam cuoi van chua xac dinh duoc huong di sau khi ra truong",
    "su phat trien nhanh chong cua cong nghe dang lam thay doi doi song xa hoi"
]

for sentence in sentences:
    pred = corrector(sentence, max_length=128)
    print(f"Input:  {sentence}")
    print(f"Output: {pred[0]['generated_text']}")
    print("-" * 30)

config.json:   0%|          | 0.00/910 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/171 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

dict.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Input:  toi dang hoc chuyen nganh tri tue nhan tao tai truong dai hoc khoa hoc tu nhien
Output: tôi đang học chuyên ngành tri nhân tạo tại trường đại học khoa học tư nhiên
------------------------------
Input:  nen kinh te viet nam dang dung truoc nhieu thach thuc trong thoi ky hoi nhap
Output: nen kinh tế Việt Nam đang dung trước nhiều thảm thực trong thời kỳ hội nhập
------------------------------


Both `max_new_tokens` (=256) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Input:  chinh phu luon co gang cai thien chat luong nen giao duc quoc gia
Output: chính phủ luôn cơ gắng cai thiện chặt lương nên giao dục quốc gia
------------------------------


Both `max_new_tokens` (=256) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Input:  nhieu sinh vien nam cuoi van chua xac dinh duoc huong di sau khi ra truong
Output: nhiều sinh viên nam cuộc vẫn chưa xác định được hướng đi sau khi ra trường.
------------------------------
Input:  su phat trien nhanh chong cua cong nghe dang lam thay doi doi song xa hoi
Output: sự phát triển nhanh chóng của công nghệ đang làm thay đổi đôi song xã hội
------------------------------
