In [2]:
import pandas as pd 
import numpy as np
import torch

data = pd.read_csv("/kaggle/input/luat123/error_dataset.csv")
data.head()

Unnamed: 0,ground_truth,corrupted,mask
0,Đại diện CLB Hà Nội cho biết thủ môn Bùi Tiến ...,Đại diện CLB Hà Nội cho BIẾT thủ môn Bùi Tiến ...,100000000000100000
1,Đầu mùa giải vừa qua Bùi Tiến Dũng chính thức ...,Đầu mùa giải vừa qua Bùi Tiến Dũng cníhh thức ...,10000000000000100
2,Thế nhưng tính đến nay Bùi Tiến Dũng vẫn chưa ...,Thế nhưng tính đến nay Bùi Tiến Dũng vẫn chưa ...,0
3,Dù trước đó anh được HLV Park Hang Seo sử dụng...,Dù trước đó anh được HLV Park Hang So sử dụng ...,1000000000000000000
4,Bùi Tiến Dũng gặp chấn thương ở cổ tay nhưng c...,Bùi Tiến Dũng gặp chấn thương ở cổ tay nhưng c...,10000


In [3]:
# Dataset trả về input_ids (normal thì ids vẫn như thế, error thì ids =1), 
#mask, label (normal thì là -100, error thì là ids của ground truth)

In [4]:
def preprocess_data(row, tokenizer, max_len=64):
    corrupted_words = row["corrupted"].split()
    ground_truth_words = row["ground_truth"].split()
    mask_flags = list(map(int, row["mask"].strip()))

    assert len(corrupted_words) == len(ground_truth_words) == len(mask_flags)

    # Tokenize corrupted sentence
    mask_token = tokenizer.mask_token
    for i in range(len(mask_flags)):
        if mask_flags[i] == 1:
            corrupted_words[i] = mask_token
    remove_index = []
    for i in range(1, len(corrupted_words)):
        if corrupted_words[i] == mask_token and corrupted_words[i-1] == mask_token:
            remove_index.append(i)
    for i, idx in enumerate(remove_index):
        corrupted_words.pop(idx-i)

    encoding = tokenizer(
        corrupted_words,
        is_split_into_words=True,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=max_len
    )
    ground_truth = tokenizer(
        ground_truth_words,
        is_split_into_words=True,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=max_len
    )["input_ids"].squeeze(0) # squeeze batch dimension

    input_ids = encoding["input_ids"].squeeze(0).clone()
    attention_mask = encoding["attention_mask"].squeeze(0)

    return {
        "input_ids": input_ids,
        "input": " ".join(corrupted_words),
        "attention_mask": attention_mask,
        "ground_truth_ids": ground_truth
    }


In [5]:
from torch.utils.data import Dataset

class GrammarCorrectionDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=64):
        self.dataframe = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        processed = preprocess_data(
            row,
            tokenizer=self.tokenizer,
            max_len=self.max_len,
        )
        return {
            "input_ids": processed["input_ids"],
            "attention_mask": processed["attention_mask"],
            "ground_truth_ids": processed["ground_truth_ids"],
            "corrupted": row["corrupted"],
            "input": processed["input"],
            "ground_truth": row["ground_truth"],
            "mask": row["mask"]
        }


In [6]:
from transformers import AutoModelForSeq2SeqLM,AutoTokenizer
from torch.utils.data import DataLoader
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [8]:
tokenizer = AutoTokenizer.from_pretrained("vinai/bartpho-syllable")
model = AutoModelForSeq2SeqLM.from_pretrained("vinai/bartpho-syllable")
# special_token_ids = [
#     tokenizer.mask_token_id,
#     tokenizer.sep_token_id,
#     tokenizer.pad_token_id,
#     tokenizer.bos_token_id,
#     tokenizer.eos_token_id
# ]

data_sub = shuffle(data, random_state=28)
train_df, val_df = train_test_split(data_sub, test_size=0.1, random_state=28)
train_dataset = GrammarCorrectionDataset(train_df, tokenizer)
val_dataset = GrammarCorrectionDataset(val_df, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=50, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=50, shuffle=False)

config.json:   0%|          | 0.00/897 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

dict.txt:   0%|          | 0.00/360k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.83M [00:00<?, ?B/s]

2025-05-19 03:11:32.876380: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747624293.091924      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747624293.151384      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

In [9]:
# Lấy mẫu đầu tiên
sample = val_dataset[100]

# In tensor thô
print("input_ids:\n", sample["input_ids"])
print("\nattention_mask:\n", sample["attention_mask"])
print("\nlabels:\n", sample["ground_truth_ids"])

print("\nDecoded input_ids:\n", tokenizer.decode(sample["input_ids"], skip_special_tokens=False))
print("Decoded output_ids:\n", tokenizer.decode(sample["ground_truth_ids"], skip_special_tokens=False))



input_ids:
 tensor([    0,   184,   156,   173,    12, 40029,  1258,  1452,  1079,   434,
         1216,    43,  1052,   123,    45,    77,    37,   916,    70,     6,
            8,   572, 40029,   371,   202,   593,   612,   141,    30,  1724,
            2,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1])

attention_mask:
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

labels:
 tensor([   0,  184,  156,  173,   12,  616, 1258, 1452, 1079,  434, 1216,   43,
        1052,  123,   45,   77,   37,  916,   70,    6,    8,  572,  172,  371,
         202,  593,  612,  141,   30, 1724,    2,    1,    1,    1,    1,    1

In [10]:
import os
from torch.amp import autocast, GradScaler
from tqdm import tqdm

device_str = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device(device_str)
model.to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-5, weight_decay=0.001)
scaler = GradScaler(device_str)

save_path = "bartpho_syllable_best_model"
os.makedirs(save_path, exist_ok=True)
best_val_loss = float("inf")

epochs = 5

In [11]:
# evaluate with pretrained model
model.eval()
val_loss = 0.0
with torch.no_grad():
    for batch in tqdm(val_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["ground_truth_ids"].to(device)

        with autocast(device_str):
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()

avg_val_loss = val_loss / len(val_loader)
print(f"Baseline Val Loss: {avg_val_loss:.4f}")


  0%|          | 0/200 [00:00<?, ?it/s][A
  0%|          | 1/200 [00:00<02:57,  1.12it/s][A
  1%|          | 2/200 [00:01<01:39,  1.99it/s][A
  2%|▏         | 3/200 [00:01<01:14,  2.66it/s][A
  2%|▏         | 4/200 [00:01<01:03,  3.11it/s][A
  2%|▎         | 5/200 [00:01<00:57,  3.41it/s][A
  3%|▎         | 6/200 [00:02<00:53,  3.63it/s][A
  4%|▎         | 7/200 [00:02<00:51,  3.77it/s][A
  4%|▍         | 8/200 [00:02<00:49,  3.88it/s][A
  4%|▍         | 9/200 [00:02<00:48,  3.91it/s][A
  5%|▌         | 10/200 [00:03<00:47,  3.96it/s][A
  6%|▌         | 11/200 [00:03<00:47,  4.01it/s][A
  6%|▌         | 12/200 [00:03<00:46,  4.02it/s][A
  6%|▋         | 13/200 [00:03<00:47,  3.97it/s][A
  7%|▋         | 14/200 [00:04<00:46,  4.02it/s][A
  8%|▊         | 15/200 [00:04<00:45,  4.04it/s][A
  8%|▊         | 16/200 [00:04<00:46,  3.99it/s][A
  8%|▊         | 17/200 [00:04<00:45,  4.06it/s][A
  9%|▉         | 18/200 [00:05<00:45,  4.02it/s][A
 10%|▉         | 19/200 [00:0

Baseline Val Loss: 8.8519





In [12]:
for epoch in range(epochs):
    model.train()
    total_loss = 0
    progress = tqdm(train_loader, desc=f"Epoch {epoch+1} [Train]")

    for batch in progress:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["ground_truth_ids"].to(device)

        optimizer.zero_grad()
        with autocast(device_str):
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()
        progress.set_postfix(loss=loss.item())

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} Train Loss: {avg_train_loss:.4f}")

    # Evaluation
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch+1} [Val]"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["ground_truth_ids"].to(device)

            with autocast(device_str):
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                val_loss += outputs.loss.item()

    avg_val_loss = val_loss / len(val_loader)
    print(f"Epoch {epoch+1} Val Loss: {avg_val_loss:.4f}")

    # Save best model
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        model.save_pretrained(save_path)
        tokenizer.save_pretrained(save_path)
        print(f"Model improved and saved to {save_path}")


Epoch 1 [Train]: 100%|██████████| 1800/1800 [23:41<00:00,  1.27it/s, loss=0.103]


Epoch 1 Train Loss: 0.4160


Epoch 1 [Val]: 100%|██████████| 200/200 [00:52<00:00,  3.78it/s]


Epoch 1 Val Loss: 0.0637
Model improved and saved to bartpho_syllable_best_model


Epoch 2 [Train]: 100%|██████████| 1800/1800 [23:39<00:00,  1.27it/s, loss=0.0688]


Epoch 2 Train Loss: 0.0946


Epoch 2 [Val]: 100%|██████████| 200/200 [00:53<00:00,  3.77it/s]


Epoch 2 Val Loss: 0.0495
Model improved and saved to bartpho_syllable_best_model


Epoch 3 [Train]: 100%|██████████| 1800/1800 [23:34<00:00,  1.27it/s, loss=0.0903]


Epoch 3 Train Loss: 0.0794


Epoch 3 [Val]: 100%|██████████| 200/200 [00:53<00:00,  3.77it/s]


Epoch 3 Val Loss: 0.0486
Model improved and saved to bartpho_syllable_best_model


Epoch 4 [Train]: 100%|██████████| 1800/1800 [23:35<00:00,  1.27it/s, loss=0.0975]


Epoch 4 Train Loss: 0.0750


Epoch 4 [Val]: 100%|██████████| 200/200 [00:53<00:00,  3.77it/s]


Epoch 4 Val Loss: 0.0470
Model improved and saved to bartpho_syllable_best_model


Epoch 5 [Train]: 100%|██████████| 1800/1800 [23:35<00:00,  1.27it/s, loss=0.0455]


Epoch 5 Train Loss: 0.0609


Epoch 5 [Val]: 100%|██████████| 200/200 [00:53<00:00,  3.76it/s]

Epoch 5 Val Loss: 0.0482



