In [None]:
!pip install python-Levenshtein evaluate

In [2]:
import pandas as pd 
import numpy as np
import torch

data = pd.read_csv("/kaggle/input/luat123/error_dataset.csv")
data.head()

Unnamed: 0,ground_truth,corrupted,mask
0,Đại diện CLB Hà Nội cho biết thủ môn Bùi Tiến ...,Đại diện CLB Hà Nội cho BIẾT thủ môn Bùi Tiến ...,100000000000100000
1,Đầu mùa giải vừa qua Bùi Tiến Dũng chính thức ...,Đầu mùa giải vừa qua Bùi Tiến Dũng cníhh thức ...,10000000000000100
2,Thế nhưng tính đến nay Bùi Tiến Dũng vẫn chưa ...,Thế nhưng tính đến nay Bùi Tiến Dũng vẫn chưa ...,0
3,Dù trước đó anh được HLV Park Hang Seo sử dụng...,Dù trước đó anh được HLV Park Hang So sử dụng ...,1000000000000000000
4,Bùi Tiến Dũng gặp chấn thương ở cổ tay nhưng c...,Bùi Tiến Dũng gặp chấn thương ở cổ tay nhưng c...,10000


In [3]:
from transformers import AutoModelForSeq2SeqLM,AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [4]:
def preprocess_data(row, tokenizer, max_len=64):
    corrupted_words = row["corrupted"].split()
    ground_truth_words = row["ground_truth"].split()
    mask_flags = list(map(int, row["mask"].strip()))

    assert len(corrupted_words) == len(ground_truth_words) == len(mask_flags)

    # Tokenize corrupted sentence
    mask_token = tokenizer.mask_token
    for i in range(len(mask_flags)):
        if mask_flags[i] == 1:
            corrupted_words[i] = mask_token
    remove_index = []
    for i in range(1, len(corrupted_words)):
        if corrupted_words[i] == mask_token and corrupted_words[i-1] == mask_token:
            remove_index.append(i)
    for i, idx in enumerate(remove_index):
        corrupted_words.pop(idx-i)

    encoding = tokenizer(
        corrupted_words,
        is_split_into_words=True,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=max_len
    )
    ground_truth = tokenizer(
        ground_truth_words,
        is_split_into_words=True,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=max_len
    )["input_ids"].squeeze(0) # squeeze batch dimension

    input_ids = encoding["input_ids"].squeeze(0).clone()
    attention_mask = encoding["attention_mask"].squeeze(0)

    return {
        "input_ids": input_ids,
        "input": " ".join(corrupted_words),
        "attention_mask": attention_mask,
        "ground_truth_ids": ground_truth
    }


In [5]:
class GrammarCorrectionDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=64):
        self.dataframe = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        processed = preprocess_data(
            row,
            tokenizer=self.tokenizer,
            max_len=self.max_len,
        )
        return {
            "input_ids": processed["input_ids"],
            "attention_mask": processed["attention_mask"],
            "ground_truth_ids": processed["ground_truth_ids"],
            "corrupted": row["corrupted"],
            "input": processed["input"],
            "ground_truth": row["ground_truth"],
            "mask": row["mask"]
        }


In [6]:
from Levenshtein import distance as levenshtein_distance
import evaluate

# Corrected compute_topk_accuracy function
def compute_topk_accuracy(model, tokenizer, val_loader, top_k=5, 
                          max_len=128, device='cuda', threshold=0.1):
    model.to(device)
    model.eval()
    bleu = evaluate.load("bleu")
    # rouge = evaluate.load("rouge")

    acc1_total = acc5_total = n_seq = 0
    bleu1_total = bleu5_total = 0

    for batch in tqdm(val_loader, desc="Evaluating"):
        ground_truths = batch["ground_truth"]
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        n_seq += len(ground_truths)

        with torch.no_grad():
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=max_len,
                num_beams=top_k,
                num_return_sequences=top_k,
                early_stopping=True
            )

        for idx in range(len(ground_truths)):
            preds = outputs[top_k*idx:top_k*idx+top_k]
            # preds = [tokenizer.decode(pred, skip_special_tokens=True).strip() for pred in preds]
            tmp = []
            for i in range(len(preds)):
                pred = tokenizer.decode(preds[i], skip_special_tokens=True).strip()
                if len(pred) > 0:
                    tmp.append(pred)
            preds = tmp
            # Levenshtein distance for each top-k outputs with ground truth
            norm = lambda x: max(len(ground_truths[idx]), len(x))
            acc1_total += int(
                levenshtein_distance(ground_truths[idx],preds[0]) / norm(preds[0]) <= threshold
            )
            acc5_total += int(any(
                    levenshtein_distance(ground_truths[idx], pred) / norm(pred) <= threshold for pred in preds
                ))
            bleu1_total += bleu.compute(predictions=[preds[0]],references=[ground_truths[idx]])['bleu']
            bleu5_total += sum(
                [bleu.compute(predictions=[pred],references=[ground_truths[idx]])['bleu'] for pred in preds]
            )/len(preds)
            
    acc1 = acc1_total / n_seq
    acc5 = acc5_total / n_seq
    bleu1 = bleu1_total / n_seq
    bleu5 = bleu5_total / n_seq
    
    return acc1, acc5, bleu1, bleu5


2025-05-21 10:31:41.288313: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747823501.505308      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747823501.569409      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [7]:
device_str = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device(device_str)

In [8]:
# For pre-trained model (baseline)
tokenizer = AutoTokenizer.from_pretrained("vinai/bartpho-syllable")
model = AutoModelForSeq2SeqLM.from_pretrained("vinai/bartpho-syllable").to(device)

config.json:   0%|          | 0.00/897 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

dict.txt:   0%|          | 0.00/360k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.83M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

In [9]:
data_sub = shuffle(data, random_state=28)
train_df, val_df = train_test_split(data_sub, test_size=0.1, random_state=28)
val_dataset = GrammarCorrectionDataset(val_df, tokenizer)
val_loader = DataLoader(val_dataset, batch_size=50, shuffle=False)

In [10]:
torch.cuda.empty_cache()
acc1, acc5, bleu1, bleu5 = compute_topk_accuracy(model,tokenizer,val_loader,device=device)
print("Metrics for baseline pre-trained model")
print(f"Acc@1: {acc1:.4f} | Acc@5: {acc5:.4f} | BLEU@1: {bleu1:.4f} | BLEU@5: {bleu5:.4f}")

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]


Evaluating:   0%|          | 0/200 [00:00<?, ?it/s][A
Evaluating:   0%|          | 1/200 [00:09<30:43,  9.26s/it][A
Evaluating:   1%|          | 2/200 [00:19<32:08,  9.74s/it][A
Evaluating:   2%|▏         | 3/200 [00:31<35:00, 10.66s/it][A
Evaluating:   2%|▏         | 4/200 [00:40<32:38,  9.99s/it][A
Evaluating:   2%|▎         | 5/200 [00:57<41:37, 12.81s/it][A
Evaluating:   3%|▎         | 6/200 [01:14<45:13, 13.98s/it][A
Evaluating:   4%|▎         | 7/200 [01:29<46:10, 14.36s/it][A
Evaluating:   4%|▍         | 8/200 [01:42<44:44, 13.98s/it][A
Evaluating:   4%|▍         | 9/200 [01:52<40:42, 12.79s/it][A
Evaluating:   5%|▌         | 10/200 [02:02<37:21, 11.80s/it][A
Evaluating:   6%|▌         | 11/200 [02:11<34:49, 11.05s/it][A
Evaluating:   6%|▌         | 12/200 [02:23<35:47, 11.42s/it][A
Evaluating:   6%|▋         | 13/200 [02:38<38:46, 12.44s/it][A
Evaluating:   7%|▋         | 14/200 [02:52<39:58, 12.90s/it][A
Evaluating:   8%|▊         | 15/200 [03:05<39:43, 12.88s/

Metrics for baseline pre-trained model
Acc@1: 0.7021 | Acc@5: 0.8257 | BLEU@1: 0.8144 | BLEU@5: 0.7778





In [None]:
# pretrained_path = "/kaggle/input/bartpho_best_1605/pytorch/default/3/bartpho_syllable_best_model"
pretrained_path = "bachthetrollface/bartpho-for-spelling-correction"
model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_path)
tokenizer = AutoTokenizer.from_pretrained(pretrained_path)

In [12]:
torch.cuda.empty_cache()
acc1, acc5, bleu1, bleu5 = compute_topk_accuracy(model, tokenizer, val_loader, device=device)
print("Metrics for fine-tuned model:")
print(f"Acc@1: {acc1:.4f} | Acc@5: {acc5:.4f} | BLEU@1: {bleu1:.4f} | BLEU@5: {bleu5:.4f}")

Evaluating: 100%|██████████| 200/200 [24:51<00:00,  7.46s/it]

Metrics for fine-tuned model:
Acc@1: 0.9591 | Acc@5: 0.9845 | BLEU@1: 0.9330 | BLEU@5: 0.8902



