In [1]:
import pandas as pd 
import numpy as np
import torch

data = pd.read_csv("/kaggle/input/luat123/error_dataset.csv")
data

In [2]:
# Dataset trả về input_ids (normal thì ids vẫn như thế, error thì ids =1), mask, label (normal thì là -100, error thì là ids của ground truth)



In [3]:
import torch
import random

def preprocess_for_mlm(row, tokenizer, max_len=64, random_mask_ratio=0.05):
    corrupted_words = row["corrupted"].split()
    ground_truth_words = row["ground_truth"].split()
    mask_flags = list(map(int, row["mask"].strip()))

    assert len(corrupted_words) == len(ground_truth_words) == len(mask_flags)

    # ✅ Thêm random mask (chỉ tại các vị trí chưa lỗi)
    total_words = len(mask_flags)
    available_positions = [i for i in range(total_words) if mask_flags[i] == 0]
    num_random_mask = max(1, int(random_mask_ratio * total_words))
    random_positions = random.sample(available_positions, min(num_random_mask, len(available_positions)))

    for pos in random_positions:
        mask_flags[pos] = 2  # Đánh dấu riêng để phân biệt random vs lỗi thật

    # Tokenize corrupted sentence
    encoding = tokenizer(corrupted_words,
                         is_split_into_words=True,
                         return_tensors="pt",
                         padding="max_length",
                         truncation=True,
                         max_length=max_len)

    word_ids = encoding.word_ids(batch_index=0)
    input_ids = encoding["input_ids"].squeeze(0).clone()
    attention_mask = encoding["attention_mask"].squeeze(0)
    labels = torch.full_like(input_ids, -100)

    for idx, word_idx in enumerate(word_ids):
        if word_idx is None:
            continue

        if mask_flags[word_idx] in [1, 2]:
            is_first = (idx == 0 or word_ids[idx - 1] != word_idx)
            if is_first:
                input_ids[idx] = tokenizer.mask_token_id
                tokenized = tokenizer.tokenize(ground_truth_words[word_idx])
                if tokenized:
                    labels[idx] = tokenizer.convert_tokens_to_ids(tokenized[0])

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }


In [4]:
from torch.utils.data import Dataset

class MLMDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=64, random_mask_ratio=0.05):
        self.dataframe = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.random_mask_ratio = random_mask_ratio

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        processed = preprocess_for_mlm(
            row,
            tokenizer=self.tokenizer,
            max_len=self.max_len,
            random_mask_ratio=self.random_mask_ratio
        )
        return {
            "input_ids": processed["input_ids"],
            "attention_mask": processed["attention_mask"],
            "labels": processed["labels"]
        }


In [5]:
from transformers import AutoModelForMaskedLM, AutoTokenizer
from torch.utils.data import DataLoader
import torch
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm

In [6]:
from sklearn.model_selection import train_test_split



In [7]:
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
model = AutoModelForMaskedLM.from_pretrained("xlm-roberta-base")
train_df, val_df = train_test_split(data, test_size=10000, random_state=42)
train_dataset = MLMDataset(train_df, tokenizer)
val_dataset = MLMDataset(val_df, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=True)

In [8]:
# Lấy mẫu đầu tiên
sample = val_dataset[0]

# In tensor thô
print("🔢 input_ids:\n", sample["input_ids"])
print("\n🟢 attention_mask:\n", sample["attention_mask"])
print("\n✅ labels:\n", sample["labels"])

# In dạng văn bản nếu muốn dễ đọc hơn
print("\n📝 Decoded input_ids:\n", tokenizer.decode(sample["input_ids"], skip_special_tokens=False))
print("🎯 Decoded labels (hiển thị token được học):")

# Chỉ hiển thị các vị trí có label != -100
for i, label_id in enumerate(sample["labels"]):
    if label_id != -100:
        print(f"Pos {i}: Token = {tokenizer.decode([label_id])}")


In [9]:
print(val_df.iloc[0]["corrupted"])
print(val_df.iloc[0]["ground_truth"])
print(val_df.iloc[0]["mask"])


In [10]:
import os
import torch
import torch.optim as optim
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-5, weight_decay=0.001)
scaler = GradScaler()

save_path = "mlm_best_model"
os.makedirs(save_path, exist_ok=True)
best_val_loss = float("inf")

epochs = 10
for epoch in range(epochs):
    model.train()
    total_loss = 0
    progress = tqdm(train_loader, desc=f"Epoch {epoch+1} [Train]")

    for batch in progress:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        with autocast():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()
        progress.set_postfix(loss=loss.item())

    avg_train_loss = total_loss / len(train_loader)
    print(f"✅ Epoch {epoch+1} Train Loss: {avg_train_loss:.4f}")

    # Evaluation
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch+1} [Val]"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            with autocast():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                val_loss += outputs.loss.item()

    avg_val_loss = val_loss / len(val_loader)
    print(f"🧪 Epoch {epoch+1} Val Loss: {avg_val_loss:.4f}")

    # Save best model
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        model.save_pretrained(save_path)
        tokenizer.save_pretrained(save_path)
        print(f"✅ Model improved and saved to {save_path}")


In [11]:
import torch
from difflib import SequenceMatcher

# Levenshtein distance (using SequenceMatcher ratio-based proxy)
def levenshtein_distance(s1, s2):
    return int((1 - SequenceMatcher(None, s1, s2).ratio()) * max(len(s1), len(s2)))
def split_label(mask_string):
    return [int(char) for char in mask_string.strip()]
# Corrected compute_topk_accuracy function
def compute_topk_accuracy(model, tokenizer, val_df, top_k=10, max_len=128, device='cuda'):
    model.to(device)
    model.eval()

    acc1_total = acc5_total = acc10_total = acc20_total = total_tokens = 0

    for _, row in val_df.iterrows():
        corrupted_tokens = row["corrupted"].strip().split()
        error_flags = list(map(int, list(row["mask"].strip())))
        ground_truth_tokens = row["ground_truth"].strip().split()

        # Prepare masked input
        masked_tokens = [tokenizer.mask_token if flag else tok for tok, flag in zip(corrupted_tokens, error_flags)]
        masked_text = " ".join(masked_tokens)

        # Tokenize
        inputs = tokenizer(masked_text, return_tensors="pt", max_length=max_len,
                           padding="max_length", truncation=True).to(device)
        word_ids = tokenizer(masked_text, return_tensors=None, max_length=max_len,
                             padding="max_length", truncation=True).word_ids()

        with torch.no_grad():
            logits = model(**inputs).logits

        input_ids = inputs["input_ids"][0]
        printed_words = set()

        for token_idx, word_idx in enumerate(word_ids):
            if word_idx is None or word_idx in printed_words or error_flags[word_idx] != 1:
                continue

            printed_words.add(word_idx)
            token_logits = logits[0, token_idx]
            topk_ids = torch.topk(token_logits, k=20).indices.tolist()
            topk_tokens = tokenizer.convert_ids_to_tokens(topk_ids)
            topk_tokens = [t.lstrip("▁").lower() for t in topk_tokens]
            gt_token = ground_truth_tokens[word_idx].lower()

            acc1_total += int(gt_token == topk_tokens[0])
            acc5_total += int(any(levenshtein_distance(gt_token, tok) <= 1 for tok in topk_tokens[:5]))
            acc10_total += int(any(levenshtein_distance(gt_token, tok) <= 1 for tok in topk_tokens[:10]))
            acc20_total += int(any(levenshtein_distance(gt_token, tok) <= 1 for tok in topk_tokens[:20]))
            total_tokens += 1

    acc1 = acc1_total / total_tokens if total_tokens > 0 else 0
    acc5 = acc5_total / total_tokens if total_tokens > 0 else 0
    acc10 = acc10_total / total_tokens if total_tokens > 0 else 0
    acc20 = acc20_total / total_tokens if total_tokens > 0 else 0

    return acc1, acc5, acc10, acc20


In [12]:
acc1, acc5, acc10, acc20 = compute_topk_accuracy(model, tokenizer, val_df, top_k=20)
print(f"Acc@1: {acc1:.4f} | Acc@5: {acc5:.4f} | Acc@10: {acc10:.4f} | Acc@20: {acc20:.4f}")

In [13]:
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
model = AutoModelForMaskedLM.from_pretrained("xlm-roberta-base").to(device)
compute_topk_accuracy(model, tokenizer,val_df, device=device)

In [14]:
def compute_topk_accuracy_encoder_decoder(model, tokenizer, val_df, top_k=20, max_len=128, device='cuda'):
    model.to(device)
    model.eval()

    acc1_total = acc5_total = acc10_total = acc20_total = total_tokens = 0

    for _, row in tqdm(val_df.iterrows(), total=len(val_df), desc="Evaluating"):
        corrupted_tokens = row["corrupted"].strip().split()
        error_flags = list(map(int, list(row["mask"].strip())))
        ground_truth_tokens = row["ground_truth"].strip().split()

        # Check consistency
        if len(corrupted_tokens) != len(error_flags) or len(ground_truth_tokens) != len(error_flags):
            print("⚠️ Length mismatch at row, skipping...")
            continue

        corrupted_tokens = [str(t) if t is not None else "[UNK]" for t in corrupted_tokens]
        masked_tokens = [tokenizer.mask_token if flag else tok for tok, flag in zip(corrupted_tokens, error_flags)]
        masked_text = " ".join(masked_tokens)

        inputs = tokenizer(masked_text, return_tensors="pt", max_length=max_len,
                           padding="max_length", truncation=True).to(device)

        with torch.no_grad():
            outputs = model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_length=max_len,
                num_beams=top_k,
                num_return_sequences=top_k,
                early_stopping=True
            )

        predictions = [
            tokenizer.decode(output, skip_special_tokens=True).strip().split()
            for output in outputs
        ]

        for idx, flag in enumerate(error_flags):
            if flag != 1:
                continue

            total_tokens += 1
            gt_token = ground_truth_tokens[idx].lower()
            topk_preds_at_idx = [pred[idx].lower() for pred in predictions if len(pred) > idx]

            if not topk_preds_at_idx:
                continue

            acc1_total += int(gt_token == topk_preds_at_idx[0])
            acc5_total += int(any(levenshtein_distance(gt_token, tok) <= 1 for tok in topk_preds_at_idx[:5]))
            acc10_total += int(any(levenshtein_distance(gt_token, tok) <= 1 for tok in topk_preds_at_idx[:10]))
            acc20_total += int(any(levenshtein_distance(gt_token, tok) <= 1 for tok in topk_preds_at_idx[:20]))

    acc1 = acc1_total / total_tokens if total_tokens else 0
    acc5 = acc5_total / total_tokens if total_tokens else 0
    acc10 = acc10_total / total_tokens if total_tokens else 0
    acc20 = acc20_total / total_tokens if total_tokens else 0

    return acc1, acc5, acc10, acc20


In [15]:
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# model_name = "vinai/bartpho-syllable"
# tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
# compute_topk_accuracy_encoder_decoder(model, tokenizer,val_df, device=device)