In [1]:
import json
from datasets import Dataset
import pandas as pd
import numpy as np

In [2]:
train_path="/kaggle/input/crawl-data/train_crosslingual.jsonl"
test_path="/kaggle/input/crawl-data/test_crosslingual.jsonl"
eval_path="/kaggle/input/crawl-data/eval_crosslingual.jsonl"


In [3]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import random

class CrossLingualDataset(Dataset):
    def __init__(self, data_path, tokenizer, context_len, query_len):
        with open(data_path, 'r', encoding='utf-8') as f:
            self.data = [json.loads(line) for line in f]
        self.tokenizer = tokenizer
        self.context_len = context_len
        self.query_len = query_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data[idx]
        vi_query = row['vi_query']
        en_query = row['en_query']
        vi_pos = random.choice(row['vi_pos'])
        en_pos = random.choice(row['en_pos'])
        vi_neg = random.sample(row['vi_neg'], k=min(len(row['vi_neg']), 4))
        en_neg = random.sample(row['en_neg'], k=min(len(row['en_neg']), 4))
        encoded_vi_query = self.tokenizer(vi_query, padding='max_length', truncation=True,
                                       max_length=self.query_len, return_tensors="pt")
        encoded_vi_pos = self.tokenizer(vi_pos, padding='max_length', truncation=True,
                                          max_length=self.context_len, return_tensors="pt")
        encoded_vi_neg = self.tokenizer(vi_neg, padding='max_length', truncation=True,
                                           max_length=self.context_len, return_tensors="pt")
        
        encoded_en_query = self.tokenizer(en_query, padding='max_length', truncation=True,
                                       max_length=self.query_len, return_tensors="pt")
        encoded_en_pos = self.tokenizer(en_pos, padding='max_length', truncation=True,
                                          max_length=self.context_len, return_tensors="pt")
        encoded_en_neg = self.tokenizer(en_neg, padding='max_length', truncation=True,
                                           max_length=self.context_len, return_tensors="pt")

        return {
            "vi_query_input_ids": encoded_vi_query["input_ids"].squeeze(0),
            "vi_query_attention_mask": encoded_vi_query["attention_mask"].squeeze(0),
            "vi_positive_input_ids": encoded_vi_pos["input_ids"].squeeze(0),
            "vi_positive_attention_mask": encoded_vi_pos["attention_mask"].squeeze(0),
            "vi_negative_input_ids": encoded_vi_neg["input_ids"], # [num_neg, len]
            "vi_negative_attention_mask": encoded_vi_neg["attention_mask"],
            
            "en_query_input_ids": encoded_en_query["input_ids"].squeeze(0),
            "en_query_attention_mask": encoded_en_query["attention_mask"].squeeze(0),
            "en_positive_input_ids": encoded_en_pos["input_ids"].squeeze(0),
            "en_positive_attention_mask": encoded_en_pos["attention_mask"].squeeze(0),
            "en_negative_input_ids": encoded_en_neg["input_ids"], # [num_neg, len]
            "en_negative_attention_mask": encoded_en_neg["attention_mask"]
        }

In [4]:
from transformers import AutoModel, AutoTokenizer
from peft import get_peft_model, LoraConfig, TaskType

class BiEncoder(nn.Module):
    def __init__(self, model_name, target_modules):
        super().__init__()
        self.model_name = model_name
        base_model = AutoModel.from_pretrained(model_name)

        peft_config = LoraConfig(
            task_type=TaskType.FEATURE_EXTRACTION,
            r=16,
            lora_alpha=32,
            lora_dropout=0.1,
            target_modules=target_modules,
            bias='none'
        )
        self.encoder = get_peft_model(base_model, peft_config)

    def forward(self, input_ids, attention_mask):
        output = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        if 'bge' in self.model_name:
            cls_output = output.pooler_output
        else:
            cls_output = output.last_hidden_state[:,0]
        return cls_output

2025-05-07 02:49:15.925211: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746586156.100823      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746586156.155076      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [5]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from tqdm import tqdm
import os
from torch.cuda.amp import autocast, GradScaler
import torch.nn.functional as F


def train_biencoder(
    model_class,
    dataset_class,
    model_name,
    train_file,
    eval_file=None,  # ✅ Thêm eval_file
    target_modules=None,
    loss_fn=None,
    scheduler=None,
    query_len=128,
    context_len=256,
    batch_size=8,
    epochs=10,
    lr=2e-5,
    device=None,
    save_dir=None,
    use_self_distill=False
):
    device = device or torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if save_dir:
        os.makedirs(save_dir, exist_ok=True)
    best_loss = float("inf")

    tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-m3", use_fast=False)

    train_dataset = dataset_class(train_file, tokenizer, context_len, query_len)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    if eval_file:
        eval_dataset = dataset_class(eval_file, tokenizer, context_len, query_len)
        eval_loader = DataLoader(eval_dataset, batch_size=batch_size, shuffle=False)
    else:
        eval_loader = None

    model = model_class(model_name, target_modules=target_modules).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    scaler = GradScaler()

    use_scheduler = scheduler is not None
    model.train()

    for epoch in range(epochs):
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            optimizer.zero_grad()
            with autocast():
                # Random chọn 1 trong 4 mode
                mode = random.choice([ 'vi_en', 'en_vi'])
                #mode = random.choice(['vi_en','vi_vi','en_en','en_vi'])

                if mode == 'vi_vi':
                    query_embed = model(
                        batch['vi_query_input_ids'].to(device),
                        batch['vi_query_attention_mask'].to(device)
                    )
                    positive_embed = model(
                        batch['vi_positive_input_ids'].to(device),
                        batch['vi_positive_attention_mask'].to(device)
                    )
                    neg_input_ids = batch['vi_negative_input_ids'].to(device)
                    neg_attention_mask = batch['vi_negative_attention_mask'].to(device)

                elif mode == 'vi_en':
                    query_embed = model(
                        batch['vi_query_input_ids'].to(device),
                        batch['vi_query_attention_mask'].to(device)
                    )
                    positive_embed = model(
                        batch['en_positive_input_ids'].to(device),
                        batch['en_positive_attention_mask'].to(device)
                    )
                    neg_input_ids = batch['en_negative_input_ids'].to(device)
                    neg_attention_mask = batch['en_negative_attention_mask'].to(device)

                elif mode == 'en_vi':
                    query_embed = model(
                        batch['en_query_input_ids'].to(device),
                        batch['en_query_attention_mask'].to(device)
                    )
                    positive_embed = model(
                        batch['vi_positive_input_ids'].to(device),
                        batch['vi_positive_attention_mask'].to(device)
                    )
                    neg_input_ids = batch['vi_negative_input_ids'].to(device)
                    neg_attention_mask = batch['vi_negative_attention_mask'].to(device)

                else:  # 'en_en'
                    query_embed = model(
                        batch['en_query_input_ids'].to(device),
                        batch['en_query_attention_mask'].to(device)
                    )
                    positive_embed = model(
                        batch['en_positive_input_ids'].to(device),
                        batch['en_positive_attention_mask'].to(device)
                    )
                    neg_input_ids = batch['en_negative_input_ids'].to(device)
                    neg_attention_mask = batch['en_negative_attention_mask'].to(device)

                # Common negative handling
                B, N, L = neg_input_ids.shape
                neg_input_ids = neg_input_ids.view(B * N, L)
                neg_attention_mask = neg_attention_mask.view(B * N, L)
                negative_embed = model(neg_input_ids, neg_attention_mask)
                #negative_embed = negative_embed.view(B, N, -1)

                teacher_query_embed = query_embed.detach() if use_self_distill else None
                teacher_positive_embed = positive_embed.detach() if use_self_distill else None
                teacher_negative_embed = negative_embed.detach() if use_self_distill else None
                
                loss = loss_fn(query_embed, positive_embed, negative_embed)
                
                # # Self-Distillation Loss
                # if use_self_distill:
                #     distill_loss = (
                #         F.mse_loss(query_embed, teacher_query_embed) +
                #         F.mse_loss(positive_embed, teacher_positive_embed) +
                #         F.mse_loss(negative_embed, teacher_negative_embed)
                #     ) / 3
                #     loss += 0.1 * distill_loss 
                if use_self_distill and epoch >= self_distill_start_epoch:
                    
                    q = F.normalize(query_embed, dim=-1)
                    p = F.normalize(positive_embed, dim=-1)
                    n = F.normalize(negative_embed, dim=-1)
                
                    B = q.size(0)
                    N = n.size(0) // B  # số negative mỗi query
                
                    # positive scores: (B, 1)
                    pos_scores = torch.sum(q * p, dim=-1, keepdim=True)
                
                    # negative scores: (B, B*N)
                    neg_scores = torch.matmul(q, n.T)
                
                    # (B, 1 + B*N)
                    ensemble_scores = torch.cat([pos_scores, neg_scores], dim=-1)
                
                    # teacher target từ chính mô hình hiện tại (detach để không ngược gradient)
                    teacher_targets = F.softmax(ensemble_scores.detach(), dim=-1)
                
                    # student logits
                    student_log_probs = F.log_softmax(ensemble_scores, dim=-1)
                
                    # KL-divergence dạng -∑T·logS
                    distill_loss = -torch.sum(teacher_targets * student_log_probs, dim=-1).mean()
                
                    # Add vào loss chính
                    loss += distill_loss / 3  # giống BGE-M3
                    loss = loss / 2           # làm dịu loss




            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"🟢 Epoch {epoch + 1}: Train Loss = {total_loss:.4f}, Avg = {avg_loss:.6f}")

        # ✅ Evaluate nếu có eval_loader
        if eval_loader:
            model.eval()
            vi2en_loss = 0
            en2vi_loss = 0
            
            with torch.no_grad():
                for batch in tqdm(eval_loader, desc="🔍 Evaluating"):
                    with autocast():
                        # VI → EN
                        vi_query_embed = model(
                            batch['vi_query_input_ids'].to(device),
                            batch['vi_query_attention_mask'].to(device)
                        )
                        en_pos_embed = model(
                            batch['en_positive_input_ids'].to(device),
                            batch['en_positive_attention_mask'].to(device)
                        )
                        en_neg_ids = batch['en_negative_input_ids'].to(device)
                        en_neg_mask = batch['en_negative_attention_mask'].to(device)
                        B, N, L = en_neg_ids.shape
                        en_neg_ids = en_neg_ids.view(B * N, L)
                        en_neg_mask = en_neg_mask.view(B * N, L)
                        en_neg_embed = model(en_neg_ids, en_neg_mask)
                        #en_neg_embed = en_neg_embed.view(B, N, -1)
            
                        loss_vi2en = loss_fn(vi_query_embed, en_pos_embed, en_neg_embed)
            
                        # EN → VI
                        en_query_embed = model(
                            batch['en_query_input_ids'].to(device),
                            batch['en_query_attention_mask'].to(device)
                        )
                        vi_pos_embed = model(
                            batch['vi_positive_input_ids'].to(device),
                            batch['vi_positive_attention_mask'].to(device)
                        )
                        vi_neg_ids = batch['vi_negative_input_ids'].to(device)
                        vi_neg_mask = batch['vi_negative_attention_mask'].to(device)
                        B, N, L = vi_neg_ids.shape
                        vi_neg_ids = vi_neg_ids.view(B * N, L)
                        vi_neg_mask = vi_neg_mask.view(B * N, L)
                        vi_neg_embed = model(vi_neg_ids, vi_neg_mask)
                        #vi_neg_embed = vi_neg_embed.view(B, N, -1)
            
                        loss_en2vi = loss_fn(en_query_embed, vi_pos_embed, vi_neg_embed)
            
                    vi2en_loss += loss_vi2en.item()
                    en2vi_loss += loss_en2vi.item()
            
            # Trung bình
            avg_vi2en_loss = vi2en_loss / len(eval_loader)
            avg_en2vi_loss = en2vi_loss / len(eval_loader)
            avg_eval_loss = (avg_vi2en_loss + avg_en2vi_loss) / 2
            
            print(f"🔵 Eval VI→EN Loss = {avg_vi2en_loss:.4f}, EN→VI Loss = {avg_en2vi_loss:.4f}, Avg = {avg_eval_loss:.6f}")

            model.train()  # quay lại chế độ training

            # ✅ Lưu nếu là best loss
            if save_dir and avg_eval_loss < best_loss:
                best_loss = avg_eval_loss
                model.encoder.save_pretrained(save_dir)
                print(f"✅ Saved best checkpoint to {save_dir} (eval_loss = {best_loss:.4f})")

        if use_scheduler:
            scheduler.step(avg_loss)

    return model

In [6]:
# import torch
# import torch.nn as nn
# import torch.nn.functional as F

# class InfoNCE2(nn.Module):
#     def __init__(self, temperature=0.1, reduction='mean'):
#         super().__init__()
#         self.temperature = temperature
#         self.reduction = reduction

#     def forward(self, query, key):
#         """
#         query: (B, D)  - embedding của câu hỏi
#         key:   (B, D)  - embedding của positive documents
#         sử dụng các key khác trong batch làm negative
#         """
#         # Normalize
#         query = F.normalize(query, dim=-1)
#         key = F.normalize(key, dim=-1)

#         # Similarity: (B, B)
#         sim_matrix = torch.matmul(query, key.T) / self.temperature

#         # Ground-truth: mỗi dòng i -> đúng là key[i]
#         labels = torch.arange(query.size(0), device=query.device)

#         loss = F.cross_entropy(sim_matrix, labels, reduction=self.reduction)
#         return loss
import torch
import torch.nn.functional as F
from torch import nn

__all__ = ['InfoNCE', 'info_nce']


class InfoNCE(nn.Module):
    def __init__(self, temperature=0.1, reduction='mean', negative_mode='unpaired'):
        super().__init__()
        self.temperature = temperature
        self.reduction = reduction
        self.negative_mode = negative_mode

    def forward(self, query, positive_key, negative_keys=None):
        return info_nce(query, positive_key, negative_keys,
                        temperature=self.temperature,
                        reduction=self.reduction,
                        negative_mode=self.negative_mode)


def info_nce(query, positive_key, negative_keys=None, temperature=0.1, reduction='mean', negative_mode='unpaired'):
    # Check input dimensionality.
    if query.dim() != 2:
        raise ValueError('<query> must have 2 dimensions.')
    if positive_key.dim() != 2:
        raise ValueError('<positive_key> must have 2 dimensions.')
    if negative_keys is not None:
        if negative_mode == 'unpaired' and negative_keys.dim() != 2:
            raise ValueError("<negative_keys> must have 2 dimensions if <negative_mode> == 'unpaired'.")
        if negative_mode == 'paired' and negative_keys.dim() != 3:
            raise ValueError("<negative_keys> must have 3 dimensions if <negative_mode> == 'paired'.")

    # Check matching number of samples.
    if len(query) != len(positive_key):
        raise ValueError('<query> and <positive_key> must must have the same number of samples.')
    if negative_keys is not None:
        if negative_mode == 'paired' and len(query) != len(negative_keys):
            raise ValueError("If negative_mode == 'paired', then <negative_keys> must have the same number of samples as <query>.")

    # Embedding vectors should have same number of components.
    if query.shape[-1] != positive_key.shape[-1]:
        raise ValueError('Vectors of <query> and <positive_key> should have the same number of components.')
    if negative_keys is not None:
        if query.shape[-1] != negative_keys.shape[-1]:
            raise ValueError('Vectors of <query> and <negative_keys> should have the same number of components.')

    # Normalize to unit vectors
    query, positive_key, negative_keys = normalize(query, positive_key, negative_keys)
    if negative_keys is not None:
        # Explicit negative keys

        # Cosine between positive pairs
        positive_logit = torch.sum(query * positive_key, dim=1, keepdim=True)

        if negative_mode == 'unpaired':
            # Cosine between all query-negative combinations
            negative_logits = query @ transpose(negative_keys)

        elif negative_mode == 'paired':
            query = query.unsqueeze(1)
            negative_logits = query @ transpose(negative_keys)
            negative_logits = negative_logits.squeeze(1)

        # First index in last dimension are the positive samples
        logits = torch.cat([positive_logit, negative_logits], dim=1)
        labels = torch.zeros(len(logits), dtype=torch.long, device=query.device)
    else:
        # Negative keys are implicitly off-diagonal positive keys.

        # Cosine between all combinations
        logits = query @ transpose(positive_key)

        # Positive keys are the entries on the diagonal
        labels = torch.arange(len(query), device=query.device)

    return F.cross_entropy(logits / temperature, labels, reduction=reduction)


def transpose(x):
    return x.transpose(-2, -1)


def normalize(*xs):
    return [None if x is None else F.normalize(x, dim=-1) for x in xs]

In [7]:
from torch.optim.lr_scheduler import ReduceLROnPlateau
from transformers import AutoModel, AutoTokenizer

model_name = "BAAI/bge-m3"

target = set()
target_layer_index = [19,20,21,22,23]
for layer_index in target_layer_index:
    base_model = AutoModel.from_pretrained(model_name)
    layer = base_model.encoder.layer[layer_index]
    for name, param in layer.named_parameters():
        if 'LayerNorm' not in name and name:
            module_name = f"{target_layer_index}." + '.'.join(name.split('.')[:-1])
            target.add(module_name)
if 'bge' in model_name:
    if hasattr(base_model, "pooler"):
        for name, param in base_model.pooler.named_parameters():
            if 'LayerNorm' not in name and name:
                module_name = "pooler." + '.'.join(name.split('.')[:-1])
                target.add(module_name)

# ✅ Khởi tạo tokenizer + model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = BiEncoder(model_name=model_name, target_modules=list(target))
model = model.to("cuda" if torch.cuda.is_available() else "cpu")

# ✅ Khởi tạo InfoNCE loss (in-batch)
loss_fn = InfoNCE(temperature=0.02, reduction='mean')

# ✅ Optimizer + Scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
scheduler = ReduceLROnPlateau(
    optimizer,
    mode='min',
    factor=0.5,
    patience=3,
    verbose=True,
    min_lr=1e-6
)

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

In [8]:
train_biencoder(
    model_class=BiEncoder,
    dataset_class=CrossLingualDataset,        
    model_name=model_name,
    train_file=train_path,
    eval_file=eval_path,
    target_modules=list(target),
    loss_fn=loss_fn,
    scheduler=scheduler,
    query_len=128,
    context_len=256,
    batch_size=4,
    epochs=7,
    lr=1e-5,                              
    save_dir="checkpoint/loss1"
)

  scaler = GradScaler()
  with autocast():
Epoch 1/7: 100%|██████████| 1000/1000 [10:12<00:00,  1.63it/s]


🟢 Epoch 1: Train Loss = 2430.1479, Avg = 2.430148


  with autocast():
🔍 Evaluating: 100%|██████████| 125/125 [02:25<00:00,  1.16s/it]


🔵 Eval VI→EN Loss = 2.3448, EN→VI Loss = 2.2482, Avg = 2.296497
✅ Saved best checkpoint to checkpoint/loss1 (eval_loss = 2.2965)


Epoch 2/7: 100%|██████████| 1000/1000 [10:11<00:00,  1.63it/s]


🟢 Epoch 2: Train Loss = 2068.6875, Avg = 2.068687


🔍 Evaluating: 100%|██████████| 125/125 [02:25<00:00,  1.16s/it]


🔵 Eval VI→EN Loss = 1.9090, EN→VI Loss = 1.8589, Avg = 1.883936
✅ Saved best checkpoint to checkpoint/loss1 (eval_loss = 1.8839)


Epoch 3/7: 100%|██████████| 1000/1000 [10:12<00:00,  1.63it/s]


🟢 Epoch 3: Train Loss = 1848.1098, Avg = 1.848110


🔍 Evaluating: 100%|██████████| 125/125 [02:25<00:00,  1.16s/it]


🔵 Eval VI→EN Loss = 1.7798, EN→VI Loss = 1.7435, Avg = 1.761635
✅ Saved best checkpoint to checkpoint/loss1 (eval_loss = 1.7616)


Epoch 4/7: 100%|██████████| 1000/1000 [10:11<00:00,  1.64it/s]


🟢 Epoch 4: Train Loss = 1788.7580, Avg = 1.788758


🔍 Evaluating: 100%|██████████| 125/125 [02:25<00:00,  1.16s/it]


🔵 Eval VI→EN Loss = 1.7244, EN→VI Loss = 1.6916, Avg = 1.707980
✅ Saved best checkpoint to checkpoint/loss1 (eval_loss = 1.7080)


Epoch 5/7: 100%|██████████| 1000/1000 [10:11<00:00,  1.64it/s]


🟢 Epoch 5: Train Loss = 1763.0008, Avg = 1.763001


🔍 Evaluating: 100%|██████████| 125/125 [02:25<00:00,  1.16s/it]


🔵 Eval VI→EN Loss = 1.6834, EN→VI Loss = 1.6508, Avg = 1.667097
✅ Saved best checkpoint to checkpoint/loss1 (eval_loss = 1.6671)


Epoch 6/7: 100%|██████████| 1000/1000 [10:11<00:00,  1.63it/s]


🟢 Epoch 6: Train Loss = 1717.5398, Avg = 1.717540


🔍 Evaluating: 100%|██████████| 125/125 [02:25<00:00,  1.16s/it]


🔵 Eval VI→EN Loss = 1.6508, EN→VI Loss = 1.6179, Avg = 1.634352
✅ Saved best checkpoint to checkpoint/loss1 (eval_loss = 1.6344)


Epoch 7/7: 100%|██████████| 1000/1000 [10:12<00:00,  1.63it/s]


🟢 Epoch 7: Train Loss = 1702.6262, Avg = 1.702626


🔍 Evaluating: 100%|██████████| 125/125 [02:25<00:00,  1.16s/it]


🔵 Eval VI→EN Loss = 1.6228, EN→VI Loss = 1.5892, Avg = 1.605993
✅ Saved best checkpoint to checkpoint/loss1 (eval_loss = 1.6060)


BiEncoder(
  (encoder): PeftModelForFeatureExtraction(
    (base_model): LoraModel(
      (model): XLMRobertaModel(
        (embeddings): XLMRobertaEmbeddings(
          (word_embeddings): Embedding(250002, 1024, padding_idx=1)
          (position_embeddings): Embedding(8194, 1024, padding_idx=1)
          (token_type_embeddings): Embedding(1, 1024)
          (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): XLMRobertaEncoder(
          (layer): ModuleList(
            (0-23): 24 x XLMRobertaLayer(
              (attention): XLMRobertaAttention(
                (self): XLMRobertaSdpaSelfAttention(
                  (query): Linear(in_features=1024, out_features=1024, bias=True)
                  (key): Linear(in_features=1024, out_features=1024, bias=True)
                  (value): Linear(in_features=1024, out_features=1024, bias=True)
                  (dropout): Dropout(p=0.1, inplace=F

In [9]:
import json
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from torch.nn.functional import cosine_similarity

def l2_normalize(x):
    return x / x.norm(p=2, dim=-1, keepdim=True)

def get_embedding(texts, model, tokenizer, device, max_length=256):
    inputs = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
            embeddings = outputs.pooler_output
        else:
            embeddings = outputs.last_hidden_state[:, 0]
    return l2_normalize(embeddings)

def embedding_en_corpus(train_df, model, tokenizer, device, batch_size=64, max_length=256):
    all_contexts = set()
    for _, row in train_df.iterrows():
        pos_items = eval(row['en_pos']) if isinstance(row['en_pos'], str) else row['en_pos']
        neg_items = eval(row['en_neg']) if isinstance(row['en_neg'], str) else row['en_neg']
        all_contexts.update(pos_items)
        all_contexts.update(neg_items)
    all_contexts = list(all_contexts)

    print("🔄 Encoding EN corpus...")
    corpus_embeddings = []
    for i in tqdm(range(0, len(all_contexts), batch_size)):
        batch = all_contexts[i:i+batch_size]
        emb = get_embedding(batch, model, tokenizer, device, max_length)
        corpus_embeddings.append(emb.cpu())  # keep CPU for later cosine similarity
    corpus_embeddings = torch.cat(corpus_embeddings, dim=0)  # shape: (N, D)
    return all_contexts, corpus_embeddings

def embedding_vi_corpus(train_df, model, tokenizer, device, batch_size=64, max_length=256):
    all_contexts = set()
    for _, row in train_df.iterrows():
        pos_items = eval(row['vi_pos']) if isinstance(row['vi_pos'], str) else row['vi_pos']
        neg_items = eval(row['vi_neg']) if isinstance(row['vi_neg'], str) else row['vi_neg']
        all_contexts.update(pos_items)
        all_contexts.update(neg_items)
    all_contexts = list(all_contexts)

    print("🔄 Encoding VI corpus...")
    corpus_embeddings = []
    for i in tqdm(range(0, len(all_contexts), batch_size)):
        batch = all_contexts[i:i+batch_size]
        emb = get_embedding(batch, model, tokenizer, device, max_length)
        corpus_embeddings.append(emb.cpu())  # keep CPU for cosine sim
    corpus_embeddings = torch.cat(corpus_embeddings, dim=0)
    return all_contexts, corpus_embeddings

def vi_query_and_en_context(
    train_df,
    all_contexts,
    corpus_embeddings,
    model,
    tokenizer,
    device,
    top_k=10,
    max_length=256
):
    results = []
    corpus_embeddings = corpus_embeddings.to(device)

    for _, row in tqdm(train_df.iterrows(), total=len(train_df)):
        query = row['vi_query']
        pos_items = eval(row['en_pos']) if isinstance(row['en_pos'], str) else row['en_pos']
        pos_items = set(str(p).strip() for p in pos_items)

        q_emb = get_embedding([query], model, tokenizer, device, max_length)  # shape: (1, D)
        sims = cosine_similarity(q_emb, corpus_embeddings, dim=1)  # shape: (N,)
        top_indices = sims.topk(top_k).indices.cpu().tolist()
        top_contexts = [str(all_contexts[i]).strip() for i in top_indices]

        results.append({
            "query": query,
            "pos": list(pos_items),
            "top_k_pred": top_contexts
        })

    return pd.DataFrame(results)
def en_query_and_vi_context(
    train_df,
    all_contexts,
    corpus_embeddings,
    model,
    tokenizer,
    device,
    top_k=10,
    max_length=256
):
    results = []
    corpus_embeddings = corpus_embeddings.to(device)

    for _, row in tqdm(train_df.iterrows(), total=len(train_df)):
        query = row['en_query']
        pos_items = eval(row['vi_pos']) if isinstance(row['vi_pos'], str) else row['vi_pos']
        pos_items = set(str(p).strip() for p in pos_items)

        q_emb = get_embedding([query], model, tokenizer, device, max_length)  # shape: (1, D)
        sims = cosine_similarity(q_emb, corpus_embeddings, dim=1)  # shape: (N,)
        top_indices = sims.topk(top_k).indices.cpu().tolist()
        top_contexts = [str(all_contexts[i]).strip() for i in top_indices]

        results.append({
            "query": query,
            "pos": list(pos_items),
            "top_k_pred": top_contexts
        })

    return pd.DataFrame(results)
def vi_query_and_vi_context(
    train_df,
    all_contexts,
    corpus_embeddings,
    model,
    tokenizer,
    device,
    top_k=10,
    max_length=256
):
    results = []
    corpus_embeddings = corpus_embeddings.to(device)

    for _, row in tqdm(train_df.iterrows(), total=len(train_df)):
        query = row['vi_query']
        pos_items = eval(row['vi_pos']) if isinstance(row['vi_pos'], str) else row['vi_pos']
        pos_items = set(str(p).strip() for p in pos_items)

        q_emb = get_embedding([query], model, tokenizer, device, max_length)  # shape: (1, D)
        sims = cosine_similarity(q_emb, corpus_embeddings, dim=1)  # shape: (N,)
        top_indices = sims.topk(top_k).indices.cpu().tolist()
        top_contexts = [str(all_contexts[i]).strip() for i in top_indices]

        results.append({
            "query": query,
            "pos": list(pos_items),
            "top_k_pred": top_contexts
        })

    return pd.DataFrame(results)
def en_query_and_en_context(
    train_df,
    all_contexts,
    corpus_embeddings,
    model,
    tokenizer,
    device,
    top_k=10,
    max_length=256
):
    results = []
    corpus_embeddings = corpus_embeddings.to(device)

    for _, row in tqdm(train_df.iterrows(), total=len(train_df)):
        query = row['en_query']
        pos_items = eval(row['en_pos']) if isinstance(row['en_pos'], str) else row['en_pos']
        pos_items = set(str(p).strip() for p in pos_items)

        q_emb = get_embedding([query], model, tokenizer, device, max_length)  # shape: (1, D)
        sims = cosine_similarity(q_emb, corpus_embeddings, dim=1)  # shape: (N,)
        top_indices = sims.topk(top_k).indices.cpu().tolist()
        top_contexts = [str(all_contexts[i]).strip() for i in top_indices]

        results.append({
            "query": query,
            "pos": list(pos_items),
            "top_k_pred": top_contexts
        })

    return pd.DataFrame(results)

In [10]:
# def compute_recall_mrr_multi_gt(df, k=10):
#     acc1, acc5, acc10 = [], [], []
#     recall10, mrr10 = [], []

#     for _, row in df.iterrows():
#         true_texts = set(str(x).strip() for x in row['pos'])
#         pred_texts = [str(x).strip() for x in row['top_k_pred'][:k]]

#         # ACC@k: ít nhất 1 đúng trong top-k
#         acc1.append(1 if any(pred in true_texts for pred in pred_texts[:1]) else 0)
#         acc5.append(1 if any(pred in true_texts for pred in pred_texts[:5]) else 0)
#         acc10.append(1 if any(pred in true_texts for pred in pred_texts[:10]) else 0)

#         # Recall@10: tỷ lệ ground-truth nằm trong top-10
#         recall = len(set(pred_texts) & true_texts) / len(true_texts)
#         recall10.append(recall)

#         # MRR@10: reciprocal rank của đúng đầu tiên trong top-10
#         mrr = 0
#         for rank, pred in enumerate(pred_texts, start=1):
#             if pred in true_texts:
#                 mrr = 1 / rank
#                 break
#         mrr10.append(mrr)

#     # Kết quả trung bình toàn bộ truy vấn
#     print(f"Accuracy@1:  {np.mean(acc1):.4f}")
#     print(f"Accuracy@5:  {np.mean(acc5):.4f}")
#     print(f"Accuracy@10: {np.mean(acc10):.4f}")
#     print(f"Recall@10:   {np.mean(recall10):.4f}")
#     print(f"MRR@10:      {np.mean(mrr10):.4f}")

#     return {
#         "acc@1": np.mean(acc1),
#         "acc@5": np.mean(acc5),
#         "acc@10": np.mean(acc10),
#         "recall@10": np.mean(recall10),
#         "mrr@10": np.mean(mrr10),
#     }
def compute_recall_mrr_multi_gt(df, k=10):
    acc1, acc5, acc_k = [], [], []
    recall_k, mrr_k = [], []

    for _, row in df.iterrows():
        true_texts = set(str(x).strip() for x in row['pos'])
        pred_texts = [str(x).strip() for x in row['top_k_pred'][:k]]

        # ACC@1
        acc1.append(1 if any(pred in true_texts for pred in pred_texts[:1]) else 0)

        # ACC@5
        acc5.append(1 if any(pred in true_texts for pred in pred_texts[:5]) else 0)

        # ACC@k
        acc_k.append(1 if any(pred in true_texts for pred in pred_texts) else 0)

        # Recall@k
        recall = len(set(pred_texts) & true_texts) / len(true_texts)
        recall_k.append(recall)

        # MRR@k
        mrr = 0
        for rank, pred in enumerate(pred_texts, start=1):
            if pred in true_texts:
                mrr = 1 / rank
                break
        mrr_k.append(mrr)

    # In kết quả
    print(f"Accuracy@1:  {np.mean(acc1):.4f}")
    print(f"Accuracy@5:  {np.mean(acc5):.4f}")
    print(f"Accuracy@{k}: {np.mean(acc_k):.4f}")
    print(f"Recall@{k}:   {np.mean(recall_k):.4f}")
    print(f"MRR@{k}:      {np.mean(mrr_k):.4f}")

    return {
        "acc@1": np.mean(acc1),
        "acc@5": np.mean(acc5),
        f"acc@{k}": np.mean(acc_k),
        f"recall@{k}": np.mean(recall_k),
        f"mrr@{k}": np.mean(mrr_k),
    }


In [11]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "BAAI/bge-m3"
checkpoint_path = "checkpoint/loss1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(checkpoint_path).to(device)
model.eval()


XLMRobertaModel(
  (embeddings): XLMRobertaEmbeddings(
    (word_embeddings): Embedding(250002, 1024, padding_idx=1)
    (position_embeddings): Embedding(8194, 1024, padding_idx=1)
    (token_type_embeddings): Embedding(1, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): XLMRobertaEncoder(
    (layer): ModuleList(
      (0-23): 24 x XLMRobertaLayer(
        (attention): XLMRobertaAttention(
          (self): XLMRobertaSdpaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): XLMRobertaSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-05, elem

In [12]:
train_df = pd.read_json(test_path, lines=True)
vi_contexts, vi_corpus_embeddings = embedding_vi_corpus(train_df, model, tokenizer, device)
en_contexts, en_corpus_embeddings = embedding_en_corpus(train_df, model, tokenizer, device)

# Predict top-10
vi_query_and_en_context_top_40 = vi_query_and_en_context(
    train_df=train_df,
    all_contexts=en_contexts,
    corpus_embeddings=en_corpus_embeddings,
    model=model,
    tokenizer=tokenizer,
    device=device,
    top_k=40
)
vi_query_and_vi_context_top_40 = vi_query_and_vi_context(
    train_df=train_df,
    all_contexts=vi_contexts,
    corpus_embeddings=vi_corpus_embeddings,
    model=model,
    tokenizer=tokenizer,
    device=device,
    top_k=40
)
en_query_and_vi_context_top_40 = en_query_and_vi_context(
    train_df=train_df,
    all_contexts=vi_contexts,
    corpus_embeddings=vi_corpus_embeddings,
    model=model,
    tokenizer=tokenizer,
    device=device,
    top_k=40
)
en_query_and_en_context_top_40 = en_query_and_en_context(
    train_df=train_df,
    all_contexts=en_contexts,
    corpus_embeddings=en_corpus_embeddings,
    model=model,
    tokenizer=tokenizer,
    device=device,
    top_k=40
)

🔄 Encoding VI corpus...


100%|██████████| 51/51 [01:17<00:00,  1.52s/it]


🔄 Encoding EN corpus...


100%|██████████| 52/52 [01:18<00:00,  1.51s/it]
100%|██████████| 1000/1000 [00:20<00:00, 47.91it/s]
100%|██████████| 1000/1000 [00:20<00:00, 48.24it/s]
100%|██████████| 1000/1000 [00:19<00:00, 50.23it/s]
100%|██████████| 1000/1000 [00:19<00:00, 50.45it/s]


In [13]:
compute_recall_mrr_multi_gt(vi_query_and_en_context_top_40, k=10)

Accuracy@1:  0.3050
Accuracy@5:  0.5340
Accuracy@10: 0.6250
Recall@10:   0.6250
MRR@10:      0.4002


{'acc@1': 0.305,
 'acc@5': 0.534,
 'acc@10': 0.625,
 'recall@10': 0.625,
 'mrr@10': 0.4001765873015873}

In [14]:
compute_recall_mrr_multi_gt(vi_query_and_vi_context_top_40, k=10)

Accuracy@1:  0.4760
Accuracy@5:  0.6670
Accuracy@10: 0.7520
Recall@10:   0.7520
MRR@10:      0.5582


{'acc@1': 0.476,
 'acc@5': 0.667,
 'acc@10': 0.752,
 'recall@10': 0.752,
 'mrr@10': 0.5582178571428572}

In [15]:
compute_recall_mrr_multi_gt(en_query_and_vi_context_top_40, k=10)

Accuracy@1:  0.2830
Accuracy@5:  0.5190
Accuracy@10: 0.6030
Recall@10:   0.6030
MRR@10:      0.3851


{'acc@1': 0.283,
 'acc@5': 0.519,
 'acc@10': 0.603,
 'recall@10': 0.603,
 'mrr@10': 0.3851003968253968}

In [16]:
compute_recall_mrr_multi_gt(en_query_and_en_context_top_40, k=10)

Accuracy@1:  0.3650
Accuracy@5:  0.5700
Accuracy@10: 0.6510
Recall@10:   0.6510
MRR@10:      0.4540


{'acc@1': 0.365,
 'acc@5': 0.57,
 'acc@10': 0.651,
 'recall@10': 0.651,
 'mrr@10': 0.4539845238095238}

In [17]:
train_df = pd.read_json(eval_path, lines=True)
vi_contexts, vi_corpus_embeddings = embedding_vi_corpus(train_df, model, tokenizer, device)
en_contexts, en_corpus_embeddings = embedding_en_corpus(train_df, model, tokenizer, device)

# Predict top-10
vi_query_and_en_context_top_40 = vi_query_and_en_context(
    train_df=train_df,
    all_contexts=en_contexts,
    corpus_embeddings=en_corpus_embeddings,
    model=model,
    tokenizer=tokenizer,
    device=device,
    top_k=40
)
vi_query_and_vi_context_top_40 = vi_query_and_vi_context(
    train_df=train_df,
    all_contexts=vi_contexts,
    corpus_embeddings=vi_corpus_embeddings,
    model=model,
    tokenizer=tokenizer,
    device=device,
    top_k=40
)
en_query_and_vi_context_top_40 = en_query_and_vi_context(
    train_df=train_df,
    all_contexts=vi_contexts,
    corpus_embeddings=vi_corpus_embeddings,
    model=model,
    tokenizer=tokenizer,
    device=device,
    top_k=40
)
en_query_and_en_context_top_40 = en_query_and_en_context(
    train_df=train_df,
    all_contexts=en_contexts,
    corpus_embeddings=en_corpus_embeddings,
    model=model,
    tokenizer=tokenizer,
    device=device,
    top_k=40
)

🔄 Encoding VI corpus...


100%|██████████| 31/31 [00:46<00:00,  1.49s/it]


🔄 Encoding EN corpus...


100%|██████████| 31/31 [00:46<00:00,  1.49s/it]
100%|██████████| 500/500 [00:10<00:00, 47.10it/s]
100%|██████████| 500/500 [00:10<00:00, 48.34it/s]
100%|██████████| 500/500 [00:09<00:00, 51.18it/s]
100%|██████████| 500/500 [00:09<00:00, 51.02it/s]


In [18]:
compute_recall_mrr_multi_gt(vi_query_and_en_context_top_40, k=10)

Accuracy@1:  0.3280
Accuracy@5:  0.5720
Accuracy@10: 0.6960
Recall@10:   0.6960
MRR@10:      0.4413


{'acc@1': 0.328,
 'acc@5': 0.572,
 'acc@10': 0.696,
 'recall@10': 0.696,
 'mrr@10': 0.44127619047619043}

In [19]:
compute_recall_mrr_multi_gt(vi_query_and_vi_context_top_40, k=10)

Accuracy@1:  0.4840
Accuracy@5:  0.7180
Accuracy@10: 0.8100
Recall@10:   0.8100
MRR@10:      0.5783


{'acc@1': 0.484,
 'acc@5': 0.718,
 'acc@10': 0.81,
 'recall@10': 0.81,
 'mrr@10': 0.5783428571428572}

In [20]:
compute_recall_mrr_multi_gt(en_query_and_vi_context_top_40, k=10)

Accuracy@1:  0.3140
Accuracy@5:  0.5820
Accuracy@10: 0.6960
Recall@10:   0.6960
MRR@10:      0.4285


{'acc@1': 0.314,
 'acc@5': 0.582,
 'acc@10': 0.696,
 'recall@10': 0.696,
 'mrr@10': 0.42853730158730163}

In [21]:
compute_recall_mrr_multi_gt(en_query_and_en_context_top_40, k=10)

Accuracy@1:  0.4180
Accuracy@5:  0.6300
Accuracy@10: 0.7000
Recall@10:   0.7000
MRR@10:      0.5050


{'acc@1': 0.418,
 'acc@5': 0.63,
 'acc@10': 0.7,
 'recall@10': 0.7,
 'mrr@10': 0.5049547619047619}