In [1]:
import json
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

import pandas as pd
import numpy as np
import random
train_path="/kaggle/input/crawl-data/train_crosslingual.jsonl"
test_path="/kaggle/input/crawl-data/test_crosslingual.jsonl"
eval_path="/kaggle/input/crawl-data/eval_crosslingual.jsonl"


In [2]:
class Vi_Query_En_Context_Dataset(Dataset):
    def __init__(self, data_path, tokenizer, max_len=256):
        import json
        with open(data_path, 'r', encoding='utf-8') as f:
            raw_data = [json.loads(line) for line in f]
        
        self.tokenizer = tokenizer
        self.samples = []

        for row in raw_data:
            vi_query = row['vi_query']
            en_pos = row['en_pos']
            en_neg = row['en_neg']

            # Positive samples (label = 1)
            for pos in en_pos:
                self.samples.append({
                    "query": vi_query,
                    "context": pos,
                    "label": 1
                })

            # Negative samples (label = 0), chọn ngẫu nhiên vài cái
            for neg in random.sample(en_neg, k=min(4, len(en_neg))):
                self.samples.append({
                    "query": vi_query,
                    "context": neg,
                    "label": 0
                })

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        item = self.samples[idx]
        encoded = self.tokenizer(
            item['query'],
            item['context'],
            padding='max_length',
            truncation=True,
            max_length=256,
            return_tensors='pt'
        )
        return {
            'input_ids': encoded['input_ids'].squeeze(0),
            'attention_mask': encoded['attention_mask'].squeeze(0),
            'labels': torch.tensor(item['label'], dtype=torch.long)
        }


In [3]:
import json
import random
import torch
from torch.utils.data import Dataset

class En_Query_Vi_Context_Dataset(Dataset):
    def __init__(self, data_path, tokenizer, max_len=256):
        with open(data_path, 'r', encoding='utf-8') as f:
            raw_data = [json.loads(line) for line in f]

        self.tokenizer = tokenizer
        self.samples = []

        for row in raw_data:
            # đây là cho en-vi
            en_query = row['en_query']
            vi_pos = row['vi_pos']
            vi_neg = row['vi_neg']

            # Positive samples (label = 1)
            for pos in vi_pos:
                self.samples.append({
                    "query": en_query,
                    "context": pos,
                    "label": 1
                })

            # Negative samples (label = 0)
            for neg in random.sample(vi_neg, k=min(4, len(vi_neg))):
                self.samples.append({
                    "query": en_query,
                    "context": neg,
                    "label": 0
                })

            # tạo thêm cho vi-en, vi-vi, en-en

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        item = self.samples[idx]
        encoded = self.tokenizer(
            item['query'],
            item['context'],
            padding='max_length',
            truncation=True,
            max_length=256,
            return_tensors='pt'
        )
        return {
            'input_ids': encoded['input_ids'].squeeze(0),
            'attention_mask': encoded['attention_mask'].squeeze(0),
            'labels': torch.tensor(item['label'], dtype=torch.long)
        }


In [4]:
import json
import random
import torch
from torch.utils.data import Dataset

class CrossLingualAllDirectionDataset(Dataset):
    def __init__(self, data_path, tokenizer, max_len=256):
        with open(data_path, 'r', encoding='utf-8') as f:
            raw_data = [json.loads(line) for line in f]

        self.tokenizer = tokenizer
        self.samples = []

        for row in raw_data:
            en_query = row['en_query']
            vi_query = row['vi_query']
            en_pos = row['en_pos']
            vi_pos = row['vi_pos']
            en_neg = row['en_neg']
            vi_neg = row['vi_neg']

            directions = [
                ('en', 'vi'),
                ('vi', 'en'),
                ('vi', 'vi'),
                ('en', 'en')
            ]

            for dir_q, dir_c in directions:
                # Lấy query theo chiều
                query = vi_query if dir_q == 'vi' else en_query
                pos_list = vi_pos if dir_c == 'vi' else en_pos
                neg_list = vi_neg if dir_c == 'vi' else en_neg

                if not pos_list or not neg_list:
                    continue

                # Chọn 1 positive và tối đa 4 negative
                pos = random.choice(pos_list)
                negs = random.sample(neg_list, k=min(4, len(neg_list)))

                self.samples.append({
                    "query": query,
                    "context": pos,
                    "label": 1
                })

                for neg in negs:
                    self.samples.append({
                        "query": query,
                        "context": neg,
                        "label": 0
                    })

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        item = self.samples[idx]
        encoded = self.tokenizer(
            item['query'],
            item['context'],
            padding='max_length',
            truncation=True,
            max_length=256,
            return_tensors='pt'
        )
        return {
            'input_ids': encoded['input_ids'].squeeze(0),
            'attention_mask': encoded['attention_mask'].squeeze(0),
            'labels': torch.tensor(float(item['label']))  # float nếu dùng BCEWithLogitsLoss
        }


In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.optim import AdamW
from tqdm import tqdm

model_name = "BAAI/bge-reranker-v2-m3"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)

# ✅ Dataset đã có rồi
dataset = CrossLingualAllDirectionDataset(eval_path, tokenizer, max_len=256)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

# ✅ Setup training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.BCEWithLogitsLoss()

tokenizer_config.json:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/795 [00:00<?, ?B/s]

2025-05-05 03:40:59.238451: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746416459.407636      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746416459.457463      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

In [6]:
import os
from torch.cuda.amp import autocast, GradScaler

num_epochs = 2
scaler = GradScaler()  

model.train()
for epoch in range(num_epochs):
    total_loss = 0.0
    correct = 0
    total = 0

    for batch in tqdm(dataloader, desc=f"Epoch {epoch+1}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].float().to(device)  # ✅ label dạng float

        optimizer.zero_grad()

        with autocast():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits.view(-1)
            loss = loss_fn(logits, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()

        # ✅ Tính accuracy tạm bằng ngưỡng 0
        preds = (logits > 0).long()
        correct += (preds == labels.long()).sum().item()
        total += labels.size(0)

    acc = correct / total
    print(f"Epoch {epoch+1}: loss = {total_loss:.4f}, accuracy = {acc:.4f}")

    # ✅ Save model
    save_path = f"cross_encoder_epoch{epoch+1}"
    os.makedirs(save_path, exist_ok=True)
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)


  scaler = GradScaler()
  with autocast():
Epoch 1: 100%|██████████| 625/625 [13:37<00:00,  1.31s/it]


Epoch 1: loss = 226.9257, accuracy = 0.8506


Epoch 2: 100%|██████████| 625/625 [13:38<00:00,  1.31s/it]


Epoch 2: loss = 96.2603, accuracy = 0.9433
