In [2]:
import pandas as pd
from tqdm import tqdm
import os
import json

In [1]:
!pip install -q sentence-transformers
!pip install -q pyvi

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m55.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m37.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
from sentence_transformers import SentenceTransformer
from pyvi.ViTokenizer import tokenize

model = SentenceTransformer('dangvantuan/vietnamese-embedding')

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/171 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/9.63k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/753 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/540M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/22.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/965 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [4]:
# Đường dẫn tới thư mục chứa các file JSON
qa_folder_path = '/kaggle/input/qa-dataset/all_dataset_qa_retrieve'

# Danh sách để chứa dữ liệu
qa_data = []

# Lặp qua tất cả các file trong thư mục và sử dụng tqdm để theo dõi tiến trình
for file_name in tqdm(os.listdir(qa_folder_path), desc="Reading files"):
    if file_name.endswith('.json'):  # Kiểm tra nếu file là file JSON
        file_path = os.path.join(qa_folder_path, file_name)
        
        # Mở và đọc dữ liệu từ file JSON
        with open(file_path, 'r', encoding='utf-8') as f:
            qa_file_data = json.load(f)
            
            # Lặp qua các câu hỏi và câu trả lời trong dữ liệu
            for qa in qa_file_data['qa']:
                qa_data.append({
                    'question': qa['question'],
                    'answer': qa['answer'],
                    'id': qa['id']
                })

Reading files: 100%|██████████| 499/499 [00:01<00:00, 271.20it/s]


In [6]:
from transformers import AutoTokenizer
# Khởi tạo tokenizer từ mô hình SentenceTransformer
tokenizer = AutoTokenizer.from_pretrained('dangvantuan/vietnamese-embedding')
max_length = 512
# Giới hạn token cho từng câu
def truncate_text(text, max_length=512):
    # Tokenize câu và giới hạn độ dài
    text  = tokenize(text).lower()
    tokens = tokenizer.tokenize(text)
    if len(tokens) > max_length:
        tokens = tokens[:max_length]
    return tokenizer.convert_tokens_to_string(tokens)

In [11]:
# Tạo DataFrame từ dữ liệu đã thu thập
qa_df = pd.DataFrame(qa_data)

In [18]:
questions = qa_df['question'].apply(lambda x: truncate_text(x, max_length))

In [15]:
question_embeddings = model.encode(questions, batch_size=32, convert_to_tensor=False)  

Batches:   0%|          | 0/129 [00:00<?, ?it/s]

In [20]:
qa_df['question_embedding'] = list(question_embeddings)

In [7]:
chunked_df = pd.read_json('/kaggle/input/df-full-content-and-content/df/chunked_df.json')

In [22]:
chunked_df['tokenized_content'] = chunked_df['content'].apply(lambda x: truncate_text(x, max_length))

In [23]:
tokenized_content = chunked_df['tokenized_content'].tolist() 

In [26]:
content_embeddings = model.encode(tokenized_content, batch_size=8, convert_to_tensor=False)  # batch_size tùy chỉnh

Batches:   0%|          | 0/39628 [00:00<?, ?it/s]

In [28]:
import numpy as np

In [29]:
chunked_df['embedding'] = list(content_embeddings)
embedding_chunked_df = chunked_df[['id','id_baiviet','embedding']]
# Chuyển cột embedding trong embedding_full_df thành ma trận numpy
embeddings_matrix = np.vstack(chunked_df['embedding'].values)
id_baiviet_list = chunked_df['id'].values

In [30]:
chunked_df[['id','id_baiviet','embedding']].to_csv('chunked_embedding.csv')

In [31]:
from tqdm import tqdm
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Hàm tối ưu tính Top-k accuracy với tqdm
def compute_top_k_accuracies(qa_df, embeddings_matrix, id_baiviet_list, max_k=100):
    correct_counts = np.zeros(max_k)  # Mảng lưu số câu hỏi đúng cho từng k
    
    # Sử dụng tqdm để theo dõi tiến trình
    for _, row in tqdm(qa_df.iterrows(), total=len(qa_df), desc="Processing questions"):
        question_embedding = row['question_embedding'].reshape(1, -1)
        true_id = row['id']
        
        # Tính độ tương đồng cosine
        similarities = cosine_similarity(question_embedding, embeddings_matrix)[0]
        
        # Lấy các chỉ số sắp xếp theo độ tương đồng giảm dần
        sorted_indices = similarities.argsort()[::-1]
        sorted_top_ids = id_baiviet_list[sorted_indices]
        
        # Kiểm tra xem true_id có nằm trong Top-k cho từng k
        for k in range(1, max_k + 1):
            if true_id in sorted_top_ids[:k]:
                correct_counts[k - 1] += 1  # Kết quả đúng cho Top-k
    
    # Tính độ chính xác cho từng k
    accuracies = correct_counts / len(qa_df)
    return accuracies

# Tính Top-k accuracy
max_k = 100
top_k_accuracies = compute_top_k_accuracies(qa_df, embeddings_matrix, id_baiviet_list, max_k)

# In kết quả
for k in range(1, max_k + 1):
    print(f"Top-{k} Accuracy: {top_k_accuracies[k - 1]:.2%}")

Processing questions: 100%|██████████| 4121/4121 [59:25<00:00,  1.16it/s]

Top-1 Accuracy: 14.95%
Top-2 Accuracy: 22.74%
Top-3 Accuracy: 27.47%
Top-4 Accuracy: 30.79%
Top-5 Accuracy: 33.46%
Top-6 Accuracy: 35.67%
Top-7 Accuracy: 37.44%
Top-8 Accuracy: 39.07%
Top-9 Accuracy: 40.67%
Top-10 Accuracy: 41.64%
Top-11 Accuracy: 42.66%
Top-12 Accuracy: 43.85%
Top-13 Accuracy: 44.79%
Top-14 Accuracy: 45.64%
Top-15 Accuracy: 46.49%
Top-16 Accuracy: 47.12%
Top-17 Accuracy: 48.19%
Top-18 Accuracy: 49.11%
Top-19 Accuracy: 49.72%
Top-20 Accuracy: 50.47%
Top-21 Accuracy: 51.13%
Top-22 Accuracy: 51.64%
Top-23 Accuracy: 52.22%
Top-24 Accuracy: 52.78%
Top-25 Accuracy: 53.34%
Top-26 Accuracy: 54.04%
Top-27 Accuracy: 54.57%
Top-28 Accuracy: 54.96%
Top-29 Accuracy: 55.40%
Top-30 Accuracy: 55.86%
Top-31 Accuracy: 56.25%
Top-32 Accuracy: 56.49%
Top-33 Accuracy: 57.05%
Top-34 Accuracy: 57.58%
Top-35 Accuracy: 58.02%
Top-36 Accuracy: 58.19%
Top-37 Accuracy: 58.48%
Top-38 Accuracy: 58.89%
Top-39 Accuracy: 59.33%
Top-40 Accuracy: 59.65%
Top-41 Accuracy: 60.03%
Top-42 Accuracy: 60.30%
T


