In [1]:
!pip -q install rank_bm25
!pip -q install pyvi

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m69.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m50.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [25]:
import os
import json
import pandas as pd
from tqdm import tqdm
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
import nltk
from pyvi.ViTokenizer import tokenize
import numpy as np

In [3]:
# Đọc file .txt và lưu vào set
file_path = '/kaggle/input/vietnamese-stopwords-dash/vietnamese-stopwords-dash.txt'  # Thay bằng đường dẫn file .txt của bạn

# Tạo một set để lưu các phần tử từ file
stop_words = set()

# Đọc nội dung file và thêm từng dòng vào set
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        stop_words.add(line.strip())  # strip() để loại bỏ ký tự xuống dòng và khoảng trắng thừa

In [4]:
df = pd.read_json('/kaggle/input/book-data/all_books_data.json')
len(df)

663

In [5]:
# Hàm tạo meta_data
def generate_meta_data(row):
    title = row['title']
    author = row['author']
    description = row['description']
    longest_review = max(row['reviews'], key=lambda r: len(r.split()), default="")  # Tìm review dài nhất

    # Nối các trường thành đoạn văn
    return f"{title} {author} {description} {longest_review}."

# Tạo cột meta_data
df['meta_data'] = df.apply(generate_meta_data, axis=1)


In [6]:
df['tokenized_metadata'] = df['meta_data'].apply(lambda x: tokenize(x.lower()).split())

In [7]:
def filter_tokens(token_list, stop_words):
    return [token for token in token_list if token not in stop_words]

In [8]:
import json
import pandas as pd
import os
import re

def create_dataframe_from_json_files(folder_path):
    """
    Đọc các file JSON trong thư mục và tạo DataFrame.

    Args:
        folder_path: Đường dẫn đến thư mục chứa các file JSON.

    Returns:
        pandas.DataFrame: DataFrame chứa dữ liệu từ các file JSON, hoặc None nếu có lỗi.
    """

    data = []
    for filename in os.listdir(folder_path):
        if filename.startswith("index_") and filename.endswith(".json"):
            try:
                # Trích xuất index từ tên file
                match = re.search(r"index_(\d+)\.json", filename)
                if match:
                    index = int(match.group(1))
                else:
                    print(f"Không thể trích xuất index từ tên file: {filename}")
                    continue

                filepath = os.path.join(folder_path, filename)
                with open(filepath, 'r', encoding='utf-8') as f: #Thêm encoding='utf-8' để xử lý tiếng Việt
                    try:
                        json_data = json.load(f)
                        for item in json_data: #Xử lý trường hợp JSON là một list các object
                            data.append({'index': index, 'question': item.get('question'), 'answer': item.get('answer')})
                    except json.JSONDecodeError:
                        print(f"Lỗi giải mã JSON trong file: {filename}")
            except FileNotFoundError:
                print(f"File không tồn tại: {filename}")
            except Exception as e:
                print(f"Lỗi không xác định khi xử lý file {filename}: {e}")

    if data:
        df = pd.DataFrame(data)
        return df
    else:
        return None

In [9]:
# Ví dụ sử dụng:
folder_path = "/kaggle/input/query-dataset/json_files" # Thay thế bằng đường dẫn thực tế đến thư mục của bạn
df_qa = create_dataframe_from_json_files(folder_path)

In [10]:
# Hàm tạo meta_data
def generate_qa(row):
    qs = row['question']
    ans = row['answer']

    # Nối các trường thành đoạn văn
    return f"{qs} {ans}"

In [11]:
df_qa['qs_ans'] = df_qa.apply(generate_qa, axis=1)

In [12]:
df_qa['tokenized_qs'] = df_qa['question'].apply(lambda x: tokenize(x.lower()).split())

In [13]:
df_qa['tokenized_qa'] = df_qa['qs_ans'].apply(lambda x: tokenize(x.lower()).split())

In [14]:
bm25 = BM25Okapi(df['tokenized_metadata'])

In [15]:
df_qa.head(1)

Unnamed: 0,index,question,answer,qs_ans,tokenized_qs,tokenized_qa
0,343,Tôi muốn đọc một cuốn sách trinh thám xoay qua...,Bạn nên tìm một cuốn sách trinh thám tập trung...,Tôi muốn đọc một cuốn sách trinh thám xoay qua...,"[tôi, muốn, đọc, một, cuốn, sách, trinh_thám, ...","[tôi, muốn, đọc, một, cuốn, sách, trinh_thám, ..."


# ko bỏ stop word

## Chỉ câu truy vấn

In [16]:
from tqdm import tqdm

# Hàm đánh giá BM25
def evaluate_bm25_on_questions(df, qa_df, bm25,col ,top_k_values=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]):
    results = {f"Top-{k}": 0 for k in top_k_values}  # Khởi tạo dictionary với các mức đánh giá Top-k
    total_queries = len(qa_df)

    for _, row in tqdm(qa_df.iterrows(), total=total_queries):
        query = row[col]
        true_id = row['index']

        # Truy vấn BM25 trên `content`
        scores = bm25.get_scores(query)
        top_indices = scores.argsort()[::-1][:100]  # Lấy chỉ số của Top-20 kết quả

        # Lấy danh sách các id trả về
        top_ids = df.iloc[top_indices]['index'].tolist()

        # Đánh giá cho từng Top-k (Top-1, Top-3, Top-5, ...)
        for k in top_k_values:
            if true_id in top_ids[:k]:
                results[f"Top-{k}"] += 1

    # Tính độ chính xác cho mỗi Top-k
    for k in top_k_values:
        results[f"Top-{k}"] /= total_queries

    return results

In [17]:
top_k_values = range(0, 200)
evaluation_results = evaluate_bm25_on_questions(df, df_qa, bm25,'tokenized_qs',top_k_values)
print("BM25 Evaluation Results:", evaluation_results)

100%|██████████| 371/371 [00:01<00:00, 259.44it/s]

BM25 Evaluation Results: {'Top-0': 0.0, 'Top-1': 0.22641509433962265, 'Top-2': 0.30997304582210244, 'Top-3': 0.3584905660377358, 'Top-4': 0.3881401617250674, 'Top-5': 0.40970350404312667, 'Top-6': 0.42587601078167114, 'Top-7': 0.4366576819407008, 'Top-8': 0.46630727762803237, 'Top-9': 0.48247978436657685, 'Top-10': 0.49865229110512127, 'Top-11': 0.5040431266846361, 'Top-12': 0.5094339622641509, 'Top-13': 0.5202156334231806, 'Top-14': 0.522911051212938, 'Top-15': 0.522911051212938, 'Top-16': 0.5283018867924528, 'Top-17': 0.5336927223719676, 'Top-18': 0.5336927223719676, 'Top-19': 0.5390835579514824, 'Top-20': 0.5444743935309974, 'Top-21': 0.5498652291105122, 'Top-22': 0.5525606469002695, 'Top-23': 0.5579514824797843, 'Top-24': 0.5633423180592992, 'Top-25': 0.5660377358490566, 'Top-26': 0.5714285714285714, 'Top-27': 0.5714285714285714, 'Top-28': 0.5795148247978437, 'Top-29': 0.5849056603773585, 'Top-30': 0.5876010781671159, 'Top-31': 0.5902964959568733, 'Top-32': 0.5956873315363881, 'Top




## Cả câu truy vấn và câu trả lời

In [18]:
evaluation_results = evaluate_bm25_on_questions(df, df_qa, bm25,'tokenized_qa',top_k_values)
print("BM25 Evaluation Results:", evaluation_results)

100%|██████████| 371/371 [00:04<00:00, 83.27it/s]

BM25 Evaluation Results: {'Top-0': 0.0, 'Top-1': 0.20754716981132076, 'Top-2': 0.2776280323450135, 'Top-3': 0.30997304582210244, 'Top-4': 0.3423180592991914, 'Top-5': 0.37735849056603776, 'Top-6': 0.3935309973045822, 'Top-7': 0.40431266846361186, 'Top-8': 0.42318059299191374, 'Top-9': 0.4339622641509434, 'Top-10': 0.4501347708894879, 'Top-11': 0.4555256064690027, 'Top-12': 0.4797843665768194, 'Top-13': 0.48517520215633425, 'Top-14': 0.49595687331536387, 'Top-15': 0.5094339622641509, 'Top-16': 0.5175202156334232, 'Top-17': 0.5283018867924528, 'Top-18': 0.5309973045822103, 'Top-19': 0.5336927223719676, 'Top-20': 0.5444743935309974, 'Top-21': 0.5444743935309974, 'Top-22': 0.5498652291105122, 'Top-23': 0.555256064690027, 'Top-24': 0.5660377358490566, 'Top-25': 0.568733153638814, 'Top-26': 0.5768194070080862, 'Top-27': 0.5822102425876011, 'Top-28': 0.5876010781671159, 'Top-29': 0.5876010781671159, 'Top-30': 0.5876010781671159, 'Top-31': 0.5956873315363881, 'Top-32': 0.601078167115903, 'Top-




# bỏ stop words

In [19]:
df['tokenized_metadata'] = df['tokenized_metadata'].apply(lambda tokens: filter_tokens(tokens, stop_words))
df_qa['tokenized_qs'] = df_qa['tokenized_qs'].apply(lambda tokens: filter_tokens(tokens, stop_words))
df_qa['tokenized_qa'] = df_qa['tokenized_qa'].apply(lambda tokens: filter_tokens(tokens, stop_words))

## Chỉ câu truy vấn

In [20]:
evaluation_results = evaluate_bm25_on_questions(df, df_qa, bm25,'tokenized_qs',top_k_values)
print("BM25 Evaluation Results:", evaluation_results)

100%|██████████| 371/371 [00:00<00:00, 494.96it/s]

BM25 Evaluation Results: {'Top-0': 0.0, 'Top-1': 0.29649595687331537, 'Top-2': 0.3719676549865229, 'Top-3': 0.41509433962264153, 'Top-4': 0.4528301886792453, 'Top-5': 0.4797843665768194, 'Top-6': 0.49865229110512127, 'Top-7': 0.5148247978436657, 'Top-8': 0.5336927223719676, 'Top-9': 0.5444743935309974, 'Top-10': 0.555256064690027, 'Top-11': 0.5606469002695418, 'Top-12': 0.5660377358490566, 'Top-13': 0.5768194070080862, 'Top-14': 0.5876010781671159, 'Top-15': 0.5929919137466307, 'Top-16': 0.5983827493261455, 'Top-17': 0.6118598382749326, 'Top-18': 0.6118598382749326, 'Top-19': 0.6172506738544474, 'Top-20': 0.6199460916442049, 'Top-21': 0.6253369272237197, 'Top-22': 0.6253369272237197, 'Top-23': 0.6253369272237197, 'Top-24': 0.6307277628032345, 'Top-25': 0.6361185983827493, 'Top-26': 0.6388140161725068, 'Top-27': 0.6442048517520216, 'Top-28': 0.6469002695417789, 'Top-29': 0.6495956873315364, 'Top-30': 0.6549865229110512, 'Top-31': 0.660377358490566, 'Top-32': 0.6630727762803235, 'Top-33'




## Cả câu truy vấn và câu trả lời

In [21]:

evaluation_results = evaluate_bm25_on_questions(df, df_qa, bm25,'tokenized_qa',top_k_values)
print("BM25 Evaluation Results:", evaluation_results)

100%|██████████| 371/371 [00:02<00:00, 181.14it/s]

BM25 Evaluation Results: {'Top-0': 0.0, 'Top-1': 0.3018867924528302, 'Top-2': 0.3719676549865229, 'Top-3': 0.42048517520215634, 'Top-4': 0.4555256064690027, 'Top-5': 0.4797843665768194, 'Top-6': 0.5067385444743935, 'Top-7': 0.5336927223719676, 'Top-8': 0.5417789757412399, 'Top-9': 0.5633423180592992, 'Top-10': 0.568733153638814, 'Top-11': 0.5768194070080862, 'Top-12': 0.5822102425876011, 'Top-13': 0.5902964959568733, 'Top-14': 0.5929919137466307, 'Top-15': 0.5956873315363881, 'Top-16': 0.601078167115903, 'Top-17': 0.6037735849056604, 'Top-18': 0.6037735849056604, 'Top-19': 0.6145552560646901, 'Top-20': 0.6253369272237197, 'Top-21': 0.628032345013477, 'Top-22': 0.633423180592992, 'Top-23': 0.6361185983827493, 'Top-24': 0.6361185983827493, 'Top-25': 0.6415094339622641, 'Top-26': 0.6442048517520216, 'Top-27': 0.6469002695417789, 'Top-28': 0.6469002695417789, 'Top-29': 0.6495956873315364, 'Top-30': 0.6522911051212938, 'Top-31': 0.6549865229110512, 'Top-32': 0.6630727762803235, 'Top-33': 0.




# xài embedding

In [36]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('dangvantuan/vietnamese-document-embedding', trust_remote_code=True)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/171 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/6.09k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.68k [00:00<?, ?B/s]

configuration.py:   0%|          | 0.00/6.09k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/dangvantuan/Vietnamese_impl:
- configuration.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling.py:   0%|          | 0.00/53.6k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/dangvantuan/Vietnamese_impl:
- modeling.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [40]:
embeddings = model.encode(df['meta_data'])

Batches:   0%|          | 0/21 [00:00<?, ?it/s]

In [39]:
embeddings_qa = model.encode(df_qa['qs_ans'])

Batches:   0%|          | 0/12 [00:00<?, ?it/s]

In [46]:
len(df_qa)

371

In [42]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm

def evaluate_embeddings_on_questions(df, qa_df, embeddings, qa_embeddings, top_k_values=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]):
    results = {f"Top-{k}": 0 for k in top_k_values}  # Khởi tạo dictionary với các mức đánh giá Top-k
    total_queries = len(qa_df)

    for i, row in tqdm(qa_df.iterrows(), total=total_queries):
        query_embedding = qa_embeddings[i]
        true_id = row['index']

        # Tính cosine similarity giữa truy vấn và tất cả embeddings
        similarities = cosine_similarity([query_embedding], embeddings)[0]
        top_indices = similarities.argsort()[::-1][:100]  # Lấy chỉ số của Top-100 kết quả

        # Lấy danh sách các id trả về
        top_ids = df.iloc[top_indices]['index'].tolist()

        # Đánh giá cho từng Top-k (Top-1, Top-3, Top-5, ...)
        for k in top_k_values:
            if true_id in top_ids[:k]:
                results[f"Top-{k}"] += 1

    # Tính độ chính xác cho mỗi Top-k
    for k in top_k_values:
        results[f"Top-{k}"] /= total_queries

    return results


In [44]:
top_k_values = range(0, 200)
results = evaluate_embeddings_on_questions(df, df_qa, embeddings, embeddings_qa,top_k_values)
print(results)

100%|██████████| 371/371 [00:01<00:00, 211.87it/s]

{'Top-0': 0.0, 'Top-1': 0.1293800539083558, 'Top-2': 0.18867924528301888, 'Top-3': 0.2183288409703504, 'Top-4': 0.2587601078167116, 'Top-5': 0.2857142857142857, 'Top-6': 0.30727762803234504, 'Top-7': 0.3288409703504043, 'Top-8': 0.3477088948787062, 'Top-9': 0.3584905660377358, 'Top-10': 0.3692722371967655, 'Top-11': 0.37735849056603776, 'Top-12': 0.38544474393531, 'Top-13': 0.3908355795148248, 'Top-14': 0.3935309973045822, 'Top-15': 0.39892183288409705, 'Top-16': 0.40431266846361186, 'Top-17': 0.41778975741239893, 'Top-18': 0.42857142857142855, 'Top-19': 0.4420485175202156, 'Top-20': 0.4474393530997305, 'Top-21': 0.4501347708894879, 'Top-22': 0.4555256064690027, 'Top-23': 0.4582210242587601, 'Top-24': 0.4609164420485175, 'Top-25': 0.4636118598382749, 'Top-26': 0.4716981132075472, 'Top-27': 0.4797843665768194, 'Top-28': 0.48787061994609165, 'Top-29': 0.48787061994609165, 'Top-30': 0.49056603773584906, 'Top-31': 0.49056603773584906, 'Top-32': 0.49326145552560646, 'Top-33': 0.498652291105


