embedding_beir

In [None]:
import os 
import joblib
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import gc
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import defaultdict
import time

# تحميل بيانات NLTK مرة واحدة
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

# زيادة حجم الـ batch
BATCH_SIZE = 128


base_path_beir = r"data\beir"

embeddings_matrix_path_beir = os.path.join(base_path_beir, "embeddings_matrix.joblib")
query_embeddings_path_beir = os.path.join(base_path_beir, "query_embeddings_matrix_beir.joblib")
queries_path_beir = os.path.join(base_path_beir, "queries_beir.csv")
qrels_path_beir = os.path.join(base_path_beir, "qrels_beir.csv")
doc_id_mapping_path_beir = os.path.join(base_path_beir, "doc_id_mapping.joblib")
docs_beir_csv_path = os.path.join(base_path_beir, "docs_beir.csv")

print("Starting to load BEIR dataset files...")

docs_beir = pd.read_csv(docs_beir_csv_path)
print(f"Loaded docs_beir.csv, number of docs: {len(docs_beir)}")

doc_id_mapping_beir = {i: str(doc_id) for i, doc_id in enumerate(docs_beir['doc_id'])}
print(f"Built doc_id_mapping_beir with {len(doc_id_mapping_beir)} entries")

joblib.dump(doc_id_mapping_beir, doc_id_mapping_path_beir)
print("Saved doc_id_mapping.joblib")

print("Loading embeddings matrix...")
embeddings_matrix_beir = joblib.load(embeddings_matrix_path_beir)
print(f"Embeddings matrix shape: {embeddings_matrix_beir.shape}")

print("Loading queries and qrels CSV files...")
queries_df_beir = pd.read_csv(queries_path_beir)
qrels_df_beir = pd.read_csv(qrels_path_beir)
print(f"Loaded {len(queries_df_beir)} queries and {len(qrels_df_beir)} qrels")

print("Loading query embeddings matrix...")
query_embeddings_beir = joblib.load(query_embeddings_path_beir)
print(f"Query embeddings shape: {query_embeddings_beir.shape}")



def process_batch(batch_idx, batch_queries, batch_query_ids, doc_embeddings, doc_id_mapping):
    batch_results = {}
    for j, (query_embedding, query_id) in enumerate(zip(batch_queries, batch_query_ids)):
        similarities = cosine_similarity(query_embedding.reshape(1, -1), doc_embeddings)[0]
        doc_scores = list(enumerate(similarities))
        doc_scores.sort(key=lambda x: x[1], reverse=True)
        top_10_docs = [doc_id_mapping[idx] for idx, _ in doc_scores[:10]]
        batch_results[query_id] = top_10_docs
        del similarities
        gc.collect()
    return batch_results

def process_similarities_in_batches(query_embeddings, doc_embeddings, queries_df, batch_size, doc_id_mapping):
    results = {}
    batch_indices = list(range(0, len(query_embeddings), batch_size))
    total_batches = len(batch_indices)
    futures = []
    start_time = time.time()

    with ThreadPoolExecutor(max_workers=4) as executor:
        for i in batch_indices:
            batch_queries = query_embeddings[i:i + batch_size]
            batch_query_ids = queries_df['query_id'].iloc[i:i + batch_size].astype(str).tolist()
            futures.append(
                executor.submit(process_batch, i, batch_queries, batch_query_ids, doc_embeddings, doc_id_mapping)
            )

        completed = 0
        with tqdm(total=total_batches, desc="Processing queries", dynamic_ncols=True, unit="batch") as pbar:
            for future in as_completed(futures):
                batch_results = future.result()
                results.update(batch_results)
                completed += 1
                elapsed = time.time() - start_time
                avg_time_per_batch = elapsed / completed
                remaining = avg_time_per_batch * (total_batches - completed)
                pbar.set_postfix({
                    "elapsed": f"{elapsed:.1f}s",
                    "remaining": f"{remaining:.1f}s"
                })
                pbar.update(1)
    return results

# دوال التقييم

def calculate_precision_at_k(retrieved_docs, relevant_docs, k=10):
    relevant_in_top_k = sum(1 for doc_id in retrieved_docs[:k] if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0)
    return relevant_in_top_k / k if k > 0 else 0.0

def calculate_recall_at_k(retrieved_docs, relevant_docs, k=10):
    relevant_in_top_k = sum(1 for doc_id in retrieved_docs[:k] if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0)
    total_relevant = sum(1 for score in relevant_docs.values() if float(score) > 0)
    return relevant_in_top_k / total_relevant if total_relevant > 0 else 0.0

def calculate_rr(retrieved_docs, relevant_docs):
    for i, doc_id in enumerate(retrieved_docs, 1):
        if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0:
            return 1.0 / i
    return 0.0

def calculate_map(retrieved_docs, relevant_docs):
    relevant_count = 0
    precision_sum = 0.0
    for i, doc_id in enumerate(retrieved_docs, 1):
        if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0:
            relevant_count += 1
            precision_sum += relevant_count / i
    return precision_sum / relevant_count if relevant_count > 0 else 0.0

def load_qrels(qrels_df):
    qrels = defaultdict(dict)
    for _, row in qrels_df.iterrows():
        qrels[str(row['query_id'])][str(row['doc_id'])] = float(row['relevance'])
    return qrels

qrels_beir = load_qrels(qrels_df_beir)

def evaluate_results(results, qrels, queries_df, dataset_name):
    map_scores = []
    precision_scores = []
    recall_scores = []
    rr_scores = []
    low_map_queries = []

    for query_id in results:
        retrieved_docs = results[query_id]
        relevant_docs = qrels.get(query_id, {})
        if relevant_docs:
            map_score = calculate_map(retrieved_docs, relevant_docs)
            map_scores.append(map_score)
            if map_score < 0.1:
                filtered_df = queries_df[queries_df['query_id'].astype(str) == str(query_id)]
                if not filtered_df.empty:
                    query_text = filtered_df['text'].iloc[0]
                    low_map_queries.append((query_id, query_text, map_score))
            precision_scores.append(calculate_precision_at_k(retrieved_docs, relevant_docs))
            recall_scores.append(calculate_recall_at_k(retrieved_docs, relevant_docs))
            rr_scores.append(calculate_rr(retrieved_docs, relevant_docs))

    map_score = np.mean(map_scores) if map_scores else 0.0
    precision_score = np.mean(precision_scores) if precision_scores else 0.0
    recall_score = np.mean(recall_scores) if recall_scores else 0.0
    mrr_score = np.mean(rr_scores) if rr_scores else 0.0

    print(f"{dataset_name} - MAP: {map_score:.4f}, Precision@10: {precision_score:.4f}, Recall@10: {recall_score:.4f}, MRR: {mrr_score:.4f}")



print("Starting batch processing of queries...")
results_beir = process_similarities_in_batches(
    query_embeddings_beir, embeddings_matrix_beir, queries_df_beir, BATCH_SIZE, doc_id_mapping_beir
)
print("Batch processing completed.")

evaluate_results(results_beir, qrels_beir, queries_df_beir, 'beir')

print("\nSample results for BEIR (first 2 queries):")
for query_id in list(results_beir.keys())[:2]:
    print(f"Query ID: {query_id}, Top Docs: {results_beir[query_id]}")

gc.collect()
print("Garbage collection done.")


  from .autonotebook import tqdm as notebook_tqdm


Starting to load BEIR dataset files...
Loaded docs_beir.csv, number of docs: 522931
Built doc_id_mapping_beir with 522931 entries
Saved doc_id_mapping.joblib
Loading embeddings matrix...
Embeddings matrix shape: (522931, 768)
Loading queries and qrels CSV files...
Loaded 10000 queries and 15675 qrels
Loading query embeddings matrix...
Query embeddings shape: (10000, 768)
Starting batch processing of queries...


Processing queries: 100%|██████████| 79/79 [3:15:00<00:00, 148.11s/batch, elapsed=11700.8s, remaining=0.0s]    


Batch processing completed.
beir - MAP: 0.6910, Precision@10: 0.1082, Recall@10: 0.8065, MRR: 0.7044

Sample results for BEIR (first 2 queries):
Query ID: 18595, Top Docs: ['141147', '509049', '357115', '349321', '182669', '58815', '11252', '11253', '77135', '87814']
Query ID: 18740, Top Docs: ['18739', '238735', '229953', '157043', '295694', '39478', '39477', '29316', '386708', '337175']
Garbage collection done.


embedding_ANTIQUE

In [None]:
import os
import joblib
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import gc
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import defaultdict
import time

# تحميل بيانات NLTK
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

BATCH_SIZE = 16


# المسارات
base_path_antique = r"data\antique"


embeddings_matrix_path_antique = os.path.join(base_path_antique, "embeddings_matrix.joblib")

query_embeddings_path_antique = os.path.join(base_path_antique, "query_embeddings_matrix_antique.joblib")

queries_path_antique = os.path.join(base_path_antique, "queries_antique.csv")
qrels_path_antique = os.path.join(base_path_antique, "qrels_antique.csv")



doc_id_mapping_path_antique = os.path.join(base_path_antique, "doc_id_mapping.joblib")


# تحميل البيانات
docs_antique = pd.read_csv(os.path.join(base_path_antique, "docs_antique.csv"))

doc_id_mapping_antique = {i: str(doc_id) for i, doc_id in enumerate(docs_antique['doc_id'])}

joblib.dump(doc_id_mapping_antique, doc_id_mapping_path_antique)


embeddings_matrix_antique = joblib.load(embeddings_matrix_path_antique)


# تحميل الكويري و qrels
queries_df_antique = pd.read_csv(queries_path_antique)

qrels_df_antique = pd.read_csv(qrels_path_antique)




query_embeddings_antique = joblib.load(query_embeddings_path_antique)




def process_batch(batch_idx, batch_queries, batch_query_ids, doc_embeddings, doc_id_mapping):
    batch_results = {}
    for j, (query_embedding, query_id) in enumerate(zip(batch_queries, batch_query_ids)):
        similarities = cosine_similarity(query_embedding.reshape(1, -1), doc_embeddings)[0]
        doc_scores = list(enumerate(similarities))
        doc_scores.sort(key=lambda x: x[1], reverse=True)
        top_10_docs = [doc_id_mapping[idx] for idx, _ in doc_scores[:10]]
        batch_results[query_id] = top_10_docs
        del similarities
        gc.collect()
    return batch_results

def process_similarities_in_batches(query_embeddings, doc_embeddings, queries_df, batch_size, doc_id_mapping):
    results = {}
    batch_indices = list(range(0, len(query_embeddings), batch_size))
    total_batches = len(batch_indices)
    futures = []
    start_time = time.time()

    with ThreadPoolExecutor(max_workers=4) as executor:
        for i in batch_indices:
            batch_queries = query_embeddings[i:i + batch_size]
            batch_query_ids = queries_df['query_id'].iloc[i:i + batch_size].astype(str).tolist()
            futures.append(
                executor.submit(process_batch, i, batch_queries, batch_query_ids, doc_embeddings, doc_id_mapping)
            )

        completed = 0
        with tqdm(total=total_batches, desc="Processing queries", dynamic_ncols=True, unit="batch") as pbar:
            for future in as_completed(futures):
                batch_results = future.result()
                results.update(batch_results)
                completed += 1
                elapsed = time.time() - start_time
                avg_time_per_batch = elapsed / completed
                remaining = avg_time_per_batch * (total_batches - completed)
                pbar.set_postfix({
                    "elapsed": f"{elapsed:.1f}s",
                    "remaining": f"{remaining:.1f}s"
                })
                pbar.update(1)
    return results

results_antique = process_similarities_in_batches(
    query_embeddings_antique, embeddings_matrix_antique, queries_df_antique, BATCH_SIZE, doc_id_mapping_antique
)


def calculate_precision_at_k(retrieved_docs, relevant_docs, k=10):
    relevant_in_top_k = sum(1 for doc_id in retrieved_docs[:k] if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0)
    return relevant_in_top_k / k if k > 0 else 0.0

def calculate_recall_at_k(retrieved_docs, relevant_docs, k=10):
    relevant_in_top_k = sum(1 for doc_id in retrieved_docs[:k] if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0)
    total_relevant = sum(1 for score in relevant_docs.values() if float(score) > 0)
    return relevant_in_top_k / total_relevant if total_relevant > 0 else 0.0

def calculate_rr(retrieved_docs, relevant_docs):
    for i, doc_id in enumerate(retrieved_docs, 1):
        if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0:
            return 1.0 / i
    return 0.0

def calculate_map(retrieved_docs, relevant_docs):
    relevant_count = 0
    precision_sum = 0.0
    for i, doc_id in enumerate(retrieved_docs, 1):
        if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0:
            relevant_count += 1
            precision_sum += relevant_count / i
    return precision_sum / relevant_count if relevant_count > 0 else 0.0

def load_qrels(qrels_df):
    qrels = defaultdict(dict)
    for _, row in qrels_df.iterrows():
        qrels[str(row['query_id'])][str(row['doc_id'])] = float(row['relevance'])
    return qrels

qrels_antique = load_qrels(qrels_df_antique)


def evaluate_results(results, qrels, queries_df, dataset_name):
    map_scores = []
    precision_scores = []
    recall_scores = []
    rr_scores = []
    low_map_queries = []

    for query_id in results:
        retrieved_docs = results[query_id]
        relevant_docs = qrels.get(query_id, {})
        if relevant_docs:
            map_score = calculate_map(retrieved_docs, relevant_docs)
            map_scores.append(map_score)
            if map_score < 0.1:
                filtered_df = queries_df[queries_df['query_id'].astype(str) == str(query_id)]
                if not filtered_df.empty:
                    query_text = filtered_df['text'].iloc[0]
                    low_map_queries.append((query_id, query_text, map_score))
            precision_scores.append(calculate_precision_at_k(retrieved_docs, relevant_docs))
            recall_scores.append(calculate_recall_at_k(retrieved_docs, relevant_docs))
            rr_scores.append(calculate_rr(retrieved_docs, relevant_docs))

    map_score = np.mean(map_scores) if map_scores else 0.0
    precision_score = np.mean(precision_scores) if precision_scores else 0.0
    recall_score = np.mean(recall_scores) if recall_scores else 0.0
    mrr_score = np.mean(rr_scores) if rr_scores else 0.0

    print(f"{dataset_name} - MAP: {map_score:.4f}, Precision@10: {precision_score:.4f}, Recall@10: {recall_score:.4f}, MRR: {mrr_score:.4f}")

evaluate_results(results_antique, qrels_antique, queries_df_antique, 'antique')



print("\nSample results for ANTIQUE (first 2 queries):")
for query_id in list(results_antique.keys())[:2]:
    print(f"Query ID: {query_id}, Top Docs: {results_antique[query_id]}")

gc.collect()


Processing queries: 100%|██████████| 11/11 [02:43<00:00, 14.90s/batch, elapsed=164.1s, remaining=0.0s] 


antique - MAP: 0.4011, Precision@10: 0.1659, Recall@10: 0.0543, MRR: 0.4514

Sample results for ANTIQUE (first 2 queries):
Query ID: 312215, Top Docs: ['3079985_9', '1060042_14', '312215_4', '1060042_20', '107187_3', '3530023_3', '560479_3', '1080427_6', '3005801_14', '312215_10']
Query ID: 3363149, Top Docs: ['3363149_5', '2652261_7', '1552113_4', '937255_12', '4103061_4', '1611814_2', '2959818_4', '2957934_6', '1865077_2', '699780_7']


9

embedding  and vector store

EMBEDDING_VECTOR_STORE_BIER


In [None]:


import os
import joblib
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import gc
import sqlite3
import faiss
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import defaultdict
import time

BATCH_SIZE = 16

# المسارات

base_path_beir = r"data/beir"


query_embeddings_path_beir = os.path.join(base_path_beir, "query_embeddings_matrix_beir.joblib")


queries_path_beir = os.path.join(base_path_beir, "queries_beir.csv")

qrels_path_beir = os.path.join(base_path_beir, "qrels_beir.csv")

# تحميل الكويريز و qrels
queries_df_antique = pd.read_csv(queries_path_antique)
queries_df_beir = pd.read_csv(queries_path_beir)
qrels_df_antique = pd.read_csv(qrels_path_antique)
qrels_df_beir = pd.read_csv(qrels_path_beir)

# تحميل تمثيلات الاستعلامات

query_embeddings_beir = joblib.load(query_embeddings_path_beir)

# تحميل تمثيلات الوثائق كاملة

doc_embeddings_beir = joblib.load(os.path.join(base_path_beir, "embeddings_matrix.joblib"))

# تحميل الفهرس FAISS

faiss_index_beir = faiss.read_index(os.path.join(base_path_beir, "embedding_index.faiss"))

# تحميل doc_id mapping


doc_mapping_beir = pd.read_csv(os.path.join(base_path_beir, "docs_beir.csv"))
doc_id_mapping_beir = {i: str(row['doc_id']) for i, row in doc_mapping_beir.iterrows()}


def process_batch_with_faiss(batch_idx, batch_queries, batch_query_ids, full_doc_embeddings, faiss_index, doc_id_mapping, top_k=20):
    batch_results = {}
    for query_embedding, query_id in zip(batch_queries, batch_query_ids):
        query_vector = np.array([query_embedding], dtype=np.float32)
        distances, indices = faiss_index.search(query_vector, top_k)

        selected_doc_embeddings = full_doc_embeddings[indices[0]]
        similarities = cosine_similarity(query_vector, selected_doc_embeddings)[0]
        doc_scores = list(zip(indices[0], similarities))
        doc_scores.sort(key=lambda x: x[1], reverse=True)

        top_docs = [doc_id_mapping[idx] for idx, _ in doc_scores[:10]]
        batch_results[query_id] = top_docs
    return batch_results


def process_similarities_with_faiss(query_embeddings, queries_df, batch_size, full_doc_embeddings, faiss_index, doc_id_mapping, dataset_name):
    results = {}
    batch_indices = list(range(0, len(query_embeddings), batch_size))
    futures = []
    start_time = time.time()

    with ThreadPoolExecutor(max_workers=4) as executor:
        for i in batch_indices:
            batch_queries = query_embeddings[i:i + batch_size]
            batch_query_ids = queries_df['query_id'].iloc[i:i + batch_size].astype(str).tolist()
            futures.append(executor.submit(
                process_batch_with_faiss, i, batch_queries, batch_query_ids,
                full_doc_embeddings, faiss_index, doc_id_mapping
            ))

        with tqdm(total=len(batch_indices), desc=f"{dataset_name} FAISS+Cosine", dynamic_ncols=True) as pbar:
            for future in as_completed(futures):
                batch_result = future.result()
                results.update(batch_result)
                pbar.update(1)
    return results

# نتائج باستخدام FAISS + Cosine


results_beir = process_similarities_with_faiss(query_embeddings_beir, queries_df_beir, BATCH_SIZE,
                                               doc_embeddings_beir, faiss_index_beir, doc_id_mapping_beir,
                                               dataset_name="beir")

# تقييم

def load_qrels(qrels_df):
    qrels = defaultdict(dict)
    for _, row in qrels_df.iterrows():
        qrels[str(row['query_id'])][str(row['doc_id'])] = float(row['relevance'])
    return qrels

def calculate_precision_at_k(retrieved_docs, relevant_docs, k=10):
    return sum(1 for d in retrieved_docs[:k] if d in relevant_docs and relevant_docs[d] > 0) / k

def calculate_recall_at_k(retrieved_docs, relevant_docs, k=10):
    rel = sum(1 for score in relevant_docs.values() if score > 0)
    return sum(1 for d in retrieved_docs[:k] if d in relevant_docs and relevant_docs[d] > 0) / rel if rel else 0

def calculate_rr(retrieved_docs, relevant_docs):
    for i, d in enumerate(retrieved_docs, 1):
        if d in relevant_docs and relevant_docs[d] > 0:
            return 1 / i
    return 0

def calculate_map(retrieved_docs, relevant_docs):
    rel, acc = 0, 0
    for i, d in enumerate(retrieved_docs, 1):
        if d in relevant_docs and relevant_docs[d] > 0:
            rel += 1
            acc += rel / i
    return acc / rel if rel else 0

def evaluate_results(results, qrels_df, queries_df, name):
    qrels = load_qrels(qrels_df)
    maps, precs, recalls, mrrs = [], [], [], []
    for qid in results:
        rel = qrels.get(qid, {})
        res = results[qid]
        maps.append(calculate_map(res, rel))
        precs.append(calculate_precision_at_k(res, rel))
        recalls.append(calculate_recall_at_k(res, rel))
        mrrs.append(calculate_rr(res, rel))
    print(f"{name} -> MAP: {np.mean(maps):.4f}, P@10: {np.mean(precs):.4f}, R@10: {np.mean(recalls):.4f}, MRR: {np.mean(mrrs):.4f}")


evaluate_results(results_beir, qrels_df_beir, queries_df_beir, "BEIR")


beir FAISS+Cosine: 100%|██████████| 625/625 [06:07<00:00,  1.70it/s]


BEIR -> MAP: 0.6897, P@10: 0.1080, R@10: 0.8063, MRR: 0.7023


EMBEDDING_VECTOR_STORE_ANTIQUE


In [None]:


import os
import joblib
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import gc
import sqlite3
import faiss
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import defaultdict
import time

BATCH_SIZE = 16

# المسارات
base_path_antique = r"data/antique"


query_embeddings_path_antique = os.path.join(base_path_antique, "query_embeddings_matrix_antique.joblib")


queries_path_antique = os.path.join(base_path_antique, "queries_antique.csv")


qrels_path_antique = os.path.join(base_path_antique, "qrels_antique.csv")


# تحميل الكويريز و qrels
queries_df_antique = pd.read_csv(queries_path_antique)

qrels_df_antique = pd.read_csv(qrels_path_antique)


# تحميل تمثيلات الاستعلامات
query_embeddings_antique = joblib.load(query_embeddings_path_antique)


# تحميل تمثيلات الوثائق كاملة
doc_embeddings_antique = joblib.load(os.path.join(base_path_antique, "embeddings_matrix.joblib"))


# تحميل الفهرس FAISS
faiss_index_antique = faiss.read_index(os.path.join(base_path_antique, "embedding_index.faiss"))


# تحميل doc_id mapping
doc_mapping_antique = pd.read_csv(os.path.join(base_path_antique, "docs_antique.csv"))
doc_id_mapping_antique = {i: str(row['doc_id']) for i, row in doc_mapping_antique.iterrows()}


def process_batch_with_faiss(batch_idx, batch_queries, batch_query_ids, full_doc_embeddings, faiss_index, doc_id_mapping, top_k=20):
    batch_results = {}
    for query_embedding, query_id in zip(batch_queries, batch_query_ids):
        query_vector = np.array([query_embedding], dtype=np.float32)
        distances, indices = faiss_index.search(query_vector, top_k)

        selected_doc_embeddings = full_doc_embeddings[indices[0]]
        similarities = cosine_similarity(query_vector, selected_doc_embeddings)[0]
        doc_scores = list(zip(indices[0], similarities))
        doc_scores.sort(key=lambda x: x[1], reverse=True)

        top_docs = [doc_id_mapping[idx] for idx, _ in doc_scores[:10]]
        batch_results[query_id] = top_docs
    return batch_results


def process_similarities_with_faiss(query_embeddings, queries_df, batch_size, full_doc_embeddings, faiss_index, doc_id_mapping, dataset_name):
    results = {}
    batch_indices = list(range(0, len(query_embeddings), batch_size))
    futures = []
    start_time = time.time()

    with ThreadPoolExecutor(max_workers=4) as executor:
        for i in batch_indices:
            batch_queries = query_embeddings[i:i + batch_size]
            batch_query_ids = queries_df['query_id'].iloc[i:i + batch_size].astype(str).tolist()
            futures.append(executor.submit(
                process_batch_with_faiss, i, batch_queries, batch_query_ids,
                full_doc_embeddings, faiss_index, doc_id_mapping
            ))

        with tqdm(total=len(batch_indices), desc=f"{dataset_name} FAISS+Cosine", dynamic_ncols=True) as pbar:
            for future in as_completed(futures):
                batch_result = future.result()
                results.update(batch_result)
                pbar.update(1)
    return results

results_antique = process_similarities_with_faiss(query_embeddings_antique, queries_df_antique, BATCH_SIZE,
                                                  doc_embeddings_antique, faiss_index_antique, doc_id_mapping_antique,
                                                  dataset_name="antique")


# تقييم

def load_qrels(qrels_df):
    qrels = defaultdict(dict)
    for _, row in qrels_df.iterrows():
        qrels[str(row['query_id'])][str(row['doc_id'])] = float(row['relevance'])
    return qrels

def calculate_precision_at_k(retrieved_docs, relevant_docs, k=10):
    return sum(1 for d in retrieved_docs[:k] if d in relevant_docs and relevant_docs[d] > 0) / k

def calculate_recall_at_k(retrieved_docs, relevant_docs, k=10):
    rel = sum(1 for score in relevant_docs.values() if score > 0)
    return sum(1 for d in retrieved_docs[:k] if d in relevant_docs and relevant_docs[d] > 0) / rel if rel else 0

def calculate_rr(retrieved_docs, relevant_docs):
    for i, d in enumerate(retrieved_docs, 1):
        if d in relevant_docs and relevant_docs[d] > 0:
            return 1 / i
    return 0

def calculate_map(retrieved_docs, relevant_docs):
    rel, acc = 0, 0
    for i, d in enumerate(retrieved_docs, 1):
        if d in relevant_docs and relevant_docs[d] > 0:
            rel += 1
            acc += rel / i
    return acc / rel if rel else 0

def evaluate_results(results, qrels_df, queries_df, name):
    qrels = load_qrels(qrels_df)
    maps, precs, recalls, mrrs = [], [], [], []
    for qid in results:
        rel = qrels.get(qid, {})
        res = results[qid]
        maps.append(calculate_map(res, rel))
        precs.append(calculate_precision_at_k(res, rel))
        recalls.append(calculate_recall_at_k(res, rel))
        mrrs.append(calculate_rr(res, rel))
    print(f"{name} -> MAP: {np.mean(maps):.4f}, P@10: {np.mean(precs):.4f}, R@10: {np.mean(recalls):.4f}, MRR: {np.mean(mrrs):.4f}")

evaluate_results(results_antique, qrels_df_antique, queries_df_antique, "ANTIQUE")



antique FAISS+Cosine: 100%|██████████| 11/11 [00:05<00:00,  1.84it/s]


ANTIQUE -> MAP: 0.4011, P@10: 0.1659, R@10: 0.0543, MRR: 0.4514


## EMBEDDING & QUERY_ANTIQUE

In [None]:
import os
import joblib
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import gc
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import defaultdict
import time

# تحميل بيانات NLTK
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

BATCH_SIZE = 128

# المسارات
base_path_antique = r"data\antique"
embeddings_matrix_path_antique = os.path.join(base_path_antique, "embeddings_matrix.joblib")
queries_path_antique = os.path.join(base_path_antique, "enhanced_queries_antique.csv")
qrels_path_antique = os.path.join(base_path_antique, "qrels_antique.csv")
doc_id_mapping_path_antique = os.path.join(base_path_antique, "doc_id_mapping.joblib")
vectorizer_path = os.path.join(base_path_antique, "embeddings_vectorizer.joblib")

# تحميل البيانات
docs_antique = pd.read_csv(os.path.join(base_path_antique, "docs_antique.csv"))
doc_id_mapping_antique = {i: str(doc_id) for i, doc_id in enumerate(docs_antique['doc_id'])}
joblib.dump(doc_id_mapping_antique, doc_id_mapping_path_antique)
embeddings_matrix_antique = joblib.load(embeddings_matrix_path_antique)

# تحميل الكويري و qrels
queries_df_antique = pd.read_csv(queries_path_antique)
qrels_df_antique = pd.read_csv(qrels_path_antique)

# تحميل الموديل لتحويل النصوص إلى تمثيلات
vectorizer = joblib.load(vectorizer_path)

# استخراج النصوص المعالجة من حقل processed_query
queries_texts = queries_df_antique['processed_query'].tolist()

# تحويل النصوص إلى تمثيلات باستخدام الموديل
query_embeddings_antique = vectorizer.encode(queries_texts, batch_size=BATCH_SIZE)


def process_batch(batch_idx, batch_queries, batch_query_ids, doc_embeddings, doc_id_mapping):
    batch_results = {}
    for j, (query_embedding, query_id) in enumerate(zip(batch_queries, batch_query_ids)):
        similarities = cosine_similarity(query_embedding.reshape(1, -1), doc_embeddings)[0]
        doc_scores = list(enumerate(similarities))
        doc_scores.sort(key=lambda x: x[1], reverse=True)
        top_10_docs = [doc_id_mapping[idx] for idx, _ in doc_scores[:10]]
        batch_results[query_id] = top_10_docs
        del similarities
        gc.collect()
    return batch_results

def process_similarities_in_batches(query_embeddings, doc_embeddings, queries_df, batch_size, doc_id_mapping):
    results = {}
    batch_indices = list(range(0, len(query_embeddings), batch_size))
    total_batches = len(batch_indices)
    futures = []
    start_time = time.time()

    with ThreadPoolExecutor(max_workers=4) as executor:
        for i in batch_indices:
            batch_queries = query_embeddings[i:i + batch_size]
            batch_query_ids = queries_df['query_id'].iloc[i:i + batch_size].astype(str).tolist()
            futures.append(
                executor.submit(process_batch, i, batch_queries, batch_query_ids, doc_embeddings, doc_id_mapping)
            )

        completed = 0
        with tqdm(total=total_batches, desc="Processing queries", bar_format="{l_bar}{bar}") as pbar:
            for future in as_completed(futures):
                batch_results = future.result()
                results.update(batch_results)
                completed += 1
                elapsed = time.time() - start_time
                avg_time_per_batch = elapsed / completed
                remaining = avg_time_per_batch * (total_batches - completed)
                pbar.set_postfix({
                    "elapsed": f"{elapsed:.1f}s",
                    "remaining": f"{remaining:.1f}s"
                })
                pbar.update(1)
    return results

results_antique = process_similarities_in_batches(
    query_embeddings_antique, embeddings_matrix_antique, queries_df_antique, BATCH_SIZE, doc_id_mapping_antique
)

def calculate_precision_at_k(retrieved_docs, relevant_docs, k=10):
    relevant_in_top_k = sum(1 for doc_id in retrieved_docs[:k] if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0)
    return relevant_in_top_k / k if k > 0 else 0.0

def calculate_recall_at_k(retrieved_docs, relevant_docs, k=10):
    relevant_in_top_k = sum(1 for doc_id in retrieved_docs[:k] if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0)
    total_relevant = sum(1 for score in relevant_docs.values() if float(score) > 0)
    return relevant_in_top_k / total_relevant if total_relevant > 0 else 0.0

def calculate_rr(retrieved_docs, relevant_docs):
    for i, doc_id in enumerate(retrieved_docs, 1):
        if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0:
            return 1.0 / i
    return 0.0

def calculate_map(retrieved_docs, relevant_docs):
    relevant_count = 0
    precision_sum = 0.0
    for i, doc_id in enumerate(retrieved_docs, 1):
        if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0:
            relevant_count += 1
            precision_sum += relevant_count / i
    return precision_sum / relevant_count if relevant_count > 0 else 0.0

def load_qrels(qrels_df):
    qrels = defaultdict(dict)
    for _, row in qrels_df.iterrows():
        qrels[str(row['query_id'])][str(row['doc_id'])] = float(row['relevance'])
    return qrels

qrels_antique = load_qrels(qrels_df_antique)

def evaluate_results(results, qrels, queries_df, dataset_name):
    map_scores = []
    precision_scores = []
    recall_scores = []
    rr_scores = []
    low_map_queries = []

    for query_id in results:
        retrieved_docs = results[query_id]
        relevant_docs = qrels.get(query_id, {})
        if relevant_docs:
            map_score = calculate_map(retrieved_docs, relevant_docs)
            map_scores.append(map_score)
            if map_score < 0.1:
                filtered_df = queries_df[queries_df['query_id'].astype(str) == str(query_id)]
                if not filtered_df.empty:
                    query_text = filtered_df['processed_query'].iloc[0]
                    low_map_queries.append((query_id, query_text, map_score))
            precision_scores.append(calculate_precision_at_k(retrieved_docs, relevant_docs))
            recall_scores.append(calculate_recall_at_k(retrieved_docs, relevant_docs))
            rr_scores.append(calculate_rr(retrieved_docs, relevant_docs))

    map_score = np.mean(map_scores) if map_scores else 0.0
    precision_score = np.mean(precision_scores) if precision_scores else 0.0
    recall_score = np.mean(recall_scores) if recall_scores else 0.0
    mrr_score = np.mean(rr_scores) if rr_scores else 0.0

    print(f"{dataset_name} - MAP: {map_score:.4f}, Precision@10: {precision_score:.4f}, Recall@10: {recall_score:.4f}, MRR: {mrr_score:.4f}")

evaluate_results(results_antique, qrels_antique, queries_df_antique, 'antique')

# عرض عينات من النتائج
print("\nSample results for ANTIQUE (first 2 queries):")
for query_id in list(results_antique.keys())[:2]:
    print(f"Query ID: {query_id}, Top Docs: {results_antique[query_id]}")

gc.collect()

  from .autonotebook import tqdm as notebook_tqdm
Processing queries: 100%|██████████


antique - MAP: 0.4033, Precision@10: 0.1720, Recall@10: 0.0577, MRR: 0.4586

Sample results for ANTIQUE (first 2 queries):
Query ID: 312215, Top Docs: ['2198611_11', '1573436_0', '1060042_28', '1060042_20', '2981567_2', '2753255_2', '1658000_4', '1932479_9', '2131723_9', '312215_4']
Query ID: 3363149, Top Docs: ['2652261_7', '3363149_4', '2652261_6', '3766120_6', '3363149_5', '1081922_8', '1042498_4', '3886596_6', '4103061_4', '3491622_11']


0

## EMBEDDING & QUERY_BEIR

In [None]:
import os
import joblib
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import gc
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import defaultdict
import time

# تحميل بيانات NLTK
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

BATCH_SIZE = 128

# المسارات
base_path_beir = r"data\beir"
embeddings_matrix_path_beir = os.path.join(base_path_beir, "embeddings_matrix.joblib")
queries_path_beir = os.path.join(base_path_beir, "enhanced_queries_beir.csv")
qrels_path_beir = os.path.join(base_path_beir, "qrels_beir.csv")
doc_id_mapping_path_beir = os.path.join(base_path_beir, "doc_id_mapping.joblib")
vectorizer_path = os.path.join(base_path_beir, "embeddings_vectorizer.joblib")

# تحميل البيانات
docs_beir = pd.read_csv(os.path.join(base_path_beir, "docs_beir.csv"))
doc_id_mapping_beir = {i: str(doc_id) for i, doc_id in enumerate(docs_beir['doc_id'])}
joblib.dump(doc_id_mapping_beir, doc_id_mapping_path_beir)
embeddings_matrix_beir = joblib.load(embeddings_matrix_path_beir)

# تحميل الكويري و qrels
queries_df_beir = pd.read_csv(queries_path_beir)
qrels_df_beir = pd.read_csv(qrels_path_beir)

# تحميل الموديل لتحويل النصوص إلى تمثيلات
vectorizer = joblib.load(vectorizer_path)

# استخراج النصوص المعالجة من حقل processed_query
queries_texts = queries_df_beir['processed_query'].tolist()

# تحويل النصوص إلى تمثيلات باستخدام الموديل
query_embeddings_beir = vectorizer.encode(queries_texts, batch_size=BATCH_SIZE)


def process_batch(batch_idx, batch_queries, batch_query_ids, doc_embeddings, doc_id_mapping):
    batch_results = {}
    for j, (query_embedding, query_id) in enumerate(zip(batch_queries, batch_query_ids)):
        similarities = cosine_similarity(query_embedding.reshape(1, -1), doc_embeddings)[0]
        doc_scores = list(enumerate(similarities))
        doc_scores.sort(key=lambda x: x[1], reverse=True)
        top_10_docs = [doc_id_mapping[idx] for idx, _ in doc_scores[:10]]
        batch_results[query_id] = top_10_docs
        del similarities
        gc.collect()
    return batch_results

def process_similarities_in_batches(query_embeddings, doc_embeddings, queries_df, batch_size, doc_id_mapping):
    results = {}
    batch_indices = list(range(0, len(query_embeddings), batch_size))
    total_batches = len(batch_indices)
    futures = []
    start_time = time.time()

    with ThreadPoolExecutor(max_workers=4) as executor:
        for i in batch_indices:
            batch_queries = query_embeddings[i:i + batch_size]
            batch_query_ids = queries_df['query_id'].iloc[i:i + batch_size].astype(str).tolist()
            futures.append(
                executor.submit(process_batch, i, batch_queries, batch_query_ids, doc_embeddings, doc_id_mapping)
            )

        completed = 0
        with tqdm(total=total_batches, desc="Processing queries", bar_format="{l_bar}{bar}") as pbar:
            for future in as_completed(futures):
                batch_results = future.result()
                results.update(batch_results)
                completed += 1
                elapsed = time.time() - start_time
                avg_time_per_batch = elapsed / completed
                remaining = avg_time_per_batch * (total_batches - completed)
                pbar.set_postfix({
                    "elapsed": f"{elapsed:.1f}s",
                    "remaining": f"{remaining:.1f}s"
                })
                pbar.update(1)
    return results

results_beir = process_similarities_in_batches(
    query_embeddings_beir, embeddings_matrix_beir, queries_df_beir, BATCH_SIZE, doc_id_mapping_beir
)

def calculate_precision_at_k(retrieved_docs, relevant_docs, k=10):
    relevant_in_top_k = sum(1 for doc_id in retrieved_docs[:k] if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0)
    return relevant_in_top_k / k if k > 0 else 0.0

def calculate_recall_at_k(retrieved_docs, relevant_docs, k=10):
    relevant_in_top_k = sum(1 for doc_id in retrieved_docs[:k] if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0)
    total_relevant = sum(1 for score in relevant_docs.values() if float(score) > 0)
    return relevant_in_top_k / total_relevant if total_relevant > 0 else 0.0

def calculate_rr(retrieved_docs, relevant_docs):
    for i, doc_id in enumerate(retrieved_docs, 1):
        if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0:
            return 1.0 / i
    return 0.0

def calculate_map(retrieved_docs, relevant_docs):
    relevant_count = 0
    precision_sum = 0.0
    for i, doc_id in enumerate(retrieved_docs, 1):
        if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0:
            relevant_count += 1
            precision_sum += relevant_count / i
    return precision_sum / relevant_count if relevant_count > 0 else 0.0

def load_qrels(qrels_df):
    qrels = defaultdict(dict)
    for _, row in qrels_df.iterrows():
        qrels[str(row['query_id'])][str(row['doc_id'])] = float(row['relevance'])
    return qrels

qrels_beir = load_qrels(qrels_df_beir)

def evaluate_results(results, qrels, queries_df, dataset_name):
    map_scores = []
    precision_scores = []
    recall_scores = []
    rr_scores = []
    low_map_queries = []

    for query_id in results:
        retrieved_docs = results[query_id]
        relevant_docs = qrels.get(query_id, {})
        if relevant_docs:
            map_score = calculate_map(retrieved_docs, relevant_docs)
            map_scores.append(map_score)
            if map_score < 0.1:
                filtered_df = queries_df[queries_df['query_id'].astype(str) == str(query_id)]
                if not filtered_df.empty:
                    query_text = filtered_df['processed_query'].iloc[0]
                    low_map_queries.append((query_id, query_text, map_score))
            precision_scores.append(calculate_precision_at_k(retrieved_docs, relevant_docs))
            recall_scores.append(calculate_recall_at_k(retrieved_docs, relevant_docs))
            rr_scores.append(calculate_rr(retrieved_docs, relevant_docs))

    map_score = np.mean(map_scores) if map_scores else 0.0
    precision_score = np.mean(precision_scores) if precision_scores else 0.0
    recall_score = np.mean(recall_scores) if recall_scores else 0.0
    mrr_score = np.mean(rr_scores) if rr_scores else 0.0

    print(f"{dataset_name} - MAP: {map_score:.4f}, Precision@10: {precision_score:.4f}, Recall@10: {recall_score:.4f}, MRR: {mrr_score:.4f}")

evaluate_results(results_beir, qrels_beir, queries_df_beir, 'beir')

# عرض عينات من النتائج
print("\nSample results for BEIR (first 2 queries):")
for query_id in list(results_beir.keys())[:2]:
    print(f"Query ID: {query_id}, Top Docs: {results_beir[query_id]}")

gc.collect()

Processing queries: 100%|██████████


beir - MAP: 0.7768, Precision@10: 0.1932, Recall@10: 0.3810, MRR: 0.5067

Sample results for BEIR (first 2 queries):
Query ID: 46, Top Docs: ['271267', '63734', '188177', '19261', '19262', '53617', '136358', '31586', '41567', '63729']
Query ID: 187, Top Docs: ['68732', '321224', '258305', '258306', '434871', '311017', '202157', '144864', '360881', '346186']


0