In [3]:
!pip install -q --upgrade transformers sentence-transformers


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m66.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m470.2/470.2 kB[0m [31m32.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
import os
import torch
import joblib
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import gc
from concurrent.futures import ThreadPoolExecutor
from collections import defaultdict

# التحقق من وجود GPU واستخدامه
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    print(f"✅ GPU متوفر: {gpu_name}")
else:
    print("❌ لا يوجد GPU متاح.")

BATCH_SIZE = 16
base_path_antique = "/content/data/antique"
base_path_beir = "/content/data/beir"

# تحميل بيانات الوثائق
docs_antique = pd.read_csv(os.path.join(base_path_antique, "docs_antique.csv"))
docs_beir = pd.read_csv(os.path.join(base_path_beir, "docs_beir.csv"))

doc_id_mapping_antique = {i: str(doc_id) for i, doc_id in enumerate(docs_antique['doc_id'])}
doc_id_mapping_beir = {i: str(doc_id) for i, doc_id in enumerate(docs_beir['doc_id'])}

# حفظ الـ doc_id_mapping لم يعد ضروريًا إذا لم تستخدمه في مكان آخر،
# لكن إذا تريد حفظه:
joblib.dump(doc_id_mapping_antique, os.path.join(base_path_antique, "doc_id_mapping.joblib"))
joblib.dump(doc_id_mapping_beir, os.path.join(base_path_beir, "doc_id_mapping.joblib"))

# تحميل تمثيلات الوثائق
embeddings_matrix_antique = joblib.load(os.path.join(base_path_antique, "embeddings_matrix.joblib"))
embeddings_matrix_beir = joblib.load(os.path.join(base_path_beir, "embeddings_matrix.joblib"))

# تحميل الاستعلامات المنظفة
queries_df_antique = pd.read_csv(os.path.join(base_path_antique, "enhanced_queries_antique.csv")).head(50)
queries_df_beir = pd.read_csv(os.path.join(base_path_beir, "enhanced_queries_beir.csv")).head(50)

queries_df_antique['processed_text'] = queries_df_antique['processed_query'].apply(lambda x: x.split())
queries_df_beir['processed_text'] = queries_df_beir['processed_query'].apply(lambda x: x.split())

# تحميل الموديل مباشرة من sentence-transformers (بدون حفظ/تحميل بالـ joblib)
def get_model(dataset_name):
    print(f"Loading model for {dataset_name} on {device}")
    return SentenceTransformer('all-mpnet-base-v2', device=device)

model_antique = get_model("antique")
model_beir = get_model("beir")

# تحويل الاستعلامات إلى تمثيلات embeddings
def encode_in_batches(texts, model, batch_size):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Encoding queries"):
        batch_texts = [' '.join(text) if isinstance(text, list) else text for text in texts[i:i + batch_size]]
        batch_embeddings = model.encode(batch_texts, convert_to_tensor=False, show_progress_bar=False)
        embeddings.append(batch_embeddings)
        gc.collect()
    return np.vstack(embeddings).astype(np.float32)

query_embeddings_antique = encode_in_batches(queries_df_antique['processed_text'].tolist(), model_antique, BATCH_SIZE)
query_embeddings_beir = encode_in_batches(queries_df_beir['processed_text'].tolist(), model_beir, BATCH_SIZE)

# حساب التشابهات
def process_batch(batch_idx, batch_queries, batch_query_ids, batch_texts, doc_embeddings, doc_id_mapping):
    batch_results = {}
    for j, (query_embedding, query_id, query_text) in enumerate(zip(batch_queries, batch_query_ids, batch_texts)):
        similarities = cosine_similarity(query_embedding.reshape(1, -1), doc_embeddings)[0]
        doc_scores = list(zip(doc_id_mapping.keys(), similarities))
        doc_scores.sort(key=lambda x: x[1], reverse=True)
        top_10_docs = [doc_id_mapping[idx] for idx, _ in doc_scores[:10]]
        batch_results[query_id] = top_10_docs
        del similarities
        gc.collect()
    return batch_results

def process_similarities_in_batches(query_embeddings, doc_embeddings, queries_df, batch_size, doc_id_mapping, dataset_base_path):
    results = {}
    with ThreadPoolExecutor(max_workers=4) as executor:
        futures = []
        for i in range(0, len(query_embeddings), batch_size):
            batch_queries = query_embeddings[i:i + batch_size]
            batch_texts = queries_df['processed_text'].iloc[i:i + batch_size].tolist()
            batch_query_ids = queries_df['query_id'].iloc[i:i + batch_size].astype(str).tolist()
            futures.append(
                executor.submit(process_batch, i, batch_queries, batch_query_ids, batch_texts, doc_embeddings, doc_id_mapping)
            )
        for future in tqdm(futures, desc=f"Processing {dataset_base_path.split(os.sep)[-1]} queries"):
            batch_results = future.result()
            results.update(batch_results)
    return results

results_antique = process_similarities_in_batches(
    query_embeddings_antique, embeddings_matrix_antique, queries_df_antique, BATCH_SIZE, doc_id_mapping_antique, base_path_antique
)
results_beir = process_similarities_in_batches(
    query_embeddings_beir, embeddings_matrix_beir, queries_df_beir, BATCH_SIZE, doc_id_mapping_beir, base_path_beir
)

# تحميل qrels
qrels_df_antique = pd.read_csv(os.path.join(base_path_antique, "qrels_antique.csv"))
qrels_df_beir = pd.read_csv(os.path.join(base_path_beir, "qrels_beir.csv"))

def load_qrels(qrels_df):
    qrels = defaultdict(dict)
    for _, row in qrels_df.iterrows():
        qrels[str(row['query_id'])][str(row['doc_id'])] = float(row['relevance'])
    return qrels

qrels_antique = load_qrels(qrels_df_antique)
qrels_beir = load_qrels(qrels_df_beir)

# تقييم النتائج
def calculate_precision_at_k(retrieved_docs, relevant_docs, k=10):
    return sum(1 for doc_id in retrieved_docs[:k] if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0) / k

def calculate_recall_at_k(retrieved_docs, relevant_docs, k=10):
    total_relevant = sum(1 for score in relevant_docs.values() if float(score) > 0)
    return sum(1 for doc_id in retrieved_docs[:k] if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0) / total_relevant if total_relevant > 0 else 0.0

def calculate_rr(retrieved_docs, relevant_docs):
    for i, doc_id in enumerate(retrieved_docs, 1):
        if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0:
            return 1.0 / i
    return 0.0

def calculate_map(retrieved_docs, relevant_docs):
    relevant_count = 0
    precision_sum = 0.0
    for i, doc_id in enumerate(retrieved_docs, 1):
        if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0:
            relevant_count += 1
            precision_sum += relevant_count / i
    return precision_sum / relevant_count if relevant_count > 0 else 0.0

def evaluate_results(results, qrels, queries_df, dataset_name):
    map_scores, precision_scores, recall_scores, rr_scores = [], [], [], []
    for query_id in results:
        retrieved_docs = results[query_id]
        relevant_docs = qrels.get(query_id, {})
        if relevant_docs:
            map_scores.append(calculate_map(retrieved_docs, relevant_docs))
            precision_scores.append(calculate_precision_at_k(retrieved_docs, relevant_docs))
            recall_scores.append(calculate_recall_at_k(retrieved_docs, relevant_docs))
            rr_scores.append(calculate_rr(retrieved_docs, relevant_docs))

    print(f"\n📊 {dataset_name.upper()} Evaluation:")
    print(f"MAP: {np.mean(map_scores):.4f}, Precision@10: {np.mean(precision_scores):.4f}, Recall@10: {np.mean(recall_scores):.4f}, MRR: {np.mean(rr_scores):.4f}")

evaluate_results(results_antique, qrels_antique, queries_df_antique, 'antique')
evaluate_results(results_beir, qrels_beir, queries_df_beir, 'beir')

print("\n🔍 Sample results for ANTIQUE (first 2 queries):")
for query_id in list(results_antique.keys())[:2]:
    print(f"Query ID: {query_id}, Top Docs: {results_antique[query_id]}")

print("\n🔍 Sample results for BEIR (first 2 queries):")
for query_id in list(results_beir.keys())[:2]:
    print(f"Query ID: {query_id}, Top Docs: {results_beir[query_id]}")

# تحرير الذاكرة
del query_embeddings_antique, query_embeddings_beir, embeddings_matrix_antique, embeddings_matrix_beir, model_antique, model_beir
gc.collect()


✅ GPU متوفر: Tesla T4
Loading model for antique on cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Loading model for beir on cuda


Encoding queries: 100%|██████████| 4/4 [00:02<00:00,  1.36it/s]
Encoding queries: 100%|██████████| 4/4 [00:01<00:00,  2.37it/s]
Processing antique queries: 100%|██████████| 4/4 [01:31<00:00, 22.92s/it]
Processing beir queries: 100%|██████████| 4/4 [01:49<00:00, 27.36s/it] 



📊 ANTIQUE Evaluation:
MAP: 0.4297, Precision@10: 0.1860, Recall@10: 0.0610, MRR: 0.4863

📊 BEIR Evaluation:
MAP: 0.4402, Precision@10: 0.0780, Recall@10: 0.5120, MRR: 0.4529

🔍 Sample results for ANTIQUE (first 2 queries):
Query ID: 3990512, Top Docs: ['2036065_1', '3265991_12', '1253368_5', '971642_3', '4087614_7', '3798202_0', '329228_8', '4273969_1', '1408062_3', '3313750_14']
Query ID: 714612, Top Docs: ['2781942_4', '721858_16', '218504_1', '2854298_9', '1178326_1', '3486436_1', '1396501_1', '3351230_3', '2497742_1', '2045066_1']

🔍 Sample results for BEIR (first 2 queries):
Query ID: 46, Top Docs: ['63734', '271267', '188177', '19261', '19262', '53617', '136358', '31586', '41567', '63729']
Query ID: 187, Top Docs: ['68732', '321224', '258305', '258306', '434871', '311017', '202157', '144864', '360881', '346186']


0