HYBRID

HYBRID_ANTIQUE

In [None]:
import os
import joblib
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import gc
import sqlite3
from scipy.sparse import csr_matrix
from collections import defaultdict
from math import log2
from datetime import datetime

# تعريف BATCH_SIZE
BATCH_SIZE = 16

# تحميل بيانات NLTK المطلوبة
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# تعريف دالة التوكنايزر المخصصة
stop_words = set(stopwords.words('english')) - {'not', 'no', 'never'}
lemmatizer = WordNetLemmatizer()

def custom_tokenizer(text):
    if not isinstance(text, str) or not text.strip():
        return []
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words and len(token) > 2]
    tokens = [lemmatizer.lemmatize(token, pos='v') for token in tokens]
    return tokens


# دالة للحصول على doc_ids من inverted index
def get_doc_ids_from_index(processed_query, db_path):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    doc_ids = set()
    for term in processed_query:
        cursor.execute("SELECT doc_id FROM inverted_index WHERE term = ?", (term,))
        doc_ids.update(row[0] for row in cursor.fetchall())
    conn.close()
    return doc_ids

# مسارات الملفات
base_path_antique = r"data\antique"


tfidf_matrix_path_antique = os.path.join(base_path_antique, "tfidf_matrix.joblib")
queries_path_antique = os.path.join(base_path_antique, "queries_antique.csv")
docs_path_antique = os.path.join(base_path_antique, "docs_antique.csv")
doc_id_mapping_path_antique = os.path.join(base_path_antique, "doc_id_mapping.joblib")
embeddings_matrix_path_antique = os.path.join(base_path_antique, "embeddings_matrix.joblib")
embeddings_vectorizer_path_antique = os.path.join(base_path_antique, "embeddings_vectorizer.joblib")


qrels_path_antique = os.path.join(base_path_antique, "qrels_antique.csv")

# تحميل المستندات وإنشاء doc_id_mapping
docs_antique = pd.read_csv(docs_path_antique)

doc_id_mapping_antique = {i: str(doc_id) for i, doc_id in enumerate(docs_antique['doc_id'])}

joblib.dump(doc_id_mapping_antique, doc_id_mapping_path_antique)


# تحميل مصفوفات TF-IDF و التضمينات
tfidf_matrix_antique = joblib.load(tfidf_matrix_path_antique)

embeddings_matrix_antique = joblib.load(embeddings_matrix_path_antique)


# تحميل الاستعلامات
queries_df_antique = pd.read_csv(queries_path_antique)


# إزالة الاستعلامات الفارغة
queries_df_antique = queries_df_antique[queries_df_antique['text'].notna() & queries_df_antique['text'].str.strip() != '']




# تحميل محولات TF-IDF
tfidf_vectorizer_antique = joblib.load(os.path.join(base_path_antique, "tfidf_vectorizer.joblib"))


# تحميل نموذج embeddings_vectorizer المخزن
model_antique = joblib.load(embeddings_vectorizer_path_antique)



query_texts_antique = queries_df_antique['text'].tolist()

query_tfidf_antique = tfidf_vectorizer_antique.transform(query_texts_antique)




query_embeddings_antique = joblib.load(os.path.join(base_path_antique, "query_embeddings_matrix_antique.joblib"))



def rank_documents_hybrid(query_vector, dataset_name):
    try:
        ranked_docs = []
        base_path = f"data/{dataset_name}"
        db_path = os.path.join(base_path, "index.db")
        
        # تحميل قاعدة البيانات
        if not os.path.exists(db_path):
            return {"status": "error", "message": "Database not found"}
        conn = sqlite3.connect(db_path)
        cursor = conn.cursor()
        cursor.execute("SELECT doc_id, text FROM documents")
        doc_mapping = {i: (doc_id, text) for i, (doc_id, text) in enumerate(cursor.fetchall())}
        conn.close()

        # تحميل مصفوفات TF-IDF والتضمينات
        tfidf_matrix = joblib.load(os.path.join(base_path, "tfidf_matrix.joblib"))
        embeddings_matrix = joblib.load(os.path.join(base_path, "embeddings_matrix.joblib"))
        doc_id_mapping = joblib.load(os.path.join(base_path, "doc_id_mapping.joblib"))

        # معالجة استعلام TF-IDF
        processed_terms = custom_tokenizer(query_vector['text'])
        doc_ids = get_doc_ids_from_index(processed_terms, db_path)
        filtered_indices = [idx for idx, doc_id in doc_id_mapping.items() if doc_id in doc_ids]
        if not filtered_indices:
            return {"status": "error", "message": "No documents found in inverted index"}

        # حساب التشابه باستخدام TF-IDF
        query_vector_tfidf = np.array(query_vector['tfidf']).reshape(1, -1)
        if query_vector_tfidf.shape[1] != tfidf_matrix.shape[1]:
            return {"status": "error", "message": "Dimension mismatch in TF-IDF"}
        filtered_tfidf_matrix = tfidf_matrix[filtered_indices]
        similarities_tfidf = cosine_similarity(query_vector_tfidf, filtered_tfidf_matrix)
        doc_indices_tfidf = np.argsort(similarities_tfidf[0])[::-1][:1000]

        # حساب التشابه باستخدام التضمينات
        query_vector_embedding = np.array(query_vector['embedding']).reshape(1, -1)
        if query_vector_embedding.shape[1] != embeddings_matrix.shape[1]:
            return {"status": "error", "message": "Dimension mismatch in embeddings"}
        filtered_embeddings = embeddings_matrix[filtered_indices]
        similarities_embedding = cosine_similarity(query_vector_embedding, filtered_embeddings)
        doc_indices_embedding = np.argsort(similarities_embedding[0])[::-1][:10]

        # دمج النتائج
        for idx in doc_indices_embedding:
            original_idx = filtered_indices[idx]
            if original_idx < tfidf_matrix.shape[0] and original_idx in doc_mapping:
                doc_id, text = doc_mapping[original_idx]
                tfidf_score = similarities_tfidf[0][idx] if idx < len(similarities_tfidf[0]) else 0.0
                ranked_docs.append({
                    "doc_id": doc_id,
                    "score": float(similarities_embedding[0][idx]),
                    "text": text,
                    "tfidf_score": float(tfidf_score)
                })

        if not ranked_docs:
            return {"status": "error", "message": "No valid documents found"}
        
        return {"status": "success", "ranked_docs": ranked_docs}
    
    except Exception as e:
        print(f"Ranking error: {str(e)}")
        return {"status": "error", "message": str(e)}

# دالة لمعالجة الاستعلامات بشكل دفعي
def process_hybrid_similarities_in_batches(query_tfidf, query_embeddings, tfidf_matrix, embeddings_matrix, queries_df, batch_size, doc_id_mapping, dataset_base_path):
    results = {}
    index_db_path = os.path.join(dataset_base_path, "index.db")
    
    total_batches = (query_tfidf.shape[0] + batch_size - 1) // batch_size
    progress_bar = tqdm(total=total_batches, desc=f"🔄 Processing queries ({dataset_base_path.split(os.sep)[-1]})", unit="batch")

    for i in range(0, query_tfidf.shape[0], batch_size):
        batch_queries_tfidf = query_tfidf[i:i + batch_size]
        batch_queries_embeddings = query_embeddings[i:i + batch_size]
        batch_texts = queries_df['text'].iloc[i:i + batch_size].tolist()
        batch_query_ids = queries_df['query_id'].iloc[i:i + batch_size].astype(str).tolist()

        for query_tfidf_vec, query_emb_vec, query_text, query_id in zip(batch_queries_tfidf, batch_queries_embeddings, batch_texts, batch_query_ids):
            query_vector = {
                'tfidf': query_tfidf_vec.toarray()[0],
                'embedding': query_emb_vec,
                'text': query_text
            }
            result = rank_documents_hybrid(query_vector, dataset_base_path.split(os.sep)[-1])
            if result['status'] == 'success':
                results[query_id] = [doc['doc_id'] for doc in result['ranked_docs']]
            else:
                results[query_id] = []
        
        gc.collect()
        progress_bar.update(1)
    
    progress_bar.close()
    return results

# دالة لحساب المقاييس
def calculate_precision_at_k(retrieved_docs, relevant_docs, k=10):
    relevant_in_top_k = sum(1 for doc_id in retrieved_docs[:k] if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0)
    return relevant_in_top_k / k if k > 0 else 0.0

def calculate_recall_at_k(retrieved_docs, relevant_docs, k=10):
    relevant_in_top_k = sum(1 for doc_id in retrieved_docs[:k] if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0)
    total_relevant = sum(1 for score in relevant_docs.values() if float(score) > 0)
    return relevant_in_top_k / total_relevant if total_relevant > 0 else 0.0

def calculate_rr(retrieved_docs, relevant_docs):
    for i, doc_id in enumerate(retrieved_docs, 1):
        if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0:
            return 1.0 / i
    return 0.0


def calculate_map(retrieved_docs, relevant_docs):
    relevant_count = 0
    precision_sum = 0.0
    for i, doc_id in enumerate(retrieved_docs, 1):
        if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0:
            relevant_count += 1
            precision_sum += relevant_count / i
    return precision_sum / relevant_count if relevant_count > 0 else 0.0

def load_qrels(qrels_df):
    qrels = defaultdict(dict)
    for _, row in qrels_df.iterrows():
        qrels[str(row['query_id'])][str(row['doc_id'])] = float(row['relevance'])
    return qrels

# تقييم النتائج
def evaluate_results(results, qrels, queries_df, dataset_name):
    map_scores = []
    precision_scores = []
    recall_scores = []
    rr_scores = []
    low_map_queries = []

    for query_id in results:
        retrieved_docs = results[query_id]
        relevant_docs = qrels.get(query_id, {})
        if relevant_docs:
            map_score = calculate_map(retrieved_docs, relevant_docs)
            map_scores.append(map_score)
            if map_score < 0.1:
                filtered_df = queries_df[queries_df['query_id'].astype(str) == str(query_id)]
                if not filtered_df.empty:
                    query_text = filtered_df['text'].iloc[0]
                    low_map_queries.append((query_id, query_text, map_score))
                else:
                    print(f"Query ID {query_id} not found in queries_df for {dataset_name}")
            precision_scores.append(calculate_precision_at_k(retrieved_docs, relevant_docs))
            recall_scores.append(calculate_recall_at_k(retrieved_docs, relevant_docs))
            rr_scores.append(calculate_rr(retrieved_docs, relevant_docs))

    map_score = np.mean(map_scores) if map_scores else 0.0
    precision_score = np.mean(precision_scores) if precision_scores else 0.0
    recall_score = np.mean(recall_scores) if recall_scores else 0.0
    mrr_score = np.mean(rr_scores) if rr_scores else 0.0

    print(f"{dataset_name} - MAP: {map_score:.4f}, Precision@10: {precision_score:.4f}, Recall@10: {recall_score:.4f}, MRR: {mrr_score:.4f}")

# استدعاء الدالة لهجين ANTIQUE
results_antique = process_hybrid_similarities_in_batches(
    query_tfidf_antique, query_embeddings_antique, tfidf_matrix_antique, embeddings_matrix_antique,
    queries_df_antique, BATCH_SIZE, doc_id_mapping_antique, base_path_antique
)



# تحميل qrels
qrels_antique = load_qrels(pd.read_csv(qrels_path_antique))


# تقييم النتائج
evaluate_results(results_antique, qrels_antique, queries_df_antique, 'antique')





# تحرير الذاكرة
del query_tfidf_antique,query_embeddings_antique, 
del tfidf_matrix_antique, embeddings_matrix_antique, 
del model_antique, 
gc.collect()   

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
🔄 Processing queries (antique): 100%|██████████| 11/11 [06:35<00:00, 35.99s/batch]


antique - MAP: 0.4446, Precision@10: 0.2085, Recall@10: 0.0721, MRR: 0.4910


3690

HYBRID_BEIR

In [None]:
import os
import joblib
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import gc
import sqlite3
from scipy.sparse import csr_matrix
from collections import defaultdict
from math import log2
from datetime import datetime

# تعريف BATCH_SIZE
BATCH_SIZE = 16

# تحميل بيانات NLTK المطلوبة
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# تعريف دالة التوكنايزر المخصصة
stop_words = set(stopwords.words('english')) - {'not', 'no', 'never'}
lemmatizer = WordNetLemmatizer()

def custom_tokenizer(text):
    if not isinstance(text, str) or not text.strip():
        return []
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words and len(token) > 2]
    tokens = [lemmatizer.lemmatize(token, pos='v') for token in tokens]
    return tokens


# دالة للحصول على doc_ids من inverted index
def get_doc_ids_from_index(processed_query, db_path):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    doc_ids = set()
    for term in processed_query:
        cursor.execute("SELECT doc_id FROM inverted_index WHERE term = ?", (term,))
        doc_ids.update(row[0] for row in cursor.fetchall())
    conn.close()
    return doc_ids

# مسارات الملفات

base_path_beir = r"data\beir"



tfidf_matrix_path_beir = os.path.join(base_path_beir, "tfidf_matrix.joblib")
queries_path_beir = os.path.join(base_path_beir, "queries_beir.csv")
docs_path_beir = os.path.join(base_path_beir, "docs_beir.csv")
doc_id_mapping_path_beir = os.path.join(base_path_beir, "doc_id_mapping.joblib")
embeddings_matrix_path_beir = os.path.join(base_path_beir, "embeddings_matrix.joblib")
embeddings_vectorizer_path_beir = os.path.join(base_path_beir, "embeddings_vectorizer.joblib")


qrels_path_beir = os.path.join(base_path_beir, "qrels_beir.csv")

# تحميل المستندات وإنشاء doc_id_mapping

docs_beir = pd.read_csv(docs_path_beir)

doc_id_mapping_beir = {i: str(doc_id) for i, doc_id in enumerate(docs_beir['doc_id'])}

joblib.dump(doc_id_mapping_beir, doc_id_mapping_path_beir)

# تحميل مصفوفات TF-IDF و التضمينات

tfidf_matrix_beir = joblib.load(tfidf_matrix_path_beir)

embeddings_matrix_beir = joblib.load(embeddings_matrix_path_beir)

# تحميل الاستعلامات

queries_df_beir = pd.read_csv(queries_path_beir)

# إزالة الاستعلامات الفارغة

queries_df_beir = queries_df_beir[queries_df_beir['text'].notna() & queries_df_beir['text'].str.strip() != '']



# تحميل محولات TF-IDF

tfidf_vectorizer_beir = joblib.load(os.path.join(base_path_beir, "tfidf_vectorizer.joblib"))

# تحميل نموذج embeddings_vectorizer المخزن

model_beir = joblib.load(embeddings_vectorizer_path_beir)



query_texts_beir = queries_df_beir['text'].tolist()

query_tfidf_beir = tfidf_vectorizer_beir.transform(query_texts_beir)

# دالة لتحويل الاستعلامات إلى تضمينات بشكل دفعي
def encode_in_batches(texts, vectorizer, batch_size=BATCH_SIZE):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Encoding queries", bar_format="{l_bar}{bar}"):
        batch_texts = texts[i:i + batch_size]
        batch_embeddings = vectorizer.encode(batch_texts, convert_to_tensor=False, show_progress_bar=False)
        embeddings.append(batch_embeddings)
        gc.collect()
    return np.vstack(embeddings).astype(np.float32)


query_embeddings_beir = joblib.load(os.path.join(base_path_beir, "query_embeddings_matrix_beir.joblib"))

# دالة هجينة لترتيب المستندات
def rank_documents_hybrid(query_vector, dataset_name):
    try:
        ranked_docs = []
        base_path = f"data/{dataset_name}"
        db_path = os.path.join(base_path, "index.db")
        
        # تحميل قاعدة البيانات
        if not os.path.exists(db_path):
            return {"status": "error", "message": "Database not found"}
        conn = sqlite3.connect(db_path)
        cursor = conn.cursor()
        cursor.execute("SELECT doc_id, text FROM documents")
        doc_mapping = {i: (doc_id, text) for i, (doc_id, text) in enumerate(cursor.fetchall())}
        conn.close()

        # تحميل مصفوفات TF-IDF والتضمينات
        tfidf_matrix = joblib.load(os.path.join(base_path, "tfidf_matrix.joblib"))
        embeddings_matrix = joblib.load(os.path.join(base_path, "embeddings_matrix.joblib"))
        doc_id_mapping = joblib.load(os.path.join(base_path, "doc_id_mapping.joblib"))

        # معالجة استعلام TF-IDF
        processed_terms = custom_tokenizer(query_vector['text'])
        doc_ids = get_doc_ids_from_index(processed_terms, db_path)
        filtered_indices = [idx for idx, doc_id in doc_id_mapping.items() if doc_id in doc_ids]
        if not filtered_indices:
            return {"status": "error", "message": "No documents found in inverted index"}

        # حساب التشابه باستخدام TF-IDF
        query_vector_tfidf = np.array(query_vector['tfidf']).reshape(1, -1)
        if query_vector_tfidf.shape[1] != tfidf_matrix.shape[1]:
            return {"status": "error", "message": "Dimension mismatch in TF-IDF"}
        filtered_tfidf_matrix = tfidf_matrix[filtered_indices]
        similarities_tfidf = cosine_similarity(query_vector_tfidf, filtered_tfidf_matrix)
        doc_indices_tfidf = np.argsort(similarities_tfidf[0])[::-1][:1000]

        # حساب التشابه باستخدام التضمينات
        query_vector_embedding = np.array(query_vector['embedding']).reshape(1, -1)
        if query_vector_embedding.shape[1] != embeddings_matrix.shape[1]:
            return {"status": "error", "message": "Dimension mismatch in embeddings"}
        filtered_embeddings = embeddings_matrix[filtered_indices]
        similarities_embedding = cosine_similarity(query_vector_embedding, filtered_embeddings)
        doc_indices_embedding = np.argsort(similarities_embedding[0])[::-1][:10]

        # دمج النتائج
        for idx in doc_indices_embedding:
            original_idx = filtered_indices[idx]
            if original_idx < tfidf_matrix.shape[0] and original_idx in doc_mapping:
                doc_id, text = doc_mapping[original_idx]
                tfidf_score = similarities_tfidf[0][idx] if idx < len(similarities_tfidf[0]) else 0.0
                ranked_docs.append({
                    "doc_id": doc_id,
                    "score": float(similarities_embedding[0][idx]),
                    "text": text,
                    "tfidf_score": float(tfidf_score)
                })

        if not ranked_docs:
            return {"status": "error", "message": "No valid documents found"}
        
        return {"status": "success", "ranked_docs": ranked_docs}
    
    except Exception as e:
        print(f"Ranking error: {str(e)}")
        return {"status": "error", "message": str(e)}

# دالة لمعالجة الاستعلامات بشكل دفعي
def process_hybrid_similarities_in_batches(query_tfidf, query_embeddings, tfidf_matrix, embeddings_matrix, queries_df, batch_size, doc_id_mapping, dataset_base_path):
    results = {}
    index_db_path = os.path.join(dataset_base_path, "index.db")
    
    total_batches = (query_tfidf.shape[0] + batch_size - 1) // batch_size
    progress_bar = tqdm(total=total_batches, desc=f"🔄 Processing queries ({dataset_base_path.split(os.sep)[-1]})", unit="batch")

    for i in range(0, query_tfidf.shape[0], batch_size):
        batch_queries_tfidf = query_tfidf[i:i + batch_size]
        batch_queries_embeddings = query_embeddings[i:i + batch_size]
        batch_texts = queries_df['text'].iloc[i:i + batch_size].tolist()
        batch_query_ids = queries_df['query_id'].iloc[i:i + batch_size].astype(str).tolist()

        for query_tfidf_vec, query_emb_vec, query_text, query_id in zip(batch_queries_tfidf, batch_queries_embeddings, batch_texts, batch_query_ids):
            query_vector = {
                'tfidf': query_tfidf_vec.toarray()[0],
                'embedding': query_emb_vec,
                'text': query_text
            }
            result = rank_documents_hybrid(query_vector, dataset_base_path.split(os.sep)[-1])
            if result['status'] == 'success':
                results[query_id] = [doc['doc_id'] for doc in result['ranked_docs']]
            else:
                results[query_id] = []
        
        gc.collect()
        progress_bar.update(1)
    
    progress_bar.close()
    return results

# دالة لحساب المقاييس
def calculate_precision_at_k(retrieved_docs, relevant_docs, k=10):
    relevant_in_top_k = sum(1 for doc_id in retrieved_docs[:k] if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0)
    return relevant_in_top_k / k if k > 0 else 0.0

def calculate_recall_at_k(retrieved_docs, relevant_docs, k=10):
    relevant_in_top_k = sum(1 for doc_id in retrieved_docs[:k] if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0)
    total_relevant = sum(1 for score in relevant_docs.values() if float(score) > 0)
    return relevant_in_top_k / total_relevant if total_relevant > 0 else 0.0

def calculate_rr(retrieved_docs, relevant_docs):
    for i, doc_id in enumerate(retrieved_docs, 1):
        if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0:
            return 1.0 / i
    return 0.0


def calculate_map(retrieved_docs, relevant_docs):
    relevant_count = 0
    precision_sum = 0.0
    for i, doc_id in enumerate(retrieved_docs, 1):
        if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0:
            relevant_count += 1
            precision_sum += relevant_count / i
    return precision_sum / relevant_count if relevant_count > 0 else 0.0

def load_qrels(qrels_df):
    qrels = defaultdict(dict)
    for _, row in qrels_df.iterrows():
        qrels[str(row['query_id'])][str(row['doc_id'])] = float(row['relevance'])
    return qrels

# تقييم النتائج     
def evaluate_results(results, qrels, queries_df, dataset_name):
    map_scores = []
    precision_scores = []
    recall_scores = []
    rr_scores = []
    low_map_queries = []

    for query_id in results:
        retrieved_docs = results[query_id]
        relevant_docs = qrels.get(query_id, {})
        if relevant_docs:
            map_score = calculate_map(retrieved_docs, relevant_docs)
            map_scores.append(map_score)
            if map_score < 0.1:
                filtered_df = queries_df[queries_df['query_id'].astype(str) == str(query_id)]
                if not filtered_df.empty:
                    query_text = filtered_df['text'].iloc[0]
                    low_map_queries.append((query_id, query_text, map_score))
                else:
                    print(f"Query ID {query_id} not found in queries_df for {dataset_name}")
            precision_scores.append(calculate_precision_at_k(retrieved_docs, relevant_docs))
            recall_scores.append(calculate_recall_at_k(retrieved_docs, relevant_docs))
            rr_scores.append(calculate_rr(retrieved_docs, relevant_docs))

    map_score = np.mean(map_scores) if map_scores else 0.0
    precision_score = np.mean(precision_scores) if precision_scores else 0.0
    recall_score = np.mean(recall_scores) if recall_scores else 0.0
    mrr_score = np.mean(rr_scores) if rr_scores else 0.0

    print(f"{dataset_name} - MAP: {map_score:.4f}, Precision@10: {precision_score:.4f}, Recall@10: {recall_score:.4f}, MRR: {mrr_score:.4f}")


# استدعاء الدالة لهجين BEIR
results_beir = process_hybrid_similarities_in_batches(
    query_tfidf_beir, query_embeddings_beir, tfidf_matrix_beir, embeddings_matrix_beir,
    queries_df_beir, BATCH_SIZE, doc_id_mapping_beir, base_path_beir
)

# تحميل qrels

qrels_beir = load_qrels(pd.read_csv(qrels_path_beir))

# تقييم النتائج

evaluate_results(results_beir, qrels_beir, queries_df_beir, 'beir')




# تحرير الذاكرة
del  query_tfidf_beir, query_embeddings_beir
del  tfidf_matrix_beir , embeddings_matrix_beir
del  model_beir
gc.collect()   

[nltk_data] Error loading punkt: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>
[nltk_data] Error loading stopwords: <urlopen error [Errno 11002]
[nltk_data]     getaddrinfo failed>
[nltk_data] Error loading wordnet: <urlopen error [Errno 11002]
[nltk_data]     getaddrinfo failed>
  from .autonotebook import tqdm as notebook_tqdm
🔄 Processing queries (beir): 100%|██████████| 625/625 [6:41:51<00:00, 38.58s/batch]  


beir - MAP: 0.6929, Precision@10: 0.1088, Recall@10: 0.8113, MRR: 0.7059


3690

hybrid and vector store

hybrid and vector store ANTIQUE

In [None]:
import os
import joblib
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import gc
import sqlite3
from scipy.sparse import csr_matrix
from collections import defaultdict
from math import log2
from datetime import datetime

# تعريف BATCH_SIZE
BATCH_SIZE = 16

# تحميل بيانات NLTK المطلوبة
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# تعريف دالة التوكنايزر المخصصة
stop_words = set(stopwords.words('english')) - {'not', 'no', 'never'}
lemmatizer = WordNetLemmatizer()

def custom_tokenizer(text):
    if not isinstance(text, str) or not text.strip():
        return []
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words and len(token) > 2]
    tokens = [lemmatizer.lemmatize(token, pos='v') for token in tokens]
    return tokens



# دالة للحصول على doc_ids من inverted index
def get_doc_ids_from_index(processed_query, db_path):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    doc_ids = set()
    for term in processed_query:
        cursor.execute("SELECT doc_id FROM inverted_index WHERE term = ?", (term,))
        doc_ids.update(row[0] for row in cursor.fetchall())
    conn.close()
    return doc_ids

# مسارات الملفات
base_path_antique = r"data\antique"


tfidf_matrix_path_antique = os.path.join(base_path_antique, "tfidf_matrix.joblib")
queries_path_antique = os.path.join(base_path_antique, "queries_antique.csv")
docs_path_antique = os.path.join(base_path_antique, "docs_antique.csv")
doc_id_mapping_path_antique = os.path.join(base_path_antique, "doc_id_mapping.joblib")
embeddings_matrix_path_antique = os.path.join(base_path_antique, "embeddings_matrix.joblib")
embeddings_vectorizer_path_antique = os.path.join(base_path_antique, "embeddings_vectorizer.joblib")


qrels_path_antique = os.path.join(base_path_antique, "qrels_antique.csv")

# تحميل المستندات وإنشاء doc_id_mapping
docs_antique = pd.read_csv(docs_path_antique)

doc_id_mapping_antique = {i: str(doc_id) for i, doc_id in enumerate(docs_antique['doc_id'])}

joblib.dump(doc_id_mapping_antique, doc_id_mapping_path_antique)


# تحميل مصفوفات TF-IDF و التضمينات
tfidf_matrix_antique = joblib.load(tfidf_matrix_path_antique)

embeddings_matrix_antique = joblib.load(embeddings_matrix_path_antique)


# تحميل الاستعلامات
queries_df_antique = pd.read_csv(queries_path_antique)


# إزالة الاستعلامات الفارغة
queries_df_antique = queries_df_antique[queries_df_antique['text'].notna() & queries_df_antique['text'].str.strip() != '']




# تحميل محولات TF-IDF
tfidf_vectorizer_antique = joblib.load(os.path.join(base_path_antique, "tfidf_vectorizer.joblib"))


# تحميل نموذج embeddings_vectorizer المخزن
model_antique = joblib.load(embeddings_vectorizer_path_antique)



query_texts_antique = queries_df_antique['text'].tolist()

query_tfidf_antique = tfidf_vectorizer_antique.transform(query_texts_antique)


# دالة لتحويل الاستعلامات إلى تضمينات بشكل دفعي
def encode_in_batches(texts, vectorizer, batch_size=BATCH_SIZE):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Encoding queries", bar_format="{l_bar}{bar}"):
        batch_texts = texts[i:i + batch_size]
        batch_embeddings = vectorizer.encode(batch_texts, convert_to_tensor=False, show_progress_bar=False)
        embeddings.append(batch_embeddings)
        gc.collect()
    return np.vstack(embeddings).astype(np.float32)

query_embeddings_antique = joblib.load(os.path.join(base_path_antique, "query_embeddings_matrix_antique.joblib"))


# دالة هجينة لترتيب المستندات
import faiss
import numpy as np
import sqlite3
import os
import joblib
from sklearn.metrics.pairwise import cosine_similarity

def rank_documents_hybrid(query_vector, dataset_name):
    try:
        ranked_docs = []
        base_path = f"data/{dataset_name}"
        db_path = os.path.join(base_path, "index.db")
        
        # تحميل قاعدة البيانات
        if not os.path.exists(db_path):
            return {"status": "error", "message": "Database not found"}
        conn = sqlite3.connect(db_path)
        cursor = conn.cursor()
        cursor.execute("SELECT doc_id, text FROM documents")
        doc_mapping = {i: (doc_id, text) for i, (doc_id, text) in enumerate(cursor.fetchall())}
        conn.close()

        # تحميل مصفوفات TF-IDF والتضمينات وفهرس FAISS
        tfidf_matrix = joblib.load(os.path.join(base_path, "tfidf_matrix.joblib"))
        embeddings_matrix = joblib.load(os.path.join(base_path, "embeddings_matrix.joblib"))
        doc_id_mapping = joblib.load(os.path.join(base_path, "doc_id_mapping.joblib"))
        index_path = os.path.join(base_path, "embedding_index.faiss")
        
        # تحميل فهرس FAISS
        if not os.path.exists(index_path):
            return {"status": "error", "message": "Vector store index not found"}
        index = faiss.read_index(index_path)

        # معالجة استعلام TF-IDF
        processed_terms = custom_tokenizer(query_vector['text'])
        doc_ids = get_doc_ids_from_index(processed_terms, db_path)
        filtered_indices = [idx for idx, doc_id in doc_id_mapping.items() if doc_id in doc_ids]
        if not filtered_indices:
            return {"status": "error", "message": "No documents found in inverted index"}

        # حساب التشابه باستخدام TF-IDF
        query_vector_tfidf = np.array(query_vector['tfidf']).reshape(1, -1)
        if query_vector_tfidf.shape[1] != tfidf_matrix.shape[1]:
            return {"status": "error", "message": "Dimension mismatch in TF-IDF"}
        filtered_tfidf_matrix = tfidf_matrix[filtered_indices]
        similarities_tfidf = cosine_similarity(query_vector_tfidf, filtered_tfidf_matrix)
        doc_indices_tfidf = np.argsort(similarities_tfidf[0])[::-1][:1000]

        # حساب التشابه باستخدام التضمينات مع FAISS
        query_vector_embedding = np.array(query_vector['embedding']).reshape(1, -1).astype(np.float32)
        if query_vector_embedding.shape[1] != embeddings_matrix.shape[1]:
            return {"status": "error", "message": "Dimension mismatch in embeddings"}
        
        # البحث باستخدام FAISS لاختيار أفضل 10 مستندات
        distances, indices = index.search(query_vector_embedding, k=10)
        doc_indices_embedding = [idx for idx in indices[0] if idx < len(doc_mapping) and idx in filtered_indices]

        # إذا لم يتم العثور على مستندات صالحة في FAISS
        if not doc_indices_embedding:
            return {"status": "error", "message": "No valid documents found in FAISS index"}

        # حساب التشابه الكوسيني للمستندات المختارة من FAISS
        selected_embeddings = embeddings_matrix[doc_indices_embedding]
        similarities_embedding = cosine_similarity(query_vector_embedding, selected_embeddings)
        sorted_indices = np.argsort(similarities_embedding[0])[::-1]

        # دمج النتائج
        for idx in sorted_indices:
            original_idx = doc_indices_embedding[idx]
            if original_idx < tfidf_matrix.shape[0] and original_idx in doc_mapping:
                doc_id, text = doc_mapping[original_idx]
                tfidf_score = similarities_tfidf[0][filtered_indices.index(original_idx)] if original_idx in filtered_indices else 0.0
                ranked_docs.append({
                    "doc_id": doc_id,
                    "score": float(similarities_embedding[0][idx]),
                    "text": text,
                    "tfidf_score": float(tfidf_score)
                })

        if not ranked_docs:
            return {"status": "error", "message": "No valid documents found"}
        
        return {"status": "success", "ranked_docs": ranked_docs}
    
    except Exception as e:
        print(f"Ranking error: {str(e)}")
        return {"status": "error", "message": str(e)}

# دالة لمعالجة الاستعلامات بشكل دفعي
def process_hybrid_similarities_in_batches(query_tfidf, query_embeddings, tfidf_matrix, embeddings_matrix, queries_df, batch_size, doc_id_mapping, dataset_base_path):
    results = {}
    index_db_path = os.path.join(dataset_base_path, "index.db")
    
    total_batches = (query_tfidf.shape[0] + batch_size - 1) // batch_size
    progress_bar = tqdm(total=total_batches, desc=f"🔄 Processing queries ({dataset_base_path.split(os.sep)[-1]})", unit="batch")

    for i in range(0, query_tfidf.shape[0], batch_size):
        batch_queries_tfidf = query_tfidf[i:i + batch_size]
        batch_queries_embeddings = query_embeddings[i:i + batch_size]
        batch_texts = queries_df['text'].iloc[i:i + batch_size].tolist()
        batch_query_ids = queries_df['query_id'].iloc[i:i + batch_size].astype(str).tolist()

        for query_tfidf_vec, query_emb_vec, query_text, query_id in zip(batch_queries_tfidf, batch_queries_embeddings, batch_texts, batch_query_ids):
            query_vector = {
                'tfidf': query_tfidf_vec.toarray()[0],
                'embedding': query_emb_vec,
                'text': query_text
            }
            result = rank_documents_hybrid(query_vector, dataset_base_path.split(os.sep)[-1])
            if result['status'] == 'success':
                results[query_id] = [doc['doc_id'] for doc in result['ranked_docs']]
            else:
                results[query_id] = []
        
        gc.collect()
        progress_bar.update(1)
    
    progress_bar.close()
    return results

# دالة لحساب المقاييس
def calculate_precision_at_k(retrieved_docs, relevant_docs, k=10):
    relevant_in_top_k = sum(1 for doc_id in retrieved_docs[:k] if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0)
    return relevant_in_top_k / k if k > 0 else 0.0

def calculate_recall_at_k(retrieved_docs, relevant_docs, k=10):
    relevant_in_top_k = sum(1 for doc_id in retrieved_docs[:k] if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0)
    total_relevant = sum(1 for score in relevant_docs.values() if float(score) > 0)
    return relevant_in_top_k / total_relevant if total_relevant > 0 else 0.0

def calculate_rr(retrieved_docs, relevant_docs):
    for i, doc_id in enumerate(retrieved_docs, 1):
        if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0:
            return 1.0 / i
    return 0.0


def calculate_map(retrieved_docs, relevant_docs):
    relevant_count = 0
    precision_sum = 0.0
    for i, doc_id in enumerate(retrieved_docs, 1):
        if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0:
            relevant_count += 1
            precision_sum += relevant_count / i
    return precision_sum / relevant_count if relevant_count > 0 else 0.0

def load_qrels(qrels_df):
    qrels = defaultdict(dict)
    for _, row in qrels_df.iterrows():
        qrels[str(row['query_id'])][str(row['doc_id'])] = float(row['relevance'])
    return qrels

# تقييم النتائج وتسجيل الاستعلامات ذات الأداء المنخفض
def evaluate_results(results, qrels, queries_df, dataset_name):
    map_scores = []
    precision_scores = []
    recall_scores = []
    rr_scores = []
    low_map_queries = []

    for query_id in results:
        retrieved_docs = results[query_id]
        relevant_docs = qrels.get(query_id, {})
        if relevant_docs:
            map_score = calculate_map(retrieved_docs, relevant_docs)
            map_scores.append(map_score)
            if map_score < 0.1:
                filtered_df = queries_df[queries_df['query_id'].astype(str) == str(query_id)]
                if not filtered_df.empty:
                    query_text = filtered_df['text'].iloc[0]
                    low_map_queries.append((query_id, query_text, map_score))
                else:
                    print(f"Query ID {query_id} not found in queries_df for {dataset_name}")
            precision_scores.append(calculate_precision_at_k(retrieved_docs, relevant_docs))
            recall_scores.append(calculate_recall_at_k(retrieved_docs, relevant_docs))
            rr_scores.append(calculate_rr(retrieved_docs, relevant_docs))

    map_score = np.mean(map_scores) if map_scores else 0.0
    precision_score = np.mean(precision_scores) if precision_scores else 0.0
    recall_score = np.mean(recall_scores) if recall_scores else 0.0
    mrr_score = np.mean(rr_scores) if rr_scores else 0.0

    print(f"{dataset_name} - MAP: {map_score:.4f}, Precision@10: {precision_score:.4f}, Recall@10: {recall_score:.4f}, MRR: {mrr_score:.4f}")

# استدعاء الدالة لهجين ANTIQUE
results_antique = process_hybrid_similarities_in_batches(
    query_tfidf_antique, query_embeddings_antique, tfidf_matrix_antique, embeddings_matrix_antique,
    queries_df_antique, BATCH_SIZE, doc_id_mapping_antique, base_path_antique
)



# تحميل qrels
qrels_antique = load_qrels(pd.read_csv(qrels_path_antique))


# تقييم النتائج
evaluate_results(results_antique, qrels_antique, queries_df_antique, 'antique')





# تحرير الذاكرة
del query_tfidf_antique,query_embeddings_antique, 
del tfidf_matrix_antique, embeddings_matrix_antique, 
del model_antique, 
gc.collect()   

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm
🔄 Processing queries (antique): 100%|██████████| 11/11 [07:57<00:00, 43.39s/batch]


antique - MAP: 0.4267, Precision@10: 0.1585, Recall@10: 0.0516, MRR: 0.4628


3690

hybrid and vector store BEIR

In [None]:
import os
import joblib
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import gc
import sqlite3
from scipy.sparse import csr_matrix
from collections import defaultdict
from math import log2
from datetime import datetime
import faiss

# إعداد
BATCH_SIZE = 16
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english')) - {'not', 'no', 'never'}
lemmatizer = WordNetLemmatizer()

def custom_tokenizer(text):
    if not isinstance(text, str) or not text.strip():
        return []
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words and len(token) > 2]
    tokens = [lemmatizer.lemmatize(token, pos='v') for token in tokens]
    return tokens

def get_doc_ids_from_index(processed_query, db_path):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    doc_ids = set()
    for term in processed_query:
        cursor.execute("SELECT doc_id FROM inverted_index WHERE term = ?", (term,))
        doc_ids.update(row[0] for row in cursor.fetchall())
    conn.close()
    return doc_ids

# Paths
base_path_beir = r"data\beir"
tfidf_matrix_path_beir = os.path.join(base_path_beir, "tfidf_matrix.joblib")
queries_path_beir = os.path.join(base_path_beir, "queries_beir.csv")
docs_path_beir = os.path.join(base_path_beir, "docs_beir.csv")
doc_id_mapping_path_beir = os.path.join(base_path_beir, "doc_id_mapping.joblib")
embeddings_matrix_path_beir = os.path.join(base_path_beir, "embeddings_matrix.joblib")
embeddings_vectorizer_path_beir = os.path.join(base_path_beir, "embeddings_vectorizer.joblib")
qrels_path_beir = os.path.join(base_path_beir, "qrels_beir.csv")

# Load data
docs_beir = pd.read_csv(docs_path_beir)
doc_id_mapping_beir = {i: str(doc_id) for i, doc_id in enumerate(docs_beir['doc_id'])}
joblib.dump(doc_id_mapping_beir, doc_id_mapping_path_beir)

tfidf_matrix_beir = joblib.load(tfidf_matrix_path_beir)
embeddings_matrix_beir = joblib.load(embeddings_matrix_path_beir)
queries_df_beir = pd.read_csv(queries_path_beir)
queries_df_beir = queries_df_beir[queries_df_beir['text'].notna() & queries_df_beir['text'].str.strip() != '']

tfidf_vectorizer_beir = joblib.load(os.path.join(base_path_beir, "tfidf_vectorizer.joblib"))
model_beir = joblib.load(embeddings_vectorizer_path_beir)

query_texts_beir = queries_df_beir['text'].tolist()
query_tfidf_beir = tfidf_vectorizer_beir.transform(query_texts_beir)
query_embeddings_beir = joblib.load(os.path.join(base_path_beir, "query_embeddings_matrix_beir.joblib"))

# Hybrid ranking function
def rank_documents_hybrid(query_vector, dataset_name):
    try:
        ranked_docs = []
        base_path = f"data/{dataset_name}"
        db_path = os.path.join(base_path, "index.db")
        if not os.path.exists(db_path):
            return {"status": "error", "message": "Database not found"}
        conn = sqlite3.connect(db_path)
        cursor = conn.cursor()
        cursor.execute("SELECT doc_id, text FROM documents")
        doc_mapping = {i: (doc_id, text) for i, (doc_id, text) in enumerate(cursor.fetchall())}
        conn.close()

        tfidf_matrix = joblib.load(os.path.join(base_path, "tfidf_matrix.joblib"))
        embeddings_matrix = joblib.load(os.path.join(base_path, "embeddings_matrix.joblib"))
        doc_id_mapping = joblib.load(os.path.join(base_path, "doc_id_mapping.joblib"))
        index_path = os.path.join(base_path, "embedding_index.faiss")

        if not os.path.exists(index_path):
            return {"status": "error", "message": "Vector store index not found"}
        index = faiss.read_index(index_path)

        processed_terms = custom_tokenizer(query_vector['text'])
        doc_ids = get_doc_ids_from_index(processed_terms, db_path)
        filtered_indices = [idx for idx, doc_id in doc_id_mapping.items() if doc_id in doc_ids]
        if not filtered_indices:
            return {"status": "error", "message": "No documents found in inverted index"}

        query_vector_tfidf = np.array(query_vector['tfidf']).reshape(1, -1)
        if query_vector_tfidf.shape[1] != tfidf_matrix.shape[1]:
            return {"status": "error", "message": "Dimension mismatch in TF-IDF"}
        filtered_tfidf_matrix = tfidf_matrix[filtered_indices]
        similarities_tfidf = cosine_similarity(query_vector_tfidf, filtered_tfidf_matrix)
        doc_indices_tfidf = np.argsort(similarities_tfidf[0])[::-1][:1000]

        query_vector_embedding = np.array(query_vector['embedding']).reshape(1, -1).astype(np.float32)
        if query_vector_embedding.shape[1] != embeddings_matrix.shape[1]:
            return {"status": "error", "message": "Dimension mismatch in embeddings"}
        distances, indices = index.search(query_vector_embedding, k=10)
        doc_indices_embedding = [idx for idx in indices[0] if idx < len(doc_mapping) and idx in filtered_indices]
        if not doc_indices_embedding:
            return {"status": "error", "message": "No valid documents found in FAISS index"}

        selected_embeddings = embeddings_matrix[doc_indices_embedding]
        similarities_embedding = cosine_similarity(query_vector_embedding, selected_embeddings)
        sorted_indices = np.argsort(similarities_embedding[0])[::-1]

        for idx in sorted_indices:
            original_idx = doc_indices_embedding[idx]
            if original_idx < tfidf_matrix.shape[0] and original_idx in doc_mapping:
                doc_id, text = doc_mapping[original_idx]
                tfidf_score = similarities_tfidf[0][filtered_indices.index(original_idx)] if original_idx in filtered_indices else 0.0
                ranked_docs.append({
                    "doc_id": doc_id,
                    "score": float(similarities_embedding[0][idx]),
                    "text": text,
                    "tfidf_score": float(tfidf_score)
                })

        if not ranked_docs:
            return {"status": "error", "message": "No valid documents found"}
        return {"status": "success", "ranked_docs": ranked_docs}
    
    except Exception as e:
        return {"status": "error", "message": str(e)}

# Batch processing
def process_hybrid_similarities_in_batches(query_tfidf, query_embeddings, tfidf_matrix, embeddings_matrix, queries_df, batch_size, doc_id_mapping, dataset_base_path):
    results = {}
    index_db_path = os.path.join(dataset_base_path, "index.db")
    total_batches = (query_tfidf.shape[0] + batch_size - 1) // batch_size
    progress_bar = tqdm(total=total_batches, desc=f"🔄 Processing queries ({dataset_base_path.split(os.sep)[-1]})", unit="batch")

    for i in range(0, query_tfidf.shape[0], batch_size):
        batch_queries_tfidf = query_tfidf[i:i + batch_size]
        batch_queries_embeddings = query_embeddings[i:i + batch_size]
        batch_texts = queries_df['text'].iloc[i:i + batch_size].tolist()
        batch_query_ids = queries_df['query_id'].iloc[i:i + batch_size].astype(str).tolist()

        for query_tfidf_vec, query_emb_vec, query_text, query_id in zip(batch_queries_tfidf, batch_queries_embeddings, batch_texts, batch_query_ids):
            query_vector = {
                'tfidf': query_tfidf_vec.toarray()[0],
                'embedding': query_emb_vec,
                'text': query_text
            }
            result = rank_documents_hybrid(query_vector, dataset_base_path.split(os.sep)[-1])
            results[query_id] = [doc['doc_id'] for doc in result['ranked_docs']] if result['status'] == 'success' else []
        
        gc.collect()
        progress_bar.update(1)
    
    progress_bar.close()
    return results

# Evaluation functions
def calculate_precision_at_k(retrieved_docs, relevant_docs, k=10):
    relevant_in_top_k = sum(1 for doc_id in retrieved_docs[:k] if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0)
    return relevant_in_top_k / k if k > 0 else 0.0

def calculate_recall_at_k(retrieved_docs, relevant_docs, k=10):
    relevant_in_top_k = sum(1 for doc_id in retrieved_docs[:k] if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0)
    total_relevant = sum(1 for score in relevant_docs.values() if float(score) > 0)
    return relevant_in_top_k / total_relevant if total_relevant > 0 else 0.0

def calculate_rr(retrieved_docs, relevant_docs):
    for i, doc_id in enumerate(retrieved_docs, 1):
        if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0:
            return 1.0 / i
    return 0.0

def calculate_map(retrieved_docs, relevant_docs):
    relevant_count = 0
    precision_sum = 0.0
    for i, doc_id in enumerate(retrieved_docs, 1):
        if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0:
            relevant_count += 1
            precision_sum += relevant_count / i
    return precision_sum / relevant_count if relevant_count > 0 else 0.0

def load_qrels(qrels_df):
    qrels = defaultdict(dict)
    for _, row in qrels_df.iterrows():
        qrels[str(row['query_id'])][str(row['doc_id'])] = float(row['relevance'])
    return qrels

def evaluate_results(results, qrels, queries_df, dataset_name):
    map_scores = []
    precision_scores = []
    recall_scores = []
    rr_scores = []
    for query_id in results:
        retrieved_docs = results[query_id]
        relevant_docs = qrels.get(query_id, {})
        if relevant_docs:
            map_scores.append(calculate_map(retrieved_docs, relevant_docs))
            precision_scores.append(calculate_precision_at_k(retrieved_docs, relevant_docs))
            recall_scores.append(calculate_recall_at_k(retrieved_docs, relevant_docs))
            rr_scores.append(calculate_rr(retrieved_docs, relevant_docs))
    print(f"{dataset_name} - MAP: {np.mean(map_scores):.4f}, Precision@10: {np.mean(precision_scores):.4f}, Recall@10: {np.mean(recall_scores):.4f}, MRR: {np.mean(rr_scores):.4f}")

# Run
results_beir = process_hybrid_similarities_in_batches(
    query_tfidf_beir, query_embeddings_beir, tfidf_matrix_beir, embeddings_matrix_beir,
    queries_df_beir, BATCH_SIZE, doc_id_mapping_beir, base_path_beir
)
qrels_beir = load_qrels(pd.read_csv(qrels_path_beir))
evaluate_results(results_beir, qrels_beir, queries_df_beir, 'beir')

# Clean memory
del query_tfidf_beir, query_embeddings_beir
del tfidf_matrix_beir, embeddings_matrix_beir
del model_beir
gc.collect()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
🔄 Processing queries (beir): 100%|██████████| 63/63 [56:36<00:00, 53.91s/batch]


beir - MAP: 0.6614, Precision@10: 0.1354, Recall@10: 0.7571, MRR: 0.6867


3690

## query and hybrid

## HYBRID AND QUERY ANTIQUE

In [None]:
import os
import joblib
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import gc
import sqlite3
from scipy.sparse import csr_matrix
from collections import defaultdict
from math import log2
from datetime import datetime

# تعريف BATCH_SIZE
BATCH_SIZE = 16

# تحميل بيانات NLTK المطلوبة
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# تعريف دالة التوكنايزر المخصصة
stop_words = set(stopwords.words('english')) - {'not', 'no', 'never'}
lemmatizer = WordNetLemmatizer()

def custom_tokenizer(text):
    if not isinstance(text, str) or not text.strip():
        return []
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words and len(token) > 2]
    tokens = [lemmatizer.lemmatize(token, pos='v') for token in tokens]
    return tokens

# دالة للحصول على doc_ids من inverted index
def get_doc_ids_from_index(processed_query, db_path):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    doc_ids = set()
    for term in processed_query:
        cursor.execute("SELECT doc_id FROM inverted_index WHERE term = ?", (term,))
        doc_ids.update(row[0] for row in cursor.fetchall())
    conn.close()
    return doc_ids

# مسارات الملفات
base_path_antique = r"data\antique"


tfidf_matrix_path_antique = os.path.join(base_path_antique, "tfidf_matrix.joblib")
enhanced_queries_path_antique = os.path.join(base_path_antique, "enhanced_queries_antique.csv")
docs_path_antique = os.path.join(base_path_antique, "docs_antique.csv")
doc_id_mapping_path_antique = os.path.join(base_path_antique, "doc_id_mapping.joblib")
embeddings_matrix_path_antique = os.path.join(base_path_antique, "embeddings_matrix.joblib")
embeddings_vectorizer_path_antique = os.path.join(base_path_antique, "embeddings_vectorizer.joblib")


qrels_path_antique = os.path.join(base_path_antique, "qrels_antique.csv")

# تحميل المستندات وإنشاء doc_id_mapping
docs_antique = pd.read_csv(docs_path_antique)

doc_id_mapping_antique = {i: str(doc_id) for i, doc_id in enumerate(docs_antique['doc_id'])}

joblib.dump(doc_id_mapping_antique, doc_id_mapping_path_antique)


# تحميل مصفوفات TF-IDF و التضمينات
tfidf_matrix_antique = joblib.load(tfidf_matrix_path_antique)

embeddings_matrix_antique = joblib.load(embeddings_matrix_path_antique)


# تحميل الاستعلامات
queries_df_antique = pd.read_csv(enhanced_queries_path_antique)

# إزالة الاستعلامات الفارغة
queries_df_antique = queries_df_antique[queries_df_antique['text'].notna() & queries_df_antique['text'].str.strip() != '']




# تحميل محولات TF-IDF
tfidf_vectorizer_antique = joblib.load(os.path.join(base_path_antique, "tfidf_vectorizer.joblib"))


# تحميل نموذج embeddings_vectorizer المخزن
model_antique = joblib.load(embeddings_vectorizer_path_antique)


query_texts_antique = queries_df_antique['text'].tolist()

query_tfidf_antique = tfidf_vectorizer_antique.transform(query_texts_antique)


queries_df_antique['processed_text'] = queries_df_antique['processed_query'].apply(lambda x: x.split())
def get_model(dataset_name):
    print(f"Loading model for {dataset_name}")
    return SentenceTransformer('all-mpnet-base-v2')

model_antique = get_model("antique")


# تحويل الاستعلامات إلى تمثيلات embeddings
def encode_in_batches(texts, model, batch_size):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Encoding queries",bar_format='{l_bar}{bar}|'):
        batch_texts = [' '.join(text) if isinstance(text, list) else text for text in texts[i:i + batch_size]]
        batch_embeddings = model.encode(batch_texts, convert_to_tensor=False, show_progress_bar=False)
        embeddings.append(batch_embeddings)
        gc.collect()
    return np.vstack(embeddings).astype(np.float32)


query_embeddings_antique = encode_in_batches(queries_df_antique['processed_text'].tolist(), model_antique, BATCH_SIZE)


# دالة هجينة لترتيب المستندات
def rank_documents_hybrid(query_vector, dataset_name):
    try:
        ranked_docs = []
        base_path = f"data/{dataset_name}"
        db_path = os.path.join(base_path, "index.db")
        
        # تحميل قاعدة البيانات
        if not os.path.exists(db_path):
            return {"status": "error", "message": "Database not found"}
        conn = sqlite3.connect(db_path)
        cursor = conn.cursor()
        cursor.execute("SELECT doc_id, text FROM documents")
        doc_mapping = {i: (doc_id, text) for i, (doc_id, text) in enumerate(cursor.fetchall())}
        conn.close()

        # تحميل مصفوفات TF-IDF والتضمينات
        tfidf_matrix = joblib.load(os.path.join(base_path, "tfidf_matrix.joblib"))
        embeddings_matrix = joblib.load(os.path.join(base_path, "embeddings_matrix.joblib"))
        doc_id_mapping = joblib.load(os.path.join(base_path, "doc_id_mapping.joblib"))

        # معالجة استعلام TF-IDF
        processed_terms = custom_tokenizer(query_vector['text'])
        doc_ids = get_doc_ids_from_index(processed_terms, db_path)
        filtered_indices = [idx for idx, doc_id in doc_id_mapping.items() if doc_id in doc_ids]
        if not filtered_indices:
            return {"status": "error", "message": "No documents found in inverted index"}

        # حساب التشابه باستخدام TF-IDF
        query_vector_tfidf = np.array(query_vector['tfidf']).reshape(1, -1)
        if query_vector_tfidf.shape[1] != tfidf_matrix.shape[1]:
            return {"status": "error", "message": "Dimension mismatch in TF-IDF"}
        filtered_tfidf_matrix = tfidf_matrix[filtered_indices]
        similarities_tfidf = cosine_similarity(query_vector_tfidf, filtered_tfidf_matrix)
        doc_indices_tfidf = np.argsort(similarities_tfidf[0])[::-1][:1000]

        # حساب التشابه باستخدام التضمينات
        query_vector_embedding = np.array(query_vector['embedding']).reshape(1, -1)
        if query_vector_embedding.shape[1] != embeddings_matrix.shape[1]:
            return {"status": "error", "message": "Dimension mismatch in embeddings"}
        filtered_embeddings = embeddings_matrix[filtered_indices]
        similarities_embedding = cosine_similarity(query_vector_embedding, filtered_embeddings)
        doc_indices_embedding = np.argsort(similarities_embedding[0])[::-1][:10]

        # دمج النتائج
        for idx in doc_indices_embedding:
            original_idx = filtered_indices[idx]
            if original_idx < tfidf_matrix.shape[0] and original_idx in doc_mapping:
                doc_id, text = doc_mapping[original_idx]
                tfidf_score = similarities_tfidf[0][idx] if idx < len(similarities_tfidf[0]) else 0.0
                ranked_docs.append({
                    "doc_id": doc_id,
                    "score": float(similarities_embedding[0][idx]),
                    "text": text,
                    "tfidf_score": float(tfidf_score)
                })

        if not ranked_docs:
            return {"status": "error", "message": "No valid documents found"}
        
        return {"status": "success", "ranked_docs": ranked_docs}
    
    except Exception as e:
        print(f"Ranking error: {str(e)}")
        return {"status": "error", "message": str(e)}

# دالة لمعالجة الاستعلامات بشكل دفعي
def process_hybrid_similarities_in_batches(query_tfidf, query_embeddings, tfidf_matrix, embeddings_matrix, queries_df, batch_size, doc_id_mapping, dataset_base_path):
    results = {}
    index_db_path = os.path.join(dataset_base_path, "index.db")
    
    total_batches = (query_tfidf.shape[0] + batch_size - 1) // batch_size
    progress_bar = tqdm(total=total_batches, desc=f"🔄 Processing queries ({dataset_base_path.split(os.sep)[-1]})", unit="batch")

    for i in range(0, query_tfidf.shape[0], batch_size):
        batch_queries_tfidf = query_tfidf[i:i + batch_size]
        batch_queries_embeddings = query_embeddings[i:i + batch_size]
        batch_texts = queries_df['text'].iloc[i:i + batch_size].tolist()
        batch_query_ids = queries_df['query_id'].iloc[i:i + batch_size].astype(str).tolist()

        for query_tfidf_vec, query_emb_vec, query_text, query_id in zip(batch_queries_tfidf, batch_queries_embeddings, batch_texts, batch_query_ids):
            query_vector = {
                'tfidf': query_tfidf_vec.toarray()[0],
                'embedding': query_emb_vec,
                'text': query_text
            }
            result = rank_documents_hybrid(query_vector, dataset_base_path.split(os.sep)[-1])
            if result['status'] == 'success':
                results[query_id] = [doc['doc_id'] for doc in result['ranked_docs']]
            else:
                results[query_id] = []
        
        gc.collect()
        progress_bar.update(1)
    
    progress_bar.close()
    return results

# دالة لحساب المقاييس
def calculate_precision_at_k(retrieved_docs, relevant_docs, k=10):
    relevant_in_top_k = sum(1 for doc_id in retrieved_docs[:k] if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0)
    return relevant_in_top_k / k if k > 0 else 0.0

def calculate_recall_at_k(retrieved_docs, relevant_docs, k=10):
    relevant_in_top_k = sum(1 for doc_id in retrieved_docs[:k] if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0)
    total_relevant = sum(1 for score in relevant_docs.values() if float(score) > 0)
    return relevant_in_top_k / total_relevant if total_relevant > 0 else 0.0

def calculate_rr(retrieved_docs, relevant_docs):
    for i, doc_id in enumerate(retrieved_docs, 1):
        if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0:
            return 1.0 / i
    return 0.0


def calculate_map(retrieved_docs, relevant_docs):
    relevant_count = 0
    precision_sum = 0.0
    for i, doc_id in enumerate(retrieved_docs, 1):
        if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0:
            relevant_count += 1
            precision_sum += relevant_count / i
    return precision_sum / relevant_count if relevant_count > 0 else 0.0

def load_qrels(qrels_df):
    qrels = defaultdict(dict)
    for _, row in qrels_df.iterrows():
        qrels[str(row['query_id'])][str(row['doc_id'])] = float(row['relevance'])
    return qrels

# تقييم النتائج 
def evaluate_results(results, qrels, queries_df, dataset_name):
    map_scores = []
    precision_scores = []
    recall_scores = []
    rr_scores = []
    low_map_queries = []

    for query_id in results:
        retrieved_docs = results[query_id]
        relevant_docs = qrels.get(query_id, {})
        if relevant_docs:
            map_score = calculate_map(retrieved_docs, relevant_docs)
            map_scores.append(map_score)
            if map_score < 0.1:
                filtered_df = queries_df[queries_df['query_id'].astype(str) == str(query_id)]
                if not filtered_df.empty:
                    query_text = filtered_df['text'].iloc[0]
                    low_map_queries.append((query_id, query_text, map_score))
                else:
                    print(f"Query ID {query_id} not found in queries_df for {dataset_name}")
            precision_scores.append(calculate_precision_at_k(retrieved_docs, relevant_docs))
            recall_scores.append(calculate_recall_at_k(retrieved_docs, relevant_docs))
            rr_scores.append(calculate_rr(retrieved_docs, relevant_docs))

    map_score = np.mean(map_scores) if map_scores else 0.0
    precision_score = np.mean(precision_scores) if precision_scores else 0.0
    recall_score = np.mean(recall_scores) if recall_scores else 0.0
    mrr_score = np.mean(rr_scores) if rr_scores else 0.0

    print(f"{dataset_name} - MAP: {map_score:.4f}, Precision@10: {precision_score:.4f}, Recall@10: {recall_score:.4f}, MRR: {mrr_score:.4f}")

# استدعاء الدالة لهجين ANTIQUE
results_antique = process_hybrid_similarities_in_batches(
    query_tfidf_antique, query_embeddings_antique, tfidf_matrix_antique, embeddings_matrix_antique,
    queries_df_antique, BATCH_SIZE, doc_id_mapping_antique, base_path_antique
)



# تحميل qrels
qrels_antique = load_qrels(pd.read_csv(qrels_path_antique))


# تقييم النتائج
evaluate_results(results_antique, qrels_antique, queries_df_antique, 'antique')





# تحرير الذاكرة
del query_tfidf_antique,query_embeddings_antique, 
del tfidf_matrix_antique, embeddings_matrix_antique, 
del model_antique, 
gc.collect()   

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Loading model for antique


Encoding queries: 100%|██████████|
🔄 Processing queries (antique): 100%|██████████| 5/5 [03:32<00:00, 42.60s/batch]


antique - MAP: 0.4024, Precision@10: 0.1960, Recall@10: 0.0648, MRR: 0.4627


3487

## HYBRID AND QUERY BEIR

In [None]:
import os
import joblib
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import gc
import sqlite3
from scipy.sparse import csr_matrix
from collections import defaultdict
from math import log2
from datetime import datetime

BATCH_SIZE = 128

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english')) - {'not', 'no', 'never'}
lemmatizer = WordNetLemmatizer()

def custom_tokenizer(text):
    if not isinstance(text, str) or not text.strip():
        return []
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words and len(token) > 2]
    tokens = [lemmatizer.lemmatize(token, pos='v') for token in tokens]
    return tokens

def get_doc_ids_from_index(processed_query, db_path):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    doc_ids = set()
    for term in processed_query:
        cursor.execute("SELECT doc_id FROM inverted_index WHERE term = ?", (term,))
        doc_ids.update(row[0] for row in cursor.fetchall())
    conn.close()
    return doc_ids

base_path_beir = r"data\beir"

tfidf_matrix_path_beir = os.path.join(base_path_beir, "tfidf_matrix.joblib")
enhanced_queries_path_beir = os.path.join(base_path_beir, "enhanced_queries_beir.csv")
docs_path_beir = os.path.join(base_path_beir, "docs_beir.csv")
doc_id_mapping_path_beir = os.path.join(base_path_beir, "doc_id_mapping.joblib")
embeddings_matrix_path_beir = os.path.join(base_path_beir, "embeddings_matrix.joblib")
embeddings_vectorizer_path_beir = os.path.join(base_path_beir, "embeddings_vectorizer.joblib")
qrels_path_beir = os.path.join(base_path_beir, "qrels_beir.csv")

docs_beir = pd.read_csv(docs_path_beir)
doc_id_mapping_beir = {i: str(doc_id) for i, doc_id in enumerate(docs_beir['doc_id'])}
joblib.dump(doc_id_mapping_beir, doc_id_mapping_path_beir)

tfidf_matrix_beir = joblib.load(tfidf_matrix_path_beir)
embeddings_matrix_beir = joblib.load(embeddings_matrix_path_beir)

queries_df_beir = pd.read_csv(enhanced_queries_path_beir)
queries_df_beir = queries_df_beir[queries_df_beir['text'].notna() & queries_df_beir['text'].str.strip() != '']

tfidf_vectorizer_beir = joblib.load(os.path.join(base_path_beir, "tfidf_vectorizer.joblib"))
model_beir = joblib.load(embeddings_vectorizer_path_beir)

query_texts_beir = queries_df_beir['text'].tolist()
query_tfidf_beir = tfidf_vectorizer_beir.transform(query_texts_beir)
queries_df_beir['processed_text'] = queries_df_beir['processed_query'].apply(
    lambda x: x.split() if isinstance(x, str) else []
)


def get_model(dataset_name):
    print(f"Loading model for {dataset_name}")
    return SentenceTransformer('all-mpnet-base-v2')

model_beir = get_model("beir")

def encode_in_batches(texts, model, batch_size):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Encoding queries", bar_format='{l_bar}{bar}|'):
        batch_texts = [' '.join(text) if isinstance(text, list) else text for text in texts[i:i + batch_size]]
        batch_embeddings = model.encode(batch_texts, convert_to_tensor=False, show_progress_bar=False)
        embeddings.append(batch_embeddings)
        gc.collect()
    return np.vstack(embeddings).astype(np.float32)

query_embeddings_beir = encode_in_batches(queries_df_beir['processed_text'].tolist(), model_beir, BATCH_SIZE)

def rank_documents_hybrid(query_vector, dataset_name):
    try:
        ranked_docs = []
        base_path = f"data/{dataset_name}"
        db_path = os.path.join(base_path, "index.db")
        if not os.path.exists(db_path):
            return {"status": "error", "message": "Database not found"}
        conn = sqlite3.connect(db_path)
        cursor = conn.cursor()
        cursor.execute("SELECT doc_id, text FROM documents")
        doc_mapping = {i: (doc_id, text) for i, (doc_id, text) in enumerate(cursor.fetchall())}
        conn.close()

        tfidf_matrix = joblib.load(os.path.join(base_path, "tfidf_matrix.joblib"))
        embeddings_matrix = joblib.load(os.path.join(base_path, "embeddings_matrix.joblib"))
        doc_id_mapping = joblib.load(os.path.join(base_path, "doc_id_mapping.joblib"))

        processed_terms = custom_tokenizer(query_vector['text'])
        doc_ids = get_doc_ids_from_index(processed_terms, db_path)
        filtered_indices = [idx for idx, doc_id in doc_id_mapping.items() if doc_id in doc_ids]
        if not filtered_indices:
            return {"status": "error", "message": "No documents found in inverted index"}

        query_vector_tfidf = np.array(query_vector['tfidf']).reshape(1, -1)
        if query_vector_tfidf.shape[1] != tfidf_matrix.shape[1]:
            return {"status": "error", "message": "Dimension mismatch in TF-IDF"}
        filtered_tfidf_matrix = tfidf_matrix[filtered_indices]
        similarities_tfidf = cosine_similarity(query_vector_tfidf, filtered_tfidf_matrix)
        doc_indices_tfidf = np.argsort(similarities_tfidf[0])[::-1][:1000]

        query_vector_embedding = np.array(query_vector['embedding']).reshape(1, -1)
        if query_vector_embedding.shape[1] != embeddings_matrix.shape[1]:
            return {"status": "error", "message": "Dimension mismatch in embeddings"}
        filtered_embeddings = embeddings_matrix[filtered_indices]
        similarities_embedding = cosine_similarity(query_vector_embedding, filtered_embeddings)
        doc_indices_embedding = np.argsort(similarities_embedding[0])[::-1][:10]

        for idx in doc_indices_embedding:
            original_idx = filtered_indices[idx]
            if original_idx < tfidf_matrix.shape[0] and original_idx in doc_mapping:
                doc_id, text = doc_mapping[original_idx]
                tfidf_score = similarities_tfidf[0][idx] if idx < len(similarities_tfidf[0]) else 0.0
                ranked_docs.append({
                    "doc_id": doc_id,
                    "score": float(similarities_embedding[0][idx]),
                    "text": text,
                    "tfidf_score": float(tfidf_score)
                })

        if not ranked_docs:
            return {"status": "error", "message": "No valid documents found"}
        
        return {"status": "success", "ranked_docs": ranked_docs}
    
    except Exception as e:
        print(f"Ranking error: {str(e)}")
        return {"status": "error", "message": str(e)}

def process_hybrid_similarities_in_batches(query_tfidf, query_embeddings, tfidf_matrix, embeddings_matrix, queries_df, batch_size, doc_id_mapping, dataset_base_path):
    results = {}
    index_db_path = os.path.join(dataset_base_path, "index.db")
    
    total_batches = (query_tfidf.shape[0] + batch_size - 1) // batch_size
    progress_bar = tqdm(total=total_batches, desc=f"🔄 Processing queries ({dataset_base_path.split(os.sep)[-1]})", bar_format="{l_bar}{bar}")

    for i in range(0, query_tfidf.shape[0], batch_size):
        batch_queries_tfidf = query_tfidf[i:i + batch_size]
        batch_queries_embeddings = query_embeddings[i:i + batch_size]
        batch_texts = queries_df['text'].iloc[i:i + batch_size].tolist()
        batch_query_ids = queries_df['query_id'].iloc[i:i + batch_size].astype(str).tolist()

        for query_tfidf_vec, query_emb_vec, query_text, query_id in zip(batch_queries_tfidf, batch_queries_embeddings, batch_texts, batch_query_ids):
            query_vector = {
                'tfidf': query_tfidf_vec.toarray()[0],
                'embedding': query_emb_vec,
                'text': query_text
            }
            result = rank_documents_hybrid(query_vector, dataset_base_path.split(os.sep)[-1])
            if result['status'] == 'success':
                results[query_id] = [doc['doc_id'] for doc in result['ranked_docs']]
            else:
                results[query_id] = []
        
        gc.collect()
        progress_bar.update(1)
    
    progress_bar.close()
    return results

def calculate_precision_at_k(retrieved_docs, relevant_docs, k=10):
    relevant_in_top_k = sum(1 for doc_id in retrieved_docs[:k] if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0)
    return relevant_in_top_k / k if k > 0 else 0.0

def calculate_recall_at_k(retrieved_docs, relevant_docs, k=10):
    relevant_in_top_k = sum(1 for doc_id in retrieved_docs[:k] if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0)
    total_relevant = sum(1 for score in relevant_docs.values() if float(score) > 0)
    return relevant_in_top_k / total_relevant if total_relevant > 0 else 0.0

def calculate_rr(retrieved_docs, relevant_docs):
    for i, doc_id in enumerate(retrieved_docs, 1):
        if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0:
            return 1.0 / i
    return 0.0

def calculate_map(retrieved_docs, relevant_docs):
    relevant_count = 0
    precision_sum = 0.0
    for i, doc_id in enumerate(retrieved_docs, 1):
        if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0:
            relevant_count += 1
            precision_sum += relevant_count / i
    return precision_sum / relevant_count if relevant_count > 0 else 0.0

def load_qrels(qrels_df):
    qrels = defaultdict(dict)
    for _, row in qrels_df.iterrows():
        qrels[str(row['query_id'])][str(row['doc_id'])] = float(row['relevance'])
    return qrels

def evaluate_results(results, qrels, queries_df, dataset_name):
    map_scores = []
    precision_scores = []
    recall_scores = []
    rr_scores = []
    low_map_queries = []

    for query_id in results:
        retrieved_docs = results[query_id]
        relevant_docs = qrels.get(query_id, {})
        if relevant_docs:
            map_score = calculate_map(retrieved_docs, relevant_docs)
            map_scores.append(map_score)
            if map_score < 0.1:
                filtered_df = queries_df[queries_df['query_id'].astype(str) == str(query_id)]
                if not filtered_df.empty:
                    query_text = filtered_df['text'].iloc[0]
                    low_map_queries.append((query_id, query_text, map_score))
                else:
                    print(f"Query ID {query_id} not found in queries_df for {dataset_name}")
            precision_scores.append(calculate_precision_at_k(retrieved_docs, relevant_docs))
            recall_scores.append(calculate_recall_at_k(retrieved_docs, relevant_docs))
            rr_scores.append(calculate_rr(retrieved_docs, relevant_docs))

    map_score = np.mean(map_scores) if map_scores else 0.0
    precision_score = np.mean(precision_scores) if precision_scores else 0.0
    recall_score = np.mean(recall_scores) if recall_scores else 0.0
    mrr_score = np.mean(rr_scores) if rr_scores else 0.0

    print(f"{dataset_name} - MAP: {map_score:.4f}, Precision@10: {precision_score:.4f}, Recall@10: {recall_score:.4f}, MRR: {mrr_score:.4f}")

results_beir = process_hybrid_similarities_in_batches(
    query_tfidf_beir, query_embeddings_beir, tfidf_matrix_beir, embeddings_matrix_beir,
    queries_df_beir, BATCH_SIZE, doc_id_mapping_beir, base_path_beir
)

qrels_beir = load_qrels(pd.read_csv(qrels_path_beir))

evaluate_results(results_beir, qrels_beir, queries_df_beir, 'beir')

del query_tfidf_beir, query_embeddings_beir
del tfidf_matrix_beir, embeddings_matrix_beir
del model_beir
gc.collect()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Loading model for beir


Encoding queries: 100%|██████████|
🔄 Processing queries (beir): 100%|██████████


beir - MAP: 0.5291, Precision@10: 0.1333, Recall@10: 0.2540, MRR: 0.6667


3487