tfidf with inverted index

In [None]:



import os
import joblib
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import gc
import sqlite3
from concurrent.futures import ThreadPoolExecutor
import numpy as np
from collections import defaultdict
from math import log2
from datetime import datetime

# تحميل بيانات NLTK المطلوبة
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# تعريف دالة التوكنايزر المخصصة
stop_words = set(stopwords.words('english')) - {'not', 'no', 'never'}
lemmatizer = WordNetLemmatizer()

def custom_tokenizer(text):
    if not isinstance(text, str) or not text.strip():
        return []
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words and len(token) > 2]
    tokens = [lemmatizer.lemmatize(token, pos='v') for token in tokens]
    return tokens

# دالة محسنة للحصول على doc_ids من inverted index باستخدام استعلام واحد
def get_doc_ids_from_index(processed_query, db_path):
    if not processed_query:
        return set()
    try:
        conn = sqlite3.connect(db_path, uri=True)
        cursor = conn.cursor()
        placeholders = ','.join('?' for _ in processed_query)
        cursor.execute(f"SELECT doc_id FROM inverted_index WHERE term IN ({placeholders})", processed_query)
        doc_ids = set(row[0] for row in cursor.fetchall())
    except sqlite3.Error as e:
        print(f"Database error: {e}") 
        doc_ids = set()
    finally:
        conn.close()
    return doc_ids

# دالة لمعالجة دفعة واحدة من الاستعلامات
def process_batch(batch_idx, batch_queries, batch_query_ids, batch_texts, doc_tfidf, doc_id_mapping, index_db_path):
    batch_results = {}
    try:
        for j, (query_vector, query_id, query_text) in enumerate(zip(batch_queries, batch_query_ids, batch_texts)):
            try:
                processed_terms = custom_tokenizer(query_text)
                doc_ids = get_doc_ids_from_index(processed_terms, index_db_path)

                if not doc_ids:
                    batch_results[query_id] = []
                    continue

                filtered_indices = [idx for idx, doc_id in doc_id_mapping.items() if doc_id in doc_ids]
                if not filtered_indices:
                    batch_results[query_id] = []
                    continue

                filtered_doc_matrix = doc_tfidf[filtered_indices]
                query_vector = query_vector.reshape(1, -1)
                similarities = cosine_similarity(query_vector, filtered_doc_matrix)
                doc_scores = list(zip(filtered_indices, similarities[0]))
                doc_scores.sort(key=lambda x: x[1], reverse=True)
                top_10_docs = [doc_id_mapping[idx] for idx, _ in doc_scores[:10]]
                batch_results[query_id] = top_10_docs

                del similarities, filtered_doc_matrix
                gc.collect()
            except Exception as e:
                print(f"Error processing query {query_id}: {e}")  # استبدال logger.error
                batch_results[query_id] = []
    except Exception as e:
        print(f"Error in batch {batch_idx}: {e}")  # استبدال logger.error
        return {}
    return batch_results

# دالة محسنة لمعالجة التشابه باستخدام المعالجة الموازية
def process_tfidf_similarities_in_batches(query_tfidf, doc_tfidf, queries_df, batch_size, doc_id_mapping, dataset_base_path):
    results = {}
    index_db_path = os.path.join(dataset_base_path, "index.db")
    batch_indices = list(range(0, query_tfidf.shape[0], batch_size))

    with ThreadPoolExecutor(max_workers=4) as executor:
        futures = []
        for i in batch_indices:
            batch_queries = query_tfidf[i:i + batch_size]
            batch_texts = queries_df['text'].iloc[i:i + batch_size].tolist()
            batch_query_ids = queries_df['query_id'].iloc[i:i + batch_size].astype(str).tolist()
            futures.append(
                executor.submit(process_batch, i, batch_queries, batch_query_ids, batch_texts, doc_tfidf, doc_id_mapping, index_db_path)
            )
        
        for future in tqdm(futures, desc="Processing batches", bar_format="{l_bar}{bar}"):
            try:
                batch_results = future.result()
                results.update(batch_results)
            except Exception as e:
                print(f"Error in future: {e}")  # استبدال logger.error
    
    return results

# دوال لحساب المقاييس
def calculate_precision_at_k(retrieved_docs, relevant_docs, k=10):
    relevant_in_top_k = sum(1 for doc_id in retrieved_docs[:k] if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0)
    return relevant_in_top_k / k if k > 0 else 0.0

def calculate_recall_at_k(retrieved_docs, relevant_docs, k=10):
    relevant_in_top_k = sum(1 for doc_id in retrieved_docs[:k] if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0)
    total_relevant = sum(1 for score in relevant_docs.values() if float(score) > 0)
    return relevant_in_top_k / total_relevant if total_relevant > 0 else 0.0

def calculate_rr(retrieved_docs, relevant_docs):
    for i, doc_id in enumerate(retrieved_docs, 1):
        if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0:
            return 1.0 / i
    return 0.0


def calculate_map(retrieved_docs, relevant_docs):
    relevant_count = 0
    precision_sum = 0.0
    for i, doc_id in enumerate(retrieved_docs, 1):
        if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0:
            relevant_count += 1
            precision_sum += relevant_count / i
    return precision_sum / relevant_count if relevant_count > 0 else 0.0

def load_qrels(qrels_df):
    qrels = defaultdict(dict)
    for _, row in qrels_df.iterrows():
        qrels[str(row['query_id'])][str(row['doc_id'])] = float(row['relevance'])
    return qrels


def evaluate_results(results, qrels, queries_df, dataset_name):
    map_scores = []
    precision_scores = []
    recall_scores = []
    rr_scores = []  # قائمة لتخزين قيم RR
    low_map_queries = []

    for query_id in results:
        retrieved_docs = results[query_id]
        relevant_docs = qrels.get(query_id, {})
        if relevant_docs:
            map_score = calculate_map(retrieved_docs, relevant_docs)
            map_scores.append(map_score)
            if map_score < 0.1:
                filtered_df = queries_df[queries_df['query_id'].astype(str) == str(query_id)]
                if not filtered_df.empty:
                    query_text = filtered_df['text'].iloc[0]
                    low_map_queries.append((query_id, query_text, map_score))
                else:
                    print(f"Query ID {query_id} not found in queries_df for {dataset_name}")
            precision_scores.append(calculate_precision_at_k(retrieved_docs, relevant_docs))
            recall_scores.append(calculate_recall_at_k(retrieved_docs, relevant_docs))
            rr_scores.append(calculate_rr(retrieved_docs, relevant_docs))  # إضافة RR

    map_score = np.mean(map_scores) if map_scores else 0.0

    precision_score = np.mean(precision_scores) if precision_scores else 0.0
    recall_score = np.mean(recall_scores) if recall_scores else 0.0
    mrr_score = np.mean(rr_scores) if rr_scores else 0.0  # حساب MRR

    print(f"{dataset_name} - MAP: {map_score:.4f}, "f"Precision@10: {precision_score:.4f}, Recall@10: {recall_score:.4f}, MRR: {mrr_score:.4f}")
    
    
    return map_score, precision_score, recall_score, mrr_score
# مسارات الملفات وقواعد البيانات
base_path_antique = r"data\antique"
base_path_beir = r"data\beir"

tfidf_matrix_path_antique = os.path.join(base_path_antique, "tfidf_matrix.joblib")
queries_path_antique = os.path.join(base_path_antique, "queries_antique.csv")
docs_path_antique = os.path.join(base_path_antique, "docs_antique.csv")
doc_id_mapping_path_antique = os.path.join(base_path_antique, "doc_id_mapping.joblib")
qrels_path_antique = os.path.join(base_path_antique, "qrels_antique.csv")

tfidf_matrix_path_beir = os.path.join(base_path_beir, "tfidf_matrix.joblib")
queries_path_beir = os.path.join(base_path_beir, "queries_beir.csv")
docs_path_beir = os.path.join(base_path_beir, "docs_beir.csv")
doc_id_mapping_path_beir = os.path.join(base_path_beir, "doc_id_mapping.joblib")
qrels_path_beir = os.path.join(base_path_beir, "qrels_beir.csv")

# تحميل المستندات لإنشاء mapping بين index و doc_id
try:
    docs_antique = pd.read_csv(docs_path_antique)
    docs_beir = pd.read_csv(docs_path_beir)
except FileNotFoundError as e:
    print(f"Error loading CSV files: {e}")  # استبدال logger.error
    raise

doc_id_mapping_antique = {i: str(doc_id) for i, doc_id in enumerate(docs_antique['doc_id'])}
doc_id_mapping_beir = {i: str(doc_id) for i, doc_id in enumerate(docs_beir['doc_id'])}

joblib.dump(doc_id_mapping_antique, doc_id_mapping_path_antique)
joblib.dump(doc_id_mapping_beir, doc_id_mapping_path_beir)

# تحميل مصفوفات TF-IDF
try:
    tfidf_matrix_antique = joblib.load(tfidf_matrix_path_antique)
    tfidf_matrix_beir = joblib.load(tfidf_matrix_path_beir)
except FileNotFoundError as e:
    print(f"Error loading TF-IDF matrices: {e}")  # استبدال logger.error
    raise

# تحميل ملفات الاستعلامات ومرجع التقييم
try:
    queries_df_antique = pd.read_csv(queries_path_antique)
    queries_df_beir = pd.read_csv(queries_path_beir)
    qrels_df_antique = pd.read_csv(qrels_path_antique)
    qrels_df_beir = pd.read_csv(qrels_path_beir)
except FileNotFoundError as e:
    print(f"Error loading CSV files: {e}")  # استبدال logger.error
    raise

# إزالة الاستعلامات الفارغة أو التي لا تحتوي نص
queries_df_antique = queries_df_antique[queries_df_antique['text'].notna() & queries_df_antique['text'].str.strip() != '']
queries_df_beir = queries_df_beir[queries_df_beir['text'].notna() & queries_df_beir['text'].str.strip() != '']


# تحميل محولات TF-IDF
try:
    tfidf_vectorizer_antique = joblib.load(os.path.join(base_path_antique, "tfidf_vectorizer.joblib"))
    tfidf_vectorizer_beir = joblib.load(os.path.join(base_path_beir, "tfidf_vectorizer.joblib"))
except FileNotFoundError as e:
    print(f"Error loading TF-IDF vectorizers: {e}")  # استبدال logger.error
    raise

# تحويل نصوص الاستعلام إلى تمثيلات TF-IDF
query_texts_antique = queries_df_antique['text'].tolist()
query_tfidf_antique = tfidf_vectorizer_antique.transform(query_texts_antique)
query_texts_beir = queries_df_beir['text'].tolist()
query_tfidf_beir = tfidf_vectorizer_beir.transform(query_texts_beir)

# حجم دفعة المعالجة
BATCH_SIZE = 16

# معالجة بيانات ANTIQUE
try:
    results_antique = process_tfidf_similarities_in_batches(
        query_tfidf_antique, tfidf_matrix_antique, queries_df_antique,
        BATCH_SIZE, doc_id_mapping_antique, base_path_antique
    )
except Exception as e:
    print(f"Error processing ANTIQUE dataset: {e}")  # استبدال logger.error
    raise

# معالجة بيانات BEIR
try:
    results_beir = process_tfidf_similarities_in_batches(
        query_tfidf_beir, tfidf_matrix_beir, queries_df_beir,
        BATCH_SIZE, doc_id_mapping_beir, base_path_beir
    )
except Exception as e:
    print(f"Error processing BEIR dataset: {e}")  # استبدال logger.error
    raise

# تحميل qrels
qrels_antique = load_qrels(qrels_df_antique)
qrels_beir = load_qrels(qrels_df_beir)

# تقييم النتائج
evaluate_results(results_antique, qrels_antique, queries_df_antique, 'antique')
evaluate_results(results_beir, qrels_beir, queries_df_beir, 'beir')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Processing batches: 100%|██████████
Processing batches: 100%|██████████


antique - MAP: 0.4816, Precision@10: 0.2645, Recall@10: 0.0894, MRR: 0.5624
beir - MAP: 0.5414, Precision@10: 0.0867, Recall@10: 0.6634, MRR: 0.5513


(np.float64(0.5413612826436129),
 np.float64(0.08673),
 np.float64(0.6633663701409881),
 np.float64(0.5513044047619047))

## TFIDF QUERY

In [None]:
import os
import joblib
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import json
from tqdm import tqdm
import gc
import sqlite3
import numpy as np
from datetime import datetime
from collections import defaultdict

# تحميل بيانات NLTK المطلوبة
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Tokenizer مخصص
stop_words = set(stopwords.words('english')) - {'not', 'no', 'never'}
lemmatizer = WordNetLemmatizer()

def custom_tokenizer(text):
    if not isinstance(text, str) or not text.strip():
        return []
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words and len(token) > 2 and token not in ['example', 'test']]
    tokens = [lemmatizer.lemmatize(token, pos='v') for token in tokens]
    return tokens

def get_doc_ids_from_index(processed_query, db_path):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    doc_ids = set()
    for term in processed_query:
        cursor.execute("SELECT doc_id FROM inverted_index WHERE term = ?", (term,))
        doc_ids.update(row[0] for row in cursor.fetchall())
    conn.close()
    return doc_ids

def calculate_precision_at_k(retrieved_docs, relevant_docs, k=10):
    relevant_in_top_k = sum(1 for doc_id in retrieved_docs[:k] if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0)
    return relevant_in_top_k / k if k > 0 else 0.0

def calculate_recall_at_k(retrieved_docs, relevant_docs, k=10):
    relevant_in_top_k = sum(1 for doc_id in retrieved_docs[:k] if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0)
    total_relevant = sum(1 for score in relevant_docs.values() if float(score) > 0)
    return relevant_in_top_k / total_relevant if total_relevant > 0 else 0.0

def calculate_rr(retrieved_docs, relevant_docs):
    for i, doc_id in enumerate(retrieved_docs, 1):
        if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0:
            return 1.0 / i
    return 0.0

def calculate_map(retrieved_docs, relevant_docs):
    relevant_count = 0
    precision_sum = 0.0
    for i, doc_id in enumerate(retrieved_docs, 1):
        if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0:
            relevant_count += 1
            precision_sum += relevant_count / i
    return precision_sum / relevant_count if relevant_count > 0 else 0.0

def load_qrels(qrels_df):
    qrels = defaultdict(dict)
    for _, row in qrels_df.iterrows():
        qrels[str(row['query_id'])][str(row['doc_id'])] = float(row['relevance'])
    return qrels

def process_tfidf_similarities_in_batches(queries_df, tfidf_vectorizer, doc_tfidf, batch_size, doc_id_mapping, dataset_base_path, dataset_name, enhance=False):
    results = {}
    index_db_path = os.path.join(dataset_base_path, "index.db")
    total_queries = len(queries_df)

    for i in tqdm(range(0, total_queries, batch_size), 
              desc=f"Processing {dataset_name} queries", 
              total=(total_queries + batch_size - 1) // batch_size, 
              bar_format="{l_bar}{bar} | {remaining}"):
        batch_texts = queries_df['text'].iloc[i:i + batch_size].tolist()
        batch_query_ids = queries_df['query_id'].iloc[i:i + batch_size].astype(str).tolist()

        for query_text, query_id in zip(batch_texts, batch_query_ids):
            query_to_process = query_text
            processed_terms = custom_tokenizer(query_text)

            query_vector = tfidf_vectorizer.transform([query_text])
            doc_ids = get_doc_ids_from_index(processed_terms, index_db_path)

            if not doc_ids:
                results[query_id] = []
                continue

            filtered_indices = [idx for idx, doc_id in doc_id_mapping.items() if doc_id in doc_ids]
            if not filtered_indices:
                results[query_id] = []
                continue

            filtered_doc_matrix = doc_tfidf[filtered_indices]
            similarities = cosine_similarity(query_vector, filtered_doc_matrix)
            doc_scores = list(zip(filtered_indices, similarities[0]))
            doc_scores.sort(key=lambda x: x[1], reverse=True)
            top_10_docs = [doc_id_mapping[idx] for idx, _ in doc_scores[:10]]
            results[query_id] = top_10_docs

            del similarities, filtered_doc_matrix
            gc.collect()

    return results, {}

def evaluate_results(results, qrels, queries_df, dataset_name):
    map_scores, precision_scores, recall_scores, rr_scores, low_map_queries = [], [], [], [], []

    for query_id in results:
        retrieved_docs = results[query_id]
        relevant_docs = qrels.get(query_id, {})
        if relevant_docs:
            map_score = calculate_map(retrieved_docs, relevant_docs)
            map_scores.append(map_score)
            if map_score < 0.1:
                filtered_df = queries_df[queries_df['query_id'].astype(str) == str(query_id)]
                if not filtered_df.empty:
                    query_text = filtered_df['text'].iloc[0]
                    low_map_queries.append((query_id, query_text, map_score))
            precision_scores.append(calculate_precision_at_k(retrieved_docs, relevant_docs))
            recall_scores.append(calculate_recall_at_k(retrieved_docs, relevant_docs))
            rr_scores.append(calculate_rr(retrieved_docs, relevant_docs))

    print(f"{dataset_name} - MAP: {np.mean(map_scores):.4f}, Precision@10: {np.mean(precision_scores):.4f}, Recall@10: {np.mean(recall_scores):.4f}, MRR: {np.mean(rr_scores):.4f}")

# تحميل المسارات
base_path_antique = "data/antique"
base_path_beir = "data/beir"

queries_path_antique = os.path.join(base_path_antique, "enhanced_queries_antique.csv")
queries_path_beir = os.path.join(base_path_beir, "enhanced_queries_beir.csv")

docs_path_antique = os.path.join(base_path_antique, "docs_antique.csv")
docs_path_beir = os.path.join(base_path_beir, "docs_beir.csv")
qrels_path_antique = os.path.join(base_path_antique, "qrels_antique.csv")
qrels_path_beir = os.path.join(base_path_beir, "qrels_beir.csv")

tfidf_matrix_antique = joblib.load(os.path.join(base_path_antique, "tfidf_matrix.joblib"))
tfidf_matrix_beir = joblib.load(os.path.join(base_path_beir, "tfidf_matrix.joblib"))

tfidf_vectorizer_antique = joblib.load(os.path.join(base_path_antique, "tfidf_vectorizer.joblib"))
tfidf_vectorizer_beir = joblib.load(os.path.join(base_path_beir, "tfidf_vectorizer.joblib"))

queries_df_antique = pd.read_csv(queries_path_antique).dropna(subset=['text'])
queries_df_beir = pd.read_csv(queries_path_beir).dropna(subset=['text']).head(1000)

qrels_df_antique = pd.read_csv(qrels_path_antique)
qrels_df_beir = pd.read_csv(qrels_path_beir)

qrels_antique = load_qrels(qrels_df_antique)
qrels_beir = load_qrels(qrels_df_beir)

docs_antique = pd.read_csv(docs_path_antique)
docs_beir = pd.read_csv(docs_path_beir)

doc_id_mapping_antique = {i: str(doc_id) for i, doc_id in enumerate(docs_antique['doc_id'])}
doc_id_mapping_beir = {i: str(doc_id) for i, doc_id in enumerate(docs_beir['doc_id'])}

BATCH_SIZE = 16

results_antique, _ = process_tfidf_similarities_in_batches(
    queries_df_antique, tfidf_vectorizer_antique, tfidf_matrix_antique,
    BATCH_SIZE, doc_id_mapping_antique, base_path_antique, "antique", enhance=False
)

results_beir, _ = process_tfidf_similarities_in_batches(
    queries_df_beir, tfidf_vectorizer_beir, tfidf_matrix_beir,
    BATCH_SIZE, doc_id_mapping_beir, base_path_beir, "beir", enhance=False
)

evaluate_results(results_antique, qrels_antique, queries_df_antique, "antique")
evaluate_results(results_beir, qrels_beir, queries_df_beir, "beir")

print("\nSample results for ANTIQUE (first 2 queries):")
for query_id in list(results_antique.keys())[:2]:
    print(f"Query ID: {query_id}, Top Docs: {results_antique[query_id]}")

print("\nSample results for BEIR (first 2 queries):")
for query_id in list(results_beir.keys())[:2]:
    print(f"Query ID: {query_id}, Top Docs: {results_beir[query_id]}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Processing antique queries: 100%|██████████ | 00:00
Processing beir queries: 100%|██████████ | 00:00


antique - MAP: 0.5260, Precision@10: 0.3119, Recall@10: 0.1045, MRR: 0.5823
beir - MAP: 0.5496, Precision@10: 0.1328, Recall@10: 0.7462, MRR: 0.5712

Sample results for ANTIQUE (first 2 queries):
Query ID: 3990512, Top Docs: ['4087614_7', '4442135_2', '2960956_7', '1253368_5', '245498_1', '245498_7', '4099979_6', '4366141_8', '4009307_3', '506361_0']
Query ID: 714612, Top Docs: ['714612_0', '1136432_0', '3069999_1', '1178326_1', '817286_2', '1802263_13', '3561080_2', '1369513_10', '714612_7', '714612_1']

Sample results for BEIR (first 2 queries):
Query ID: 46, Top Docs: ['167865', '188177', '174045', '147009', '41567', '19261', '69441', '47309', '5632', '456168']
Query ID: 187, Top Docs: ['504714', '68733', '258306', '188', '171059', '68732', '144864', '360881', '58103', '225197']


In [4]:
import os
import joblib
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import json
from tqdm import tqdm
import gc
import sqlite3
import numpy as np
from datetime import datetime
from collections import defaultdict

# تحميل بيانات NLTK المطلوبة
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Tokenizer مخصص
stop_words = set(stopwords.words('english')) - {'not', 'no', 'never'}
lemmatizer = WordNetLemmatizer()

def custom_tokenizer(text):
    if not isinstance(text, str) or not text.strip():
        return []
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words and len(token) > 2 and token not in ['example', 'test']]
    tokens = [lemmatizer.lemmatize(token, pos='v') for token in tokens]
    return tokens

def get_doc_ids_from_index(processed_query, db_path):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    doc_ids = set()
    for term in processed_query:
        cursor.execute("SELECT doc_id FROM inverted_index WHERE term = ?", (term,))
        doc_ids.update(row[0] for row in cursor.fetchall())
    conn.close()
    return doc_ids

def calculate_precision_at_k(retrieved_docs, relevant_docs, k=10):
    relevant_in_top_k = sum(1 for doc_id in retrieved_docs[:k] if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0)
    return relevant_in_top_k / k if k > 0 else 0.0

def calculate_recall_at_k(retrieved_docs, relevant_docs, k=10):
    relevant_in_top_k = sum(1 for doc_id in retrieved_docs[:k] if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0)
    total_relevant = sum(1 for score in relevant_docs.values() if float(score) > 0)
    return relevant_in_top_k / total_relevant if total_relevant > 0 else 0.0

def calculate_rr(retrieved_docs, relevant_docs):
    for i, doc_id in enumerate(retrieved_docs, 1):
        if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0:
            return 1.0 / i
    return 0.0

def calculate_map(retrieved_docs, relevant_docs):
    relevant_count = 0
    precision_sum = 0.0
    for i, doc_id in enumerate(retrieved_docs, 1):
        if str(doc_id) in relevant_docs and float(relevant_docs[str(doc_id)]) > 0:
            relevant_count += 1
            precision_sum += relevant_count / i
    return precision_sum / relevant_count if relevant_count > 0 else 0.0

def load_qrels(qrels_df):
    qrels = defaultdict(dict)
    for _, row in qrels_df.iterrows():
        qrels[str(row['query_id'])][str(row['doc_id'])] = float(row['relevance'])
    return qrels

def process_tfidf_similarities_in_batches(queries_df, tfidf_vectorizer, doc_tfidf, batch_size, doc_id_mapping, dataset_base_path, dataset_name, enhance=False):
    results = {}
    index_db_path = os.path.join(dataset_base_path, "index.db")
    total_queries = len(queries_df)

    for i in tqdm(range(0, total_queries, batch_size), 
              desc=f"Processing {dataset_name} queries", 
              total=(total_queries + batch_size - 1) // batch_size, 
              bar_format="{l_bar}{bar} | {remaining}"):
        batch_texts = queries_df['text'].iloc[i:i + batch_size].tolist()
        batch_query_ids = queries_df['query_id'].iloc[i:i + batch_size].astype(str).tolist()

        for query_text, query_id in zip(batch_texts, batch_query_ids):
            query_to_process = query_text
            processed_terms = custom_tokenizer(query_text)

            query_vector = tfidf_vectorizer.transform([query_text])
            doc_ids = get_doc_ids_from_index(processed_terms, index_db_path)

            if not doc_ids:
                results[query_id] = []
                continue

            filtered_indices = [idx for idx, doc_id in doc_id_mapping.items() if doc_id in doc_ids]
            if not filtered_indices:
                results[query_id] = []
                continue

            filtered_doc_matrix = doc_tfidf[filtered_indices]
            similarities = cosine_similarity(query_vector, filtered_doc_matrix)
            doc_scores = list(zip(filtered_indices, similarities[0]))
            doc_scores.sort(key=lambda x: x[1], reverse=True)
            top_10_docs = [doc_id_mapping[idx] for idx, _ in doc_scores[:10]]
            results[query_id] = top_10_docs

            del similarities, filtered_doc_matrix
            gc.collect()

    return results, {}

def evaluate_results(results, qrels, queries_df, dataset_name):
    map_scores, precision_scores, recall_scores, rr_scores, low_map_queries = [], [], [], [], []

    for query_id in results:
        retrieved_docs = results[query_id]
        relevant_docs = qrels.get(query_id, {})
        if relevant_docs:
            map_score = calculate_map(retrieved_docs, relevant_docs)
            map_scores.append(map_score)
            if map_score < 0.1:
                filtered_df = queries_df[queries_df['query_id'].astype(str) == str(query_id)]
                if not filtered_df.empty:
                    query_text = filtered_df['text'].iloc[0]
                    low_map_queries.append((query_id, query_text, map_score))
            precision_scores.append(calculate_precision_at_k(retrieved_docs, relevant_docs))
            recall_scores.append(calculate_recall_at_k(retrieved_docs, relevant_docs))
            rr_scores.append(calculate_rr(retrieved_docs, relevant_docs))

    print(f"{dataset_name} - MAP: {np.mean(map_scores):.4f}, Precision@10: {np.mean(precision_scores):.4f}, Recall@10: {np.mean(recall_scores):.4f}, MRR: {np.mean(rr_scores):.4f}")

# تحميل المسارات
base_path_antique = "data/antique"
base_path_beir = "data/beir"

queries_path_antique = os.path.join(base_path_antique, "enhanced_queries_antique.csv")
queries_path_beir = os.path.join(base_path_beir, "enhanced_queries_beir.csv")

docs_path_antique = os.path.join(base_path_antique, "docs_antique.csv")
docs_path_beir = os.path.join(base_path_beir, "docs_beir.csv")
qrels_path_antique = os.path.join(base_path_antique, "qrels_antique.csv")
qrels_path_beir = os.path.join(base_path_beir, "qrels_beir.csv")

tfidf_matrix_antique = joblib.load(os.path.join(base_path_antique, "tfidf_matrix.joblib"))
tfidf_matrix_beir = joblib.load(os.path.join(base_path_beir, "tfidf_matrix.joblib"))

tfidf_vectorizer_antique = joblib.load(os.path.join(base_path_antique, "tfidf_vectorizer.joblib"))
tfidf_vectorizer_beir = joblib.load(os.path.join(base_path_beir, "tfidf_vectorizer.joblib"))

queries_df_antique = pd.read_csv(queries_path_antique).dropna(subset=['text'])
queries_df_beir = pd.read_csv(queries_path_beir).dropna(subset=['text'])

qrels_df_antique = pd.read_csv(qrels_path_antique)
qrels_df_beir = pd.read_csv(qrels_path_beir)

qrels_antique = load_qrels(qrels_df_antique)
qrels_beir = load_qrels(qrels_df_beir)

docs_antique = pd.read_csv(docs_path_antique)
docs_beir = pd.read_csv(docs_path_beir)

doc_id_mapping_antique = {i: str(doc_id) for i, doc_id in enumerate(docs_antique['doc_id'])}
doc_id_mapping_beir = {i: str(doc_id) for i, doc_id in enumerate(docs_beir['doc_id'])}

BATCH_SIZE = 16

results_antique, _ = process_tfidf_similarities_in_batches(
    queries_df_antique, tfidf_vectorizer_antique, tfidf_matrix_antique,
    BATCH_SIZE, doc_id_mapping_antique, base_path_antique, "antique", enhance=False
)

results_beir, _ = process_tfidf_similarities_in_batches(
    queries_df_beir, tfidf_vectorizer_beir, tfidf_matrix_beir,
    BATCH_SIZE, doc_id_mapping_beir, base_path_beir, "beir", enhance=False
)

evaluate_results(results_antique, qrels_antique, queries_df_antique, "antique")
evaluate_results(results_beir, qrels_beir, queries_df_beir, "beir")

print("\nSample results for ANTIQUE (first 2 queries):")
for query_id in list(results_antique.keys())[:2]:
    print(f"Query ID: {query_id}, Top Docs: {results_antique[query_id]}")

print("\nSample results for BEIR (first 2 queries):")
for query_id in list(results_beir.keys())[:2]:
    print(f"Query ID: {query_id}, Top Docs: {results_beir[query_id]}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Processing antique queries: 100%|██████████ | 00:00
Processing beir queries: 100%|██████████ | 00:00


antique - MAP: 0.5260, Precision@10: 0.3119, Recall@10: 0.1045, MRR: 0.5823
beir - MAP: 0.4949, Precision@10: 0.0946, Recall@10: 0.7102, MRR: 0.5049

Sample results for ANTIQUE (first 2 queries):
Query ID: 3990512, Top Docs: ['4087614_7', '4442135_2', '2960956_7', '1253368_5', '245498_1', '245498_7', '4099979_6', '4366141_8', '4009307_3', '506361_0']
Query ID: 714612, Top Docs: ['714612_0', '1136432_0', '3069999_1', '1178326_1', '817286_2', '1802263_13', '3561080_2', '1369513_10', '714612_7', '714612_1']

Sample results for BEIR (first 2 queries):
Query ID: 46, Top Docs: ['300076', '19261', '53617', '75216', '104769', '48650', '301936', '353907', '93153', '296385']
Query ID: 187, Top Docs: ['68732', '144864', '360881', '4789', '202157', '68733', '4788', '346186', '202765', '288149']
