In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score, recall_score, average_precision_score
from gensim.models import Word2Vec
import joblib
from Data_Processing_Antique import data_processing_antique

print(" Download cleaned_antique.csv ...")
docs_df = pd.read_csv("cleaned_antique.csv").dropna(subset=["cleaned_text"]).reset_index(drop=True)

queries_df = pd.read_csv("antique_queries_train.csv").dropna(subset=["text"])

qrels_df = pd.read_csv("antique_qrels_train.csv")

print("Download qrels...")
qrels_dict = {}
for _, row in qrels_df.iterrows():
    qid, did = row["query_id"], row["doc_id"]
    qrels_dict.setdefault(qid, set()).add(did)

print("Download Word2Vec...")
doc_vectors = joblib.load("antique_doc_vectors_2000.joblib")

doc_ids = joblib.load("antique_doc_ids.joblib")

model = Word2Vec.load("word2vec_antique_200.model")

def get_w2v_vector(text):
    words = text.split()
    vecs = [model.wv[word] for word in words if word in model.wv]
    return np.mean(vecs, axis=0) if vecs else np.zeros(model.vector_size)

print("start evaluation.")
map_scores, mrr_scores, precision_scores, recall_scores = [], [], [], []

for _, query_row in queries_df.iterrows():
    qid = query_row["query_id"]
    query_text = data_processing_antique(query_row["text"])
    query_vector = get_w2v_vector(query_text).reshape(1, -1)

    # حساب التشابه
    sims = cosine_similarity(doc_vectors, query_vector).flatten()

    # أفضل top_k
    top_k = 10
    top_indices = sims.argsort()[-top_k:][::-1]
    top_doc_ids = [doc_ids[i] for i in top_indices]

    # القيم الحقيقية والمتوقعة
    relevant_docs = qrels_dict.get(qid, set())
    y_true = [1 if doc_id in relevant_docs else 0 for doc_id in top_doc_ids]
    y_scores = sims[top_indices]

    if sum(y_true) == 0:
        continue

    map_scores.append(average_precision_score(y_true, y_scores))
    precision_scores.append(precision_score(y_true, [1]*len(y_true)))
    recall_scores.append(recall_score(y_true, [1]*len(y_true)))
    mrr_scores.append(1 / (np.where(np.array(y_true) == 1)[0][0] + 1))

print("\n النتائج النهائية للتقييم:")
print(f" MAP  : {np.mean(map_scores):.4f}")
print(f" MRR  : {np.mean(mrr_scores):.4f}")
print(f" P@10 : {np.mean(precision_scores):.4f}")
print(f" R@10 : {np.mean(recall_scores):.4f}")


 Download cleaned_antique.csv ...
Download qrels...
Download Word2Vec...
start evaluation.

 النتائج النهائية للتقييم:
 MAP  : 0.5556
 MRR  : 0.6071
 P@10 : 0.1901
 R@10 : 1.0000
