In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score, recall_score, average_precision_score
from gensim.models import Word2Vec
import joblib
from Data_Processing_Quora import data_processing_quora  

print(" download docs")
docs_df = pd.read_csv("cleaned_quora.csv").dropna(subset=["cleaned_text"]).reset_index(drop=True)

print("download queries")
queries_df = pd.read_csv("queries.tsv", sep='\t').dropna(subset=["text"])

print("download qrels...")
qrels_df = pd.read_csv("qrels.tsv", sep='\t')

qrels_dict = {}
for _, row in qrels_df.iterrows():
    qid, did = row["query_id"], row["doc_id"]
    qrels_dict.setdefault(qid, set()).add(did)

print("download docs representation ")
doc_vectors = joblib.load("quora_doc_vectors.joblib")
doc_ids = docs_df["doc_id"].tolist()

print("download Word2Vec model...")
model = Word2Vec.load("word2vec_quora.model")

def get_w2v_vector(text):
    words = text.split()
    vecs = [model.wv[word] for word in words if word in model.wv]
    return np.mean(vecs, axis=0) if vecs else np.zeros(model.vector_size)

print("start evaluation")
map_scores, mrr_scores, precision_scores, recall_scores = [], [], [], []

for _, query_row in queries_df.iterrows():
    qid = query_row["query_id"]
    raw_text = query_row["text"]
    
    query_text = data_processing_quora(raw_text)
    query_vector = get_w2v_vector(query_text).reshape(1, -1)

    sims = cosine_similarity(doc_vectors, query_vector).flatten()
    top_k = 10
    top_indices = sims.argsort()[-top_k:][::-1]
    top_doc_ids = [doc_ids[i] for i in top_indices]

    relevant_docs = qrels_dict.get(qid, set())
    y_true = [1 if doc_id in relevant_docs else 0 for doc_id in top_doc_ids]
    y_scores = sims[top_indices]

    if sum(y_true) == 0:
        continue

    map_scores.append(average_precision_score(y_true, y_scores))
    precision_scores.append(precision_score(y_true, [1]*len(y_true)))
    recall_scores.append(recall_score(y_true, [1]*len(y_true)))
    mrr_scores.append(1 / (np.where(np.array(y_true) == 1)[0][0] + 1))

print("\n final results:")
print(f"🔹 MAP  : {np.mean(map_scores):.4f}")
print(f"🔹 MRR  : {np.mean(mrr_scores):.4f}")
print(f"🔹 P@10 : {np.mean(precision_scores):.4f}")
print(f"🔹 R@10 : {np.mean(recall_scores):.4f}")


 download docs
download queries
download qrels...
download docs representation 
download Word2Vec model...
start evaluation

 final results:
🔹 MAP  : 0.7504
🔹 MRR  : 0.7927
🔹 P@10 : 0.1234
🔹 R@10 : 1.0000
