# Clinical and CORD-19 Retrieval Evaluation
This notebook implements the evaluation function for clinical trials and CORD-19 retrieval systems using TF-IDF, Word2Vec, and hybrid modes.

In [2]:
from tqdm import tqdm
from clinical_trials_retrieval import ClinicalTrialsRetrieval
from cord19_retrieval import Cord19Retrieval

In [3]:

def evaluate_dataset(retrieval_system, mode="tfidf", top_n=10, use_topic_filter=False):
    ap_list = []
    recall_list = []
    rr_list = []

    for query in tqdm(retrieval_system.dataset.queries_iter()):
        query_id = query.query_id
        query_text = getattr(query, "text", None) or f"{query.title} {query.description}"

        if mode == "tfidf":
            results = retrieval_system.search(query_text, top_n)
        elif mode == "word2vec":
            results = retrieval_system.search_word2vec(query_text, top_n)
        elif mode == "hybrid":
            results = retrieval_system.search_hybrid(query_text, top_n, use_topic_filter)
        else:
            raise ValueError("Invalid mode selected.")

        doc_ids = [doc["doc_id"] for doc in results]
        eval_result = retrieval_system.evaluate(query_id, doc_ids)

        ap_list.append(eval_result["average_precision"])
        recall_list.append(eval_result["recall"])
        rr_list.append(eval_result["reciprocal_rank"])

    map_score = sum(ap_list) / len(ap_list)
    mean_recall = sum(recall_list) / len(recall_list)
    mrr = sum(rr_list) / len(rr_list)

    print(f"\n=== Evaluation Results ({retrieval_system.__class__.__name__}) ===")
    print(f"Mode: {mode}")
    print(f"Queries evaluated: {len(ap_list)}")
    print(f"MAP: {map_score:.4f}")
    print(f"Mean Recall: {mean_recall:.4f}")
    print(f"MRR: {mrr:.4f}")


Instantiate systems

In [4]:
clinical = ClinicalTrialsRetrieval()
cord19 = Cord19Retrieval()

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\olexd\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\olexd\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\olexd\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\olexd\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\olexd\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package 

Run evaluations

In [5]:
evaluate_dataset(clinical, mode="hybrid", top_n=10)
evaluate_dataset(cord19, mode="hybrid", top_n=10)

# evaluate_dataset(clinical, mode="word2vec", top_n=10)
# evaluate_dataset(cord19, mode="hybrid", top_n=10, use_topic_filter=True)

75it [02:57,  2.36s/it]



=== Evaluation Results (ClinicalTrialsRetrieval) ===
Mode: hybrid
Queries evaluated: 75
MAP: 0.0303
Mean Recall: 0.0412
MRR: 0.6503


50it [00:46,  1.07it/s]


=== Evaluation Results (Cord19Retrieval) ===
Mode: hybrid
Queries evaluated: 50
MAP: 0.0097
Mean Recall: 0.0123
MRR: 0.6721



