# Imports

In [31]:
import pandas as pd
import numpy as np
import os

from sklearn.metrics import ndcg_score, precision_score
from sklearn.preprocessing import LabelBinarizer

# Load data

In [2]:
BM25_results = {filename.split(".")[0]: pd.read_csv(f"data/BM25_results/{filename}", index_col=0, header=0) for filename in os.listdir("data/BM25_results/")}

In [3]:
BM25_results["1"].head()

Unnamed: 0,doc_id,score,query_id,summary,detailed_description,unknown,relevance,bm25,bm25Plus,bm25L
0,NCT00003176,121.18297,1,\n \n RATIONALE: Drugs used in chemoth...,\n \n OBJECTIVES: I. Evaluate the acti...,0.0,1.0,178.913627,366.642656,301.413865
1,NCT03633552,118.12887,1,"\n \n This is a phase III, non-blinded...",\n \n This study aimed to compare the ...,0.0,1.0,191.140627,375.016619,163.067224
2,NCT00968240,115.93323,1,\n \n The high-grade malignant brain t...,\n \n The current standard of care for...,0.0,1.0,185.792705,364.606073,306.646563
3,NCT03896568,109.383545,1,\n \n This phase I trial studies best ...,\n \n PRIMARY OBJECTIVES:\r\n\r\n ...,0.0,1.0,167.600015,353.394314,165.924104
4,NCT02942264,109.16138,1,\n \n Background:\r\n\r\n Zotirac...,\n \n Background:\r\n\r\n - Zo...,0.0,1.0,177.752863,357.818129,230.682298


# Calculate NDCG@5 and NDCG@10

In [41]:
def get_ndcg_score(y_true, y_score, k):

    return ndcg_score(y_true, y_score, k=k)

    
metrics = {}

for query_id, query_results in BM25_results.items():

    metrics[query_id] = {
                            "ndcg@5": {},
                            "ndcg@10": {}
                        }

    query_results["relevance"].mask(query_results["relevance"] == 2, 1, inplace=True)

    y_true = np.asarray([query_results["relevance"].fillna(0).tolist()])
    
    for variant in ["bm25", "bm25Plus", "bm25L"]:

        y_score = np.asarray([query_results[["relevance", variant]].sort_values(by=variant, ascending=False)["relevance"].fillna(0).tolist()])
        
        metrics[query_id]["ndcg@5"][variant] = get_ndcg_score(y_true, y_score, 5)
        metrics[query_id]["ndcg@10"][variant] = get_ndcg_score(y_true, y_score, 10)

{'ndcg@5': {'bm25': 0.04819277108433735,
  'bm25Plus': 0.18072289156626503,
  'bm25L': 0.036144578313253004},
 'ndcg@10': {'bm25': 0.04819277108433735,
  'bm25Plus': 0.18072289156626506,
  'bm25L': 0.03614457831325301}}