# Imports

In [1]:
import os
import pandas as pd
import ir_datasets

from rank_bm25 import BM25Okapi, BM25Plus, BM25L
from sklearn.metrics import ndcg_score, precision_score

# Load data

## Queries

In [2]:
queries_df = pd.read_csv("data/queries_2021.tsv", sep="\t", header=None, names=['id', 'query'])

queries_df.head()

Unnamed: 0,id,query
0,1,Patient is a 45-year-old man with a history of...
1,2,"48 M with a h/o HTN hyperlipidemia, bicuspid a..."
2,3,A 32 yo woman who presents following a severe ...
3,4,"This is a 44 year old female with PMH of PCOS,..."
4,5,"74M hx of CAD s/p CABG, EF 60% prior CVA (no r..."


## Document relevance for query

In [3]:
query_relevance_df = pd.read_csv("data/qrels2021.txt", sep=" ", header=None, names = ["query_id", "unknown", "doc_id", "relevance"])
query_relevance_df["doc_id"] = query_relevance_df["doc_id"]
query_relevance_df.fillna(0, inplace=True)

query_relevance_df.head()

Unnamed: 0,query_id,unknown,doc_id,relevance
0,1,0,NCT00002569,1
1,1,0,NCT00002620,1
2,1,0,NCT00002806,0
3,1,0,NCT00002814,2
4,1,0,NCT00003022,1


## Full dataset

In [4]:
dataset = ir_datasets.load("clinicaltrials/2021")

dataset_df = pd.DataFrame.from_records([(document.doc_id, document.summary, document.detailed_description) for document in dataset.docs_iter()], columns=["doc_id", "summary", "detailed_description"])
    
dataset_df.head()

Unnamed: 0,doc_id,summary,detailed_description
0,NCT00000102,\n \n This study will test the ability...,\n \n This protocol is designed to ass...
1,NCT00000104,\n \n Inner city children are at an in...,
2,NCT00000105,\n \n The purpose of this study is to ...,\n \n Patients will receive each vacci...
3,NCT00000106,\n \n Recently a non-toxic system for ...,
4,NCT00000107,\n \n Adults with cyanotic congenital ...,


## Query results

In [7]:
ES_query_results = {}

for filename in os.listdir("data/ES_outputs"):

    query_id = str(filename.split(".")[0])

    ES_query_results[query_id] = pd.read_csv(f"data/ES_outputs/{filename}", header=None, names=['doc_id', 'score'])
    ES_query_results[query_id]["query_id"] = query_id
    ES_query_results[query_id]["query_id"] = ES_query_results[query_id]["query_id"].astype(int)

ES_query_results["1"].head()

Unnamed: 0,doc_id,score,query_id
0,NCT00003176,121.18297,1
1,NCT03633552,118.12887,1
2,NCT00968240,115.93323,1
3,NCT03896568,109.383545,1
4,NCT02942264,109.16138,1


## Join dataset to query results and relevance

In [8]:
for query_id, query_results in ES_query_results.items():
    
    ES_query_results[query_id] = ES_query_results[query_id].merge(dataset_df, how="left", on="doc_id")
    ES_query_results[query_id] = ES_query_results[query_id].merge(query_relevance_df, how="left", on=["doc_id", "query_id"])

ES_query_results["1"].head()

Unnamed: 0,doc_id,score,query_id,summary,detailed_description,unknown,relevance
0,NCT00003176,121.18297,1,\n \n RATIONALE: Drugs used in chemoth...,\n \n OBJECTIVES: I. Evaluate the acti...,0.0,1.0
1,NCT03633552,118.12887,1,"\n \n This is a phase III, non-blinded...",\n \n This study aimed to compare the ...,0.0,1.0
2,NCT00968240,115.93323,1,\n \n The high-grade malignant brain t...,\n \n The current standard of care for...,0.0,1.0
3,NCT03896568,109.383545,1,\n \n This phase I trial studies best ...,\n \n PRIMARY OBJECTIVES:\r\n\r\n ...,0.0,1.0
4,NCT02942264,109.16138,1,\n \n Background:\r\n\r\n Zotirac...,\n \n Background:\r\n\r\n - Zo...,0.0,1.0


# Generate BM25 rankings

## Index and rank BM25

In [31]:
for query_id, query_results in ES_query_results.items():
    
    print(query_id)
    bm25 = BM25Okapi([document.split(" ") for document in query_results["detailed_description"].tolist()])
    bm25Plus = BM25Plus([document.split(" ") for document in query_results["detailed_description"].tolist()])
    bm25L = BM25L([document.split(" ") for document in query_results["detailed_description"].tolist()])

    ES_query_results[query_id]["bm25"] = bm25.get_scores(queries_df[queries_df["id"] == int(query_id)]["query"].values[0].split(" "))
    ES_query_results[query_id]["bm25Plus"] = bm25Plus.get_scores(queries_df[queries_df["id"] == int(query_id)]["query"].values[0].split(" "))
    ES_query_results[query_id]["bm25L"] = bm25L.get_scores(queries_df[queries_df["id"] == int(query_id)]["query"].values[0].split(" "))

68
6
54
40
41
7
55
69
43
57
5
56
4
42
46
52
1
53
47
51
3
45
44
50
2
37
23
22
36
20
34
35
21
25
31
19
18
30
24
32
26
27
33
16
17
29
15
14
28
10
38
39
11
13
12
49
75
61
60
74
48
62
63
67
73
9
8
72
66
70
64
58
59
65
71


## Save results

In [32]:
for query_id, query_results in ES_query_results.items():

    query_results.to_csv(f"data/BM25_results/{query_id}.csv", header=True)