## Sparse Retrieval
Implementation of sparse passage retrieval using TF-IDF and BM25. Evaluated on the MS MARCO dataset using MRR and retrieval time.

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from rank_bm25 import BM25Okapi
import numpy as np
import time
from ranx import Run, Qrels, evaluate

## Load Dataset

In [1]:
from src.dataset_loader import DatasetLoader

# Load cranfield dataset for testing functions, use MS MARCO, HotpotQA, and potentially Climate-FEVER for real evaluation
loader = DatasetLoader("cranfield")
docs, queries, qrels = loader.get_all()
loader.print_info()

# # load multiple datasets for evaluation
# nameset = ["beir/msmarco/dev", "bier/hotpotqa/dev", "bier/climate-fever/dev"]
#
# # Dictionary to hold datasets
# datasets = {}
#
# for name in nameset:
#     loader = DatasetLoader(name)
#     docs, queries, qrels = loader.get_all()
#     datasets[name] = {
#         "docs": docs,
#         "queries": queries,
#         "qrels": qrels
#     }
#     loader.print_info()


DATASET: cranfield
DOCS (1400): ('1', 'experimental investigation of the aerodynamics of a\nwing in a slipstream .\n  an experimental study of a wing in a propeller slipstream was\nmade in order to determine the spanwise distribution of the lift\nincrease due to slipstream at different angles of attack of the wing\nand at different free stream to slipstream velocity ratios .  the\nresults were intended in part as an evaluation basis for different\ntheoretical treatments of this problem .\n  the comparative span loading curves, together with\nsupporting evidence, showed that a substantial part of the lift increment\nproduced by the slipstream was due to a /destalling/ or\nboundary-layer-control effect .  the integrated remaining lift\nincrement, after subtracting this destalling lift, was found to agree\nwell with a potential flow theory .\n  an empirical evaluation of the destalling effects was made for\nthe specific configuration of the experiment .') 

QUERIES (225): ('1', 'what simi

## Data Preprocessing

In [3]:
# extract document and query IDs + texts
doc_ids, doc_texts = list(docs.keys()), list(docs.values())
query_ids, query_texts = list(queries.keys()), list(queries.values())

## Build Retriever

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi

# TF-IDF
tfidf = TfidfVectorizer(lowercase=True, stop_words="english")
tfidf_matrix = tfidf.fit_transform(doc_texts)
similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
#print(similarity_matrix[0][:10]) # similarity scores for the first document

# # BM25
# Use the same preprocessor and tokenizer as TF-IDF
tokenized_docs = [tfidf.build_tokenizer()(tfidf.build_preprocessor()(doc)) for doc in doc_texts]
bm25 = BM25Okapi(tokenized_docs)
print(bm25.get_scores(tokenized_docs[0])) # BM25 scores for the first document


[455.6084743  128.73215535  58.3061003  ... 119.11721915 101.34736483
 115.11643683]


## Evaluation

In [14]:
import numpy as np

tfidf_results = {}
for i in range(len(doc_texts)):
    query_id = f"{i}"
    ranked_docs = {}
    # Get similarity scores and sort in descending order
    similarity_scores = similarity_matrix[i]
    sorted_indices = np.argsort(similarity_scores)[::-1]
    for rank, doc_index in enumerate(sorted_indices):
        if doc_index != i:  # Exclude the query document itself
            ranked_docs[str(doc_index)] = similarity_scores[doc_index]
    tfidf_results[query_id] = ranked_docs

print("TF-IDF Results (ranx format):")
print(tfidf_results)

# TO-DO: get BM25 results

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



### Retrieval Time

In [None]:
import time

# get retrieval time for tfidf, do the same for BM25
k = 10
start_time = time.time()
# do retrieval here
retrieval_time = (time.time() - start_time) / len(query_ids)
print(f"Retrieval time per query: {retrieval_time:.4f} seconds")

### Mean Reciprocal Rank (MRR)

In [None]:
from ranx import Run, evaluate

# calculate MRR

# Run: stores the relevance scores estimated by the model under evaluation
# map results for each query_id -> { doc_id: score }
run = {
    # TO-DO: edit this to use BM25 results
    query_ids[i]: {
        doc_ids[knn[i][j]]: -float(distances[i][j]) for j in range(k) # use negative distance as ranx interprets higher score = higher rank
    }
    for i in range(len(query_ids))
}
run_rx = Run(run)

# measure MRR
mrr = evaluate(qrels, run_rx, "mrr", make_comparable=True)
print(f"MRR: {mrr:.4f}")
