INFORMATION RETRIEVAL

ASSIGNMENT-6

PRANSHU PARATE

202211063

In [33]:
#Installing pyterrier library
! pip install python-terrier



In [34]:
#Importing packages
import pandas as pd
import numpy as np
import pyterrier as pt
import ir_measures
from ir_measures import *
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

In [35]:
#Initialize
if not pt.started():
  pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])

In [36]:
#Documents
document = pt.get_dataset("vaswani")

In [37]:
document

RemoteDataset for vaswani, with ['corpus', 'topics', 'qrels', 'index', 'info_url', 'corpus_iter']

In [38]:
#Indexing the collection of documents using the PyTerrier library.
columns = ['title', 'body']

indexer = pt.TRECCollectionIndexer('./index_fields', blocks = True, overwrite=True)

pt.ApplicationSetup.setProperty("indexer.meta.forward.keys", ",".join(columns))
pt.ApplicationSetup.setProperty("indexer.meta.reverse.keys", ",".join(columns))
pt.ApplicationSetup.setProperty("FieldTags.process", ",".join(columns))
pt.ApplicationSetup.setProperty("TrecDocTags.doctag", "DOC")
pt.ApplicationSetup.setProperty("TrecDocTags.idtag", "DOCNO")
pt.ApplicationSetup.setProperty("TrecDocTags.skip", "DOCHDR")

index = indexer.index(document.get_corpus())
doc_index = pt.IndexFactory.of(index)

In [39]:
#Retrieve the main topics from the document.
main_topics = document.get_topics()

In [40]:
#Retrieve the main qrels from the document.
main_qrels = document.get_qrels()

In [41]:
#Batch Retrieve object using the BM25 model.
BM25 = pt.BatchRetrieve(doc_index, wmodel='BM25')

In [42]:
#Perform an experiment to evaluate the BM25 retrieval model.
pt.Experiment(
    [BM25], main_topics, main_qrels,
    eval_metrics = [MRR@10, MRR@100, MAP@10, MAP@100, nDCG@10, nDCG@100],
    names = ['BM25']
)

Unnamed: 0,name,RR@10,RR@100,AP@10,AP@100,nDCG@10,nDCG@100
0,BM25,0.719875,0.725488,0.167809,0.272523,0.446609,0.502299


BM25 retrieval model performs reasonably well according to these metrics.

In [43]:
#Set up a pipeline that combines two PyTerrier components SDM and BatchRetrieve with the BM25 model.
sdm = pt.rewrite.SequentialDependence(doc_index)
bm25 = pt.BatchRetrieve(doc_index, wmodel="BM25")
pipeline = sdm >> bm25

In [44]:
#Perform an experiment to evaluate the retrieval performance of the pipeline.
pt.Experiment(
    [pipeline], main_topics, main_qrels,
    eval_metrics= [MRR@10, MRR@100, MAP@10, MAP@100, nDCG@10, nDCG@100],
    names =['BM25+SequentialDependence']
)

Unnamed: 0,name,RR@10,RR@100,AP@10,AP@100,nDCG@10,nDCG@100
0,BM25+SequentialDependence,0.729792,0.734333,0.170217,0.274253,0.453038,0.504336


The combination of Sequential Dependence Model and BM25 model in the pipeline appears to enhance retrieval performance as compared to BM25 model alone.

In [45]:
#Perform an experiment to evaluate the impact of different techniques on retrieval performance.
Bo1 = pt.rewrite.Bo1QueryExpansion(doc_index)
KL = pt.rewrite.KLQueryExpansion(doc_index)
RM3 = pt.rewrite.RM3(doc_index)
AQ = pt.rewrite.AxiomaticQE(doc_index)
pt.Experiment(
    [
            BM25,
            BM25 >> Bo1 >> BM25,
            BM25 >> KL >> BM25,
            BM25 >> RM3 >> BM25,
            BM25 >> AQ >> BM25,
    ],
    main_topics, main_qrels,
    eval_metrics=[MRR@10, MRR@100, MAP@10, MAP@100, nDCG@10, nDCG@100],
    names=["BM25", "+Bo1 Divergence", "+KLQueryExpansion", "+RM3", "+AxiomaticQE"]
    )

Unnamed: 0,name,RR@10,RR@100,AP@10,AP@100,nDCG@10,nDCG@100
0,BM25,0.719875,0.725488,0.167809,0.272523,0.446609,0.502299
1,+Bo1 Divergence,0.681315,0.6853,0.170858,0.280307,0.45221,0.509802
2,+KLQueryExpansion,0.674863,0.679529,0.168336,0.278259,0.44735,0.507592
3,+RM3,0.636973,0.64066,0.160819,0.268703,0.436146,0.490788
4,+AxiomaticQE,0.719875,0.725488,0.167809,0.272523,0.446609,0.502299


In this experiment, various query expansion techniques were evaluated in combination with the BM25 retrieval model. The results indicate that, for the given dataset and retrieval task, none of the query expansion methods significantly outperformed the baseline BM25 in terms of retrieval performance.

In [46]:
#Set up pipeline with the BM25 model and with term frequency (TF) and PL2.
pipeline = pt.FeaturesBatchRetrieve(doc_index, wmodel="BM25", features=["WMODEL:Tf", "WMODEL:PL2"])

In [47]:
#Using random forest regression model for learning to rank.
random = RandomForestRegressor(n_estimators=2)
pipe_random = pipeline >> pt.ltr.apply_learned_model(random)
pipe_random.fit(main_topics.iloc[:10], main_qrels.iloc[:10])

In [48]:
#Perform an experiment to evaluate the impact of different techniques and learned-to-rank.
Bo1 = pt.rewrite.Bo1QueryExpansion(doc_index)
KL = pt.rewrite.KLQueryExpansion(doc_index)
RM3 = pt.rewrite.RM3(doc_index)
AQ = pt.rewrite.AxiomaticQE(doc_index)
pt.Experiment(
    [
            BM25,
            BM25 >> Bo1 >> pipe_random,
            BM25 >> KL >> BM25 >> pipe_random,
            BM25 >> RM3 >> BM25 >> pipe_random,
            BM25 >> AQ >> BM25 >> pipe_random,
    ],
    main_topics, main_qrels,
    eval_metrics=[MRR@10, MRR@100, MAP@10, MAP@100, nDCG@10, nDCG@100],
    names=["BM25", "+Bo1 Divergence", "+KLQueryExpansion", "+RM3", "+AxiomaticQE"]
    )

Unnamed: 0,name,RR@10,RR@100,AP@10,AP@100,nDCG@10,nDCG@100
0,BM25,0.719875,0.725488,0.167809,0.272523,0.446609,0.502299
1,+Bo1 Divergence,0.046621,0.065866,0.002093,0.006525,0.020851,0.0516
2,+KLQueryExpansion,0.041709,0.063804,0.001086,0.005856,0.014998,0.050035
3,+RM3,0.033734,0.056024,0.001726,0.006287,0.018204,0.050346
4,+AxiomaticQE,0.034182,0.053625,0.001571,0.006103,0.016385,0.049761


I noticed that the combination of query expansion techniques, followed by learned-to-rank using a Random Forest model, generally leads to a significant decrease in retrieval performance as compared to the baseline BM25 model

In [49]:
#Train XGBoost Ranker model for learned-to-rank.
model = xgb.sklearn.XGBRanker(objective='rank:ndcg',
      learning_rate=0.01,
      gamma=1.0,
      min_child_weight=0.1,
      max_depth=6,
      verbose=2,
      random_state=42)

pipe_model = pipeline >> pt.ltr.apply_learned_model(model, form="ltr")
pipe_model.fit(main_topics, main_qrels,main_topics,main_qrels)

Parameters: { "verbose" } are not used.



In [50]:
#Create a pipeline that sequentially applies the Sequential Dependence Model.
sdm = pt.rewrite.SequentialDependence(doc_index)
BM25 = pt.BatchRetrieve(doc_index, wmodel="BM25")
pipeline = sdm >> BM25 >> pipe_model

In [51]:
#Perform an experiment to evaluate the performance of a pipeline.
pt.Experiment(
    [pipeline], main_topics, main_qrels,
    eval_metrics=[MRR@10, MRR@100, MAP@10, MAP@100, nDCG@10, nDCG@100],
    names=["BM25+SequentialDependence+LTR"]
)

Unnamed: 0,name,RR@10,RR@100,AP@10,AP@100,nDCG@10,nDCG@100
0,BM25+SequentialDependence+LTR,0.011585,0.01767,0.000688,0.001296,0.005375,0.012982


The results indicate that the combined approach of Sequential Dependence Model, BM25 model, and Learned-to-Rank yields significantly lower performance across all evaluated metrics as compared to the baseline BM25 model.

In [52]:
#Perform an experiment to assess the impact of different techniques, followed by a Learned-to-Rank model.
Bo1 = pt.rewrite.Bo1QueryExpansion(doc_index)
KL = pt.rewrite.KLQueryExpansion(doc_index)
RM3 = pt.rewrite.RM3(doc_index)
AQ = pt.rewrite.AxiomaticQE(doc_index)
pt.Experiment(
    [
            BM25,
            BM25 >> Bo1 >> pipe_model,
            BM25 >> KL >> pipe_model,
            BM25 >> RM3 >> pipe_model,
            BM25 >> AQ >> pipe_model,
    ],
    main_topics, main_qrels,
    eval_metrics=[MRR@10, MRR@100, MAP@10, MAP@100, nDCG@10, nDCG@100],
    names=["BM25", "+Bo1 Divergence+LTR", "+KLQueryExpansion+LTR", "+RM3+LTR", "+AxiomaticQE+LTR"]
    )

Unnamed: 0,name,RR@10,RR@100,AP@10,AP@100,nDCG@10,nDCG@100
0,BM25,0.719875,0.725488,0.167809,0.272523,0.446609,0.502299
1,+Bo1 Divergence+LTR,0.159916,0.181522,0.00915,0.032612,0.065199,0.159615
2,+KLQueryExpansion+LTR,0.147308,0.169053,0.009255,0.031497,0.057929,0.155575
3,+RM3+LTR,0.124172,0.145295,0.010165,0.036679,0.06276,0.167443
4,+AxiomaticQE+LTR,0.259754,0.275644,0.067479,0.101658,0.161166,0.251185


The results indicates that combining various query expansion techniques with Learned-to-Rank models significantly influences performance when integrated with the BM25 model. +AxiomaticQE+LTR have improved RR@10, RR@100, AP@10, and nDCG@10 values as compared to the baseline BM25 model,