# Dataset

In [17]:
from datasets import load_dataset
import pandas as pd


ds = load_dataset("microsoft/ms_marco", "v1.1")

df = ds['test'].to_pandas()
df["passage_text"] = df["passages"].apply(lambda x: x["passage_text"].tolist())  # Convert NumPy array to list
df["is_selected"] = df["passages"].apply(lambda x: x["is_selected"].tolist())    # Convert NumPy array to list

df.drop("passages", axis=1, inplace=True)
df.set_index("query_id", inplace=True)
df.head()

Unnamed: 0_level_0,answers,query,query_type,wellFormedAnswers,passage_text,is_selected
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,[Yes],does human hair stop squirrels,description,[],[We have been feeding our back yard squirrels ...,"[0, 0, 1, 0, 0, 0, 0]"
1,[Fossil fuels are basically the remains of ani...,what are the benefits of fossil fuels,description,[],[The biggest advantage of using fossil fuels i...,"[0, 1, 0, 0, 0, 0, 0, 0, 0]"
2,[The apothem of a regular polygon is a line se...,what is a apothem,description,[],[Apothem. The apothem of a regular polygon is ...,"[0, 0, 0, 0, 0, 1, 0, 0, 0]"
3,[$45 to $210. 2],average cost for custom canopy,numeric,[],"[Congratulations! You have found BuyShade.com,...","[0, 0, 0, 0, 0, 1, 0, 0, 0]"
4,[It is the collection of physical elements tha...,what is a hardware in a computer,description,[],"[Hardware is best described as a device, such ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"


# MiniLM


In [None]:
from ranking.rank_minilm import rank_passages_minilm_knn, rank_passages_minilm_bm25, rank_passages_minilm_cosine
from evaluation import evaluate_metrics

### Cosine Similarity

In [20]:
df["ranked_passages_minilm_cosine"] = df.apply(lambda x: rank_passages_minilm_cosine(x["query"], x["passage_text"]), axis=1)
df.head()

Unnamed: 0_level_0,answers,query,query_type,wellFormedAnswers,passage_text,is_selected,ranked_passages_minilm_cosine
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,[Yes],does human hair stop squirrels,description,[],[We have been feeding our back yard squirrels ...,"[0, 0, 1, 0, 0, 0, 0]","[4, 2, 0, 3, 5, 6, 1]"
1,[Fossil fuels are basically the remains of ani...,what are the benefits of fossil fuels,description,[],[The biggest advantage of using fossil fuels i...,"[0, 1, 0, 0, 0, 0, 0, 0, 0]","[4, 1, 3, 0, 5, 2, 6, 8, 7]"
2,[The apothem of a regular polygon is a line se...,what is a apothem,description,[],[Apothem. The apothem of a regular polygon is ...,"[0, 0, 0, 0, 0, 1, 0, 0, 0]","[8, 1, 5, 7, 0, 4, 2, 6, 3]"
3,[$45 to $210. 2],average cost for custom canopy,numeric,[],"[Congratulations! You have found BuyShade.com,...","[0, 0, 0, 0, 0, 1, 0, 0, 0]","[5, 1, 6, 2, 7, 4, 8, 3, 0]"
4,[It is the collection of physical elements tha...,what is a hardware in a computer,description,[],"[Hardware is best described as a device, such ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]","[9, 4, 8, 0, 2, 3, 1, 6, 5, 7]"


In [28]:
result1 = evaluate_metrics(df, relevance_col='is_selected', rank_col='ranked_passages_minilm_cosine')
result1#results for minilm model with cosine similarity retrieval

{'MRR': 0.5514425528415166,
 'nDCG': 0.6502893932815138,
 'Precision at 3': 0.24359240069084628,
 'Recall at 3': 0.6643937823834196,
 'MAP': 0.5438316994270362}

### KNN

In [23]:
df["ranked_passages_minilm_knn"] = df.apply(lambda x: rank_passages_minilm_knn(x["query"], x["passage_text"]), axis=1)
df.head()

Unnamed: 0_level_0,answers,query,query_type,wellFormedAnswers,passage_text,is_selected,ranked_passages_minilm_cosine,ranked_passages_minilm_knn
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,[Yes],does human hair stop squirrels,description,[],[We have been feeding our back yard squirrels ...,"[0, 0, 1, 0, 0, 0, 0]","[4, 2, 0, 3, 5, 6, 1]","[4, 2, 0, 3, 5]"
1,[Fossil fuels are basically the remains of ani...,what are the benefits of fossil fuels,description,[],[The biggest advantage of using fossil fuels i...,"[0, 1, 0, 0, 0, 0, 0, 0, 0]","[4, 1, 3, 0, 5, 2, 6, 8, 7]","[4, 1, 3, 0, 5]"
2,[The apothem of a regular polygon is a line se...,what is a apothem,description,[],[Apothem. The apothem of a regular polygon is ...,"[0, 0, 0, 0, 0, 1, 0, 0, 0]","[8, 1, 5, 7, 0, 4, 2, 6, 3]","[8, 1, 5, 7, 0]"
3,[$45 to $210. 2],average cost for custom canopy,numeric,[],"[Congratulations! You have found BuyShade.com,...","[0, 0, 0, 0, 0, 1, 0, 0, 0]","[5, 1, 6, 2, 7, 4, 8, 3, 0]","[5, 1, 6, 2, 7]"
4,[It is the collection of physical elements tha...,what is a hardware in a computer,description,[],"[Hardware is best described as a device, such ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]","[9, 4, 8, 0, 2, 3, 1, 6, 5, 7]","[9, 4, 8, 0, 2]"


In [24]:
ranked_indices_minilm_bm25 = rank_passages_minilm_bm25(df.iloc[0]["query"], df.iloc[0]["passage_text"])


In [29]:
result2 = evaluate_metrics(df, relevance_col='is_selected', rank_col='ranked_passages_minilm_knn')
result2 #results for minilm model with knn similarity retrieval

{'MRR': 0.5348307426597582,
 'nDCG': 0.6045202426077834,
 'Precision at 3': 0.24359240069084628,
 'Recall at 3': 0.6643937823834196,
 'MAP': 0.5216455958549224}

### BM25

In [26]:
df["ranked_passages_minilm_bm25"] = df.apply(lambda x: rank_passages_minilm_bm25(x["query"], x["passage_text"]), axis=1)
df.head()

Unnamed: 0_level_0,answers,query,query_type,wellFormedAnswers,passage_text,is_selected,ranked_passages_minilm_cosine,ranked_passages_minilm_knn,ranked_passages_minilm_bm25
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,[Yes],does human hair stop squirrels,description,[],[We have been feeding our back yard squirrels ...,"[0, 0, 1, 0, 0, 0, 0]","[4, 2, 0, 3, 5, 6, 1]","[4, 2, 0, 3, 5]","[1, 2, 5, 4, 6, 3, 0]"
1,[Fossil fuels are basically the remains of ani...,what are the benefits of fossil fuels,description,[],[The biggest advantage of using fossil fuels i...,"[0, 1, 0, 0, 0, 0, 0, 0, 0]","[4, 1, 3, 0, 5, 2, 6, 8, 7]","[4, 1, 3, 0, 5]","[1, 4, 8, 7, 5, 6, 0, 2, 3]"
2,[The apothem of a regular polygon is a line se...,what is a apothem,description,[],[Apothem. The apothem of a regular polygon is ...,"[0, 0, 0, 0, 0, 1, 0, 0, 0]","[8, 1, 5, 7, 0, 4, 2, 6, 3]","[8, 1, 5, 7, 0]","[0, 5, 6, 1, 7, 4, 2, 8, 3]"
3,[$45 to $210. 2],average cost for custom canopy,numeric,[],"[Congratulations! You have found BuyShade.com,...","[0, 0, 0, 0, 0, 1, 0, 0, 0]","[5, 1, 6, 2, 7, 4, 8, 3, 0]","[5, 1, 6, 2, 7]","[5, 6, 2, 1, 8, 0, 4, 3, 7]"
4,[It is the collection of physical elements tha...,what is a hardware in a computer,description,[],"[Hardware is best described as a device, such ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]","[9, 4, 8, 0, 2, 3, 1, 6, 5, 7]","[9, 4, 8, 0, 2]","[6, 3, 1, 0, 2, 7, 8, 5, 4, 9]"


In [30]:
result3 = evaluate_metrics(df, relevance_col='is_selected', rank_col='ranked_passages_minilm_bm25')
result3 #results for minilm model with bm25 similarity retrieval

{'MRR': 0.43476400197384646,
 'nDCG': 0.5610625418080807,
 'Precision at 3': 0.18753022452504317,
 'Recall at 3': 0.5107962003454232,
 'MAP': 0.4292430689201414}

# USE (Universal Sentence Encoder)


In [None]:
from ranking.rank_use import rank_passages_use_dot, rank_passages_use_knn, rank_passages_use_bm25
from evaluation import evaluate_metrics

### Cosine Similarity

In [36]:
df["ranked_passages_use_cosine"] = df.apply(lambda x: rank_passages_use_cosine(x["query"], x["passage_text"]), axis=1)
df.head()

Unnamed: 0_level_0,answers,query,query_type,wellFormedAnswers,passage_text,is_selected,ranked_passages_minilm_cosine,ranked_passages_minilm_knn,ranked_passages_minilm_bm25,ranked_passages_use_cosine
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,[Yes],does human hair stop squirrels,description,[],[We have been feeding our back yard squirrels ...,"[0, 0, 1, 0, 0, 0, 0]","[4, 2, 0, 3, 5, 6, 1]","[4, 2, 0, 3, 5]","[1, 2, 5, 4, 6, 3, 0]","[2, 4, 5, 3, 1, 6, 0]"
1,[Fossil fuels are basically the remains of ani...,what are the benefits of fossil fuels,description,[],[The biggest advantage of using fossil fuels i...,"[0, 1, 0, 0, 0, 0, 0, 0, 0]","[4, 1, 3, 0, 5, 2, 6, 8, 7]","[4, 1, 3, 0, 5]","[1, 4, 8, 7, 5, 6, 0, 2, 3]","[3, 1, 4, 5, 7, 8, 2, 0, 6]"
2,[The apothem of a regular polygon is a line se...,what is a apothem,description,[],[Apothem. The apothem of a regular polygon is ...,"[0, 0, 0, 0, 0, 1, 0, 0, 0]","[8, 1, 5, 7, 0, 4, 2, 6, 3]","[8, 1, 5, 7, 0]","[0, 5, 6, 1, 7, 4, 2, 8, 3]","[1, 0, 5, 4, 2, 8, 6, 7, 3]"
3,[$45 to $210. 2],average cost for custom canopy,numeric,[],"[Congratulations! You have found BuyShade.com,...","[0, 0, 0, 0, 0, 1, 0, 0, 0]","[5, 1, 6, 2, 7, 4, 8, 3, 0]","[5, 1, 6, 2, 7]","[5, 6, 2, 1, 8, 0, 4, 3, 7]","[6, 5, 2, 1, 8, 4, 3, 7, 0]"
4,[It is the collection of physical elements tha...,what is a hardware in a computer,description,[],"[Hardware is best described as a device, such ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]","[9, 4, 8, 0, 2, 3, 1, 6, 5, 7]","[9, 4, 8, 0, 2]","[6, 3, 1, 0, 2, 7, 8, 5, 4, 9]","[9, 0, 1, 4, 2, 6, 5, 7, 8, 3]"


In [44]:
result4 = evaluate_metrics(df, relevance_col='is_selected', rank_col='ranked_passages_use_cosine')
result4#results for use model with cosine similarity retrieval

{'MRR': 0.4407627683197632,
 'nDCG': 0.5655166834843557,
 'Precision at 3': 0.1898791018998273,
 'Recall at 3': 0.5155630397236615,
 'MAP': 0.43496572086520274}

### Dot Product

In [39]:
df["ranked_passages_use_dot"] = df.apply(lambda x: rank_passages_use_dot(x["query"], x["passage_text"]), axis=1)
df.head()

Unnamed: 0_level_0,answers,query,query_type,wellFormedAnswers,passage_text,is_selected,ranked_passages_minilm_cosine,ranked_passages_minilm_knn,ranked_passages_minilm_bm25,ranked_passages_use_cosine,ranked_passages_use_dot
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,[Yes],does human hair stop squirrels,description,[],[We have been feeding our back yard squirrels ...,"[0, 0, 1, 0, 0, 0, 0]","[4, 2, 0, 3, 5, 6, 1]","[4, 2, 0, 3, 5]","[1, 2, 5, 4, 6, 3, 0]","[2, 4, 5, 3, 1, 6, 0]","[2, 4, 5, 3, 1, 6, 0]"
1,[Fossil fuels are basically the remains of ani...,what are the benefits of fossil fuels,description,[],[The biggest advantage of using fossil fuels i...,"[0, 1, 0, 0, 0, 0, 0, 0, 0]","[4, 1, 3, 0, 5, 2, 6, 8, 7]","[4, 1, 3, 0, 5]","[1, 4, 8, 7, 5, 6, 0, 2, 3]","[3, 1, 4, 5, 7, 8, 2, 0, 6]","[3, 1, 4, 5, 7, 8, 2, 0, 6]"
2,[The apothem of a regular polygon is a line se...,what is a apothem,description,[],[Apothem. The apothem of a regular polygon is ...,"[0, 0, 0, 0, 0, 1, 0, 0, 0]","[8, 1, 5, 7, 0, 4, 2, 6, 3]","[8, 1, 5, 7, 0]","[0, 5, 6, 1, 7, 4, 2, 8, 3]","[1, 0, 5, 4, 2, 8, 6, 7, 3]","[1, 0, 5, 4, 2, 8, 6, 7, 3]"
3,[$45 to $210. 2],average cost for custom canopy,numeric,[],"[Congratulations! You have found BuyShade.com,...","[0, 0, 0, 0, 0, 1, 0, 0, 0]","[5, 1, 6, 2, 7, 4, 8, 3, 0]","[5, 1, 6, 2, 7]","[5, 6, 2, 1, 8, 0, 4, 3, 7]","[6, 5, 2, 1, 8, 4, 3, 7, 0]","[6, 5, 2, 1, 8, 4, 3, 7, 0]"
4,[It is the collection of physical elements tha...,what is a hardware in a computer,description,[],"[Hardware is best described as a device, such ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]","[9, 4, 8, 0, 2, 3, 1, 6, 5, 7]","[9, 4, 8, 0, 2]","[6, 3, 1, 0, 2, 7, 8, 5, 4, 9]","[9, 0, 1, 4, 2, 6, 5, 7, 8, 3]","[9, 0, 1, 4, 2, 6, 5, 7, 8, 3]"


In [45]:
result5 = evaluate_metrics(df, relevance_col='is_selected', rank_col='ranked_passages_use_dot')
result5#results for use model with dot similarity retrieval

{'MRR': 0.4408049592894153,
 'nDCG': 0.5655484588032726,
 'Precision at 3': 0.1899481865284974,
 'Recall at 3': 0.5158221070811745,
 'MAP': 0.43499927625627105}

### KNN

In [41]:
df["ranked_passages_use_knn"] = df.apply(lambda x: rank_passages_use_knn(x["query"], x["passage_text"]), axis=1)
df.head()

Unnamed: 0_level_0,answers,query,query_type,wellFormedAnswers,passage_text,is_selected,ranked_passages_minilm_cosine,ranked_passages_minilm_knn,ranked_passages_minilm_bm25,ranked_passages_use_cosine,ranked_passages_use_dot,ranked_passages_use_knn
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,[Yes],does human hair stop squirrels,description,[],[We have been feeding our back yard squirrels ...,"[0, 0, 1, 0, 0, 0, 0]","[4, 2, 0, 3, 5, 6, 1]","[4, 2, 0, 3, 5]","[1, 2, 5, 4, 6, 3, 0]","[2, 4, 5, 3, 1, 6, 0]","[2, 4, 5, 3, 1, 6, 0]","[2, 4, 5, 3, 1]"
1,[Fossil fuels are basically the remains of ani...,what are the benefits of fossil fuels,description,[],[The biggest advantage of using fossil fuels i...,"[0, 1, 0, 0, 0, 0, 0, 0, 0]","[4, 1, 3, 0, 5, 2, 6, 8, 7]","[4, 1, 3, 0, 5]","[1, 4, 8, 7, 5, 6, 0, 2, 3]","[3, 1, 4, 5, 7, 8, 2, 0, 6]","[3, 1, 4, 5, 7, 8, 2, 0, 6]","[3, 1, 4, 5, 7]"
2,[The apothem of a regular polygon is a line se...,what is a apothem,description,[],[Apothem. The apothem of a regular polygon is ...,"[0, 0, 0, 0, 0, 1, 0, 0, 0]","[8, 1, 5, 7, 0, 4, 2, 6, 3]","[8, 1, 5, 7, 0]","[0, 5, 6, 1, 7, 4, 2, 8, 3]","[1, 0, 5, 4, 2, 8, 6, 7, 3]","[1, 0, 5, 4, 2, 8, 6, 7, 3]","[1, 0, 5, 4, 2]"
3,[$45 to $210. 2],average cost for custom canopy,numeric,[],"[Congratulations! You have found BuyShade.com,...","[0, 0, 0, 0, 0, 1, 0, 0, 0]","[5, 1, 6, 2, 7, 4, 8, 3, 0]","[5, 1, 6, 2, 7]","[5, 6, 2, 1, 8, 0, 4, 3, 7]","[6, 5, 2, 1, 8, 4, 3, 7, 0]","[6, 5, 2, 1, 8, 4, 3, 7, 0]","[6, 5, 2, 1, 8]"
4,[It is the collection of physical elements tha...,what is a hardware in a computer,description,[],"[Hardware is best described as a device, such ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]","[9, 4, 8, 0, 2, 3, 1, 6, 5, 7]","[9, 4, 8, 0, 2]","[6, 3, 1, 0, 2, 7, 8, 5, 4, 9]","[9, 0, 1, 4, 2, 6, 5, 7, 8, 3]","[9, 0, 1, 4, 2, 6, 5, 7, 8, 3]","[9, 0, 1, 4, 2]"


In [46]:
result6 = evaluate_metrics(df, relevance_col='is_selected', rank_col='ranked_passages_use_knn')
result6#results for use model with knn similarity retrieval

{'MRR': 0.4109015544041451,
 'nDCG': 0.4869620376502086,
 'Precision at 3': 0.1898791018998273,
 'Recall at 3': 0.5156148531951641,
 'MAP': 0.39828316062176167}

### BM25

In [43]:
df["ranked_passages_use_bm25"] = df.apply(lambda x: rank_passages_use_bm25(x["query"], x["passage_text"]), axis=1)
df.head()

Unnamed: 0_level_0,answers,query,query_type,wellFormedAnswers,passage_text,is_selected,ranked_passages_minilm_cosine,ranked_passages_minilm_knn,ranked_passages_minilm_bm25,ranked_passages_use_cosine,ranked_passages_use_dot,ranked_passages_use_knn,ranked_passages_use_bm25
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,[Yes],does human hair stop squirrels,description,[],[We have been feeding our back yard squirrels ...,"[0, 0, 1, 0, 0, 0, 0]","[4, 2, 0, 3, 5, 6, 1]","[4, 2, 0, 3, 5]","[1, 2, 5, 4, 6, 3, 0]","[2, 4, 5, 3, 1, 6, 0]","[2, 4, 5, 3, 1, 6, 0]","[2, 4, 5, 3, 1]","[1, 2, 5, 4, 6, 3, 0]"
1,[Fossil fuels are basically the remains of ani...,what are the benefits of fossil fuels,description,[],[The biggest advantage of using fossil fuels i...,"[0, 1, 0, 0, 0, 0, 0, 0, 0]","[4, 1, 3, 0, 5, 2, 6, 8, 7]","[4, 1, 3, 0, 5]","[1, 4, 8, 7, 5, 6, 0, 2, 3]","[3, 1, 4, 5, 7, 8, 2, 0, 6]","[3, 1, 4, 5, 7, 8, 2, 0, 6]","[3, 1, 4, 5, 7]","[1, 4, 8, 7, 5, 6, 0, 2, 3]"
2,[The apothem of a regular polygon is a line se...,what is a apothem,description,[],[Apothem. The apothem of a regular polygon is ...,"[0, 0, 0, 0, 0, 1, 0, 0, 0]","[8, 1, 5, 7, 0, 4, 2, 6, 3]","[8, 1, 5, 7, 0]","[0, 5, 6, 1, 7, 4, 2, 8, 3]","[1, 0, 5, 4, 2, 8, 6, 7, 3]","[1, 0, 5, 4, 2, 8, 6, 7, 3]","[1, 0, 5, 4, 2]","[0, 5, 6, 1, 7, 4, 2, 8, 3]"
3,[$45 to $210. 2],average cost for custom canopy,numeric,[],"[Congratulations! You have found BuyShade.com,...","[0, 0, 0, 0, 0, 1, 0, 0, 0]","[5, 1, 6, 2, 7, 4, 8, 3, 0]","[5, 1, 6, 2, 7]","[5, 6, 2, 1, 8, 0, 4, 3, 7]","[6, 5, 2, 1, 8, 4, 3, 7, 0]","[6, 5, 2, 1, 8, 4, 3, 7, 0]","[6, 5, 2, 1, 8]","[5, 6, 2, 1, 8, 0, 4, 3, 7]"
4,[It is the collection of physical elements tha...,what is a hardware in a computer,description,[],"[Hardware is best described as a device, such ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]","[9, 4, 8, 0, 2, 3, 1, 6, 5, 7]","[9, 4, 8, 0, 2]","[6, 3, 1, 0, 2, 7, 8, 5, 4, 9]","[9, 0, 1, 4, 2, 6, 5, 7, 8, 3]","[9, 0, 1, 4, 2, 6, 5, 7, 8, 3]","[9, 0, 1, 4, 2]","[6, 3, 1, 0, 2, 7, 8, 5, 4, 9]"


In [47]:
result7 = evaluate_metrics(df, relevance_col='is_selected', rank_col='ranked_passages_use_bm25')
result7#results for use model with bm25 similarity retrieval

{'MRR': 0.43476400197384646,
 'nDCG': 0.5610625418080807,
 'Precision at 3': 0.18753022452504317,
 'Recall at 3': 0.5107962003454232,
 'MAP': 0.4292430689201414}

# Custom Models specially trained on MS Marco Dataset


### msmarco-MiniLM-L12-cos-v5

In [57]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from typing import List

# Load MiniLM model
model_minilm = SentenceTransformer("sentence-transformers/msmarco-MiniLM-L12-cos-v5")

def rank_passages_minilm_l12_cosine(query: str, passages: List[str]) -> List[int]:
    """Rank passages by cosine similarity with the query using MiniLM L12 embeddings."""
    # Generate embeddings
    query_embedding = model_minilm.encode([query])
    passage_embeddings = model_minilm.encode(passages)

    # Compute cosine similarity between query and all passages
    similarity_scores = cosine_similarity(query_embedding, passage_embeddings).flatten()

    # Rank passages by similarity score
    ranked_indices = np.argsort(similarity_scores)[::-1]
    return ranked_indices.tolist()




modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [63]:
df["ranked_passages_minilm_l12_cosine"] = df.apply(lambda x: rank_passages_minilm_l12_cosine(x["query"], x["passage_text"]), axis=1)
df.head()

Unnamed: 0_level_0,answers,query,query_type,wellFormedAnswers,passage_text,is_selected,ranked_passages_minilm_cosine,ranked_passages_minilm_knn,ranked_passages_minilm_bm25,ranked_passages_use_cosine,ranked_passages_use_dot,ranked_passages_use_knn,ranked_passages_use_bm25,ranked_passages_distilbert_cosine,ranked_passages_minilm_l12_cosine
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,[Yes],does human hair stop squirrels,description,[],[We have been feeding our back yard squirrels ...,"[0, 0, 1, 0, 0, 0, 0]","[4, 2, 0, 3, 5, 6, 1]","[4, 2, 0, 3, 5]","[1, 2, 5, 4, 6, 3, 0]","[2, 4, 5, 3, 1, 6, 0]","[2, 4, 5, 3, 1, 6, 0]","[2, 4, 5, 3, 1]","[1, 2, 5, 4, 6, 3, 0]","[2, 4, 1, 6, 5, 3, 0]","[2, 4, 3, 5, 6, 0, 1]"
1,[Fossil fuels are basically the remains of ani...,what are the benefits of fossil fuels,description,[],[The biggest advantage of using fossil fuels i...,"[0, 1, 0, 0, 0, 0, 0, 0, 0]","[4, 1, 3, 0, 5, 2, 6, 8, 7]","[4, 1, 3, 0, 5]","[1, 4, 8, 7, 5, 6, 0, 2, 3]","[3, 1, 4, 5, 7, 8, 2, 0, 6]","[3, 1, 4, 5, 7, 8, 2, 0, 6]","[3, 1, 4, 5, 7]","[1, 4, 8, 7, 5, 6, 0, 2, 3]","[4, 1, 3, 0, 2, 8, 5, 6, 7]","[1, 4, 3, 5, 0, 2, 8, 6, 7]"
2,[The apothem of a regular polygon is a line se...,what is a apothem,description,[],[Apothem. The apothem of a regular polygon is ...,"[0, 0, 0, 0, 0, 1, 0, 0, 0]","[8, 1, 5, 7, 0, 4, 2, 6, 3]","[8, 1, 5, 7, 0]","[0, 5, 6, 1, 7, 4, 2, 8, 3]","[1, 0, 5, 4, 2, 8, 6, 7, 3]","[1, 0, 5, 4, 2, 8, 6, 7, 3]","[1, 0, 5, 4, 2]","[0, 5, 6, 1, 7, 4, 2, 8, 3]","[8, 2, 0, 6, 5, 1, 7, 4, 3]","[8, 7, 0, 1, 2, 4, 5, 6, 3]"
3,[$45 to $210. 2],average cost for custom canopy,numeric,[],"[Congratulations! You have found BuyShade.com,...","[0, 0, 0, 0, 0, 1, 0, 0, 0]","[5, 1, 6, 2, 7, 4, 8, 3, 0]","[5, 1, 6, 2, 7]","[5, 6, 2, 1, 8, 0, 4, 3, 7]","[6, 5, 2, 1, 8, 4, 3, 7, 0]","[6, 5, 2, 1, 8, 4, 3, 7, 0]","[6, 5, 2, 1, 8]","[5, 6, 2, 1, 8, 0, 4, 3, 7]","[5, 1, 6, 2, 4, 7, 3, 8, 0]","[1, 5, 6, 2, 4, 8, 7, 3, 0]"
4,[It is the collection of physical elements tha...,what is a hardware in a computer,description,[],"[Hardware is best described as a device, such ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]","[9, 4, 8, 0, 2, 3, 1, 6, 5, 7]","[9, 4, 8, 0, 2]","[6, 3, 1, 0, 2, 7, 8, 5, 4, 9]","[9, 0, 1, 4, 2, 6, 5, 7, 8, 3]","[9, 0, 1, 4, 2, 6, 5, 7, 8, 3]","[9, 0, 1, 4, 2]","[6, 3, 1, 0, 2, 7, 8, 5, 4, 9]","[4, 0, 2, 3, 1, 5, 9, 7, 8, 6]","[1, 3, 2, 6, 4, 8, 0, 9, 5, 7]"


In [61]:
result8 = evaluate_metrics(df, relevance_col='is_selected', rank_col='ranked_passages_distilbert_cosine')
result8#results for distilbert model with cosine similarity retrieval

{'MRR': 0.6187172876058886,
 'nDCG': 0.7017649146033434,
 'Precision at 3': 0.2737478411053541,
 'Recall at 3': 0.7492987910189982,
 'MAP': 0.6103371247635496}

### msmarco-distilbert-cos-v5

In [58]:
# Load DistilBERT model
model_distilbert = SentenceTransformer("sentence-transformers/msmarco-distilbert-cos-v5")

def rank_passages_distilbert_cosine(query: str, passages: List[str]) -> List[int]:
    """Rank passages by cosine similarity with the query using DistilBERT embeddings."""
    # Generate embeddings
    query_embedding = model_distilbert.encode([query])
    passage_embeddings = model_distilbert.encode(passages)

    # Compute cosine similarity between query and all passages
    similarity_scores = cosine_similarity(query_embedding, passage_embeddings).flatten()

    # Rank passages by similarity score
    ranked_indices = np.argsort(similarity_scores)[::-1]
    return ranked_indices.tolist()



modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [60]:
df["ranked_passages_distilbert_cosine"] = df.apply(lambda x: rank_passages_distilbert_cosine(x["query"], x["passage_text"]), axis=1)
df.head()

Unnamed: 0_level_0,answers,query,query_type,wellFormedAnswers,passage_text,is_selected,ranked_passages_minilm_cosine,ranked_passages_minilm_knn,ranked_passages_minilm_bm25,ranked_passages_use_cosine,ranked_passages_use_dot,ranked_passages_use_knn,ranked_passages_use_bm25,ranked_passages_distilbert_cosine
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,[Yes],does human hair stop squirrels,description,[],[We have been feeding our back yard squirrels ...,"[0, 0, 1, 0, 0, 0, 0]","[4, 2, 0, 3, 5, 6, 1]","[4, 2, 0, 3, 5]","[1, 2, 5, 4, 6, 3, 0]","[2, 4, 5, 3, 1, 6, 0]","[2, 4, 5, 3, 1, 6, 0]","[2, 4, 5, 3, 1]","[1, 2, 5, 4, 6, 3, 0]","[2, 4, 1, 6, 5, 3, 0]"
1,[Fossil fuels are basically the remains of ani...,what are the benefits of fossil fuels,description,[],[The biggest advantage of using fossil fuels i...,"[0, 1, 0, 0, 0, 0, 0, 0, 0]","[4, 1, 3, 0, 5, 2, 6, 8, 7]","[4, 1, 3, 0, 5]","[1, 4, 8, 7, 5, 6, 0, 2, 3]","[3, 1, 4, 5, 7, 8, 2, 0, 6]","[3, 1, 4, 5, 7, 8, 2, 0, 6]","[3, 1, 4, 5, 7]","[1, 4, 8, 7, 5, 6, 0, 2, 3]","[4, 1, 3, 0, 2, 8, 5, 6, 7]"
2,[The apothem of a regular polygon is a line se...,what is a apothem,description,[],[Apothem. The apothem of a regular polygon is ...,"[0, 0, 0, 0, 0, 1, 0, 0, 0]","[8, 1, 5, 7, 0, 4, 2, 6, 3]","[8, 1, 5, 7, 0]","[0, 5, 6, 1, 7, 4, 2, 8, 3]","[1, 0, 5, 4, 2, 8, 6, 7, 3]","[1, 0, 5, 4, 2, 8, 6, 7, 3]","[1, 0, 5, 4, 2]","[0, 5, 6, 1, 7, 4, 2, 8, 3]","[8, 2, 0, 6, 5, 1, 7, 4, 3]"
3,[$45 to $210. 2],average cost for custom canopy,numeric,[],"[Congratulations! You have found BuyShade.com,...","[0, 0, 0, 0, 0, 1, 0, 0, 0]","[5, 1, 6, 2, 7, 4, 8, 3, 0]","[5, 1, 6, 2, 7]","[5, 6, 2, 1, 8, 0, 4, 3, 7]","[6, 5, 2, 1, 8, 4, 3, 7, 0]","[6, 5, 2, 1, 8, 4, 3, 7, 0]","[6, 5, 2, 1, 8]","[5, 6, 2, 1, 8, 0, 4, 3, 7]","[5, 1, 6, 2, 4, 7, 3, 8, 0]"
4,[It is the collection of physical elements tha...,what is a hardware in a computer,description,[],"[Hardware is best described as a device, such ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]","[9, 4, 8, 0, 2, 3, 1, 6, 5, 7]","[9, 4, 8, 0, 2]","[6, 3, 1, 0, 2, 7, 8, 5, 4, 9]","[9, 0, 1, 4, 2, 6, 5, 7, 8, 3]","[9, 0, 1, 4, 2, 6, 5, 7, 8, 3]","[9, 0, 1, 4, 2]","[6, 3, 1, 0, 2, 7, 8, 5, 4, 9]","[4, 0, 2, 3, 1, 5, 9, 7, 8, 6]"


In [64]:
result9 = evaluate_metrics(df, relevance_col='is_selected', rank_col='ranked_passages_minilm_l12_cosine')
result9#results for MiniLm model L12 with cosine similarity retrieval

{'MRR': 0.6097618636401021,
 'nDCG': 0.6950429415907557,
 'Precision at 3': 0.26922279792746107,
 'Recall at 3': 0.7378652849740932,
 'MAP': 0.6014155365024536}