In [1]:
pip install sentence-transformers scikit-learn rank-bm25


Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.2


MiniLm


In [2]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from typing import List

# Load MiniLM model
model = SentenceTransformer('all-MiniLM-L6-v2')

def rank_passages_minilm_cosine(query: str, passages: List[str]) -> List[int]:
    """Rank passages by cosine similarity with the query using MiniLM embeddings."""
    # Generate embeddings
    query_embedding = model.encode([query])
    passage_embeddings = model.encode(passages)

    # Compute cosine similarity between query and all passages
    similarity_scores = cosine_similarity(query_embedding, passage_embeddings).flatten()

    # Rank passages by similarity score
    ranked_indices = np.argsort(similarity_scores)[::-1]
    return ranked_indices.tolist()


  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [3]:
from sklearn.neighbors import NearestNeighbors

def rank_passages_minilm_knn(query: str, passages: List[str], k: int = 5) -> List[int]:
    """Rank passages by k-nearest neighbors similarity with the query using MiniLM embeddings."""
    # Generate embeddings
    query_embedding = model.encode([query])
    passage_embeddings = model.encode(passages)

    # Fit kNN model
    knn = NearestNeighbors(n_neighbors=min(k, len(passages)), metric="cosine")
    knn.fit(passage_embeddings)

    # Find k-nearest neighbors for the query
    distances, indices = knn.kneighbors(query_embedding)
    return indices.flatten().tolist()


In [4]:
!pip install nltk



In [5]:
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')

def rank_passages_minilm_bm25(query: str, passages: List[str]) -> List[int]:
    """Rank passages by BM25 similarity with the query using tokenized MiniLM embeddings."""
    # Tokenize passages
    tokenized_passages = [word_tokenize(passage.lower()) for passage in passages]

    # Initialize BM25
    bm25 = BM25Okapi(tokenized_passages)

    # Rank passages by BM25 scores
    query_tokens = word_tokenize(query.lower())
    scores = bm25.get_scores(query_tokens)

    ranked_indices = np.argsort(scores)[::-1]
    return ranked_indices.tolist()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [6]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [17]:
from datasets import load_dataset
import pandas as pd

# Load the dataset
ds = load_dataset("microsoft/ms_marco", "v1.1")

# Convert the validation split to a pandas DataFrame
df = ds['test'].to_pandas()
# Extract the passage text and is_selected fields
df["passage_text"] = df["passages"].apply(lambda x: x["passage_text"].tolist())  # Convert NumPy array to list
df["is_selected"] = df["passages"].apply(lambda x: x["is_selected"].tolist())    # Convert NumPy array to list



# Extract passage text and selection status
# df["passage_text"] = df["passages"].apply(lambda x: x["passage_text"])
# df["passage_text"] = df["passage_text"].apply(lambda x: x.split(". ") if isinstance(x, str) else x)
# df["is_selected"] = df["passages"].apply(lambda x: x["is_selected"])

# Drop the original passages column and set query_id as index
df.drop("passages", axis=1, inplace=True)
df.set_index("query_id", inplace=True)

# Display the first few rows
df.head()

Unnamed: 0_level_0,answers,query,query_type,wellFormedAnswers,passage_text,is_selected
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,[Yes],does human hair stop squirrels,description,[],[We have been feeding our back yard squirrels ...,"[0, 0, 1, 0, 0, 0, 0]"
1,[Fossil fuels are basically the remains of ani...,what are the benefits of fossil fuels,description,[],[The biggest advantage of using fossil fuels i...,"[0, 1, 0, 0, 0, 0, 0, 0, 0]"
2,[The apothem of a regular polygon is a line se...,what is a apothem,description,[],[Apothem. The apothem of a regular polygon is ...,"[0, 0, 0, 0, 0, 1, 0, 0, 0]"
3,[$45 to $210. 2],average cost for custom canopy,numeric,[],"[Congratulations! You have found BuyShade.com,...","[0, 0, 0, 0, 0, 1, 0, 0, 0]"
4,[It is the collection of physical elements tha...,what is a hardware in a computer,description,[],"[Hardware is best described as a device, such ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"


In [18]:
ranked_indices_minilm_cosine = rank_passages_minilm_cosine(df.iloc[0]["query"], df.iloc[0]["passage_text"])

In [19]:
print("Ranked Indices of MiniLM_cosine:", ranked_indices_minilm_cosine)

Ranked Indices of MiniLM_cosine: [4, 2, 0, 3, 5, 6, 1]


In [20]:
df["ranked_passages_minilm_cosine"] = df.apply(lambda x: rank_passages_minilm_cosine(x["query"], x["passage_text"]), axis=1)
df.head()

Unnamed: 0_level_0,answers,query,query_type,wellFormedAnswers,passage_text,is_selected,ranked_passages_minilm_cosine
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,[Yes],does human hair stop squirrels,description,[],[We have been feeding our back yard squirrels ...,"[0, 0, 1, 0, 0, 0, 0]","[4, 2, 0, 3, 5, 6, 1]"
1,[Fossil fuels are basically the remains of ani...,what are the benefits of fossil fuels,description,[],[The biggest advantage of using fossil fuels i...,"[0, 1, 0, 0, 0, 0, 0, 0, 0]","[4, 1, 3, 0, 5, 2, 6, 8, 7]"
2,[The apothem of a regular polygon is a line se...,what is a apothem,description,[],[Apothem. The apothem of a regular polygon is ...,"[0, 0, 0, 0, 0, 1, 0, 0, 0]","[8, 1, 5, 7, 0, 4, 2, 6, 3]"
3,[$45 to $210. 2],average cost for custom canopy,numeric,[],"[Congratulations! You have found BuyShade.com,...","[0, 0, 0, 0, 0, 1, 0, 0, 0]","[5, 1, 6, 2, 7, 4, 8, 3, 0]"
4,[It is the collection of physical elements tha...,what is a hardware in a computer,description,[],"[Hardware is best described as a device, such ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]","[9, 4, 8, 0, 2, 3, 1, 6, 5, 7]"


In [21]:
ranked_indices_minilm_knn = rank_passages_minilm_knn(df.iloc[0]["query"], df.iloc[0]["passage_text"])


In [22]:
print("Ranked Indices of MiniLM_knn:", ranked_indices_minilm_knn)

Ranked Indices of MiniLM_knn: [4, 2, 0, 3, 5]


In [23]:
df["ranked_passages_minilm_knn"] = df.apply(lambda x: rank_passages_minilm_knn(x["query"], x["passage_text"]), axis=1)
df.head()

Unnamed: 0_level_0,answers,query,query_type,wellFormedAnswers,passage_text,is_selected,ranked_passages_minilm_cosine,ranked_passages_minilm_knn
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,[Yes],does human hair stop squirrels,description,[],[We have been feeding our back yard squirrels ...,"[0, 0, 1, 0, 0, 0, 0]","[4, 2, 0, 3, 5, 6, 1]","[4, 2, 0, 3, 5]"
1,[Fossil fuels are basically the remains of ani...,what are the benefits of fossil fuels,description,[],[The biggest advantage of using fossil fuels i...,"[0, 1, 0, 0, 0, 0, 0, 0, 0]","[4, 1, 3, 0, 5, 2, 6, 8, 7]","[4, 1, 3, 0, 5]"
2,[The apothem of a regular polygon is a line se...,what is a apothem,description,[],[Apothem. The apothem of a regular polygon is ...,"[0, 0, 0, 0, 0, 1, 0, 0, 0]","[8, 1, 5, 7, 0, 4, 2, 6, 3]","[8, 1, 5, 7, 0]"
3,[$45 to $210. 2],average cost for custom canopy,numeric,[],"[Congratulations! You have found BuyShade.com,...","[0, 0, 0, 0, 0, 1, 0, 0, 0]","[5, 1, 6, 2, 7, 4, 8, 3, 0]","[5, 1, 6, 2, 7]"
4,[It is the collection of physical elements tha...,what is a hardware in a computer,description,[],"[Hardware is best described as a device, such ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]","[9, 4, 8, 0, 2, 3, 1, 6, 5, 7]","[9, 4, 8, 0, 2]"


In [24]:
ranked_indices_minilm_bm25 = rank_passages_minilm_bm25(df.iloc[0]["query"], df.iloc[0]["passage_text"])


In [25]:
print("Ranked Indices of MiniLM_bm25:", ranked_indices_minilm_bm25)

Ranked Indices of MiniLM_bm25: [1, 2, 5, 4, 6, 3, 0]


In [26]:
df["ranked_passages_minilm_bm25"] = df.apply(lambda x: rank_passages_minilm_bm25(x["query"], x["passage_text"]), axis=1)
df.head()

Unnamed: 0_level_0,answers,query,query_type,wellFormedAnswers,passage_text,is_selected,ranked_passages_minilm_cosine,ranked_passages_minilm_knn,ranked_passages_minilm_bm25
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,[Yes],does human hair stop squirrels,description,[],[We have been feeding our back yard squirrels ...,"[0, 0, 1, 0, 0, 0, 0]","[4, 2, 0, 3, 5, 6, 1]","[4, 2, 0, 3, 5]","[1, 2, 5, 4, 6, 3, 0]"
1,[Fossil fuels are basically the remains of ani...,what are the benefits of fossil fuels,description,[],[The biggest advantage of using fossil fuels i...,"[0, 1, 0, 0, 0, 0, 0, 0, 0]","[4, 1, 3, 0, 5, 2, 6, 8, 7]","[4, 1, 3, 0, 5]","[1, 4, 8, 7, 5, 6, 0, 2, 3]"
2,[The apothem of a regular polygon is a line se...,what is a apothem,description,[],[Apothem. The apothem of a regular polygon is ...,"[0, 0, 0, 0, 0, 1, 0, 0, 0]","[8, 1, 5, 7, 0, 4, 2, 6, 3]","[8, 1, 5, 7, 0]","[0, 5, 6, 1, 7, 4, 2, 8, 3]"
3,[$45 to $210. 2],average cost for custom canopy,numeric,[],"[Congratulations! You have found BuyShade.com,...","[0, 0, 0, 0, 0, 1, 0, 0, 0]","[5, 1, 6, 2, 7, 4, 8, 3, 0]","[5, 1, 6, 2, 7]","[5, 6, 2, 1, 8, 0, 4, 3, 7]"
4,[It is the collection of physical elements tha...,what is a hardware in a computer,description,[],"[Hardware is best described as a device, such ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]","[9, 4, 8, 0, 2, 3, 1, 6, 5, 7]","[9, 4, 8, 0, 2]","[6, 3, 1, 0, 2, 7, 8, 5, 4, 9]"


In [27]:
###Evaluation

import numpy as np
import pandas as pd
from typing import List

def evaluate_metrics(df: pd.DataFrame, relevance_col: str, rank_col: str, k: int = 3):
    """
    Function to calculate
        - MRR (Mean Reciprocal Rank)
        - nDCG (Normalized Discounted Cumulative Gain)
        - Precision@k
        - Recall@k
        - MAP (Mean Average Precision)
    """
    mrr_scores = []
    ndcg_scores = []
    precision_at_k = []
    recall_at_k = []
    average_precisions = []

    for _, row in df.iterrows():
        relevance = np.array(row[relevance_col])  # Convert relevance list to numpy array
        rank = np.array(row[rank_col])  # Ranked indices
        relevance_at_rank = relevance[rank]

        # MRR: Find the rank of the first relevant document
        first_relevant = np.where(relevance_at_rank == 1)[0]
        if len(first_relevant) > 0:
            mrr_scores.append(1 / (first_relevant[0] + 1))
        else:
            mrr_scores.append(0)

        # nDCG
        dcg = sum((2 ** relevance_at_rank[i] - 1) / np.log2(i + 2) for i in range(len(relevance_at_rank)))
        ideal_relevance = sorted(relevance, reverse=True)
        idcg = sum((2 ** ideal_relevance[i] - 1) / np.log2(i + 2) for i in range(len(ideal_relevance)))
        ndcg = dcg / idcg if idcg > 0 else 0
        ndcg_scores.append(ndcg)

        # Top-k relevance
        relevance_at_k = relevance_at_rank[:k]

        # Precision@k
        precision = relevance_at_k.sum() / k
        precision_at_k.append(precision)

        # Recall@k
        total_relevant = relevance.sum()
        recall = relevance_at_k.sum() / total_relevant if total_relevant > 0 else 0
        recall_at_k.append(recall)

        # Average Precision (AP)
        num_relevant_retrieved = 0
        cumulative_precision = 0
        for i in range(len(relevance_at_rank)):
            if relevance_at_rank[i] == 1:
                num_relevant_retrieved += 1
                cumulative_precision += num_relevant_retrieved / (i + 1)
        ap = cumulative_precision / total_relevant if total_relevant > 0 else 0
        average_precisions.append(ap)

    # Aggregate results
    metrics = {
        "MRR": np.mean(mrr_scores),
        "nDCG": np.mean(ndcg_scores),
        f"Precision at {k}": np.mean(precision_at_k),
        f"Recall at {k}": np.mean(recall_at_k),
        "MAP": np.mean(average_precisions),
    }

    return metrics

In [28]:
result1 = evaluate_metrics(df, relevance_col='is_selected', rank_col='ranked_passages_minilm_cosine')
result1#results for minilm model with cosine similarity retrieval

{'MRR': 0.5514425528415166,
 'nDCG': 0.6502893932815138,
 'Precision at 3': 0.24359240069084628,
 'Recall at 3': 0.6643937823834196,
 'MAP': 0.5438316994270362}

In [29]:
result2 = evaluate_metrics(df, relevance_col='is_selected', rank_col='ranked_passages_minilm_knn')
result2#results for minilm model with knn similarity retrieval

{'MRR': 0.5348307426597582,
 'nDCG': 0.6045202426077834,
 'Precision at 3': 0.24359240069084628,
 'Recall at 3': 0.6643937823834196,
 'MAP': 0.5216455958549224}

In [30]:
result3 = evaluate_metrics(df, relevance_col='is_selected', rank_col='ranked_passages_minilm_bm25')
result3#results for minilm model with bm25 similarity retrieval

{'MRR': 0.43476400197384646,
 'nDCG': 0.5610625418080807,
 'Precision at 3': 0.18753022452504317,
 'Recall at 3': 0.5107962003454232,
 'MAP': 0.4292430689201414}

USE (Universal Sentence Encoder) By Google


In [31]:
from sklearn.metrics.pairwise import cosine_similarity
from typing import List
import numpy as np
import tensorflow_hub as hub

# Load the USE model
use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

def rank_passages_use_cosine(query: str, passages: List[str]) -> List[int]:
    """Rank passages by cosine similarity using USE embeddings."""
    # Generate embeddings
    query_embedding = use_model([query]).numpy()
    passage_embeddings = use_model(passages).numpy()

    # Compute cosine similarity
    similarity_scores = cosine_similarity(query_embedding, passage_embeddings).flatten()

    # Rank passages by similarity score
    ranked_indices = np.argsort(similarity_scores)[::-1]
    return ranked_indices.tolist()


In [32]:
def rank_passages_use_dot(query: str, passages: List[str]) -> List[int]:
    """Rank passages by dot product similarity using USE embeddings."""
    # Generate embeddings
    query_embedding = use_model([query]).numpy()
    passage_embeddings = use_model(passages).numpy()

    # Compute dot product
    similarity_scores = np.dot(passage_embeddings, query_embedding.T).flatten()

    # Rank passages by similarity score
    ranked_indices = np.argsort(similarity_scores)[::-1]
    return ranked_indices.tolist()


In [33]:
from sklearn.neighbors import NearestNeighbors

def rank_passages_use_knn(query: str, passages: List[str], k: int = 5) -> List[int]:
    """Rank passages using KNN similarity with USE embeddings."""
    if len(passages) == 0:
        return []  # No passages to rank

    # Adjust k to be at most the number of passages
    adjusted_k = min(k, len(passages))

    # Generate embeddings
    query_embedding = use_model([query]).numpy()
    passage_embeddings = use_model(passages).numpy()

    # Fit KNN model
    knn = NearestNeighbors(n_neighbors=adjusted_k, metric='cosine').fit(passage_embeddings)

    # Find nearest neighbors for the query
    distances, indices = knn.kneighbors(query_embedding)

    # Return ranked indices (flatten since KNN returns a 2D array)
    return indices.flatten().tolist()



In [38]:
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')

def rank_passages_use_bm25(query: str, passages: List[str]) -> List[int]:
    """Rank passages using BM25 with tokenized USE passages."""
    # Tokenize passages
    tokenized_passages = [word_tokenize(passage.lower()) for passage in passages]

    # Initialize BM25
    bm25 = BM25Okapi(tokenized_passages)

    # Tokenize query
    query_tokens = word_tokenize(query.lower())

    # Compute BM25 scores
    scores = bm25.get_scores(query_tokens)

    # Rank passages by score
    ranked_indices = np.argsort(scores)[::-1]
    return ranked_indices.tolist()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [34]:
ranked_indices_use_cosine = rank_passages_use_cosine(df.iloc[0]["query"], df.iloc[0]["passage_text"])


In [35]:
print("Ranked Indices of use_cosine:", ranked_indices_use_cosine)

Ranked Indices of use_cosine: [2, 4, 5, 3, 1, 6, 0]


In [36]:
df["ranked_passages_use_cosine"] = df.apply(lambda x: rank_passages_use_cosine(x["query"], x["passage_text"]), axis=1)
df.head()

Unnamed: 0_level_0,answers,query,query_type,wellFormedAnswers,passage_text,is_selected,ranked_passages_minilm_cosine,ranked_passages_minilm_knn,ranked_passages_minilm_bm25,ranked_passages_use_cosine
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,[Yes],does human hair stop squirrels,description,[],[We have been feeding our back yard squirrels ...,"[0, 0, 1, 0, 0, 0, 0]","[4, 2, 0, 3, 5, 6, 1]","[4, 2, 0, 3, 5]","[1, 2, 5, 4, 6, 3, 0]","[2, 4, 5, 3, 1, 6, 0]"
1,[Fossil fuels are basically the remains of ani...,what are the benefits of fossil fuels,description,[],[The biggest advantage of using fossil fuels i...,"[0, 1, 0, 0, 0, 0, 0, 0, 0]","[4, 1, 3, 0, 5, 2, 6, 8, 7]","[4, 1, 3, 0, 5]","[1, 4, 8, 7, 5, 6, 0, 2, 3]","[3, 1, 4, 5, 7, 8, 2, 0, 6]"
2,[The apothem of a regular polygon is a line se...,what is a apothem,description,[],[Apothem. The apothem of a regular polygon is ...,"[0, 0, 0, 0, 0, 1, 0, 0, 0]","[8, 1, 5, 7, 0, 4, 2, 6, 3]","[8, 1, 5, 7, 0]","[0, 5, 6, 1, 7, 4, 2, 8, 3]","[1, 0, 5, 4, 2, 8, 6, 7, 3]"
3,[$45 to $210. 2],average cost for custom canopy,numeric,[],"[Congratulations! You have found BuyShade.com,...","[0, 0, 0, 0, 0, 1, 0, 0, 0]","[5, 1, 6, 2, 7, 4, 8, 3, 0]","[5, 1, 6, 2, 7]","[5, 6, 2, 1, 8, 0, 4, 3, 7]","[6, 5, 2, 1, 8, 4, 3, 7, 0]"
4,[It is the collection of physical elements tha...,what is a hardware in a computer,description,[],"[Hardware is best described as a device, such ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]","[9, 4, 8, 0, 2, 3, 1, 6, 5, 7]","[9, 4, 8, 0, 2]","[6, 3, 1, 0, 2, 7, 8, 5, 4, 9]","[9, 0, 1, 4, 2, 6, 5, 7, 8, 3]"


In [37]:
ranked_indices_use_dot = rank_passages_use_dot(df.iloc[0]["query"], df.iloc[0]["passage_text"])
print("Ranked Indices of use_dot:", ranked_indices_use_dot)


Ranked Indices of use_dot: [2, 4, 5, 3, 1, 6, 0]


In [39]:
df["ranked_passages_use_dot"] = df.apply(lambda x: rank_passages_use_dot(x["query"], x["passage_text"]), axis=1)
df.head()

Unnamed: 0_level_0,answers,query,query_type,wellFormedAnswers,passage_text,is_selected,ranked_passages_minilm_cosine,ranked_passages_minilm_knn,ranked_passages_minilm_bm25,ranked_passages_use_cosine,ranked_passages_use_dot
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,[Yes],does human hair stop squirrels,description,[],[We have been feeding our back yard squirrels ...,"[0, 0, 1, 0, 0, 0, 0]","[4, 2, 0, 3, 5, 6, 1]","[4, 2, 0, 3, 5]","[1, 2, 5, 4, 6, 3, 0]","[2, 4, 5, 3, 1, 6, 0]","[2, 4, 5, 3, 1, 6, 0]"
1,[Fossil fuels are basically the remains of ani...,what are the benefits of fossil fuels,description,[],[The biggest advantage of using fossil fuels i...,"[0, 1, 0, 0, 0, 0, 0, 0, 0]","[4, 1, 3, 0, 5, 2, 6, 8, 7]","[4, 1, 3, 0, 5]","[1, 4, 8, 7, 5, 6, 0, 2, 3]","[3, 1, 4, 5, 7, 8, 2, 0, 6]","[3, 1, 4, 5, 7, 8, 2, 0, 6]"
2,[The apothem of a regular polygon is a line se...,what is a apothem,description,[],[Apothem. The apothem of a regular polygon is ...,"[0, 0, 0, 0, 0, 1, 0, 0, 0]","[8, 1, 5, 7, 0, 4, 2, 6, 3]","[8, 1, 5, 7, 0]","[0, 5, 6, 1, 7, 4, 2, 8, 3]","[1, 0, 5, 4, 2, 8, 6, 7, 3]","[1, 0, 5, 4, 2, 8, 6, 7, 3]"
3,[$45 to $210. 2],average cost for custom canopy,numeric,[],"[Congratulations! You have found BuyShade.com,...","[0, 0, 0, 0, 0, 1, 0, 0, 0]","[5, 1, 6, 2, 7, 4, 8, 3, 0]","[5, 1, 6, 2, 7]","[5, 6, 2, 1, 8, 0, 4, 3, 7]","[6, 5, 2, 1, 8, 4, 3, 7, 0]","[6, 5, 2, 1, 8, 4, 3, 7, 0]"
4,[It is the collection of physical elements tha...,what is a hardware in a computer,description,[],"[Hardware is best described as a device, such ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]","[9, 4, 8, 0, 2, 3, 1, 6, 5, 7]","[9, 4, 8, 0, 2]","[6, 3, 1, 0, 2, 7, 8, 5, 4, 9]","[9, 0, 1, 4, 2, 6, 5, 7, 8, 3]","[9, 0, 1, 4, 2, 6, 5, 7, 8, 3]"


In [40]:
ranked_indices_use_knn = rank_passages_use_knn(df.iloc[0]["query"], df.iloc[0]["passage_text"])
print("Ranked Indices of use_knn:", ranked_indices_use_knn)


Ranked Indices of use_knn: [2, 4, 5, 3, 1]


In [41]:
df["ranked_passages_use_knn"] = df.apply(lambda x: rank_passages_use_knn(x["query"], x["passage_text"]), axis=1)
df.head()

Unnamed: 0_level_0,answers,query,query_type,wellFormedAnswers,passage_text,is_selected,ranked_passages_minilm_cosine,ranked_passages_minilm_knn,ranked_passages_minilm_bm25,ranked_passages_use_cosine,ranked_passages_use_dot,ranked_passages_use_knn
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,[Yes],does human hair stop squirrels,description,[],[We have been feeding our back yard squirrels ...,"[0, 0, 1, 0, 0, 0, 0]","[4, 2, 0, 3, 5, 6, 1]","[4, 2, 0, 3, 5]","[1, 2, 5, 4, 6, 3, 0]","[2, 4, 5, 3, 1, 6, 0]","[2, 4, 5, 3, 1, 6, 0]","[2, 4, 5, 3, 1]"
1,[Fossil fuels are basically the remains of ani...,what are the benefits of fossil fuels,description,[],[The biggest advantage of using fossil fuels i...,"[0, 1, 0, 0, 0, 0, 0, 0, 0]","[4, 1, 3, 0, 5, 2, 6, 8, 7]","[4, 1, 3, 0, 5]","[1, 4, 8, 7, 5, 6, 0, 2, 3]","[3, 1, 4, 5, 7, 8, 2, 0, 6]","[3, 1, 4, 5, 7, 8, 2, 0, 6]","[3, 1, 4, 5, 7]"
2,[The apothem of a regular polygon is a line se...,what is a apothem,description,[],[Apothem. The apothem of a regular polygon is ...,"[0, 0, 0, 0, 0, 1, 0, 0, 0]","[8, 1, 5, 7, 0, 4, 2, 6, 3]","[8, 1, 5, 7, 0]","[0, 5, 6, 1, 7, 4, 2, 8, 3]","[1, 0, 5, 4, 2, 8, 6, 7, 3]","[1, 0, 5, 4, 2, 8, 6, 7, 3]","[1, 0, 5, 4, 2]"
3,[$45 to $210. 2],average cost for custom canopy,numeric,[],"[Congratulations! You have found BuyShade.com,...","[0, 0, 0, 0, 0, 1, 0, 0, 0]","[5, 1, 6, 2, 7, 4, 8, 3, 0]","[5, 1, 6, 2, 7]","[5, 6, 2, 1, 8, 0, 4, 3, 7]","[6, 5, 2, 1, 8, 4, 3, 7, 0]","[6, 5, 2, 1, 8, 4, 3, 7, 0]","[6, 5, 2, 1, 8]"
4,[It is the collection of physical elements tha...,what is a hardware in a computer,description,[],"[Hardware is best described as a device, such ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]","[9, 4, 8, 0, 2, 3, 1, 6, 5, 7]","[9, 4, 8, 0, 2]","[6, 3, 1, 0, 2, 7, 8, 5, 4, 9]","[9, 0, 1, 4, 2, 6, 5, 7, 8, 3]","[9, 0, 1, 4, 2, 6, 5, 7, 8, 3]","[9, 0, 1, 4, 2]"


In [42]:
ranked_indices_use_bm25 = rank_passages_use_bm25(df.iloc[0]["query"], df.iloc[0]["passage_text"])
print("Ranked Indices of use_bm25:", ranked_indices_use_bm25)


Ranked Indices of use_bm25: [1, 2, 5, 4, 6, 3, 0]


In [43]:
df["ranked_passages_use_bm25"] = df.apply(lambda x: rank_passages_use_bm25(x["query"], x["passage_text"]), axis=1)
df.head()

Unnamed: 0_level_0,answers,query,query_type,wellFormedAnswers,passage_text,is_selected,ranked_passages_minilm_cosine,ranked_passages_minilm_knn,ranked_passages_minilm_bm25,ranked_passages_use_cosine,ranked_passages_use_dot,ranked_passages_use_knn,ranked_passages_use_bm25
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,[Yes],does human hair stop squirrels,description,[],[We have been feeding our back yard squirrels ...,"[0, 0, 1, 0, 0, 0, 0]","[4, 2, 0, 3, 5, 6, 1]","[4, 2, 0, 3, 5]","[1, 2, 5, 4, 6, 3, 0]","[2, 4, 5, 3, 1, 6, 0]","[2, 4, 5, 3, 1, 6, 0]","[2, 4, 5, 3, 1]","[1, 2, 5, 4, 6, 3, 0]"
1,[Fossil fuels are basically the remains of ani...,what are the benefits of fossil fuels,description,[],[The biggest advantage of using fossil fuels i...,"[0, 1, 0, 0, 0, 0, 0, 0, 0]","[4, 1, 3, 0, 5, 2, 6, 8, 7]","[4, 1, 3, 0, 5]","[1, 4, 8, 7, 5, 6, 0, 2, 3]","[3, 1, 4, 5, 7, 8, 2, 0, 6]","[3, 1, 4, 5, 7, 8, 2, 0, 6]","[3, 1, 4, 5, 7]","[1, 4, 8, 7, 5, 6, 0, 2, 3]"
2,[The apothem of a regular polygon is a line se...,what is a apothem,description,[],[Apothem. The apothem of a regular polygon is ...,"[0, 0, 0, 0, 0, 1, 0, 0, 0]","[8, 1, 5, 7, 0, 4, 2, 6, 3]","[8, 1, 5, 7, 0]","[0, 5, 6, 1, 7, 4, 2, 8, 3]","[1, 0, 5, 4, 2, 8, 6, 7, 3]","[1, 0, 5, 4, 2, 8, 6, 7, 3]","[1, 0, 5, 4, 2]","[0, 5, 6, 1, 7, 4, 2, 8, 3]"
3,[$45 to $210. 2],average cost for custom canopy,numeric,[],"[Congratulations! You have found BuyShade.com,...","[0, 0, 0, 0, 0, 1, 0, 0, 0]","[5, 1, 6, 2, 7, 4, 8, 3, 0]","[5, 1, 6, 2, 7]","[5, 6, 2, 1, 8, 0, 4, 3, 7]","[6, 5, 2, 1, 8, 4, 3, 7, 0]","[6, 5, 2, 1, 8, 4, 3, 7, 0]","[6, 5, 2, 1, 8]","[5, 6, 2, 1, 8, 0, 4, 3, 7]"
4,[It is the collection of physical elements tha...,what is a hardware in a computer,description,[],"[Hardware is best described as a device, such ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]","[9, 4, 8, 0, 2, 3, 1, 6, 5, 7]","[9, 4, 8, 0, 2]","[6, 3, 1, 0, 2, 7, 8, 5, 4, 9]","[9, 0, 1, 4, 2, 6, 5, 7, 8, 3]","[9, 0, 1, 4, 2, 6, 5, 7, 8, 3]","[9, 0, 1, 4, 2]","[6, 3, 1, 0, 2, 7, 8, 5, 4, 9]"


In [44]:
result4 = evaluate_metrics(df, relevance_col='is_selected', rank_col='ranked_passages_use_cosine')
result4#results for use model with cosine similarity retrieval

{'MRR': 0.4407627683197632,
 'nDCG': 0.5655166834843557,
 'Precision at 3': 0.1898791018998273,
 'Recall at 3': 0.5155630397236615,
 'MAP': 0.43496572086520274}

In [45]:
result5 = evaluate_metrics(df, relevance_col='is_selected', rank_col='ranked_passages_use_dot')
result5#results for use model with dot similarity retrieval

{'MRR': 0.4408049592894153,
 'nDCG': 0.5655484588032726,
 'Precision at 3': 0.1899481865284974,
 'Recall at 3': 0.5158221070811745,
 'MAP': 0.43499927625627105}

In [46]:
result6 = evaluate_metrics(df, relevance_col='is_selected', rank_col='ranked_passages_use_knn')
result6#results for use model with knn similarity retrieval

{'MRR': 0.4109015544041451,
 'nDCG': 0.4869620376502086,
 'Precision at 3': 0.1898791018998273,
 'Recall at 3': 0.5156148531951641,
 'MAP': 0.39828316062176167}

In [47]:
result7 = evaluate_metrics(df, relevance_col='is_selected', rank_col='ranked_passages_use_bm25')
result7#results for use model with bm25 similarity retrieval

{'MRR': 0.43476400197384646,
 'nDCG': 0.5610625418080807,
 'Precision at 3': 0.18753022452504317,
 'Recall at 3': 0.5107962003454232,
 'MAP': 0.4292430689201414}

Custom Models specially trained on this dataset MS Marco


In [57]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from typing import List

# Load MiniLM model
model_minilm = SentenceTransformer("sentence-transformers/msmarco-MiniLM-L12-cos-v5")

def rank_passages_minilm_l12_cosine(query: str, passages: List[str]) -> List[int]:
    """Rank passages by cosine similarity with the query using MiniLM L12 embeddings."""
    # Generate embeddings
    query_embedding = model_minilm.encode([query])
    passage_embeddings = model_minilm.encode(passages)

    # Compute cosine similarity between query and all passages
    similarity_scores = cosine_similarity(query_embedding, passage_embeddings).flatten()

    # Rank passages by similarity score
    ranked_indices = np.argsort(similarity_scores)[::-1]
    return ranked_indices.tolist()




modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [58]:
# Load DistilBERT model
model_distilbert = SentenceTransformer("sentence-transformers/msmarco-distilbert-cos-v5")

def rank_passages_distilbert_cosine(query: str, passages: List[str]) -> List[int]:
    """Rank passages by cosine similarity with the query using DistilBERT embeddings."""
    # Generate embeddings
    query_embedding = model_distilbert.encode([query])
    passage_embeddings = model_distilbert.encode(passages)

    # Compute cosine similarity between query and all passages
    similarity_scores = cosine_similarity(query_embedding, passage_embeddings).flatten()

    # Rank passages by similarity score
    ranked_indices = np.argsort(similarity_scores)[::-1]
    return ranked_indices.tolist()



modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [59]:
ranked_indices_distilbert_cosine = rank_passages_distilbert_cosine(df.iloc[0]["query"], df.iloc[0]["passage_text"])
print("Ranked Indices of ditilbert Cosine:", ranked_indices_distilbert_cosine)

Ranked Indices of ditilbert Cosine: [2, 4, 1, 6, 5, 3, 0]


In [60]:
df["ranked_passages_distilbert_cosine"] = df.apply(lambda x: rank_passages_distilbert_cosine(x["query"], x["passage_text"]), axis=1)
df.head()

Unnamed: 0_level_0,answers,query,query_type,wellFormedAnswers,passage_text,is_selected,ranked_passages_minilm_cosine,ranked_passages_minilm_knn,ranked_passages_minilm_bm25,ranked_passages_use_cosine,ranked_passages_use_dot,ranked_passages_use_knn,ranked_passages_use_bm25,ranked_passages_distilbert_cosine
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,[Yes],does human hair stop squirrels,description,[],[We have been feeding our back yard squirrels ...,"[0, 0, 1, 0, 0, 0, 0]","[4, 2, 0, 3, 5, 6, 1]","[4, 2, 0, 3, 5]","[1, 2, 5, 4, 6, 3, 0]","[2, 4, 5, 3, 1, 6, 0]","[2, 4, 5, 3, 1, 6, 0]","[2, 4, 5, 3, 1]","[1, 2, 5, 4, 6, 3, 0]","[2, 4, 1, 6, 5, 3, 0]"
1,[Fossil fuels are basically the remains of ani...,what are the benefits of fossil fuels,description,[],[The biggest advantage of using fossil fuels i...,"[0, 1, 0, 0, 0, 0, 0, 0, 0]","[4, 1, 3, 0, 5, 2, 6, 8, 7]","[4, 1, 3, 0, 5]","[1, 4, 8, 7, 5, 6, 0, 2, 3]","[3, 1, 4, 5, 7, 8, 2, 0, 6]","[3, 1, 4, 5, 7, 8, 2, 0, 6]","[3, 1, 4, 5, 7]","[1, 4, 8, 7, 5, 6, 0, 2, 3]","[4, 1, 3, 0, 2, 8, 5, 6, 7]"
2,[The apothem of a regular polygon is a line se...,what is a apothem,description,[],[Apothem. The apothem of a regular polygon is ...,"[0, 0, 0, 0, 0, 1, 0, 0, 0]","[8, 1, 5, 7, 0, 4, 2, 6, 3]","[8, 1, 5, 7, 0]","[0, 5, 6, 1, 7, 4, 2, 8, 3]","[1, 0, 5, 4, 2, 8, 6, 7, 3]","[1, 0, 5, 4, 2, 8, 6, 7, 3]","[1, 0, 5, 4, 2]","[0, 5, 6, 1, 7, 4, 2, 8, 3]","[8, 2, 0, 6, 5, 1, 7, 4, 3]"
3,[$45 to $210. 2],average cost for custom canopy,numeric,[],"[Congratulations! You have found BuyShade.com,...","[0, 0, 0, 0, 0, 1, 0, 0, 0]","[5, 1, 6, 2, 7, 4, 8, 3, 0]","[5, 1, 6, 2, 7]","[5, 6, 2, 1, 8, 0, 4, 3, 7]","[6, 5, 2, 1, 8, 4, 3, 7, 0]","[6, 5, 2, 1, 8, 4, 3, 7, 0]","[6, 5, 2, 1, 8]","[5, 6, 2, 1, 8, 0, 4, 3, 7]","[5, 1, 6, 2, 4, 7, 3, 8, 0]"
4,[It is the collection of physical elements tha...,what is a hardware in a computer,description,[],"[Hardware is best described as a device, such ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]","[9, 4, 8, 0, 2, 3, 1, 6, 5, 7]","[9, 4, 8, 0, 2]","[6, 3, 1, 0, 2, 7, 8, 5, 4, 9]","[9, 0, 1, 4, 2, 6, 5, 7, 8, 3]","[9, 0, 1, 4, 2, 6, 5, 7, 8, 3]","[9, 0, 1, 4, 2]","[6, 3, 1, 0, 2, 7, 8, 5, 4, 9]","[4, 0, 2, 3, 1, 5, 9, 7, 8, 6]"


In [61]:
result8 = evaluate_metrics(df, relevance_col='is_selected', rank_col='ranked_passages_distilbert_cosine')
result8#results for distilbert model with cosine similarity retrieval

{'MRR': 0.6187172876058886,
 'nDCG': 0.7017649146033434,
 'Precision at 3': 0.2737478411053541,
 'Recall at 3': 0.7492987910189982,
 'MAP': 0.6103371247635496}

In [62]:
ranked_indices_minilm_l12_cosine = rank_passages_minilm_l12_cosine(df.iloc[0]["query"], df.iloc[0]["passage_text"])
print("Ranked Indices of minilm_l12 Cosine:", ranked_indices_minilm_l12_cosine)

Ranked Indices of minilm_l12 Cosine: [2, 4, 3, 5, 6, 0, 1]


In [63]:
df["ranked_passages_minilm_l12_cosine"] = df.apply(lambda x: rank_passages_minilm_l12_cosine(x["query"], x["passage_text"]), axis=1)
df.head()

Unnamed: 0_level_0,answers,query,query_type,wellFormedAnswers,passage_text,is_selected,ranked_passages_minilm_cosine,ranked_passages_minilm_knn,ranked_passages_minilm_bm25,ranked_passages_use_cosine,ranked_passages_use_dot,ranked_passages_use_knn,ranked_passages_use_bm25,ranked_passages_distilbert_cosine,ranked_passages_minilm_l12_cosine
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,[Yes],does human hair stop squirrels,description,[],[We have been feeding our back yard squirrels ...,"[0, 0, 1, 0, 0, 0, 0]","[4, 2, 0, 3, 5, 6, 1]","[4, 2, 0, 3, 5]","[1, 2, 5, 4, 6, 3, 0]","[2, 4, 5, 3, 1, 6, 0]","[2, 4, 5, 3, 1, 6, 0]","[2, 4, 5, 3, 1]","[1, 2, 5, 4, 6, 3, 0]","[2, 4, 1, 6, 5, 3, 0]","[2, 4, 3, 5, 6, 0, 1]"
1,[Fossil fuels are basically the remains of ani...,what are the benefits of fossil fuels,description,[],[The biggest advantage of using fossil fuels i...,"[0, 1, 0, 0, 0, 0, 0, 0, 0]","[4, 1, 3, 0, 5, 2, 6, 8, 7]","[4, 1, 3, 0, 5]","[1, 4, 8, 7, 5, 6, 0, 2, 3]","[3, 1, 4, 5, 7, 8, 2, 0, 6]","[3, 1, 4, 5, 7, 8, 2, 0, 6]","[3, 1, 4, 5, 7]","[1, 4, 8, 7, 5, 6, 0, 2, 3]","[4, 1, 3, 0, 2, 8, 5, 6, 7]","[1, 4, 3, 5, 0, 2, 8, 6, 7]"
2,[The apothem of a regular polygon is a line se...,what is a apothem,description,[],[Apothem. The apothem of a regular polygon is ...,"[0, 0, 0, 0, 0, 1, 0, 0, 0]","[8, 1, 5, 7, 0, 4, 2, 6, 3]","[8, 1, 5, 7, 0]","[0, 5, 6, 1, 7, 4, 2, 8, 3]","[1, 0, 5, 4, 2, 8, 6, 7, 3]","[1, 0, 5, 4, 2, 8, 6, 7, 3]","[1, 0, 5, 4, 2]","[0, 5, 6, 1, 7, 4, 2, 8, 3]","[8, 2, 0, 6, 5, 1, 7, 4, 3]","[8, 7, 0, 1, 2, 4, 5, 6, 3]"
3,[$45 to $210. 2],average cost for custom canopy,numeric,[],"[Congratulations! You have found BuyShade.com,...","[0, 0, 0, 0, 0, 1, 0, 0, 0]","[5, 1, 6, 2, 7, 4, 8, 3, 0]","[5, 1, 6, 2, 7]","[5, 6, 2, 1, 8, 0, 4, 3, 7]","[6, 5, 2, 1, 8, 4, 3, 7, 0]","[6, 5, 2, 1, 8, 4, 3, 7, 0]","[6, 5, 2, 1, 8]","[5, 6, 2, 1, 8, 0, 4, 3, 7]","[5, 1, 6, 2, 4, 7, 3, 8, 0]","[1, 5, 6, 2, 4, 8, 7, 3, 0]"
4,[It is the collection of physical elements tha...,what is a hardware in a computer,description,[],"[Hardware is best described as a device, such ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]","[9, 4, 8, 0, 2, 3, 1, 6, 5, 7]","[9, 4, 8, 0, 2]","[6, 3, 1, 0, 2, 7, 8, 5, 4, 9]","[9, 0, 1, 4, 2, 6, 5, 7, 8, 3]","[9, 0, 1, 4, 2, 6, 5, 7, 8, 3]","[9, 0, 1, 4, 2]","[6, 3, 1, 0, 2, 7, 8, 5, 4, 9]","[4, 0, 2, 3, 1, 5, 9, 7, 8, 6]","[1, 3, 2, 6, 4, 8, 0, 9, 5, 7]"


In [64]:
result9 = evaluate_metrics(df, relevance_col='is_selected', rank_col='ranked_passages_minilm_l12_cosine')
result9#results for MiniLm model L12 with cosine similarity retrieval

{'MRR': 0.6097618636401021,
 'nDCG': 0.6950429415907557,
 'Precision at 3': 0.26922279792746107,
 'Recall at 3': 0.7378652849740932,
 'MAP': 0.6014155365024536}