In [6]:
pip install sentence-transformers scikit-learn rank-bm25


Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.2


MiniLm


In [16]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from typing import List

# Load MiniLM model
model = SentenceTransformer('all-MiniLM-L6-v2')

def rank_passages_minilm_cosine(query: str, passages: List[str]) -> List[int]:
    """Rank passages by cosine similarity with the query using MiniLM embeddings."""
    # Generate embeddings
    query_embedding = model.encode([query])
    passage_embeddings = model.encode(passages)

    # Compute cosine similarity between query and all passages
    similarity_scores = cosine_similarity(query_embedding, passage_embeddings).flatten()

    # Rank passages by similarity score
    ranked_indices = np.argsort(similarity_scores)[::-1]
    return ranked_indices.tolist()


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [17]:
from sklearn.neighbors import NearestNeighbors

def rank_passages_minilm_knn(query: str, passages: List[str], k: int = 5) -> List[int]:
    """Rank passages by k-nearest neighbors similarity with the query using MiniLM embeddings."""
    # Generate embeddings
    query_embedding = model.encode([query])
    passage_embeddings = model.encode(passages)

    # Fit kNN model
    knn = NearestNeighbors(n_neighbors=min(k, len(passages)), metric="cosine")
    knn.fit(passage_embeddings)

    # Find k-nearest neighbors for the query
    distances, indices = knn.kneighbors(query_embedding)
    return indices.flatten().tolist()


In [15]:
!pip install nltk



In [7]:
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')

def rank_passages_minilm_bm25(query: str, passages: List[str]) -> List[int]:
    """Rank passages by BM25 similarity with the query using tokenized MiniLM embeddings."""
    # Tokenize passages
    tokenized_passages = [word_tokenize(passage.lower()) for passage in passages]

    # Initialize BM25
    bm25 = BM25Okapi(tokenized_passages)

    # Rank passages by BM25 scores
    query_tokens = word_tokenize(query.lower())
    scores = bm25.get_scores(query_tokens)

    ranked_indices = np.argsort(scores)[::-1]
    return ranked_indices.tolist()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [11]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [12]:
from datasets import load_dataset
import pandas as pd

# Load the dataset
ds = load_dataset("microsoft/ms_marco", "v1.1")

# Convert the validation split to a pandas DataFrame
df = ds['validation'].to_pandas()
# Extract the passage text and is_selected fields
df["passage_text"] = df["passages"].apply(lambda x: x["passage_text"].tolist())  # Convert NumPy array to list
df["is_selected"] = df["passages"].apply(lambda x: x["is_selected"].tolist())    # Convert NumPy array to list



# Extract passage text and selection status
# df["passage_text"] = df["passages"].apply(lambda x: x["passage_text"])
# df["passage_text"] = df["passage_text"].apply(lambda x: x.split(". ") if isinstance(x, str) else x)
# df["is_selected"] = df["passages"].apply(lambda x: x["is_selected"])

# Drop the original passages column and set query_id as index
df.drop("passages", axis=1, inplace=True)
df.set_index("query_id", inplace=True)

# Display the first few rows
df.head()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/9.48k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/21.4M [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/175M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/10047 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/82326 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/9650 [00:00<?, ? examples/s]

Unnamed: 0_level_0,answers,query,query_type,wellFormedAnswers,passage_text,is_selected
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
9652,"[Approximately $15,000 per year.]",walgreens store sales average,numeric,[],[The average Walgreens salary ranges from appr...,"[1, 0, 0, 0, 0, 0]"
9653,"[$21,550 per year, The average hourly wage for...",how much do bartenders make,numeric,[],[A bartender’s income is comprised mostly of t...,"[0, 1, 0, 0, 0, 0, 0, 0]"
9654,"[A boil, also called a furuncle, is a deep fol...",what is a furuncle boil,description,[],"[Knowledge center. A boil, also known as a fur...","[0, 0, 0, 0, 0, 0, 1, 0]"
9655,"[Detect and assess a wide range of disorders, ...",what can urinalysis detect,description,[],[Urinalysis: One way to test for bladder cance...,"[0, 0, 0, 0, 1, 0, 0, 0, 0]"
9656,"[Shigellosis, diseases of the nervous system, ...",what is vitamin a used for,description,[],[Since vitamin A is fat-soluble it is not need...,"[0, 0, 0, 0, 1, 0, 0, 0, 0]"


In [18]:
ranked_indices_minilm_cosine = rank_passages_minilm_cosine(df.iloc[0]["query"], df.iloc[0]["passage_text"])

In [19]:
print("Ranked Indices of MiniLM_cosine:", ranked_indices_minilm_cosine)

Ranked Indices of MiniLM_cosine: [3, 4, 2, 0, 5, 1]


In [20]:
df["ranked_passages_minilm_cosine"] = df.apply(lambda x: rank_passages_minilm_cosine(x["query"], x["passage_text"]), axis=1)
df.head()

Unnamed: 0_level_0,answers,query,query_type,wellFormedAnswers,passage_text,is_selected,ranked_passages_use_cosine,ranked_passages_minilm_cosine
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
9652,"[Approximately $15,000 per year.]",walgreens store sales average,numeric,[],[The average Walgreens salary ranges from appr...,"[1, 0, 0, 0, 0, 0]","[5, 2, 0, 4, 3, 1]","[3, 4, 2, 0, 5, 1]"
9653,"[$21,550 per year, The average hourly wage for...",how much do bartenders make,numeric,[],[A bartender’s income is comprised mostly of t...,"[0, 1, 0, 0, 0, 0, 0, 0]","[5, 4, 3, 0, 7, 2, 1, 6]","[2, 6, 5, 4, 1, 7, 0, 3]"
9654,"[A boil, also called a furuncle, is a deep fol...",what is a furuncle boil,description,[],"[Knowledge center. A boil, also known as a fur...","[0, 0, 0, 0, 0, 0, 1, 0]","[0, 7, 5, 6, 1, 2, 4, 3]","[0, 3, 6, 4, 1, 7, 2, 5]"
9655,"[Detect and assess a wide range of disorders, ...",what can urinalysis detect,description,[],[Urinalysis: One way to test for bladder cance...,"[0, 0, 0, 0, 1, 0, 0, 0, 0]","[0, 6, 1, 4, 7, 3, 8, 5, 2]","[3, 5, 1, 4, 6, 7, 2, 8, 0]"
9656,"[Shigellosis, diseases of the nervous system, ...",what is vitamin a used for,description,[],[Since vitamin A is fat-soluble it is not need...,"[0, 0, 0, 0, 1, 0, 0, 0, 0]","[8, 5, 0, 6, 7, 1, 3, 2, 4]","[4, 5, 7, 0, 6, 8, 1, 3, 2]"


In [21]:
ranked_indices_minilm_knn = rank_passages_minilm_knn(df.iloc[0]["query"], df.iloc[0]["passage_text"])


In [22]:
print("Ranked Indices of MiniLM_knn:", ranked_indices_minilm_knn)

Ranked Indices of MiniLM_knn: [3, 4, 2, 0, 5]


In [23]:
df["ranked_passages_minilm_knn"] = df.apply(lambda x: rank_passages_minilm_knn(x["query"], x["passage_text"]), axis=1)
df.head()

Unnamed: 0_level_0,answers,query,query_type,wellFormedAnswers,passage_text,is_selected,ranked_passages_use_cosine,ranked_passages_minilm_cosine,ranked_passages_minilm_knn
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
9652,"[Approximately $15,000 per year.]",walgreens store sales average,numeric,[],[The average Walgreens salary ranges from appr...,"[1, 0, 0, 0, 0, 0]","[5, 2, 0, 4, 3, 1]","[3, 4, 2, 0, 5, 1]","[3, 4, 2, 0, 5]"
9653,"[$21,550 per year, The average hourly wage for...",how much do bartenders make,numeric,[],[A bartender’s income is comprised mostly of t...,"[0, 1, 0, 0, 0, 0, 0, 0]","[5, 4, 3, 0, 7, 2, 1, 6]","[2, 6, 5, 4, 1, 7, 0, 3]","[2, 6, 5, 4, 1]"
9654,"[A boil, also called a furuncle, is a deep fol...",what is a furuncle boil,description,[],"[Knowledge center. A boil, also known as a fur...","[0, 0, 0, 0, 0, 0, 1, 0]","[0, 7, 5, 6, 1, 2, 4, 3]","[0, 3, 6, 4, 1, 7, 2, 5]","[0, 3, 6, 4, 1]"
9655,"[Detect and assess a wide range of disorders, ...",what can urinalysis detect,description,[],[Urinalysis: One way to test for bladder cance...,"[0, 0, 0, 0, 1, 0, 0, 0, 0]","[0, 6, 1, 4, 7, 3, 8, 5, 2]","[3, 5, 1, 4, 6, 7, 2, 8, 0]","[3, 5, 1, 4, 6]"
9656,"[Shigellosis, diseases of the nervous system, ...",what is vitamin a used for,description,[],[Since vitamin A is fat-soluble it is not need...,"[0, 0, 0, 0, 1, 0, 0, 0, 0]","[8, 5, 0, 6, 7, 1, 3, 2, 4]","[4, 5, 7, 0, 6, 8, 1, 3, 2]","[4, 5, 7, 0, 6]"


In [24]:
ranked_indices_minilm_bm25 = rank_passages_minilm_bm25(df.iloc[0]["query"], df.iloc[0]["passage_text"])


In [25]:
print("Ranked Indices of MiniLM_bm25:", ranked_indices_minilm_bm25)

Ranked Indices of MiniLM_bm25: [3, 2, 5, 4, 0, 1]


In [26]:
df["ranked_passages_minilm_bm25"] = df.apply(lambda x: rank_passages_minilm_bm25(x["query"], x["passage_text"]), axis=1)
df.head()

Unnamed: 0_level_0,answers,query,query_type,wellFormedAnswers,passage_text,is_selected,ranked_passages_use_cosine,ranked_passages_minilm_cosine,ranked_passages_minilm_knn,ranked_passages_minilm_bm25
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
9652,"[Approximately $15,000 per year.]",walgreens store sales average,numeric,[],[The average Walgreens salary ranges from appr...,"[1, 0, 0, 0, 0, 0]","[5, 2, 0, 4, 3, 1]","[3, 4, 2, 0, 5, 1]","[3, 4, 2, 0, 5]","[3, 2, 5, 4, 0, 1]"
9653,"[$21,550 per year, The average hourly wage for...",how much do bartenders make,numeric,[],[A bartender’s income is comprised mostly of t...,"[0, 1, 0, 0, 0, 0, 0, 0]","[5, 4, 3, 0, 7, 2, 1, 6]","[2, 6, 5, 4, 1, 7, 0, 3]","[2, 6, 5, 4, 1]","[5, 3, 4, 6, 2, 7, 0, 1]"
9654,"[A boil, also called a furuncle, is a deep fol...",what is a furuncle boil,description,[],"[Knowledge center. A boil, also known as a fur...","[0, 0, 0, 0, 0, 0, 1, 0]","[0, 7, 5, 6, 1, 2, 4, 3]","[0, 3, 6, 4, 1, 7, 2, 5]","[0, 3, 6, 4, 1]","[0, 3, 6, 2, 4, 7, 1, 5]"
9655,"[Detect and assess a wide range of disorders, ...",what can urinalysis detect,description,[],[Urinalysis: One way to test for bladder cance...,"[0, 0, 0, 0, 1, 0, 0, 0, 0]","[0, 6, 1, 4, 7, 3, 8, 5, 2]","[3, 5, 1, 4, 6, 7, 2, 8, 0]","[3, 5, 1, 4, 6]","[4, 6, 7, 3, 0, 8, 5, 1, 2]"
9656,"[Shigellosis, diseases of the nervous system, ...",what is vitamin a used for,description,[],[Since vitamin A is fat-soluble it is not need...,"[0, 0, 0, 0, 1, 0, 0, 0, 0]","[8, 5, 0, 6, 7, 1, 3, 2, 4]","[4, 5, 7, 0, 6, 8, 1, 3, 2]","[4, 5, 7, 0, 6]","[2, 4, 0, 6, 5, 7, 3, 8, 1]"


In [27]:
###Evaluation

import numpy as np
import pandas as pd
from typing import List

def evaluate_metrics(df: pd.DataFrame, relevance_col: str, rank_col: str, k: int = 3):
    """
    Function to calculate
        - MRR (Mean Reciprocal Rank)
        - nDCG (Normalized Discounted Cumulative Gain)
        - Precision@k
        - Recall@k
        - MAP (Mean Average Precision)
    """
    mrr_scores = []
    ndcg_scores = []
    precision_at_k = []
    recall_at_k = []
    average_precisions = []

    for _, row in df.iterrows():
        relevance = np.array(row[relevance_col])  # Convert relevance list to numpy array
        rank = np.array(row[rank_col])  # Ranked indices
        relevance_at_rank = relevance[rank]

        # MRR: Find the rank of the first relevant document
        first_relevant = np.where(relevance_at_rank == 1)[0]
        if len(first_relevant) > 0:
            mrr_scores.append(1 / (first_relevant[0] + 1))
        else:
            mrr_scores.append(0)

        # nDCG
        dcg = sum((2 ** relevance_at_rank[i] - 1) / np.log2(i + 2) for i in range(len(relevance_at_rank)))
        ideal_relevance = sorted(relevance, reverse=True)
        idcg = sum((2 ** ideal_relevance[i] - 1) / np.log2(i + 2) for i in range(len(ideal_relevance)))
        ndcg = dcg / idcg if idcg > 0 else 0
        ndcg_scores.append(ndcg)

        # Top-k relevance
        relevance_at_k = relevance_at_rank[:k]

        # Precision@k
        precision = relevance_at_k.sum() / k
        precision_at_k.append(precision)

        # Recall@k
        total_relevant = relevance.sum()
        recall = relevance_at_k.sum() / total_relevant if total_relevant > 0 else 0
        recall_at_k.append(recall)

        # Average Precision (AP)
        num_relevant_retrieved = 0
        cumulative_precision = 0
        for i in range(len(relevance_at_rank)):
            if relevance_at_rank[i] == 1:
                num_relevant_retrieved += 1
                cumulative_precision += num_relevant_retrieved / (i + 1)
        ap = cumulative_precision / total_relevant if total_relevant > 0 else 0
        average_precisions.append(ap)

    # Aggregate results
    metrics = {
        "MRR": np.mean(mrr_scores),
        "nDCG": np.mean(ndcg_scores),
        f"Precision at {k}": np.mean(precision_at_k),
        f"Recall at {k}": np.mean(recall_at_k),
        "MAP": np.mean(average_precisions),
    }

    return metrics

In [28]:
result1 = evaluate_metrics(df, relevance_col='is_selected', rank_col='ranked_passages_minilm_cosine')
result1#results for minilm model with cosine similarity retrieval

{'MRR': 0.5481979932412897,
 'nDCG': 0.648096832505137,
 'Precision at 3': 0.23973988918748548,
 'Recall at 3': 0.6585746989150991,
 'MAP': 0.5421654362064434}

In [29]:
result2 = evaluate_metrics(df, relevance_col='is_selected', rank_col='ranked_passages_minilm_knn')
result2#results for minilm model with knn similarity retrieval

{'MRR': 0.5321240171195383,
 'nDCG': 0.6051396573894309,
 'Precision at 3': 0.23973988918748548,
 'Recall at 3': 0.6585746989150991,
 'MAP': 0.5214484146732579}

In [30]:
result3 = evaluate_metrics(df, relevance_col='is_selected', rank_col='ranked_passages_minilm_bm25')
result3#results for minilm model with bm25 similarity retrieval

{'MRR': 0.4256574654678567,
 'nDCG': 0.5534450098034668,
 'Precision at 3': 0.18320560034504493,
 'Recall at 3': 0.5027022991937892,
 'MAP': 0.41996561399517507}

USE (Universal Sentence Encoder) By Google


In [1]:
from sklearn.metrics.pairwise import cosine_similarity
from typing import List
import numpy as np
import tensorflow_hub as hub

# Load the USE model
use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

def rank_passages_use_cosine(query: str, passages: List[str]) -> List[int]:
    """Rank passages by cosine similarity using USE embeddings."""
    # Generate embeddings
    query_embedding = use_model([query]).numpy()
    passage_embeddings = use_model(passages).numpy()

    # Compute cosine similarity
    similarity_scores = cosine_similarity(query_embedding, passage_embeddings).flatten()

    # Rank passages by similarity score
    ranked_indices = np.argsort(similarity_scores)[::-1]
    return ranked_indices.tolist()


In [2]:
def rank_passages_use_dot(query: str, passages: List[str]) -> List[int]:
    """Rank passages by dot product similarity using USE embeddings."""
    # Generate embeddings
    query_embedding = use_model([query]).numpy()
    passage_embeddings = use_model(passages).numpy()

    # Compute dot product
    similarity_scores = np.dot(passage_embeddings, query_embedding.T).flatten()

    # Rank passages by similarity score
    ranked_indices = np.argsort(similarity_scores)[::-1]
    return ranked_indices.tolist()


In [36]:
from sklearn.neighbors import NearestNeighbors

def rank_passages_use_knn(query: str, passages: List[str], k: int = 5) -> List[int]:
    """Rank passages using KNN similarity with USE embeddings."""
    if len(passages) == 0:
        return []  # No passages to rank

    # Adjust k to be at most the number of passages
    adjusted_k = min(k, len(passages))

    # Generate embeddings
    query_embedding = use_model([query]).numpy()
    passage_embeddings = use_model(passages).numpy()

    # Fit KNN model
    knn = NearestNeighbors(n_neighbors=adjusted_k, metric='cosine').fit(passage_embeddings)

    # Find nearest neighbors for the query
    distances, indices = knn.kneighbors(query_embedding)

    # Return ranked indices (flatten since KNN returns a 2D array)
    return indices.flatten().tolist()



In [8]:
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')

def rank_passages_use_bm25(query: str, passages: List[str]) -> List[int]:
    """Rank passages using BM25 with tokenized USE passages."""
    # Tokenize passages
    tokenized_passages = [word_tokenize(passage.lower()) for passage in passages]

    # Initialize BM25
    bm25 = BM25Okapi(tokenized_passages)

    # Tokenize query
    query_tokens = word_tokenize(query.lower())

    # Compute BM25 scores
    scores = bm25.get_scores(query_tokens)

    # Rank passages by score
    ranked_indices = np.argsort(scores)[::-1]
    return ranked_indices.tolist()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
ranked_indices_use_cosine = rank_passages_use_cosine(df.iloc[0]["query"], df.iloc[0]["passage_text"])


In [14]:
print("Ranked Indices of use_cosine:", ranked_indices_use_cosine)

Ranked Indices of use_cosine: [5, 2, 0, 4, 3, 1]


In [31]:
df["ranked_passages_use_cosine"] = df.apply(lambda x: rank_passages_use_cosine(x["query"], x["passage_text"]), axis=1)
df.head()

Unnamed: 0_level_0,answers,query,query_type,wellFormedAnswers,passage_text,is_selected,ranked_passages_use_cosine,ranked_passages_minilm_cosine,ranked_passages_minilm_knn,ranked_passages_minilm_bm25
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
9652,"[Approximately $15,000 per year.]",walgreens store sales average,numeric,[],[The average Walgreens salary ranges from appr...,"[1, 0, 0, 0, 0, 0]","[5, 2, 0, 4, 3, 1]","[3, 4, 2, 0, 5, 1]","[3, 4, 2, 0, 5]","[3, 2, 5, 4, 0, 1]"
9653,"[$21,550 per year, The average hourly wage for...",how much do bartenders make,numeric,[],[A bartender’s income is comprised mostly of t...,"[0, 1, 0, 0, 0, 0, 0, 0]","[5, 4, 3, 0, 7, 2, 1, 6]","[2, 6, 5, 4, 1, 7, 0, 3]","[2, 6, 5, 4, 1]","[5, 3, 4, 6, 2, 7, 0, 1]"
9654,"[A boil, also called a furuncle, is a deep fol...",what is a furuncle boil,description,[],"[Knowledge center. A boil, also known as a fur...","[0, 0, 0, 0, 0, 0, 1, 0]","[0, 7, 5, 6, 1, 2, 4, 3]","[0, 3, 6, 4, 1, 7, 2, 5]","[0, 3, 6, 4, 1]","[0, 3, 6, 2, 4, 7, 1, 5]"
9655,"[Detect and assess a wide range of disorders, ...",what can urinalysis detect,description,[],[Urinalysis: One way to test for bladder cance...,"[0, 0, 0, 0, 1, 0, 0, 0, 0]","[0, 6, 1, 4, 7, 3, 8, 5, 2]","[3, 5, 1, 4, 6, 7, 2, 8, 0]","[3, 5, 1, 4, 6]","[4, 6, 7, 3, 0, 8, 5, 1, 2]"
9656,"[Shigellosis, diseases of the nervous system, ...",what is vitamin a used for,description,[],[Since vitamin A is fat-soluble it is not need...,"[0, 0, 0, 0, 1, 0, 0, 0, 0]","[8, 5, 0, 6, 7, 1, 3, 2, 4]","[4, 5, 7, 0, 6, 8, 1, 3, 2]","[4, 5, 7, 0, 6]","[2, 4, 0, 6, 5, 7, 3, 8, 1]"


In [32]:
ranked_indices_use_dot = rank_passages_use_dot(df.iloc[0]["query"], df.iloc[0]["passage_text"])
print("Ranked Indices of use_dot:", ranked_indices_use_dot)


Ranked Indices of use_dot: [5, 2, 0, 4, 3, 1]


In [33]:
df["ranked_passages_use_dot"] = df.apply(lambda x: rank_passages_use_dot(x["query"], x["passage_text"]), axis=1)
df.head()

Unnamed: 0_level_0,answers,query,query_type,wellFormedAnswers,passage_text,is_selected,ranked_passages_use_cosine,ranked_passages_minilm_cosine,ranked_passages_minilm_knn,ranked_passages_minilm_bm25,ranked_passages_use_dot
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
9652,"[Approximately $15,000 per year.]",walgreens store sales average,numeric,[],[The average Walgreens salary ranges from appr...,"[1, 0, 0, 0, 0, 0]","[5, 2, 0, 4, 3, 1]","[3, 4, 2, 0, 5, 1]","[3, 4, 2, 0, 5]","[3, 2, 5, 4, 0, 1]","[5, 2, 0, 4, 3, 1]"
9653,"[$21,550 per year, The average hourly wage for...",how much do bartenders make,numeric,[],[A bartender’s income is comprised mostly of t...,"[0, 1, 0, 0, 0, 0, 0, 0]","[5, 4, 3, 0, 7, 2, 1, 6]","[2, 6, 5, 4, 1, 7, 0, 3]","[2, 6, 5, 4, 1]","[5, 3, 4, 6, 2, 7, 0, 1]","[5, 4, 3, 0, 7, 2, 1, 6]"
9654,"[A boil, also called a furuncle, is a deep fol...",what is a furuncle boil,description,[],"[Knowledge center. A boil, also known as a fur...","[0, 0, 0, 0, 0, 0, 1, 0]","[0, 7, 5, 6, 1, 2, 4, 3]","[0, 3, 6, 4, 1, 7, 2, 5]","[0, 3, 6, 4, 1]","[0, 3, 6, 2, 4, 7, 1, 5]","[0, 7, 5, 6, 1, 2, 4, 3]"
9655,"[Detect and assess a wide range of disorders, ...",what can urinalysis detect,description,[],[Urinalysis: One way to test for bladder cance...,"[0, 0, 0, 0, 1, 0, 0, 0, 0]","[0, 6, 1, 4, 7, 3, 8, 5, 2]","[3, 5, 1, 4, 6, 7, 2, 8, 0]","[3, 5, 1, 4, 6]","[4, 6, 7, 3, 0, 8, 5, 1, 2]","[0, 6, 1, 4, 7, 3, 8, 5, 2]"
9656,"[Shigellosis, diseases of the nervous system, ...",what is vitamin a used for,description,[],[Since vitamin A is fat-soluble it is not need...,"[0, 0, 0, 0, 1, 0, 0, 0, 0]","[8, 5, 0, 6, 7, 1, 3, 2, 4]","[4, 5, 7, 0, 6, 8, 1, 3, 2]","[4, 5, 7, 0, 6]","[2, 4, 0, 6, 5, 7, 3, 8, 1]","[8, 5, 0, 6, 7, 1, 3, 2, 4]"


In [37]:
ranked_indices_use_knn = rank_passages_use_knn(df.iloc[0]["query"], df.iloc[0]["passage_text"])
print("Ranked Indices of use_knn:", ranked_indices_use_knn)


Ranked Indices of use_knn: [5, 2, 0, 4, 3]


In [38]:
df["ranked_passages_use_knn"] = df.apply(lambda x: rank_passages_use_knn(x["query"], x["passage_text"]), axis=1)
df.head()

Unnamed: 0_level_0,answers,query,query_type,wellFormedAnswers,passage_text,is_selected,ranked_passages_use_cosine,ranked_passages_minilm_cosine,ranked_passages_minilm_knn,ranked_passages_minilm_bm25,ranked_passages_use_dot,ranked_passages_use_knn
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
9652,"[Approximately $15,000 per year.]",walgreens store sales average,numeric,[],[The average Walgreens salary ranges from appr...,"[1, 0, 0, 0, 0, 0]","[5, 2, 0, 4, 3, 1]","[3, 4, 2, 0, 5, 1]","[3, 4, 2, 0, 5]","[3, 2, 5, 4, 0, 1]","[5, 2, 0, 4, 3, 1]","[5, 2, 0, 4, 3]"
9653,"[$21,550 per year, The average hourly wage for...",how much do bartenders make,numeric,[],[A bartender’s income is comprised mostly of t...,"[0, 1, 0, 0, 0, 0, 0, 0]","[5, 4, 3, 0, 7, 2, 1, 6]","[2, 6, 5, 4, 1, 7, 0, 3]","[2, 6, 5, 4, 1]","[5, 3, 4, 6, 2, 7, 0, 1]","[5, 4, 3, 0, 7, 2, 1, 6]","[5, 4, 3, 0, 7]"
9654,"[A boil, also called a furuncle, is a deep fol...",what is a furuncle boil,description,[],"[Knowledge center. A boil, also known as a fur...","[0, 0, 0, 0, 0, 0, 1, 0]","[0, 7, 5, 6, 1, 2, 4, 3]","[0, 3, 6, 4, 1, 7, 2, 5]","[0, 3, 6, 4, 1]","[0, 3, 6, 2, 4, 7, 1, 5]","[0, 7, 5, 6, 1, 2, 4, 3]","[0, 7, 5, 6, 1]"
9655,"[Detect and assess a wide range of disorders, ...",what can urinalysis detect,description,[],[Urinalysis: One way to test for bladder cance...,"[0, 0, 0, 0, 1, 0, 0, 0, 0]","[0, 6, 1, 4, 7, 3, 8, 5, 2]","[3, 5, 1, 4, 6, 7, 2, 8, 0]","[3, 5, 1, 4, 6]","[4, 6, 7, 3, 0, 8, 5, 1, 2]","[0, 6, 1, 4, 7, 3, 8, 5, 2]","[0, 6, 1, 4, 7]"
9656,"[Shigellosis, diseases of the nervous system, ...",what is vitamin a used for,description,[],[Since vitamin A is fat-soluble it is not need...,"[0, 0, 0, 0, 1, 0, 0, 0, 0]","[8, 5, 0, 6, 7, 1, 3, 2, 4]","[4, 5, 7, 0, 6, 8, 1, 3, 2]","[4, 5, 7, 0, 6]","[2, 4, 0, 6, 5, 7, 3, 8, 1]","[8, 5, 0, 6, 7, 1, 3, 2, 4]","[8, 5, 0, 6, 7]"


In [39]:
ranked_indices_use_bm25 = rank_passages_use_bm25(df.iloc[0]["query"], df.iloc[0]["passage_text"])
print("Ranked Indices of use_bm25:", ranked_indices_use_bm25)


Ranked Indices of use_bm25: [3, 2, 5, 4, 0, 1]


In [40]:
df["ranked_passages_use_bm25"] = df.apply(lambda x: rank_passages_use_bm25(x["query"], x["passage_text"]), axis=1)
df.head()

Unnamed: 0_level_0,answers,query,query_type,wellFormedAnswers,passage_text,is_selected,ranked_passages_use_cosine,ranked_passages_minilm_cosine,ranked_passages_minilm_knn,ranked_passages_minilm_bm25,ranked_passages_use_dot,ranked_passages_use_knn,ranked_passages_use_bm25
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
9652,"[Approximately $15,000 per year.]",walgreens store sales average,numeric,[],[The average Walgreens salary ranges from appr...,"[1, 0, 0, 0, 0, 0]","[5, 2, 0, 4, 3, 1]","[3, 4, 2, 0, 5, 1]","[3, 4, 2, 0, 5]","[3, 2, 5, 4, 0, 1]","[5, 2, 0, 4, 3, 1]","[5, 2, 0, 4, 3]","[3, 2, 5, 4, 0, 1]"
9653,"[$21,550 per year, The average hourly wage for...",how much do bartenders make,numeric,[],[A bartender’s income is comprised mostly of t...,"[0, 1, 0, 0, 0, 0, 0, 0]","[5, 4, 3, 0, 7, 2, 1, 6]","[2, 6, 5, 4, 1, 7, 0, 3]","[2, 6, 5, 4, 1]","[5, 3, 4, 6, 2, 7, 0, 1]","[5, 4, 3, 0, 7, 2, 1, 6]","[5, 4, 3, 0, 7]","[5, 3, 4, 6, 2, 7, 0, 1]"
9654,"[A boil, also called a furuncle, is a deep fol...",what is a furuncle boil,description,[],"[Knowledge center. A boil, also known as a fur...","[0, 0, 0, 0, 0, 0, 1, 0]","[0, 7, 5, 6, 1, 2, 4, 3]","[0, 3, 6, 4, 1, 7, 2, 5]","[0, 3, 6, 4, 1]","[0, 3, 6, 2, 4, 7, 1, 5]","[0, 7, 5, 6, 1, 2, 4, 3]","[0, 7, 5, 6, 1]","[0, 3, 6, 2, 4, 7, 1, 5]"
9655,"[Detect and assess a wide range of disorders, ...",what can urinalysis detect,description,[],[Urinalysis: One way to test for bladder cance...,"[0, 0, 0, 0, 1, 0, 0, 0, 0]","[0, 6, 1, 4, 7, 3, 8, 5, 2]","[3, 5, 1, 4, 6, 7, 2, 8, 0]","[3, 5, 1, 4, 6]","[4, 6, 7, 3, 0, 8, 5, 1, 2]","[0, 6, 1, 4, 7, 3, 8, 5, 2]","[0, 6, 1, 4, 7]","[4, 6, 7, 3, 0, 8, 5, 1, 2]"
9656,"[Shigellosis, diseases of the nervous system, ...",what is vitamin a used for,description,[],[Since vitamin A is fat-soluble it is not need...,"[0, 0, 0, 0, 1, 0, 0, 0, 0]","[8, 5, 0, 6, 7, 1, 3, 2, 4]","[4, 5, 7, 0, 6, 8, 1, 3, 2]","[4, 5, 7, 0, 6]","[2, 4, 0, 6, 5, 7, 3, 8, 1]","[8, 5, 0, 6, 7, 1, 3, 2, 4]","[8, 5, 0, 6, 7]","[2, 4, 0, 6, 5, 7, 3, 8, 1]"


In [41]:
result4 = evaluate_metrics(df, relevance_col='is_selected', rank_col='ranked_passages_use_cosine')
result4#results for use model with cosine similarity retrieval

{'MRR': 0.4369928005042965,
 'nDCG': 0.5624396630135898,
 'Precision at 3': 0.18685511429614146,
 'Recall at 3': 0.5112073255698218,
 'MAP': 0.43209226819135255}

In [42]:
result5 = evaluate_metrics(df, relevance_col='is_selected', rank_col='ranked_passages_use_dot')
result5#results for use model with dot similarity retrieval

{'MRR': 0.4370765734381739,
 'nDCG': 0.5625041678073792,
 'Precision at 3': 0.18682193689658605,
 'Recall at 3': 0.5111077933711556,
 'MAP': 0.43218433547511875}

In [45]:
result6 = evaluate_metrics(df, relevance_col='is_selected', rank_col='ranked_passages_use_knn')
result6#results for use model with knn similarity retrieval

{'MRR': 0.4062390099863973,
 'nDCG': 0.4822953337396435,
 'Precision at 3': 0.1868551142961415,
 'Recall at 3': 0.5112073255698218,
 'MAP': 0.3948263992568262}

In [46]:
result7 = evaluate_metrics(df, relevance_col='is_selected', rank_col='ranked_passages_use_bm25')
result7#results for use model with bm25 similarity retrieval

{'MRR': 0.4256574654678567,
 'nDCG': 0.5534450098034668,
 'Precision at 3': 0.18320560034504493,
 'Recall at 3': 0.5027022991937892,
 'MAP': 0.41996561399517507}