In [1]:
# import faiss
import sys
sys.path.append('..')
import math

import pandas as pd
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity

import os
# initialize tfidf
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
from sklearn.decomposition import TruncatedSVD


# ================================
# Initialization
# ================================

model = SentenceTransformer("all-MiniLM-L6-v2")

In [2]:
df = pd.read_csv("../data/questions.csv")


In [3]:
df['title'] .unique()

array(['University_of_Notre_Dame', 'Beyoncé', 'Montana', 'Genocide',
       'Antibiotics', 'Frédéric_Chopin',
       'Sino-Tibetan_relations_during_the_Ming_dynasty', 'IPod',
       'The_Legend_of_Zelda:_Twilight_Princess', 'Spectre_(2015_film)',
       '2008_Sichuan_earthquake', 'New_York_City',
       'To_Kill_a_Mockingbird', 'Solar_energy', 'Tajikistan',
       'Anthropology', 'Portugal', 'Kanye_West', 'Buddhism',
       'American_Idol', 'Dog', '2008_Summer_Olympics_torch_relay',
       'Alfred_North_Whitehead', 'Financial_crisis_of_2007%E2%80%9308',
       'Saint_Barth%C3%A9lemy', 'Genome', 'Comprehensive_school',
       'Republic_of_the_Congo', 'Prime_minister',
       'Institute_of_technology', 'Wayback_Machine', 'Dutch_Republic',
       'Symbiosis', 'Canadian_Armed_Forces', 'Cardinal_(Catholicism)',
       'Iranian_languages', 'Lighting',
       'Separation_of_powers_under_the_United_States_Constitution',
       'Architecture', 'Human_Development_Index', 'Southern_Europe',
     

In [4]:
df[df['title'] == 'Dog']

Unnamed: 0,id,title,context,question,answers
7753,56d47d7d2ccc5a1400d8314e,Dog,The domestic dog (Canis lupus familiaris or Ca...,What is the three word Latin name for domestic...,"{'text': ['Canis lupus familiaris'], 'answer_s..."
7754,56d99788dc89441400fdb580,Dog,The domestic dog (Canis lupus familiaris or Ca...,What is Canis familiaris?,"{'text': ['domestic dog'], 'answer_start': [4]}"
7755,56d99788dc89441400fdb581,Dog,The domestic dog (Canis lupus familiaris or Ca...,How long has the domestic dog been selectively...,"{'text': ['millennia'], 'answer_start': [122]}"
7756,56d99788dc89441400fdb582,Dog,The domestic dog (Canis lupus familiaris or Ca...,Along with various behaviors and physical attr...,"{'text': ['sensory capabilities'], 'answer_sta..."
7757,56d4a7a72ccc5a1400d83168,Dog,Although initially thought to have originated ...,What decade had significant studies of dog gen...,"{'text': ['2010s'], 'answer_start': [212]}"
...,...,...,...,...,...
95194,56d9b546dc89441400fdb715_aug,Dog,"In developing countries, the majority of dogs ...",Dog cognition has been studied on what kind of...,"{'text': ['pet dogs living in human homes.'], ..."
95195,56d9c243dc89441400fdb7aa_aug,Dog,"Wolves, and their dog descendants, would have ...",What would wolves have gotten from living with...,"{'text': ['significant benefits'], 'answer_sta..."
95196,56d9c649dc89441400fdb7e3_aug,Dog,Dogs and humans might have been able to coexis...,What has likely led to human success?,"{'text': ['the domestication of dogs'], 'answe..."
95197,56d9d357dc89441400fdb860_aug,Dog,There is mixed scientific evidence as to wheth...,Studies that people are better off with dogs h...,"{'text': ['poorly controlled'], 'answer_start'..."


In [5]:
# df = df.head(10000)
# df = df[df['title'] == "New_York_City"].copy()
df = df[df['title'] == 'Dog'].copy()
df = df.reset_index(drop=True)
titles = df["title"].tolist()
questions = df["question"].tolist()

In [6]:
question_embeddings_sbert = model.encode(questions, show_progress_bar=True)



Batches:   0%|          | 0/25 [00:00<?, ?it/s]

In [7]:
df

Unnamed: 0,id,title,context,question,answers
0,56d47d7d2ccc5a1400d8314e,Dog,The domestic dog (Canis lupus familiaris or Ca...,What is the three word Latin name for domestic...,"{'text': ['Canis lupus familiaris'], 'answer_s..."
1,56d99788dc89441400fdb580,Dog,The domestic dog (Canis lupus familiaris or Ca...,What is Canis familiaris?,"{'text': ['domestic dog'], 'answer_start': [4]}"
2,56d99788dc89441400fdb581,Dog,The domestic dog (Canis lupus familiaris or Ca...,How long has the domestic dog been selectively...,"{'text': ['millennia'], 'answer_start': [122]}"
3,56d99788dc89441400fdb582,Dog,The domestic dog (Canis lupus familiaris or Ca...,Along with various behaviors and physical attr...,"{'text': ['sensory capabilities'], 'answer_sta..."
4,56d4a7a72ccc5a1400d83168,Dog,Although initially thought to have originated ...,What decade had significant studies of dog gen...,"{'text': ['2010s'], 'answer_start': [212]}"
...,...,...,...,...,...
765,56d9b546dc89441400fdb715_aug,Dog,"In developing countries, the majority of dogs ...",Dog cognition has been studied on what kind of...,"{'text': ['pet dogs living in human homes.'], ..."
766,56d9c243dc89441400fdb7aa_aug,Dog,"Wolves, and their dog descendants, would have ...",What would wolves have gotten from living with...,"{'text': ['significant benefits'], 'answer_sta..."
767,56d9c649dc89441400fdb7e3_aug,Dog,Dogs and humans might have been able to coexis...,What has likely led to human success?,"{'text': ['the domestication of dogs'], 'answe..."
768,56d9d357dc89441400fdb860_aug,Dog,There is mixed scientific evidence as to wheth...,Studies that people are better off with dogs h...,"{'text': ['poorly controlled'], 'answer_start'..."


# Find the most relevant questions.

Manually go and check and find which is the closest question that can be used to test precision and recall.

In [8]:
# for this run I found 
query = "do does die eating chocolate"
relevant_indices = set([87, 475])

# SBERT

In [9]:


# New query
new_query = "do does die while eating choclate"
new_query_embedding = model.encode(new_query)

# Cosine similarity
cosine_scores = util.cos_sim(new_query_embedding, question_embeddings_sbert)[0]
top5_indices_sbert = cosine_scores.argsort(descending=True)[:10]

for i in top5_indices_sbert:
    print(f"Index {i}")
    print(f"Title: {titles[i]}")
    print(f"Question: {questions[i]}")
    print(f"Similarity: {cosine_scores[i].item()}")
    print("-" * 50)


Index 87
Title: Dog
Question: What is the chemical in chocolate that is poisonous to dogs?
Similarity: 0.3498760759830475
--------------------------------------------------
Index 475
Title: Dog
Question: What is the chemical in chocolate that is poisonous to dogs?
Similarity: 0.3498760759830475
--------------------------------------------------
Index 478
Title: Dog
Question: What form of chocolate is especially toxic to dogs?
Similarity: 0.3422899544239044
--------------------------------------------------
Index 90
Title: Dog
Question: What form of chocolate is especially toxic to dogs?
Similarity: 0.3422899544239044
--------------------------------------------------
Index 679
Title: Dog
Question: When is the Korean dog recipe usually eaten?
Similarity: 0.3003668785095215
--------------------------------------------------
Index 297
Title: Dog
Question: When is the Korean dog recipe usually eaten?
Similarity: 0.3003668785095215
--------------------------------------------------
Index 84

# TF-IDF

In [10]:
question_embeddings_tfidf = tfidf_vectorizer.fit_transform(questions).toarray()
svd = TruncatedSVD(n_components=2)
reduced_embeddings_tfidf = svd.fit_transform(question_embeddings_tfidf)

In [11]:

# New query
new_query = [query]
new_query_tfidf = tfidf_vectorizer.transform(new_query)

# Cosine similarity without SVD
similarities = cosine_similarity(new_query_tfidf, question_embeddings_tfidf)[0]
top5_indices_tfidf = similarities.argsort()[-5:][::-1]

for i in top5_indices_tfidf:
    print(f"Title: {titles[i]}")
    print(f"Question: {questions[i]}")
    print(f"Similarity: {similarities[i]}")
    print("-" * 50)


Title: Dog
Question: When did Bluey die?
Similarity: 0.27625703834659054
--------------------------------------------------
Title: Dog
Question: When did Bluey die?
Similarity: 0.27625703834659054
--------------------------------------------------
Title: Dog
Question: What is the chemical in chocolate that is poisonous to dogs?
Similarity: 0.24812316443779114
--------------------------------------------------
Title: Dog
Question: What is the chemical in chocolate that is poisonous to dogs?
Similarity: 0.24812316443779114
--------------------------------------------------
Title: Dog
Question: What form of chocolate is especially toxic to dogs?
Similarity: 0.2439169087802441
--------------------------------------------------


# BM25

In [12]:
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize

# Tokenize all questions
tokenized_corpus = [word_tokenize(q.lower()) for q in questions]

# Build BM25 index
bm25 = BM25Okapi(tokenized_corpus)

# New query
new_query = "do does die eating chocolate"
tokenized_query = word_tokenize(new_query.lower())

# Get top 5 results
scores = bm25.get_scores(tokenized_query)
top5_indices_bm25 = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:5]

# Display
for i in top5_indices_bm25:
    print(f"Title: {titles[i]}")
    print(f"Question: {questions[i]}")
    print(f"Score: {scores[i]}")
    print("-" * 50)


Title: Dog
Question: Dogs do not require a very high level of what when eating?
Score: 6.7815415195816175
--------------------------------------------------
Title: Dog
Question: Dogs do not require a very high level of what when eating?
Score: 6.7815415195816175
--------------------------------------------------
Title: Dog
Question: When did Bluey die?
Score: 6.520119656594554
--------------------------------------------------
Title: Dog
Question: When did Bluey die?
Score: 6.520119656594554
--------------------------------------------------
Title: Dog
Question: What form of chocolate is especially toxic to dogs?
Score: 5.631209559413964
--------------------------------------------------


# Compare Results

**IMPORTANT**: Since i am using precision@5 but i only have 2 relvant question. The maximum possible precision@5 is 0.4, which is just $\frac{2}{5}$. So if the precision is 0.4, it means that the model is perfect. If the precision is 0.2, it means that the model is not good at all.

TF-IDF is impossible to run so i used a subset of data which only includes questions for *dogs*. Since, we already know BM25 and TF-IDF does not perform well, it might not be able to pick anything. The output here is slightly different from the one in the presentation becuase i used a different randomb subset but the results are similar.

> This can be improved in the future by using more number of relevant questions. But for now, this is the best we can do.

In [None]:

def precision_at_k(top_k, relevant):
    return len(set(top_k) & relevant) / len(top_k)

def recall_at_k(top_k, relevant):
    return len(set(top_k) & relevant) / len(relevant)

def average_precision(top_k, relevant):
    score = 0
    hits = 0
    for i, doc_id in enumerate(top_k, 1):
        if doc_id in relevant:
            hits += 1
            score += hits / i
    return score / len(relevant) if relevant else 0

def dcg(relevance_list):
    return sum((1 / math.log2(i + 2)) for i, rel in enumerate(relevance_list) if rel)

def ndcg_at_k(top_k, relevant):
    relevance_list = [1 if doc in relevant else 0 for doc in top_k]
    ideal_list = sorted(relevance_list, reverse=True)
    return dcg(relevance_list) / dcg(ideal_list) if dcg(ideal_list) > 0 else 0

def evaluate_all(tfidf_top, bm25_top, sbert_top, relevant, k=5):
    # Ensure integer indices (especially for SBERT if tensor or numpy)
    tfidf_top = [int(i) for i in tfidf_top[:k]]
    bm25_top = [int(i) for i in bm25_top[:k]]
    sbert_top = [int(i) for i in sbert_top[:k]]

    methods = {
        "TF-IDF": tfidf_top,
        "BM25": bm25_top,
        "SBERT": sbert_top,
    }

    print("SBERT Top-5 Indices:", sbert_top)
    print("Relevant Indices:", relevant)
    print("Intersection:", set(sbert_top) & relevant)
    print()

    for method, top_k in methods.items():
        print(f"{method}:")
        print("  Precision@k:", precision_at_k(top_k, relevant))
        print("  Recall@k:   ", recall_at_k(top_k, relevant))
        print("  MAP:        ", average_precision(top_k, relevant))
        print("  nDCG@k:     ", ndcg_at_k(top_k, relevant))
        print("-" * 40)


evaluate_all(top5_indices_tfidf, top5_indices_bm25, top5_indices_sbert, relevant_indices, k=5)


SBERT Top-5 Indices: [87, 475, 478, 90, 679]
Relevant Indices: {475, 87}
Intersection: {475, 87}

TF-IDF:
  Precision@k: 0.4
  Recall@k:    1.0
  MAP:         0.41666666666666663
  nDCG@k:      0.5706417189553201
----------------------------------------
BM25:
  Precision@k: 0.0
  Recall@k:    0.0
  MAP:         0.0
  nDCG@k:      0
----------------------------------------
SBERT:
  Precision@k: 0.4
  Recall@k:    1.0
  MAP:         1.0
  nDCG@k:      1.0
----------------------------------------
