In [1]:
import pandas as pd
import numpy as np

In [2]:
questions = pd.read_csv(r'data/questions.csv')

In [3]:
questions.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [4]:
questions.columns

Index(['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate'], dtype='object')

In [5]:
questions.shape

(404351, 6)

In [6]:
data = pd.concat([questions['question1'], questions['question2']], ignore_index=True)
data = pd.Series(data).dropna().drop_duplicates()

In [7]:
data

0         What is the step by step guide to invest in sh...
1         What is the story of Kohinoor (Koh-i-Noor) Dia...
2         How can I increase the speed of my internet co...
3         Why am I mentally very lonely? How can I solve...
4         Which one dissolve in water quikly sugar, salt...
                                ...                        
808695    What will the CPU upgrade to the 2016 Apple Ma...
808696    What does Jainism say about Gays and Homosexua...
808699                                    What's this coin?
808700    I am having little hairfall problem but I want...
808701        What is it like to have sex with your cousin?
Length: 537387, dtype: object

In [8]:
data.shape

(537387,)

In [9]:
data = data[:100000].tolist()

In [10]:
data

['What is the step by step guide to invest in share market in india?',
 'What is the story of Kohinoor (Koh-i-Noor) Diamond?',
 'How can I increase the speed of my internet connection while using a VPN?',
 'Why am I mentally very lonely? How can I solve it?',
 'Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?',
 'Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?',
 'Should I buy tiago?',
 'How can I be a good geologist?',
 'When do you use シ instead of し?',
 'Motorola (company): Can I hack my Charter Motorolla DCX3400?',
 'Method to find separation of slits using fresnel biprism?',
 'How do I read and find my YouTube comments?',
 'What can make Physics easy to learn?',
 'What was your first sexual experience like?',
 'What are the laws to change your status from a student visa to a green card in the US, how do they compare to the immigration laws in Canada?',
 'What would a Trump presidency mean for current international 

In [11]:
len(data)

100000

## Embeddings (sentence transformer)

In [12]:
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
model = SentenceTransformer('all-MiniLM-L6-v2')
question_emd = model.encode(data, batch_size=34, show_progress_bar=True)

Batches: 100%|██████████| 2942/2942 [06:05<00:00,  8.04it/s]


In [14]:
np.save('question_embeddings.npy',question_emd)
pd.DataFrame({'question':data}).to_csv('ques_emd.csv', index=False)

In [15]:
print("embeddings shape:", question_emd.shape)

embeddings shape: (100000, 384)


## FAISS Index

In [16]:
import faiss

In [17]:
# normalize embeddings for cosine similarity
faiss.normalize_L2(question_emd)

In [18]:
# create faiss index

dimension = question_emd.shape[1]  # 384 for MiniLM
index = faiss.IndexFlatIP(dimension)  # inner product = cosine similarity after normalization of embds

In [19]:
# add embeddings to index

index.add(question_emd)

In [20]:
# save index

faiss.write_index(index, 'faiss_index.index')
print("index size:", index.ntotal)

index size: 100000


## Test Retrieval Function

In [21]:
def search_similar_questions(query, top_k=5):

    # encode query
    query_embedding = model.encode([query], convert_to_numpy=True)
    faiss.normalize_L2(query_embedding)

    # search
    distances, indices = index.search(query_embedding, top_k+1)

    # get results
    result = []
    for i, (idx,score) in enumerate(zip(indices[0][1:], distances[0][1:])):
        if data[idx].lower() != query.lower():
            result.append({
                'rank': len(result)+1,
                'question': data[idx],
                'similarity_score': float(score)
            }
            )

        if len(result) == top_k:
            break

    return result



In [22]:
# test

test_query = "How do I get a Job?"
result = search_similar_questions(test_query)

for i in result:
    print(f"{i['rank']}. {i['question']}  (score: {i['similarity_score']:.3f})")

1. How can I find a job?  (score: 0.890)
2. How can I apply for a job?  (score: 0.832)
3. What is the easiest way to get a job?  (score: 0.778)
4. How can I get a good job?  (score: 0.769)
5. How do I find a good job?  (score: 0.753)


## Evaluation

In [23]:
test_queries = [
    "What are the healthy breakfast options?",
    "What is the best way to lose weight?",
    "How can I make money online?",
    "What are good books to read?",
    "How do I get better at public speaking?",
    "What is the meaning of life?",
    "How can I improve my credit score?",
    "What are the symptoms of depression?",
    "What are the ways to get rid of acne?",
    "How do I reduce hairfall?",
    "How do I learn guitar?",
    "What causes anxiety?",
    "How can I save money effectively?",
    "How do I get a job in data science?",
    "How should I protect my job from recession?",
]

In [24]:
def evaluate_relevance():
    precisions = []

    for query in test_queries:
        print(f"\n{'='*60}")
        print(f"Query: {query}")
        print('='*60)
        
        results = search_similar_questions(query, top_k=5)
        relevant_count = 0
        
        for i, r in enumerate(results, 1):
            print(f"{i}. {r['question']}")
            print(f"   Similarity: {r['similarity_score']:.3f}")
            is_relevant = input("   Relevant? (y/n): ").lower() == 'y'
            relevant_count += int(is_relevant)

        precision_q = relevant_count / 5
        precisions.append(precision_q)

        print(f"\nPrecision@5 for query: {precision_q:.2f}")

    mean_precision = sum(precisions) / len(precisions)
    print(f"\n{'='*60}")
    print(f"Mean Precision@5: {mean_precision:.2%}")
    print('='*60)


In [25]:
evaluate_relevance()


Query: What are the healthy breakfast options?
1. What is best for breakfast?
   Similarity: 0.869
2. What are the best healthy indian breakfast ideas?
   Similarity: 0.828
3. What should I eat for breakfast?
   Similarity: 0.807
4. What do you eat for breakfast?
   Similarity: 0.800
5. What should I eat in breakfast?
   Similarity: 0.797

Precision@5 for query: 1.00

Query: What is the best way to lose weight?
1. Which are the best ways to lose weight?
   Similarity: 0.964
2. What are some good ways to lose weight?
   Similarity: 0.935
3. What is the best method of losing weight?
   Similarity: 0.913
4. The best way for weight loss?
   Similarity: 0.900
5. What are the best was to lose weight?
   Similarity: 0.898

Precision@5 for query: 0.60

Query: How can I make money online?
1. How do you make money online?
   Similarity: 0.965
2. What is a way to make money online?
   Similarity: 0.964
3. What are ways I can make money online?
   Similarity: 0.962
4. How can I make money online 