In [1]:
from datasets import load_dataset
import pandas as pd
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.metrics import average_precision_score, recall_score
from gensim.models import KeyedVectors
import torch
import torch.nn.functional as F
import time



In [2]:
# Load the WikiQA dataset
dataset = load_dataset("wiki_qa")
train_data = dataset['train']
valid_data = dataset['validation']
test_data = dataset['test']

# Create DataFrames
train_df = pd.DataFrame({'question': train_data['question'], 'document': train_data['document_title'], 'answer': train_data['answer'], 'label': train_data['label']})
valid_df = pd.DataFrame({'question': valid_data['question'], 'document': valid_data['document_title'], 'answer': valid_data['answer'], 'label': valid_data['label']})
test_df = pd.DataFrame({'question': test_data['question'], 'document': test_data['document_title'], 'answer': test_data['answer'], 'label': test_data['label']})

# Display sample data
print(train_df.head())
print(valid_df.head())
print(test_df.head())

Downloading builder script:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/830 [00:00<?, ?B/s]

Downloading and preparing dataset wiki_qa/default (download: 6.77 MiB, generated: 6.10 MiB, post-processed: Unknown size, total: 12.87 MiB) to /root/.cache/huggingface/datasets/wiki_qa/default/0.1.0/d2d236b5cbdc6fbdab45d168b4d678a002e06ddea3525733a24558150585951c...


Downloading data:   0%|          | 0.00/7.09M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/6165 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2733 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/20360 [00:00<?, ? examples/s]

Dataset wiki_qa downloaded and prepared to /root/.cache/huggingface/datasets/wiki_qa/default/0.1.0/d2d236b5cbdc6fbdab45d168b4d678a002e06ddea3525733a24558150585951c. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

                        question      document  \
0  how are glacier caves formed?  Glacier cave   
1  how are glacier caves formed?  Glacier cave   
2  how are glacier caves formed?  Glacier cave   
3  how are glacier caves formed?  Glacier cave   
4  how are glacier caves formed?  Glacier cave   

                                              answer  label  
0  A partly submerged glacier cave on Perito More...      0  
1          The ice facade is approximately 60 m high      0  
2          Ice formations in the Titlis glacier cave      0  
3  A glacier cave is a cave formed within the ice...      1  
4  Glacier caves are often called ice caves , but...      0  
                                      question          document  \
0  How are epithelial tissues joined together?  Tissue (biology)   
1  How are epithelial tissues joined together?  Tissue (biology)   
2  How are epithelial tissues joined together?  Tissue (biology)   
3  How are epithelial tissues joined together?  Tissue 

In [3]:
# Text preprocessing
nltk.download('stopwords')
nltk.download('punkt')

# Load pre-trained Word2Vec model (if available)
# Replace with the actual path to your Word2Vec model
word2vec_model_path = '/kaggle/input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin'
word2vec_model = KeyedVectors.load_word2vec_format(word2vec_model_path, binary=True, limit=500000)  # Limit for performance

# Function to obtain Word2Vec embeddings for a text
def get_word2vec_embedding(text):
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words]
    tokens = [token for token in tokens if token in word2vec_model]
    if not tokens:
        return np.zeros(word2vec_model.vector_size)
    embedding = np.mean([word2vec_model[token] for token in tokens], axis=0)
    return embedding

# Apply Word2Vec embedding to the DataFrames
train_df['question_embedding'] = train_df['question'].apply(get_word2vec_embedding)
train_df['document_embedding'] = train_df['document'].apply(get_word2vec_embedding)

valid_df['question_embedding'] = valid_df['question'].apply(get_word2vec_embedding)
valid_df['document_embedding'] = valid_df['document'].apply(get_word2vec_embedding)

test_df['question_embedding'] = test_df['question'].apply(get_word2vec_embedding)
test_df['document_embedding'] = test_df['document'].apply(get_word2vec_embedding)

# Display sample preprocessed data with Word2Vec embeddings
print(train_df.head())
print(valid_df.head())
print(test_df.head())

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
                        question      document  \
0  how are glacier caves formed?  Glacier cave   
1  how are glacier caves formed?  Glacier cave   
2  how are glacier caves formed?  Glacier cave   
3  how are glacier caves formed?  Glacier cave   
4  how are glacier caves formed?  Glacier cave   

                                              answer  label  \
0  A partly submerged glacier cave on Perito More...      0   
1          The ice facade is approximately 60 m high      0   
2          Ice formations in the Titlis glacier cave      0   
3  A glacier cave is a cave formed within the ice...      1   
4  Glacier caves are often called ice caves , but...      0   

                                  question_embedding  \
0  [-0.07259115, 0.11344

In [5]:
from sklearn.metrics.pairwise import cosine_similarity

def calculate_map(predictions, true_labels):
    num_relevant_documents = sum(true_labels)
    precision_at_k = 0.0
    num_correct_predictions = 0

    for i, (score, label) in enumerate(sorted(zip(predictions, true_labels), key=lambda x: x[0], reverse=True), 1):
        if label == 1:
            num_correct_predictions += 1
            precision_at_k += num_correct_predictions / i

    if num_relevant_documents == 0:
        return 0.0

    average_precision = precision_at_k / num_relevant_documents
    return average_precision


# Function to perform semantic search on a query using cosine similarity
def semantic_search(query_embedding, document_embeddings):
    similarities = cosine_similarity([query_embedding], document_embeddings)[0]
    return similarities

# Perform semantic search for 100 queries using the pre-trained Word2Vec model
num_queries = 100
map_scores = []
elapsed_times = []

for i in range(num_queries):
    # Randomly select a query from the test set
    query_row = test_df.sample(1).iloc[0]
    query_embedding = get_word2vec_embedding(query_row['question'])
    
    # Perform semantic search and measure time elapsed
    start_time = time.time()
    predictions = semantic_search(query_embedding, test_df['document_embedding'].tolist())
    elapsed_time = time.time() - start_time
    
    # Calculate MAP for the query
    true_labels = test_df['label'].values
    map_score = calculate_map(predictions, true_labels)
    
    # Append results to lists
    map_scores.append(map_score)
    elapsed_times.append(elapsed_time)

# Calculate average MAP and average time elapsed
average_map = np.mean(map_scores)
average_time_elapsed = np.mean(elapsed_times)

# Display results
print(f"Average MAP for {num_queries} queries: {average_map}")
print(f"Average time elapsed for each query: {average_time_elapsed} seconds")


Average MAP for 100 queries: 0.048732772473409236
Average time elapsed for each query: 0.03592027425765991 seconds


In [10]:
def semantic_search_top_answers(question, query_df, document_embeddings, top_n=3):
    # Get the embedding for the given question
    query_embedding = get_word2vec_embedding(question)
    
    # Perform semantic search
    predictions = semantic_search(query_embedding, document_embeddings)
    
    # Get the indices of the top N documents based on hit scores
    top_indices = predictions.argsort()[-top_n:][::-1]
    
    # Retrieve the top N answers and their scores
    top_answers = query_df.iloc[top_indices]['answer'].tolist()
    top_scores = predictions[top_indices]
    
    return top_answers, top_scores

# Example usage:
# Allow the user to input their own question
user_question = input("Enter your question: ")

# Perform semantic search and get the top 3 answers
top_answers, top_scores = semantic_search_top_answers(user_question, test_df, test_df['document_embedding'].tolist(), top_n=3)

# Display the results with the full text of the answers
print("User Question:", user_question)
print("Top 3 Answers:")
for i, (answer, score) in enumerate(zip(top_answers, top_scores), 1):
    print(f"{i}. Answer: {answer}\n   Score: {score}\n")


Enter your question:  What is the capital of the United States?


User Question: What is the capital of the United States?
Top 3 Answers:
1. Answer: Paleoindians migrated from Asia to what is now the United States mainland around 15,000 years ago.
   Score: 0.8587633137120406

2. Answer: The United States of America (USA or U.S.A.), commonly called the United States (US or U.S.) or America, is a federal republic consisting of fifty states and a federal district .
   Score: 0.8587633137120406

3. Answer: The state of Alaska is west of Canada and east of Russia across the Bering Strait, and the state of Hawaii is in the mid-North Pacific.
   Score: 0.8587633137120406

