In [34]:
from datasets import load_dataset
import pandas as pd
import nltk
import torch
from nltk.corpus import stopwords
from sklearn.metrics import average_precision_score
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import time
import numpy as np

In [35]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the WikiQA dataset
dataset = load_dataset("wiki_qa")
train_data = dataset['train']
valid_data = dataset['validation']
test_data = dataset['test']

# Create DataFrames
train_df = pd.DataFrame({'question': train_data['question'], 'document': train_data['document_title'], 'answer': train_data['answer'], 'label': train_data['label']})
valid_df = pd.DataFrame({'question': valid_data['question'], 'document': valid_data['document_title'], 'answer': valid_data['answer'], 'label': valid_data['label']})
test_df = pd.DataFrame({'question': test_data['question'], 'document': test_data['document_title'], 'answer': test_data['answer'], 'label': test_data['label']})

# Display sample data
print(train_df.head())
print(valid_df.head())
print(test_df.head())

  0%|          | 0/3 [00:00<?, ?it/s]

                        question      document  \
0  how are glacier caves formed?  Glacier cave   
1  how are glacier caves formed?  Glacier cave   
2  how are glacier caves formed?  Glacier cave   
3  how are glacier caves formed?  Glacier cave   
4  how are glacier caves formed?  Glacier cave   

                                              answer  label  
0  A partly submerged glacier cave on Perito More...      0  
1          The ice facade is approximately 60 m high      0  
2          Ice formations in the Titlis glacier cave      0  
3  A glacier cave is a cave formed within the ice...      1  
4  Glacier caves are often called ice caves , but...      0  
                                      question          document  \
0  How are epithelial tissues joined together?  Tissue (biology)   
1  How are epithelial tissues joined together?  Tissue (biology)   
2  How are epithelial tissues joined together?  Tissue (biology)   
3  How are epithelial tissues joined together?  Tissue 

In [36]:
# Text preprocessing
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")
model.to(device)

# Function to obtain BERT embeddings for a text
def get_bert_embedding(text, model=model, tokenizer=tokenizer, device=device):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
    return embeddings

# Apply BERT embedding to the DataFrames
train_df['question_embedding'] = train_df['question'].apply(lambda x: get_bert_embedding(x, model, tokenizer, device))
train_df['document_embedding'] = train_df['document'].apply(lambda x: get_bert_embedding(x, model, tokenizer, device))

valid_df['question_embedding'] = valid_df['question'].apply(lambda x: get_bert_embedding(x, model, tokenizer, device))
valid_df['document_embedding'] = valid_df['document'].apply(lambda x: get_bert_embedding(x, model, tokenizer, device))

test_df['question_embedding'] = test_df['question'].apply(lambda x: get_bert_embedding(x, model, tokenizer, device))
test_df['document_embedding'] = test_df['document'].apply(lambda x: get_bert_embedding(x, model, tokenizer, device))

# Display sample preprocessed data with BERT embeddings
print(train_df.head())
print(valid_df.head())
print(test_df.head())

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
                        question      document  \
0  how are glacier caves formed?  Glacier cave   
1  how are glacier caves formed?  Glacier cave   
2  how are glacier caves formed?  Glacier cave   
3  how are glacier caves formed?  Glacier cave   
4  how are glacier caves formed?  Glacier cave   

                                              answer  label  \
0  A partly submerged glacier cave on Perito More...      0   
1          The ice facade is approximately 60 m high      0   
2          Ice formations in the Titlis glacier cave      0   
3  A glacier cave is a cave formed within the ice...      1   
4  Glacier caves are often called ice caves , but...      0   

                                  question_embedding  \
0  [0.32571954, 0.11772122, 0.007693596, 0.038442...   
1  [0.32571954, 0.11772122, 0.007693596, 0.038442...   
2  [0.32571954, 0.11772122, 

In [37]:
# Function to perform semantic search on a query using cosine similarity with BERT embeddings
def semantic_search_bert(query_embedding, document_embeddings):
    similarities = cosine_similarity([query_embedding], document_embeddings)[0]
    return similarities

# Perform semantic search for 100 queries using BERT embeddings
num_queries = 100
map_scores = []
elapsed_times = []

for i in range(num_queries):
    # Randomly select a query from the test set
    query_row = test_df.sample(1).iloc[0]
    query_embedding = get_bert_embedding(query_row['question'])
    
    # Perform semantic search and measure time elapsed
    start_time = time.time()
    predictions = semantic_search_bert(query_embedding, np.vstack(test_df['document_embedding'].values))
    elapsed_time = time.time() - start_time
    
    # Calculate MAP for the query
    true_labels = test_df['label'].values
    map_score = average_precision_score(true_labels, predictions)
    
    # Append results to lists
    map_scores.append(map_score)
    elapsed_times.append(elapsed_time)

# Calculate average MAP and average time elapsed
average_map = np.mean(map_scores)
average_time_elapsed = np.mean(elapsed_times)

# Display results
print(f"Average MAP for {num_queries} queries: {average_map}")
print(f"Average time elapsed for each query: {average_time_elapsed} seconds")

Average MAP for 100 queries: 0.05074964931327312
Average time elapsed for each query: 0.048799877166748044 seconds
