In [70]:
from datasets import load_dataset
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer
import torch.nn.functional as F
import numpy as np
from sklearn.metrics import average_precision_score
import time


In [71]:
# Load the WikiQA dataset
dataset = load_dataset("wiki_qa")
train_data = dataset['train']
valid_data = dataset['validation']
test_data = dataset['test']

# Create DataFrames
train_df = pd.DataFrame({'question': train_data['question'], 'document': train_data['document_title'], 'answer': train_data['answer'], 'label': train_data['label']})
valid_df = pd.DataFrame({'question': valid_data['question'], 'document': valid_data['document_title'], 'answer': valid_data['answer'], 'label': valid_data['label']})
test_df = pd.DataFrame({'question': test_data['question'], 'document': test_data['document_title'], 'answer': test_data['answer'], 'label': test_data['label']})

# Display sample data
print(train_df.head())
print(valid_df.head())
print(test_df.head())

  0%|          | 0/3 [00:00<?, ?it/s]

                        question      document  \
0  how are glacier caves formed?  Glacier cave   
1  how are glacier caves formed?  Glacier cave   
2  how are glacier caves formed?  Glacier cave   
3  how are glacier caves formed?  Glacier cave   
4  how are glacier caves formed?  Glacier cave   

                                              answer  label  
0  A partly submerged glacier cave on Perito More...      0  
1          The ice facade is approximately 60 m high      0  
2          Ice formations in the Titlis glacier cave      0  
3  A glacier cave is a cave formed within the ice...      1  
4  Glacier caves are often called ice caves , but...      0  
                                      question          document  \
0  How are epithelial tissues joined together?  Tissue (biology)   
1  How are epithelial tissues joined together?  Tissue (biology)   
2  How are epithelial tissues joined together?  Tissue (biology)   
3  How are epithelial tissues joined together?  Tissue 

In [72]:
# Text preprocessing
nltk.download('stopwords')
nltk.download('punkt')

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

# Apply text preprocessing to the DataFrames
train_df['question'] = train_df['question'].apply(preprocess_text)
train_df['document'] = train_df['document'].apply(preprocess_text)

valid_df['question'] = valid_df['question'].apply(preprocess_text)
valid_df['document'] = valid_df['document'].apply(preprocess_text)

test_df['question'] = test_df['question'].apply(preprocess_text)
test_df['document'] = test_df['document'].apply(preprocess_text)

# Display sample preprocessed data
print(train_df.head())
print(valid_df.head())
print(test_df.head())

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
               question      document  \
0  glacier caves formed  glacier cave   
1  glacier caves formed  glacier cave   
2  glacier caves formed  glacier cave   
3  glacier caves formed  glacier cave   
4  glacier caves formed  glacier cave   

                                              answer  label  
0  A partly submerged glacier cave on Perito More...      0  
1          The ice facade is approximately 60 m high      0  
2          Ice formations in the Titlis glacier cave      0  
3  A glacier cave is a cave formed within the ice...      1  
4  Glacier caves are often called ice caves , but...      0  
                             question        document  \
0  epithelial tissues joined together  tissue biology   
1  epithelial tissues joine

In [73]:
pip install -U sentence-transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [74]:
# Load a pre-trained SentenceTransformer model
model = SentenceTransformer('paraphrase-distilroberta-base-v2')

# Tokenize and encode the questions and documents
embedding_dim = 768  # Update with the actual embedding dimension
train_questions_embeddings = model.encode(train_df['question'].tolist(), convert_to_tensor=True)
train_documents_embeddings = model.encode(train_df['document'].tolist(), convert_to_tensor=True)

valid_questions_embeddings = model.encode(valid_df['question'].tolist(), convert_to_tensor=True)
valid_documents_embeddings = model.encode(valid_df['document'].tolist(), convert_to_tensor=True)

test_questions_embeddings = model.encode(test_df['question'].tolist(), convert_to_tensor=True)
test_documents_embeddings = model.encode(test_df['document'].tolist(), convert_to_tensor=True)


Batches:   0%|          | 0/637 [00:00<?, ?it/s]

Batches:   0%|          | 0/637 [00:00<?, ?it/s]

Batches:   0%|          | 0/86 [00:00<?, ?it/s]

Batches:   0%|          | 0/86 [00:00<?, ?it/s]

Batches:   0%|          | 0/193 [00:00<?, ?it/s]

Batches:   0%|          | 0/193 [00:00<?, ?it/s]

In [91]:

# Function to calculate cosine similarity
def calculate_cosine_similarity(query_embeddings, document_embeddings):
    # Normalize embeddings along the second dimension (dim=1)
    query_embeddings = F.normalize(query_embeddings, p=2, dim=1)
    document_embeddings = F.normalize(document_embeddings, p=2, dim=1)

    # Calculate cosine similarity
    similarity_scores = torch.matmul(query_embeddings, document_embeddings.T)
    return similarity_scores.numpy()

# Function to calculate Mean Average Precision (MAP)
def calculate_map(y_true, y_scores):
    sorted_indices = np.argsort(y_scores)[::-1]
    num_correct = 0
    precision_at_k = 0
    y_true_array = np.array(y_true)  # Convert y_true to a NumPy array
    for i, idx in enumerate(sorted_indices):
        num_correct += y_true_array[int(idx)].astype(int)
        precision_at_k += num_correct / (i + 1)

    if num_correct == 0:
        return 0

    return precision_at_k / num_correct

# Assuming you want to calculate MAP for the first 100 queries
num_queries = 100
valid_query_embeddings = torch.randn((len(valid_df), embedding_dim))[:num_queries]  # Replace with actual embeddings
valid_document_embeddings = torch.randn((len(valid_df), embedding_dim))[:num_queries]  # Replace with actual embeddings
valid_labels = valid_df['label'].tolist()[:num_queries]

# Calculate cosine similarities
cosine_similarities_matrix = calculate_cosine_similarity(valid_query_embeddings, valid_document_embeddings)

# Calculate MAP for each query
map_scores = []
for i in range(len(valid_query_embeddings)):
    query_scores = cosine_similarities_matrix[i]
    map_score = calculate_map(valid_labels, query_scores)
    map_scores.append(map_score)

# Calculate the overall MAP score
overall_map_score = np.mean(map_scores)
print(f"Overall MAP Score for {num_queries} queries: {overall_map_score}")

end_time = time.time()
elapsed_time = end_time - start_time

print(f"Elapsed time for evaluation: {elapsed_time} seconds")


Overall MAP Score for 100 queries: 0.9615902569915136
Elapsed time for evaluation: 3340.1074998378754 seconds
