In [12]:
from datasets import load_dataset
import pandas as pd
import nltk
import torch
from nltk.corpus import stopwords
from sklearn.metrics import average_precision_score
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import time
import numpy as np

In [13]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the WikiQA dataset
dataset = load_dataset("wiki_qa")
train_data = dataset['train']
valid_data = dataset['validation']
test_data = dataset['test']

# Create DataFrames
train_df = pd.DataFrame({'question': train_data['question'], 'document': train_data['document_title'], 'answer': train_data['answer'], 'label': train_data['label']})
valid_df = pd.DataFrame({'question': valid_data['question'], 'document': valid_data['document_title'], 'answer': valid_data['answer'], 'label': valid_data['label']})
test_df = pd.DataFrame({'question': test_data['question'], 'document': test_data['document_title'], 'answer': test_data['answer'], 'label': test_data['label']})

# Display sample data
print(train_df.head())
print(valid_df.head())
print(test_df.head())

  0%|          | 0/3 [00:00<?, ?it/s]

                        question      document  \
0  how are glacier caves formed?  Glacier cave   
1  how are glacier caves formed?  Glacier cave   
2  how are glacier caves formed?  Glacier cave   
3  how are glacier caves formed?  Glacier cave   
4  how are glacier caves formed?  Glacier cave   

                                              answer  label  
0  A partly submerged glacier cave on Perito More...      0  
1          The ice facade is approximately 60 m high      0  
2          Ice formations in the Titlis glacier cave      0  
3  A glacier cave is a cave formed within the ice...      1  
4  Glacier caves are often called ice caves , but...      0  
                                      question          document  \
0  How are epithelial tissues joined together?  Tissue (biology)   
1  How are epithelial tissues joined together?  Tissue (biology)   
2  How are epithelial tissues joined together?  Tissue (biology)   
3  How are epithelial tissues joined together?  Tissue 

In [14]:
import re
from nltk.tokenize import word_tokenize

# Text preprocessing
nltk.download('stopwords')
nltk.download('punkt')

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

# Apply text preprocessing to the DataFrames
train_df['question'] = train_df['question'].apply(preprocess_text)
train_df['answer'] = train_df['answer'].apply(preprocess_text)

valid_df['question'] = valid_df['question'].apply(preprocess_text)
valid_df['answer'] = valid_df['answer'].apply(preprocess_text)

test_df['question'] = test_df['question'].apply(preprocess_text)
test_df['answer'] = test_df['answer'].apply(preprocess_text)

# Display sample preprocessed data
print(train_df.head())
print(valid_df.head())
print(test_df.head())

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
               question      document  \
0  glacier caves formed  Glacier cave   
1  glacier caves formed  Glacier cave   
2  glacier caves formed  Glacier cave   
3  glacier caves formed  Glacier cave   
4  glacier caves formed  Glacier cave   

                                              answer  label  
0  partly submerged glacier cave perito moreno gl...      0  
1                      ice facade approximately high      0  
2                 ice formations titlis glacier cave      0  
3        glacier cave cave formed within ice glacier      1  
4  glacier caves often called ice caves term prop...      0  
                             question          document  \
0  epithelial tissues joined together  Tissue (biology)   
1  epithelial tissues j

In [15]:
from transformers import  BertTokenizer, AdamW

# Tokenize and encode the text data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_data(data, max_length=128):
    encodings = tokenizer(data['question'].tolist(), data['answer'].tolist(), truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')
    return encodings

# Tokenize the data
train_tokenized = tokenize_data(train_df)
valid_tokenized = tokenize_data(valid_df)
test_tokenized = tokenize_data(test_df)

# Convert labels to tensors
train_labels = torch.tensor(train_df['label'].values)
valid_labels = torch.tensor(valid_df['label'].values)
test_labels = torch.tensor(test_df['label'].values)

In [16]:
from transformers import  BertModel
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from torch.utils.data import DataLoader


# Define the dataset
class QAClassificationDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['label'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create datasets and dataloaders
train_dataset = QAClassificationDataset(train_tokenized, train_labels)
valid_dataset = QAClassificationDataset(valid_tokenized, valid_labels)
test_dataset = QAClassificationDataset(test_tokenized, test_labels)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Model training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Fine-tuned BERT for sequence classification
model_classifier = BertModel.from_pretrained("bert-base-uncased", num_labels=2).to(device)
optimizer = AdamW(model_classifier.parameters(), lr=5e-5)

# Training function
def train_classification_model(model, train_loader, optimizer, num_epochs=3):
    criterion = torch.nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0

        for batch in train_loader:
            inputs = {key: value.to(device) for key, value in batch.items() if key != 'label'}
            labels = batch['label'].to(device)

            outputs = model(**inputs)
            logits = outputs.last_hidden_state[:, 0, :]  # Extract the [CLS] token representation
            loss = criterion(logits, labels)

            total_loss += loss.item()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        average_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {average_loss}")


# Train the classification model
train_classification_model(model_classifier, train_loader, optimizer)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['label'] = torch.tensor(self.labels[idx])


Epoch 1/3, Loss: 0.23966432761918746
Epoch 2/3, Loss: 0.1612970909014324
Epoch 3/3, Loss: 0.10100924557298815


In [22]:
# Save the trained model
model_save_path = "/kaggle/working/"
model_classifier.save_pretrained(model_save_path)

In [18]:
import torch.nn.functional as F
import numpy as np
from sklearn.metrics import average_precision_score
from sklearn.metrics.pairwise import cosine_similarity

def evaluate_classification_model(model, data_loader):
    model.eval()
    all_labels = []
    all_predictions = []

    with torch.no_grad():
        for batch in data_loader:
            inputs = {key: value.to(device) for key, value in batch.items() if key != 'label'}
            labels = batch['label'].to(device)

            outputs = model(**inputs)
            logits = outputs.last_hidden_state[:, 0, :]  # Extract the [CLS] token representation
            predictions = torch.argmax(logits, dim=1)

            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predictions.cpu().numpy())

    return all_labels, all_predictions

# Evaluate the model on the test data
test_labels, test_predictions = evaluate_classification_model(model_classifier, test_loader)

# Calculate evaluation metrics
average_precision = average_precision_score(test_labels, test_predictions)
print(f"Average Precision: {average_precision}")


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Assuming test_labels and test_predictions represent binary relevance labels (1 for relevant, 0 for not relevant)

# Calculate additional classification metrics
accuracy = accuracy_score(test_labels, test_predictions)
precision = precision_score(test_labels, test_predictions)
recall = recall_score(test_labels, test_predictions)
f1 = f1_score(test_labels, test_predictions)

# Print the metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")

# Confusion Matrix
conf_matrix = confusion_matrix(test_labels, test_predictions)
print("\nConfusion Matrix:")
print(conf_matrix)

# Classification Report
class_report = classification_report(test_labels, test_predictions)
print("\nClassification Report:")
print(class_report)



  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['label'] = torch.tensor(self.labels[idx])


Average Precision: 0.10577967167272476
Accuracy: 0.9347931873479318
Precision: 0.2845849802371542
Recall: 0.24573378839590443
F1-score: 0.26373626373626374

Confusion Matrix:
[[5691  181]
 [ 221   72]]

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.97      5872
           1       0.28      0.25      0.26       293

    accuracy                           0.93      6165
   macro avg       0.62      0.61      0.61      6165
weighted avg       0.93      0.93      0.93      6165



In [19]:
from transformers import BertModel, BertTokenizer
from sklearn.metrics.pairwise import cosine_similarity
import time
import numpy as np

# Replace "your_model_path" with the actual path where your fine-tuned model is saved
fine_tuned_model_path = "/kaggle/working/"

# Load the fine-tuned BERT model and tokenizer
fine_tuned_model = BertModel.from_pretrained(fine_tuned_model_path)
# Load the standard BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Move the model to the device
fine_tuned_model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [20]:
import torch

def get_bert_embedding(text, model, tokenizer, device):
    # Tokenize input text
    tokens = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    
    # Move the input to the device
    for key in tokens:
        tokens[key] = tokens[key].to(device)

    # Get the model output
    with torch.no_grad():
        output = model(**tokens)

    # Extract the embeddings from the model output
    embeddings = output.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

    return embeddings


# Apply fine-tuned BERT embedding to the DataFrames
train_df['question_embedding'] = train_df['question'].apply(lambda x: get_bert_embedding(x, fine_tuned_model, tokenizer, device))
train_df['document_embedding'] = train_df['document'].apply(lambda x: get_bert_embedding(x, fine_tuned_model, tokenizer, device))

valid_df['question_embedding'] = valid_df['question'].apply(lambda x: get_bert_embedding(x, fine_tuned_model, tokenizer, device))
valid_df['document_embedding'] = valid_df['document'].apply(lambda x: get_bert_embedding(x, fine_tuned_model, tokenizer, device))

test_df['question_embedding'] = test_df['question'].apply(lambda x: get_bert_embedding(x, fine_tuned_model, tokenizer, device))
test_df['document_embedding'] = test_df['document'].apply(lambda x: get_bert_embedding(x, fine_tuned_model, tokenizer, device))

# Display sample preprocessed data with fine-tuned BERT embeddings
print(train_df.head())
print(valid_df.head())
print(test_df.head())


               question      document  \
0  glacier caves formed  Glacier cave   
1  glacier caves formed  Glacier cave   
2  glacier caves formed  Glacier cave   
3  glacier caves formed  Glacier cave   
4  glacier caves formed  Glacier cave   

                                              answer  label  \
0  partly submerged glacier cave perito moreno gl...      0   
1                      ice facade approximately high      0   
2                 ice formations titlis glacier cave      0   
3        glacier cave cave formed within ice glacier      1   
4  glacier caves often called ice caves term prop...      0   

                                  question_embedding  \
0  [10.352801, 7.9546432, -0.21673298, -0.1958606...   
1  [10.352801, 7.9546432, -0.21673298, -0.1958606...   
2  [10.352801, 7.9546432, -0.21673298, -0.1958606...   
3  [10.352801, 7.9546432, -0.21673298, -0.1958606...   
4  [10.352801, 7.9546432, -0.21673298, -0.1958606...   

                                  doc

In [21]:
# Function to perform semantic search on a query using cosine similarity with fine-tuned BERT embeddings
def semantic_search_fine_tuned_bert(query_embedding, document_embeddings):
    similarities = cosine_similarity([query_embedding], document_embeddings)[0]
    return similarities

# Perform semantic search for 100 queries using fine-tuned BERT embeddings
num_queries = 100
map_scores_fine_tuned = []
elapsed_times_fine_tuned = []

for i in range(num_queries):
    # Randomly select a query from the test set
    query_row = test_df.sample(1).iloc[0]
    query_embedding = get_bert_embedding(query_row['question'], fine_tuned_model, tokenizer, device)
    
    # Perform semantic search and measure time elapsed
    start_time = time.time()
    predictions = semantic_search_fine_tuned_bert(query_embedding, np.vstack(test_df['document_embedding'].values))
    elapsed_time = time.time() - start_time
    
    # Calculate MAP for the query
    true_labels = test_df['label'].values
    map_score = average_precision_score(true_labels, predictions)
    
    # Append results to lists
    map_scores_fine_tuned.append(map_score)
    elapsed_times_fine_tuned.append(elapsed_time)

# Calculate average MAP and average time elapsed for fine-tuned BERT
average_map_fine_tuned = np.mean(map_scores_fine_tuned)
average_time_elapsed_fine_tuned = np.mean(elapsed_times_fine_tuned)

# Display results for fine-tuned BERT
print(f"Average MAP for {num_queries} queries using fine-tuned BERT: {average_map_fine_tuned}")
print(f"Average time elapsed for each query using fine-tuned BERT: {average_time_elapsed_fine_tuned} seconds")

Average MAP for 100 queries using fine-tuned BERT: 0.050003662531048355
Average time elapsed for each query using fine-tuned BERT: 0.05123048543930054 seconds


In [30]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the WikiQA dataset
dataset = load_dataset("wiki_qa")
train_df = pd.DataFrame({'question': dataset['train']['question'], 'document': dataset['train']['document_title'], 'answer': dataset['train']['answer'], 'label': dataset['train']['label']})
valid_df = pd.DataFrame({'question': dataset['validation']['question'], 'document': dataset['validation']['document_title'], 'answer': dataset['validation']['answer'], 'label': dataset['validation']['label']})
test_df = pd.DataFrame({'question': dataset['test']['question'], 'document': dataset['test']['document_title'], 'answer': dataset['test']['answer'], 'label': dataset['test']['label']})

# Apply BERT embedding to the DataFrames
train_df['question_embedding'] = train_df['question'].apply(lambda x: get_bert_embedding(x, model, tokenizer, device))
train_df['answer_embedding'] = train_df['answer'].apply(lambda x: get_bert_embedding(x, model, tokenizer, device))

valid_df['question_embedding'] = valid_df['question'].apply(lambda x: get_bert_embedding(x, model, tokenizer, device))
valid_df['answer_embedding'] = valid_df['answer'].apply(lambda x: get_bert_embedding(x, model, tokenizer, device))

test_df['question_embedding'] = test_df['question'].apply(lambda x: get_bert_embedding(x, model, tokenizer, device))
test_df['answer_embedding'] = test_df['answer'].apply(lambda x: get_bert_embedding(x, model, tokenizer, device))

# Concatenate the DataFrames
df = pd.concat([train_df, valid_df, test_df], ignore_index=True)

# Display sample data
print(df.head())

from sklearn.metrics.pairwise import cosine_similarity

def search(query, df, model, tokenizer, all_questions_embeddings, all_answers_embeddings):
    # Get BERT embedding for the query
    query_embedding = get_bert_embedding(query, model, tokenizer, device)
    
    # Calculate cosine similarity with all question embeddings
    question_similarities = cosine_similarity([query_embedding], all_questions_embeddings)[0]
    
    # Calculate cosine similarity with all answer embeddings
    answer_similarities = cosine_similarity([query_embedding], all_answers_embeddings)[0]
    
    # Combine similarities from both question and answer embeddings
    combined_similarities = (question_similarities + answer_similarities) / 2
    
    # Get indices of the top 3 hits
    top_indices = np.argsort(combined_similarities)[::-1][:3]
    
    # Display the top 3 hits and their similarity scores
    print(f"Query: {query}\n")
    print("Top 3 Hits:")
    for i, idx in enumerate(top_indices, 1):
        print(f"{i}. Answer: {df.iloc[idx]['answer']}")
        print(f"   Similarity Score: {combined_similarities[idx]:.4f}\n")

# Extract all question and answer embeddings from the entire dataset
all_questions_embeddings = np.vstack(df['question_embedding'].values)
all_answers_embeddings = np.vstack(df['answer_embedding'].values)

# Example usage:
query_example = "What is the capital of the United States?"
search(query_example, df, model, tokenizer, all_questions_embeddings, all_answers_embeddings)



  0%|          | 0/3 [00:00<?, ?it/s]

                        question      document  \
0  how are glacier caves formed?  Glacier cave   
1  how are glacier caves formed?  Glacier cave   
2  how are glacier caves formed?  Glacier cave   
3  how are glacier caves formed?  Glacier cave   
4  how are glacier caves formed?  Glacier cave   

                                              answer  label  \
0  A partly submerged glacier cave on Perito More...      0   
1          The ice facade is approximately 60 m high      0   
2          Ice formations in the Titlis glacier cave      0   
3  A glacier cave is a cave formed within the ice...      1   
4  Glacier caves are often called ice caves , but...      0   

                                  question_embedding  \
0  [0.32571954, 0.11772122, 0.007693596, 0.038442...   
1  [0.32571954, 0.11772122, 0.007693596, 0.038442...   
2  [0.32571954, 0.11772122, 0.007693596, 0.038442...   
3  [0.32571954, 0.11772122, 0.007693596, 0.038442...   
4  [0.32571954, 0.11772122, 0.007693596,