installing sentence BERT python library

In [None]:
!pip install sentence_transformers

Reading the duplicate questions and xml file (Similar to Assignment-3)

In [4]:
import csv
from post_parser_record import PostParserRecord

def read_tsv_test_data(file_path):
  # Takes in the file path for test file and generate a dictionary
  # of question id as the key and the list of question ids similar to it
  # as value. It also returns the list of all question ids that have
  # at least one similar question
  dic_similar_questions = {}
  lst_all_test = []
  with open(file_path) as fd:
    rd = csv.reader(fd, delimiter="\t", quotechar='"')
    for row in rd:
        question_id = int(row[0])
        lst_similar = list(map(int, row[1:]))
        dic_similar_questions[question_id] = lst_similar
        lst_all_test.append(question_id)
        lst_all_test.extend(lst_similar)
  return dic_similar_questions, lst_all_test

dic_similar_questions, lst_all_test = read_tsv_test_data("duplicate_questions.tsv")
post_reader = PostParserRecord("Posts_law.xml")

Using pre-trained Quora duplicate question to encode questions and find similar questions

In [6]:
from sentence_transformers import SentenceTransformer, util
import torch

# in question one, we are using the pre-trained model on quora with no further fine-tuning
model_name = 'distilbert-base-nli-stsb-quora-ranking'
model = SentenceTransformer(model_name)

# list of text to be indexed (encoded)
corpus = []
# this dictionary is used as key: corpus index [0, 1, 2, ...] and value: corresponding question id
index_to_question_id = {}
idx = 0

# indexing all the questions in the law stack exchange -- only using the question titles
for question_id in post_reader.map_questions:
  question = post_reader.map_questions[question_id]
  text = question.title
  q_id = question.post_id
  corpus.append(text)
  index_to_question_id[idx] = question_id
  idx += 1

# Indexing (embedding) the 
corpus_embeddings = model.encode(corpus, convert_to_tensor=True, show_progress_bar=True)

lst_test_question_ids = list(dic_similar_questions.keys())
top_k = 100

# Initialize variables for P@1 and MRR
total_p1 = 0
total_mrr = 0
num_questions = len(lst_test_question_ids)

for question_id in lst_test_question_ids:
  query_text = post_reader.map_questions[question_id].title
  query_embedding = model.encode(query_text, convert_to_tensor=True)

  # We use cosine-similarity and torch.topk to find the highest 5 scores
  cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
  top_results = torch.topk(cos_scores, k=top_k)

  # Calculate P@1
  top_result_ids = [index_to_question_id[int(idx)] for idx in top_results[1]]
  if question_id in top_result_ids[:1]:
    total_p1 += 1

    # Calculate MRR
  for rank, idx in enumerate(top_results[1]):
    index = int(idx)
    if question_id == index_to_question_id[index]:
      total_mrr += 1.0 / (rank + 1)
      break

  for score, idx in zip(top_results[0], top_results[1]):
    index = int(idx)
    # printing question id and similarity score
    # print(index_to_question_id[index], "(Score: {:.4f})".format(score))


# Calculate average P@1 and MRR
avg_p1 = (total_p1 / num_questions)
avg_mrr = (total_mrr / num_questions)

# Print the results
print("Average P@1: {:.4f}".format(avg_p1))
print("Average MRR: {:.4f}".format(avg_mrr))

Batches:   0%|          | 0/756 [00:00<?, ?it/s]

Average P@1: 0.0106
Average MRR: 0.0053


Step 2: Fine-tune Sentence-BERT Model

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sentence_transformers import SentenceTransformer, InputExample, losses

# Load the pre-trained Sentence-BERT model
model_name = 'bert-base-nli-mean-tokens'
model = SentenceTransformer(model_name)

# Split the training data into train and test sets
split_ratio = 0.1
split_index = int(len(train_data['sentences']) * split_ratio)
train_sentences = train_data['sentences'][split_index:]
train_labels = train_data['labels'][split_index:]
test_sentences = train_data['sentences'][:split_index]
test_labels = train_data['labels'][:split_index]

# Define the optimizer, loss function, and any other hyperparameters
optimizer = transformers.AdamW(model.parameters(), lr=2e-5)
loss = losses.CosineSimilarityLoss()
epochs = 5
batch_size = 16

# Convert the training and test data into InputExamples
train_examples = [InputExample(texts=[train_sentences[i]], label=train_labels[i]) for i in range(len(train_sentences))]
test_examples = [InputExample(texts=[test_sentences[i]], label=test_labels[i]) for i in range(len(test_sentences))]

# Train the model on the training data and evaluate it on the test data
for epoch in range(epochs):
    train_dataloader = transformers.DataLoader(train_examples, batch_size=batch_size, shuffle=True)
    test_dataloader = transformers.DataLoader(test_examples, batch_size=batch_size, shuffle=True)
    model.train()
    train_loss = 0
    for step, batch in enumerate(train_dataloader):
        features = model.encode(batch['texts'], convert_to_tensor=True)
        labels = torch.tensor(batch['labels'], dtype=torch.float)
        output = loss(features, labels)
        train_loss += output.item()
        output.backward()
        optimizer.step()
        optimizer.zero_grad()
    train_loss /= (step + 1)
    model.eval()
    with torch.no_grad():
        predictions = []
        for step, batch in enumerate(test_dataloader):
            features = model.encode(batch['texts'], convert_to_tensor=True)
            labels = torch.tensor(batch['labels'], dtype=torch.float)
            similarities = features @ features.T
            similarities = similarities.cpu().numpy()
            for i in range(similarities.shape[0]):
                sorted_indices = np.argsort(similarities[i])[::-1]
                predictions.append(sorted_indices[0])
        test_accuracy = accuracy_score(test_labels, predictions)
        test_precision = precision_score(test_labels, predictions)
        test_recall = recall_score(test_labels, predictions)
        test_f1_score = f1_score(test_labels, predictions)
    # Compute P@1 and MRR
    p_at_1 = 0
    mrr = 0
    for i in range(len(test_sentences)):
        features = model.encode([test_sentences[i]], convert_to_tensor=True)
        similarities = features @ features.T
        similarities = similarities.cpu().numpy()
        sorted_indices = np.argsort(similarities[0])[::-1]
        if sorted_indices[0] == test_labels[i]:
            p_at_1 += 1
            mrr += 1 / (sorted_indices.tolist().index(test_labels[i]) + 1)
    p_at_1 /= len(test_sentences)
    mrr /= len(test_sentences)

    print(f'Epoch {epoch + 1}: Train Loss = {train_loss:.4f}, Test Accuracy = {test_accuracy:.4f}, '
          f'Test Precision = {test_precision:.4f}, Test Recall = {test_recall:.4f}, '
          f'Test F1-Score = {test_f1_score:.4f}, P@1 = {p_at_1:.4f}, MRR = {mrr:.4f}')