installing sentence BERT python library

In [278]:
!pip install sentence_transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Reading the duplicate questions and xml file (Similar to Assignment-3)

In [279]:
import csv
from post_parser_record import PostParserRecord

def read_tsv_test_data(file_path):
  # Takes in the file path for test file and generate a dictionary
  # of question id as the key and the list of question ids similar to it
  # as value. It also returns the list of all question ids that have
  # at least one similar question
  dic_similar_questions = {}
  lst_all_test = []
  with open(file_path) as fd:
    rd = csv.reader(fd, delimiter="\t", quotechar='"')
    for row in rd:
        question_id = int(row[0])
        lst_similar = list(map(int, row[1:]))
        dic_similar_questions[question_id] = lst_similar
        lst_all_test.append(question_id)
        lst_all_test.extend(lst_similar)
  return dic_similar_questions, lst_all_test

dic_similar_questions, lst_all_test = read_tsv_test_data("duplicate_questions.tsv")
post_reader = PostParserRecord("Posts_law.xml")

Using pre-trained Quora duplicate question to encode questions and find similar questions

In [280]:
#part 1

In [281]:
from sentence_transformers import SentenceTransformer, util
import torch
import numpy as np

# in question one, we are using the pre-trained model on quora with no further fine-tuning
model_name = 'distilbert-base-nli-stsb-quora-ranking'
model = SentenceTransformer(model_name)

# list of text to be indexed (encoded)
corpus = []
# this dictionary is used as key: corpus index [0, 1, 2, ...] and value: corresponding question id
index_to_question_id = {}
idx = 0

# indexing all the questions in the law stack exchange -- only using the question titles
for question_id in post_reader.map_questions:
    question = post_reader.map_questions[question_id]
    text = question.title
    q_id = question.post_id
    corpus.append(text)
    index_to_question_id[idx] = question_id
    idx += 1
    
# Indexing (embedding) the 
corpus_embeddings = model.encode(corpus, convert_to_tensor=True, show_progress_bar=True)


Batches:   0%|          | 0/756 [00:00<?, ?it/s]

In [282]:
# Initialize the lists to store P@1 and MRR values
p_at_1_list = []
mrr_list = []
# Find top 100 similar questions for each question in the corpus
for idx in index_to_question_id:
    q_id = index_to_question_id[idx]
    if q_id in dic_similar_questions:
        emb = corpus_embeddings[idx]
        # Use cosine-similarity and torch.topk to find the highest 100 scores
        cos_scores = util.cos_sim(emb, corpus_embeddings)[0]
        top_results = torch.topk(cos_scores, k=100)

        # Extract the indices of the top 100 matches
        top_indices = top_results[1].tolist()


        # Calculate P@1 and MRR for the top 100 matches
        p_at_1 = 0
        reciprocal_ranks = []
        for i, index in enumerate(top_indices):
            # If the top match is the original question, record P@1 and MRR
            if index_to_question_id[index] in dic_similar_questions[q_id]:
                if i == 0:
                    p_at_1 = 1
                reciprocal_ranks.append(1 / (i+1))
                break
          
        # Append the P@1 and MRR values for this question to the lists
        p_at_1_list.append(p_at_1)
        if reciprocal_ranks:
            mrr_list.append(np.mean(reciprocal_ranks))
        else:
            mrr_list.append(0)

# Calculate the average P@1 and MRR values across all questions in the corpus
avg_p_at_1 = np.mean(p_at_1_list)
avg_mrr = np.mean(mrr_list)

print("Average P@1:", avg_p_at_1)
print("Average Mean Reciprocal Rank:", avg_mrr)

Average P@1: 0.0070921985815602835
Average Mean Reciprocal Rank: 0.12549970266865101


In [283]:
# part 2

In [284]:
import random
# this function uses the key for every positive sample and randomly selects a new value that isn't in the positive samples dictionary
def generate_negative_samples(dic, lst_of_ids):
  new_dic = {}
  lst = []
  cnt = 0
  for id in dic:
    for value in dic[id]:
      cnt += 1
      lst.append(value)
    new_dic[id] = []
    for _ in range(cnt):
      while True:
        random_item = random.choice(lst_of_ids)
        if random_item not in lst:
          break
      new_dic[id].append(random_item)
    cnt = 0
  return new_dic

In [285]:
# helper code to generate a list ids for all the questions
lst_of_ids = []
for id in post_reader.map_questions:
  lst_of_ids.append(id)

In [286]:
# positive and negative sample id dictionaries
negative_samples_id = generate_negative_samples(dic_similar_questions, lst_of_ids)
positive_samples_id = dic_similar_questions

In [287]:
# get data for training and testing, 90/10 split
train_data = []
test_data = {}
test_size = int(len(positive_samples_id) * 0.1)
cnt = 0
for q_id in positive_samples_id:
    if cnt < len(positive_samples_id) - test_size:
        key_text = post_reader.map_questions[q_id].title
        sim_q_id = positive_samples_id[q_id][0]
        value_text = post_reader.map_questions[sim_q_id].title
        train_data.append((key_text, value_text, 1))
        sim_q_id = negative_samples_id[q_id][0]
        value_text = post_reader.map_questions[sim_q_id].title
        train_data.append((key_text, value_text, 0))
    else:
        test_data[q_id] = positive_samples_id[q_id][0]
    cnt += 1


In [288]:
from sentence_transformers import SentenceTransformer, InputExample, losses, SentencesDataset
from torch.utils.data import DataLoader
# using quora pre-trained model
model_name = 'distilbert-base-nli-stsb-quora-ranking'
model = SentenceTransformer(model_name)

# build samples using both loss functions
train_samples_MultipleNegativesRankingLoss = []
train_samples_ConstrativeLoss = []

for data in train_data:
    train_samples_ConstrativeLoss.append(InputExample(texts=[data[0], data[1]], label=data[2]))
    if data[2] == 1:
        train_samples_MultipleNegativesRankingLoss.append(InputExample(texts=[data[0], data[1]], label=1))
        train_samples_MultipleNegativesRankingLoss.append(InputExample(texts=[data[1], data[0]], label=0))  # if A is a duplicate of B, then B is a duplicate of A

# Create data loader and loss for MultipleNegativesRankingLoss
train_dataset_MultipleNegativesRankingLoss = SentencesDataset(train_samples_MultipleNegativesRankingLoss, model=model)
train_dataloader_MultipleNegativesRankingLoss = DataLoader(train_dataset_MultipleNegativesRankingLoss, batch_size=train_batch_size)
train_loss_MultipleNegativesRankingLoss = losses.MultipleNegativesRankingLoss(model)

distance_metric = util.pytorch_cos_sim
margin = 0.5
# Create data loader and loss for OnlineContrastiveLoss
train_dataset_ConstrativeLoss = SentencesDataset(train_samples_ConstrativeLoss, model=model)
train_dataloader_ConstrativeLoss = DataLoader(train_dataset_ConstrativeLoss, shuffle = True, batch_size=train_batch_size)
train_loss_ConstrativeLoss = losses.OnlineContrastiveLoss(model=model, distance_metric=distance_metric, margin=margin)

# Train the model
model.fit(train_objectives=[(train_dataloader_MultipleNegativesRankingLoss, train_loss_MultipleNegativesRankingLoss), (train_dataloader_ConstrativeLoss, train_loss_ConstrativeLoss)],
          epochs=3,
          warmup_steps=1000,
          output_path=model_save_path
          )

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/32 [00:00<?, ?it/s]

Iteration:   0%|          | 0/32 [00:00<?, ?it/s]

Iteration:   0%|          | 0/32 [00:00<?, ?it/s]

In [289]:
# list of text to be indexed (encoded)
corpus = []
# this dictionary is used as key: corpus index [0, 1, 2, ...] and value: corresponding question id
index_to_question_id = {}
idx = 0

# indexing all the questions in the law stack exchange -- only using the question titles
for question_id in post_reader.map_questions:
    question = post_reader.map_questions[question_id]
    text = question.title
    q_id = question.post_id
    corpus.append(text)
    index_to_question_id[idx] = question_id
    idx += 1
    
# Indexing (embedding) the 
corpus_embeddings = model.encode(corpus, convert_to_tensor=True, show_progress_bar=True)

Batches:   0%|          | 0/756 [00:00<?, ?it/s]

In [290]:
# Initialize the lists to store P@1 and MRR values
p_at_1_list = []
mrr_list = []

# Find top 100 similar questions for each question in the corpus
for idx in index_to_question_id:
    q_id = index_to_question_id[idx]
    if q_id in test_data:
        emb = corpus_embeddings[idx]
        
        # Use cosine-similarity and torch.topk to find the highest 100 scores
        cos_scores = util.cos_sim(emb, corpus_embeddings)[0]
        top_results = torch.topk(cos_scores, k=100)
        
        # Extract the indices of the top 100 matches
        top_indices = top_results[1].tolist()

        # Calculate P@1 and MRR for the top 100 matches
        p_at_1 = 0
        reciprocal_ranks = []
        for i, index in enumerate(top_indices):
            if index_to_question_id[index] in dic_similar_questions[q_id]:
                if i == 0:
                    p_at_1 = 1
                reciprocal_ranks.append(1 / (i+1))
                break
          
        # Append the P@1 and MRR values for this question to the lists
        p_at_1_list.append(p_at_1)
        if reciprocal_ranks:
            mrr_list.append(np.mean(reciprocal_ranks))
        else:
            mrr_list.append(0)

# Calculate the average P@1 and MRR values across all questions in the corpus
avg_p_at_1 = np.mean(p_at_1_list)
avg_mrr = np.mean(mrr_list)

print("Average P@1:", avg_p_at_1)
print("Average Mean Reciprocal Rank:", avg_mrr)

Average P@1: 0.0
Average Mean Reciprocal Rank: 0.1706378281750524


In [291]:
#part 3

In [292]:
# quora model
model_name = 'distilbert-base-nli-stsb-quora-ranking'
model = SentenceTransformer(model_name)

# list of text to be indexed (encoded)
corpus = []
# this dictionary is used as key: corpus index [0, 1, 2, ...] and value: corresponding question id
index_to_question_id = {}
idx = 0

# indexing all the questions in the law stack exchange -- only using the question titles
for question_id in post_reader.map_questions:
    question = post_reader.map_questions[question_id]
    text = question.title
    q_id = question.post_id
    corpus.append(text)
    index_to_question_id[idx] = question_id
    idx += 1
    
# Indexing (embedding) the 
corpus_embeddings = model.encode(corpus, convert_to_tensor=True, show_progress_bar=True)


Batches:   0%|          | 0/756 [00:00<?, ?it/s]

In [293]:
# Initialize the lists to store P@1 and MRR values
p_at_1_list = []
mrr_list = []

# Find top 100 similar questions for each question in the corpus
for idx in index_to_question_id:
    q_id = index_to_question_id[idx]
    if q_id in test_data:
        query_embedding = corpus_embeddings[idx]
        
        # Use cosine-similarity and torch.topk to find the highest 100 scores
        cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
        top_results = torch.topk(cos_scores, k=100)
        
        # Extract the indices of the top 100 matches
        top_indices = top_results[1].tolist()

        # Calculate P@1 and MRR for the top 100 matches
        p_at_1 = 0
        reciprocal_ranks = []
        for i, index in enumerate(top_indices):
            if index_to_question_id[index] in dic_similar_questions[q_id]:
                if i == 0:
                    p_at_1 = 1
                reciprocal_ranks.append(1 / (i+1))
                break
          
        # Append the P@1 and MRR values for this question to the lists
        p_at_1_list.append(p_at_1)
        if reciprocal_ranks:
            mrr_list.append(np.mean(reciprocal_ranks))
        else:
            mrr_list.append(0)

# Calculate the average P@1 and MRR values across all questions in the corpus
avg_p_at_1 = np.mean(p_at_1_list)
avg_mrr = np.mean(mrr_list)

print("Average P@1:", avg_p_at_1)
print("Average Mean Reciprocal Rank:", avg_mrr)

Average P@1: 0.0
Average Mean Reciprocal Rank: 0.17041136844808843


In [294]:
#extra crediit legal

In [295]:
# use legal bert as pre-trained model
model_name = 'nlpaueb/legal-bert-base-uncased'
model = SentenceTransformer(model_name)

# list of text to be indexed (encoded)
corpus = []
# this dictionary is used as key: corpus index [0, 1, 2, ...] and value: corresponding question id
index_to_question_id = {}
idx = 0

# indexing all the questions in the law stack exchange -- only using the question titles
for question_id in post_reader.map_questions:
    question = post_reader.map_questions[question_id]
    text = question.title
    q_id = question.post_id
    corpus.append(text)
    index_to_question_id[idx] = question_id
    idx += 1
    
# embedded corpus
corpus_embeddings = model.encode(corpus, convert_to_tensor=True, show_progress_bar=True)


No sentence-transformers model found with name /home/shea.durgin/.cache/torch/sentence_transformers/nlpaueb_legal-bert-base-uncased. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at /home/shea.durgin/.cache/torch/sentence_transformers/nlpaueb_legal-bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be 

Batches:   0%|          | 0/756 [00:00<?, ?it/s]

In [296]:
# Initialize the lists to store P@1 and MRR values
p_at_1_list = []
mrr_list = []

# Find top 100 similar questions for each question in the corpus
for idx in index_to_question_id:
    q_id = index_to_question_id[idx]
    if q_id in test_data:
        query_embedding = corpus_embeddings[idx]
        
        # Use cosine-similarity and torch.topk to find the highest 100 scores
        cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
        top_results = torch.topk(cos_scores, k=100)
        
        # Extract the indices of the top 100 matches
        top_indices = top_results[1].tolist()

        # Calculate P@1 and MRR for the top 100 matches
        p_at_1 = 0
        reciprocal_ranks = []
        for i, index in enumerate(top_indices):
            if index_to_question_id[index] in dic_similar_questions[q_id]:
                if i == 0:
                    p_at_1 = 1
                reciprocal_ranks.append(1 / (i+1))
                break
          
        # Append the P@1 and MRR values for this question to the lists
        p_at_1_list.append(p_at_1)
        if reciprocal_ranks:
            mrr_list.append(np.mean(reciprocal_ranks))
        else:
            mrr_list.append(0)

# Calculate the average P@1 and MRR values across all questions in the corpus
avg_p_at_1 = np.mean(p_at_1_list)
avg_mrr = np.mean(mrr_list)

print("Average P@1:", avg_p_at_1)
print("Average Mean Reciprocal Rank:", avg_mrr)

Average P@1: 0.0
Average Mean Reciprocal Rank: 0.10229937076634055
