In [1]:
import json
import torch
import numpy as np
import random
import pandas as pd
from transformers import (
    DPRContextEncoder,
    DPRQuestionEncoder,
    DPRContextEncoderTokenizer,
    DPRQuestionEncoderTokenizer,
)
import torch.nn.functional as F
from tqdm import tqdm

In [2]:
with open("./data/test_data.json", "r") as f:
    test_data = json.load(f)

comparison_data = pd.DataFrame()

# BM25

In [3]:
from rank_bm25 import BM25Okapi

In [4]:
total_num_correct = 0
total_num_pos = 0
labels = []
accuracies = []

for data in test_data:
    label = data["label"]
    job_description = data["description"]
    pos = data["pos"]
    neg = data["neg"]
    all = pos + neg
    random.shuffle(all) 
    tokenized_all = [doc.split(" ") for doc in all]
    bm25 = BM25Okapi(tokenized_all)

    tokenized_query = job_description.split(" ")
    scores = bm25.get_scores(tokenized_query)

    _, indices = torch.topk(torch.from_numpy(scores), 5)
    relevant_passages = np.array(all)[indices]
    num_correct = 0
    for p in relevant_passages:
        if p in pos:
            num_correct += 1
    total_num_correct += num_correct
    total_num_pos += 5
    labels.append(label)
    accuracies.append(num_correct)
    print(f"Accuracy ({label}): {num_correct}/{len(relevant_passages)}")

labels.append("Total")
accuracies.append(total_num_correct)
comparison_data["Label"] = labels
comparison_data["BM25"] = accuracies
print(f"Total accuracy: {total_num_correct}/{total_num_pos}")

Accuracy (Security_Analyst): 3/5
Accuracy (Systems_Administrator): 3/5
Accuracy (Project_manager): 2/5
Accuracy (Database_Administrator): 2/5
Accuracy (Software_Developer): 4/5
Accuracy (Front_End_Developer): 2/5
Accuracy (Web_Developer): 4/5
Accuracy (Java_Developer): 4/5
Accuracy (Network_Administrator): 3/5
Accuracy (Python_Developer): 4/5
Accuracy (Security_Analyst): 3/5
Accuracy (Systems_Administrator): 2/5
Accuracy (Project_manager): 3/5
Accuracy (Database_Administrator): 2/5
Accuracy (Software_Developer): 4/5
Accuracy (Front_End_Developer): 4/5
Accuracy (Web_Developer): 2/5
Accuracy (Java_Developer): 4/5
Accuracy (Network_Administrator): 4/5
Accuracy (Python_Developer): 4/5
Accuracy (Security_Analyst): 3/5
Accuracy (Systems_Administrator): 2/5
Accuracy (Project_manager): 4/5
Accuracy (Database_Administrator): 3/5
Accuracy (Software_Developer): 5/5
Accuracy (Front_End_Developer): 1/5
Accuracy (Web_Developer): 3/5
Accuracy (Java_Developer): 5/5
Accuracy (Network_Administrator): 3/

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [5]:
question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
    "facebook/dpr-question_encoder-single-nq-base"
)
context_tokenizer = DPRContextEncoderTokenizer.from_pretrained(
    "facebook/dpr-ctx_encoder-single-nq-base"
)

def encode(tokenizer, encoder, text):
    tokenized_output = tokenizer(
        text, return_tensors="pt", padding="max_length", max_length=512, truncation=True
    )
    input_ids = tokenized_output["input_ids"]
    attention_mask = tokenized_output["attention_mask"]

    return encoder(input_ids.to(device), attention_mask.to(device)).pooler_output

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.


# DPR Default

In [7]:
question_encoder = DPRQuestionEncoder.from_pretrained(
    "facebook/dpr-question_encoder-single-nq-base"
).to(device)
context_encoder = DPRContextEncoder.from_pretrained(
    "facebook/dpr-ctx_encoder-single-nq-base"
).to(device)

total_num_correct = 0
total_num_pos = 0
accuracies = []

for data in test_data:
    label = data["label"]
    job_description = data["description"]
    pos = data["pos"]
    neg = data["neg"]
    all = pos + neg
    random.shuffle(all)

    # Encode the question and the context
    question_output = encode(question_tokenizer, question_encoder, job_description)
    context_output = encode(context_tokenizer, context_encoder, all)
    scores = F.cosine_similarity(question_output, context_output)
    _, indices = torch.topk(scores, 5)
    relevant_passages = np.array(all)[indices.cpu().numpy()]
    num_correct = 0
    for p in relevant_passages:
        if p in pos:
            num_correct += 1
    total_num_correct += num_correct
    total_num_pos += 5

    accuracies.append(num_correct)
    print(f"Accuracy ({label}): {num_correct}/{len(relevant_passages)}")
accuracies.append(total_num_correct)
comparison_data["DPR No Finetuning"] = accuracies
print(f"Total accuracy: {total_num_correct}/{total_num_pos}")

Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the

Accuracy (Security_Analyst): 2/5
Accuracy (Systems_Administrator): 3/5
Accuracy (Project_manager): 3/5
Accuracy (Database_Administrator): 3/5
Accuracy (Software_Developer): 3/5
Accuracy (Front_End_Developer): 2/5
Accuracy (Web_Developer): 0/5
Accuracy (Java_Developer): 3/5
Accuracy (Network_Administrator): 3/5
Accuracy (Python_Developer): 2/5
Accuracy (Security_Analyst): 3/5
Accuracy (Systems_Administrator): 3/5
Accuracy (Project_manager): 3/5
Accuracy (Database_Administrator): 3/5
Accuracy (Software_Developer): 3/5
Accuracy (Front_End_Developer): 2/5
Accuracy (Web_Developer): 3/5
Accuracy (Java_Developer): 1/5
Accuracy (Network_Administrator): 3/5
Accuracy (Python_Developer): 2/5
Accuracy (Security_Analyst): 3/5
Accuracy (Systems_Administrator): 1/5
Accuracy (Project_manager): 2/5
Accuracy (Database_Administrator): 3/5
Accuracy (Software_Developer): 3/5
Accuracy (Front_End_Developer): 1/5
Accuracy (Web_Developer): 2/5
Accuracy (Java_Developer): 2/5
Accuracy (Network_Administrator): 3/

# DPR Training

In [8]:
question_encoder = DPRQuestionEncoder.from_pretrained(
    "/home/sujay/Code/544-final-project/models/finetune_question_encoder"
).to(device)
context_encoder = DPRContextEncoder.from_pretrained(
    "/home/sujay/Code/544-final-project/models/finetune_context_encoder"
).to(device)

total_num_correct = 0
total_num_pos = 0
accuracies = []

for data in test_data:
    label = data["label"]
    job_description = data["description"]
    pos = data["pos"]
    neg = data["neg"]
    all = pos + neg
    # random.shuffle(all)

    question_output = encode(question_tokenizer, question_encoder, job_description)
    context_output = encode(context_tokenizer, context_encoder, all)
    scores = F.cosine_similarity(question_output, context_output)
    _, indices = torch.topk(scores, 5)
    relevant_passages = np.array(all)[indices.cpu().numpy()]
    num_correct = 0
    for p in relevant_passages:
        if p in pos:
            num_correct += 1
    total_num_correct += num_correct
    total_num_pos += 5
    accuracies.append(num_correct)
    print(f"Accuracy ({label}): {num_correct}/{len(relevant_passages)}")

accuracies.append(total_num_correct)
comparison_data["DPR Finetuning"] = accuracies
print(f"Total accuracy: {total_num_correct}/{total_num_pos}")

Accuracy (Security_Analyst): 3/5
Accuracy (Systems_Administrator): 3/5
Accuracy (Project_manager): 3/5
Accuracy (Database_Administrator): 5/5
Accuracy (Software_Developer): 4/5
Accuracy (Front_End_Developer): 2/5
Accuracy (Web_Developer): 2/5
Accuracy (Java_Developer): 3/5
Accuracy (Network_Administrator): 3/5
Accuracy (Python_Developer): 2/5
Accuracy (Security_Analyst): 3/5
Accuracy (Systems_Administrator): 2/5
Accuracy (Project_manager): 3/5
Accuracy (Database_Administrator): 4/5
Accuracy (Software_Developer): 3/5
Accuracy (Front_End_Developer): 5/5
Accuracy (Web_Developer): 3/5
Accuracy (Java_Developer): 2/5
Accuracy (Network_Administrator): 3/5
Accuracy (Python_Developer): 4/5
Accuracy (Security_Analyst): 4/5
Accuracy (Systems_Administrator): 2/5
Accuracy (Project_manager): 3/5
Accuracy (Database_Administrator): 1/5
Accuracy (Software_Developer): 4/5
Accuracy (Front_End_Developer): 2/5
Accuracy (Web_Developer): 3/5
Accuracy (Java_Developer): 3/5
Accuracy (Network_Administrator): 4/

# DPR + BM25

In [6]:
lambda_val = 1.1
question_encoder = DPRQuestionEncoder.from_pretrained(
    "/home/sujay/Code/544-final-project/models/finetune_question_encoder"
).to(device)
context_encoder = DPRContextEncoder.from_pretrained(
    "/home/sujay/Code/544-final-project/models/finetune_context_encoder"
).to(device)

total_num_correct = 0
total_num_pos = 0
accuracies = []

for data in test_data:
    label = data["label"]
    job_description = data["description"]
    pos = data["pos"]
    neg = data["neg"]
    all = pos + neg
    # random.shuffle(all)

    question_output = encode(question_tokenizer, question_encoder, job_description)
    context_output = encode(context_tokenizer, context_encoder, all)
    scores = F.cosine_similarity(question_output, context_output).detach().cpu().numpy()
    scores = scores/np.max(scores)

    tokenized_all = [doc.split(" ") for doc in all]
    bm25 = BM25Okapi(tokenized_all)

    tokenized_query = job_description.split(" ")
    scores_bm25 = bm25.get_scores(tokenized_query)
    scores_bm25=scores_bm25/np.max(scores_bm25)
    # from IPython import embed; embed()
    # print(scores_bm25)

    consolidated_scores = torch.from_numpy(lambda_val*scores + scores_bm25)
    _, indices = torch.topk(consolidated_scores, 5)
    relevant_passages = np.array(all)[indices.numpy()]
    num_correct = 0
    for p in relevant_passages:
        if p in pos:
            num_correct += 1
    total_num_correct += num_correct
    total_num_pos += 5
    accuracies.append(num_correct)
    print(f"Accuracy ({label}): {num_correct}/{len(relevant_passages)}")

accuracies.append(total_num_correct)
comparison_data["DPR Finetuning + BM25"] = accuracies
print(f"Total accuracy: {total_num_correct}/{total_num_pos}")

Accuracy (Security_Analyst): 3/5
Accuracy (Systems_Administrator): 3/5
Accuracy (Project_manager): 3/5
Accuracy (Database_Administrator): 3/5
Accuracy (Software_Developer): 4/5
Accuracy (Front_End_Developer): 2/5
Accuracy (Web_Developer): 3/5
Accuracy (Java_Developer): 4/5
Accuracy (Network_Administrator): 4/5
Accuracy (Python_Developer): 4/5
Accuracy (Security_Analyst): 3/5
Accuracy (Systems_Administrator): 2/5
Accuracy (Project_manager): 4/5
Accuracy (Database_Administrator): 3/5
Accuracy (Software_Developer): 5/5
Accuracy (Front_End_Developer): 4/5
Accuracy (Web_Developer): 2/5
Accuracy (Java_Developer): 4/5
Accuracy (Network_Administrator): 4/5
Accuracy (Python_Developer): 5/5
Accuracy (Security_Analyst): 3/5
Accuracy (Systems_Administrator): 2/5
Accuracy (Project_manager): 4/5
Accuracy (Database_Administrator): 3/5
Accuracy (Software_Developer): 5/5
Accuracy (Front_End_Developer): 1/5
Accuracy (Web_Developer): 3/5
Accuracy (Java_Developer): 5/5
Accuracy (Network_Administrator): 4/

In [9]:
pd.set_option("display.max_rows", None)
comparison_data

Unnamed: 0,Label,BM25,DPR No Finetuning,DPR Finetuning
0,Security_Analyst,3,2,3
1,Systems_Administrator,3,3,3
2,Project_manager,2,3,3
3,Database_Administrator,2,3,5
4,Software_Developer,4,3,4
5,Front_End_Developer,2,2,2
6,Web_Developer,4,0,2
7,Java_Developer,4,3,3
8,Network_Administrator,3,3,3
9,Python_Developer,4,2,2


In [10]:
# # Clear gpu
# def clear_memory():
#     import gc
#     gc.collect()
#     torch.cuda.empty_cache()
    
# del question_tokenizer
# del context_tokenizer
# del question_encoder
# del context_encoder
# clear_memory()