In [11]:
import json
import torch
import numpy as np
import random

In [12]:
with open("./data/test_data.json", "r") as f:
    test_data = json.load(f)

# BM25

In [13]:
from rank_bm25 import BM25Okapi

In [14]:
total_num_correct = 0
total_num_pos = 0

for data in test_data:
    label = data["label"]
    job_description = data["description"]
    pos = data["pos"]
    neg = data["neg"]
    all = pos + neg
    random.shuffle(all) 
    tokenized_all = [doc.split(" ") for doc in all]
    bm25 = BM25Okapi(tokenized_all)

    tokenized_query = job_description.split(" ")
    scores = bm25.get_scores(tokenized_query)

    _, indices = torch.topk(torch.from_numpy(scores), 5)
    relevant_passages = np.array(all)[indices]
    num_correct = 0
    for p in relevant_passages:
        if p in pos:
            num_correct += 1
    total_num_correct += num_correct
    total_num_pos += 5
    print(f"Accuracy ({label}): {num_correct}/{len(relevant_passages)}")
print(f"Total accuracy: {total_num_correct}/{total_num_pos}")

Accuracy (Security_Analyst): 3/5
Accuracy (Systems_Administrator): 1/5
Accuracy (Project_manager): 2/5
Accuracy (Database_Administrator): 2/5
Accuracy (Software_Developer): 4/5
Accuracy (Front_End_Developer): 2/5
Accuracy (Web_Developer): 4/5
Accuracy (Java_Developer): 4/5
Accuracy (Network_Administrator): 3/5
Accuracy (Python_Developer): 4/5
Accuracy (Security_Analyst): 3/5
Accuracy (Systems_Administrator): 2/5
Accuracy (Project_manager): 4/5
Accuracy (Database_Administrator): 2/5
Accuracy (Software_Developer): 3/5
Accuracy (Front_End_Developer): 4/5
Accuracy (Web_Developer): 1/5
Accuracy (Java_Developer): 4/5
Accuracy (Network_Administrator): 3/5
Accuracy (Python_Developer): 3/5
Accuracy (Security_Analyst): 3/5
Accuracy (Systems_Administrator): 2/5
Accuracy (Project_manager): 2/5
Accuracy (Database_Administrator): 3/5
Accuracy (Software_Developer): 3/5
Accuracy (Front_End_Developer): 1/5
Accuracy (Web_Developer): 3/5
Accuracy (Java_Developer): 5/5
Accuracy (Network_Administrator): 3/

# DPR Default

In [15]:
from transformers import (
    DPRContextEncoder,
    DPRQuestionEncoder,
    DPRContextEncoderTokenizer,
    DPRQuestionEncoderTokenizer,
)
import torch.nn.functional as F

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [17]:
device

device(type='cuda')

In [18]:
question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
    "facebook/dpr-question_encoder-single-nq-base"
)
context_tokenizer = DPRContextEncoderTokenizer.from_pretrained(
    "facebook/dpr-ctx_encoder-single-nq-base"
)
question_encoder = DPRQuestionEncoder.from_pretrained(
    "facebook/dpr-question_encoder-single-nq-base"
).to(device)
context_encoder = DPRContextEncoder.from_pretrained(
    "facebook/dpr-ctx_encoder-single-nq-base"
).to(device)

total_num_correct = 0
total_num_pos = 0

for data in test_data:
    label = data["label"]
    job_description = data["description"]
    pos = data["pos"]
    neg = data["neg"]
    all = pos + neg
    random.shuffle(all)

    # Tokenize the question and the context
    tokenized_question = question_tokenizer(
        job_description, return_tensors="pt", padding="max_length", max_length=512, truncation=True
    )
    question_input_ids = tokenized_question["input_ids"]
    question_attention_mask = tokenized_question["attention_mask"]

    tokenized_context = context_tokenizer(
        all, return_tensors="pt", padding="max_length", max_length=512, truncation=True
    )
    context_input_ids = tokenized_context["input_ids"]
    context_attention_mask = tokenized_context["attention_mask"]

    # Encode the question and the context
    question_output = question_encoder(question_input_ids.to(device), question_attention_mask.to(device)).pooler_output
    context_output = context_encoder(context_input_ids.to(device), context_attention_mask.to(device)).pooler_output
    scores = F.cosine_similarity(question_output, context_output)
    _, indices = torch.topk(scores, 5)
    relevant_passages = np.array(all)[indices.cpu().numpy()]
    num_correct = 0
    for p in relevant_passages:
        if p in pos:
            num_correct += 1
    total_num_correct += num_correct
    total_num_pos += 5
    print(f"Accuracy ({label}): {num_correct}/{len(relevant_passages)}")
print(f"Total accuracy: {total_num_correct}/{total_num_pos}")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.
Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequence

Accuracy (Security_Analyst): 2/5
Accuracy (Systems_Administrator): 2/5
Accuracy (Project_manager): 3/5
Accuracy (Database_Administrator): 4/5
Accuracy (Software_Developer): 1/5
Accuracy (Front_End_Developer): 2/5
Accuracy (Web_Developer): 2/5
Accuracy (Java_Developer): 3/5
Accuracy (Network_Administrator): 3/5
Accuracy (Python_Developer): 1/5
Accuracy (Security_Analyst): 3/5
Accuracy (Systems_Administrator): 2/5
Accuracy (Project_manager): 4/5
Accuracy (Database_Administrator): 3/5
Accuracy (Software_Developer): 0/5
Accuracy (Front_End_Developer): 1/5
Accuracy (Web_Developer): 1/5
Accuracy (Java_Developer): 1/5
Accuracy (Network_Administrator): 2/5
Accuracy (Python_Developer): 1/5
Accuracy (Security_Analyst): 3/5
Accuracy (Systems_Administrator): 2/5
Accuracy (Project_manager): 2/5
Accuracy (Database_Administrator): 2/5
Accuracy (Software_Developer): 3/5
Accuracy (Front_End_Developer): 1/5
Accuracy (Web_Developer): 2/5
Accuracy (Java_Developer): 2/5
Accuracy (Network_Administrator): 3/

# DPR Training

In [20]:
question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
    "facebook/dpr-question_encoder-single-nq-base"
)
context_tokenizer = DPRContextEncoderTokenizer.from_pretrained(
    "facebook/dpr-ctx_encoder-single-nq-base"
)
question_encoder = DPRQuestionEncoder.from_pretrained(
    "/home/sujay/Code/544-final-project/models/finetune_question_encoder"
).to(device)
context_encoder = DPRContextEncoder.from_pretrained(
    "/home/sujay/Code/544-final-project/models/finetune_context_encoder"
).to(device)

total_num_correct = 0
total_num_pos = 0

for data in test_data:
    label = data["label"]
    job_description = data["description"]
    pos = data["pos"]
    neg = data["neg"]
    all = neg + pos
    # random.shuffle(all)

    # Tokenize the question and the context
    tokenized_question = question_tokenizer(
        job_description,
        return_tensors="pt",
        padding="max_length",
        max_length=512,
        truncation=True,
    )
    question_input_ids = tokenized_question["input_ids"]
    question_attention_mask = tokenized_question["attention_mask"]

    tokenized_context = context_tokenizer(
        all, return_tensors="pt", padding="max_length", max_length=512, truncation=True
    )
    context_input_ids = tokenized_context["input_ids"]
    context_attention_mask = tokenized_context["attention_mask"]

    # Encode the question and the context
    question_output = question_encoder(
        question_input_ids.to(device), question_attention_mask.to(device)
    ).pooler_output
    context_output = context_encoder(
        context_input_ids.to(device), context_attention_mask.to(device)
    ).pooler_output
    scores = F.cosine_similarity(question_output, context_output)
    _, indices = torch.topk(scores, 5)
    relevant_passages = np.array(all)[indices.cpu().numpy()]
    num_correct = 0
    for p in relevant_passages:
        if p in pos:
            num_correct += 1
    total_num_correct += num_correct
    total_num_pos += 5
    print(f"Accuracy ({label}): {num_correct}/{len(relevant_passages)}")
print(f"Total accuracy: {total_num_correct}/{total_num_pos}")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.


Accuracy (Security_Analyst): 0/5
Accuracy (Systems_Administrator): 0/5
Accuracy (Project_manager): 1/5
Accuracy (Database_Administrator): 0/5
Accuracy (Software_Developer): 0/5
Accuracy (Front_End_Developer): 0/5
Accuracy (Web_Developer): 1/5
Accuracy (Java_Developer): 1/5
Accuracy (Network_Administrator): 0/5
Accuracy (Python_Developer): 1/5
Accuracy (Security_Analyst): 0/5
Accuracy (Systems_Administrator): 0/5
Accuracy (Project_manager): 0/5
Accuracy (Database_Administrator): 0/5
Accuracy (Software_Developer): 0/5
Accuracy (Front_End_Developer): 0/5
Accuracy (Web_Developer): 1/5
Accuracy (Java_Developer): 0/5
Accuracy (Network_Administrator): 0/5
Accuracy (Python_Developer): 0/5
Accuracy (Security_Analyst): 0/5
Accuracy (Systems_Administrator): 0/5
Accuracy (Project_manager): 0/5
Accuracy (Database_Administrator): 0/5
Accuracy (Software_Developer): 0/5
Accuracy (Front_End_Developer): 0/5
Accuracy (Web_Developer): 0/5
Accuracy (Java_Developer): 0/5
Accuracy (Network_Administrator): 0/

In [8]:
# # Clear gpu
# def clear_memory():
#     import gc
#     gc.collect()
#     torch.cuda.empty_cache()
    
# del question_tokenizer
# del context_tokenizer
# del question_encoder
# del context_encoder
# clear_memory()

NameError: name 'question_tokenizer' is not defined