In [1]:
import pypandoc
import os
import torch
from striprtf.striprtf import rtf_to_text

from transformers import (
    DPRContextEncoder,
    DPRQuestionEncoder,
    DPRContextEncoderTokenizer,
    DPRQuestionEncoderTokenizer,
)
import torch.nn.functional as F
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
search_corpus_names = []
search_corpus = []
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [12]:
def rank(job_description, resumes, resume_names):
    question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
        "facebook/dpr-question_encoder-single-nq-base"
    )
    context_tokenizer = DPRContextEncoderTokenizer.from_pretrained(
        "facebook/dpr-ctx_encoder-single-nq-base"
    )
    question_encoder = DPRQuestionEncoder.from_pretrained(
        "/home/sujay/Code/544-final-project/models/finetune_question_encoder"
    ).to(device)
    context_encoder = DPRContextEncoder.from_pretrained(
        "/home/sujay/Code/544-final-project/models/finetune_context_encoder"
    ).to(device)

    # Tokenize the question and the context
    tokenized_question = question_tokenizer(
        job_description,
        return_tensors="pt",
        padding="max_length",
        max_length=512,
        truncation=True,
    )
    question_input_ids = tokenized_question["input_ids"]
    question_attention_mask = tokenized_question["attention_mask"]

    tokenized_context = context_tokenizer(
        resumes, return_tensors="pt", padding="max_length", max_length=512, truncation=True
    )
    context_input_ids = tokenized_context["input_ids"]
    context_attention_mask = tokenized_context["attention_mask"]

    # Encode the question and the context
    question_output = question_encoder(
        question_input_ids.to(device), question_attention_mask.to(device)
    ).pooler_output
    context_output = context_encoder(
        context_input_ids.to(device), context_attention_mask.to(device)
    ).pooler_output
    sorted_scores, indices = torch.sort(F.cosine_similarity(question_output, context_output), descending=True)
    sorted_scores = sorted_scores.cpu().detach().numpy()
    indices = indices.cpu().detach().numpy()
    # from IPython import embed; embed()
    return np.array(resume_names)[indices], np.array(resumes)[indices], sorted_scores

In [13]:
# Define the directory
dir_path = "./data/production"

# Get a list of all files in the directory
files = os.listdir(dir_path)

# Iterate over each file
for file in files:
    file_path = os.path.join(dir_path, file)
    output = pypandoc.convert_file(file_path, "rtf")
    search_corpus_names.append(file)
    search_corpus.append(rtf_to_text(output))

In [14]:
job_description = open("./data/prod_job_description.txt").read()

In [15]:
rank(job_description, search_corpus, search_corpus_names)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.


(array(['Ashish_khandelwal_4963_8078.docx', 'WI-WI-89150.docx',
        'NE-BA-08590_formatted-88.docx', 'Arlene_Mccarthy_8272_13253.docx',
        'Mohammad_Jordan_7624_12195.docx',
        'Shrikkanth_Cadambi_9108_14884.docx',
        'Lakhwinder_Singh_4968_8082.docx'], dtype='<U34'),
 array(["Ashish Khandelwal\nAgile Coach\nCertification\n•\tCertified SAFe 5 Program Consultant (SPC5), 2020\n•\tCertified SAFe 4.6 Program Consultant (SPC4.6), 2019\n•\tAdvanced Certified Scrum Master (A-CSM), 2018\n•\tCertified Scrum Master (CSM), 2017\n•\tCertified Scrum Product Owner (CSPO), 2016\n•\tProject Management Professional (PMP®), 2015\n•\tITIL® V3 (Foundation Certificate in IT Service Management), 2015\n•\tSAS® Certified Base Programmer for SAS 9, 2014\nExperience\nSr. Agile Coach May’19 - Till Date\nNorthrop Grumman / Peraton, Washington DC Metro Area\n•\tTransformed product teams to agile methodology by implementing scrum and kanban frameworks.\n•\tLed adoption of agile processes and prac