In [35]:
import json
import torch
import numpy as np
import random
import pandas as pd
from transformers import (
    DPRContextEncoder,
    DPRQuestionEncoder,
    DPRContextEncoderTokenizer,
    DPRQuestionEncoderTokenizer,
)
import torch.nn.functional as F
from tqdm.notebook import tqdm
import faiss  # make faiss available

In [36]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [37]:
question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
    "facebook/dpr-question_encoder-single-nq-base"
)
context_tokenizer = DPRContextEncoderTokenizer.from_pretrained(
    "facebook/dpr-ctx_encoder-single-nq-base"
)


def encode(tokenizer, encoder, text):
    with torch.no_grad():
        tokenized_output = tokenizer(
            text, return_tensors="pt", padding="max_length", max_length=512, truncation=True
        )
        input_ids = tokenized_output["input_ids"]
        attention_mask = tokenized_output["attention_mask"]

        return F.normalize(encoder(input_ids.to(device), attention_mask.to(device)).pooler_output)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.


In [38]:
data = {"resume": [], "categories": []}
df = pd.DataFrame(data)
START_INDEX = 10000

# Iterate through a folder of text files and extract the text
import os
import codecs

# Define the directory
dir_path = "./data/resumes_corpus"

# Get a list of all files in the directory
files = os.listdir(dir_path)[START_INDEX:]

# Iterate over each file
for file in files:
    if file.endswith(".txt"):
        # Construct full file path
        file_path = os.path.join(dir_path, file)
        file_path_label = file_path.replace(".txt", ".lab")
        data = {"resume": [], "categories": []}

        # Open the file
        with codecs.open(file_path, "r", encoding="utf8", errors="ignore") as f:
            # Read the file's contents
            resume = f.read()
            data["resume"].append(resume)

        with codecs.open(file_path_label, "r", encoding="utf8", errors="ignore") as f:
            # Read the file's contents
            content = f.read()
            content = content.split("\n")
            if "" in content and len(content) == 1:
                continue
            elif "" in content:
                content.remove("")
            data["categories"].append(content)

        new_row_df = pd.DataFrame(data)
        # Add the new row to the DataFrame
        df = pd.concat([df, new_row_df], ignore_index=True)

In [39]:
def print_memory_usage(tensor):
    print(tensor.element_size() * tensor.nelement() / 1024 / 1024)

In [40]:
def memory_stats():
    print(torch.cuda.memory_allocated() / 1024**2)
    print(torch.cuda.memory_cached() / 1024**2)

In [41]:
def count_labels(label, result):
    return sum([label in x for x in result["categories"]])

In [42]:
unique_labels = df["categories"].explode().unique()
unique_labels

array(['Security_Analyst', 'Systems_Administrator', 'Project_manager',
       'Database_Administrator', 'Software_Developer',
       'Front_End_Developer', 'Web_Developer', 'Java_Developer',
       'Network_Administrator', 'Python_Developer'], dtype=object)

In [43]:
def compute_distance_subset(index, xq, subset):
    n, _ = xq.shape
    _, k = subset.shape
    distances = np.empty((n, k), dtype=np.float32)
    index.compute_distance_subset(
        n, faiss.swig_ptr(xq), k, faiss.swig_ptr(distances), faiss.swig_ptr(subset)
    )
    return distances

# DPR + BM25

In [44]:
from rank_bm25 import BM25Okapi

In [45]:
job_descriptions = {
    "Security_Analyst": "Security Analyst: As a Security Analyst, you will play a critical role in safeguarding our organization's digital assets and ensuring the integrity of our systems. Your primary responsibilities will include continuously monitoring and analyzing security alerts and events to detect and respond to potential threats. By conducting thorough investigations, you will identify the root causes of security incidents and develop strategies to prevent their recurrence. Collaborating closely with cross-functional teams, including IT, network operations, and software development, you will assess and enhance our security infrastructure. This involves evaluating existing security measures, such as firewalls, intrusion detection systems, and endpoint protection solutions, and recommending improvements to strengthen our defenses against cyber threats. Your role will also involve staying abreast of emerging security trends, vulnerabilities, and attack techniques. By keeping our security policies and procedures up to date, you will ensure that our organization remains compliant with industry regulations and standards. Additionally, you will participate in security awareness training programs to educate employees about best practices for maintaining information security. Through your expertise and diligence, you will contribute to creating a secure environment where our stakeholders can trust that their data is protected from unauthorized access, disclosure, and manipulation.",
    "Systems_Administrator": "A Systems Administrator is a vital IT professional responsible for the maintenance, configuration, and reliable operation of computer systems, particularly multi-user computers, such as servers. They ensure that the infrastructure of an organization runs smoothly by managing, upgrading, and setting up hardware and software, installing patches, and ensuring the security of data from internal and external threats. This role includes developing and maintaining networks, backups, and system security strategies. Systems Administrators must quickly diagnose and resolve problems in IT systems, making them essential for minimizing downtime and enhancing performance in business operations. They also provide technical support and guidance to users, which may include training non-technical staff on how to use business systems. Applicants should possess strong problem-solving skills, experience with various operating systems including Windows and Linux, and understanding of network management and security. Certifications such as Microsoft Certified Systems Administrator (MCSA), CompTIA A+, and Cisco Certified Network Associate (CCNA) are highly regarded. Effective communication and the ability to work in a team are crucial for collaborating with IT staff and other departments to facilitate seamless IT operations.",
    "Project_manager": "A Project Manager is a critical role responsible for planning, executing, and finalizing projects within set deadlines and budgets. This position involves coordinating efforts of team members, consultants, and contractors to deliver projects according to plan. The Project Manager must define project objectives, create schedules, and oversee quality control throughout the project life cycle. Key responsibilities include managing project resources, leading team meetings, and ensuring clear communication among project stakeholders. The role demands strong leadership to motivate and direct diverse teams, and an ability to anticipate and mitigate risks that could impact project timelines or outcomes. Applicants should have proven experience in project management methodologies, such as Agile or Waterfall, and be adept at using project management software like Microsoft Project or Atlassian JIRA. Strong organizational skills, attention to detail, and the capacity to handle multiple projects simultaneously are essential. Ideal candidates will possess a Bachelor’s degree in a related field and certifications such as PMP (Project Management Professional) or PRINCE2. Excellent interpersonal and communication skills are crucial for negotiating with clients and vendors, as well as for fostering a collaborative team environment. This role suits someone with a proactive approach and a knack for problem-solving in dynamic project environments.",
    "Database_Administrator": "A Database Administrator (DBA) is responsible for the performance, integrity, and security of databases in an organization. They ensure that data remains consistent across the database, is clearly defined, and efficiently accessed by users. DBAs are also involved in planning and development of the database, as well as in troubleshooting any issues on behalf of the users. Key responsibilities include installing, configuring, upgrading, and maintaining database servers. A DBA will set and enforce policies for the access and use of the database, and ensure compliance with data security and privacy mandates. They perform regular backups to prevent data loss in case of power failure or other issues. Additionally, DBAs optimize database performance through monitoring, tuning, and managing database parameters. Candidates should have a strong understanding of database languages like SQL, experience with database management systems like Oracle, SQL Server, or MySQL, and familiarity with operating systems such as Linux and Windows. Relevant certifications, such as Oracle Certified Professional or Microsoft Certified Database Administrator, are advantageous. This role requires excellent problem-solving skills and attention to detail. Effective communication skills are also important, as DBAs need to collaborate with IT staff and management to align database plans with organizational goals.",
    "Software_Developer": "A Software Developer is responsible for designing, coding, testing, and maintaining software applications that meet user needs and business requirements. They collaborate with other developers, project managers, UX designers, and sometimes clients to create complex software systems. This role involves writing scalable and efficient code, often in various programming languages such as Java, Python, C#, or JavaScript. Key responsibilities include analyzing user needs and developing software solutions, often modifying existing software to improve performance or adapt to new hardware. A developer must also monitor the performance of software applications to ensure they are error-free and efficient. Documenting the development process, code changes, and other technical information is essential for ongoing maintenance and updates. Candidates should possess a bachelor's degree in computer science, software engineering, or a related field, along with a strong foundation in computer science principles, data structures, and algorithms. Experience with development environments, version control systems like Git, and methodologies such as Agile or Scrum is beneficial. Effective problem-solving skills, attention to detail, and the ability to work in a team environment are crucial. Good communication skills are also important for interacting with non-technical team members and stakeholders to gather requirements and explain technical details.",
    "Front_End_Developer": "A Front End Developer is responsible for implementing visual elements that users see and interact with in a web application. They are tasked with combining the art of design with the science of programming, translating UI/UX design wireframes into actual code that will produce visual elements of the application. Key responsibilities include developing new user-facing features and optimizing applications for maximum speed and scalability. Front End Developers ensure the technical feasibility of UI/UX designs and maintain the consistency of visual design language across a digital ecosystem. They are also expected to manage responsive design, ensuring websites function across different devices and platforms. Candidates should be proficient in web markup, including HTML5, CSS3, and client-side scripting languages like JavaScript. Knowledge of JavaScript frameworks such as React, Angular, or Vue.js is highly advantageous. Experience with version control tools like Git, and familiarity with SEO principles are also desirable. A bachelor’s degree in computer science, information technology, or a related field is typically required. Strong problem-solving skills, attention to detail, and the ability to work collaboratively in cross-functional teams are essential. Effective communication skills are necessary to explain complex concepts to team members who may not have a technical background.",
    "Web_Developer": "A Web Developer is responsible for the coding, design, and layout of websites according to a company’s specifications. As the role takes into consideration user experience and function, a certain level of both graphic design and computer programming is necessary. They work closely with project managers and designers to ensure the final product adheres to the predetermined budget, scope, and design. Web developers typically build the framework of a website using coding languages such as HTML, CSS, JavaScript, and frameworks like React or Angular. They must also test and document software for web sites, work with graphics and other designers to determine the website’s layout, and integrate graphics, audio, and video into the website. Additionally, they are responsible for monitoring website performance and ensuring that the site is user-friendly and up-to-date. Candidates should have strong analytical skills and be detail-oriented with the ability to solve problems effectively. Knowledge of search engine optimization (SEO) practices is a plus, helping to ensure that websites meet optimal architecture, content, linking, and other factors to rank well in search engines. A bachelor’s degree in computer science or a related field is often required, along with experience in web development or a similar role. Strong communication and teamwork skills are essential to collaborate effectively with various departments.",
    "Java_Developer": "A Java Developer is responsible for the design, development, and management of Java-based applications. Because Java is widely used in large systems, the role requires a deep understanding of the language as well as the underlying frameworks and libraries such as Spring and Hibernate. Key responsibilities include writing well-designed, efficient code that fits in with the rest of the project’s architecture. A Java Developer must also conduct software analysis, programming, testing, and debugging, ensuring that standards and project requirements are met. Further, they engage in maintaining, expanding, and scaling existing software systems and may assist in the design of new applications. Candidates should have a strong grasp of Java development and object-oriented design patterns, as well as experience with concepts like full-stack development, microservices, and cloud services. Familiarity with web technologies like HTML, CSS, JavaScript, and popular frameworks like Angular or React is beneficial. A bachelor’s degree in computer science, engineering, or a related field is typically required. Java Developers must have strong analytical skills and the ability to work independently or as part of a team. They should also possess good communication skills to effectively collaborate with team members and stakeholders to define user requirements and provide business solutions.",
    "Network_Administrator": "A Network Administrator is responsible for maintaining an organization's computer networks, ensuring that they run efficiently and securely. This role involves setting up, administering, and troubleshooting network hardware and software systems to ensure optimal operations and connectivity. Key duties include installing and configuring network equipment such as routers, switches, firewalls, and load balancers. Network Administrators also manage IP addresses, monitor network performance to identify and resolve bottlenecks, and perform security measures to protect data and systems from external and internal threats. They are tasked with regularly updating network infrastructure, conducting routine maintenance, and providing support for network issues to users across the organization. Candidates should possess a solid understanding of network infrastructure and network hardware. They must be capable of quickly learning new technologies and procedures, and able to implement and maintain emergency backup and restore systems for mission-critical network servers. Certifications such as CompTIA Network+, Cisco Certified Network Associate (CCNA), or Certified Information Systems Security Professional (CISSP) can be advantageous. A bachelor’s degree in computer science, network administration, or a related field is typically required. Strong problem-solving skills, attention to detail, and effective communication capabilities are essential for diagnosing network issues, explaining complex information to non-technical colleagues, and ensuring reliable network infrastructure.",
    "Python_Developer": "A Python Developer is responsible for writing server-side web application logic, developing backend components, connecting applications with third-party web services, and supporting frontend developers by integrating their work with the Python application. Often, Python developers are involved in system development projects, requiring them to handle data interchange between servers and users. Key responsibilities include writing and testing scalable code, developing back-end components to improve responsiveness and overall performance, and integrating user-facing elements into applications. A Python Developer also often handles the integration of data storage solutions and may assist in designing and implementing low-latency, high-availability applications. Candidates should have a strong understanding of the Python programming language and familiarity with some ORM (Object Relational Mapper) libraries. They should be proficient in understanding of threading limitations of Python, and multi-process architecture. Experience with server-side templating languages and basic understanding of front-end technologies, such as JavaScript, HTML5, and CSS3, is highly beneficial. A bachelor’s degree in computer science, engineering, or a relevant field is typically required. Strong problem-solving skills and the ability to work in a team or independently are crucial. Good communication skills are also important for collaborating with other team members and stakeholders to deliver effective software solutions."
}

In [46]:
resumes = df["resume"].values.tolist()[0:10000]

In [47]:
question_encoder = DPRContextEncoder.from_pretrained(
    "./models/finetune_question_encoder"
).to(device)

index = faiss.read_index("./models/faiss_index_finetuning.index")
indices = np.arange(0, 10000).reshape(1, 10000)

total_correct = 0
total = 0
top_k = 100
lambda_val = 0.1

for role, description in job_descriptions.items():
    # BM 25
    tokenized_all = [doc.split(" ") for doc in resumes]
    bm25 = BM25Okapi(tokenized_all)
    tokenized_query = description.split(" ")
    scores_bm25 = bm25.get_scores(tokenized_query)
    scores_bm25 = scores_bm25 / np.max(scores_bm25)

    # FAISS
    indices = np.arange(0, 10000).reshape(1, 10000)
    encoded_description = (
        encode(question_tokenizer, question_encoder, description).detach().cpu().numpy()
    )
    scores = compute_distance_subset(index, encoded_description, indices)[0]
    scores = scores / np.max(scores)

    consolidated_scores = torch.from_numpy(lambda_val * scores + scores_bm25)

    _, indices = torch.topk(consolidated_scores, top_k)
    result = df.iloc[indices]
    count = count_labels(role, result)
    total_correct += count
    total += top_k
    print(f"{role}: {count}/{top_k}")
print(f"Total: {total_correct}/{total}")

Some weights of DPRContextEncoder were not initialized from the model checkpoint at ./models/finetune_question_encoder and are newly initialized: ['bert_model.embeddings.LayerNorm.bias', 'bert_model.embeddings.LayerNorm.weight', 'bert_model.embeddings.position_embeddings.weight', 'bert_model.embeddings.token_type_embeddings.weight', 'bert_model.embeddings.word_embeddings.weight', 'bert_model.encoder.layer.0.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.0.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.0.attention.output.dense.bias', 'bert_model.encoder.layer.0.attention.output.dense.weight', 'bert_model.encoder.layer.0.attention.self.key.bias', 'bert_model.encoder.layer.0.attention.self.key.weight', 'bert_model.encoder.layer.0.attention.self.query.bias', 'bert_model.encoder.layer.0.attention.self.query.weight', 'bert_model.encoder.layer.0.attention.self.value.bias', 'bert_model.encoder.layer.0.attention.self.value.weight', 'bert_model.encoder.layer.0.interme

Security_Analyst: 51/100
Systems_Administrator: 48/100
Project_manager: 88/100
Database_Administrator: 70/100
Software_Developer: 85/100
Front_End_Developer: 93/100
Web_Developer: 55/100
Java_Developer: 25/100
Network_Administrator: 67/100
Python_Developer: 99/100
Total: 681/1000


# Create FAISS Index

In [49]:
context_encoder = DPRContextEncoder.from_pretrained(
    "./models/finetune_context_encoder"
).to(device)

In [50]:
start_index = 0
end_index = 10000

index = faiss.IndexFlatIP(768)  # build the index

for i in tqdm(range(start_index, end_index, 500)):
    data = resumes[i : i + 500]
    encoded_resumes = encode(context_tokenizer, context_encoder, data)
    index.add(encoded_resumes.detach().cpu().numpy())

faiss.write_index(index, "./models/faiss_index_finetuning.index")

  0%|          | 0/20 [00:00<?, ?it/s]

KeyboardInterrupt: 