# Analyzign Models Best Suited for Gaps Project

### Notebook Code Author: 
- Austin Youngren

### Data Collection Performed by: 
- Spencer Baloga Loufek 
- Ayush Adhikari

### Resources Used for Code Creation: 
- [Hugging Face](huggingface.co)
- [Facebook Research DPR GitHub Repository](https://github.com/facebookresearch/DPR)
- [Deepseek](https://www.deepseek.com/)

Deepseek AI chatbot was used due to limited access of DPR training documentation and examples.

## Imports

In [1]:
import gc
import re
import csv
import json
import torch
import wandb
import random
import pandas as pd
import torch.nn.functional as F

# from torch import nn
# from itertools import product
from torch.optim import AdamW
from torch.utils.data import DataLoader
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.metrics import precision_recall_fscore_support
from transformers import ( DPRQuestionEncoder, 
                            DPRQuestionEncoderTokenizer, 
                            DPRContextEncoder, 
                            DPRContextEncoderTokenizer,
                            get_linear_schedule_with_warmup )

pd.set_option('display.max_rows', None) 
pd.set_option('display.max_columns', None) 
pd.set_option('display.width', None) 
pd.set_option('display.max_colwidth', None) 
pd.set_option('display.max_seq_item', None)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(torch.__version__)  # e.g., 2.2.0+cu121
print(torch.cuda.is_available())  # Should return True
print(torch.cuda.get_device_name(0))

2.5.1
True
NVIDIA GeForce RTX 3080


## Data Preprocessing

In [3]:
with open ('data/all-csu-codes.csv', 'r') as c_data:
    csv_reader = csv.reader(c_data) 
    courses_data = list(csv_reader)
    
courses_df = pd.DataFrame(columns=['Courses', 'Skills'])

for idx in range(0, len(courses_data)):
    skill_list = courses_data[idx][1:-1]
    skill_list = [skill.title() for skill in skill_list]
    skill_list = [re.sub(r'\b(vs|Vs)\b', 'VS', skill) for skill in skill_list]

    row = pd.DataFrame({'Courses': courses_data[idx][0], 'Skills':[skill_list]})
    courses_df = pd.concat([courses_df, row], ignore_index=True)

courses_df.to_csv('data/dpr_courses_data.csv', index=False)

In [4]:
with open('data/descriptions.txt', 'r') as j_data:
    csv_reader = csv.reader(j_data, delimiter='|')
    jobs_data = list(csv_reader)

jobs_df = pd.DataFrame(columns=['Job_Title', 'Job_Description', 'Required_Skills'])
                    
for row in jobs_data:
    if len(row) == 3:
        job_title = row[0].strip().strip('"') 

        job_description = row[1].strip().strip('"')
        job_description = re.sub(r'\bDESCRIPTION\b', '', job_description)
    
        skills = row[2].strip().strip('"')
        skill_list = [skill.strip().strip('"') for skill in skills.split(',')]
        cap_skill_list = [skill.title() for skill in skill_list]
        cleaned_skills = [re.sub(r'\s?\(.*?\)', '', skill) for skill in cap_skill_list]

        row = pd.DataFrame({'Job_Title': job_title, 'Job_Description': job_description, 'Required_Skills': [skill_list]})
        jobs_df = pd.concat([jobs_df, row], ignore_index=True)
            
jobs_df.to_csv('data/dpr_jobs.csv', index=False)

In [5]:
check_course = next(item for item in courses_data if item[0] == 'CS470')
print(check_course)
print('********************************************')
print(courses_df.head(1))

['CS470', 'Assembly', 'C Language', 'Digital Gates', 'Instruction Set Architectures', 'Number Representation']
********************************************
  Courses  \
0   CS462   

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       Skills  
0  [3D Modeling, Animations, Assigning 3D Object Properties, Blender, Calculating Lights, Shades,

In [6]:
courses_df.head(1)

Unnamed: 0,Courses,Skills
0,CS462,"[3D Modeling, Animations, Assigning 3D Object Properties, Blender, Calculating Lights, Shades, And, Camera Rendering, Character Animation, Character Design, Collisions, Constructing 3D Scenes, Design 3D, Develop 3D, Event Triggering, Finite State Machine, Fourier Transform, Game Design, Geometric Image Manipulation, Human Aware Ai, Immersive 3D Worlds, Implementing Animation To Chara, Implementing Movement To Charac, Manipulate Lighting, Manipulate Rendering, Manipulating 3D Object Properti, Movement, Principles Of Lighting, Principles Of Rendering, Real Time Rendering Pipeline, Scene Composition, Scene Layout, Scripting Interactive Behaviors, Scripting Interactive Elements, Tangent Space, Unity, Vectors]"


In [7]:
check_job = [item for item in jobs_data if item[0] == 'Yahoo_Software_Dev_Engineer']
print(check_job)
print('********************************************')
print(jobs_df.head(1))

[['Yahoo_Software_Dev_Engineer', "Who We're Looking For- Junior Software Engineer We hire engineers who love the web, but can see its cracks and its future, too. We look for people who are exceptionally who are exceptionally imaginative, collaborative, and truly excited about tech. Our DSP Reporting team is currently looking for talented full-stack engineers to design, implement, and support robust, scalable, and high-quality reporting solutions Your Responsibilities - Develop and enhance a state-of-the-art reporting and analytics platform. - Build intuitive front-end UIs for reporting and analytics using React. - Develop microservices to power reporting and analytics solutions. - Write clean, maintainable, and performant code, including unit tests and refactoring when needed. - Collaborate with designers and developers to define and deliver new features. - Participate in system architecture reviews, code reviews, performance tuning, and production support. - Own and deliver projects r

In [8]:
jobs_df.head(1)

Unnamed: 0,Job_Title,Job_Description,Required_Skills
0,Adobe_AI_ML_Engineer,"The Opportunity?Adobe is seeking talented and passionate Software Engineer across all organizations to help plan, design, develop, and test software systems or applications for software enhancements and new products used in local, networked, cloud-based or Internet-related computer programs and products. What You'll Do - Develop high-performance, reliable, testable and maintainable code. - Participating in all aspects of software development activities, including design, coding, code review, testing, bug fixing, and code/API documentation. - Collaborate with engineers and participate in daily or weekly stand ups and meetings. - Grow with the support of your team and help others on the team grow by providing thoughtful feedback and uplifting those around you. - Work both independently and collaboratively within a fast-paced development team, with clear, positive, and constructive communication. - Additional responsibilities as needed based on specific role or team What You Need to Succeed - Bachelor's or Master's in Computer Science, Computer Engineering, Electrical Engineer, or equivalent experience required - 1-3+ years of experience in specific skill/field(s) - Proficient in programming languages such as Ruby, Python, Javascript, etc. - Strong technical background with analytical and problem-solving skills - Familiarity with client-side frameworks and libraries like React, Vue, Angular, Web Components. - Excellent problem solving and debugging skills, and direct experience with DevOps in a SaaS environment is a plus. - Customer focused and have real passion for quality and engineering excellence at scale. - Excellent communication and collaboration skills.","[Programming, Ruby, JavaScript, Python, Analytical skills, Knowledge sharing, Angular, Problem Solving Skills, React.js, Strong communication skills, Vue, Front-end frameworks, Collaboration, Communication, Development, Quality Assurance, Customer needs, Efficiency, Bug fixing, Code/API documentation, Code Review, Design, High performance, Reliability, Testing, Maintaining software applications, Feedback, Teamwork, Debugging, DevOps, Planning]"


In [9]:
del check_course, check_job, j_data, jobs_data, csv_reader, c_data, courses_data
gc.collect()
courses_df.shape, jobs_df.shape

((35, 2), (79, 3))

### Collect All Skills from CSU Course Data

In [10]:
def get_all_acquired_skills(courses_df):
    all_acquired_skills = set() 
    
    for skills in courses_df['Skills']:
        all_acquired_skills.update(skills)

    return all_acquired_skills

In [11]:
all_acquired_skills = get_all_acquired_skills(courses_df)
list(all_acquired_skills)[0:9]

['Webpack',
 'Bittorrent',
 'Grid Computing',
 'Non-Deterministic Polynomial Co',
 'Packet Switching',
 'Scripting Interactive Elements',
 'Combiner Functions',
 'Function Regression',
 'Mocking Concepts']

### Create Corpus for CSU Courses 

In [12]:
# def create_corpus(courses_df):
#     corpus = []
#     for idx, row in courses_df.iterrows():
#         document = {
#             "id": f"csu_course_{idx}",
#             "title": row['Courses'],
#             "text": f"Skills: {', '.join(row['Skills'])}",
#             "metadata": {"course_name": row['Courses'], "skills_acquired": row['Skills']}
#         }
#         corpus.append(document)
#     return corpus


# courses_corpus = create_corpus(courses_df)

# with open("data/dpr_course_corpus.json", "w") as f:
#     json.dump(courses_corpus, f, indent=4)

# json.dumps(courses_corpus[0], indent=4)

### Create the Training Dataset

In [13]:
def all_class_comparison(jobs_df, courses_df, all_acquired_skills):
    training_data = []
    course_entries = list(zip(courses_df["Courses"], courses_df["Skills"]))


    for _, job in jobs_df.iterrows():
        job_text = f"Job Title: {job['Job_Title']} -- Job Description: {job['Job_Description']}"
        job_skills = set(job["Required_Skills"])
        missing_skills = job_skills - all_acquired_skills
        
        pos_courses = []
        neg_courses = []

        for course, skills in course_entries:
            if any(skill in job_skills for skill in skills):
                pos_courses.append(f"[COURSE] {course} [SKILLS] {', '.join(skills)}")
            else:
                neg_courses.append(f"[COURSE] {course} [SKILLS] {', '.join(skills)}")

        pos_context = "\n".join(pos_courses) if pos_courses else "No relevant courses found."
        neg_context = "\n".join(neg_courses) if neg_courses else "No irrelevant courses found."

        if len(missing_skills) == 0:
            training_data.append({
                'query': job_text,
                'pos_context': pos_context,
                'neg_context':  neg_context,
                'answer': f"You qualify as an applicant for the job position, {job['Job_Title']}. No required skills are missing",
                'label': 0
            })
        else:
            training_data.append({
                'query': job_text,
                'pos_context': pos_context,
                'neg_context':  neg_context,
                'answer': f"Missing Skills: ".join(list(missing_skills)),
                'label': 1
            })
        
    return training_data

In [14]:
def compare_individual_course(jobs_df, courses_df, neg_per_pos=1):
    training_data = []
     
    for _, job in jobs_df.iterrows():
        job_text = f"Job Title: {job['Job_Title']} -- Job Description: {job['Job_Description']}"
        job_skills = set(job['Required_Skills'])  
        
        for _, course in courses_df.iterrows():
            course_entries = list(zip(course["Courses"], course["Skills"]))
            pos_courses = []
            neg_courses = []
            missing_skills = list(job_skills - set(course["Skills"]))
            
            for course, skills in course_entries:
                if any(skill in job_skills for skill in skills):
                    pos_courses.append(f"[COURSE] {course} [SKILLS] {', '.join(skills)}")
                else:
                    neg_courses.append(f"[COURSE] {course} [SKILLS] {', '.join(skills)}")

            pos_context = "\n".join(pos_courses) if pos_courses else "No relevant courses found."
            neg_context = "\n".join(neg_courses) if neg_courses else "No irrelevant courses found."

            if len(missing_skills) == 0:
                training_data.append({
                    'query': job_text,
                    'pos_context': pos_context,
                    'neg_context': neg_context,
                    'answer': f"You qualify as an applicant for the job position, {job['Job_Title']}. No required skills are missing",
                    'label': 0
                })
            else:
                training_data.append({
                    'query': job_text,
                    'pos_context': pos_context,
                    'neg_context': neg_context,
                    'answer': f"Missing Skills: ".join(list(missing_skills)),
                    'label': 1
                })

    return training_data

In [15]:
def create_schedule_data(jobs_df, schedules):
    training_data = []

    for _, job in jobs_df.iterrows():
        job_text = f"Job Title: {job['Job_Title']} -- Job Description: {job['Job_Description']}"
        job_skills = set(job["Required_Skills"])

        for sched in schedules:
            course_entries = list(zip(sched["Courses"], sched["Skills"]))
            pos_courses = []
            neg_courses = []
            sched_skills = set([item for sublist in sched["Skills"].tolist() for item in sublist])
            missing_skills = list(job_skills - sched_skills)

            for course, skills in course_entries:
                if any(skill in job_skills for skill in skills):
                    pos_courses.append(f"[COURSE] {course} [SKILLS] {', '.join(skills)}")
                else:
                    neg_courses.append(f"[COURSE] {course} [SKILLS] {', '.join(skills)}")

            pos_context = "\n".join(pos_courses) if pos_courses else "No relevant courses found."
            neg_context = "\n".join(neg_courses) if neg_courses else "No irrelevant courses found."

            if len(missing_skills) == 0:
                training_data.append({
                    'query': job_text,
                    'pos_context': pos_context,
                    'neg_context': neg_context,
                    'answer': f"You qualify as an applicant for the job position, {job['Job_Title']}. No required skills are missing",
                    'label': 0
                })
            else:
                training_data.append({
                    'query': job_text,
                    'pos_context': pos_context,
                    'neg_context': neg_context,
                    'answer': f"Missing Skills: ".join(list(missing_skills)),
                    'label': 1
                })

    return training_data


def get_courseloads(jobs_df, courses_df, number_of_schedules=20):
    core_classes = ['CS150', 'CS164', 'CS152', 'CS162', 'CS201', 'CS165', 'CS220', 
                    'CS270', 'CS250', 'CS314', 'CS370', 'CS320', 'CS214']
    
    elective_courses_df = courses_df[~courses_df['Courses'].isin(core_classes)]

    schedules_df = []
    used_schedules = set()

    while len(schedules_df) < number_of_schedules:
        l_4_courses = elective_courses_df[elective_courses_df['Courses'].str.startswith('CS4')]
        l_3_4_courses = elective_courses_df[elective_courses_df['Courses'].str.startswith('CS3') | elective_courses_df['Courses'].str.startswith('CS4')]
        other_courses = elective_courses_df[~elective_courses_df['Courses'].str.startswith('CS3') & ~elective_courses_df['Courses'].str.startswith('CS4')]

        l_4_sample = random.sample(l_4_courses['Courses'].tolist(), 2)

        l_3_4_filtered = l_3_4_courses[~l_3_4_courses['Courses'].isin(l_4_sample)]
        l_3_4_sample = random.sample(l_3_4_filtered['Courses'].tolist(), 2)

        all_sampled_courses = l_4_sample + l_3_4_sample
        other_courses_filtered = other_courses[~other_courses['Courses'].isin(all_sampled_courses)]
        other_sample = random.sample(other_courses_filtered['Courses'].tolist(), 1)

        sched_courses = core_classes + l_4_sample + l_3_4_sample + other_sample
        sched_df = courses_df[courses_df['Courses'].isin(sched_courses)].copy()

        sched_tuple = tuple(sorted(sched_df['Courses'].tolist()))
        if sched_tuple not in used_schedules:
            schedules_df.append(sched_df)
            used_schedules.add(sched_tuple)

    training_data = create_schedule_data(jobs_df, schedules_df)

    return training_data

In [16]:
def create_training_data(jobs_df, courses_df, all_acquired_skills):
    training_data = all_class_comparison(jobs_df, courses_df, all_acquired_skills)
    print(f"All: {training_data[-1]}\n")

    training_data = training_data + compare_individual_course(jobs_df, courses_df)
    print(f"Individual: {training_data[-1]}\n")
    
    training_data = training_data + get_courseloads(jobs_df, courses_df)
    print(f"Course load: {training_data[-1]}\n")

    return training_data

training_data = create_training_data(jobs_df, courses_df, all_acquired_skills)

All: {'query': "Job Title: Yahoo_Software_Dev_Engineer -- Job Description: Who We're Looking For- Junior Software Engineer We hire engineers who love the web, but can see its cracks and its future, too. We look for people who are exceptionally who are exceptionally imaginative, collaborative, and truly excited about tech. Our DSP Reporting team is currently looking for talented full-stack engineers to design, implement, and support robust, scalable, and high-quality reporting solutions Your Responsibilities - Develop and enhance a state-of-the-art reporting and analytics platform. - Build intuitive front-end UIs for reporting and analytics using React. - Develop microservices to power reporting and analytics solutions. - Write clean, maintainable, and performant code, including unit tests and refactoring when needed. - Collaborate with designers and developers to define and deliver new features. - Participate in system architecture reviews, code reviews, performance tuning, and product

In [17]:
td_df = pd.DataFrame(training_data, columns=['question', 'pos_context', 'neg_context', 'answer', 'label'])
print(td_df.shape)

td_df.to_csv('data/dpr_training_data.csv', index=False)

del training_data, jobs_df, courses_df
gc.collect()

(4424, 5)


23

In [18]:
td_df.head(1)

Unnamed: 0,question,pos_context,neg_context,answer,label
0,,"[COURSE] CS314 [SKILLS] Agile, Black Box Testing, Burndown Charts, Clean Code, Cmmi, Code Climate, Code Quality, Communication, Compatability Standards, Concurrency, Configuration Management, Continuous Integration, Databases, Development Environments, Devops, Docker, Docker Container, Establishing Interpersonal Rela, Git, Github, Github Actions, Github Projects, Github Repository, Individual Metrics, Integration Tests, Intellij, Java, Java Concurrency, Java Spark, Json, Junit, Kml, Linux, Mariadb, Maven, Networking, Npm, Optimization, Peer Evaluation, Peer Review, Port Forwarding, Postman, Problem Solving, Product Integration, Project Management, Project Planning, Refactoring, Remote Development, Rest Api, Retrospectives, Scrum, Slack, Slf4J, Software Development Practices, Source Control, Sprint Planning, Sql, Story Boards, Story Sizing, Task Breakdown, Team Development Experience, Team Diversity, Team Metrics, Teamwork, Test Driven Development, Tuckman'S Model, Unix, Use Case Testing, Verification, VS Code, Webpack, White Box Testing\n[COURSE] CS150 [SKILLS] Conditionals, Data Analysis, Data Visualization, Dictionaries, Functions, Hcc, Html, I O Console, I O File, Libraries, Lists, Loops, Operator, Privacy, Python, Research, Research Design, Security, Sets\n[COURSE] CS345 [SKILLS] Anaconda, Characterizing Data, Classifier Accuracy, Classifying Data, Conduct Experiments, Convolutional Neural Networks, Cross-Validation, Data Manipulation, Data Representation, Data Visualization, Decision Trees, Derivatives, Dot Products, Ensemble Methods, Experimental Design, Experimental Development, Function Regression, Gradient Descent, Hyperparameters, Hyperplanes, Jupyter Notebook, Keras, Kernels, Linear Regression, Machine Learning, Machine Learning Algorithms, Matplotlib, Matrices, Multiclass Classification, Multivariate Linear Regression, Nearest Neighbors, Neural Networks, Numpy, Overfitting, Pandas, Partial Derivatives, Perceptron, Principle Components Analysis, Python, Regularization, Scikit-Learn, Support Vector Machine, Tensorflow, Validation Sets\n[COURSE] CS320 [SKILLS] Bipartite Graph, Breadth First, Depth First, Dictionaries, Divide & Conquer, Dynamic Multi-Threading, Dynamic Programming, Graph Algorithm, Greedy Algorithms, Greedy Proofs, Heaps, Heapsort, Knapsack, Line Of Sight Algorithm, Master Theorem, Memoization, Memory-Efficient Knapsack, Minimum Spanning Tree, Non-Deterministic Polynomial Cl, Non-Deterministic Polynomial Co, Orders Of Magnitude, Parallel Algorithms, Parallel Scans, Polynomial Class, Polynomial Time Reduction, Prefix Sums, Python, Recursive Substructure, Shortest Paths, Topological Sort, Tree Algorithm\n[COURSE] CS164 [SKILLS] 2D Arrays, Abstract Classes, Algorithms, Arraylists, Arrays, Branching, Classes, Code Comprehention, Code Design, Code Reading Analysis, Conditionals, Datatypes, Debugging, Exceptions, File Input, File Output, Git, Github, Ide, Intellij, Interfaces, Java, Java Development Environment, Javadoc, Linux, Logical Operators, Loops, Methods, Obects, Object Oriented Programming, Polymorphism, Problem Solving, Putty, Quality Assurance, Recursion, Searching, Sorting, Ssh, String Manipulation, Strings, Testing, Uml, Version Control, VS Code, Windows\n[COURSE] CS220 [SKILLS] Anaconda, Big O, Combinations, Counting, Directed Graphs, Functions, Graph, Induction Proof, Jupyter Notebook, Logical Inference, Loop Invariants, Mathematical Proof, Matplotlib, Miniconda, Permutations, Predicate Logic, Propositional Logic, Python, Relations, Sets\n[COURSE] CS214 [SKILLS] Agile, Artifact Management, Code Coverage, Coding Standards, Debugging, Documentation, Functional Programming, Git, Individual Software Development, Java, Junit Testing, Memory Efficientcy, Project Management, Project Planning, Refactoring, Software Design, Software Engineering Process, Software Implementation, Software Maintenance, Software Testing, Test Driven Development, Time Efficiency\n[COURSE] CS445 [SKILLS] Classification, Deep Learning, Jupyter Notebook, Multivariable Data, Pattern Classification Algorith, Prediction, Python, Regression Algorithms, Reinforcement Learning Algorith, Scientific Reports, Statistical Analyses, Visualization","[COURSE] CS462 [SKILLS] 3D Modeling, Animations, Assigning 3D Object Properties, Blender, Calculating Lights, Shades, And, Camera Rendering, Character Animation, Character Design, Collisions, Constructing 3D Scenes, Design 3D, Develop 3D, Event Triggering, Finite State Machine, Fourier Transform, Game Design, Geometric Image Manipulation, Human Aware Ai, Immersive 3D Worlds, Implementing Animation To Chara, Implementing Movement To Charac, Manipulate Lighting, Manipulate Rendering, Manipulating 3D Object Properti, Movement, Principles Of Lighting, Principles Of Rendering, Real Time Rendering Pipeline, Scene Composition, Scene Layout, Scripting Interactive Behaviors, Scripting Interactive Elements, Tangent Space, Unity, Vectors\n[COURSE] CS165 [SKILLS] Algorithms, Assertions, B+ Trees, Binary Search Trees, Black Box Testing, Branching Recursion, Data Structures, Dequeues, Expression Trees, Generics, Graph, Hashmap, Infix, Inheritance, Java, Linkedlists, Object Oriented Principles, Object Oriented Programming, Pcre, Polymorphism, Postfix, Prefix, Priority Queues, Problem Solving, Queues, Regex, Stack, Unit Testing\n[COURSE] CS201 [SKILLS] Argument Construction, Computer Solution Designing, Computer Solution Implementatio, Computer Solution Operations, Decision Making, Ethical Dilemma Analysis, Ethical Dilemma Problem Solving, Ethics Analysis, Legal Obligations, Moral Obligations, Philosophy, Professional Code Of Ethics\n[COURSE] CS370 [SKILLS] Commercial Operating Systems, Containers, Deadlocks, Deadlocks Management, Design Threaded Programs, File System Architecture, Interprocess Communication, Kernel Threads, Memory Management, Open Source Operating Systems, Operating Systems, Process Synchronization, Processes Management, Resource Management, Scheduling Algorithms, Storage Architecture, Symmetric Multiprocessing, Synchronization, Task Synchronization, Thread Management, Threads, Type-1 Hypervisors, Type-2 Hypervisors, User Threads\n[COURSE] CS110 [SKILLS] Data Analysis, Data Manipulation, Data Visualization, Hardware, Privacy, Research, Security\n[COURSE] CS455 [SKILLS] Abstraction, Algorithms Underpinning P2P Sys, Amazon Dynamo, Architectural Styles For Distri, Berkley Algorithm, Bittorrent, Brewer'S Cap Theorem, Build Distributed Systems, Build Scalable Servers, Casual Consistency, Central-Server Algorithm, Centralized Single Cpu Systems, Chord, Clock Synchronization In Distri, Combiner Functions, Compound Actions, Concurrency Primitives, Concurrent Collections, Concurrent Programming, Conditions Requirements For Dis, Confinement, Consensus, Consistency, Consistency Protocol, Consistent Ordering Of Operatio, Core Architecture Framework, Cristian'S Algorithm, Data And Client Centric Consist, Data Synchronization, Design And Build Distrubuted Fa, Design Cloud Scale Storage Syst, Design Efficient Data Represent, Design Small Scale Storage Syst, Design Systems That Can Recover, Distributed Coordination, Distributed Deadlock Detection, Distributed Graph Algorithms, Distributed Hash Tables, Distributed Mutual Exclusion, Distributed Objects, Distributed Shared Memory, Election Algorithms, Elections In Wireless Environme, Exclusions, Extreme Scale Storage Systems, Failure Detectors, File System Design, Foundational Issues, Gnutella, Google File System, Grid Computing, Hadoop, Heaps, Hpc, Intrinsic Locks, Intrinsic Reentracy, Lamport'S Clocks, Lazy Evaluations, Locking Strategies, Maekawa Algorithm, Map And Reduce Functions, Mapreduce, Mapreduce Data Flow, Monotonic Read, Monotonic Write, Multivariable Invariants, Multivariable Thread Safety, Napster, Narrow And Wide Transformations, Non-Blocking I O, Overlays, Pair Resilient Distributed Data, Partitioning Functions, Partitioning Schemes, Pastry, Peer To Peer, Performance Considerations, Pipelining Schemes, Race Conditions, Rdbms, Read-Your-Writes, Replica Placements, Resilient Distrubuted Datasets, Ricarat And Agarwala'S Algorith, Ring-Based Algorithm, Scalable Server Design, Scale Entities In Scalable Syst, Sequential Consistency, Sharing Objects, Software Stack, Spark, Stacks, Stateful And Stateless Transfor, Structured Peer To Peer, Synchronization In Wireless Set, Synchronized Collections, Tapestry, Tasks And Split Strategies, Thread Lifecycle, Thread Management, Thread Pools, Thread Safety, Thread-Safe Classes, Time And Global Positioning Sys, Time Synchronization Algorithms, Token Permission Based Approach, Transformations, Unstructured Peer To Peer, Vector And Matrix Clocks, Volunteer Computing, Voting Sets, Windowed Operations, Write Mapreduce Programs That E\n[COURSE] CS470 - Modules [SKILLS] Accelerators, Activation Records, Analysts Memory Heirarchy, Assembly, Cache, Combination Circuits, Custom Accellerators, Datapaths, Design Memory Heirarchy, Disk Memory Heirarchy, Elements Of Compilation, Implementation Memory Heirarchy, Instruction Set Processors, Instruction Set Processors Micr, Pipelining, Prefetching Data, Prefetching Instructions, Ram Memory Heirarchy, Rationale Memory, Register Transfer Notation, Registers, Sequential Circuits, Ssd Memory Heirarchy, Stack Frame, State Machine, Structure Memory, Systolic Matric Multipliers\n[COURSE] CS453 [SKILLS] Activation Records, Asm Code Generation, Ast Lowering, Ast Translation, Compiler Algorithms, Compiler Back-End Design, Compiler Construction, Compiler Front-End Design, Compiler Middle-End Design, Compiler Testing, Control Flow Analysis, Dataflow Analysis, Develope A Compiler From Java, Develope A Compiler From Mips, Grammars, Instructor Scheduling, Intermediate Representation Con, Lexical Analysis, Linear Scan Algorithm, Liveness Analysis, Ll Parsing, Lr Bison, Lr Flex, Lr Parsing, Mips Assembly, Parsers, Register Allocation, Semantic Analysis, Simd Vectorization, Translation, Type Analysis, Type Checking\n[COURSE] CS425 [SKILLS] Algorithm Implementation On Bio, Bioinformatics, Biological Data Analysis, Deep Learning, Genomic Assembly Sequence, Machine Learning, Motif Finding, Predicting Protein-Coding Regio\n[COURSE] CS435 [SKILLS] Big Data, Big Data Organization, Data Storage Systems, Distributed Storage Architectur, Experience Using Real-World Big, In-Memory Data Analytics, Knowledge Discovery At Scale, Large Scale Data Analysis Frame, Large Scale Data Analysis Model, Scalable Data Abalytics Framewo, Self-Descriptive Data Represent\n[COURSE] CS162 [SKILLS] Abstraction, Arrays, Assignment, Booleans, Characters, Classes, Conditionals, Encapsulations, Expressions, I O File, Inheritance, Interfaces, Java, Lists, Loops, Object Oriented Programming, Objects, Operator, Polymorphism, Recursion, Sorting, Strings\n[COURSE] CS270 [SKILLS] Activation Records, Arithmetic Logic Unit, Assembly, Bitwise Operators, C, C++, Circuits, Computer Architecture, Datatype Conversion, Digital Logic, Dynamic Memory Management, Gates, Global Variables, Instruction Set Architectures, Lc3, Logism, Memory Management, Number Formats, Point, Registers, Stack, State Machine, Subroutines, Transistors\n[COURSE] CS250 [SKILLS] Assembly, B+ Trees, Bits, Boolean Algebra, Boolean Logic, Border Gateway Protocol, Cache, Computer Architecture, Data Structures, Data Structures For Storage Sys, Databases, Datatypes, Distributed Systems, Domain Name System, File Systems, Frameworks, Graphic Processing Unit, Harvard Architectue, Internet Protocol, Logged Structure Merge, Logic Design, Machine Language, Main Memory, Memory Efficientcy, Memory Hierarchy, Memory Management, Nand Gates, Networks, Neuromorphic Computing, Numeric Datatypes, Parallel Systems, Parallelization, Processors, Registers, Signed Unsigned Numbers, Sorted Strings Tables, Storage, Transmission Control Protocol, User Datagram Protocol\n[COURSE] CS464 [SKILLS] Experimental Design, Human Centered Interaction Foun, Human Computer Interaction, Human Computer Interaction Rese, Human Factors, Hypothesis Testing, Interaction Design Principles, Latex, Perception, Principles Of User Experience, Principles Of User Usability, Prototyping, Publishing Research Paper, Qualitiative Analysis, Scientific Foundations, User Centered Interaction Desig, Writing Research Paper\n[COURSE] CS440 [SKILLS] Analyse Implications Of Ai, Apply Ai To Real-World Problems, Conceptual Frameworks, Deep Learning, Designing, Training, & Evaluati, Ethical And Societal Implicatio, Fundamental Principles Of Ai, Knowledge Representation, Machine Learning Technique, Neural Networks, Practical Aspects Of Artificial, Probabilistic Models, Reasoning, Scikit-Learn, Search Algorithms, Software Artifacs, Software Process Evaluation, Software Process Improvement, Software Processes, Software Product Evaluation, Software Product Improvement, Supervised Learning, Symbolic Reasoning, Tensorflow, Theoretical Aspects Of Artifici\n[COURSE] CS458 [SKILLS] Blockchain, Consensus Protocols, Cryptocurrency, Cryptographic Algorithms, Decent, Decentralized Exchanges (Uniswa, Distributed Application, Ethereum Virtual Machine, Fault Tolerance, Hyperledger, Iota, Mining Algorithms, Mining Game Theory, Naivechain, Naivecoin, Network Models, Oracles, Proof Of Importance, Proof Of Stake, Proof Of Work, Rchain, React, Smart Contracts, Solidity, Sybil Resistance\n[COURSE] CS456 [SKILLS] Anonymity, Art Of Invisibility, Attack Surfaces, Computer Log Scanners, Crypto Key Management, Cyber Kill Chain, Cyber Security, Cyber Security Threats, Cyber Security Vulnerabilities, Cyber Threat Intelligence, Cyber-Physical Security, Data Science, Differential Privacy, Dns Over Https, Dns Over Tls, Dns Privacy, Elliptic Curve Crypto, End-To-End Encryption, Homomorphoc Encryption, Https, Incident Response, Log Analysis, Malware Analysis, Malware Detection Using Ml, Malware Relationship Analysis, Owasp, Privacy, Privacy By Design, Privacy Engineering, Privacy Enhancing Technologies, Protecting Digital Data, Protecting Sensitive Data, Quantum Computing, Quantum Cryptography, Securing Data At Rest, Securing Data In Motion, Securing Data-In-Use, Security Testing, Siem Systems, Signal Protocol, Socks, Tls, Vpn, Zero-Knowledge Proofs, Zero-Trust, Zero-Trust Security\n[COURSE] CS414 [SKILLS] Architechure Design, Backlogs Refinement, Code Idioms, Code Styles, Cohesion, Conceptual Modeling, Coupling, Crc Modeling, Design Modeling, Design Models, Design Patterns, Domain Modeling, Gather Requirements, Inheritance, Interfaces, Object Oriented Analysis, Object Oriented Design, Object Oriented Principles, Problem Analysis, Refactoring, Requirement Breakdown, Requirements Engineering, Retroactives, Software Engineering Ethics, Software Process Models, Story Mapping, Team Development Experience, Team Interaction, Team Project, Test Driven Development\n[COURSE] CS457 [SKILLS] Arp, Arp Addresses, Bgp, Circuit Switching, Congestion Control, Connected-Oriented Transport, Connectionless Transport, Content Distribution Networks, Cryptography, Data Center Networking, Demultiplexing, Digital Signatures, Distance-Vector, Dns, End-Point Authentication, Error-Corrections, Error-Detection, Ethernet Protocol, Forwarding, Http, Interdomain Routing, Internetworking, Intra-As Routing, Ip Networking Stack, Ip-Anycast, Ipv6, Link Layer Access Control, Link Virtualization, Linked-State, Mac Addresses, Message Integrity, Middleboxes, Multiple Access Links Protocol, Multiplexing, Nat, Netowrk Nfv, Network Application Protocols, Network Architecture, Network Delay, Network Layer Control Plane, Network Layers, Network Loss, Network Sdn, Network Security, Network Throughput, Ospf, P2P File Distribution, Packet Switching, Pv4, Reliable Data Transfer, Routing, Routing Algorithms, Routing Policy, Service Models, Smtp, Socket Programming, Software Defined Networking, Switched Ethernet Mpls, Switched Local Area Networks, Switches, Tcp, Transport Layer Services\n[COURSE] CS415 [SKILLS] Automatic Test Generation, Control Flow Graphs, Data Flow Criteria, Dbconnector, Fitnesse, Fuzz Testing, Gamification Of Learning, Graph Coverage Criteria, Graph Coverage For Specificatio, Graph Coverage For State Charts, Graph Coverage For Use Cases, Graph Fundamentals, Input Space Partitioning, Junit, Junit Parameterized Classes, Junit Theory, Mocking Concepts, Mockito, Mutation Analysis, Regression Testing, Restcontroller, Selenium, Static Analysis\n[COURSE] CS475 [SKILLS] Compiling Programs On Clusters, Cuda, Mpi, Openmp, Parallel Program Analysis, Parallel Program Debugging, Parallel Program Design, Parallel Programming\n[COURSE] CS110 - Modules [SKILLS] Computer Hardware, Computer Input, Computer Output, Computer Security, Computer Software, Computer Storage, Data Analytics, Databases, Digital Identity, Excel, File Management, Mac Os, Mobile Devices, Networking And Connecting To Th, Social Media, Spreadsheets\n[COURSE] CS152 [SKILLS] Algorithms, Data Analysis, Data Visualization, Functions, I O File, Libraries, Logic, Problem Solving\n[COURSE] CS430 [SKILLS] Application Development, Concurrency, Create, Manage, And Optimize Da, Creating Database, Data Modeling, Database Design, Database Guis, Database Recovery, Database Security, Database Systems, E R Diagram, Hash Table, Indexing, Locking, Migrating Databases, Mysql, Normalization, Queries, Query Processing, Relational Algebra, Robust Implementation Of Databa, Sql, Storage Management, Transaction Management, Transactions, Updating Databases\n[COURSE] CS312 [SKILLS] Asychronous, Cascading Style Sheets, Databases, Full-Stack Development, Hypertext Markup Language, Javascript, Microservices, Rest Api, Service-Based Architecture, Usability Assessment And Enhanc, User Account Creation, Ux Design, Web Applications\n[COURSE] CS356 [SKILLS] Access Control, Buffer Overflow Attacks, Cryptographic Tools, Database Security, Defensive Programming Technique, Denial Of Service Attacks, Ethical Hacking Experiments, Hacking Demos, Internet Authentication, Intrusion Detection, Malicious Software, Prevention Systems, User Authentication\n[COURSE] CS470 [SKILLS] Assembly, C Language, Digital Gates, Instruction Set Architectures",RubyMissing Skills: Strong communication skillsMissing Skills: Front-end frameworksMissing Skills: Customer needsMissing Skills: ProgrammingMissing Skills: JavaScriptMissing Skills: ReliabilityMissing Skills: React.jsMissing Skills: EfficiencyMissing Skills: Bug fixingMissing Skills: PlanningMissing Skills: CollaborationMissing Skills: FeedbackMissing Skills: High performanceMissing Skills: AngularMissing Skills: DevelopmentMissing Skills: Maintaining software applicationsMissing Skills: Code/API documentationMissing Skills: VueMissing Skills: Knowledge sharingMissing Skills: Problem Solving SkillsMissing Skills: Analytical skillsMissing Skills: DesignMissing Skills: Code ReviewMissing Skills: DevOps,1


## Question (Jobs) and Context (Courses) Encoding for DPR Training

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# question_encoder = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
# context_encoder = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')

# question_encoder = question_encoder.to(device)
# context_encoder = context_encoder.to(device)

# pos_weight = torch.tensor([(len(td_df) - td_df['label'].sum()) / td_df['label'].sum()]).to(device)
# loss_fn = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)
# print(td_df['label'].value_counts())

In [20]:
question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
context_tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')

def encode_data(batch):
    max_length = 512

    question_texts = batch['question'] if isinstance(batch['question'], list) else [str(batch['question'])]
    question_texts = [str(text) for text in question_texts]
    
    pos_contexts = batch['pos_context'] if isinstance(batch['pos_context'], list) else [str(batch['pos_context'])]
    neg_contexts = batch['neg_context'] if isinstance(batch['neg_context'], list) else [str(batch['neg_context'])]
    
    pos_contexts = [str(text) for text in pos_contexts]
    neg_contexts = [str(text) for text in neg_contexts]

    question_encodings = question_tokenizer(
        question_texts, 
        padding="max_length", 
        truncation=True, 
        max_length=max_length, 
        return_tensors="pt"
    )
    pos_context_encodings = context_tokenizer(
        pos_contexts, 
        padding="max_length", 
        truncation=True, 
        max_length=max_length, 
        return_tensors="pt"
    )
    neg_context_encodings = context_tokenizer(
        neg_contexts, 
        padding="max_length", 
        truncation=True, 
        max_length=max_length, 
        return_tensors="pt"
    )

    # Handle labels (unchanged)
    if 'label' in batch:
        labels = torch.tensor(batch['label']).to(torch.float16)
    else:
        labels = torch.zeros(len(question_texts), dtype=torch.long)
        
    return {
        'question_input_ids': question_encodings['input_ids'],
        'question_attention_mask': question_encodings['attention_mask'],
        'pos_context_input_ids': pos_context_encodings['input_ids'],
        'pos_context_attention_mask': pos_context_encodings['attention_mask'],
        'neg_context_input_ids': neg_context_encodings['input_ids'],
        'neg_context_attention_mask': neg_context_encodings['attention_mask'],
        'labels': labels
    }

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.


In [21]:
train_df, temp_df = train_test_split(td_df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "validation": Dataset.from_pandas(val_df),
    "test": Dataset.from_pandas(test_df)
})

encoded_dataset = dataset.map(
    encode_data, 
    batched=True,
    remove_columns=dataset["train"].column_names  # Remove original columns
)

encoded_dataset.set_format(type='torch', columns=[
    'question_input_ids',
    'question_attention_mask',
    'pos_context_input_ids',
    'pos_context_attention_mask',
    'neg_context_input_ids',
    'neg_context_attention_mask',
    'labels'
])


Map: 100%|██████████| 3096/3096 [00:28<00:00, 107.08 examples/s]
Map: 100%|██████████| 664/664 [00:06<00:00, 109.05 examples/s]
Map: 100%|██████████| 664/664 [00:05<00:00, 112.94 examples/s]


## WandB Evaluation Setup

In [22]:
key_file = rf'D:\Development\cs580\CSU-Industry-Skills\dpr-model\WANDB_API_KEY.txt' 

with open(key_file, "r") as f:
    api_key = f.read().strip()

wandb.login(key=api_key)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\ayoun\_netrc
[34m[1mwandb[0m: Currently logged in as: [33mayoungren94[0m ([33mayoungren-colostate[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [23]:
def compute_metrics(preds, labels):
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels.cpu(), preds.cpu(), average='binary', zero_division=0
    )
    acc = (preds == labels).float().mean()
    return {
        'accuracy': acc.item(),
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [24]:
def evaluate(model_tuple, dataloader, device):
    question_encoder, context_encoder = model_tuple
    question_encoder.eval()
    context_encoder.eval()
    
    all_preds, all_labels = [], []
    
    for batch in dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        
        query_out = question_encoder(
            input_ids=batch['question_input_ids'],
            attention_mask=batch['question_attention_mask']
        )
        pos_ctx_out = context_encoder(
            input_ids=batch['pos_context_input_ids'],
            attention_mask=batch['pos_context_attention_mask']
        )
        neg_ctx_out = context_encoder(
            input_ids=batch['neg_context_input_ids'],
            attention_mask=batch['neg_context_attention_mask']
        )

        pos_scores = torch.sum(query_out.pooler_output * pos_ctx_out.pooler_output, dim=1)
        neg_scores = torch.sum(query_out.pooler_output * neg_ctx_out.pooler_output, dim=1)
        
        preds = torch.stack([pos_scores, neg_scores], dim=1).argmax(dim=1)

        if preds.dim() == 0:
            preds = preds.unsqueeze(0)
        if batch['labels'].dim() == 0:
            batch['labels'] = batch['labels'].unsqueeze(0)
        
        all_preds.append(preds.cpu())
        all_labels.append(batch['labels'].cpu())

    if not all_preds or not all_labels:
        raise ValueError("No predictions or labels collected during evaluation.")
    
    return compute_metrics(torch.cat(all_preds), torch.cat(all_labels))

In [25]:
torch.cuda.empty_cache()
gc.collect()

159

In [26]:
def contrastive_loss(pos_scores, neg_scores, labels, margin=0.2):
    loss = torch.where(
        labels == 0,
        torch.relu(margin - pos_scores + neg_scores),  
        torch.relu(pos_scores - neg_scores + margin)
    )
    return loss.mean()

In [27]:
def train_dpr(config):
    model_name = f"dpr--lr:{config['lr']}--{config['batch_size']}--wd:{config['weight_decay']}"
    run = wandb.init(
        project="Gaps-DPR",
        entity="ayoungren-colostate",
        name= model_name,
        config={
            'learning_rate': config['lr'],
            'batch_size': config['batch_size'],
            'weight_decay': config['weight_decay'],
        }
    )

    torch.cuda.empty_cache()
    gc.collect()

    scaler = torch.amp.GradScaler('cuda',)
    autocast = torch.amp.autocast(device_type='cuda', dtype=torch.float16, enabled=True)
    
    question_encoder = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base').to(device)
    context_encoder = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base').to(device)
    
    grad_accum_steps = max(1, config['batch_size']//8)
    effective_batch = config['batch_size'] // grad_accum_steps

    train_loader = DataLoader(
        encoded_dataset["train"], 
        batch_size=effective_batch, 
        shuffle=True,
        pin_memory=True,
        num_workers=2,        
        persistent_workers=True
    )
    
    val_loader = DataLoader(
        encoded_dataset["validation"], 
        batch_size=effective_batch, 
        shuffle=False,
        pin_memory=True
    )

    optimizer = AdamW(
        list(question_encoder.parameters()) + list(context_encoder.parameters()),
        lr=config['lr'],
        weight_decay=config['weight_decay']
    )

    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=config['warmup_steps'],
        num_training_steps=len(train_loader) * config['epochs'] // grad_accum_steps
    )

    best_val_f1 = 0
    early_stop_counter = 0
    patience = 30

    for epoch in range(1, config['epochs']+1):
        print(f"\nBeginning Epoch {epoch} for model architecture {model_name}")
        question_encoder.train()
        context_encoder.train()
        
        epoch_loss = 0
        all_preds, all_labels = [], []
        optimizer.zero_grad()

        for step, batch in enumerate(train_loader):
            # print(torch.cuda.memory_summary())
            batch = {k: v.to(device, non_blocking=True) for k, v in batch.items()}
            
            with autocast:
                query_out = question_encoder(
                    input_ids=batch['question_input_ids'],
                    attention_mask=batch['question_attention_mask']
                )
                pos_ctx_out = context_encoder(
                    input_ids=batch['pos_context_input_ids'],
                    attention_mask=batch['pos_context_attention_mask']
                )
                neg_ctx_out = context_encoder(
                    input_ids=batch['neg_context_input_ids'],
                    attention_mask=batch['neg_context_attention_mask']
                )

                pos_sim = torch.sum(query_out.pooler_output * pos_ctx_out.pooler_output, dim=1)
                neg_sim = torch.sum(query_out.pooler_output * neg_ctx_out.pooler_output, dim=1)

                targets = batch['labels'].float()
                loss = contrastive_loss(pos_sim, neg_sim, targets) / grad_accum_steps

                scores = torch.stack([pos_sim, neg_sim], dim=1)
                preds = scores.argmax(dim=1)

            scaler.scale(loss).backward()

            if (step + 1) % grad_accum_steps == 0:
                scaler.step(optimizer)
                scaler.update() 
                scheduler.step() 
                optimizer.zero_grad()

            epoch_loss += loss.item() * grad_accum_steps
            
            with torch.no_grad():
                preds = scores.argmax(dim=-1).detach().cpu()
                all_preds.append(preds)
                all_labels.append(batch['labels'].cpu())

            if step % 10 == 0:
                run.log({
                    "batch_loss": loss.item() * grad_accum_steps,
                    "batch_lr": scheduler.get_last_lr()[0],
                    "gpu_mem": torch.cuda.memory_allocated()/1e9
                }, commit=False) 
                torch.cuda.empty_cache()
                gc.collect()

        train_metrics = compute_metrics(torch.cat(all_preds), torch.cat(all_labels))
        avg_train_loss = epoch_loss / len(train_loader)

        val_metrics = evaluate((question_encoder, context_encoder), val_loader, device)

        run.log({
            "epoch": epoch,
            "train_loss": avg_train_loss,
            "train_accuracy": train_metrics['accuracy'],
            "train_f1": train_metrics['f1'],
            "train_precision": train_metrics['precision'],
            "train_recall": train_metrics['recall'],
            "val_accuracy": val_metrics['accuracy'],
            "val_f1": val_metrics['f1'],
            "val_precision": val_metrics['precision'],
            "val_recall": val_metrics['recall'],
            "learning_rate": scheduler.get_last_lr()[0],
            "gpu_mem": torch.cuda.memory_allocated()/1e9
        }, commit=True)

        if val_metrics['f1'] > best_val_f1:
            best_val_f1 = val_metrics['f1']
            early_stop_counter = 0
            torch.save({
                "question_encoder": question_encoder.state_dict(),
                "context_encoder": context_encoder.state_dict(),
                "config": config,
                "val_metrics": val_metrics
            }, f"best_dpr_model_{wandb.run.id}.pt")
        else:
            early_stop_counter += 1
            if early_stop_counter >= patience:
                print(f"Early stopping at epoch {epoch}")
                break

    checkpoint = torch.load(f"best_dpr_model_{wandb.run.id}.pt", map_location=device)
    question_encoder.load_state_dict(checkpoint["question_encoder"])
    context_encoder.load_state_dict(checkpoint["context_encoder"])
    
    test_metrics = evaluate((question_encoder, context_encoder), val_loader, device)
    run.log({
        "val_accuracy": test_metrics['accuracy'],
        "val_recall": test_metrics['recall'],
        "val_precision": test_metrics['precision'],
        "val_f1": test_metrics['f1'],
        "final_gpu_mem": torch.cuda.memory_allocated()/1e9
    })
    run.finish()

    return test_metrics['f1']

In [None]:
hyperparams = {
    'epochs': [150],
    'lr': [2e-5, 3e-5, 4e-4, 4e-5],
    'batch_size': [128, 64, 32, 16],
    'weight_decay': [0.01],
    'warmup_steps': [500]
}

best_score = 0
best_config = None

for config in ParameterGrid(hyperparams):
    print(f"Training with config: {config}")
    current_score = train_dpr(config)
    
    if current_score > best_score:
        best_score = current_score
        best_config = config
        print(f"New best config: {best_config} with F1: {best_score}")

print(f"Best configuration: {best_config}")
print(f"Best validation F1: {best_score}")

Training with config: {'batch_size': 128, 'epochs': 150, 'lr': 2e-05, 'warmup_steps': 500, 'weight_decay': 0.01}


Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the


Beginning Epoch 1 for model architecture dpr--lr:2e-05--128--wd:0.01




In [None]:
def evaluate_final_model(run_id):
    """Run this separately after selecting best run"""
    run = wandb.init(project="Gaps-DPR", name=f"final-test-{run_id}")
    
    checkpoint = torch.load(f"best_dpr_model_{run_id}.pt")
    

    question_encoder = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base').to(device)
    context_encoder = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base').to(device)
    question_encoder.load_state_dict(checkpoint["question_encoder"])
    context_encoder.load_state_dict(checkpoint["context_encoder"])
    
    test_loader = DataLoader(
        encoded_dataset["test"], 
        batch_size=checkpoint["config"]["batch_size"], 
        shuffle=False,
        pin_memory=True,
        num_workers=4,
        persistent_workers=True
    )

    with torch.inference_mode():  
        test_metrics = evaluate((question_encoder, context_encoder), test_loader, device)
    
    run.log({
        "final_test_accuracy": test_metrics['accuracy'],
        "final_test_f1": test_metrics['f1']
    })
    run.finish()
    return test_metrics

In [None]:
if best_config:
    print("\n=== Running Final Test Evaluation ===")
    test_results = evaluate_final_model(config)
    print(f"Final Test F1: {test_results['f1']:.4f}")