# Analyzign Models Best Suited for Gaps Project

## Imports

In [1]:
import gc
import re
import csv
import json
import torch
import wandb
import random

import pandas as pd
# import numpy as np

from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer, DPRContextEncoder, DPRContextEncoderTokenizer, DPRReader, Trainer, TrainingArguments

pd.set_option('display.max_rows', None) 
pd.set_option('display.max_columns', None) 
pd.set_option('display.width', None) 
pd.set_option('display.max_colwidth', None) 
pd.set_option('display.max_seq_item', None)

## Data Preprocessing

In [2]:
with open ('data/all-csu-codes.csv', 'r') as c_data:
    csv_reader = csv.reader(c_data) 
    courses_data = list(csv_reader)
    
courses_df = pd.DataFrame(columns=['Courses', 'Skills'])

for idx in range(0, len(courses_data)):
    skill_list = courses_data[idx][1:-1]
    skill_list = [skill.title() for skill in skill_list]
    skill_list = [re.sub(r'\b(vs|Vs)\b', 'VS', skill) for skill in skill_list]

    row = pd.DataFrame({'Courses': courses_data[idx][0], 'Skills':[skill_list]})
    courses_df = pd.concat([courses_df, row], ignore_index=True)

courses_df.to_csv('data/dpr_courses_data.csv', index=False)

In [3]:
with open('data/descriptions.txt', 'r') as j_data:
    csv_reader = csv.reader(j_data, delimiter='|')
    jobs_data = list(csv_reader)

jobs_df = pd.DataFrame(columns=['Job_Title', 'Job_Description', 'Required_Skills'])
                    
for row in jobs_data:
    if len(row) == 3:
        job_title = row[0].strip().strip('"') 

        job_description = row[1].strip().strip('"')
        job_description = re.sub(r'\bDESCRIPTION\b', '', job_description)
    
        skills = row[2].strip().strip('"')
        skill_list = [skill.strip().strip('"') for skill in skills.split(',')]
        cap_skill_list = [skill.title() for skill in skill_list]
        cleaned_skills = [re.sub(r'\s?\(.*?\)', '', skill) for skill in cap_skill_list]

        row = pd.DataFrame({'Job_Title': job_title, 'Job_Description': job_description, 'Required_Skills': [skill_list]})
        jobs_df = pd.concat([jobs_df, row], ignore_index=True)
            
jobs_df.to_csv('data/dpr_jobs.csv', index=False)

In [4]:
check_course = next(item for item in courses_data if item[0] == 'CS470')
print(check_course)
print('********************************************')
print(courses_df.head(1))

['CS470', 'Assembly', 'C Language', 'Digital Gates', 'Instruction Set Architectures', 'Number Representation']
********************************************
  Courses  \
0   CS462   

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       Skills  
0  [3D Modeling, Animations, Assigning 3D Object Properties, Blender, Calculating Lights, Shades,

In [5]:
courses_df

Unnamed: 0,Courses,Skills
0,CS462,"[3D Modeling, Animations, Assigning 3D Object Properties, Blender, Calculating Lights, Shades, And, Camera Rendering, Character Animation, Character Design, Collisions, Constructing 3D Scenes, Design 3D, Develop 3D, Event Triggering, Finite State Machine, Fourier Transform, Game Design, Geometric Image Manipulation, Human Aware Ai, Immersive 3D Worlds, Implementing Animation To Chara, Implementing Movement To Charac, Manipulate Lighting, Manipulate Rendering, Manipulating 3D Object Properti, Movement, Principles Of Lighting, Principles Of Rendering, Real Time Rendering Pipeline, Scene Composition, Scene Layout, Scripting Interactive Behaviors, Scripting Interactive Elements, Tangent Space, Unity, Vectors]"
1,CS314,"[Agile, Black Box Testing, Burndown Charts, Clean Code, Cmmi, Code Climate, Code Quality, Communication, Compatability Standards, Concurrency, Configuration Management, Continuous Integration, Databases, Development Environments, Devops, Docker, Docker Container, Establishing Interpersonal Rela, Git, Github, Github Actions, Github Projects, Github Repository, Individual Metrics, Integration Tests, Intellij, Java, Java Concurrency, Java Spark, Json, Junit, Kml, Linux, Mariadb, Maven, Networking, Npm, Optimization, Peer Evaluation, Peer Review, Port Forwarding, Postman, Problem Solving, Product Integration, Project Management, Project Planning, Refactoring, Remote Development, Rest Api, Retrospectives, Scrum, Slack, Slf4J, Software Development Practices, Source Control, Sprint Planning, Sql, Story Boards, Story Sizing, Task Breakdown, Team Development Experience, Team Diversity, Team Metrics, Teamwork, Test Driven Development, Tuckman'S Model, Unix, Use Case Testing, Verification, VS Code, Webpack, White Box Testing]"
2,CS165,"[Algorithms, Assertions, B+ Trees, Binary Search Trees, Black Box Testing, Branching Recursion, Data Structures, Dequeues, Expression Trees, Generics, Graph, Hashmap, Infix, Inheritance, Java, Linkedlists, Object Oriented Principles, Object Oriented Programming, Pcre, Polymorphism, Postfix, Prefix, Priority Queues, Problem Solving, Queues, Regex, Stack, Unit Testing]"
3,CS201,"[Argument Construction, Computer Solution Designing, Computer Solution Implementatio, Computer Solution Operations, Decision Making, Ethical Dilemma Analysis, Ethical Dilemma Problem Solving, Ethics Analysis, Legal Obligations, Moral Obligations, Philosophy, Professional Code Of Ethics]"
4,CS370,"[Commercial Operating Systems, Containers, Deadlocks, Deadlocks Management, Design Threaded Programs, File System Architecture, Interprocess Communication, Kernel Threads, Memory Management, Open Source Operating Systems, Operating Systems, Process Synchronization, Processes Management, Resource Management, Scheduling Algorithms, Storage Architecture, Symmetric Multiprocessing, Synchronization, Task Synchronization, Thread Management, Threads, Type-1 Hypervisors, Type-2 Hypervisors, User Threads]"
5,CS150,"[Conditionals, Data Analysis, Data Visualization, Dictionaries, Functions, Hcc, Html, I O Console, I O File, Libraries, Lists, Loops, Operator, Privacy, Python, Research, Research Design, Security, Sets]"
6,CS110,"[Data Analysis, Data Manipulation, Data Visualization, Hardware, Privacy, Research, Security]"
7,CS455,"[Abstraction, Algorithms Underpinning P2P Sys, Amazon Dynamo, Architectural Styles For Distri, Berkley Algorithm, Bittorrent, Brewer'S Cap Theorem, Build Distributed Systems, Build Scalable Servers, Casual Consistency, Central-Server Algorithm, Centralized Single Cpu Systems, Chord, Clock Synchronization In Distri, Combiner Functions, Compound Actions, Concurrency Primitives, Concurrent Collections, Concurrent Programming, Conditions Requirements For Dis, Confinement, Consensus, Consistency, Consistency Protocol, Consistent Ordering Of Operatio, Core Architecture Framework, Cristian'S Algorithm, Data And Client Centric Consist, Data Synchronization, Design And Build Distrubuted Fa, Design Cloud Scale Storage Syst, Design Efficient Data Represent, Design Small Scale Storage Syst, Design Systems That Can Recover, Distributed Coordination, Distributed Deadlock Detection, Distributed Graph Algorithms, Distributed Hash Tables, Distributed Mutual Exclusion, Distributed Objects, Distributed Shared Memory, Election Algorithms, Elections In Wireless Environme, Exclusions, Extreme Scale Storage Systems, Failure Detectors, File System Design, Foundational Issues, Gnutella, Google File System, Grid Computing, Hadoop, Heaps, Hpc, Intrinsic Locks, Intrinsic Reentracy, Lamport'S Clocks, Lazy Evaluations, Locking Strategies, Maekawa Algorithm, Map And Reduce Functions, Mapreduce, Mapreduce Data Flow, Monotonic Read, Monotonic Write, Multivariable Invariants, Multivariable Thread Safety, Napster, Narrow And Wide Transformations, Non-Blocking I O, Overlays, Pair Resilient Distributed Data, Partitioning Functions, Partitioning Schemes, Pastry, Peer To Peer, Performance Considerations, Pipelining Schemes, Race Conditions, Rdbms, Read-Your-Writes, Replica Placements, Resilient Distrubuted Datasets, Ricarat And Agarwala'S Algorith, Ring-Based Algorithm, Scalable Server Design, Scale Entities In Scalable Syst, Sequential Consistency, Sharing Objects, Software Stack, Spark, Stacks, Stateful And Stateless Transfor, Structured Peer To Peer, Synchronization In Wireless Set, Synchronized Collections, Tapestry, Tasks And Split Strategies, Thread Lifecycle, Thread Management, Thread Pools, Thread Safety, Thread-Safe Classes, Time And Global Positioning Sys, Time Synchronization Algorithms, Token Permission Based Approach, Transformations, Unstructured Peer To Peer, Vector And Matrix Clocks, Volunteer Computing, Voting Sets, Windowed Operations, Write Mapreduce Programs That E]"
8,CS470 - Modules,"[Accelerators, Activation Records, Analysts Memory Heirarchy, Assembly, Cache, Combination Circuits, Custom Accellerators, Datapaths, Design Memory Heirarchy, Disk Memory Heirarchy, Elements Of Compilation, Implementation Memory Heirarchy, Instruction Set Processors, Instruction Set Processors Micr, Pipelining, Prefetching Data, Prefetching Instructions, Ram Memory Heirarchy, Rationale Memory, Register Transfer Notation, Registers, Sequential Circuits, Ssd Memory Heirarchy, Stack Frame, State Machine, Structure Memory, Systolic Matric Multipliers]"
9,CS453,"[Activation Records, Asm Code Generation, Ast Lowering, Ast Translation, Compiler Algorithms, Compiler Back-End Design, Compiler Construction, Compiler Front-End Design, Compiler Middle-End Design, Compiler Testing, Control Flow Analysis, Dataflow Analysis, Develope A Compiler From Java, Develope A Compiler From Mips, Grammars, Instructor Scheduling, Intermediate Representation Con, Lexical Analysis, Linear Scan Algorithm, Liveness Analysis, Ll Parsing, Lr Bison, Lr Flex, Lr Parsing, Mips Assembly, Parsers, Register Allocation, Semantic Analysis, Simd Vectorization, Translation, Type Analysis, Type Checking]"


In [6]:
check_job = [item for item in jobs_data if item[0] == 'Yahoo_Software_Dev_Engineer']
print(check_job)
print('********************************************')
print(jobs_df.head(1))

[['Yahoo_Software_Dev_Engineer', "Who We're Looking For- Junior Software Engineer We hire engineers who love the web, but can see its cracks and its future, too. We look for people who are exceptionally who are exceptionally imaginative, collaborative, and truly excited about tech. Our DSP Reporting team is currently looking for talented full-stack engineers to design, implement, and support robust, scalable, and high-quality reporting solutions Your Responsibilities - Develop and enhance a state-of-the-art reporting and analytics platform. - Build intuitive front-end UIs for reporting and analytics using React. - Develop microservices to power reporting and analytics solutions. - Write clean, maintainable, and performant code, including unit tests and refactoring when needed. - Collaborate with designers and developers to define and deliver new features. - Participate in system architecture reviews, code reviews, performance tuning, and production support. - Own and deliver projects r

In [7]:
jobs_df

Unnamed: 0,Job_Title,Job_Description,Required_Skills
0,Adobe_AI_ML_Engineer,"The Opportunity?Adobe is seeking talented and passionate Software Engineer across all organizations to help plan, design, develop, and test software systems or applications for software enhancements and new products used in local, networked, cloud-based or Internet-related computer programs and products. What You'll Do - Develop high-performance, reliable, testable and maintainable code. - Participating in all aspects of software development activities, including design, coding, code review, testing, bug fixing, and code/API documentation. - Collaborate with engineers and participate in daily or weekly stand ups and meetings. - Grow with the support of your team and help others on the team grow by providing thoughtful feedback and uplifting those around you. - Work both independently and collaboratively within a fast-paced development team, with clear, positive, and constructive communication. - Additional responsibilities as needed based on specific role or team What You Need to Succeed - Bachelor's or Master's in Computer Science, Computer Engineering, Electrical Engineer, or equivalent experience required - 1-3+ years of experience in specific skill/field(s) - Proficient in programming languages such as Ruby, Python, Javascript, etc. - Strong technical background with analytical and problem-solving skills - Familiarity with client-side frameworks and libraries like React, Vue, Angular, Web Components. - Excellent problem solving and debugging skills, and direct experience with DevOps in a SaaS environment is a plus. - Customer focused and have real passion for quality and engineering excellence at scale. - Excellent communication and collaboration skills.","[Programming, Ruby, JavaScript, Python, Analytical skills, Knowledge sharing, Angular, Problem Solving Skills, React.js, Strong communication skills, Vue, Front-end frameworks, Collaboration, Communication, Development, Quality Assurance, Customer needs, Efficiency, Bug fixing, Code/API documentation, Code Review, Design, High performance, Reliability, Testing, Maintaining software applications, Feedback, Teamwork, Debugging, DevOps, Planning]"
1,Adobe_Junior_SDE,"The Opportunity The Adobe Commerce Services team is looking for a Junior Software Engineer to build secure and scalable high-traffic SaaS services. This group builds and maintains our collection of SaaS-based products for the Adobe Commerce merchants. We utilize the advantages of promoting containerization, clusters, and continuous integration/continuous deployment (CI/CD) methodologies. We're extending the Adobe Commerce product offering through SaaS design principles, moving more and more of the storefront into the cloud, facilitating effortless upgrades on the merchant's side, and enhance our capabilities to deliver quality insights to both the merchant and the shopper. Our team of 15+ people works in smaller product engineering teams. This allows us to be agile while working as part of a larger organization. Though we are established, in production, and iterating, there are plenty of challenges to overcome, decisions to be made, and new development to do. The team follows a hybrid working model, with regular office presence required at our office location. What You'll Do - Implement and deliver high-quality features for Adobe Commerce SaaS Services and Adobe Commerce. - Help with designing features and architectural decisions - Ensure high quality by following coding standard methodologies, code reviews, and providing automated tests. - Help improve the availability, performance, scalability, and security of the product. Adding monitoring and metrics. - Actively participate in the development and update of production infrastructure to the latest changes. - Keep high levels of Security and Compliance with the Adobe Standards. - Work in a Lean-Agile environment. What you need to succeed - Align with our values: Create the future, Own the Outcome, Raise the Bar, and Be Genuine. - Participate in the full lifecycle of a project from concept to completion. - Knowledgeable in Java (Spring Boot). - Knowledgeable in PHP. - Understanding of the SQL. - Shift Quality, Security and Observability left in the development process (unit, integration, performance, and UI; logging, tracing, monitoring, alerting). - Good understanding of DevOps practices and the Cloud Tech stack (AWS preferred). - Knowledgeable (or eager to learn) in distributed systems architecture. - Effective communication skills. - Fluent English. Are a plus - Experience with Magento or Adobe Commerce. - Knowledge of e-commerce and payments - Experience in DevOps practices: IaC (Terraform preferred), CI systems (Jenkins preferred) - Experience in k8s","[Security, Improvement, Performance, Product development, Metrics, Monitoring, Scalability, Teamwork, Development, Agile development, Decision making, Knowledge sharing, CI systems, Terraform, DevOps, Efficiency, Lean-Agile environment, Compliance, Programming, Software Development, Shift Quality, PHP, Observability, SQL, Innovation, Quality Insights, SaaS Design Principles, Cloud computing, Quality control, Automated tests, Magento, Adobe Commerce, Java, Spring Boot, Clusters, CI/CD methodologies, Containerization, Designing Features, Architecture Design, Cloud Technology, Distributed systems, Infrastructure Development]"
2,Adobe_Software_Engineering_Intern,"As an Adobe Intern, you'll access socials, corporate social responsibility events, learning, and networking opportunities! You'll also be assigned a dedicated Manager to support you throughout your time within the business. - Location: Basel, Switzerland - Duration: 6 months - Start date: 1st of July 2025 - Eligibility: Open to candidates who are graduating in 2026 - Hybrid Work In Basel we build Adobe Experience Manager (AEM), the industry leader in Experience Management. AEM helps our customers design, manage, and deliver high-quality digital experiences for the world's biggest brands. We are having multiple internship projects open in the following areas: - Backend development (Java and/or NodeJS) - Frontend development (HTTP, HTML, JavaScript) - Cloud Engineering - Machine Learning / AI With our AI-First approach to problem-solving, knowledge about AI is a great asset orthogonal to the areas above. What you will do: - Integrate into a highly collaborative team and work with a mentor to pursue your internship project; - Analyze and design an approach ? possibly using AI ? to solve challenging problems of your project; - Apply your programming skills and best engineering practices to realize the project; - Leverage cloud technologies to deploy and operate at scale; - Share your approach and results with the team and community; - Work on a project that will reach production. What you need to succeed: - Pursuing a BS, MS, or Ph.D. in Computer Science or Machine Learning at a Swiss or EU University; - Passionate about programming with a solid understanding of concepts such as data structures, algorithms, OOP, AI; - Good understanding of web technologies; - Excellent interpersonal skills; - Fluent in English, both written and verbal.","[Communication, Interpersonal Skills, Teamwork, Machine learning, Cloud Engineering, Artificial Intelligence (AI), JavaScript, Programming, Algorithms, C++, Data structures, Java, OOP/Func Paradigms, Python, Networking, Analytical skills, Design, Cloud Technology, Adobe Experience Manager, Backend development, Node.js]"
3,Adobe_Software_Quality_Engineer,"What you'll do - Successfully implement and test features and workflows in Acrobat Web, ensuring flawless performance and user experience. - Develop and maintain test automation frameworks, bringing to bear your hands-on expertise in programming languages such as JavaScript, React, or TypeScript. - Implement various testing methodologies, including integration, performance, acceptance, and system testing, to maintain high standards of quality. - Apply your sophisticated knowledge of HTML/JS web technologies, web debugging tools, and open source JS libraries (node, jQuery, etc.) to improve the testing process. - Collaborate with cross-functional teams to determine and address testing requirements and ensure seamless integration and deployment. What you need to succeed - A B.E/B.Tech degree in computer science engineering, coupled with 3 ? proven experience in a known testing role is essential. - Proven analytical skills and excellent scripting/programming knowledge are required to thrive in this engineering position. - Outstanding problem-solving abilities and a proactive attitude to manage releases within tight deadlines, really committing to the task. - Excellent written and verbal communication skills to effectively interact with team members, collaborators, customers, and partners. - A passion for software testing and a dedication to delivering world-class digital experiences that compete on a global scale.","[JavaScript, TypeScript, Programming, React.js, Analytical skills, Scripting, Communication, High performance, Problem Solving Skills, Time management, Quality Assurance, Testing, Testing Workflows, Implementation, Web Debugging Tools, HTML, Open source tools, Testing Methodologies, Collaboration, Deployment]"
4,Adobe_Software_Solutions_Architect,"The Opportunity Hello, We're the Customer Engineering Team (CET) at Adobe, a group of creative problem-solvers who love helping B2B marketing automation teams crush their business goals by adding features and capabilities to our products. We're in a unique spot where we get to build software products with guidance from some of our biggest customers in the world. If you're passionate about tech and love a good challenge, you'll fit right in! We're on the hunt for an experienced Solution Architect in the Product Development Organization. we are looking for you to be a key driver in ensuring the success of our customers, field teams, and partners with their implementation and adoption of Adobe Marketo and Journey Optimizer B2B. You'll collaborate with Adobe's strategic customers to guarantee technical success, product adoption, and continuous improvement. By harnessing customer usage and feedback, you'll identify and drive product innovation throughout the development lifecycle. Additionally, you'll help establish standard methodologies and technical guides to enhance product adoption, enablement, and customer success across the broader Adobe ecosystem. As part of our team you will build shape and increase our products to help customer implement their sophisticated requirements of performance, automation and features while working with a team of excellent engineers inside and around the Adobe engineering organization. What You'll Do - Collaborate with customers and implementation teams to understand and address functional requirements. - Tackle the most sophisticated technical challenges, from design to launch. - Uphold and help define architectural standards while solving intricate architecture solutions. - Partner with engineering teams on company-wide initiatives spanning multiple projects to help solve for the problem. - Provide solution architecture recommendations and enablement across applications and services, acting as a trusted advisor to ensure customer success and business scalability. - Work with customers, support teams, product, and engineering to resolve critical issues and drive product requests to resolution. - Author and deliver use case and implementation guides for customer and partner reference. - Find opportunities for product improvements and new solutions during the implementation process. Collaborate with Product and Engineering teams to drive product advancements aligned with customer use cases. - Translate the product roadmap into technical, implementation focused, customer-facing materials for ongoing alignment. - Track and present status changes and recommendations to executives and customers based on product state and roadmap working with Product Managers and Engineering Leads to help set the right direction. What You'll Need to Succeed - Deep Experience with Enterprise Architecture: Proficiency with Adobe cloud solutions, Data retention and ensuring seamless integration and optimization. - Interpersonal: Ability to work across business and technical partners with high energy, personal drive, and positivity, fostering strong relationships. - Team Orientation: Adaptability to the needs and demands of internal and external partners, promoting collaboration and partnership. - Effective Communication: Ability to convey sophisticated ideas both verbally and in writing, ensuring comprehension and acceptance among collaborators. - Leadership: Capability to lead customer meetings and work streams, coordinating across teams to manage expectations and progress. - Passion for Learning: Enthusiasm for new challenges and staying updated on industry and technology trends, driving continuous improvement. - Technical Curiosity: Willingness to dive deep into technology to resolve issues and establish technical credibility, enhancing problem-solving capabilities. - Analytical and Problem-Solving Skills: Strong research, hypothesis development, and recommendation synthesis abilities, providing actionable insights. - Strategic and Tactical Direction: Providing comprehensive strategic direction with a deep understanding of tactical details to ensure project success and alignment. - Implementation Experience: Leading implementation efforts of business applications at large enterprise customers, ensuring successful deployment and adoption. - Engineering Leadership: Experience collaborating with product managers and engineering teams to drive features and innovation. - Technical Familiarity: Knowledge of REST APIs, web and mobile technologies (browsers, cookies, HTML, JavaScript), enabling effective technical solutions. - Advertising and Customer Experience Technology: Experience with DMPs, DSPs, Ad servers, Ad Exchanges, Cross-Device Tracking platforms, Email systems, customer data marts, etc., optimizing customer engagement. - Data Warehousing and Customer Data Environments: Familiarity with data storage, movement, security, and CRM systems, ensuring robust data management.","[Collaboration, Teamwork, Problem Solving Skills, Customer engagement, Adobe Marketo, Product adoption, Continuous improvement (CI), Product development, Technical leadership, Product Advancements, Implementation, Team collaboration, Leadership, Innovation, Progress Management, Programming, Mobile technologies, HTML, JavaScript, REST APIs, Web development, Data management, Data storage, CRM systems, Data Mining, Security, Data Management Platforms (DMPs), Cross-Device Tracking platforms, Email systems, Demand-side Platforms (DSPs), Strategic planning, Architecture Solutions, Architecture Design, Communication, Effective Communication, Analytical skills, Actionable Insights, Research, Interpersonal Skills, Solution architecture, Data Retention, Enterprise Architecture, Adobe Cloud Solutions, Optimization, Automation, Customer needs, Feedback, Performance, Resolution Drive, Creative Solutions, Recommendation Systems, Product Management, Tooling]"
5,Amazon_Cloud_Support_Engineer_ETL,"As a Cloud Support Engineer you will learn at an accelerated pace how to use and leverage many different cloud technologies to help our customers succeed. You will act as the Cloud Ambassador across AWS products, providing our customers with required tools and tactics to scale their impact in world-wide markets. The Big Data (Distributed Processing) role supports our services that leverage data and produce business insights, which include hadoop ecosystem and component/applications: Apache Spark, Apache Hive, Presto, Map-Reduce, Zookeeper, HBASE, HDFS, YARN and Pig. In this role, engineers interact with customers that reach out to AWS via email, chat or phone, helping our customers use and integrate Big Data services in what is arguably our industry's most exciting space. Engineers develop Subject Matter Expertise in one or more services. Engineers also collaborate with internal teams to provide the required support services to AWS customers. In addition, engineers also work on team functions, automation/optimization projects and continually work on improving operational efficiency. A day in the life?Your day as a Cloud Support Engineer will include, but not be limited to, the following activities: ? You will be primarily responsible for solving customer's cases through a variety of contact channels (telephone, email, and web/live chat), applying advanced troubleshooting techniques to provide tailored solutions and working with them to dive deep into the root cause of an issue.?? You will drive initiatives that improve support-related processes and our customers' experience. These can include tutorials, how-to videos, technical articles, trainings, among others.?? You will leverage your customer support experience to provide feedback to internal AWS teams on how to improve our services, and work on critical, highly complex customer problems that may span multiple AWS services.?? You will be continuously learning innovative technologies, and developing new technical skills and other professional competencies.?? You will act as interviewer in hiring processes, and coach/mentor new team members.?? PLEASE NOTE: This role requires the flexibility to work 5 days a week (occasionally on weekends) on a rotational basis. Schedules may align to Sunday - Thursday, Tuesday ? Saturday or Monday - Friday. Want to know what's it like to be a Support Engineer at AWS? Take a look https://www.youtube.com/watch?v=GC3bWcFFZTo About the team?AWS Support Engineering is a customer-facing global organization that provides technical support to our customers as well as our internal teams. As a member our team, you will be at the forefront of this transformational technology, operating on a follow-the-sun model. You will be assisting a global list of companies and developers that are taking advantage of a growing set of services and features to run their mission-critical applications. About AWS Diverse Experiences?AWS values diverse experiences. Even if you do not meet all of the preferred qualifications and skills listed in the job description, we encourage candidates to apply. If your career is just starting, hasn't followed a traditional path, or includes alternative experiences, don't let it stop you from applying. Why AWS??Amazon Web Services (AWS) is the world's most comprehensive and broadly adopted cloud platform. We pioneered cloud computing and never stopped innovating ? that's why customers from the most successful startups to Global 500 companies trust our robust suite of products and services to power their businesses.?Inclusive Team Culture?Here at AWS, it's in our nature to learn and be curious. Our employee-led affinity groups foster a culture of inclusion that empower us to be proud of our differences. Ongoing events and learning experiences, including our Conversations on Race and Ethnicity (CORE) and AmazeCon (gender diversity) conferences, inspire us to never stop embracing our uniqueness.?Mentorship & Career Growth?We're continuously raising our performance bar as we strive to become Earth's Best Employer. That's why you'll find endless knowledge-sharing, mentorship and other career-advancing resources here to help you develop into a better-rounded professional. Sales, Marketing and Global Services (SMGS)?AWS Sales, Marketing, and Global Services (SMGS) is responsible for driving revenue, adoption, and growth from the largest and fastest growing small- and mid-market accounts to enterprise-level customers including public sector. The AWS Global Support team interacts with leading companies and believes that world-class support is critical to customer success. AWS Support also partners with a global list of customers that are building mission-critical applications on top of AWS services. BASIC QUALIFICATIONS - 1+ years of software development, or 1+ years of technical support experience?- Experience troubleshooting and debugging technical systems?- Bachelor's degree in Engineering/Computer Science/ Mathematics or any related field or a minimum of 1+ years of related professional and/or military experience and an in-depth understanding of ETL (Extract, Transform, Load), able to create ETL pipelines to ingest data into datalake/warehouse with simple to complex transformations and troubleshoot ETL job issues. Intermediate to advanced expertise in ETL tools such as Talend, Informatica or similar. Knowledge or Experience with Networking and troubleshooting (TCP/IP, DNS, OSI model, routing, switching, firewalls, LAN/WAN, traceroute, iperf, dig, cURL or related). Understanding of OS concepts(memory, storage and CPU usage). PREFERRED QUALIFICATIONS - Knowledge or experience with Hadoop ecosystems (Apache Spark, Apache Hive), Presto/Trino. Advanced SQL and query performance tuning skills.?- Knowledge or experience in data lake architecture, administration and data analysis techniques such as quantitative or qualitative analysis.?- Knowledge or experience in various big data or distributed systems (NoSQL, search and streaming).?- Be able to read and understand Python, Scala, Java and Shell code.?- Understanding of cloud computing concepts and/or experience with any cloud platforms (AWS, Azure, Google Cloud).","[Innovation, Customer engagement, Technical leadership, Software Development, Data analysis, Problem Solving Skills, Programming, Quality Assurance, Big data, Automation, Efficiency, Continuous improvement (CI), Advanced Troubleshooting Techniques, Mentorship, Extract, Transform, Load (ETL), Networking, Operating Systems, Troubleshooting, HDFS, Pig, Apache Hive, HBASE, Apache Spark, Operational efficiency, Collaboration, Customer needs, Knowledge sharing, Data lake architecture, Hadoop ecosystems, Shell, Scala, Cloud computing, Java, NoSQL, Python, SQL]"
6,Amazon_Graduate_Software_Engineer,"Do you want to solve business challenges through innovative technology? Do you enjoy working on cutting-edge, scalable services technology in a team environment? Do you like working on industry-defining projects that move the needle? At Amazon, we hire the best minds in technology to innovate and build on behalf of our customers. The intense focus we have on our customers is why we are one of the world's most beloved brands ? customer obsession is part of our company DNA. Our Software Development Engineers (SDEs) use cutting-edge technology to solve complex problems and get to see the impact of their work first-hand. If this is you, come chart your own path at Amazon! The challenges SDEs solve for at Amazon are big and impact millions of customers, sellers, and products around the world. We're looking for individuals who are excited by the idea of creating new products, features, and services from scratch while managing ambiguity and the pace of a company whose ship cycles are measured in weeks, not years. Key job responsibilities?- Collaborate with experienced cross-disciplinary Amazonians to conceive, design, and bring to market innovative products and services.?- Design and build innovative technologies in a large distributed computing environment and help lead fundamental changes in the industry.?- Create solutions to run predictions on distributed systems with exposure to innovative technologies at incredible scale and speed.?- Build distributed storage, index, and query systems that are scalable, fault-tolerant, low cost, and easy to manage/use.?- Work in an agile environment to deliver high quality software. BASIC QUALIFICATIONS - Graduated less than 24 months ago or about to complete a Bachelor's or Master's Degree in Computer Science, Computer Engineering, or related fields at time of application?- Although no specific programming language is required ? you should be familiar with the syntax of languages such as Java, C/C++, or Python?- Knowledge of Computer Science fundamentals such as object-oriented design, algorithm design, data structures, problem solving and complexity analysis. PREFERRED QUALIFICATIONS - Previous technical internship(s) if applicable?- Experience with distributed, multi-tiered systems, algorithms, and relational databases?- Experience in optimization mathematics such as linear programming and nonlinear optimisation?- Ability to effectively articulate technical challenges and solutions?- Adept at handling ambiguous or undefined problems as well as ability to think abstractly.","[Communication, Analytical skills, Abstraction, Technical articulation, Algorithms, Distributed systems, Optimization, Problem Solving Skills, Relational databases, Efficiency, Agile environment, Distributed storage, Fault Tolerance, Scalability, Collaboration, Leadership, Innovation, Teamwork, Customer engagement, Creating solutions for distributed systems, Designing and building innovative technologies, Complexity analysis, Algorithm design, C++, Data structures, Java, Object-oriented design, Python]"
7,Amazon_SDE_AI_ML,"We're on the lookout for the curious, those who think big and want to define the world of tomorrow. At Amazon, you will grow into the high impact, visionary person you know you're ready to be. Every day will be filled with exciting new challenges, developing new skills, and achieving personal growth. How often can you say that your work changes the world? At Amazon, you'll say it often. Join us and define tomorrow. At Amazon, we hire the best minds in technology to innovate and build on behalf of our customers. The intense focus we have on our customers is why we are one of the world's most beloved brands ? customer obsession is part of our company DNA. Our interns write real software and collaborate with experienced software development engineers (SDEs) who guide interns on projects that matter to our customers. Key job responsibilities?? Collaborate with experienced cross-disciplinary Amazonians to conceive, design, and bring innovative products and services to market.?? Design and build innovative technologies in a large distributed computing environment, and help lead fundamental changes in the industry.?? Create solutions to run predictions on distributed systems with exposure to innovative technologies at incredible scale and speed.?? Build distributed storage, index, and query systems that are scalable, fault-tolerant, low cost, and easy to manage/use.?? Ability to design and code the right solutions starting with broadly defined problems.?? Work in an agile environment to deliver high-quality software. A day in the life?As an intern, you will be matched to a manager and a mentor. You will have the opportunity to influence the evolution of Amazon technology and lead mission critical projects early in your career. Your design, code, and raw smarts will contribute to solving some of the most complex technical challenges in the areas of distributed systems, data mining, automation, optimization, scalability, and security ? just to name a few. In addition to working on an impactful project, you will have the opportunity to engage with Amazonians for both personal and professional development, expand your network, and participate in activities with other interns throughout your internship. No matter the location of your internship, we give you the tools to own your project and learn in a real-world setting. Many of our technologies overlap, and you would be hard pressed to find a team that is not using Amazon Web Services (AWS), touching the catalogue, or iterating services to better personalize for customers. We make the impossible, possible. About the team?At Amazon, we're at the forefront of transformative AI, shaping the next generation of intelligent technologies. For over 25 years, we've been pioneering state-of-the-art AI and machine learning (ML) models to revolutionize customer experiences worldwide. Now, we're on the cusp of a new era, where AI holds the promise to reshape society and business in unprecedented ways. As part of our world-class team of AI experts?scientists, engineers, researchers, product builders, and public policy experts?here, you'll have the opportunity to unlock the next frontiers of Artificial General Intelligence (AGI). We are looking for the brightest minds from a wide range of backgrounds and experiences. Join us in creating transformative AI solutions that will improve lives, solve global challenges, and open up new realms of possibility?from reinventing commerce and accelerating enterprise productivity to advancing universal agents and shaping the future of robotics. BASIC QUALIFICATIONS - Knowledge of computer science fundamentals such as object-oriented design, operating systems, algorithms, data structures, and complexity analysis?- Are 18 years of age or older?- Experience with at least one modern language such as Java, Python, C++, or C# including object-oriented design?- Currently enrolled in a Bachelors, Masters, or PhD in Computer Science, Computer Engineering, Data Science, Electrical Engineering, or majors relating to these fields, with an expected graduation date between 10/2025 - 9/2028 PREFERRED QUALIFICATIONS - Experience in optimization mathematics such as linear programming and nonlinear optimization?- Experience with distributed, multi-tiered systems, algorithms, and relational databases?- Candidates with strong interests and academic qualifications/research focus in: Artificial Intelligence, machine learning, and/or Generative AI","[Innovation, Data Mining, Amazon Web Services (AWS), Automation, Cloud computing, Distributed systems, Optimization, Scalability, Security, Machine learning, Algorithms, Generative AI, Relational databases, Artificial Intelligence (AI), Research, Coding skills, Collaboration, Problem Solving Skills, System Design, Transformative AI Solutions, Complexity analysis, C++, C#, Data structures, Java, Object-oriented design, Operating Systems, Python]"
8,Amazon_SDE_Cloud,"Amazon Web Services (AWS) internships are full-time (40 hours/week) for 12 consecutive weeks during summer. By applying to this position, your application will be considered for all locations we hire for in the United States. This position requires that the candidate selected be a U.S. citizen and be eligible to obtain and maintain an active TS/SCI security clearance with polygraph. At Amazon, we hire the best minds in technology to innovate and build on behalf of our customers. The focus we have on our customers is why we are one of the world's most beloved brands ? customer obsession is part of our company DNA. Our interns write real software and collaborate with a select group of experienced software development engineers (SDEs) who guide interns on projects that matter to our customers. As an intern, you will be matched to a manager and a mentor. You will have the opportunity to influence the evolution of Amazon technology and lead mission critical projects early in your career. Your design, code, and raw smarts will contribute to solving some of the most complex technical challenges in the areas of distributed systems, data mining, automation, optimization, scalability, and security ? just to name a few. In addition to working on an impactful project, you will have the opportunity to engage with Amazonians for both personal and professional development, expand your network, and participate in activities with other interns throughout your internship. No matter the location of your internship, we give you the tools to own your project and learn in a real-world setting. Many of our technologies overlap, and you would be hard pressed to find a team that is not using Amazon Web Services (AWS), touching the catalogue, or iterating services to better personalize for customers. If this opportunity interests you, apply and come chart your own path at Amazon. Key job responsibilities?? Collaborate with experienced cross-disciplinary Amazonians to conceive, design, and bring innovative products and services to market.?? Design and build innovative technologies in a large distributed computing environment and help lead fundamental changes in the industry.?? Create solutions to run predictions on distributed systems with exposure to innovative technologies at incredible scale and speed. BASIC QUALIFICATIONS - Are enrolled in a Bachelor's degree or above in Computer Science, Computer Engineering, Data Science, Electrical Engineering, or majors relating to these fields with a graduation date between December 2025 - September 2026?- Experience with at least one modern language such as Java, Python, C++, or C# including object-oriented design PREFERRED QUALIFICATIONS - Knowledge of the syntax of languages such as Java, C/C++ or Python. ? Knowledge of Computer Science fundamentals such as object-oriented design, algorithm design, data structures, problem solving, and complexity analysis.","[C++, C#, Java, Object-oriented design, Python, Quality Assurance, Innovation, Collaboration, Innovative technologies, Amazon Web Services (AWS), Distributed systems, Leadership, Networking, Software Development, Customer engagement, Automation, Data Mining, Optimization, Scalability, Security, Programming, Algorithm design, Complexity analysis, Compute technologies, Data structures, Problem Solving Skills]"
9,Amazon_SDE_Embedded_Dev,"Do you want to solve business challenges through innovative technology? Do you enjoy working on cutting-edge, scalable services technology in a team environment? Do you like working on industry-defining projects that move the needle? At Amazon, we hire the best minds in technology to innovate and build on behalf of our customers. The intense focus we have on our customers is why we are one of the world's most beloved brands ? customer obsession is part of our company DNA. Our Software Development Engineers (SDEs) use cutting-edge technology to solve complex problems and get to see the impact of their work first-hand. The challenges SDEs solve for at Amazon are big and impact millions of customers, sellers, and products around the world. We're looking for individuals who are excited by the idea of creating new products, features, and services from scratch while managing ambiguity and the pace of a company whose ship cycles are measured in weeks, not years. If this is you, come chart your own path at Amazon! Key job responsibilities?- Collaborate with experienced cross-disciplinary Amazonians to conceive, design, and bring to market innovative products and services.?- Design and build innovative technologies in a large distributed computing environment and help lead fundamental changes in the industry.?- Create solutions to run predictions on distributed systems with exposure to innovative technologies at incredible scale and speed.?- Build distributed storage, index, and query systems that are scalable, fault-tolerant, low cost, and easy to manage/use.?- Work in an agile environment to deliver high quality software. BASIC QUALIFICATIONS - Graduated less than 24 months ago or about to complete a Bachelor's or Master's Degree in Computer Science, Computer Engineering, or related fields at time of application?- Knowledge of Computer Science fundamentals - Programming experience in C or Java / Rust - Knowledge in databases - Experience with Operating systems and Kernel Internals, Rust programming PREFERRED QUALIFICATIONS - Previous technical internship(s) if applicable - Experience with distributed, multi-tiered systems, algorithms, and relational databases - Experience such as linear programming and nonlinear optimisation - Ability to effectively articulate technical challenges and solutions - Adept at handling ambiguous or undefined problems as well as ability to think abstractly - Experience with Distributed Systems: Large Scale Database, Multi-tenant, Highly Available Systems, Fault Tolerance, Disaster Recover, or Transactional Systems","[Efficiency, Teamwork, Distributed storage, Agile environment, Fault Tolerance, Scalability, Innovation, Creating new products, Creative Solutions, Linear Programming, Abstraction, Disaster Recovery, Large Scale Databases, Multi-tenant Systems, Transactional Systems, Nonlinear Optimization, Algorithms, Distributed systems, Problem Solving Skills, Relational databases, Availability, Programming, C, Compute technologies, Databases, Java, Kernel, Operating Systems, Rust]"


In [8]:
del check_course, check_job, j_data, jobs_data, csv_reader, c_data, courses_data
gc.collect()
courses_df.shape, jobs_df.shape

((35, 2), (79, 3))

### Collect All Skills from CSU Course Data

In [None]:
def get_all_acquired_skills(courses_df):
    all_acquired_skills = set() 
    
    for skills in courses_df['Skills']:
        all_acquired_skills.update(skills)

    return all_acquired_skills

In [10]:
all_acquired_skills = get_all_acquired_skills(courses_df)
list(all_acquired_skills)[0:9]

['Moral Obligations',
 'Code Styles',
 'Prefetching Instructions',
 'Smart Contracts',
 'Cyber Threat Intelligence',
 'Computer Output',
 'Debugging',
 'C',
 'Intrusion Detection']

### Create Corpus for CSU Courses 

In [11]:
def create_corpus(courses_df):
    corpus = []
    for idx, row in courses_df.iterrows():
        document = {
            "id": f"csu_course_{idx}",
            "title": row['Courses'],
            "text": f"Skills: {', '.join(row['Skills'])}",
            "metadata": {"course_name": row['Courses'], "skills_acquired": row['Skills']}
        }
        corpus.append(document)
    return corpus


courses_corpus = create_corpus(courses_df)

with open("data/dpr_course_corpus.json", "w") as f:
    json.dump(courses_corpus, f, indent=4)

json.dumps(courses_corpus[0], indent=4)

'{\n    "id": "csu_course_0",\n    "title": "CS462",\n    "text": "Skills: 3D Modeling, Animations, Assigning 3D Object Properties, Blender, Calculating Lights, Shades, And, Camera Rendering, Character Animation, Character Design, Collisions, Constructing 3D Scenes, Design 3D, Develop 3D, Event Triggering, Finite State Machine, Fourier Transform, Game Design, Geometric Image Manipulation, Human Aware Ai, Immersive 3D Worlds, Implementing Animation To Chara, Implementing Movement To Charac, Manipulate Lighting, Manipulate Rendering, Manipulating 3D Object Properti, Movement, Principles Of Lighting, Principles Of Rendering, Real Time Rendering Pipeline, Scene Composition, Scene Layout, Scripting Interactive Behaviors, Scripting Interactive Elements, Tangent Space, Unity, Vectors",\n    "metadata": {\n        "course_name": "CS462",\n        "skills_acquired": [\n            "3D Modeling",\n            "Animations",\n            "Assigning 3D Object Properties",\n            "Blender",\n 

### Create the Training Dataset

In [None]:

def all_class_comparison(jobs_df, courses_df, all_acquired_skills):
    training_data = []
    context = "\n".join([f"Course: {course} -- Skills: {', '.join(skills)}"
                        for course, skills in zip(courses_df["Courses"], courses_df["Skills"])])

    for _, job in jobs_df.iterrows():
        job_text = f"Job Title: {job['Job_Title']} -- Job Description: {job['Job_Description']}"
        job_skills = set(job["Required_Skills"])

        missing_skills = job_skills - all_acquired_skills

        training_data.append({
            'question': job_text,
            'context': context,
            'answer': list(missing_skills)
        })

    return training_data


def compare_individual_course(jobs_df, courses_df):
    training_data = []
     
    for _, job in jobs_df.iterrows():
        job_text = f"Job Title: {job['Job_Title']} -- Job Description: {job['Job_Description']}"
        job_skills = set(job['Required_Skills'])  
        
        for _, course in courses_df.iterrows():
            course_name = course['Courses']
            course_skills = set(course['Skills'])  
            
            missing_skills = job_skills - course_skills
            context = f"Course: {course_name} -- Skills: {', '.join(course_skills)}"

            if missing_skills == job_skills:
                training_data.append({
                    'query': job_text,
                    'context': context,
                    'answer': list(job_skills)
                })
            elif len(missing_skills) == 0:
                training_data.append({
                    'query': job_text,
                    'context': context,
                    'answer': []
                })
            else:
                training_data.append({
                    'query': job_text,
                    'context': context,
                    'answer': list(missing_skills)
                })

    return training_data


def create_schedule_data(jobs_df, schedules):
    training_data = []

    for _, job in jobs_df.iterrows():
        job_text = f"Job Title: {job['Job_Title']} -- Job Description: {job['Job_Description']}"
        job_skills = set(job["Required_Skills"])

        for sched in schedules:
            context = "\n".join([f"Course: {course} -- Skills: {', '.join(skills)}"
                        for course, skills in zip(sched["Courses"], sched["Skills"])])
            sched_skills = [item for sublist in sched["Skills"].tolist() for item in sublist]
            missing_skills = job_skills - set(sched_skills)

            if missing_skills == job_skills:
                training_data.append({
                    'query': job_text,
                    'context': context,
                    'answer': list(job_skills)
                })
            elif len(missing_skills) == 0:
                training_data.append({
                    'query': job_text,
                    'context': context,
                    'answer': []
                })
            else:
                training_data.append({
                    'query': job_text,
                    'context': context,
                    'answer': list(missing_skills)
                })

    return training_data


def get_courseloads(jobs_df, courses_df, number_of_schedules=20):
    core_classes = ['CS150', 'CS164', 'CS152', 'CS162', 'CS201', 'CS165', 'CS220', 
                    'CS270', 'CS250', 'CS314', 'CS370', 'CS320', 'CS214']
    
    elective_courses_df = courses_df[~courses_df['Courses'].isin(core_classes)]

    schedules_df = []
    used_schedules = set()

    while len(schedules_df) < number_of_schedules:
        l_4_courses = elective_courses_df[elective_courses_df['Courses'].str.startswith('CS4')]
        l_3_4_courses = elective_courses_df[elective_courses_df['Courses'].str.startswith('CS3') | elective_courses_df['Courses'].str.startswith('CS4')]
        other_courses = elective_courses_df[~elective_courses_df['Courses'].str.startswith('CS3') & ~elective_courses_df['Courses'].str.startswith('CS4')]

        l_4_sample = random.sample(l_4_courses['Courses'].tolist(), 2)

        l_3_4_filtered = l_3_4_courses[~l_3_4_courses['Courses'].isin(l_4_sample)]
        l_3_4_sample = random.sample(l_3_4_filtered['Courses'].tolist(), 2)

        all_sampled_courses = l_4_sample + l_3_4_sample
        other_courses_filtered = other_courses[~other_courses['Courses'].isin(all_sampled_courses)]
        other_sample = random.sample(other_courses_filtered['Courses'].tolist(), 1)

        sched_courses = core_classes + l_4_sample + l_3_4_sample + other_sample
        sched_df = courses_df[courses_df['Courses'].isin(sched_courses)].copy()

        sched_tuple = tuple(sorted(sched_df['Courses'].tolist()))
        if sched_tuple not in used_schedules:
            schedules_df.append(sched_df)
            used_schedules.add(sched_tuple)

    training_data = create_schedule_data(jobs_df, schedules_df)

    return training_data


def create_training_data(jobs_df, courses_df, all_acquired_skills, num_negatives=4):
    training_data = all_class_comparison(jobs_df, courses_df, all_acquired_skills)
    print(f"All: {training_data[-1]}\n")

    training_data = training_data + compare_individual_course(jobs_df, courses_df)
    print(f"Individual: {training_data[-1]}\n")
    
    training_data = training_data + get_courseloads(jobs_df, courses_df)
    print(f"Course load: {training_data[-1]}\n")

    return training_data

# training_data, all_job_skills = create_training_data(jobs_df, courses_df, all_acquired_skills)

training_data = create_training_data(jobs_df, courses_df, all_acquired_skills)


# check = set(all_job_skills[0]) - set(training_data[0]["answer"])
# check2 = set(training_data[0]["answer"]) - set(all_job_skills[0])
# print(check) 
# print(check2)

(2765, 3)

In [None]:
td_df = pd.DataFrame(training_data, columns=['question', 'context', 'answer'])
print(td_df.shape)

td_df.to_csv('data/dpr_training_data.csv', index=False)

del td_df
gc.collect()

(1694, 3)


66

## Question (Jobs) and Context (Courses) Encoding for DPR Training

In [None]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# question_encoder = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-multiset-base')
# context_encoder = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-multiset-base')

# question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-multiset-base')
# context_tokenizer = DPRContextEncodeTokenizer.from_pretrained('facebook/dpr-ctx_encoder-multiset-base')

# train_data = {
#     'question': [entry['question'] for entry in dpr_training_data],
#     'context': [entry['context'] for entry in dpr_training_data],
#     'answer': [entry['answer'] for entry in dpr_training_data]
# }

# train_dataset = Dataset.from_dict(train_data)

# training_args = TrainingArguments(
#     output_dir='./results',
#     num_train_epochs=3,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     logging_dir='./logs',
#     logging_steps=100,
#     evaluation_strategy="steps",
#     save_steps=1000,
#     save_total_limit=2,
# )

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset
# )

# trainer.train()

## Create and Initialize Dataloader Class

In [None]:
# def collate_fn(batch):
#     question_input_ids = [item['question_input_ids'] for item in batch]
#     context_input_ids = [item['context_input_ids'] for item in batch]
    
#     question_input_ids_padded = pad_sequence(question_input_ids, batch_first=True, padding_value=0)  # Padding value 0 (or any value as required)
#     context_input_ids_padded = pad_sequence(context_input_ids, batch_first=True, padding_value=0)
    
#     return {
#         'question_input_ids': question_input_ids_padded,
#         'context_input_ids': context_input_ids_padded,
#     }

# class DPRDataset(Dataset):
#     def __init__(self, data):
#         self.data = data  # List of (question, context) pairs
#         self.tokenizer = tokenizer  # Your tokenizer
    
#     def __len__(self):
#         return len(self.data)
    
#     def __getitem__(self, idx):
#         question, context = self.data[idx]
#         question_encoding = self.tokenizer(question, truncation=True, padding='max_length', return_tensors="pt")
#         context_encoding = self.tokenizer(context, truncation=True, padding='max_length', return_tensors="pt")

#         return {
#             'question_input_ids': question_encoding['input_ids'].squeeze(0),
#             'context_input_ids': context_encoding['input_ids'].squeeze(0),
#         }

# train_dataloader = DataLoader(training_data, batch_size=32, collate_fn=collate_fn)

## WandB Evaluation Setup

In [None]:
# key_file = r'C:\Development\cs580B3\term_project_model\WANDB_API_KEY.txt' 

# with open(key_file, "r") as f:
#     api_key = f.read().strip()

# # Log into WandB with the API key
# wandb.login(key=api_key)

## Set-Up HuggingFace Evaluations

In [None]:
# learning_rate = 1e-5
# batch_size = 4
# epochs = 3
# neg_sample_size = 4
# warmup_steps = 1000

# # Initialize wandb for tracking
# wandb.init(
#     project="Gaps-DPR", 
#     entity="ayoungren-colostate",
#     name=f"DPR_lr-{learning_rate}_bs-{batch_size}_epochs-{epochs}",
#     config={
#         "learning_rate": learning_rate,
#         "batch_size": batch_size,
#         "epochs": epochs,
#         "neg_sample_size": neg_sample_size,
#         "warmup_steps": warmup_steps
#     }
# )

# optimizer = torch.optim.AdamW(list(question_encoder.parameters()) + list(context_encoder.parameters()), lr=1e-5)

# epochs = 3
# for epoch in range(epochs):
#     question_encoder.train()
#     context_encoder.train()
#     total_loss = 0
#     for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{epochs}"):
#         # Move data to device (GPU/CPU)
#         question_input_ids = batch['question_input_ids'].to(device)
#         context_input_ids = batch['context_input_ids'].to(device)
        
#         # Forward pass
#         question_outputs = question_encoder(input_ids=question_input_ids)
#         context_outputs = context_encoder(input_ids=context_input_ids)
        
#         # Get embeddings (use the [CLS] token embeddings for simplicity)
#         question_embeddings = question_outputs.last_hidden_state[:, 0, :]
#         context_embeddings = context_outputs.last_hidden_state[:, 0, :]
        
#         # Compute loss
#         loss = compute_loss(question_embeddings, context_embeddings)
#         total_loss += loss.item()

#         # Backpropagation
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()

#     print(f"Epoch {epoch + 1} - Loss: {total_loss / len(train_dataloader)}")

# # 5. Save the model
# torch.save(question_encoder.state_dict(), "question_encoder.pth")
# torch.save(context_encoder.state_dict(), "context_encoder.pth")