In [None]:
import re
import os
import gc
import csv
import torch
import wandb
import random
import numpy as np
import pandas as pd

from tqdm import tqdm
from torch.cuda import amp
from torch.utils.data import DataLoader
from sklearn.model_selection import ParameterGrid, train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sentence_transformers import SentenceTransformer, losses, util, InputExample, evaluation

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open ('data/all-csu-codes.csv', 'r') as c_data:
    csv_reader = csv.reader(c_data) 
    courses_data = list(csv_reader)
    
courses_df = pd.DataFrame(columns=['Courses', 'Skills'])

for idx in range(0, len(courses_data)):
    skill_list = courses_data[idx][1:-1]
    skill_list = [skill.title() for skill in skill_list]
    skill_list = [re.sub(r'\b(vs|Vs)\b', 'VS', skill) for skill in skill_list]

    row = pd.DataFrame({'Courses': courses_data[idx][0], 'Skills':[skill_list]})
    courses_df = pd.concat([courses_df, row], ignore_index=True)

courses_df.to_csv('data/courses_data.csv', index=False)

In [3]:
with open('data/descriptions.txt', 'r') as j_data:
    csv_reader = csv.reader(j_data, delimiter='|')
    jobs_data = list(csv_reader)

jobs_df = pd.DataFrame(columns=['Job_Title', 'Job_Description', 'Required_Skills'])
                    
for row in jobs_data:
    if len(row) == 3:
        job_title = row[0].strip().strip('"') 

        job_description = row[1].strip().strip('"')
        job_description = re.sub(r'\bDESCRIPTION\b', '', job_description)
    
        skills = row[2].strip().strip('"')
        skill_list = [skill.strip().strip('"') for skill in skills.split(',')]
        cap_skill_list = [skill.title() for skill in skill_list]
        cleaned_skills = [re.sub(r'\s?\(.*?\)', '', skill) for skill in cap_skill_list]

        row = pd.DataFrame({'Job_Title': job_title, 'Job_Description': job_description, 'Required_Skills': [skill_list]})
        jobs_df = pd.concat([jobs_df, row], ignore_index=True)
            
jobs_df.to_csv('data/jobs_data.csv', index=False)

In [4]:
def get_all_acquired_skills(courses_df):
    all_acquired_skills = set() 
    
    for skills in courses_df['Skills']:
        all_acquired_skills.update(skills)

    return all_acquired_skills

In [None]:
def all_class_comparison(jobs_df, courses_df, all_acquired_skills):
    training_data = []
    course_entries = list(zip(courses_df["Courses"], courses_df["Skills"]))

    for _, job in jobs_df.iterrows():
        job_text = f"Job Title: {job['Job_Title']}\n Job Description: {job['Job_Description']}"
        job_skills = set(job["Required_Skills"])
        missing_skills = job_skills - all_acquired_skills
        num_missing = float(len(missing_skills))
        num_job_skills = float(len(job_skills))
        
        course_info = "COURSES TAKEN:\n"
        for course, skills in course_entries:
            course_info += f"\tCOURSE: {course}\n\t\t{course} SKILLS: {', '.join(skills)}\n"

        query_text = "What skills am I lacking for the following job position, given the classes I have taken?\n\n" + job_text
        query_text += "\n\n" + course_info
        if len(missing_skills) == 0:
            acquired_skills = {', '.join(job['Required_Skills'])}
            training_data.append({
                'query': query_text,
                'answer': f"You qualify as an applicant for the job position, {job['Job_Title']}.\ Your courses provided all listed required skills: {acquired_skills}",
                'label': 1.0
            })
        elif num_missing == num_job_skills:
            answer = ", ".join(list(missing_skills))
            training_data.append({
                'query': query_text,
                'answer': (f"You are missing the following skills required by the postion: " + answer),
                'label': 0.0
            })
        else:
            answer = ", ".join(list(missing_skills))
            training_data.append({
                'query': query_text,
                'answer': (f"You are missing the following skills required by the postion: " + answer),
                'label': (1.0 - (num_missing/num_job_skills))
            })
        
    return training_data

In [None]:
def compare_individual_course(jobs_df, courses_df, neg_per_pos=1):
    training_data = []
     
    for _, job in jobs_df.iterrows():
        job_text = f"Job Title: {job['Job_Title']}\n Job Description: {job['Job_Description']}"
        job_skills = set(job['Required_Skills']) 
        num_job_skills = float(len(job_skills))
        
        for _, course in courses_df.iterrows():
            missing_skills = list(job_skills - set(course["Skills"]))
            num_missing = float(len(missing_skills))
            course_info = f"COURSE TAKEN: {course['Courses']}\n {course['Courses']} SKILLS: {', '.join(course['Skills'])}"

        query_text = "What skills am I lacking for the following job position, given the classes I have taken?\n\n" + job_text
        query_text += "\n\n" + course_info
        if len(missing_skills) == 0:
            acquired_skills = {', '.join(job['Required_Skills'])}
            training_data.append({
                'query': query_text,
                'answer': f"You qualify as an applicant for the job position, {job['Job_Title']}.\ Your courses provided all listed required skills: {acquired_skills}",
                'label': 1.0
            })
        elif num_missing == num_job_skills:
            answer = ", ".join(list(missing_skills))
            training_data.append({
                'query': query_text,
                'answer': (f"You are missing all of the following skills required by the postion: " + answer),
                'label': 0.0
            })
        else:
            answer = ", ".join(list(missing_skills))
            training_data.append({
                'query': query_text,
                'answer': (f"You are missing the following skills required by the postion: " + answer),
                'label': (1.0 - (num_missing/num_job_skills))
            })

    return training_data

In [None]:
def create_schedule_data(jobs_df, schedules):
    training_data = []

    for _, job in jobs_df.iterrows():
        job_text = f"Job Title: {job['Job_Title']}\n Job Description: {job['Job_Description']}"
        job_skills = set(job["Required_Skills"])
        num_job_skills = float(len(job_skills))

        for sched in schedules:
            course_entries = list(zip(sched["Courses"], sched["Skills"]))
            sched_skills = set([item for sublist in sched["Skills"].tolist() for item in sublist])
            missing_skills = list(job_skills - sched_skills)
            num_missing = float(len(missing_skills))
            
            course_info = "COURSES TAKEN:\n"
            for course, skills in course_entries:
                course_info += f"\tCOURSE: {course}\n\t\t{course} SKILLS: {', '.join(skills)}\n"

            query_text = "What skills am I lacking for the following job position, given the classes I have taken?\n\n" + job_text
            query_text += "\n\n" + course_info
            if len(missing_skills) == 0:
                acquired_skills = {', '.join(job['Required_Skills'])}
                training_data.append({
                    'query': query_text,
                    'answer': f"You qualify as an applicant for the job position, {job['Job_Title']}.\ Your courses provided all listed required skills: {acquired_skills}",
                    'label': 1.0
                })
            elif num_missing == num_job_skills:
                answer = ", ".join(list(missing_skills))
                training_data.append({
                    'query': query_text,
                    'answer': (f"You are missing all of the following skills required by the postion: " + answer),
                    'label': 0.0
                })
            else:
                answer = ", ".join(list(missing_skills))
                training_data.append({
                    'query': query_text,
                    'answer': (f"You are missing the following skills required by the postion: " + answer),
                    'label': (1.0 - (num_missing/num_job_skills))
                })

    return training_data


def get_courseloads(jobs_df, courses_df, number_of_schedules=20):
    core_classes = ['CS150', 'CS164', 'CS152', 'CS162', 'CS201', 'CS165', 'CS220', 
                    'CS270', 'CS250', 'CS314', 'CS370', 'CS320', 'CS214']
    
    elective_courses_df = courses_df[~courses_df['Courses'].isin(core_classes)]

    schedules_df = []
    used_schedules = set()

    while len(schedules_df) < number_of_schedules:
        l_4_courses = elective_courses_df[elective_courses_df['Courses'].str.startswith('CS4')]
        l_3_4_courses = elective_courses_df[elective_courses_df['Courses'].str.startswith('CS3') | elective_courses_df['Courses'].str.startswith('CS4')]
        other_courses = elective_courses_df[~elective_courses_df['Courses'].str.startswith('CS3') & ~elective_courses_df['Courses'].str.startswith('CS4')]

        l_4_sample = random.sample(l_4_courses['Courses'].tolist(), 2)

        l_3_4_filtered = l_3_4_courses[~l_3_4_courses['Courses'].isin(l_4_sample)]
        l_3_4_sample = random.sample(l_3_4_filtered['Courses'].tolist(), 2)

        all_sampled_courses = l_4_sample + l_3_4_sample
        other_courses_filtered = other_courses[~other_courses['Courses'].isin(all_sampled_courses)]
        other_sample = random.sample(other_courses_filtered['Courses'].tolist(), 1)

        sched_courses = core_classes + l_4_sample + l_3_4_sample + other_sample
        sched_df = courses_df[courses_df['Courses'].isin(sched_courses)].copy()

        sched_tuple = tuple(sorted(sched_df['Courses'].tolist()))
        if sched_tuple not in used_schedules:
            schedules_df.append(sched_df)
            used_schedules.add(sched_tuple)

    training_data = create_schedule_data(jobs_df, schedules_df)

    return training_data

In [8]:
def create_training_data(jobs_df, courses_df, all_acquired_skills):
    training_data = all_class_comparison(jobs_df, courses_df, all_acquired_skills)
    print(f"All: {training_data[-1]}\n")

    training_data = training_data + compare_individual_course(jobs_df, courses_df)
    print(f"Individual: {training_data[-1]}\n")
    
    training_data = training_data + get_courseloads(jobs_df, courses_df)
    print(f"Course load: {training_data[-1]}\n")

    return training_data

all_acquired_skills = get_all_acquired_skills(courses_df)
training_data = create_training_data(jobs_df, courses_df, all_acquired_skills)

All: {'query': "Job Title: Yahoo_Software_Dev_Engineer\nJob Description: Who We're Looking For- Junior Software Engineer We hire engineers who love the web, but can see its cracks and its future, too. We look for people who are exceptionally who are exceptionally imaginative, collaborative, and truly excited about tech. Our DSP Reporting team is currently looking for talented full-stack engineers to design, implement, and support robust, scalable, and high-quality reporting solutions Your Responsibilities - Develop and enhance a state-of-the-art reporting and analytics platform. - Build intuitive front-end UIs for reporting and analytics using React. - Develop microservices to power reporting and analytics solutions. - Write clean, maintainable, and performant code, including unit tests and refactoring when needed. - Collaborate with designers and developers to define and deliver new features. - Participate in system architecture reviews, code reviews, performance tuning, and productio

In [9]:
print(training_data[0])
td_df = pd.DataFrame(training_data, columns=['query', 'answer'])
print(td_df.shape)

td_df.to_csv('data/bert_training_data.csv', index=False)

del td_df, training_data, jobs_df, courses_df
gc.collect()


{'query': "Job Title: Adobe_AI_ML_Engineer\nJob Description: The Opportunity?Adobe is seeking talented and passionate Software Engineer across all organizations to help plan, design, develop, and test software systems or applications for software enhancements and new products used in local, networked, cloud-based or Internet-related computer programs and products. What You'll Do - Develop high-performance, reliable, testable and maintainable code. - Participating in all aspects of software development activities, including design, coding, code review, testing, bug fixing, and code/API documentation. - Collaborate with engineers and participate in daily or weekly stand ups and meetings. - Grow with the support of your team and help others on the team grow by providing thoughtful feedback and uplifting those around you. - Work both independently and collaboratively within a fast-paced development team, with clear, positive, and constructive communication. - Additional responsibilities as

64

In [10]:
with open ('data/bert_training_data.csv', 'r') as t_data:
    csv_reader = csv.reader(t_data) 
    training_data = list(csv_reader)

print(training_data[0])
td_df = pd.DataFrame(training_data, columns=['query', 'answer'])
print(td_df.shape)
td_df

['query', 'answer']
(1739, 2)


Unnamed: 0,query,answer
0,query,answer
1,Job Title: Adobe_AI_ML_Engineer\nJob Descripti...,You are missing the following skills required ...
2,Job Title: Adobe_Junior_SDE\nJob Description: ...,You are missing the following skills required ...
3,Job Title: Adobe_Software_Engineering_Intern\n...,You are missing the following skills required ...
4,Job Title: Adobe_Software_Quality_Engineer\nJo...,You are missing the following skills required ...
...,...,...
1734,Job Title: Yahoo_Software_Dev_Engineer\nJob De...,You are missing the following skills required ...
1735,Job Title: Yahoo_Software_Dev_Engineer\nJob De...,You are missing the following skills required ...
1736,Job Title: Yahoo_Software_Dev_Engineer\nJob De...,You are missing the following skills required ...
1737,Job Title: Yahoo_Software_Dev_Engineer\nJob De...,You are missing the following skills required ...


In [None]:
def prepare_datasets(td_df):
    td_df['label_bin'] = td_df['label'].round(1)
    train_df, temp_df = train_test_split(td_df, test_size=0.4, random_state=42, stratify=td_df['label_bin'])
    val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label_bin'])
    
    return {
        'train': [InputExample(texts=[row['query']], label=row['label']) for _,row in train_df.iterrows()],
        'val': [InputExample(texts=[row['query']], label=row['label']) for _,row in val_df.iterrows()],
        'test': [InputExample(texts=[row['query']], label=row['label']) for _,row in test_df.iterrows()]
    }

In [None]:
def compute_metrics(labels, preds):
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1': f1_score(labels, preds, average='weighted'),
        'precision': precision_score(labels, preds, average='weighted'),
        'recall': recall_score(labels, preds, average='weighted')
    }

In [None]:
key_file = rf'D:\Development\cs580\CSU-Industry-Skills\WANDB_API_KEY.txt' 

with open(key_file, "r") as f:
    api_key = f.read().strip()

wandb.login(key=api_key)

MODEL_NAME = 'sentence-transformers/all-MiniLM-L6-v2'
EARLY_STOPPING_PATIENCE = 5
MIN_DELTA = 0.005

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.backends.cudnn.benchmark = True
print(f"Using device: {device}")

In [None]:
def train_and_validate(config, datasets, run):
    model = SentenceTransformer(MODEL_NAME).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), 
                                lr=config.learning_rate, 
                                weight_decay=config.weight_decay)
    loss_fn = losses.CosineSimilarityLoss(model).to(device)
    
    train_loader = DataLoader(datasets['train'], shuffle=True, 
                            batch_size=config.batch_size, pin_memory=True)
    val_loader = DataLoader(datasets['val'], 
                          batch_size=config.batch_size, pin_memory=True)

    best_val_metric = -np.inf
    patience_counter = 0

    for epoch in range(1, 101):
        model.train()
        epoch_train_loss = []
        train_preds, train_labels = [], []
        
        for batch in train_loader:
            optimizer.zero_grad()
            with amp.autocast():
                embeddings = model(batch['texts'])
                loss = loss_fn(embeddings, batch['labels'])
            loss.backward()
            optimizer.step()
            
            epoch_train_loss.append(loss.item())
            train_preds.extend(torch.argmax(embeddings, dim=1).cpu().numpy())
            train_labels.extend(batch['labels'].numpy())

        train_metrics = compute_metrics(train_labels, train_preds)
        
        model.eval()
        val_preds, val_labels = [], []
        with torch.no_grad():
            for batch in val_loader:
                embeddings = model(batch['texts'].to(device))
                val_preds.extend(torch.argmax(embeddings, dim=1).cpu().numpy())
                val_labels.extend(batch['labels'].numpy())

        val_metrics = compute_metrics(val_labels, val_preds)
        
        run.log({
            'epoch': epoch,
            'train/loss': np.mean(epoch_train_loss),
            'train/accuracy': train_metrics['accuracy'],
            'train/f1': train_metrics['f1'],
            'train/precision': train_metrics['precision'],
            'train/recall': train_metrics['recall'],
            'val/accuracy': val_metrics['accuracy'],
            'val/f1': val_metrics['f1'],
            'val/precision': val_metrics['precision'],
            'val/recall': val_metrics['recall'],
            'hp/learning_rate': config.learning_rate,
            'hp/batch_size': config.batch_size,
            'hp/weight_decay': config.weight_decay
        })

        if val_metrics['f1'] > best_val_metric + MIN_DELTA:
            best_val_metric = val_metrics['f1']
            patience_counter = 0
            torch.save(model.state_dict(), "temp_val_best.pth")
            run.log({'best_val/f1': best_val_metric}) 
        else:
            patience_counter += 1

        if patience_counter >= EARLY_STOPPING_PATIENCE:
            break

    return best_val_metric, "temp_val_best.pth"

In [None]:
def evaluate_test_set(model, test_data):
    test_preds, test_labels = [], []
    test_loader = DataLoader(test_data, batch_size=32)
    model.eval()
    with torch.no_grad():
        for batch in test_loader:
            embeddings = model(batch['texts'].to(device))
            test_preds.extend(torch.argmax(embeddings, dim=1).cpu().numpy())
            test_labels.extend(batch['labels'].numpy())
    return compute_metrics(test_labels, test_preds)

In [None]:
def train_model(best_model, datasets, config=None):
    
    with wandb.init(config=config) as run:
        
        val_f1, model_path = train_and_validate(config, datasets, run)
        
        wandb.log({
            'val_f1': val_f1,
            'learning_rate': config.learning_rate,
            'batch_size': config.batch_size,
            'weight_decay': config.weight_decay
        })
        
        if val_f1 > best_model['metric'] + MIN_DELTA:
            if best_model['model_path']:
                os.remove(best_model['model_path'])
            
            best_model = {
                'metric': val_f1,
                'model_path': f"best_model_{run.id}.pth",
                'config': config
            }
            os.rename(model_path, best_model['model_path'])
        else:
            os.remove(model_path)

In [None]:
sweep_config = {
    'method': 'grid',
    'metric': {'name': 'val_f1', 'goal': 'maximize'},
    'parameters': {
        'learning_rate': {'values': [2e-5, 3e-5, 5e-5]},
        'batch_size': {'values': [16, 32]},
        'weight_decay': {'values': [0.0, 0.01]}
    }
}

best_model = []
datasets = prepare_datasets(td_df)
sweep_id = wandb.sweep(sweep_config, entity="ayoungren-colostate", project='SenBERT-Skill-Gaps')

wandb.agent(sweep_id, train_model(best_model, datasets), count=15)

if best_model['model_path']:
    final_model = SentenceTransformer(MODEL_NAME).to(device)
    final_model.load_state_dict(torch.load(best_model['model_path']))
    
    
    test_metrics = evaluate_test_set(final_model, datasets['test'])
    
    with wandb.init(project='sbert-arch-search', job_type='final_eval') as run:
        run.config.update(best_model['config'])
        wandb.log({
            'final_test_f1': test_metrics['f1'],
            'final_test_accuracy': test_metrics['accuracy'],
            'final_test_precision': test_metrics['precision'],
            'final_test_recall': test_metrics['recall']
        })
    
    print(f"=== BEST MODEL RESULTS ===")
    print(f"Config: {best_model['config']}")
    print(f"Validation F1: {best_model['metric']:.4f}")
    print(f"Test F1: {test_metrics['f1']:.4f}")