# Sentence-BERT: Academic-Industry Skill Gaps Through Semantic Identification

## Imports

In [1]:
import re
import os
import gc
import csv
import torch
import wandb
import random
import numpy as np
import pandas as pd

from torch.utils.data import DataLoader
from scipy.stats import pearsonr, spearmanr
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


  from .autonotebook import tqdm as notebook_tqdm


## Data Retrieval

In [None]:
# with open ('data/all-csu-codes.csv', 'r') as c_data:
#     csv_reader = csv.reader(c_data) 
#     courses_data = list(csv_reader)
    
# courses_df = pd.DataFrame(columns=['Courses', 'Skills'])

# for idx in range(0, len(courses_data)):
#     skill_list = courses_data[idx][1:-1]
#     skill_list = [skill.title() for skill in skill_list]
#     skill_list = [re.sub(r'\b(vs|Vs)\b', 'VS', skill) for skill in skill_list]

#     row = pd.DataFrame({'Courses': courses_data[idx][0], 'Skills':[skill_list]})
#     courses_df = pd.concat([courses_df, row], ignore_index=True)

# courses_df.to_csv('data/courses_data.csv', index=False)

In [None]:
# with open('data/descriptions.txt', 'r') as j_data:
#     csv_reader = csv.reader(j_data, delimiter='|')
#     jobs_data = list(csv_reader)

# jobs_df = pd.DataFrame(columns=['Job_Title', 'Job_Description', 'Required_Skills'])
                    
# for row in jobs_data:
#     if len(row) == 3:
#         job_title = row[0].strip().strip('"') 

#         job_description = row[1].strip().strip('"')
#         job_description = re.sub(r'\bDESCRIPTION\b', '', job_description)
    
#         skills = row[2].strip().strip('"')
#         skill_list = [skill.strip().strip('"') for skill in skills.split(',')]
#         cap_skill_list = [skill.title() for skill in skill_list]
#         cleaned_skills = [re.sub(r'\s?\(.*?\)', '', skill) for skill in cap_skill_list]

#         row = pd.DataFrame({'Job_Title': job_title, 'Job_Description': job_description, 'Required_Skills': [skill_list]})
#         jobs_df = pd.concat([jobs_df, row], ignore_index=True)
            
# jobs_df.to_csv('data/jobs_data.csv', index=False)

### Gather all course skills in one structure

In [2]:
def get_all_acquired_skills(courses_df):
    all_acquired_skills = set() 
    
    for skills in courses_df['Skills']:
        all_acquired_skills.update(skills)

    return all_acquired_skills

## Create Training Data
### Job to All Courses Comparison

In [3]:
def all_class_comparison(jobs_df, courses_df, all_acquired_skills):
    training_data = []
    course_entries = list(zip(courses_df["Courses"], courses_df["Skills"]))

    for _, job in jobs_df.iterrows():
        job_text = f"Job Title: {job['Job_Title']}\n Job Description: {job['Job_Description']}"
        job_skills = set(job["Required_Skills"])
        missing_skills = job_skills - all_acquired_skills
        num_missing = float(len(missing_skills))
        num_job_skills = float(len(job_skills))
        
        course_info = "COURSES TAKEN:\n"
        for course, skills in course_entries:
            course_info += f"\tCOURSE: {course}\n\t\t{course} SKILLS: {', '.join(skills)}\n"

        if len(missing_skills) == 0:
            training_data.append({
                'job': job_text,
                'courses': course_info,
                'label': 1.0
            })

        elif num_missing == num_job_skills:
            training_data.append({
                'job': job_text,
                'courses': course_info,
                'label': 0.0
            })
        else:
            training_data.append({
                'job': job_text,
                'courses': course_info,
                'label': (1.0 - (num_missing/num_job_skills))
            })

        
    return training_data

### Job to Individual Course Comaprisons

In [4]:
def compare_individual_course(jobs_df, courses_df):
    training_data = []
     
    for _, job in jobs_df.iterrows():
        job_text = f"Job Title: {job['Job_Title']}\n Job Description: {job['Job_Description']}"
        job_skills = set(job['Required_Skills']) 
        num_job_skills = float(len(job_skills))
        
        for _, course in courses_df.iterrows():
            missing_skills = list(job_skills - set(course["Skills"]))
            num_missing = float(len(missing_skills))
            course_info = f"COURSE TAKEN: {course['Courses']}\n {course['Courses']} SKILLS: {', '.join(course['Skills'])}"

            if len(missing_skills) == 0:
                training_data.append({
                    'job': job_text,
                    'courses': course_info,
                    'label': 1.0
                })

            elif num_missing == num_job_skills:
                training_data.append({
                    'job': job_text,
                    'courses': course_info,
                    'label': 0.0
                })
            else:
                training_data.append({
                    'job': job_text,
                    'courses': course_info,
                    'label': (1.0 - (num_missing/num_job_skills))
                })

    return training_data

### Creation of Psuedo Schedules & Job to Psuedo Schedule Comparisons

In [5]:
def create_schedule_data(jobs_df, schedules):
    training_data = []

    for _, job in jobs_df.iterrows():
        job_text = f"Job Title: {job['Job_Title']}\n Job Description: {job['Job_Description']}"
        job_skills = set(job["Required_Skills"])
        num_job_skills = float(len(job_skills))

        for sched in schedules:
            course_entries = list(zip(sched["Courses"], sched["Skills"]))
            sched_skills = set([item for sublist in sched["Skills"].tolist() for item in sublist])
            missing_skills = list(job_skills - sched_skills)
            num_missing = float(len(missing_skills))
            
            course_info = "COURSES TAKEN:\n"
            for course, skills in course_entries:
                course_info += f"\tCOURSE: {course}\n\t\t{course} SKILLS: {', '.join(skills)}\n"

            if len(missing_skills) == 0:
                training_data.append({
                    'job': job_text,
                    'courses': course_info,
                    'label': 1.0
                })
            elif num_missing == num_job_skills:
                training_data.append({
                    'job': job_text,
                    'courses': course_info,
                    'label': 0.0
                })
            else:
                training_data.append({
                    'job': job_text,
                    'courses': course_info,
                    'label': (1.0 - (num_missing/num_job_skills))
                })


    return training_data


def get_courseloads(jobs_df, courses_df, number_of_schedules=20):
    core_classes = ['CS150', 'CS164', 'CS152', 'CS162', 'CS201', 'CS165', 'CS220', 
                    'CS270', 'CS250', 'CS314', 'CS370', 'CS320', 'CS214']
    
    elective_courses_df = courses_df[~courses_df['Courses'].isin(core_classes)]

    schedules_df = []
    used_schedules = set()

    while len(schedules_df) < number_of_schedules:
        l_4_courses = elective_courses_df[elective_courses_df['Courses'].str.startswith('CS4')]
        l_3_4_courses = elective_courses_df[elective_courses_df['Courses'].str.startswith('CS3') | elective_courses_df['Courses'].str.startswith('CS4')]
        other_courses = elective_courses_df[~elective_courses_df['Courses'].str.startswith('CS3') & ~elective_courses_df['Courses'].str.startswith('CS4')]

        l_4_sample = random.sample(l_4_courses['Courses'].tolist(), 2)

        l_3_4_filtered = l_3_4_courses[~l_3_4_courses['Courses'].isin(l_4_sample)]
        l_3_4_sample = random.sample(l_3_4_filtered['Courses'].tolist(), 2)

        all_sampled_courses = l_4_sample + l_3_4_sample
        other_courses_filtered = other_courses[~other_courses['Courses'].isin(all_sampled_courses)]
        other_sample = random.sample(other_courses_filtered['Courses'].tolist(), 1)

        sched_courses = core_classes + l_4_sample + l_3_4_sample + other_sample
        sched_df = courses_df[courses_df['Courses'].isin(sched_courses)].copy()

        sched_tuple = tuple(sorted(sched_df['Courses'].tolist()))
        if sched_tuple not in used_schedules:
            schedules_df.append(sched_df)
            used_schedules.add(sched_tuple)

    training_data = create_schedule_data(jobs_df, schedules_df)

    return training_data

### Training Data Creation Caller Function

In [6]:
def create_training_data(jobs_df, courses_df, all_acquired_skills):
    training_data = all_class_comparison(jobs_df, courses_df, all_acquired_skills)
    print(f"{len(training_data) } --All: {training_data[-1]}\n")

    training_data = training_data + compare_individual_course(jobs_df, courses_df)
    print(f"{len(training_data) } --Individual: {training_data[-1]}\n")
    
    training_data = training_data + get_courseloads(jobs_df, courses_df)
    print(f"{len(training_data) } --Course load: {training_data[-1]}\n")

    return training_data

# all_acquired_skills = get_all_acquired_skills(courses_df)
# training_data = create_training_data(jobs_df, courses_df, all_acquired_skills)

### Save/Export Training Data for Consisency and Reusability

Uncommenting and using the cell below will overwrite the current saved training data. This will change the psuedo schedules, providing a different dataset.

In [None]:
# print(training_data[0])
# td_df = pd.DataFrame(training_data, columns=['job', 'courses', 'label'])
# print(td_df.shape)

# td_df.to_csv('data/bert_training_data.csv', index=False)

# del td_df, training_data, jobs_df, courses_df
# gc.collect()


{'job': "Job Title: Adobe_AI_ML_Engineer\n Job Description: The Opportunity?Adobe is seeking talented and passionate Software Engineer across all organizations to help plan, design, develop, and test software systems or applications for software enhancements and new products used in local, networked, cloud-based or Internet-related computer programs and products. What You'll Do - Develop high-performance, reliable, testable and maintainable code. - Participating in all aspects of software development activities, including design, coding, code review, testing, bug fixing, and code/API documentation. - Collaborate with engineers and participate in daily or weekly stand ups and meetings. - Grow with the support of your team and help others on the team grow by providing thoughtful feedback and uplifting those around you. - Work both independently and collaboratively within a fast-paced development team, with clear, positive, and constructive communication. - Additional responsibilities as 

### Retrieve/Import Saved Training Data

In [7]:
with open ('data/bert_training_data.csv', 'r') as t_data:
    csv_reader = csv.reader(t_data) 
    training_data = list(csv_reader)

print(training_data[0])
td_df = pd.DataFrame(training_data, columns=['job', 'courses', 'label'])
print(td_df.shape)
td_df

['job', 'courses', 'label']
(4425, 3)


Unnamed: 0,job,courses,label
0,job,courses,label
1,Job Title: Adobe_AI_ML_Engineer\n Job Descript...,COURSES TAKEN:\n\tCOURSE: CS462\n\t\tCS462 SKI...,0.19354838709677424
2,Job Title: Adobe_Junior_SDE\n Job Description:...,COURSES TAKEN:\n\tCOURSE: CS462\n\t\tCS462 SKI...,0.0714285714285714
3,Job Title: Adobe_Software_Engineering_Intern\n...,COURSES TAKEN:\n\tCOURSE: CS462\n\t\tCS462 SKI...,0.33333333333333337
4,Job Title: Adobe_Software_Quality_Engineer\n J...,COURSES TAKEN:\n\tCOURSE: CS462\n\t\tCS462 SKI...,0.15000000000000002
...,...,...,...
4420,Job Title: Yahoo_Software_Dev_Engineer\n Job D...,COURSES TAKEN:\n\tCOURSE: CS314\n\t\tCS314 SKI...,0.13157894736842102
4421,Job Title: Yahoo_Software_Dev_Engineer\n Job D...,COURSES TAKEN:\n\tCOURSE: CS462\n\t\tCS462 SKI...,0.13157894736842102
4422,Job Title: Yahoo_Software_Dev_Engineer\n Job D...,COURSES TAKEN:\n\tCOURSE: CS314\n\t\tCS314 SKI...,0.13157894736842102
4423,Job Title: Yahoo_Software_Dev_Engineer\n Job D...,COURSES TAKEN:\n\tCOURSE: CS314\n\t\tCS314 SKI...,0.13157894736842102


## Model Training
### Preparation Function to Convert Dataframe Structure to Dataset Structure with Train, Test, and Validation Split

In [8]:
def prepare_datasets(td_df):
    td_df = td_df.copy()
    
    td_df.loc[:, 'label'] = pd.to_numeric(td_df['label'], errors='coerce')
    
    td_df = td_df.dropna(subset=['label'])
    
    td_df.loc[:, 'job'] = td_df['job'].astype(str)
    td_df.loc[:, 'courses'] = td_df['courses'].astype(str)
    
    train_df, temp_df = train_test_split(td_df, test_size=0.4, random_state=42)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)
    
    def create_dataset(df):
        return [
            {
                'job': str(row['job']),
                'courses': str(row['courses']),
                'label': float(row['label'])
            }
            for _, row in df.iterrows()
        ]
    
    return {
        'train': create_dataset(train_df),
        'val': create_dataset(val_df),
        'test': create_dataset(test_df)
    }


### Function for Computing and Returning Regression Metrics

In [9]:
def compute_metrics(labels, preds):
    metrics = {
        'spearman_rho': spearmanr(labels, preds)[0],
        'pearson_r': pearsonr(labels, preds)[0],
        'mae': mean_absolute_error(labels, preds),
        'r2': r2_score(labels, preds),
        'mse': mean_squared_error(labels, preds),
    }
    
    return metrics

### Set-up for WANDB, Torch, Model Saving, and Early Stopping

In [13]:
key_file = rf'C:\Development\cs580B3\term_project_model\CSU-Industry-Skills\WANDB_API_KEY.txt' 

with open(key_file, "r") as f:
    api_key = f.read().strip()

wandb.login(key=api_key)

MODEL_NAME = 'sentence-transformers/all-MiniLM-L6-v2'
EARLY_STOPPING_PATIENCE = 5
MIN_DELTA = 0.005

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.backends.cudnn.benchmark = True
print(f"Using device: {device}")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\ayoun\_netrc
[34m[1mwandb[0m: Currently logged in as: [33mayoungren94[0m ([33mayoungren-colostate[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Using device: cuda


### Evaluating Base Model Performance
#### Default Model Evaluation Function

In [14]:
def evaluate_dataset(dataset, batch_size=32):
    preds, labels = [], []
    model = SentenceTransformer(MODEL_NAME).to(device)
    
    model.eval()
    with torch.no_grad():
        for i in range(0, len(dataset), batch_size):
            batch = dataset[i:i + batch_size]
            
            batch_queries = [item['job'] for item in batch]
            batch_answers = [item['courses'] for item in batch]
            batch_labels = torch.tensor([item['label'] for item in batch], 
                                      dtype=torch.float).to(device)
            
            query_emb = model.encode(batch_queries, convert_to_tensor=True)
            answer_emb = model.encode(batch_answers, convert_to_tensor=True)
            batch_cos_sim = torch.nn.functional.cosine_similarity(query_emb, answer_emb)
            
            preds.extend(batch_cos_sim.cpu().numpy())
            labels.extend(batch_labels.cpu().numpy())
    
    return compute_metrics(labels, preds)

#### Default Model Evaluation

In [15]:
datasets = prepare_datasets(td_df)

run = wandb.init(
    entity="ayoungren-colostate",
    project="sbert-param-search-2",
    name="untrained_model"
)

val_metrics = evaluate_dataset(datasets['val'])


run.log({
    'val_mse': val_metrics['mse'],
    'val_mae': val_metrics['mae'],
    'val_r2': val_metrics['r2'],
    'val_pearson_r': val_metrics['pearson_r'],
    'val_spearman_rho': val_metrics['spearman_rho']
})

test_metrics = evaluate_dataset(datasets['test'])

run.log({
    'test_mse': test_metrics['mse'],
    'test_mae': test_metrics['mae'],
    'test_r2': test_metrics['r2'],
    'test_pearson_r': test_metrics['pearson_r'],
    'test_spearman_rho': test_metrics['spearman_rho']
})

run.finish()


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
test_mae,▁
test_mse,▁
test_pearson_r,▁
test_r2,▁
test_spearman_rho,▁
val_mae,▁
val_mse,▁
val_pearson_r,▁
val_r2,▁
val_spearman_rho,▁

0,1
test_mae,0.2318
test_mse,0.06551
test_pearson_r,0.29358
test_r2,-4.28678
test_spearman_rho,0.31257
val_mae,0.23748
val_mse,0.06841
val_pearson_r,0.25789
val_r2,-4.5314
val_spearman_rho,0.28723


### Model Fine Tuning and Evaluation Function

In [14]:
def train_and_evaluate(config, datasets):
    best_val_metric = -1 
    patience = 0
    best_state = None
    arch_name = f"lr_{config['lr']}_bs_{config['batch_size']}"
    wandb.init(
        entity="ayoungren-colostate",
        project="sbert-param-search-2",
        name=arch_name,
        config=config,
        reinit=True
    )

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = SentenceTransformer(MODEL_NAME).to(device)
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=config['lr'],
        weight_decay=config['weight_decay']
    )

    save_dir = "saved_models"
    os.makedirs(save_dir, exist_ok=True)

    print(f'Starting Training for {arch_name}')

    for epoch in range(1, 101):
        print(f"Beginning Epoch {epoch}")
        model.train()
        epoch_loss = 0
        all_train_preds = []
        all_train_labels = []
        
        random.shuffle(datasets['train'])
        
        for i in range(0, len(datasets['train']), config['batch_size']):
            batch = datasets['train'][i:i+config['batch_size']]
            
            queries = [item['job'] for item in batch]
            answers = [item['courses'] for item in batch]
            labels = torch.tensor([item['label'] for item in batch], 
                                dtype=torch.float).to(device)

            tokenizer = model.tokenizer
            query_inputs = tokenizer(queries, padding=True, truncation=True, 
                                   return_tensors='pt').to(device)
            answer_inputs = tokenizer(answers, padding=True, truncation=True,
                                    return_tensors='pt').to(device)
            
            query_features = model(query_inputs)['sentence_embedding']
            answer_features = model(answer_inputs)['sentence_embedding']

            cos_sim = torch.nn.functional.cosine_similarity(query_features, answer_features)

            loss = torch.nn.functional.mse_loss(cos_sim, labels)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
            all_train_preds.extend(cos_sim.detach().cpu().numpy())
            all_train_labels.extend(labels.cpu().numpy())
            
            torch.cuda.empty_cache()

        model.eval()
        val_preds, val_labels = [], []
        with torch.no_grad():
            for i in range(0, len(datasets['val']), 32):
                batch = datasets['val'][i:i+32]
                queries = [item['job'] for item in batch]
                answers = [item['courses'] for item in batch]
                labels = torch.tensor([item['label'] for item in batch], 
                                    dtype=torch.float).to(device)
                
                query_emb = model.encode(queries, convert_to_tensor=True, show_progress_bar=False)
                answer_emb = model.encode(answers, convert_to_tensor=True, show_progress_bar=False)
                cos_sim = torch.nn.functional.cosine_similarity(query_emb, answer_emb)
                
                val_preds.extend(cos_sim.cpu().numpy())
                val_labels.extend(labels.cpu().numpy())

        train_metrics = compute_metrics(all_train_labels, all_train_preds)
        val_metrics = compute_metrics(val_labels, val_preds)
        
        wandb.log({
            'epoch': epoch,
            'train_loss': epoch_loss / len(datasets['train']),
            'train_mse': train_metrics['mse'],
            'train_mae': train_metrics['mae'],
            'train_r2': train_metrics['r2'],
            'train_pearson_r': train_metrics['pearson_r'],
            'train_spearman_rho': train_metrics['spearman_rho'],
            'val_mse': val_metrics['mse'],
            'val_mae': val_metrics['mae'],
            'val_r2': val_metrics['r2'],
            'val_pearson_r': val_metrics['pearson_r'],
            'val_spearman_rho': val_metrics['spearman_rho'],
            'learning_rate': optimizer.param_groups[0]['lr']
        })

        if val_metrics['spearman_rho'] > best_val_metric + MIN_DELTA:
            best_val_metric = val_metrics['spearman_rho']  # Track Spearman instead of Pearson
            patience = 0
            best_state = model.state_dict()
            torch.save({
                'epoch': epoch,
                'model_state_dict': best_state,
                'spearman_rho': best_val_metric,
            }, os.path.join(save_dir, f"lr_{config['lr']}_bs_{config['batch_size']}_best_model.pth"))
        else:
            patience += 1
            if patience >= EARLY_STOPPING_PATIENCE:
                print(f"Early stopping at epoch {epoch}")
                break
    
    wandb.finish()
    return best_val_metric, best_state

### Grid Search and Fine-Tuning Run Cell

In [15]:
datasets = prepare_datasets(td_df)
    
param_grid = ParameterGrid({
    'lr': [2e-5, 3e-5, 5e-5],
    'batch_size': [16, 32, 64, 128],
    'weight_decay': [0.01]
})

best_spearman = -1
best_model = None
best_config = None

for config in param_grid:
    val_spearman, state_dict = train_and_evaluate(config, datasets)
    if val_spearman > best_spearman:
        best_spearman = val_spearman
        best_model = state_dict
        best_config = config

print(f"The best performing model used the architecture: {best_config}")

Starting Training for lr_2e-05_bs_16
Beginning Epoch 1
Beginning Epoch 2
Beginning Epoch 3
Beginning Epoch 4
Beginning Epoch 5
Beginning Epoch 6
Beginning Epoch 7
Beginning Epoch 8
Beginning Epoch 9
Beginning Epoch 10
Early stopping at epoch 10


0,1
epoch,▁▂▃▃▄▅▆▆▇█
learning_rate,▁▁▁▁▁▁▁▁▁▁
train_loss,█▃▂▁▁▁▁▁▁▁
train_mae,█▄▃▂▂▁▁▁▁▁
train_mse,█▃▂▁▁▁▁▁▁▁
train_pearson_r,▁▆▇▇██████
train_r2,▁▆▇███████
train_spearman_rho,▁▆▆▇▇▇████
val_mae,█▂▃▂▁▁▃▁▁▁
val_mse,█▂▂▂▁▁▂▁▁▁

0,1
epoch,10.0
learning_rate,2e-05
train_loss,3e-05
train_mae,0.01591
train_mse,0.00045
train_pearson_r,0.98146
train_r2,0.96302
train_spearman_rho,0.90333
val_mae,0.01592
val_mse,0.00047


Starting Training for lr_3e-05_bs_16
Beginning Epoch 1
Beginning Epoch 2
Beginning Epoch 3
Beginning Epoch 4
Beginning Epoch 5
Beginning Epoch 6
Beginning Epoch 7
Beginning Epoch 8
Beginning Epoch 9
Beginning Epoch 10
Beginning Epoch 11
Beginning Epoch 12
Beginning Epoch 13
Early stopping at epoch 13


0,1
epoch,▁▂▂▃▃▄▅▅▆▆▇▇█
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss,█▃▂▂▁▁▁▁▁▁▁▁▁
train_mae,█▄▃▂▂▂▁▁▁▁▁▁▁
train_mse,█▃▂▂▁▁▁▁▁▁▁▁▁
train_pearson_r,▁▆▇▇█████████
train_r2,▁▆▇▇█████████
train_spearman_rho,▁▅▆▆▇▇▇██████
val_mae,█▄▅▃▂▁▁▃▁▂▄▁▁
val_mse,█▃▄▂▂▁▁▂▁▂▃▁▁

0,1
epoch,13.0
learning_rate,3e-05
train_loss,2e-05
train_mae,0.01261
train_mse,0.0003
train_pearson_r,0.98772
train_r2,0.97555
train_spearman_rho,0.914
val_mae,0.01414
val_mse,0.00043


Starting Training for lr_5e-05_bs_16
Beginning Epoch 1
Beginning Epoch 2
Beginning Epoch 3
Beginning Epoch 4
Beginning Epoch 5
Beginning Epoch 6
Beginning Epoch 7
Beginning Epoch 8
Beginning Epoch 9
Early stopping at epoch 9


0,1
epoch,▁▂▃▄▅▅▆▇█
learning_rate,▁▁▁▁▁▁▁▁▁
train_loss,█▂▂▁▁▁▁▁▁
train_mae,█▃▂▂▁▁▁▁▁
train_mse,█▂▂▁▁▁▁▁▁
train_pearson_r,▁▇▇██████
train_r2,▁▇▇██████
train_spearman_rho,▁▆▇▇▇████
val_mae,██▂▂▃▁▁▃▁
val_mse,▇█▂▂▂▁▁▂▁

0,1
epoch,9.0
learning_rate,5e-05
train_loss,2e-05
train_mae,0.01405
train_mse,0.00036
train_pearson_r,0.98495
train_r2,0.97005
train_spearman_rho,0.90783
val_mae,0.01364
val_mse,0.0004


Starting Training for lr_2e-05_bs_32
Beginning Epoch 1
Beginning Epoch 2
Beginning Epoch 3
Beginning Epoch 4
Beginning Epoch 5
Beginning Epoch 6
Beginning Epoch 7
Beginning Epoch 8
Beginning Epoch 9
Beginning Epoch 10
Beginning Epoch 11
Early stopping at epoch 11


0,1
epoch,▁▂▂▃▄▅▅▆▇▇█
learning_rate,▁▁▁▁▁▁▁▁▁▁▁
train_loss,█▂▂▂▁▁▁▁▁▁▁
train_mae,█▄▃▂▂▂▂▁▁▁▁
train_mse,█▂▂▂▁▁▁▁▁▁▁
train_pearson_r,▁▇▇▇███████
train_r2,▁▇▇▇███████
train_spearman_rho,▁▆▇▇▇▇▇████
val_mae,█▆▄▅▂▄▂▂▂▁▁
val_mse,█▅▃▄▁▃▂▂▁▁▁

0,1
epoch,11.0
learning_rate,2e-05
train_loss,1e-05
train_mae,0.01469
train_mse,0.00039
train_pearson_r,0.98405
train_r2,0.96829
train_spearman_rho,0.9061
val_mae,0.01406
val_mse,0.00043


Starting Training for lr_3e-05_bs_32
Beginning Epoch 1
Beginning Epoch 2
Beginning Epoch 3
Beginning Epoch 4
Beginning Epoch 5
Beginning Epoch 6
Beginning Epoch 7
Beginning Epoch 8
Beginning Epoch 9
Beginning Epoch 10
Early stopping at epoch 10


0,1
epoch,▁▂▃▃▄▅▆▆▇█
learning_rate,▁▁▁▁▁▁▁▁▁▁
train_loss,█▂▂▁▁▁▁▁▁▁
train_mae,█▃▂▂▂▂▁▁▁▁
train_mse,█▂▂▁▁▁▁▁▁▁
train_pearson_r,▁▇▇███████
train_r2,▁▇▇███████
train_spearman_rho,▁▆▇▇▇▇▇███
val_mae,█▄▄▃▁▁▂▂▂▂
val_mse,█▃▃▂▁▁▂▂▁▁

0,1
epoch,10.0
learning_rate,3e-05
train_loss,1e-05
train_mae,0.01494
train_mse,0.0004
train_pearson_r,0.98334
train_r2,0.96678
train_spearman_rho,0.9074
val_mae,0.01729
val_mse,0.00058


Starting Training for lr_5e-05_bs_32
Beginning Epoch 1
Beginning Epoch 2
Beginning Epoch 3
Beginning Epoch 4
Beginning Epoch 5
Beginning Epoch 6
Beginning Epoch 7
Beginning Epoch 8
Beginning Epoch 9
Beginning Epoch 10
Beginning Epoch 11
Beginning Epoch 12
Early stopping at epoch 12


0,1
epoch,▁▂▂▃▄▄▅▅▆▇▇█
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁
train_loss,█▂▂▁▁▁▁▁▁▁▁▁
train_mae,█▃▃▂▂▂▁▁▁▁▁▁
train_mse,█▂▂▁▁▁▁▁▁▁▁▁
train_pearson_r,▁▇▇█████████
train_r2,▁▇▇█████████
train_spearman_rho,▁▆▆▇▇▇▇█████
val_mae,█▅▅▂▃▁▂▃▁▁▁▁
val_mse,█▅▄▂▂▁▂▂▁▁▁▁

0,1
epoch,12.0
learning_rate,5e-05
train_loss,1e-05
train_mae,0.01388
train_mse,0.00035
train_pearson_r,0.98572
train_r2,0.97155
train_spearman_rho,0.91278
val_mae,0.01446
val_mse,0.00047


Starting Training for lr_2e-05_bs_64
Beginning Epoch 1
Beginning Epoch 2
Beginning Epoch 3
Beginning Epoch 4
Beginning Epoch 5
Beginning Epoch 6
Beginning Epoch 7
Beginning Epoch 8
Beginning Epoch 9
Beginning Epoch 10
Beginning Epoch 11
Beginning Epoch 12
Beginning Epoch 13
Beginning Epoch 14
Beginning Epoch 15
Beginning Epoch 16
Beginning Epoch 17
Beginning Epoch 18
Beginning Epoch 19
Early stopping at epoch 19


0,1
epoch,▁▁▂▂▃▃▃▄▄▅▅▅▆▆▆▇▇██
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss,█▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_mae,█▄▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁
train_mse,█▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_pearson_r,▁▆▇▇▇██████████████
train_r2,▁▇▇▇███████████████
train_spearman_rho,▁▆▇▇▇▇▇▇▇██████████
val_mae,█▇▄▄▄▄▃▃▃▃▂▃▃▂▂▁▁▂▁
val_mse,█▆▃▃▂▂▂▂▂▂▁▂▂▁▂▁▁▂▁

0,1
epoch,19.0
learning_rate,2e-05
train_loss,1e-05
train_mae,0.01433
train_mse,0.00037
train_pearson_r,0.98473
train_r2,0.96964
train_spearman_rho,0.90796
val_mae,0.01441
val_mse,0.00045


Starting Training for lr_3e-05_bs_64
Beginning Epoch 1
Beginning Epoch 2
Beginning Epoch 3
Beginning Epoch 4
Beginning Epoch 5
Beginning Epoch 6
Beginning Epoch 7
Beginning Epoch 8
Beginning Epoch 9
Beginning Epoch 10
Beginning Epoch 11
Beginning Epoch 12
Beginning Epoch 13
Early stopping at epoch 13


0,1
epoch,▁▂▂▃▃▄▅▅▆▆▇▇█
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss,█▂▂▁▁▁▁▁▁▁▁▁▁
train_mae,█▄▃▂▂▂▂▂▁▁▁▁▁
train_mse,█▂▂▁▁▁▁▁▁▁▁▁▁
train_pearson_r,▁▆▇▇▇████████
train_r2,▁▇▇██████████
train_spearman_rho,▁▆▇▇▇▇▇▇█████
val_mae,█▅▃▅▄▃▂▂▂▁▁▁▁
val_mse,█▄▃▄▃▂▂▁▂▁▁▁▁

0,1
epoch,13.0
learning_rate,3e-05
train_loss,1e-05
train_mae,0.01479
train_mse,0.0004
train_pearson_r,0.98345
train_r2,0.96704
train_spearman_rho,0.905
val_mae,0.01527
val_mse,0.0005


Starting Training for lr_5e-05_bs_64
Beginning Epoch 1
Beginning Epoch 2
Beginning Epoch 3
Beginning Epoch 4
Beginning Epoch 5
Beginning Epoch 6
Beginning Epoch 7
Beginning Epoch 8
Beginning Epoch 9
Beginning Epoch 10
Beginning Epoch 11
Beginning Epoch 12
Beginning Epoch 13
Beginning Epoch 14
Early stopping at epoch 14


0,1
epoch,▁▂▂▃▃▄▄▅▅▆▆▇▇█
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss,█▂▂▁▁▁▁▁▁▁▁▁▁▁
train_mae,█▃▃▂▂▂▂▁▁▁▁▁▁▁
train_mse,█▂▂▁▁▁▁▁▁▁▁▁▁▁
train_pearson_r,▁▇▇▇██████████
train_r2,▁▇▇███████████
train_spearman_rho,▁▆▇▇▇▇▇███████
val_mae,█▅▄▆▂▃▄▂▁▃▂▁▂▁
val_mse,█▄▃▄▂▂▃▂▁▂▁▁▂▁

0,1
epoch,14.0
learning_rate,5e-05
train_loss,1e-05
train_mae,0.01402
train_mse,0.00036
train_pearson_r,0.98511
train_r2,0.97032
train_spearman_rho,0.91259
val_mae,0.01357
val_mse,0.00038


Starting Training for lr_2e-05_bs_128
Beginning Epoch 1
Beginning Epoch 2
Beginning Epoch 3
Beginning Epoch 4
Beginning Epoch 5
Beginning Epoch 6
Beginning Epoch 7
Beginning Epoch 8
Beginning Epoch 9
Beginning Epoch 10
Beginning Epoch 11
Beginning Epoch 12
Beginning Epoch 13
Early stopping at epoch 13


0,1
epoch,▁▂▂▃▃▄▅▅▆▆▇▇█
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss,█▃▂▂▁▁▁▁▁▁▁▁▁
train_mae,█▄▃▂▂▂▂▁▁▁▁▁▁
train_mse,█▃▂▂▁▁▁▁▁▁▁▁▁
train_pearson_r,▁▆▇▇▇████████
train_r2,▁▆▇▇█████████
train_spearman_rho,▁▆▇██████████
val_mae,█▅▅▄▃▃▂▂▂▁▂▁▁
val_mse,█▄▃▃▂▂▂▁▂▁▁▁▁

0,1
epoch,13.0
learning_rate,2e-05
train_loss,1e-05
train_mae,0.01978
train_mse,0.00065
train_pearson_r,0.97307
train_r2,0.9463
train_spearman_rho,0.88457
val_mae,0.02026
val_mse,0.00073


Starting Training for lr_3e-05_bs_128
Beginning Epoch 1
Beginning Epoch 2
Beginning Epoch 3
Beginning Epoch 4
Beginning Epoch 5
Beginning Epoch 6
Beginning Epoch 7
Beginning Epoch 8
Beginning Epoch 9
Beginning Epoch 10
Beginning Epoch 11
Beginning Epoch 12
Beginning Epoch 13
Beginning Epoch 14
Beginning Epoch 15
Beginning Epoch 16
Beginning Epoch 17
Beginning Epoch 18
Beginning Epoch 19
Beginning Epoch 20
Beginning Epoch 21
Beginning Epoch 22
Beginning Epoch 23
Early stopping at epoch 23


0,1
epoch,▁▁▂▂▂▃▃▃▄▄▄▅▅▅▅▆▆▆▇▇▇██
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss,█▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_mae,█▄▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁
train_mse,█▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_pearson_r,▁▆▇▇▇██████████████████
train_r2,▁▇▇▇███████████████████
train_spearman_rho,▁▇▇▇▇▇▇████████████████
val_mae,█▆▅▅▄▄▄▃▃▂▃▃▂▂▂▂▂▂▂▁▁▃▁
val_mse,█▆▄▄▃▂▃▂▂▂▂▂▁▂▁▂▂▁▁▁▁▂▁

0,1
epoch,23.0
learning_rate,3e-05
train_loss,0.0
train_mae,0.01491
train_mse,0.0004
train_pearson_r,0.98366
train_r2,0.9675
train_spearman_rho,0.90728
val_mae,0.0142
val_mse,0.00042


Starting Training for lr_5e-05_bs_128
Beginning Epoch 1
Beginning Epoch 2
Beginning Epoch 3
Beginning Epoch 4
Beginning Epoch 5
Beginning Epoch 6
Beginning Epoch 7
Beginning Epoch 8
Beginning Epoch 9
Beginning Epoch 10
Beginning Epoch 11
Beginning Epoch 12
Beginning Epoch 13
Beginning Epoch 14
Early stopping at epoch 14


0,1
epoch,▁▂▂▃▃▄▄▅▅▆▆▇▇█
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss,█▂▂▁▁▁▁▁▁▁▁▁▁▁
train_mae,█▄▃▂▂▂▁▁▁▁▁▁▁▁
train_mse,█▂▂▁▁▁▁▁▁▁▁▁▁▁
train_pearson_r,▁▆▇▇██████████
train_r2,▁▇▇███████████
train_spearman_rho,▁▇▇▇▇▇████████
val_mae,█▅▄▂▃▂▂▂▂▁▃▃▃▁
val_mse,█▄▃▂▂▂▂▂▁▁▂▂▂▁

0,1
epoch,14.0
learning_rate,5e-05
train_loss,0.0
train_mae,0.01716
train_mse,0.0005
train_pearson_r,0.9793
train_r2,0.95871
train_spearman_rho,0.89791
val_mae,0.01743
val_mse,0.0006


The best performing model used the architecture: {'batch_size': 32, 'lr': 5e-05, 'weight_decay': 0.01}


#### SBERT Model That Performed The Best on Validation Set 

In [16]:
print(best_config)

{'batch_size': 32, 'lr': 5e-05, 'weight_decay': 0.01}


### Evaluate Best Model on the Test Set

In [None]:

checkpoint = torch.load(f"saved_models/lr_{best_config['lr']}_bs_{best_config['batch_size']}_best_model.pth")

model = SentenceTransformer(MODEL_NAME) 

model.load_state_dict(checkpoint['model_state_dict'])

print(f"Loaded model from epoch {checkpoint['epoch']} with Spearman ρ = {checkpoint['spearman_rho']:.3f}")

if best_model:
    model = SentenceTransformer(MODEL_NAME)
    model.load_state_dict(best_model)
    
    test_loader = DataLoader(datasets['test'], batch_size=32)
    test_preds, test_labels = [], []

    run = wandb.init(
        entity="ayoungren-colostate",
        project="sbert-param-search-2",
        name=f"Best Model Arch: lr--{best_config['lr']}__bs--{best_config['batch_size']}",
        config=best_config,
        reinit=True
    )

    torch.cuda.empty_cache()
    gc.collect()
    
    model.eval()
    test_preds, test_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            queries = batch['job']
            courses = batch['courses']
            labels = batch['label'].to(device)
            query_emb = model.encode(queries, convert_to_tensor=True, show_progress_bar=False)
            answer_emb = model.encode(courses, convert_to_tensor=True, show_progress_bar=False)
            
            cos_sim = torch.nn.functional.cosine_similarity(query_emb, answer_emb)
            
            test_preds.extend(cos_sim.cpu().numpy())
            test_labels.extend(labels.cpu().numpy())

    test_metrics = compute_metrics(test_labels, test_preds)
    print("Test Metrics:", test_metrics)

    run.log({
        'test_mse': test_metrics['mse'],
        'test_mae': test_metrics['mae'],
        'test_r2': test_metrics['r2'],
        'test_pearson_r': test_metrics['pearson_r'],
        'test_spearman_rho': test_metrics['spearman_rho'],
    })

    run.finish()
    


  checkpoint = torch.load(f"saved_models/lr_{best_config['lr']}_bs_{best_config['batch_size']}_best_model.pth")


Loaded model from epoch 7 with Spearman ρ = 0.900


Test Metrics: {'spearman_rho': np.float64(0.8928012228326971), 'pearson_r': np.float64(0.9808994267769434), 'mae': 0.0141974287004188, 'r2': 0.9603095997721003, 'mse': 0.0004918067361434823}


0,1
test_mae,▁
test_mse,▁
test_pearson_r,▁
test_r2,▁
test_spearman_rho,▁

0,1
test_mae,0.0142
test_mse,0.00049
test_pearson_r,0.9809
test_r2,0.96031
test_spearman_rho,0.8928
