In [1]:
import re
import os
import gc
import csv
import torch
import wandb
import random
import numpy as np
import pandas as pd

from torch.utils.data import DataLoader
from scipy.stats import pearsonr, spearmanr
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# with open ('data/all-csu-codes.csv', 'r') as c_data:
#     csv_reader = csv.reader(c_data) 
#     courses_data = list(csv_reader)
    
# courses_df = pd.DataFrame(columns=['Courses', 'Skills'])

# for idx in range(0, len(courses_data)):
#     skill_list = courses_data[idx][1:-1]
#     skill_list = [skill.title() for skill in skill_list]
#     skill_list = [re.sub(r'\b(vs|Vs)\b', 'VS', skill) for skill in skill_list]

#     row = pd.DataFrame({'Courses': courses_data[idx][0], 'Skills':[skill_list]})
#     courses_df = pd.concat([courses_df, row], ignore_index=True)

# courses_df.to_csv('data/courses_data.csv', index=False)

In [3]:
# with open('data/descriptions.txt', 'r') as j_data:
#     csv_reader = csv.reader(j_data, delimiter='|')
#     jobs_data = list(csv_reader)

# jobs_df = pd.DataFrame(columns=['Job_Title', 'Job_Description', 'Required_Skills'])
                    
# for row in jobs_data:
#     if len(row) == 3:
#         job_title = row[0].strip().strip('"') 

#         job_description = row[1].strip().strip('"')
#         job_description = re.sub(r'\bDESCRIPTION\b', '', job_description)
    
#         skills = row[2].strip().strip('"')
#         skill_list = [skill.strip().strip('"') for skill in skills.split(',')]
#         cap_skill_list = [skill.title() for skill in skill_list]
#         cleaned_skills = [re.sub(r'\s?\(.*?\)', '', skill) for skill in cap_skill_list]

#         row = pd.DataFrame({'Job_Title': job_title, 'Job_Description': job_description, 'Required_Skills': [skill_list]})
#         jobs_df = pd.concat([jobs_df, row], ignore_index=True)
            
# jobs_df.to_csv('data/jobs_data.csv', index=False)

In [4]:
def get_all_acquired_skills(courses_df):
    all_acquired_skills = set() 
    
    for skills in courses_df['Skills']:
        all_acquired_skills.update(skills)

    return all_acquired_skills

In [5]:
def all_class_comparison(jobs_df, courses_df, all_acquired_skills):
    training_data = []
    course_entries = list(zip(courses_df["Courses"], courses_df["Skills"]))

    for _, job in jobs_df.iterrows():
        job_text = f"Job Title: {job['Job_Title']}\n Job Description: {job['Job_Description']}"
        job_skills = set(job["Required_Skills"])
        missing_skills = job_skills - all_acquired_skills
        num_missing = float(len(missing_skills))
        num_job_skills = float(len(job_skills))
        
        course_info = "COURSES TAKEN:\n"
        for course, skills in course_entries:
            course_info += f"\tCOURSE: {course}\n\t\t{course} SKILLS: {', '.join(skills)}\n"

        query_text = "What skills am I lacking for the following job position, given the classes I have taken?\n\n" + job_text
        query_text += "\n\n" + course_info
        if len(missing_skills) == 0:
            acquired_skills = {', '.join(job['Required_Skills'])}
            training_data.append({
                'query': query_text,
                'answer': f"You qualify as an applicant for the job position, {job['Job_Title']}.\ Your courses provided all listed required skills: {acquired_skills}",
                'label': 1.0
            })
        elif num_missing == num_job_skills:
            answer = ", ".join(list(missing_skills))
            training_data.append({
                'query': query_text,
                'answer': (f"You are missing the following skills required by the postion: " + answer),
                'label': 0.0
            })
        else:
            answer = ", ".join(list(missing_skills))
            training_data.append({
                'query': query_text,
                'answer': (f"You are missing the following skills required by the postion: " + answer),
                'label': (1.0 - (num_missing/num_job_skills))
            })
        
    return training_data

In [6]:
def compare_individual_course(jobs_df, courses_df, neg_per_pos=1):
    training_data = []
     
    for _, job in jobs_df.iterrows():
        job_text = f"Job Title: {job['Job_Title']}\n Job Description: {job['Job_Description']}"
        job_skills = set(job['Required_Skills']) 
        num_job_skills = float(len(job_skills))
        
        for _, course in courses_df.iterrows():
            missing_skills = list(job_skills - set(course["Skills"]))
            num_missing = float(len(missing_skills))
            course_info = f"COURSE TAKEN: {course['Courses']}\n {course['Courses']} SKILLS: {', '.join(course['Skills'])}"

        query_text = "What skills am I lacking for the following job position, given the classes I have taken?\n\n" + job_text
        query_text += "\n\n" + course_info
        if len(missing_skills) == 0:
            acquired_skills = {', '.join(job['Required_Skills'])}
            training_data.append({
                'query': query_text,
                'answer': f"You qualify as an applicant for the job position, {job['Job_Title']}.\ Your courses provided all listed required skills: {acquired_skills}",
                'label': 1.0
            })
        elif num_missing == num_job_skills:
            answer = ", ".join(list(missing_skills))
            training_data.append({
                'query': query_text,
                'answer': (f"You are missing all of the following skills required by the postion: " + answer),
                'label': 0.0
            })
        else:
            answer = ", ".join(list(missing_skills))
            training_data.append({
                'query': query_text,
                'answer': (f"You are missing the following skills required by the postion: " + answer),
                'label': (1.0 - (num_missing/num_job_skills))
            })

    return training_data

In [7]:
def create_schedule_data(jobs_df, schedules):
    training_data = []

    for _, job in jobs_df.iterrows():
        job_text = f"Job Title: {job['Job_Title']}\n Job Description: {job['Job_Description']}"
        job_skills = set(job["Required_Skills"])
        num_job_skills = float(len(job_skills))

        for sched in schedules:
            course_entries = list(zip(sched["Courses"], sched["Skills"]))
            sched_skills = set([item for sublist in sched["Skills"].tolist() for item in sublist])
            missing_skills = list(job_skills - sched_skills)
            num_missing = float(len(missing_skills))
            
            course_info = "COURSES TAKEN:\n"
            for course, skills in course_entries:
                course_info += f"\tCOURSE: {course}\n\t\t{course} SKILLS: {', '.join(skills)}\n"

            query_text = "What skills am I lacking for the following job position, given the classes I have taken?\n\n" + job_text
            query_text += "\n\n" + course_info
            if len(missing_skills) == 0:
                acquired_skills = {', '.join(job['Required_Skills'])}
                training_data.append({
                    'query': query_text,
                    'answer': f"You qualify as an applicant for the job position, {job['Job_Title']}.\ Your courses provided all listed required skills: {acquired_skills}",
                    'label': 1.0
                })
            elif num_missing == num_job_skills:
                answer = ", ".join(list(missing_skills))
                training_data.append({
                    'query': query_text,
                    'answer': (f"You are missing all of the following skills required by the postion: " + answer),
                    'label': 0.0
                })
            else:
                answer = ", ".join(list(missing_skills))
                training_data.append({
                    'query': query_text,
                    'answer': (f"You are missing the following skills required by the postion: " + answer),
                    'label': (1.0 - (num_missing/num_job_skills))
                })

    return training_data


def get_courseloads(jobs_df, courses_df, number_of_schedules=20):
    core_classes = ['CS150', 'CS164', 'CS152', 'CS162', 'CS201', 'CS165', 'CS220', 
                    'CS270', 'CS250', 'CS314', 'CS370', 'CS320', 'CS214']
    
    elective_courses_df = courses_df[~courses_df['Courses'].isin(core_classes)]

    schedules_df = []
    used_schedules = set()

    while len(schedules_df) < number_of_schedules:
        l_4_courses = elective_courses_df[elective_courses_df['Courses'].str.startswith('CS4')]
        l_3_4_courses = elective_courses_df[elective_courses_df['Courses'].str.startswith('CS3') | elective_courses_df['Courses'].str.startswith('CS4')]
        other_courses = elective_courses_df[~elective_courses_df['Courses'].str.startswith('CS3') & ~elective_courses_df['Courses'].str.startswith('CS4')]

        l_4_sample = random.sample(l_4_courses['Courses'].tolist(), 2)

        l_3_4_filtered = l_3_4_courses[~l_3_4_courses['Courses'].isin(l_4_sample)]
        l_3_4_sample = random.sample(l_3_4_filtered['Courses'].tolist(), 2)

        all_sampled_courses = l_4_sample + l_3_4_sample
        other_courses_filtered = other_courses[~other_courses['Courses'].isin(all_sampled_courses)]
        other_sample = random.sample(other_courses_filtered['Courses'].tolist(), 1)

        sched_courses = core_classes + l_4_sample + l_3_4_sample + other_sample
        sched_df = courses_df[courses_df['Courses'].isin(sched_courses)].copy()

        sched_tuple = tuple(sorted(sched_df['Courses'].tolist()))
        if sched_tuple not in used_schedules:
            schedules_df.append(sched_df)
            used_schedules.add(sched_tuple)

    training_data = create_schedule_data(jobs_df, schedules_df)

    return training_data

In [8]:
def create_training_data(jobs_df, courses_df, all_acquired_skills):
    training_data = all_class_comparison(jobs_df, courses_df, all_acquired_skills)
    print(f"All: {training_data[-1]}\n")

    training_data = training_data + compare_individual_course(jobs_df, courses_df)
    print(f"Individual: {training_data[-1]}\n")
    
    training_data = training_data + get_courseloads(jobs_df, courses_df)
    print(f"Course load: {training_data[-1]}\n")

    return training_data

# all_acquired_skills = get_all_acquired_skills(courses_df)
# training_data = create_training_data(jobs_df, courses_df, all_acquired_skills)

In [9]:
# print(training_data[0])
# td_df = pd.DataFrame(training_data, columns=['query', 'answer', 'label'])
# print(td_df.shape)

# td_df.to_csv('data/bert_training_data.csv', index=False)

# del td_df, training_data, jobs_df, courses_df
# gc.collect()


In [10]:
with open ('data/bert_training_data.csv', 'r') as t_data:
    csv_reader = csv.reader(t_data) 
    training_data = list(csv_reader)

print(training_data[0])
td_df = pd.DataFrame(training_data, columns=['query', 'answer', 'label'])
print(td_df.shape)
td_df

['query', 'answer', 'label']
(1739, 3)


Unnamed: 0,query,answer,label
0,query,answer,label
1,What skills am I lacking for the following job...,You are missing the following skills required ...,0.19354838709677424
2,What skills am I lacking for the following job...,You are missing the following skills required ...,0.0714285714285714
3,What skills am I lacking for the following job...,You are missing the following skills required ...,0.33333333333333337
4,What skills am I lacking for the following job...,You are missing the following skills required ...,0.15000000000000002
...,...,...,...
1734,What skills am I lacking for the following job...,You are missing the following skills required ...,0.13157894736842102
1735,What skills am I lacking for the following job...,You are missing the following skills required ...,0.1578947368421053
1736,What skills am I lacking for the following job...,You are missing the following skills required ...,0.1578947368421053
1737,What skills am I lacking for the following job...,You are missing the following skills required ...,0.13157894736842102


In [11]:
def prepare_datasets(td_df):
    td_df = td_df.copy()
    
    td_df.loc[:, 'label'] = pd.to_numeric(td_df['label'], errors='coerce')
    
    td_df = td_df.dropna(subset=['label'])
    
    td_df.loc[:, 'query'] = td_df['query'].astype(str)
    td_df.loc[:, 'answer'] = td_df['answer'].astype(str)
    
    train_df, temp_df = train_test_split(td_df, test_size=0.4, random_state=42)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)
    
    def create_dataset(df):
        return [
            {
                'query': str(row['query']),
                'answer': str(row['answer']),
                'label': float(row['label'])
            }
            for _, row in df.iterrows()
        ]
    
    return {
        'train': create_dataset(train_df),
        'val': create_dataset(val_df),
        'test': create_dataset(test_df)
    }


In [12]:
def compute_metrics(labels, preds):
    metrics = {
        'spearman_rho': spearmanr(labels, preds)[0],
        'pearson_r': pearsonr(labels, preds)[0],

        'mae': mean_absolute_error(labels, preds),
        'r2': r2_score(labels, preds),
        'mse': mean_squared_error(labels, preds),
    }
    
    # Optional: Add threshold-based metrics if you have binary labels
    if len(np.unique(labels)) == 2:  # Binary classification
        metrics.update({
            'accuracy': accuracy_score(labels, preds > 0.5),
            'f1': f1_score(labels, preds > 0.5)
        })
    
    return metrics

In [13]:
key_file = rf'D:\Development\cs580\CSU-Industry-Skills\WANDB_API_KEY.txt' 

with open(key_file, "r") as f:
    api_key = f.read().strip()

wandb.login(key=api_key)

MODEL_NAME = 'sentence-transformers/all-MiniLM-L6-v2'
EARLY_STOPPING_PATIENCE = 5
MIN_DELTA = 0.005

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.backends.cudnn.benchmark = True
print(f"Using device: {device}")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\ayoun\_netrc
[34m[1mwandb[0m: Currently logged in as: [33mayoungren94[0m ([33mayoungren-colostate[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Using device: cuda


In [14]:
def train_and_evaluate(config, datasets):
    best_val_metric = -1 
    patience = 0
    best_state = None
    arch_name = f"lr_{config['lr']}_bs_{config['batch_size']}"
    wandb.init(
        entity="ayoungren-colostate",
        project="sbert-param-search",
        name=arch_name,
        config=config,
        reinit=True
    )

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = SentenceTransformer(MODEL_NAME).to(device)
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=config['lr'],
        weight_decay=config['weight_decay']
    )

    save_dir = "saved_models"
    os.makedirs(save_dir, exist_ok=True)

    print(f'Starting Training for {arch_name}')

    for epoch in range(1, 101):
        print(f"Beginning Epoch {epoch}")
        model.train()
        epoch_loss = 0
        all_train_preds = []
        all_train_labels = []
        
        random.shuffle(datasets['train'])
        
        for i in range(0, len(datasets['train']), config['batch_size']):
            batch = datasets['train'][i:i+config['batch_size']]
            
            queries = [item['query'] for item in batch]
            answers = [item['answer'] for item in batch]
            labels = torch.tensor([item['label'] for item in batch], 
                                dtype=torch.float).to(device)

            tokenizer = model.tokenizer
            query_inputs = tokenizer(queries, padding=True, truncation=True, 
                                   return_tensors='pt').to(device)
            answer_inputs = tokenizer(answers, padding=True, truncation=True,
                                    return_tensors='pt').to(device)
            
            query_features = model(query_inputs)['sentence_embedding']
            answer_features = model(answer_inputs)['sentence_embedding']

            cos_sim = torch.nn.functional.cosine_similarity(query_features, answer_features)

            loss = torch.nn.functional.mse_loss(cos_sim, labels)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
            all_train_preds.extend(cos_sim.detach().cpu().numpy())
            all_train_labels.extend(labels.cpu().numpy())
            
            torch.cuda.empty_cache()

        model.eval()
        val_preds, val_labels = [], []
        with torch.no_grad():
            for i in range(0, len(datasets['val']), 32):
                batch = datasets['val'][i:i+32]
                queries = [item['query'] for item in batch]
                answers = [item['answer'] for item in batch]
                labels = torch.tensor([item['label'] for item in batch], 
                                    dtype=torch.float).to(device)
                
                query_emb = model.encode(queries, convert_to_tensor=True, show_progress_bar=False)
                answer_emb = model.encode(answers, convert_to_tensor=True, show_progress_bar=False)
                cos_sim = torch.nn.functional.cosine_similarity(query_emb, answer_emb)
                
                val_preds.extend(cos_sim.cpu().numpy())
                val_labels.extend(labels.cpu().numpy())

        train_metrics = compute_metrics(all_train_labels, all_train_preds)
        val_metrics = compute_metrics(val_labels, val_preds)
        
        wandb.log({
            'epoch': epoch,
            'train_loss': epoch_loss / len(datasets['train']),
            'train_mse': train_metrics['mse'],
            'train_mae': train_metrics['mae'],
            'train_r2': train_metrics['r2'],
            'train_pearson_r': train_metrics['pearson_r'],
            'train_spearman_rho': train_metrics['spearman_rho'],
            'val_mse': val_metrics['mse'],
            'val_mae': val_metrics['mae'],
            'val_r2': val_metrics['r2'],
            'val_pearson_r': val_metrics['pearson_r'],
            'val_spearman_rho': val_metrics['spearman_rho'],
            'learning_rate': optimizer.param_groups[0]['lr']
        })

        if val_metrics['spearman_rho'] > best_val_metric + MIN_DELTA:
            best_val_metric = val_metrics['spearman_rho']  # Track Spearman instead of Pearson
            patience = 0
            best_state = model.state_dict()
            torch.save({
                'epoch': epoch,
                'model_state_dict': best_state,
                'spearman_rho': best_val_metric,
            }, os.path.join(save_dir, "best_model.pth"))
        else:
            patience += 1
            if patience >= EARLY_STOPPING_PATIENCE:
                print(f"Early stopping at epoch {epoch}")
                break
    
    wandb.finish()
    return best_val_metric, best_state

In [None]:
datasets = prepare_datasets(td_df)
    
param_grid = ParameterGrid({
    'lr': [2e-5, 3e-5, 5e-5],
    'batch_size': [16, 32, 64, 128],
    'weight_decay': [0.01]
})

best_f1 = -1
best_model = None
best_config = None

for config in param_grid:
    val_f1, state_dict = train_and_evaluate(config, datasets)
    if val_f1 > best_f1:
        best_f1 = val_f1
        best_model = state_dict
        best_config = config


Starting Training for lr_2e-05_bs_16
Beginning Epoch 1
Beginning Epoch 2
Beginning Epoch 3
Beginning Epoch 4
Beginning Epoch 5
Beginning Epoch 6
Beginning Epoch 7
Beginning Epoch 8
Beginning Epoch 9
Beginning Epoch 10
Beginning Epoch 11
Beginning Epoch 12
Beginning Epoch 13
Beginning Epoch 14
Beginning Epoch 15
Early stopping at epoch 15


0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss,█▂▂▁▁▁▁▁▁▁▁▁▁▁▁
train_mae,█▃▃▂▂▂▂▂▁▁▁▁▁▁▁
train_mse,█▂▂▁▁▁▁▁▁▁▁▁▁▁▁
train_pearson_r,▁▇▇████████████
train_r2,▁▇▇████████████
train_spearman_rho,▁▆▇████████████
val_mae,██▅▄▃▄▃▄▂▂▁▁▁▂▅
val_mse,█▆▃▂▂▂▂▂▁▁▁▁▁▁▃

0,1
epoch,15.0
learning_rate,2e-05
train_loss,2e-05
train_mae,0.01507
train_mse,0.00038
train_pearson_r,0.98054
train_r2,0.96086
train_spearman_rho,0.97859
val_mae,0.02954
val_mse,0.00111


Starting Training for lr_3e-05_bs_16
Beginning Epoch 1
Beginning Epoch 2
Beginning Epoch 3
Beginning Epoch 4
Beginning Epoch 5
Beginning Epoch 6
Beginning Epoch 7
Beginning Epoch 8
Beginning Epoch 9
Beginning Epoch 10
Beginning Epoch 11
Early stopping at epoch 11


0,1
epoch,▁▂▂▃▄▅▅▆▇▇█
learning_rate,▁▁▁▁▁▁▁▁▁▁▁
train_loss,█▂▁▁▁▁▁▁▁▁▁
train_mae,█▃▂▂▂▁▁▁▁▁▁
train_mse,█▂▁▁▁▁▁▁▁▁▁
train_pearson_r,▁▇█████████
train_r2,▁▇█████████
train_spearman_rho,▁▇▇████████
val_mae,█▂▂▂▁▃▁▃▁▁▁
val_mse,█▂▁▂▁▂▁▂▁▁▁

0,1
epoch,11.0
learning_rate,3e-05
train_loss,3e-05
train_mae,0.01583
train_mse,0.00042
train_pearson_r,0.97835
train_r2,0.95713
train_spearman_rho,0.9745
val_mae,0.01428
val_mse,0.00043


Starting Training for lr_5e-05_bs_16
Beginning Epoch 1
Beginning Epoch 2
Beginning Epoch 3
Beginning Epoch 4
Beginning Epoch 5
Beginning Epoch 6
Beginning Epoch 7
Beginning Epoch 8
Beginning Epoch 9
Beginning Epoch 10
Early stopping at epoch 10


0,1
epoch,▁▂▃▃▄▅▆▆▇█
learning_rate,▁▁▁▁▁▁▁▁▁▁
train_loss,█▂▁▁▁▁▁▁▁▁
train_mae,█▂▂▂▂▁▁▁▁▁
train_mse,█▂▁▁▁▁▁▁▁▁
train_pearson_r,▁▇████████
train_r2,▁▇████████
train_spearman_rho,▁▇█▇██████
val_mae,█▄▃▄▃▁▄▁▁▁
val_mse,█▃▃▃▂▁▃▁▁▁

0,1
epoch,10.0
learning_rate,5e-05
train_loss,2e-05
train_mae,0.01518
train_mse,0.00037
train_pearson_r,0.98083
train_r2,0.96172
train_spearman_rho,0.97737
val_mae,0.01336
val_mse,0.00039


Starting Training for lr_2e-05_bs_32
Beginning Epoch 1
Beginning Epoch 2
Beginning Epoch 3
Beginning Epoch 4
Beginning Epoch 5
Beginning Epoch 6
Beginning Epoch 7
Beginning Epoch 8
Beginning Epoch 9
Beginning Epoch 10
Beginning Epoch 11
Beginning Epoch 12
Beginning Epoch 13
Beginning Epoch 14
Early stopping at epoch 14


0,1
epoch,▁▂▂▃▃▄▄▅▅▆▆▇▇█
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss,█▂▂▁▁▁▁▁▁▁▁▁▁▁
train_mae,█▃▂▂▂▁▂▁▁▁▁▁▁▁
train_mse,█▂▂▁▁▁▁▁▁▁▁▁▁▁
train_pearson_r,▁▆▇███████████
train_r2,▁▇▇███████████
train_spearman_rho,▁▆▇███████████
val_mae,█▅▅▃▃▂▃▃▃▂▁▁▁▃
val_mse,█▄▃▂▂▁▂▂▂▁▁▁▁▂

0,1
epoch,14.0
learning_rate,2e-05
train_loss,2e-05
train_mae,0.01927
train_mse,0.0006
train_pearson_r,0.96943
train_r2,0.93815
train_spearman_rho,0.965
val_mae,0.02146
val_mse,0.00071


Starting Training for lr_3e-05_bs_32
Beginning Epoch 1
Beginning Epoch 2
Beginning Epoch 3
Beginning Epoch 4
Beginning Epoch 5
Beginning Epoch 6
Beginning Epoch 7
Beginning Epoch 8
Beginning Epoch 9
Early stopping at epoch 9


0,1
epoch,▁▂▃▄▅▅▆▇█
learning_rate,▁▁▁▁▁▁▁▁▁
train_loss,█▂▁▁▁▁▁▁▁
train_mae,█▃▂▁▁▁▁▁▁
train_mse,█▂▁▁▁▁▁▁▁
train_pearson_r,▁▆▇██████
train_r2,▁▇███████
train_spearman_rho,▁▆███████
val_mae,█▂▂▂▂▂▁▁▁
val_mse,█▃▂▂▁▁▁▁▁

0,1
epoch,9.0
learning_rate,3e-05
train_loss,2e-05
train_mae,0.01994
train_mse,0.00067
train_pearson_r,0.96522
train_r2,0.93064
train_spearman_rho,0.96005
val_mae,0.01583
val_mse,0.00052


Starting Training for lr_5e-05_bs_32
Beginning Epoch 1
Beginning Epoch 2
Beginning Epoch 3
Beginning Epoch 4
Beginning Epoch 5
Beginning Epoch 6
Beginning Epoch 7
Beginning Epoch 8
Beginning Epoch 9
Beginning Epoch 10
Beginning Epoch 11
Beginning Epoch 12
Beginning Epoch 13
Beginning Epoch 14
Early stopping at epoch 14


0,1
epoch,▁▂▂▃▃▄▄▅▅▆▆▇▇█
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss,█▂▁▁▁▁▁▁▁▁▁▁▁▁
train_mae,█▃▂▂▂▂▁▁▁▁▁▁▁▁
train_mse,█▂▁▁▁▁▁▁▁▁▁▁▁▁
train_pearson_r,▁▆▇███████████
train_r2,▁▇████████████
train_spearman_rho,▁▆▇███████████
val_mae,█▅▅▅▃▃▂▂▁▅▂▁▁▃
val_mse,█▄▃▃▂▂▁▁▁▂▁▁▁▂

0,1
epoch,14.0
learning_rate,5e-05
train_loss,1e-05
train_mae,0.0163
train_mse,0.00044
train_pearson_r,0.97734
train_r2,0.95441
train_spearman_rho,0.97273
val_mae,0.01829
val_mse,0.00055


Starting Training for lr_2e-05_bs_64
Beginning Epoch 1
Beginning Epoch 2
Beginning Epoch 3
Beginning Epoch 4
Beginning Epoch 5
Beginning Epoch 6
Beginning Epoch 7
Beginning Epoch 8
Beginning Epoch 9
Beginning Epoch 10
Beginning Epoch 11
Beginning Epoch 12
Beginning Epoch 13
Early stopping at epoch 13


0,1
epoch,▁▂▂▃▃▄▅▅▆▆▇▇█
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss,█▂▂▁▁▁▁▁▁▁▁▁▁
train_mae,█▄▂▂▂▁▁▁▁▁▁▁▁
train_mse,█▂▂▁▁▁▁▁▁▁▁▁▁
train_pearson_r,▁▅▇▇▇████████
train_r2,▁▇▇██████████
train_spearman_rho,▁▄▆▇▇████████
val_mae,█▅▄▂▂▂▂▂▁▃▁▁▁
val_mse,█▄▃▂▂▂▁▁▁▂▁▁▁

0,1
epoch,13.0
learning_rate,2e-05
train_loss,1e-05
train_mae,0.02196
train_mse,0.0008
train_pearson_r,0.95868
train_r2,0.91807
train_spearman_rho,0.95596
val_mae,0.01917
val_mse,0.00066


Starting Training for lr_3e-05_bs_64
Beginning Epoch 1
Beginning Epoch 2
Beginning Epoch 3
Beginning Epoch 4
Beginning Epoch 5
Beginning Epoch 6
Beginning Epoch 7
Beginning Epoch 8
Beginning Epoch 9
Beginning Epoch 10
Beginning Epoch 11
Beginning Epoch 12
Early stopping at epoch 12


0,1
epoch,▁▂▂▃▄▄▅▅▆▇▇█
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁
train_loss,█▂▂▁▁▁▁▁▁▁▁▁
train_mae,█▄▂▂▁▁▁▁▁▁▁▁
train_mse,█▂▂▁▁▁▁▁▁▁▁▁
train_pearson_r,▁▅▇▇████████
train_r2,▁▇▇█████████
train_spearman_rho,▁▅▇▇████████
val_mae,█▃▂▃▂▁▁▁▁▁▂▂
val_mse,█▃▂▂▁▁▁▁▁▁▁▁

0,1
epoch,12.0
learning_rate,3e-05
train_loss,1e-05
train_mae,0.02174
train_mse,0.00078
train_pearson_r,0.95946
train_r2,0.91932
train_spearman_rho,0.9552
val_mae,0.02488
val_mse,0.00093


Starting Training for lr_5e-05_bs_64
Beginning Epoch 1
Beginning Epoch 2
Beginning Epoch 3
Beginning Epoch 4
Beginning Epoch 5
Beginning Epoch 6
Beginning Epoch 7
Beginning Epoch 8
Beginning Epoch 9
Beginning Epoch 10
Early stopping at epoch 10


0,1
epoch,▁▂▃▃▄▅▆▆▇█
learning_rate,▁▁▁▁▁▁▁▁▁▁
train_loss,█▂▁▁▁▁▁▁▁▁
train_mae,█▃▂▂▂▁▁▁▁▁
train_mse,█▂▁▁▁▁▁▁▁▁
train_pearson_r,▁▆▇▇██████
train_r2,▁▇████████
train_spearman_rho,▁▆▇▇██████
val_mae,█▅▅▂▂▂▁▂▁▄
val_mse,█▄▃▁▁▁▁▁▁▂

0,1
epoch,10.0
learning_rate,5e-05
train_loss,1e-05
train_mae,0.01976
train_mse,0.00065
train_pearson_r,0.96699
train_r2,0.93273
train_spearman_rho,0.965
val_mae,0.02395
val_mse,0.00089


Starting Training for lr_2e-05_bs_128
Beginning Epoch 1
Beginning Epoch 2
Beginning Epoch 3
Beginning Epoch 4
Beginning Epoch 5
Beginning Epoch 6
Beginning Epoch 7
Beginning Epoch 8
Beginning Epoch 9
Beginning Epoch 10
Beginning Epoch 11
Beginning Epoch 12
Beginning Epoch 13
Beginning Epoch 14
Beginning Epoch 15
Beginning Epoch 16
Beginning Epoch 17
Beginning Epoch 18
Beginning Epoch 19
Beginning Epoch 20
Beginning Epoch 21
Beginning Epoch 22
Early stopping at epoch 22


0,1
epoch,▁▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇▇██
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss,█▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_mae,█▄▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_mse,█▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_pearson_r,▁▃▅▆▇▇▇▇▇█████████████
train_r2,▁▆▇▇██████████████████
train_spearman_rho,▁▃▅▆▇▇▇▇██████████████
val_mae,█▅▄▃▂▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁
val_mse,█▄▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,22.0
learning_rate,2e-05
train_loss,1e-05
train_mae,0.02149
train_mse,0.00077
train_pearson_r,0.96
train_r2,0.92082
train_spearman_rho,0.95362
val_mae,0.01821
val_mse,0.00061


Starting Training for lr_3e-05_bs_128
Beginning Epoch 1
Beginning Epoch 2
Beginning Epoch 3
Beginning Epoch 4
Beginning Epoch 5
Beginning Epoch 6
Beginning Epoch 7
Beginning Epoch 8
Beginning Epoch 9
Beginning Epoch 10
Beginning Epoch 11
Beginning Epoch 12
Beginning Epoch 13
Beginning Epoch 14
Beginning Epoch 15
Beginning Epoch 16
Early stopping at epoch 16


0,1
epoch,▁▁▂▂▃▃▄▄▅▅▆▆▇▇██
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss,█▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁
train_mae,█▄▃▂▂▂▁▁▁▁▁▁▁▁▁▁
train_mse,█▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁
train_pearson_r,▁▄▆▇▇▇▇▇████████
train_r2,▁▇▇▇████████████
train_spearman_rho,▁▃▆▇▇▇██████████
val_mae,█▆▄▃▂▃▄▂▂▂▂▁▁▂▂▂
val_mse,█▅▃▂▂▂▂▂▁▁▁▁▁▁▁▁

0,1
epoch,16.0
learning_rate,3e-05
train_loss,1e-05
train_mae,0.02134
train_mse,0.00078
train_pearson_r,0.95951
train_r2,0.9197
train_spearman_rho,0.95682
val_mae,0.02485
val_mse,0.00093


Starting Training for lr_5e-05_bs_128
Beginning Epoch 1
Beginning Epoch 2
Beginning Epoch 3
Beginning Epoch 4
Beginning Epoch 5
Beginning Epoch 6
Beginning Epoch 7
Beginning Epoch 8
Beginning Epoch 9
Beginning Epoch 10
Beginning Epoch 11
Beginning Epoch 12
Beginning Epoch 13
Beginning Epoch 14
Beginning Epoch 15
Beginning Epoch 16
Beginning Epoch 17
Beginning Epoch 18
Early stopping at epoch 18


0,1
epoch,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇██
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss,█▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_mae,█▄▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁
train_mse,█▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_pearson_r,▁▄▆▇▇▇████████████
train_r2,▁▇▇▇██████████████
train_spearman_rho,▁▄▆▇▇▇████████████
val_mae,█▇▃▆▅▄▂▂▂▃▄▃▂▂▁▁▁▃
val_mse,██▃▄▃▃▂▁▁▂▂▂▁▁▁▁▁▂

0,1
epoch,18.0
learning_rate,5e-05
train_loss,1e-05
train_mae,0.01883
train_mse,0.00062
train_pearson_r,0.9683
train_r2,0.93631
train_spearman_rho,0.96701
val_mae,0.02855
val_mse,0.00111


In [21]:
checkpoint = torch.load("saved_models/best_model.pth")

model = SentenceTransformer(MODEL_NAME) 

model.load_state_dict(checkpoint['model_state_dict'])

print(f"Loaded model from epoch {checkpoint['epoch']} with Spearman ρ = {checkpoint['spearman_rho']:.3f}")

if best_model:
    model = SentenceTransformer(MODEL_NAME)
    model.load_state_dict(best_model)
    
    test_loader = DataLoader(datasets['test'], batch_size=32)
    test_preds, test_labels = [], []

    wandb.init(
        entity="ayoungren-colostate",
        project="sbert-param-search",
        name=f"Best Model Arch: lr--{best_config['lr']}__bs--{best_config['batch_size']}",
        config=best_config,
        reinit=True
    )

    torch.cuda.empty_cache()
    gc.collect()
    
    model.eval()
    test_preds, test_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            queries = batch['query']
            answers = batch['answer']
            labels = batch['label'].to(device)
            query_emb = model.encode(queries, convert_to_tensor=True, show_progress_bar=False)
            answer_emb = model.encode(answers, convert_to_tensor=True, show_progress_bar=False)
            
            cos_sim = torch.nn.functional.cosine_similarity(query_emb, answer_emb)
            
            test_preds.extend(cos_sim.cpu().numpy())
            test_labels.extend(labels.cpu().numpy())

    test_metrics = compute_metrics(test_labels, test_preds)
    print("Test Metrics:", test_metrics)

    wandb.log({
        'test_mse': test_metrics['mse'],
        'test_mae': test_metrics['mae'],
        'test_r2': test_metrics['r2'],
        'test_pearson_r': test_metrics['pearson_r'],
        'test_spearman_rho': test_metrics['spearman_rho'],
    })

    wandb.finish()
    


  checkpoint = torch.load("saved_models/best_model.pth")


Loaded model from epoch 13 with Spearman ρ = 0.984


Test Metrics: {'spearman_rho': np.float64(0.9881507906394621), 'pearson_r': np.float64(0.9869179341449975), 'mae': 0.01843222628654548, 'r2': 0.9440507836431647, 'mse': 0.0005618930258482958}


0,1
test_mae,▁
test_mse,▁
test_pearson_r,▁
test_r2,▁
test_spearman_rho,▁

0,1
test_mae,0.01843
test_mse,0.00056
test_pearson_r,0.98692
test_r2,0.94405
test_spearman_rho,0.98815


In [None]:
from transformers import pipeline

checkpoint = torch.load("saved_models/best_model.pth")
model = SentenceTransformer(MODEL_NAME) 
model.load_state_dict(checkpoint['model_state_dict'])
text_generator = pipeline("text-generation", model="gpt2")

def generate_response(prompt):
    embedding = model.encode(prompt)
    
    response = text_generator(
        f"Based on these embeddings: {embedding[:3]}...\n\nResponse:",
        max_length=1024,
        do_sample=True
    )[0]['generated_text']
    
    return response.split("Response:")[-1].strip()

random_row = td_df.sample(n=1)
prompt = random_row['query'].iloc[0]
expected_answer = random_row['answer'].iloc[0]
model_response = generate_response(prompt)
print()
print(f"Prompt given to the model with Sentence BERT Embeddings:\n{prompt}")
print(f'Model Response:\n{model_response}')
print()
print(f'Expected Response:\n{expected_answer}')


  checkpoint = torch.load("saved_models/best_model.pth")
Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Prompt given to the model with Sentence BERT Embeddings:
What skills am I lacking for the following job position, given the classes I have taken?

Job Title: Amazon_Graduate_Software_Engineer
 Job Description:  Do you want to solve business challenges through innovative technology? Do you enjoy working on cutting-edge, scalable services technology in a team environment? Do you like working on industry-defining projects that move the needle? At Amazon, we hire the best minds in technology to innovate and build on behalf of our customers. The intense focus we have on our customers is why we are one of the world's most beloved brands ? customer obsession is part of our company DNA. Our Software Development Engineers (SDEs) use cutting-edge technology to solve complex problems and get to see the impact of their work first-hand. If this is you, come chart your own path at Amazon! The challenges SDEs solve for at Amazon are big and impact millions of customers, sellers, and products around t

In [None]:
training_data = []
for _, row in td_df.iterrows():
    training_data.append({
        "input": row["query"],
        "output": row["answer"]
    })

import json
with open("ollama_training_data.jsonl", "w") as f:
    for item in training_data:
        f.write(json.dumps(item) + "\n")

In [None]:
!echo 'FROM mistral  # or llama3
!SYSTEM "You identify missing skills between job descriptions and courses. Use the format: \"You are missing: [skills].\""
!PARAMETER temperature 0.3' > Modelfile

!ollama create missing_skills -f Modelfile
!ollama train missing_skills -f ollama_training_data.jsonl

In [None]:
import ollama

def generate_response(job_desc, courses_taken):
    prompt = f"""
    {job_desc}
    {courses_taken}
    
    What skills are missing for this job given these courses?
    """
    
    response = ollama.generate(
        model="missing_skills",  # Your fine-tuned model
        prompt=prompt
    )
    return response["response"]


job_desc = "Seeks Python developer with Django and AWS experience."
courses_taken = ["Intro to Python", "Databases 101"]
print(generate_response(job_desc, courses_taken))