In [9]:
import csv
import torch
import wandb
import numpy as np
import pandas as pd

from torch.utils.data import DataLoader
from scipy.stats import pearsonr, spearmanr
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [10]:
with open ('data/bert_training_data.csv', 'r') as t_data:
    csv_reader = csv.reader(t_data) 
    training_data = list(csv_reader)

print(training_data[0])
td_df = pd.DataFrame(training_data, columns=['job', 'courses', 'label'])
print(td_df.shape)
td_df

['job', 'courses', 'label']
(1739, 3)


Unnamed: 0,job,courses,label
0,job,courses,label
1,Job Title: Adobe_AI_ML_Engineer\n Job Descript...,COURSES TAKEN:\n\tCOURSE: CS462\n\t\tCS462 SKI...,0.19354838709677424
2,Job Title: Adobe_Junior_SDE\n Job Description:...,COURSES TAKEN:\n\tCOURSE: CS462\n\t\tCS462 SKI...,0.0714285714285714
3,Job Title: Adobe_Software_Engineering_Intern\n...,COURSES TAKEN:\n\tCOURSE: CS462\n\t\tCS462 SKI...,0.33333333333333337
4,Job Title: Adobe_Software_Quality_Engineer\n J...,COURSES TAKEN:\n\tCOURSE: CS462\n\t\tCS462 SKI...,0.15000000000000002
...,...,...,...
1734,Job Title: Yahoo_Software_Dev_Engineer\n Job D...,COURSES TAKEN:\n\tCOURSE: CS314\n\t\tCS314 SKI...,0.13157894736842102
1735,Job Title: Yahoo_Software_Dev_Engineer\n Job D...,COURSES TAKEN:\n\tCOURSE: CS314\n\t\tCS314 SKI...,0.13157894736842102
1736,Job Title: Yahoo_Software_Dev_Engineer\n Job D...,COURSES TAKEN:\n\tCOURSE: CS314\n\t\tCS314 SKI...,0.13157894736842102
1737,Job Title: Yahoo_Software_Dev_Engineer\n Job D...,COURSES TAKEN:\n\tCOURSE: CS462\n\t\tCS462 SKI...,0.13157894736842102


In [11]:
def prepare_datasets(td_df):
    td_df = td_df.copy()
    
    td_df.loc[:, 'label'] = pd.to_numeric(td_df['label'], errors='coerce')
    
    td_df = td_df.dropna(subset=['label'])
    
    td_df.loc[:, 'job'] = td_df['job'].astype(str)
    td_df.loc[:, 'courses'] = td_df['courses'].astype(str)
    
    train_df, temp_df = train_test_split(td_df, test_size=0.4, random_state=42)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)
    
    def create_dataset(df):
        return [
            {
                'job': str(row['job']),
                'courses': str(row['courses']),
                'label': float(row['label'])
            }
            for _, row in df.iterrows()
        ]
    
    return {
        'train': create_dataset(train_df),
        'val': create_dataset(val_df),
        'test': create_dataset(test_df)
    }

In [12]:
def compute_metrics(labels, preds):
    metrics = {
        'spearman_rho': spearmanr(labels, preds)[0],
        'pearson_r': pearsonr(labels, preds)[0],
        'mae': mean_absolute_error(labels, preds),
        'r2': r2_score(labels, preds),
        'mse': mean_squared_error(labels, preds),
    }
    
    return metrics

In [13]:
key_file = rf'D:\Development\cs580\CSU-Industry-Skills\WANDB_API_KEY.txt' 

with open(key_file, "r") as f:
    api_key = f.read().strip()

wandb.login(key=api_key)

MODEL_NAME = 'sentence-transformers/all-MiniLM-L6-v2'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.backends.cudnn.benchmark = True
print(f"Using device: {device}")

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\ayoun\_netrc


Using device: cuda


In [14]:
def evaluate_dataset(dataset, batch_size=32):
    preds, labels = [], []
    model = SentenceTransformer(MODEL_NAME).to(device)
    
    model.eval()
    with torch.no_grad():
        for i in range(0, len(dataset), batch_size):
            batch = dataset[i:i + batch_size]
            
            batch_queries = [item['job'] for item in batch]
            batch_answers = [item['courses'] for item in batch]
            batch_labels = torch.tensor([item['label'] for item in batch], 
                                      dtype=torch.float).to(device)
            
            query_emb = model.encode(batch_queries, convert_to_tensor=True)
            answer_emb = model.encode(batch_answers, convert_to_tensor=True)
            batch_cos_sim = torch.nn.functional.cosine_similarity(query_emb, answer_emb)
            
            preds.extend(batch_cos_sim.cpu().numpy())
            labels.extend(batch_labels.cpu().numpy())
    
    return compute_metrics(labels, preds)

In [15]:
datasets = prepare_datasets(td_df)

run = wandb.init(
    entity="ayoungren-colostate",
    project="sbert-param-search",
    name="untrained_model"
)

val_metrics = evaluate_dataset(datasets['val'])


run.log({
    'val_mse': val_metrics['mse'],
    'val_mae': val_metrics['mae'],
    'val_r2': val_metrics['r2'],
    'val_pearson_r': val_metrics['pearson_r'],
    'val_spearman_rho': val_metrics['spearman_rho']
})

test_metrics = evaluate_dataset(datasets['test'])

run.log({
    'test_mse': test_metrics['mse'],
    'test_mae': test_metrics['mae'],
    'test_r2': test_metrics['r2'],
    'test_pearson_r': test_metrics['pearson_r'],
    'test_spearman_rho': test_metrics['spearman_rho']
})

run.finish()


0,1
test_mae,▁
test_mse,▁
test_pearson_r,▁
test_r2,▁
test_spearman_rho,▁
val_mae,▁
val_mse,▁
val_pearson_r,▁
val_r2,▁
val_spearman_rho,▁

0,1
test_mae,0.1628
test_mse,0.03819
test_pearson_r,0.00157
test_r2,-2.77522
test_spearman_rho,-0.04
val_mae,0.15911
val_mse,0.03729
val_pearson_r,-0.08662
val_r2,-3.12034
val_spearman_rho,-0.11851
