# Embeddings research

### Purpose

1. To load the given Sentence embedding models and test on given data,
2. Identify suitability of different SBERT models for given task and select the relevant,
3. Generate training, validation and testing datasets for finetuning a custom SBERT model,
4. Define and test the custom/reusable data embedding methods,
5. Train and store custom SBERT model. Prepare training pipeline for the future,
6. Test the custom SBERT-based embedding with semantic search as a classifier.

In [1]:
import pandas as pd
import csv
import os
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import string

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shrin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Load train and test dataframes

In [2]:
train_df = pd.read_csv("../artifacts/data_ingestion/raw_train/train_data.csv")
train_df = pd.read_csv("../artifacts/data_ingestion/raw_train/train_data.csv")


### Define Common Functions for data

In [3]:
def get_special_tokens(path="../data/raw/special_words.txt"):
    """ 
    This method returns list of special words to be removed from the text.
    
    Input: path: str --> Path to the word file

    Output: : special_tokens: list[str] --> all words/special tokens to be removed
    """
    special_tokens=[]
    if not os.path.exists(path):
        return []

    with open(path, "r") as f:
        special_tokens = f.readlines()
    special_tokens = [line.rstrip('\n') for line in special_tokens]
    
    return special_tokens

def clean_text(text, stop_words=stopwords.words('english'), punct=string.punctuation, special_tokens=[]):
        """ 
        This method returns cleaned String from an input string. 
        Removes stop words, punctuations, numbers and any special tokens given.
        
        Input:  text : str                  --> input string to be cleaned
                stop_words : list[str]      --> (Optional) list of stop words to be removed from the text
                punct : list[str]           --> (Optional) list of punctuations to be removed from the text
                special_tokens : list[str]  --> (Optional) list of special words to be removed from the text

        Output: text: string:  cleaned text
        """

        text= text.lower()
        
        text = text.replace("\n"," ")
        
        text = text.replace(r'[0-9]+', ' ')
        text = text.replace(r'[^\w\s]', ' ')
        text = text.replace(r'[^a-zA-Z]', ' ')
        for p in punct:
            text = text.replace(p," ") 
            
        text = ' '.join([word for word in text.split() if word not in stop_words])
        text = ' '.join([word for word in text.split() if word not in special_tokens])
        text = ''.join([i for i in text if not i.isdigit()])
        text = text.replace(r'\s+', ' ')
        text = ' '.join([i for i in text.split() if len(i)>1])
        
        text = text.replace(r'\s+', ' ')
        return text

def get_clean_job_str(job_title, job_post):
    """ 
    This method returns cleaned Job Posting from Job Title and Job String. 
    Appends title and body tokens and concatenates the two.
    
    Input: Job Title Raw : string
            Job Body Raw : string
    Output: job_str: string:  cleaned and concatenated job details
    """
    title_token = "[TTL] "
    body_token = " [DESC] "

    job_title = clean_text(job_title, special_tokens=get_special_tokens())
    job_post = clean_text(job_post, special_tokens=get_special_tokens())

    job_str = title_token + job_title + body_token + job_post

    return job_str
def get_all_onets(onet_data_path="../data/raw/All_Occupations.csv"):
    """ 
    This method returns list of all ONETs available on the official site
    
    Input: path: str -->  (Optional) Path to the onet csv file

    Output: : all_onets_original: list[str] --> all ONETs available
    """
    all_occupations_df = pd.read_csv(onet_data_path)
    all_onets_original = all_occupations_df.Occupation.to_list()
    return all_onets_original

def get_onet_dicts(all_onets_original=get_all_onets()):
    """ 
    This method returns 2 dictionaries used to map standard ONET Names to string IDs. 
    
    Input: all_onets_original: list[str] --> (Optional) list of all ONETs

    Output: : id_to_onet_dict: dict[str, str] --> standard mapping of string id to ONETs --> "id" : "ONET_NAME"
              onet_to_id_dict: dict[str, str] --> standard mapping of ONETs to string id --> "ONET_NAME" : "id"
    """
    id_to_onet_dict = {str(id):onet for id, onet in enumerate(all_onets_original)}
    onet_to_id_dict = {onet:id for id,onet in id_to_onet_dict.items()}
    return id_to_onet_dict, onet_to_id_dict

### Calculate Sentence embeddings using SBERT models

In [6]:
from transformers import pipeline, set_seed, AutoTokenizer, AutoModel
from datasets import load_dataset, load_from_disk, load_metric
import torch
from torch.utils.data import DataLoader
import tqdm

from sentence_transformers import SentenceTransformer, models, InputExample, losses, evaluation, util

In [4]:
def get_embd(input_docs, model=None, model_ckpt=None, save_embd=False, save_path=None):
    """ 
    This method computes embedding of the given string or list of strings using the given SBERT model. 
    
    Input: input_docs: str or list[str] --> list of input string. Can be a single string which will be converted to a list.
           model: SentenceTransformers() --> (Optional) pre-trained SBERT model, if None, checks for path 
           model_ckpt: str --> (Optional) pre-trained SBERT model checkpoint path, if None, loads basic SBERT from HF library 
           save_embd: Boolean --> (Optional) Flag to save computed embeddings.
           save_path: str --> (Optional) Path to save computed embeddings. If empty, new path will be created

    Output: simple_embd: numpy.ndarray (len(input), embd_size) --> array of Computed sentence embeddings. 
    """

    if not input_docs:
        return None
    if not isinstance(input_docs, list):
        print("convert string to list")
        input_docs = [input_docs]       

    if model:
        print("loading model as it is")
        sbert_model = model
    elif model_ckpt:
        if not os.path.exists(model_ckpt):
            model_ckpt = "shriadke/adept-job-msmarco-distilbert-base-v4"
        print("loading model from ckpt: ", model_ckpt)
        sbert_model = SentenceTransformer(model_ckpt)
    else:
        print("loading HF base model")
        sbert_model = SentenceTransformer("msmarco-distilbert-base-v4")
    
    simple_embd = sbert_model.encode(input_docs, show_progress_bar=True)
    
    if save_embd and save_path:
        if not os.path.exists(save_path):
            os.makedirs(save_path, exist_ok=True)
            #os.chmod(save_path+"embd.pkl", 0o777)
        with open(save_path+"embd.pkl", "wb") as fOut:
            print("lenght of docs: ", len(input_docs))
            print("lenght of embd: ", len(simple_embd))
            pickle.dump({'input': input_docs, 'embeddings': simple_embd}, fOut, protocol=pickle.HIGHEST_PROTOCOL)

    return simple_embd

In [9]:
# Testing get embeddings
id_to_onet_dict, onet_to_id_dict = get_onet_dicts()
all_onets_original = get_all_onets()
print("O*NET: ", all_onets_original[0], " is converted to a vector of shape: ", get_embd(all_onets_original[0]).shape)

convert string to list
loading HF base model


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

O*NET:  Accountants and Auditors  is converted to a vector of shape:  (1, 768)


In [10]:
def get_onet_embeddings(model=None, model_ckpt=None, onet_embd_path=None, save_embd=False, save_path="../data/processed/embeddings/onet/"):
    """ 
    This method computes embedding for all ONETs available. 
    
    Input: model: SentenceTransformers() --> (Optional) pre-trained SBERT model, if None, checks for path 
           model_ckpt: str --> (Optional) pre-trained SBERT model checkpoint path, if None, loads basic SBERT from HF library 
           onet_embd_path: str --> (Optional) Path to load computed embeddings from pickle. If empty, Embeddings will be computed from scratch. 
           save_embd: Boolean --> (Optional) Flag to save computed embeddings.
           save_path: str --> (Optional) Path to save computed embeddings. If empty, new path will be created

    Output: onet_embd_df: pandas.DataFrame --> Dataframe with 2 columns:["ONET_NAME", "ONET_EMBD"] 
                                               Computed sentence embeddings can be stored as key, val pair. 
    """
    onet_embd_df = pd.DataFrame(columns=["ONET_NAME", "ONET_EMBD"])
    
    if not onet_embd_path:
        # Create onet embeddings from all onet data
        # Get list of all ONETs
        all_onets_original = get_all_onets()

        # Compute Embeddings of the entire list 
        simple_embd = get_embd(all_onets_original, model=model, model_ckpt=model_ckpt, save_embd=save_embd, save_path=save_path)

        # Save Embds as dataframe
        onet_embd_df["ONET_NAME"] = pd.Series(all_onets_original)
        onet_embd_df["ONET_EMBD"] = pd.Series([arr for arr in simple_embd])
        
    elif os.path.exists(onet_embd_path):
        #Load sentences & embeddings from disc
        with open(onet_embd_path, "rb") as fIn:
            stored_data = pickle.load(fIn)
            onet_embd_df["ONET_NAME"] = stored_data['input']
            onet_embd_df["ONET_EMBD"] = pd.Series([arr for arr in stored_data['embeddings']])
    
    print("Total ONETs available: ",len(onet_embd_df))
    return onet_embd_df

In [12]:
def get_job_embed_df_from_df(job_df=None,model=None, model_ckpt=None, job_embd_path=None, save_embd=False, save_path="../data/processed/embeddings/job/"):
    """ 
    This method computes embedding for all ONETs available. 
    
    Input: job_df: pandas.DataFrame --> (Optional) Raw job data Dataframe with at least 2 columns:["TITLE_RAW", "BODY"], if empty, loads precomputed.
           model: SentenceTransformers() --> (Optional) pre-trained SBERT model, if None, checks for path 
           model_ckpt: str --> (Optional) pre-trained SBERT model checkpoint path, if None, loads basic SBERT from HF library 
           job_embd_path: str --> (Optional) Path to load computed embeddings from pickle. If empty, Embeddings will be computed from scratch. 
           save_embd: Boolean --> (Optional) Flag to save computed embeddings.
           save_path: str --> (Optional) Path to save computed embeddings. If empty, new path will be created

    Output: onet_embd_df: pandas.DataFrame --> Dataframe with 2 columns:["ONET_NAME", "ONET_EMBD"] 
                                               Computed sentence embeddings can be stored as key, val pair. 
    """
    if job_df is None:
        print("Path to Job DF Given")
        job_df = pd.DataFrame(columns=["TITLE_RAW","BODY", "CLEANED_JOB", "JOB_EMBD"])
        # load embeddings from stored DF embeddings
        if os.path.exists(job_embd_path):
            #Load sentences & embeddings from disc
            with open(job_embd_path, "rb") as fIn:
                stored_data = pickle.load(fIn)
                # Loads cleaned job str and its embeddings
                job_df["CLEANED_JOB"] = stored_data['input']
                job_df["JOB_EMBD"] = pd.Series([arr for arr in stored_data['embeddings']])
                job_df["TITLE_RAW"] = job_df["CLEANED_JOB"].apply(lambda x:x[6:x.find(" [DESC] ")])
                job_df["BODY"] = job_df["CLEANED_JOB"].apply(lambda x:x[x.find(" [DESC] ")+1:])
        else:
            print("Path to Job DF does not exists")
            return job_df
    elif len(job_df) > 0:
        # DF present, compute from Raw DF
        if not ("TITLE_RAW" in job_df.columns and "BODY" in job_df.columns):
            print("Incomplete DataFrame, please try again")
            return None
        if not "CLEANED_JOB" in job_df.columns:
            job_df["CLEANED_JOB"] = job_df.apply(lambda x:get_clean_job_str(x["TITLE_RAW"], x["BODY"]), axis=1)
        
        simple_embd = get_embd(job_df["CLEANED_JOB"].to_list(), model=model, model_ckpt=model_ckpt, save_embd=save_embd, save_path=str(save_path)+str(len(job_df))+"/")

        job_df["JOB_EMBD"] = pd.Series([arr for arr in simple_embd])
    else:
        print("Unexpected Input Job DF, please try again")
    
    print("Total Jobs available: ",len(job_df)) 
    return job_df

def get_job_embd_df_frm_title_body(job_title, job_body, model=None, model_ckpt=None):
    job_df = pd.DataFrame({ "TITLE_RAW" : [job_title],
                            "BODY"      : [job_body], })
                            #"CLEANED_JOB": get_clean_job_str(job_title, job_body)
    job_df = get_job_embed_df_from_df(job_df=job_df, model=model, model_ckpt=model_ckpt)

    return job_df

In [14]:
# Generating embeddings from dummy dataframe of 5 examples
temp_train_df = get_job_embed_df_from_df(job_df=train_df.copy().head())
temp_train_df.head()

loading HF base model


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Total Jobs available:  5


Unnamed: 0,ID,POSTED,TITLE_RAW,BODY,ONET_NAME,ONET,CLEANED_JOB,JOB_EMBD
0,3a9bc988d77e46507f6753429dd848a816d0b9b9,2023-05-03,Executive Meeting Manager,Executive Meeting Manager Marriott La Jolla - ...,"Meeting, Convention, and Event Planners",13-1121.00,[TTL] executive meeting manager [DESC] executi...,"[-0.08524054, 0.20759732, 0.20139317, -0.12825..."
1,eb3a017370d55577e892ff8207a640b7d7136f31,2023-05-03,Rehabilitation Technician-Outpatient Rehab-Fle...,Rehabilitation Technician-Outpatient Rehab-Fle...,Occupational Therapy Aides,31-2012.00,[TTL] rehabilitation technician outpatient reh...,"[0.13869289, -0.4502431, 0.76077837, -0.099650..."
2,8717d2213055d39271bd12490263a7fbe603aedb,2023-05-03,Office/Bookkeeping Assistant,"Office/Bookkeeping Assistant\nSanta Barbara, C...","Office Clerks, General",43-9061.00,[TTL] office bookkeeping assistant [DESC] offi...,"[0.51665556, 0.4805767, 0.16331044, -0.1097672..."
3,43b55e4334835e20e1c64d9ac7bb0a0267369b9e,2023-05-03,Administrative Support Coordinator - VA - (REM...,Find Jobs Administrative Support Coordinator -...,"Secretaries and Administrative Assistants, Exc...",43-6014.00,[TTL] administrative support coordinator va re...,"[-0.22788213, -0.17873518, 0.07261607, -0.1571..."
4,afa355a328687ddb88d6265a237d0375bb36eae7,2023-05-03,Receptionist/Administrative Assistant,Receptionist/Administrative Assistant Burgess ...,"Secretaries and Administrative Assistants, Exc...",43-6014.00,[TTL] receptionist administrative assistant [D...,"[0.33224586, 0.061779127, 0.43059358, -0.18563..."


In [15]:
# Generating embeddings from Single record
temp_train_df = get_job_embd_df_frm_title_body(train_df["TITLE_RAW"][0],train_df["BODY"][0])
temp_train_df.head()

loading HF base model


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Total Jobs available:  1


Unnamed: 0,TITLE_RAW,BODY,CLEANED_JOB,JOB_EMBD
0,Executive Meeting Manager,Executive Meeting Manager Marriott La Jolla - ...,[TTL] executive meeting manager [DESC] executi...,"[-0.08524054, 0.20759732, 0.20139317, -0.12825..."


### Finetuning SBERT for custom data

In [16]:
# Load pretrained model 
model_ckpt = "msmarco-distilbert-base-v4"

sbert_model = SentenceTransformer(model_ckpt)

word_embedding_model = sbert_model._first_module()

tokens = ["[TTL] ", " [DESC] "]
word_embedding_model.tokenizer.add_tokens(tokens, special_tokens=True)
word_embedding_model.auto_model.resize_token_embeddings(len(word_embedding_model.tokenizer))

Embedding(30524, 768)

In [19]:
# Data transformation into train, val, test splits

def get_dataloader(data, split):
        examples = []
        data = data[split]
        n_examples = data.num_rows

        for i in range(n_examples):
            example = data[i]
            examples.append(InputExample(texts=[example['CLEAN_JOB'], example['ONET_NAME']], label=float(1)))
        logger.info(f"in {split}, We have a {type(examples)} of length {len(examples)} containing {type(examples[0])}'s.")
        dataloader = DataLoader(examples, shuffle=True, batch_size=16)
        return examples, dataloader

def convert(self):

    # get csv data to df
    train_df = pd.read_csv(self.config.data_path+"raw_train/train_data.csv").head()
    print("TITLE_RAW"  in train_df.columns)
    test_df = pd.read_csv(self.config.data_path+"raw_test/test_data.csv").head()

    # add clean col to df
    train_df["CLEAN_JOB"] = train_df.apply(lambda x:get_clean_job_str(x["TITLE_RAW"], x["BODY"]), axis=1)
    test_df["CLEAN_JOB"] = test_df.apply(lambda x:get_clean_job_str(x["TITLE_RAW"], x["BODY"]), axis=1)

    # split train/val/test data
    train_ratio = 0.85
    val_ratio = 0.15
    train_df, val_df = train_test_split(train_df, test_size=1 - train_ratio, random_state=42, shuffle=True)

    final_data = DatasetDict({
        "train" : Dataset.from_pandas(train_df).remove_columns(["__index_level_0__"]),
        "val" : Dataset.from_pandas(val_df).remove_columns(["__index_level_0__"]),
        "test" : Dataset.from_pandas(test_df)
    })
    #final_data.save_to_disk( os.path.join(self.config.data_path,"final_data/"))

    dataset = final_data#load_from_disk( os.path.join(self.config.data_path,"final_data/"))

    
    train_examples, train_dataloader = self.get_dataloader(dataset, "train")
    torch.save(train_dataloader, os.path.join(self.config.root_dir,"train.pth"))

    
    val_examples, val_dataloader = self.get_dataloader(dataset, "val")
    val_evaluator = evaluation.EmbeddingSimilarityEvaluator([],[],[]).from_input_examples(examples=val_examples)
    torch.save(val_dataloader, os.path.join(self.config.root_dir,"val.pth"))
    torch.save(val_evaluator, os.path.join(self.config.root_dir,"val_eval.pth"))

    test_examples, test_dataloader = self.get_dataloader(dataset, "test")
    test_evaluator = evaluation.EmbeddingSimilarityEvaluator([],[],[]).from_input_examples(examples=test_examples)
    torch.save(test_dataloader, os.path.join(self.config.root_dir,"test.pth"))
    torch.save(test_evaluator, os.path.join(self.config.root_dir,"test_eval.pth"))

### Further Transformation Moved to [`research/03_model_transformation.ipynb`](https://github.com/shriadke/JobClassification/blob/master/research/03_model_transformation.ipynb)

In [None]:
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
val_dataloader = DataLoader(val_examples, shuffle=True, batch_size=16)
test_dataloader = DataLoader(test_examples, shuffle=True, batch_size=16)

# training Loss used when all examples are positive pairs
loss = losses.MultipleNegativesRankingLoss(model=sbert_model)

# TRAINING ARGS
num_epochs = 2
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1) #10% of train data
weight_decay = 0.01
output_path = "./models/"

In [None]:
# change test_dataloader while training
sbert_model.fit(train_objectives=[(train_dataloader, loss)], epochs = num_epochs, warmup_steps= warmup_steps, weight_decay=weight_decay, output_path= output_path)

# Here the model is trained on 85% of training data for 2 iterations to get the finetuned model

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/953 [00:00<?, ?it/s]

Iteration:   0%|          | 0/953 [00:01<?, ?it/s]

### Further Training Moved to [`research/04_model_trainer.ipynb`](https://github.com/shriadke/JobClassification/blob/master/research/04_model_trainer.ipynb)

In [None]:
from huggingface_hub import login
login()

In [None]:
# Store model to hub
sbert_model.save_to_hub(
    "shriadke/adept-job-msmarco-distilbert-base-v4", 
    organization="",
    exist_ok=True, 
    )

### Further Evaluation Moved to [`research/05_model_evaluation.ipynb`](https://github.com/shriadke/JobClassification/blob/master/research/05_model_evaluation.ipynb)

### Generating Embeddings for input

In [None]:
# Generate ONETs to corresponding embeddings and save it to pickle
onet_embd_df = get_onet_embeddings(onet_embd_path=None, save_embd=True, save_path="../data/processed/embeddings/onet/")

In [None]:
onet_embd_df.head()

Unnamed: 0,ONET_NAME,ONET_EMBD
0,Accountants and Auditors,"[-0.81990623, 0.36192688, 0.5624391, 0.611101,..."
1,Actors,"[0.33532786, 0.5877481, -0.66991186, -0.950445..."
2,Actuaries,"[-0.1727865, -0.7303153, 0.28830737, -0.694946..."
3,Acupuncturists,"[-0.4327046, 0.42423505, -0.465206, 0.6233071,..."
4,Acute Care Nurses,"[-0.562286, -0.8178085, -0.08535522, -1.267615..."


In [None]:
# Load saved embeddings from above pickle
onet_embd_df = get_onet_embeddings(onet_embd_path="./data/processed/embeddings/onet/embd.pkl", save_embd=False, save_path=None)
onet_embd_df.head()

1017


Unnamed: 0,ONET_NAME,ONET_EMBD
0,Accountants and Auditors,"[-0.81990623, 0.36192688, 0.5624391, 0.611101,..."
1,Actors,"[0.33532786, 0.5877481, -0.66991186, -0.950445..."
2,Actuaries,"[-0.1727865, -0.7303153, 0.28830737, -0.694946..."
3,Acupuncturists,"[-0.4327046, 0.42423505, -0.465206, 0.6233071,..."
4,Acute Care Nurses,"[-0.562286, -0.8178085, -0.08535522, -1.267615..."


### Compute embeddings on train and test data to store them for later use

In [None]:
temp_train_df = pd.read_csv("../artifacts/data_ingestion/raw_train/train_data.csv")
temp_train_df = get_job_embed_df_from_df(job_df=temp_train_df,model_ckpt="models/", save_embd=True, save_path="./data/processed/embeddings/job/custom_model_1/train/")

loading model from ckpt


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Batches:   0%|          | 0/561 [00:00<?, ?it/s]

lenght of docs:  17927
lenght of embd:  17927
Total Jobs available:  17927


In [None]:
temp_test_df = pd.read_csv("../artifacts/data_ingestion/raw_test/test_data.csv")
temp_test_df = get_job_embed_df_from_df(job_df=temp_test_df,model_ckpt="models/", save_embd=True, save_path="./data/processed/embeddings/job/custom_model_1/test/")

loading model from ckpt


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Batches:   0%|          | 0/607 [00:00<?, ?it/s]

lenght of docs:  19394
lenght of embd:  19394
Total Jobs available:  19394


#### Load Saved dataframe embeddings

In [None]:
temp_test_df = test_df.drop(["ID", "POSTED","ONET"], axis=1)

test_job_df = get_job_embed_df_from_df(job_embd_path="data/processed/embeddings/job/custom_model_1/test/19394/embd.pkl")

temp_test_df["CLEANED_JOB"] = test_job_df["CLEANED_JOB"]
temp_test_df["JOB_EMBD"] = test_job_df["JOB_EMBD"]
id_to_onet_dict, onet_to_id_dict = get_onet_dicts()
temp_test_df["ONET_ID"] = temp_test_df["ONET_NAME"].apply(lambda x:int(onet_to_id_dict[x]))
temp_test_df.head()

Path to Job DF Given
Total Jobs available:  19394


Unnamed: 0,TITLE_RAW,BODY,ONET_NAME,CLEANED_JOB,JOB_EMBD,ONET_ID
0,Grocery Order Writer (Buyer / Inventory Replen...,Grocery Order Writer (Buyer / Inventory Replen...,"Purchasing Agents, Except Wholesale, Retail, a...",[TTL] grocery order writer buyer inventory rep...,"[-0.98015654, -0.2370125, -0.054709036, 0.5909...",794
1,Superintendent,Apply to this job. \nThink you're the perfect ...,"Education Administrators, Kindergarten through...",[TTL] superintendent [DESC] think perfect cand...,"[-0.7707803, 0.068227395, -0.3073499, 0.680984...",277
2,Software Developer IV,Software Developer IV\nJob Locations\nUS-NE-Om...,Software Developers,[TTL] software developer iv [DESC] software de...,"[-0.62012154, 0.47546214, 0.3680548, 0.3727675...",889
3,Auto Glass Technician,Auto Glass Technician Gerber Collision & Glass...,Automotive Service Technicians and Mechanics,[TTL] auto glass technician [DESC] auto glass ...,"[0.7393341, 0.5710749, 0.3004027, 0.2613186, 0...",76
4,Food and Beverage Operations Manager,Food and Beverage Operations Manager Wavetroni...,Food Service Managers,[TTL] food beverage operations manager [DESC] ...,"[-0.7466557, 0.78241795, -0.22036402, -0.59654...",404


### Testing the Semantic search on Test Data

In [None]:
# Prepare Test DataLoader from saved dataset
test_examples = []
test_data = dataset['test']

n_examples = dataset['test'].num_rows
test_queries = {}
test_relevant_docs = {}

for i in range(n_examples):
  example = test_data[i]
  test_examples.append(InputExample(texts=[example['job_post'], example['onet_name']]))
  test_queries[str(i)] = example['job_post']
  test_relevant_docs[str(i)] = onet_to_id_dict[example["onet_name"]]
print(f"We have a {type(test_examples)} of length {len(test_examples)} containing {type(test_examples[0])}'s.")

In [None]:
test_queries_ids = []
for qid in test_queries:
    if qid in test_relevant_docs and len(test_relevant_docs[qid]) > 0:
        test_queries_ids.append(qid)

test_queries = [test_queries[qid] for qid in test_queries_ids]


In [None]:
test_q_embd = get_embd(test_queries, model=sbert_model)
test_hits = util.semantic_search(test_q_embd, np.array(onet_embd_df["ONET_EMBD"].to_list()), top_k=10)


loading model as it is


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

In [None]:
# Custom code to calculate Performance metrics

acc_at_k = [1,5,10]
num_hits_at_k = {k:0 for k in acc_at_k}

precision_recall_at_k = [1,5,10]
precisions_at_k = {k:[] for k in precision_recall_at_k}
recall_at_k = {k:[] for k in precision_recall_at_k}

mrr_at_k = [1,5,10]
MRR = {k:0 for k in mrr_at_k}


for test_qid in range(len(test_hits)):

    qid = test_queries_ids[test_qid]
    # Sort scores
    top_hits = test_hits[test_qid]
    query_relevant_docs = [int(test_relevant_docs[qid])]

    # Accuracy@k - We count the result correct, if at least one relevant doc is accross the top-k documents
    for k_val in acc_at_k:
        for hit in top_hits[0:k_val]:
            if hit['corpus_id'] in query_relevant_docs:
                num_hits_at_k[k_val] += 1
                break

    # Precision and Recall@k
    for k_val in precision_recall_at_k:
        num_correct = 0
        for hit in top_hits[0:k_val]:
            if hit['corpus_id'] in query_relevant_docs:
                num_correct += 1

        precisions_at_k[k_val].append(num_correct / k_val)
        recall_at_k[k_val].append(num_correct / len(query_relevant_docs))

    # MRR@k
    for k_val in mrr_at_k:
        for rank, hit in enumerate(top_hits[0:k_val]):
            if hit['corpus_id'] in query_relevant_docs:
                MRR[k_val] += 1.0 / (rank + 1)
                break

In [None]:
for k in num_hits_at_k:
    num_hits_at_k[k] /= len(test_queries)

for k in precisions_at_k:
    precisions_at_k[k] = np.mean(precisions_at_k[k])

for k in recall_at_k:
    recall_at_k[k] = np.mean(recall_at_k[k])

for k in MRR:
    MRR[k] /= len(test_queries)

op_dict =  {'accuracy@k': num_hits_at_k, 'precision@k': precisions_at_k, 'recall@k': recall_at_k, 'mrr@k': MRR}
for key,val in op_dict.items():
    print(key, val)



accuracy@k {1: 0.5129533678756477, 5: 0.7461139896373057, 10: 0.8134715025906736}
precision@k {1: 0.5129533678756477, 5: 0.14922279792746113, 10: 0.08134715025906736}
recall@k {1: 0.5129533678756477, 5: 0.7461139896373057, 10: 0.8134715025906736}
mrr@k {1: 0.5129533678756477, 5: 0.6051813471502591, 10: 0.6137326260383255}
