# Classifier Research

### Purpose

1. To generate/load the sentence embeddings in run-time or from stored location,
2. Identify different classifier heads suitable for this classification task,
3. Train and test classifier model,
4. Evaluate/compare the performance on test data,
5. Define and test the common functions necessary for actual prediction pipeline i.e. `predict()` method. 

In [1]:
import pandas as pd
import csv
import os
import numpy as np
import pickle
import string

from nltk.corpus import stopwords
import nltk
from nltk.tokenize import sent_tokenize
import tqdm

from sklearn.model_selection import train_test_split
from transformers import pipeline, set_seed, AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer, models, InputExample, losses, evaluation, util
from sentence_transformers.util import cos_sim, dot_score


import matplotlib.pyplot as plt
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset, load_from_disk, load_metric

### Define Common Functions for data, embedding processing

In [None]:
def get_special_tokens(path="../data/raw/special_words.txt"):
    """ 
    This method returns list of special words to be removed from the text.
    
    Input: path: str --> Path to the word file

    Output: : special_tokens: list[str] --> all words/special tokens to be removed
    """
    special_tokens=[]
    if not os.path.exists(path):
        return []

    with open(path, "r") as f:
        special_tokens = f.readlines()
    special_tokens = [line.rstrip('\n') for line in special_tokens]
    
    return special_tokens

def clean_text(text, stop_words=stopwords.words('english'), punct=string.punctuation, special_tokens=[]):
        """ 
        This method returns cleaned String from an input string. 
        Removes stop words, punctuations, numbers and any special tokens given.
        
        Input:  text : str                  --> input string to be cleaned
                stop_words : list[str]      --> (Optional) list of stop words to be removed from the text
                punct : list[str]           --> (Optional) list of punctuations to be removed from the text
                special_tokens : list[str]  --> (Optional) list of special words to be removed from the text

        Output: text: string:  cleaned text
        """

        text= text.lower()
        
        text = text.replace("\n"," ")
        
        text = text.replace(r'[0-9]+', ' ')
        text = text.replace(r'[^\w\s]', ' ')
        text = text.replace(r'[^a-zA-Z]', ' ')
        for p in punct:
            text = text.replace(p," ") 
            
        text = ' '.join([word for word in text.split() if word not in stop_words])
        text = ' '.join([word for word in text.split() if word not in special_tokens])
        text = ''.join([i for i in text if not i.isdigit()])
        text = text.replace(r'\s+', ' ')
        text = ' '.join([i for i in text.split() if len(i)>1])
        
        text = text.replace(r'\s+', ' ')
        return text

def get_clean_job_str(job_title, job_post):
    """ 
    This method returns cleaned Job Posting from Job Title and Job String. 
    Appends title and body tokens and concatenates the two.
    
    Input: Job Title Raw : string
            Job Body Raw : string
    Output: job_str: string:  cleaned and concatenated job details
    """
    title_token = "[TTL] "
    body_token = " [DESC] "

    job_title = clean_text(job_title, special_tokens=get_special_tokens())
    job_post = clean_text(job_post, special_tokens=get_special_tokens())

    job_str = title_token + job_title + body_token + job_post

    return job_str
def get_all_onets(onet_data_path="../data/raw/All_Occupations.csv"):
    """ 
    This method returns list of all ONETs available on the official site
    
    Input: path: str -->  (Optional) Path to the onet csv file

    Output: : all_onets_original: list[str] --> all ONETs available
    """
    all_occupations_df = pd.read_csv(onet_data_path)
    all_onets_original = all_occupations_df.Occupation.to_list()
    return all_onets_original

def get_onet_dicts(all_onets_original=get_all_onets()):
    """ 
    This method returns 2 dictionaries used to map standard ONET Names to string IDs. 
    
    Input: all_onets_original: list[str] --> (Optional) list of all ONETs

    Output: : id_to_onet_dict: dict[str, str] --> standard mapping of string id to ONETs --> "id" : "ONET_NAME"
              onet_to_id_dict: dict[str, str] --> standard mapping of ONETs to string id --> "ONET_NAME" : "id"
    """
    id_to_onet_dict = {str(id):onet for id, onet in enumerate(all_onets_original)}
    onet_to_id_dict = {onet:id for id,onet in id_to_onet_dict.items()}
    return id_to_onet_dict, onet_to_id_dict

def get_embd(input_docs, model=None, model_ckpt=None, save_embd=False, save_path=None):
    """ 
    This method computes embedding of the given string or list of strings using the given SBERT model. 
    
    Input: input_docs: str or list[str] --> list of input string. Can be a single string which will be converted to a list.
           model: SentenceTransformers() --> (Optional) pre-trained SBERT model, if None, checks for path 
           model_ckpt: str --> (Optional) pre-trained SBERT model checkpoint path, if None, loads basic SBERT from HF library 
           save_embd: Boolean --> (Optional) Flag to save computed embeddings.
           save_path: str --> (Optional) Path to save computed embeddings. If empty, new path will be created

    Output: simple_embd: numpy.ndarray (len(input), embd_size) --> array of Computed sentence embeddings. 
    """

    if not input_docs:
        return None
    if not isinstance(input_docs, list):
        print("convert string to list")
        input_docs = [input_docs]       

    if model:
        print("loading model as it is")
        sbert_model = model
    elif model_ckpt:
        print("loading model from ckpt")
        if not os.path.exists(model_ckpt):
            model_ckpt = "shriadke/adept-job-msmarco-distilbert-base-v4"
        print("loading model from ckpt: ", model_ckpt)
        sbert_model = SentenceTransformer(model_ckpt)
    else:
        print("loading HF base model")
        sbert_model = SentenceTransformer("msmarco-distilbert-base-v4")
    
    simple_embd = sbert_model.encode(input_docs, show_progress_bar=True)
    
    if save_embd and save_path:
        if not os.path.exists(save_path):
            os.makedirs(save_path, exist_ok=True)
            #os.chmod(save_path+"embd.pkl", 0o777)
        with open(save_path+"embd.pkl", "wb") as fOut:
            print("lenght of docs: ", len(input_docs))
            print("lenght of embd: ", len(simple_embd))
            pickle.dump({'input': input_docs, 'embeddings': simple_embd}, fOut, protocol=pickle.HIGHEST_PROTOCOL)

    return simple_embd

def get_onet_embeddings(model=None, model_ckpt=None, onet_embd_path=None, save_embd=False, save_path="./data/processed/embeddings/onet/"):
    """ 
    This method computes embedding for all ONETs available. 
    
    Input: model: SentenceTransformers() --> (Optional) pre-trained SBERT model, if None, checks for path 
           model_ckpt: str --> (Optional) pre-trained SBERT model checkpoint path, if None, loads basic SBERT from HF library 
           onet_embd_path: str --> (Optional) Path to load computed embeddings from pickle. If empty, Embeddings will be computed from scratch. 
           save_embd: Boolean --> (Optional) Flag to save computed embeddings.
           save_path: str --> (Optional) Path to save computed embeddings. If empty, new path will be created

    Output: onet_embd_df: pandas.DataFrame --> Dataframe with 2 columns:["ONET_NAME", "ONET_EMBD"] 
                                               Computed sentence embeddings can be stored as key, val pair. 
    """
    onet_embd_df = pd.DataFrame(columns=["ONET_NAME", "ONET_EMBD"])
    
    if not onet_embd_path:
        # Create onet embeddings from all onet data
        # Get list of all ONETs
        all_onets_original = get_all_onets()

        # Compute Embeddings of the entire list 
        simple_embd = get_embd(all_onets_original, model=model, model_ckpt=model_ckpt, save_embd=save_embd, save_path=save_path)

        # Save Embds as dataframe
        onet_embd_df["ONET_NAME"] = pd.Series(all_onets_original)
        onet_embd_df["ONET_EMBD"] = pd.Series([arr for arr in simple_embd])
        
    elif os.path.exists(onet_embd_path):
        #Load sentences & embeddings from disc
        with open(onet_embd_path, "rb") as fIn:
            stored_data = pickle.load(fIn)
            onet_embd_df["ONET_NAME"] = stored_data['input']
            onet_embd_df["ONET_EMBD"] = pd.Series([arr for arr in stored_data['embeddings']])
    
    print("Total ONETs available: ",len(onet_embd_df))
    return onet_embd_df

def get_job_embed_df_from_df(job_df=None,model=None, model_ckpt=None, job_embd_path=None, save_embd=False, save_path="./data/processed/embeddings/job/"):
    """ 
    This method computes embedding for all ONETs available. 
    
    Input: job_df: pandas.DataFrame --> (Optional) Raw job data Dataframe with at least 2 columns:["TITLE_RAW", "BODY"], if empty, loads precomputed.
           model: SentenceTransformers() --> (Optional) pre-trained SBERT model, if None, checks for path 
           model_ckpt: str --> (Optional) pre-trained SBERT model checkpoint path, if None, loads basic SBERT from HF library 
           job_embd_path: str --> (Optional) Path to load computed embeddings from pickle. If empty, Embeddings will be computed from scratch. 
           save_embd: Boolean --> (Optional) Flag to save computed embeddings.
           save_path: str --> (Optional) Path to save computed embeddings. If empty, new path will be created

    Output: onet_embd_df: pandas.DataFrame --> Dataframe with 2 columns:["ONET_NAME", "ONET_EMBD"] 
                                               Computed sentence embeddings can be stored as key, val pair. 
    """
    if job_df is None:
        print("Path to Job DF Given")
        job_df = pd.DataFrame(columns=["TITLE_RAW","BODY", "CLEANED_JOB", "JOB_EMBD"])
        # load embeddings from stored DF embeddings
        if os.path.exists(job_embd_path):
            #Load sentences & embeddings from disc
            with open(job_embd_path, "rb") as fIn:
                stored_data = pickle.load(fIn)
                # Loads cleaned job str and its embeddings
                job_df["CLEANED_JOB"] = stored_data['input']
                job_df["JOB_EMBD"] = pd.Series([arr for arr in stored_data['embeddings']])
                job_df["TITLE_RAW"] = job_df["CLEANED_JOB"].apply(lambda x:x[6:x.find(" [DESC] ")])
                job_df["BODY"] = job_df["CLEANED_JOB"].apply(lambda x:x[x.find(" [DESC] ")+1:])
        else:
            print("Path to Job DF does not exists")
            return job_df
    elif len(job_df) > 0:
        # DF present, compute from Raw DF
        if not ("TITLE_RAW" in job_df.columns and "BODY" in job_df.columns):
            print("Incomplete DataFrame, please try again")
            return None
        if not "CLEANED_JOB" in job_df.columns:
            job_df["CLEANED_JOB"] = job_df.apply(lambda x:get_clean_job_str(x["TITLE_RAW"], x["BODY"]), axis=1)
        
        simple_embd = get_embd(job_df["CLEANED_JOB"].to_list(), model=model, model_ckpt=model_ckpt, save_embd=save_embd, save_path=str(save_path)+str(len(job_df))+"/")

        job_df["JOB_EMBD"] = pd.Series([arr for arr in simple_embd])
    else:
        print("Unexpected Input Job DF, please try again")
    
    print("Total Jobs available: ",len(job_df)) 
    return job_df

def get_job_embd_df_frm_title_body(job_title, job_body, model_ckpt=None):
    job_df = pd.DataFrame({ "TITLE_RAW" : [job_title],
                            "BODY"      : [job_body], })
                            #"CLEANED_JOB": get_clean_job_str(job_title, job_body)
    job_df = get_job_embed_df_from_df(job_df=job_df, model_ckpt=None)

    return job_df  

### Get Preprocessed embeddings for training data

In [11]:
# Load data
temp_train_df = pd.read_csv("../artifacts/data_ingestion/raw_train/train_data.csv")
temp_train_df = temp_train_df.drop(["ID", "POSTED","ONET"], axis=1)
temp_train_df.head()
id_to_onet_dict, onet_to_id_dict = get_onet_dicts()

In [12]:
# Get stored data frame along with embeddings
job_df = get_job_embed_df_from_df(job_embd_path="data/processed/embeddings/job/custom_model_1/train/17927/embd.pkl")
job_df.head()

Path to Job DF Given
Total Jobs available:  17927


Unnamed: 0,TITLE_RAW,BODY,CLEANED_JOB,JOB_EMBD
0,executive meeting manager,[DESC] executive meeting manager marriott la j...,[TTL] executive meeting manager [DESC] executi...,"[-0.8965845, 0.21626818, 0.4062995, 0.25944906..."
1,rehabilitation technician outpatient rehab fle...,[DESC] rehabilitation technician outpatient re...,[TTL] rehabilitation technician outpatient reh...,"[-0.3449566, -0.01683965, -0.33390927, -0.0695..."
2,office bookkeeping assistant,[DESC] office bookkeeping assistant santa barb...,[TTL] office bookkeeping assistant [DESC] offi...,"[-0.4091166, 1.4821604, 0.5247243, -0.10980953..."
3,administrative support coordinator va remote,[DESC] find jobs administrative support coordi...,[TTL] administrative support coordinator va re...,"[-0.8193301, 0.6742436, 0.6131675, -0.277765, ..."
4,receptionist administrative assistant,[DESC] receptionist administrative assistant b...,[TTL] receptionist administrative assistant [D...,"[-0.3198364, 1.0830945, 0.35040185, -0.0614275..."


In [13]:
temp_train_df["CLEANED_JOB"] = job_df["CLEANED_JOB"]
temp_train_df["JOB_EMBD"] = job_df["JOB_EMBD"]
temp_train_df.head()

Unnamed: 0,TITLE_RAW,BODY,ONET_NAME,CLEANED_JOB,JOB_EMBD
0,Executive Meeting Manager,Executive Meeting Manager Marriott La Jolla - ...,"Meeting, Convention, and Event Planners",[TTL] executive meeting manager [DESC] executi...,"[-0.8965845, 0.21626818, 0.4062995, 0.25944906..."
1,Rehabilitation Technician-Outpatient Rehab-Fle...,Rehabilitation Technician-Outpatient Rehab-Fle...,Occupational Therapy Aides,[TTL] rehabilitation technician outpatient reh...,"[-0.3449566, -0.01683965, -0.33390927, -0.0695..."
2,Office/Bookkeeping Assistant,"Office/Bookkeeping Assistant\nSanta Barbara, C...","Office Clerks, General",[TTL] office bookkeeping assistant [DESC] offi...,"[-0.4091166, 1.4821604, 0.5247243, -0.10980953..."
3,Administrative Support Coordinator - VA - (REM...,Find Jobs Administrative Support Coordinator -...,"Secretaries and Administrative Assistants, Exc...",[TTL] administrative support coordinator va re...,"[-0.8193301, 0.6742436, 0.6131675, -0.277765, ..."
4,Receptionist/Administrative Assistant,Receptionist/Administrative Assistant Burgess ...,"Secretaries and Administrative Assistants, Exc...",[TTL] receptionist administrative assistant [D...,"[-0.3198364, 1.0830945, 0.35040185, -0.0614275..."


### Prepare the data for classifier

In [27]:
# Convert O*NETS to ids as prediction classes 
temp_train_df["ONET_ID"] = temp_train_df["ONET_NAME"].apply(lambda x:int(onet_to_id_dict[x]))
temp_train_df.head()

Unnamed: 0,TITLE_RAW,BODY,ONET_NAME,CLEANED_JOB,JOB_EMBD,ONET_ID
0,Executive Meeting Manager,Executive Meeting Manager Marriott La Jolla - ...,"Meeting, Convention, and Event Planners",[TTL] executive meeting manager [DESC] executi...,"[-0.8965845, 0.21626818, 0.4062995, 0.25944906...",613
1,Rehabilitation Technician-Outpatient Rehab-Fle...,Rehabilitation Technician-Outpatient Rehab-Fle...,Occupational Therapy Aides,[TTL] rehabilitation technician outpatient reh...,"[-0.3449566, -0.01683965, -0.33390927, -0.0695...",675
2,Office/Bookkeeping Assistant,"Office/Bookkeeping Assistant\nSanta Barbara, C...","Office Clerks, General",[TTL] office bookkeeping assistant [DESC] offi...,"[-0.4091166, 1.4821604, 0.5247243, -0.10980953...",678
3,Administrative Support Coordinator - VA - (REM...,Find Jobs Administrative Support Coordinator -...,"Secretaries and Administrative Assistants, Exc...",[TTL] administrative support coordinator va re...,"[-0.8193301, 0.6742436, 0.6131675, -0.277765, ...",855
4,Receptionist/Administrative Assistant,Receptionist/Administrative Assistant Burgess ...,"Secretaries and Administrative Assistants, Exc...",[TTL] receptionist administrative assistant [D...,"[-0.3198364, 1.0830945, 0.35040185, -0.0614275...",855


In [12]:
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn import metrics

import matplotlib.pyplot as plt
import datetime
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

#### Split the data into train and validation sets

In [29]:
X= temp_train_df["JOB_EMBD"].to_list()
y = temp_train_df["ONET_ID"].to_list()

In [79]:
xtrain,xtest,ytrain,ytest = train_test_split(X,y ,test_size = 0.15, random_state=42)

In [80]:
all_classes = list(range(0,len(onet_to_id_dict)))
print("No of classes : ", len(all_classes))

No of classes :  1017


## Train and save LR Model

In [81]:
model1 = LogisticRegression(solver = 'sag' , max_iter =20, verbose=1, random_state=42)
model1.fit(xtrain , ytrain)

max_iter reached after 740 seconds


In [83]:
with open("./classifier/LR/model_20_sag.pkl", "wb") as fOut:
    pickle.dump(model1, fOut, protocol=pickle.HIGHEST_PROTOCOL)

### Check performance on Validation data

In [84]:
y_pred = loaded_model.predict(xtest)

acc=metrics.accuracy_score(ytest,y_pred)
print("accuracy : ",acc)

# Convert available classes to all classes.
# This is due to the missing output classes from training. This skews/ommits the missing class ids and needs to be fixed.
all_classes = np.array(sorted(all_classes))
# Get the probabilities for learnt classes
prob = loaded_model.predict_proba(xtest)
# Create the result matrix, where all values are initially zero
new_prob = np.zeros((prob.shape[0], all_classes.size))
# Set the columns corresponding to clf.classes_
new_prob[:, all_classes.searchsorted(loaded_model.classes_)] = prob

# Calculate top_k accuracies
k_acc = metrics.top_k_accuracy_score(ytest,new_prob,k=1, labels=all_classes)
print("top 1 accuracy : ",k_acc)
k_acc = metrics.top_k_accuracy_score(ytest,new_prob,k=5, labels=all_classes)
print("top 5 accuracy : ",k_acc)
k_acc = metrics.top_k_accuracy_score(ytest,new_prob,k=10, labels=all_classes)
print("top 10 accuracy : ",k_acc)

accuracy :  0.7112597547380156
top 1 accuracy :  0.7112597547380156
top 5 accuracy :  0.8807134894091416
top 10 accuracy :  0.9141583054626533


In [78]:
# Sample Output of prediction
# data_id, Actual O*NET label, Predicted list of top 5 O*NETs
# data_id, Actual O*NET ID, Index/ Rank at which it is found in predicted list

for i in range(10):
    preds_idx = np.argsort(new_prob[i])[::-1][:5]
    print(i, id_to_onet_dict[str(ytest[i])], [id_to_onet_dict[str(idx)] for idx in preds_idx])
    if ytest[i] in preds_idx:
        print(i, ytest[i], np.where(preds_idx == ytest[i])[0])
        

0 Accountants and Auditors ['Accountants and Auditors', 'Sales Representatives, Wholesale and Manufacturing, Except Technical and Scientific Products', 'Information Technology Project Managers', 'Sales Managers', 'Public Relations Managers']
0 0 [0]
1 Occupational Health and Safety Specialists ['General and Operations Managers', 'Managers, All Other', 'Occupational Health and Safety Specialists', 'Training and Development Specialists', 'Emergency Management Directors']
1 672 [2]
2 Automotive Service Technicians and Mechanics ['Environmental Science and Protection Technicians, Including Health', 'Janitors and Cleaners, Except Maids and Housekeeping Cleaners', 'Heating, Air Conditioning, and Refrigeration Mechanics and Installers', 'Landscaping and Groundskeeping Workers', 'Maintenance and Repair Workers, General']
3 Human Resources Specialists ['Human Resources Specialists', 'Public Relations Specialists', 'Market Research Analysts and Marketing Specialists', 'Document Management Specia

### Performance of LR Model on test data

In [85]:
temp_test_df = pd.read_csv("../artifacts/data_ingestion/raw_train/train_data.csv")
temp_test_df = temp_test_df.drop(["ID", "POSTED","ONET"], axis=1)

# Load Precomputed test data embeddings
test_job_df = get_job_embed_df_from_df(job_embd_path="../data/processed/embeddings/job/custom_model_1/test/19394/embd.pkl")

temp_test_df["CLEANED_JOB"] = test_job_df["CLEANED_JOB"]
temp_test_df["JOB_EMBD"] = test_job_df["JOB_EMBD"]
id_to_onet_dict, onet_to_id_dict = get_onet_dicts()
temp_test_df["ONET_ID"] = temp_test_df["ONET_NAME"].apply(lambda x:int(onet_to_id_dict[x]))
temp_test_df.head()

Path to Job DF Given
Total Jobs available:  19394


Unnamed: 0,TITLE_RAW,BODY,ONET_NAME,CLEANED_JOB,JOB_EMBD,ONET_ID
0,Grocery Order Writer (Buyer / Inventory Replen...,Grocery Order Writer (Buyer / Inventory Replen...,"Purchasing Agents, Except Wholesale, Retail, a...",[TTL] grocery order writer buyer inventory rep...,"[-0.98015654, -0.2370125, -0.054709036, 0.5909...",794
1,Superintendent,Apply to this job. \nThink you're the perfect ...,"Education Administrators, Kindergarten through...",[TTL] superintendent [DESC] think perfect cand...,"[-0.7707803, 0.068227395, -0.3073499, 0.680984...",277
2,Software Developer IV,Software Developer IV\nJob Locations\nUS-NE-Om...,Software Developers,[TTL] software developer iv [DESC] software de...,"[-0.62012154, 0.47546214, 0.3680548, 0.3727675...",889
3,Auto Glass Technician,Auto Glass Technician Gerber Collision & Glass...,Automotive Service Technicians and Mechanics,[TTL] auto glass technician [DESC] auto glass ...,"[0.7393341, 0.5710749, 0.3004027, 0.2613186, 0...",76
4,Food and Beverage Operations Manager,Food and Beverage Operations Manager Wavetroni...,Food Service Managers,[TTL] food beverage operations manager [DESC] ...,"[-0.7466557, 0.78241795, -0.22036402, -0.59654...",404


In [86]:
# Split features and labels for test data
X= temp_test_df["JOB_EMBD"].to_list()
y = temp_test_df["ONET_ID"].to_list()

In [87]:
# model 1 metrics
with open("classifier/LR/model_20_sag.pkl", "rb") as fIn:
    loaded_model = pickle.load(fIn)
 
y_pred = loaded_model.predict(X)
#print(metrics.classification_report(ytest , y_pred))
acc=metrics.accuracy_score(y,y_pred)
print("accuracy : ",acc)
all_classes = np.array(sorted(all_classes))
# Get the probabilities for learnt classes
prob = loaded_model.predict_proba(X)
# Create the result matrix, where all values are initially zero
new_prob = np.zeros((prob.shape[0], all_classes.size))
# Set the columns corresponding to clf.classes_
new_prob[:, all_classes.searchsorted(loaded_model.classes_)] = prob
k_acc = metrics.top_k_accuracy_score(y,new_prob,k=1, labels=all_classes)
print("top 1 accuracy : ",k_acc)
k_acc = metrics.top_k_accuracy_score(y,new_prob,k=5, labels=all_classes)
print("top 5 accuracy : ",k_acc)
k_acc = metrics.top_k_accuracy_score(y,new_prob,k=10, labels=all_classes)
print("top 10 accuracy : ",k_acc)

accuracy :  0.6844900484685985
top 1 accuracy :  0.6844900484685985
top 5 accuracy :  0.8764050737341446
top 10 accuracy :  0.9127565226358668


#### The accuracies are much higher than a simple embedding based model and can be further improved with the advanced classifiers or by implementing neural network classifier head.

## Semantic Search Classifier/Recommender

To demonstrate use of embedding search as a classifier on an entire dataframe or on a single example case, following code is re-written with modular functions. 

This is the same process as described in embeddings research and testing.

### Define common functions used for result processing

In [2]:
def compare_docs_and_queries(job_embeddings, onet_embeddings, top_k=1):
    """ 
    This method performs semantic search operation that finds top_k ONET embeddings for the given job embeddings 
    using SBERT util's semantic_search method with cosine similarity. 
    
    Input: job_embeddings: ndarray --> List/array of input job embeddings.
           onet_embeddings: ndarray --> List/array of input ONET embeddings.
           top_k: int --> (Optional) top K results to return

    Output: hits: list[list[{"corpus_id", "score"}]] --> Returns a list with one entry for each query. 
                             Each entry is a list of dictionaries with the keys ‘corpus_id’ (id of ONETs) and ‘score’, 
                             sorted by decreasing cosine similarity scores
    """
    
    hits = util.semantic_search(job_embeddings, onet_embeddings, top_k=top_k)

    return hits

In [3]:
def process_hits(all_hits,job_df, onet_embd_df):
    """ 
    This method processes the hits returned by semantic search and maps them to corresponding inputs. 
    
    Input: all_hits: ndarray --> List/array of hits.
           job_df: pd.DataFrame --> Input Job Dataframe.
           onet_embd_df: pd.DataFrame --> Input O*NET Dataframe.

    Output: result_df: pd.DataFrame --> Output Dataframe with PRED_ONETS
    """
    result_df = job_df.copy()
    results = []
    
    for id in range(len(all_hits)):
        hits = all_hits[id]
        k_res = []
        for hit in hits:
            pred_hit = onet_embd_df["ONET_NAME"][hit['corpus_id']]
            #print("\t{:.3f}\t{}".format(hit['score'], pred_hit))
            k_res.append(pred_hit)
        results.append(k_res)
    result_df["PRED_ONETS"] = pd.Series([arr for arr in results])

    return result_df

In [4]:
def predict_frm_file(file_path=None, data_set=None, top_k=1, save_results=False, evaluate=False, model=None, model_ckpt=None, onet_embd_path=None):
    """
    Prediction function for pridicting over entire input file
    Returns: result_df: pd.DataFrame --> Output Dataframe with PRED_ONETS
    """

    # Get results for given dataframe
    from_pkl = False
    job_df = pd.DataFrame()
    if not file_path and data_set in ["train", "test"]:
        fname = str(data_set) + "_data.csv"
        file_path = os.part.join("../data/raw/",fname)
    
    if os.path.exists(file_path):
        if file_path.endswith(".csv"):
            # Take CSV file
            # load data into DF
            job_df = pd.read_csv(file_path)
            from_pkl = False
            job_df_file = None 
            # Make sure title and body present
            if not ("TITLE_RAW" in job_df.columns and "BODY" in job_df.columns):
                print("Incomplete DataFrame, please try again")
                return None
        elif file_path.endswith(".pkl"):
            #load df from pickle
            from_pkl = True
            job_df_file = file_path 
            job_df = None            
    #Check on sample data
    
    #job_df = job_df.head(2)    
    

    # Get Job_Embd DF
    job_df = get_job_embed_df_from_df(job_df=job_df, job_embd_path=job_df_file, model=model, model_ckpt=model_ckpt, save_embd=save_results)

    # Get ONET Embd DF
    onet_embd_df = get_onet_embeddings(model=model, model_ckpt=model_ckpt, onet_embd_path=onet_embd_path, save_embd=save_results)


    all_hits = compare_docs_and_queries(np.array(job_df["JOB_EMBD"].to_list()), np.array(onet_embd_df["ONET_EMBD"].to_list()), top_k=top_k)

    result_df = process_hits(all_hits,job_df, onet_embd_df)

    return result_df


In [None]:
def predict_frm_input_string(job_title=None, job_body=None, top_k=1, save_results=False, evaluate=False, model=None, model_ckpt=None, onet_embd_path=None):
    """
    Prediction function for predicting for a single entry
    Returns: result_df: pd.DataFrame --> Output Dataframe with PRED_ONETS
    """
    # Get results for given dataframe
    # Get Job_Embd DF
    if job_body == "" or job_body is None or job_title =="" or job_title is None:
        print("Invalid Input, Please provide both Job Title and Body")
        return None

    job_df = get_job_embd_df_frm_title_body(job_title, job_body)

    # Get ONET Embd DF
    onet_embd_df = get_onet_embeddings(model=model, model_ckpt=model_ckpt, onet_embd_path=onet_embd_path, save_embd=save_results)


    all_hits = compare_docs_and_queries(np.array(job_df["JOB_EMBD"].to_list()), np.array(onet_embd_df["ONET_EMBD"].to_list()), top_k=top_k)

    result_df = process_hits(all_hits,job_df, onet_embd_df)

    return result_df


In [None]:
def predict(job_title=None, job_body=None, top_k=1):
    """
    Prediction function for predicting for a single entry and used in the prediction() pipeline
    Returns: result: list --> Output list of predicted O*NETs
    """
    onet_embd_path="../data/processed/embeddings/onet_custom_model_1/embd.pkl"
    model_ckpt = "models/"
    result_df = predict_frm_input_string(job_title, job_body, model_ckpt=model_ckpt, top_k=top_k, onet_embd_path=onet_embd_path)

    if result_df is None:
        return None
    pred_onets = result_df["PRED_ONETS"].to_list()[0]
    return pred_onets

In [None]:
# Predicting output on 2 train examples without giving custom O*NET embeddings 
predict_frm_file(file_path="../artifacts/data_ingestion/raw_train/train_data.csv", model_ckpt="models/", top_k=3)

loading model from ckpt


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Total Jobs available:  2
loading model from ckpt


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Total ONETs available:  1017


Unnamed: 0,ID,POSTED,TITLE_RAW,BODY,ONET_NAME,ONET,CLEANED_JOB,JOB_EMBD,PRED_ONETS
0,3a9bc988d77e46507f6753429dd848a816d0b9b9,2023-05-03,Executive Meeting Manager,Executive Meeting Manager Marriott La Jolla - ...,"Meeting, Convention, and Event Planners",13-1121.00,[TTL] executive meeting manager [DESC] executi...,"[-0.8965845, 0.21626818, 0.4062995, 0.25944906...","[Meeting, Convention, and Event Planners, Lodg..."
1,eb3a017370d55577e892ff8207a640b7d7136f31,2023-05-03,Rehabilitation Technician-Outpatient Rehab-Fle...,Rehabilitation Technician-Outpatient Rehab-Fle...,Occupational Therapy Aides,31-2012.00,[TTL] rehabilitation technician outpatient reh...,"[-0.3449566, -0.016839795, -0.3339095, -0.0695...","[Rehabilitation Counselors, Occupational Thera..."


In [None]:
# Saving O*NET embeddings for reuse
temp_onet = get_onet_embeddings(model_ckpt="models/", save_embd=True, save_path="data/processed/embeddings/onet_custom_model_1/")

loading model from ckpt


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

lenght of docs:  1017
lenght of embd:  1017
Total ONETs available:  1017


In [None]:
# Predicting again with saved O*NETs
predict_frm_file(file_path="../artifacts/data_ingestion/raw_train/train_data.csv", model_ckpt="models/", top_k=3, onet_embd_path="data/processed/embeddings/onet_custom_model_1/embd.pkl")

loading model from ckpt


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Total Jobs available:  2
Total ONETs available:  1017


Unnamed: 0,ID,POSTED,TITLE_RAW,BODY,ONET_NAME,ONET,CLEANED_JOB,JOB_EMBD,PRED_ONETS
0,3a9bc988d77e46507f6753429dd848a816d0b9b9,2023-05-03,Executive Meeting Manager,Executive Meeting Manager Marriott La Jolla - ...,"Meeting, Convention, and Event Planners",13-1121.00,[TTL] executive meeting manager [DESC] executi...,"[-0.8965845, 0.21626818, 0.4062995, 0.25944906...","[Meeting, Convention, and Event Planners, Lodg..."
1,eb3a017370d55577e892ff8207a640b7d7136f31,2023-05-03,Rehabilitation Technician-Outpatient Rehab-Fle...,Rehabilitation Technician-Outpatient Rehab-Fle...,Occupational Therapy Aides,31-2012.00,[TTL] rehabilitation technician outpatient reh...,"[-0.3449566, -0.016839795, -0.3339095, -0.0695...","[Rehabilitation Counselors, Occupational Thera..."


#### Predicting for sample train dataframe with 5 examples.

In [None]:
temp_5_df = predict_frm_file(file_path="data/processed/embeddings/job/custom_model_1/train/5/embd.pkl", model_ckpt="models/", top_k=2, onet_embd_path="data/processed/embeddings/onet_custom_model_1/embd.pkl")
temp_5_df

Path to Job DF Given
Total Jobs available:  5
Total ONETs available:  1017


Unnamed: 0,TITLE_RAW,BODY,CLEANED_JOB,JOB_EMBD,PRED_ONETS
0,executive meeting manager,[DESC] executive meeting manager marriott la j...,[TTL] executive meeting manager [DESC] executi...,"[-0.8965845, 0.21626818, 0.4062995, 0.25944906...","[Meeting, Convention, and Event Planners, Lodg..."
1,rehabilitation technician outpatient rehab fle...,[DESC] rehabilitation technician outpatient re...,[TTL] rehabilitation technician outpatient reh...,"[-0.3449566, -0.016839795, -0.3339095, -0.0695...","[Rehabilitation Counselors, Occupational Thera..."
2,office bookkeeping assistant,[DESC] office bookkeeping assistant santa barb...,[TTL] office bookkeeping assistant [DESC] offi...,"[-0.4091167, 1.4821604, 0.52472407, -0.1098092...","[Office Clerks, General, Bookkeeping, Accounti..."
3,administrative support coordinator va remote,[DESC] find jobs administrative support coordi...,[TTL] administrative support coordinator va re...,"[-0.81933033, 0.6742432, 0.61316764, -0.277764...","[Office and Administrative Support Workers, Al..."
4,receptionist administrative assistant,[DESC] receptionist administrative assistant b...,[TTL] receptionist administrative assistant [D...,"[-0.3198364, 1.0830945, 0.35040185, -0.0614275...","[Secretaries and Administrative Assistants, Ex..."


following code is copied from 00_02_embeddings.ipynb

### Testing the Semantic search on Test Data

In [None]:
# Prepare Test DataLoader from saved dataset
test_examples = []
test_data = dataset['test']

n_examples = dataset['test'].num_rows
test_queries = {}
test_relevant_docs = {}

for i in range(n_examples):
  example = test_data[i]
  test_examples.append(InputExample(texts=[example['job_post'], example['onet_name']]))
  test_queries[str(i)] = example['job_post']
  test_relevant_docs[str(i)] = onet_to_id_dict[example["onet_name"]]
print(f"We have a {type(test_examples)} of length {len(test_examples)} containing {type(test_examples[0])}'s.")

In [None]:
test_queries_ids = []
for qid in test_queries:
    if qid in test_relevant_docs and len(test_relevant_docs[qid]) > 0:
        test_queries_ids.append(qid)

test_queries = [test_queries[qid] for qid in test_queries_ids]


In [None]:
test_q_embd = get_embd(test_queries, model=sbert_model)
test_hits = util.semantic_search(test_q_embd, np.array(onet_embd_df["ONET_EMBD"].to_list()), top_k=10)


loading model as it is


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

In [None]:
# Custom code to calculate Performance metrics

acc_at_k = [1,5,10]
num_hits_at_k = {k:0 for k in acc_at_k}

precision_recall_at_k = [1,5,10]
precisions_at_k = {k:[] for k in precision_recall_at_k}
recall_at_k = {k:[] for k in precision_recall_at_k}

mrr_at_k = [1,5,10]
MRR = {k:0 for k in mrr_at_k}


for test_qid in range(len(test_hits)):

    qid = test_queries_ids[test_qid]
    # Sort scores
    top_hits = test_hits[test_qid]
    query_relevant_docs = [int(test_relevant_docs[qid])]

    # Accuracy@k - We count the result correct, if at least one relevant doc is accross the top-k documents
    for k_val in acc_at_k:
        for hit in top_hits[0:k_val]:
            if hit['corpus_id'] in query_relevant_docs:
                num_hits_at_k[k_val] += 1
                break

    # Precision and Recall@k
    for k_val in precision_recall_at_k:
        num_correct = 0
        for hit in top_hits[0:k_val]:
            if hit['corpus_id'] in query_relevant_docs:
                num_correct += 1

        precisions_at_k[k_val].append(num_correct / k_val)
        recall_at_k[k_val].append(num_correct / len(query_relevant_docs))

    # MRR@k
    for k_val in mrr_at_k:
        for rank, hit in enumerate(top_hits[0:k_val]):
            if hit['corpus_id'] in query_relevant_docs:
                MRR[k_val] += 1.0 / (rank + 1)
                break

In [None]:
for k in num_hits_at_k:
    num_hits_at_k[k] /= len(test_queries)

for k in precisions_at_k:
    precisions_at_k[k] = np.mean(precisions_at_k[k])

for k in recall_at_k:
    recall_at_k[k] = np.mean(recall_at_k[k])

for k in MRR:
    MRR[k] /= len(test_queries)

op_dict =  {'accuracy@k': num_hits_at_k, 'precision@k': precisions_at_k, 'recall@k': recall_at_k, 'mrr@k': MRR}
for key,val in op_dict.items():
    print(key, val)



accuracy@k {1: 0.5129533678756477, 5: 0.7461139896373057, 10: 0.8134715025906736}
precision@k {1: 0.5129533678756477, 5: 0.14922279792746113, 10: 0.08134715025906736}
recall@k {1: 0.5129533678756477, 5: 0.7461139896373057, 10: 0.8134715025906736}
mrr@k {1: 0.5129533678756477, 5: 0.6051813471502591, 10: 0.6137326260383255}
