### Authenticate your notebook environment (Colab only)

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

In [None]:
import sys

if "google.colab" in sys.modules:
    # Authenticate user to Google Cloud
    from google.colab import auth

    auth.authenticate_user()

### Import Libraries

In [16]:
import pandas as pd
import os
from model import ResumeFields, Skill, Experience, Qualification, PersonalityTrait
import pickle
from typing import List, Dict
from rich import print as rprint
from tqdm import tqdm
import numpy as np

import asyncio
import vertexai
from langchain_google_vertexai import VertexAIEmbeddings

### Customized weights for assessment criteria

In [3]:
key_skills_wght = 0.4
exp_wght = 0.3
qualifications_wght = 0.2
personality_traits_wght = 0.1

### Prepare documents

In [4]:
skill_delimiter = "\t"

In [5]:
def create_document(page_content, metadata):
    document = Document(
        page_content=page_content,
        metadata=metadata
    ) 
    return document

def create_skill_document(skills: List[Skill], parent_doc_id):
    page_content = skill_delimiter.join([skill.skill.lower() for skill in skills])
    # return create_document(page_content, {"doc_id": parent_doc_id})
    return page_content

def create_experience_document(experiences: List[Experience], parent_doc_id):
    page_content = "\n".join([experience.experience for experience in experiences])
    return page_content

def create_qualification_document(qualifications: List[Qualification], parent_doc_id):
    page_content = "\n".join([qualification.degree for qualification in qualifications])
    return page_content

def create_personality_trait_document(personality_traits: List[PersonalityTrait], parent_doc_id):
    page_content = "\n".join([pt.trait for pt in personality_traits])
    return page_content

##### Candidate resume docs

In [6]:
candidates_ds_path = "./outputs/candidates"
candidate_resumes = {}

for file in os.listdir(candidates_ds_path):
    if ".pkl" in file: 
        file_path = f"{candidates_ds_path}/{file}"
        doc_id = file.split(".")[0]
        with open(file_path, 'rb') as f:
            resume_fields = pickle.load(f)
            if resume_fields is not None:
                candidate_resumes[doc_id] = {}
                candidate_resumes[doc_id]["skills"] = create_skill_document(resume_fields.skills, doc_id)
                if len(resume_fields.experiences) > 0:
                    candidate_resumes[doc_id]["exp"] = create_experience_document(resume_fields.experiences, doc_id)
                if len(resume_fields.qualifications) > 0:
                    candidate_resumes[doc_id]["qualification"] = create_qualification_document(resume_fields.qualifications, doc_id)
                if len(resume_fields.personality_traits) > 0:
                    candidate_resumes[doc_id]["trait"] = create_personality_trait_document(resume_fields.personality_traits, doc_id)

candidate_resumes_df = pd.DataFrame(candidate_resumes).T
candidate_resumes_df = candidate_resumes_df.fillna("")

In [7]:
candidate_resumes_df.head()

Unnamed: 0,skills,exp,qualification,trait
3019,electronic & mechanical technology\tmaintenanc...,7 years of experience in Engineering\n7 years ...,MBA\nBachelor of Science\nAssociate of Science,
9370,business development & sales leadership\tsales...,4 years experience in vendor relationships\n4 ...,Bachelor of Arts,
703,microsoft office\tword\texcel\toutlook\tsharep...,2+ years of experience in Data entry\n2+ years...,B.S. Degree,
2636,data science\tdata analysis\tdeep learning\tma...,0 years in Recommender System,BCA\nMCA,
7790,statistical modeling\tnatural language process...,6 years of experience in Graphic Visualization...,B.Tech in Information Technology,


##### Target job description doc

In [8]:
# Target document
trgt_file_path = "./outputs/linkedin_cv.pkl"
trgt_query_docs = {}

if os.path.exists(trgt_file_path):
    doc_id = trgt_file_path.split(".")[0]
    with open(file_path, 'rb') as f:
        resume_fields = pickle.load(f)
        if resume_fields is not None:
            trgt_query_docs["skills"] = skill_delimiter.join([skill.skill.lower() for skill in resume_fields.skills])
            trgt_query_docs["exp"] = "\n".join([experience.experience for experience in resume_fields.experiences])
            trgt_query_docs["qualification"] = "\n".join([qualification.degree for qualification in resume_fields.qualifications])
            trgt_query_docs["trait"] = "\n".join([pt.trait for pt in resume_fields.personality_traits])
            

trgt_query_docs

{'skills': 'software development\tmachine learning\tdeep learning\trisk assessment\trequirement gathering\tapplication support\tjavascript\tpython\tdocker\thtml\thive\tcss\tc\tc++',
 'exp': '7 years of experience in Unified Payment Interface\n7 years of experience in Risk Prediction\n7 years of experience in Spark\n7 years of experience in PySpark\n7 years of experience in Big Data',
 'qualification': 'B.Tech in Electronics/Telecommunication from Birla Institute of Technology (BIT), Ranchi',
 'trait': ''}

### Define embeddings

#### Sparse emebddings

Instantiate TfidfVectorizer 

In [None]:
# Utility function to transform text into a TF-IDF Sparse Vector
from sklearn.feature_extraction.text import TfidfVectorizer

# Fit the TFIDF Vectorizer (This is usually done on a very large corpus of data to make sure that word statistics generalize well on new data)
def word_tokenize(page_content):
    return page_content.split(skill_delimiter) 
    
vectorizer = TfidfVectorizer(tokenizer=word_tokenize, max_features = 1000)
texts = candidate_resumes_df["skills"].values
vectorizer.fit(texts)

Calculate similarity using sparse embedding vector

In [10]:
def get_sparse_embedding_sim(tfidf_vectorizer, target, text):
    trgt_vector = tfidf_vectorizer.transform([target]).toarray()
    text_vector = tfidf_vectorizer.transform([text]).toarray()
    sim = text_vector @ trgt_vector.T
    return sim[0,0]

candidate_resumes_df.loc[:, "skills_sparse_sim"] = candidate_resumes_df.skills.apply(lambda x:get_sparse_embedding_sim(vectorizer
                                                                                                            , target=trgt_query_docs["skills"]
                                                                                                           , text=x))

#### Dense Embeddings

Initialize verex ai module

In [None]:
vertexai.init(project=PROJECT_ID, location=LOCATION)
embedding_model = VertexAIEmbeddings(model_name="text-embedding-005")

DEfine utility functions for embeddings and calculate similarity using dense embeddings

In [13]:
req_per_min = 1000
time_per_req = 60 // req_per_min
async def get_embeddings(doc:Dict):
    docs_keys = list(doc.keys())
    docs_vals = list(doc.values())
    dummy_embedding = [0] * 768
    embeddings = {}
    try:
        embeddings = embedding_model.embed_documents([
            text if text != "" else "dummy"
            for text in docs_vals
        ])
        
        embeddings = {
            docs_keys[index]: result if docs_vals[index] != "" else dummy_embedding
            for index, result in enumerate(embeddings)
        }
        
    except Exception as ex:
        print(index, ex)

    return embeddings

async def get_embeddings_sim(doc: Dict, trgt_embeddings: Dict):
    assert doc.keys() == trgt_embeddings.keys(), "Both the dict should have same keys"

    src_embeddings = await get_embeddings(doc)
    src_embeddings_matrix = np.array(list(src_embeddings.values()))
    src_embeddings_matrix = src_embeddings_matrix[:,np.newaxis,:]

    trgt_embeddings_matrix = np.array(list(trgt_embeddings.values()))
    trgt_embeddings_matrix = np.transpose(trgt_embeddings_matrix[:,np.newaxis,:],(0,2,1))

    # print(src_embeddings_matrix.shape, trgt_embeddings_matrix.shape)

    sim = np.einsum('bmn, bnk->bmk', src_embeddings_matrix, trgt_embeddings_matrix)

    result_keys = [key+"_sim" for key in doc.keys()]
    return dict(zip(result_keys, sim.squeeze()))
    

Calcualte target embeddings

In [14]:
# Calculate target embeddings 
target_dense_embeddings = await get_embeddings(trgt_query_docs)

Validate embeddings module

In [17]:
sim = await get_embeddings_sim(trgt_query_docs, target_dense_embeddings)
assert np.isclose(sim["trait_sim"], 0.0), "trait should have 0 similarity"
assert np.isclose(sim["exp_sim"], 1), "exp should have 1 similarity"
assert np.isclose(sim["qualification_sim"], 1), "qualification should have 1 similarity"

Calculate source embeddings (<b>This will take close to 10 mins</b>)

In [18]:
bg_tasks={}
req_cols = ["skills","exp","qualification","trait"]
for index, row in tqdm(candidate_resumes_df.iterrows()):
    bg_tasks[index] = asyncio.create_task(get_embeddings_sim(row[req_cols].to_dict(), target_dense_embeddings))
    await asyncio.sleep(time_per_req)

results = await asyncio.gather(*bg_tasks.values())
exp_emb = dict(zip(bg_tasks.keys(), results))

9543it [12:08, 13.11it/s]


Calcualte score and rank based on score

In [36]:
candidate_evaluation_df = candidate_resumes_df.join(pd.DataFrame(exp_emb).T)
candidate_evaluation_df.loc[:,"score"] = candidate_evaluation_df.apply(lambda x:
                                                                       x.skills_sparse_sim * key_skills_wght+
                                                                       x.exp_sim * exp_wght +
                                                                       x.qualification_sim * qualifications_wght + 
                                                                       x.trait_sim * personality_traits_wght
                                                                      ,axis = 1)

candidate_evaluation_df.loc[:, "rank"] = candidate_evaluation_df["score"].rank(method='min', na_option='bottom',ascending=False)

#### Top 3 Candidates
- Reason being there CV matched exactly with job description. Looks like they have updated it based on the resume

In [38]:
top_candidates = candidate_evaluation_df[candidate_evaluation_df["rank"] <4.0]
top_candidates

Unnamed: 0,skills,exp,qualification,trait,skills_sparse_sim,skills_sim,exp_sim,qualification_sim,trait_sim,score,rank
27,software development\tmachine learning\tdeep l...,7 years of experience in Unified Payment Inter...,B.Tech in Electronics/Telecommunication,,1.0,0.999998,0.999998,0.903071,0.0,0.880614,3.0
8877,software development\tmachine learning\tdeep l...,7 years in Unified Payment Interface\n7 years ...,B.Tech in Electronics/Telecommunication from B...,,1.0,0.95037,0.989085,0.999999,0.0,0.896725,2.0
778,software development\tmachine learning\tdeep l...,7 years of experience in Unified Payment Inter...,B.Tech in Electronics/Telecommunication from B...,,1.0,0.999998,0.999998,0.999999,0.0,0.899999,1.0


In [54]:
index = "27"

rprint(top_candidates.loc[index].skills)
rprint(top_candidates.loc[index].exp)
rprint(top_candidates.loc[index].qualification)

rprint("-"*100)

rprint(trgt_query_docs["skills"])
rprint(trgt_query_docs["exp"])
rprint(trgt_query_docs["qualification"])

#### Unit tests

In [20]:
import unittest

class TestNotebook(unittest.TestCase):
    
    async def test_trgt_sim(self):
        sim = await get_embeddings_sim(trgt_query_docs, target_dense_embeddings)
        assert np.isclose(sim["trait_sim"], 0.0), "trait should have 0 similarity"
        assert np.isclose(sim["exp_sim"], 1), "exp should have 1 similarity"
        assert np.isclose(sim["qualification_sim"], 1), "qualification should have 1 similarity"

    async def test_src_sim(self):
        src_row = candidate_resumes_df.iloc[1]
        src_embeddings = await get_embeddings(src_row)
        sim = await get_embeddings_sim(src_row, src_embeddings)
        assert np.isclose(sim["trait_sim"], 0.0), "trait should have 0 similarity"
        assert np.isclose(sim["exp_sim"], 1), "exp should have 1 similarity"
        assert np.isclose(sim["qualification_sim"], 1), "qualification should have 1 similarity"
        

unittest.main(argv=[''], verbosity=2, exit=False)

  method()
ok
  method()
ok

----------------------------------------------------------------------
Ran 2 tests in 0.004s

OK


<unittest.main.TestProgram at 0x7f8155717a30>