In [2]:
!pip install -q langchain-community
!pip install --upgrade --quiet  rank_bm25 > /dev/null

In [28]:
import pandas as pd
import os
from model import ResumeFields, Skill, Experience, Qualification, PersonalityTrait
import pickle
from typing import List
from rich import print as rprint

from langchain_core.documents import Document

### Define weights for assessment criteria

In [67]:
key_skills_wght = 0.4
exp_wght = 0.3
qualifications_wght = 0.2
personality_traits_wght = 0.1

### Define documents

In [5]:
# from rich import print as rprint
# index = 5600
# with open(f'./outputs/candidates/{index}.pkl', 'rb') as f:
#     tmp = pickle.load(f)
# rprint(tmp.json())

In [19]:
skill_delimiter = "\t"

def create_document(page_content, metadata):
    document = Document(
        page_content=page_content,
        metadata=metadata
    ) 
    return document

def create_skill_document(skills: List[Skill], parent_doc_id):
    page_content = skill_delimiter.join([skill.skill.lower() for skill in skills])
    return create_document(page_content, {"doc_id": parent_doc_id})

def create_experience_document(experiences: List[Experience], parent_doc_id):
    page_content = "\n".join([experience.experience for experience in experiences])
    return create_document(page_content, {"doc_id": parent_doc_id})

def create_qualification_document(qualifications: List[Qualification], parent_doc_id):
    page_content = "\n".join([qualification.degree for qualification in qualifications])
    return create_document(page_content, {"doc_id": parent_doc_id})

def create_personality_trait_document(personality_traits: List[PersonalityTrait], parent_doc_id):
    page_content = "\n".join([pt.trait for pt in personality_traits])
    return create_document(page_content, {"doc_id": parent_doc_id})

In [53]:
candidates_ds_path = "./outputs/candidates"
skill_documents = []
experience_documents = []
qualification_documents = []
personality_trait_documents = []

for file in os.listdir(candidates_ds_path):
    if ".pkl" in file: 
        file_path = f"{candidates_ds_path}/{file}"
        doc_id = file.split(".")[0]
        with open(file_path, 'rb') as f:
            resume_fields = pickle.load(f)
            if resume_fields is not None:
                skill_documents.append(create_skill_document(resume_fields.skills, doc_id))
                if len(resume_fields.experiences) > 0:
                    experience_documents.append(create_experience_document(resume_fields.experiences, doc_id))
                if len(resume_fields.qualifications) > 0:
                    qualification_documents.append(create_qualification_document(resume_fields.qualifications, doc_id))
                if len(resume_fields.personality_traits) > 0:
                    personality_trait_documents.append(create_personality_trait_document(resume_fields.personality_traits, doc_id))

In [57]:
# rprint(personality_trait_documents[:2])

In [37]:
# Target document
trgt_file_path = "./outputs/linkedin_cv.pkl"
trgt_query_docs = []

if os.path.exists(trgt_file_path):
    doc_id = trgt_file_path.split(".")[0]
    with open(file_path, 'rb') as f:
        resume_fields = pickle.load(f)
        if resume_fields is not None:
            trgt_query_docs.extend([
                skill_delimiter.join([skill.skill.lower() for skill in resume_fields.skills])
                ,"\n".join([experience.experience for experience in resume_fields.experiences])
                ,"\n".join([qualification.degree for qualification in resume_fields.qualifications])
                ,"\n".join([pt.trait for pt in resume_fields.personality_traits])
            ])

trgt_query = "\n".join(trgt_query_docs)

In [38]:
print(trgt_query)

software development	machine learning	deep learning	risk assessment	requirement gathering	application support	javascript	python	docker	html	hive	css	c	c++
7 years of experience in Unified Payment Interface
7 years of experience in Risk Prediction
7 years of experience in Spark
7 years of experience in PySpark
7 years of experience in Big Data
B.Tech in Electronics/Telecommunication from Birla Institute of Technology (BIT), Ranchi



### Define retrievers

In [21]:
from langchain.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever
from langchain_community.vectorstores import FAISS

In [42]:
import vertexai
from langchain_google_vertexai import VertexAIEmbeddings

PROJECT_ID = "sidproj-354203"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

vertexai.init(project=PROJECT_ID, location=LOCATION)
embedding_model = VertexAIEmbeddings(model_name="text-embedding-005")

In [22]:
def word_tokenize(page_content):
    return page_content.split(skill_delimiter) 

#### Skills Retriever

In [None]:
skills_retriever = BM25Retriever.from_documents(
    skill_documents,
    k=3,
    preprocess_func=word_tokenize,
)

#### Personality trait retriever

In [61]:
trait_vectorstore = FAISS.from_documents(
    personality_trait_documents, embedding_model
)
trait_retriever = trait_vectorstore.as_retriever(search_kwargs={"k": 3})

#### Qualification Retriever

In [60]:
qualification_vectorstore = FAISS.from_documents(
    qualification_documents, embedding_model
)
qualification_retriever = qualification_vectorstore.as_retriever(search_kwargs={"k": 3})

#### Experience Retriever

In [59]:
exp_vectorstore = FAISS.from_documents(
    experience_documents, embedding_model
)
exp_retriever = exp_vectorstore.as_retriever(search_kwargs={"k": 3})

#### Ensemble Retreiver

In [68]:
ensemble_retriever = EnsembleRetriever(
    retrievers=[skills_retriever, exp_retriever, qualification_retriever, trait_retriever]
    , weights=[key_skills_wght
               , exp_wght
              , qualifications_wght
              , personality_traits_wght]
)

In [69]:
result = ensemble_retriever.invoke(trgt_query)
result

[Document(metadata={'doc_id': '778'}, page_content='software development\tmachine learning\tdeep learning\trisk assessment\trequirement gathering\tapplication support\tjavascript\tpython\tdocker\thtml\thive\tcss\tc\tc++'),
 Document(id='ba600247-ccd9-4951-b880-a3076b4c38ec', metadata={'doc_id': '27'}, page_content='7 years of experience in Unified Payment Interface\n7 years of experience in Risk Prediction\n7 years of experience in Spark\n7 years of experience in PySpark\n7 years of experience in Big Data'),
 Document(id='70c894fc-7aa9-4e6e-8220-1abc5c235d15', metadata={'doc_id': '8739'}, page_content='B.Tech(Computers)\nMasters (Data Science)'),
 Document(id='285e0e43-bc7e-4d3d-b96c-b13d6d7ec201', metadata={'doc_id': '7805'}, page_content='M.Tech from Indian Institute of Technology\nB.E in Electrical Engineering from Devi Ahilya University\nCourse in Machine learning from Coursera - Stanford'),
 Document(id='946d0bfa-a678-4d19-8a7b-86a7e4d4ca6b', metadata={'doc_id': '5176'}, page_cont

In [65]:
result = exp_vectorstore.similarity_search(trgt_query)
result

[Document(id='ba600247-ccd9-4951-b880-a3076b4c38ec', metadata={'doc_id': '27'}, page_content='7 years of experience in Unified Payment Interface\n7 years of experience in Risk Prediction\n7 years of experience in Spark\n7 years of experience in PySpark\n7 years of experience in Big Data'),
 Document(id='0b4acc47-631d-4dd2-adfc-c8659e26c464', metadata={'doc_id': '5705'}, page_content='7 years of experience in Unified Payment Interface\n7 years of experience in Risk Prediction\n7 years of experience in Spark\n7 years of experience in PySpark\n7 years of experience in Big Data'),
 Document(id='cff61925-43c6-4259-807f-cb27b89c8837', metadata={'doc_id': '9038'}, page_content='7 years of experience in Unified Payment Interface\n7 years of experience in Risk Prediction\n7 years of experience in Spark\n7 years of experience in PySpark\n7 years of experience in Big Data'),
 Document(id='4be9cbea-2f7e-4203-aa6d-734c56d47071', metadata={'doc_id': '3866'}, page_content='7 years of experience in Un

In [None]:
skills_retriever