In [1]:
# Load PDF file

from langchain_community.document_loaders import PyMuPDFLoader

my_cv_path = "/Users/rsukumar/Downloads/Sukumar_RAGHAVAN_CV.pdf" 

loader = PyMuPDFLoader(my_cv_path)
pdf_loaded_data = loader.load()

In [3]:
# Store vector embeddings into the vector db for later use

import chromadb
from chromadb.utils.embedding_functions import create_langchain_embedding
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma

# Using Huggingface embeddings
from langchain_huggingface import HuggingFaceEmbeddings
langchain_embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")

# Using Ollama embeddings
# from langchain_ollama import OllamaEmbeddings
# langchain_embeddings = OllamaEmbeddings(model="llama3.2")

ef = create_langchain_embedding(langchain_embeddings)

# Keeping option to switch between Ephemeral / Persistent chroma client, if wanted
chroma_db_persist_path = "chroma-db-job-coverletter"
vector_db_collection_name = "job-cv-vector-db-collection"

# ephemeral_chroma_client = chromadb.EphemeralClient()
persistent_chroma_client = chromadb.PersistentClient(path=chroma_db_persist_path)

vector_db_client = persistent_chroma_client

text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
chunked_docs = text_splitter.split_documents(pdf_loaded_data)

vector_store_from_client = Chroma(
    client=vector_db_client,
    collection_name=vector_db_collection_name,
    embedding_function=ef,
)

vector_store_from_client.add_documents(chunked_docs)

['d090bd95-53f5-4a64-9b85-6eab13d5d746',
 'e6a7347a-971d-4163-b66e-0deaee562c02',
 '6268b12a-45cd-4bee-930e-9c773e0836ab']

In [4]:
# validating the created index
from IPython.display import Markdown, display

from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_huggingface import HuggingFaceEmbeddings
from chromadb.utils.embedding_functions import create_langchain_embedding
from langchain_ollama import ChatOllama


# See full prompt at https://smith.langchain.com/hub/rlm/rag-prompt
prompt = hub.pull("rlm/rag-prompt")


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

chroma_db_persist_path = "chroma-db-job-coverletter"
vector_db_collection_name = "job-cv-vector-db-collection"
vector_store_client = chromadb.PersistentClient(path=chroma_db_persist_path)
langchain_embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")
ef = create_langchain_embedding(langchain_embeddings)

vector_store_from_client = Chroma(
    client=vector_store_client,
    collection_name=vector_db_collection_name,
    embedding_function=ef,
)

llm_instance = ChatOllama(model="llama3.2", temperature=0.7)

qa_chain = (
    {
        "context": vector_store_from_client.as_retriever() | format_docs,
        "question": RunnablePassthrough(),
    }
    | prompt
    | llm_instance
    | StrOutputParser()
)

query = "What is the document all about?"
response = qa_chain.invoke(query)

display(Markdown(f"<b>{response}</b>"))



<b>The document appears to be the professional summary of Sukumar Raghavan, a Machine Learning Engineer, detailing his experience in developing and deploying cutting-edge AI solutions, including ContagionNET, which detects viral load of COVID-19 using computer vision pipelines and deep learning models.</b>

In [5]:
from IPython.display import Markdown, display

def my_display(display_text):
  display(Markdown(f"<b>{display_text}</b>"))

In [6]:
# addtional context enrichment input using the job description
job_description = """
"""

In [7]:
from operator import itemgetter

from langchain_chroma import Chroma
from langchain_ollama import ChatOllama
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import PromptTemplate
from langchain_huggingface import HuggingFaceEmbeddings
from chromadb.utils.embedding_functions import create_langchain_embedding

def preprocess_job_description(job_description):
    """Extract key requirements and skills from job description"""
    # You could use an LLM to extract key requirements
    extraction_prompt = """
    Analyze the following job description and extract:
    1. Required technical skills
    2. Required soft skills
    3. Main responsibilities
    4. Key technologies mentioned
    
    Job Description:
    {job_description}
    
    Provide a structured summary of the key requirements.
    """

    llm = ChatOllama(model="llama3.2", temperature=0)
    requirements = llm.invoke(extraction_prompt.format(job_description=job_description))

    return requirements


def get_relevant_experience(vector_db, job_description):
    # First, extract key requirements from job description
    job_query = f"Find experience and skills related to: {job_description}"

    # Get relevant experience matching job requirements
    relevant_docs = vector_db.similarity_search(
        job_query,
        k=4,  # Increase number of relevant chunks
    )

    # Get general professional summary
    summary_docs = vector_db.similarity_search(
        "professional summary and key achievements", k=2
    )

    return relevant_docs + summary_docs


def build_context(retrieved_docs, job_description):
    cv_context = "\n".join([doc.page_content for doc in retrieved_docs])

    return {
        "context": cv_context,
        "job_description": job_description,
        "query": "Write a tailored cover letter for this specific job position.",
    }

# Generate cover letter
# Use in main flow
def generate_tailored_cover_letter(vector_store_client, job_description):
    # Preprocess job description to extract key requirements
    key_requirements = preprocess_job_description(job_description)

    lc_embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")
    ef = create_langchain_embedding(lc_embeddings)
    vector_db = Chroma(
        client=vector_store_client,
        collection_name=vector_db_collection_name,
        embedding_function=ef,
    )

    # Use these requirements to guide relevant experience retrieval
    relevant_docs = get_relevant_experience(vector_db, key_requirements)

    # Generate cover letter with explicit focus on matching requirements
    input_context = build_context(relevant_docs, job_description)

    # Add to the chain configuration
    chain_config = {
        "document_separator": "\n\n",
        "max_tokens_limit": 2000,
        "return_source_documents": True,
        "token_overlap": 200,
    }

    # Adjust LLM parameters
    llm_config = {
        "temperature": 0.7,
        "top_p": 0.9,
        "presence_penalty": 0.6,
        "frequency_penalty": 0.6,
    }

    # Use local Ollama model to generate the cover letter.
    # llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", **llm_config)
    llm = ChatOllama(model="llama3.2", **llm_config)
    retriever = vector_db.as_retriever(search_kwargs={"k": 4},
            search_type="mmr",  # Use Maximum Marginal Relevance for better diversity
            **chain_config)

    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)

    # Prompt template
    prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer just say "I don't know", don't try to make up an answer.
    Context: {context}

    Job Description: {job_description}

    Question: {query}

    Answer:"""
    
    QA_CHAIN_PROMPT = PromptTemplate.from_template(prompt_template)

    qa_chain = (
        {
            "context": itemgetter('retrieved_context') | retriever | format_docs,
            "query": itemgetter('query'),
            "job_description": itemgetter('job_description')
        }
        | QA_CHAIN_PROMPT
        | llm
        | StrOutputParser()
    )

    # query
    question = "Write a cover letter for me to apply for the job described in the given job description using the given context."

    response = qa_chain.invoke({
        "retrieved_context": input_context['context'],
        "job_description": input_context['job_description'],
        "query": question,
    })

    return response

In [None]:
# %debug

import chromadb

chroma_db_persist_path = "chroma-db-job-coverletter"
vector_db_collection_name = "job-cv-vector-db-collection"

persistent_chroma_client = chromadb.PersistentClient(path=chroma_db_persist_path)

generated_cover_letter = generate_tailored_cover_letter(persistent_chroma_client, job_description)

my_display(generated_cover_letter)

Number of requested results 20 is greater than number of elements in index 6, updating n_results = 6
