In [None]:
# Load PDF file

import pymupdf4llm

my_cv_path = "/Users/rsukumar/Downloads/Sukumar_RAGHAVAN_CV.pdf"

llama_reader = pymupdf4llm.LlamaMarkdownReader()
llama_docs = llama_reader.load_data(my_cv_path)

In [4]:
# Store vector embeddings into the vector db for later use

import chromadb
from chromadb.utils.embedding_functions import create_langchain_embedding

from langchain_chroma import Chroma
# from langchain_huggingface import HuggingFaceEmbeddings
from langchain_ollama import ChatOllama, OllamaEmbeddings
from llama_index.core import StorageContext, VectorStoreIndex
from llama_index.embeddings.langchain import LangchainEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore

# Using Huggingface embeddings
# langchain_embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")

# Using Ollama embeddings
langchain_embeddings = OllamaEmbeddings(model="llama3.2")
ef = create_langchain_embedding(langchain_embeddings)
embed_model = LangchainEmbedding(langchain_embeddings)

# Keeping option to switch between Ephemeral / Persistent chroma client, if wanted
chroma_db_persist_path = "chroma-db-job-coverletter"
vector_db_collection_name = "job-cv-vector-db-collection"

# ephemeral_chroma_client = chromadb.EphemeralClient()
persistent_chroma_client = chromadb.PersistentClient(path=chroma_db_persist_path)
vector_db_client = persistent_chroma_client

chroma_collection = vector_db_client.get_or_create_collection(
    vector_db_collection_name, embedding_function=ef
)

vector_store_from_client = Chroma(
    client=vector_db_client,
    collection_name=vector_db_collection_name,
    embedding_function=ef,
)
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    llama_docs,
    storage_context=storage_context,
    embed_model=embed_model,
)

In [None]:
# validating the index
from IPython.display import Markdown, display


llm_instance = ChatOllama(model="llama3.2", temperature=0)

# Query Data
query_engine = index.as_query_engine(llm=llm_instance, streaming=True)
response = query_engine.query("What is the document all about?")
display(Markdown(f"<b>{response}</b>"))

In [8]:
# context enrichment input
add_job_description_context = """
"""

In [9]:
def preprocess_job_description(job_description):
    """Extract key requirements and skills from job description"""
    # You could use an LLM to extract key requirements
    extraction_prompt = """
    Analyze the following job description and extract:
    1. Required technical skills
    2. Required soft skills
    3. Main responsibilities
    4. Key technologies mentioned
    
    Job Description:
    {job_description}
    
    Provide a structured summary of the key requirements.
    """

    llm = ChatOllama(model="llama3.2", temperature=0)
    requirements = llm.invoke(extraction_prompt.format(job_description=job_description))

    return requirements


def get_relevant_experience(vector_db, job_description):
    # First, extract key requirements from job description
    job_query = f"Find experience and skills related to: {job_description}"

    # Get relevant experience matching job requirements
    relevant_docs = vector_db.similarity_search(
        job_query,
        k=4,  # Increase number of relevant chunks
        # search_type="similarity"
    )

    # Get general professional summary
    summary_docs = vector_db.similarity_search(
        "professional summary and key achievements", k=2
    )

    return relevant_docs + summary_docs


def build_context(retrieved_docs, job_description):
    cv_context = "\n".join([doc.page_content for doc in retrieved_docs])

    return {
        "context": cv_context,
        "job_description": job_description,
        "query": "Write a tailored cover letter for this specific job position.",
    }


# Generate cover letter
# Use in main flow
def generate_tailored_cover_letter(vector_db, job_description):
    # Preprocess job description to extract key requirements
    key_requirements = preprocess_job_description(job_description)

    # Use these requirements to guide relevant experience retrieval
    relevant_docs = get_relevant_experience(vector_db, key_requirements)

    # Generate cover letter with explicit focus on matching requirements
    input_context = build_context(relevant_docs, job_description)
    result = qa_chain.invoke(input_context)

    return result["result"]

In [None]:
# main flow

from langchain.prompts import PromptTemplate
from langchain_ollama import ChatOllama, OllamaEmbeddings
from chromadb.utils.embedding_functions import create_langchain_embedding
from IPython.display import Markdown, display
from langchain_chroma import Chroma
from langchain.chains import RetrievalQA
# from langchain_huggingface import HuggingFaceEmbeddings

# Prompt template
prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer just say "I don't know", don't try to make up an answer.
Context: {context}

Question: {question}

Answer as a jobseeker:"""

# query
question = "Write a cover letter for applying for the job role as described in the context matching the job description exactly in the additional context."

# Add to the chain configuration
chain_config = {
    "document_separator": "\n\n",
    "max_tokens_limit": 2000,
    "return_source_documents": True,
    "token_overlap": 200,
}

# Adjust LLM parameters
llm_config = {
    "temperature": 0.7,
    "top_p": 0.9,
    "presence_penalty": 0.6,
    "frequency_penalty": 0.6,
}

# Use local Ollama model to generate the cover letter.
llm = ChatOllama(model="llama3.2", **llm_config)

QA_CHAIN_PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "job_description", "query"]
)

chroma_db_persist_path = "chroma-db-job-coverletter"
vector_db_collection_name = "job-cv-vector-db-collection"

lc_embeddings = OllamaEmbeddings(model="llama3.2")
# lc_embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")
ef2 = create_langchain_embedding(lc_embeddings)

vector_db = Chroma(
    collection_name=vector_db_collection_name,
    persist_directory=chroma_db_persist_path,
    embedding_function=ef2,
)

qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vector_db.as_retriever(
        search_kwargs={"k": 4},
        search_type="mmr",  # Use Maximum Marginal Relevance for better diversity
        **chain_config,
    ),
    chain_type_kwargs={
        "prompt": QA_CHAIN_PROMPT,
        "verbose": True,  # Helpful for debugging
    },
    chain_type="stuff",  # other techniques to experiment: "map_reduce", "refine", "map_rerank"
)

# Example usage
cover_letter = generate_tailored_cover_letter(vector_db, add_job_description_context)
display(Markdown(f"<b>{cover_letter}</b>"))