In [56]:
import os
import uuid
from dotenv import load_dotenv
from langchain_community.document_loaders import TextLoader, PyPDFLoader, PyMuPDFLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
from typing import List, Dict, Tuple, Any
from sklearn.metrics.pairwise import cosine_similarity
from langchain_groq import ChatGroq

load_dotenv()

dir_loader = DirectoryLoader(
    '../data/pdf',
    glob="**/*.pdf",
    loader_cls=PyMuPDFLoader,
    show_progress=False
)

pdf_documents = dir_loader.load()
pdf_documents

[Document(metadata={'producer': 'macOS Version 14.5 (Build 23F79) Quartz PDFContext', 'creator': '', 'creationdate': "D:20260102150824Z00'00'", 'source': '../data/pdf/prasanth narasimhan.pdf', 'file_path': '../data/pdf/prasanth narasimhan.pdf', 'total_pages': 2, 'format': 'PDF 1.4', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': "D:20260102150824Z00'00'", 'trapped': '', 'modDate': "D:20260102150824Z00'00'", 'creationDate': "D:20260102150824Z00'00'", 'page': 0}, page_content='+91-7259913174 \nprasanth.iyer94@gmail.com \nhttps://www.linkedin.c\nom/in/prasanthiyer \nPRASANTH \nNARASIMHAN \nBangalore, India \n \nSUMMARY \nSenior Full Stack Developer with 9+ years of experience building \nweb applications from ground up to production. Proven track \nrecord of leading teams, architecting robust solutions, and \ndriving product innovation. Particularly skilled in React, Python, \nMySQL and AWS. \nHave led a 7-member team and provided mentorship and \nguidance to the team

In [3]:
class EmbeddingManager:
    def __init__(self, model_name:str = "all-MiniLM-l6-v2"):
        
        self.model_name = model_name
        self.model = None
        self._load_model()
    
    def _load_model(self):
        try:
            self.model = SentenceTransformer(self.model_name)
            print("dimension", self.model.get_sentence_embedding_dimension())
        except Exception as e:
            print(f"Exception : {e}")
            raise
    
    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        if not self.model:
            raise ValueError("Model not loaded")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Shape generated: {embeddings.shape}")
        return embeddings
    
    def get_embedding_dimension(self) -> int:
        if not self.model:
            raise ValueError("Model not loaded")
        return self.model.get_sentence_embedding_dimension()

embedding_manager = EmbeddingManager()
embedding_manager

dimension 384


<__main__.EmbeddingManager at 0x15606fe50>

In [4]:
class VectorStore:
    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialise_store()
    
    def _initialise_store(self):
        try:
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            self.collection = self.client.get_or_create_collection(name=self.collection_name, metadata={"description": "RAG store for my resume"})
            print("Count in the collection" ,self.collection.count())
        except Exception:
            raise
            
    def add_documents(self, document:List[any], embeddings:np.ndarray):
        if len(document) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
        
        print(f"Adding {len(document)} documets")
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []
        
        for i, (doc, embedding) in enumerate(zip(document, embeddings)):
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            
            metadata = dict(doc.metadata)
            metadata["doc_index"] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)
            
            documents_text.append(doc.page_content)
            embeddings_list.append(embedding.tolist())
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Added {len(document)} to the vector store")
            print(f'Total documents: {self.collection.count()}')
        except Exception:
            raise Exception

vector_store = VectorStore()
vector_store
        
            

Count in the collection 12


<__main__.VectorStore at 0x120f0afd0>

In [5]:
def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs

chunks = split_documents(pdf_documents)
chunks

Split 2 documents into 6 chunks

Example chunk:
Content: +91-7259913174 
prasanth.iyer94@gmail.com 
https://www.linkedin.c
om/in/prasanthiyer 
PRASANTH 
NARASIMHAN 
Bangalore, India 
 
SUMMARY 
Senior Full Stack Developer with 9+ years of experience buildin...
Metadata: {'producer': 'macOS Version 14.5 (Build 23F79) Quartz PDFContext', 'creator': '', 'creationdate': "D:20260102150824Z00'00'", 'source': '../data/pdf/prasanth narasimhan.pdf', 'file_path': '../data/pdf/prasanth narasimhan.pdf', 'total_pages': 2, 'format': 'PDF 1.4', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': "D:20260102150824Z00'00'", 'trapped': '', 'modDate': "D:20260102150824Z00'00'", 'creationDate': "D:20260102150824Z00'00'", 'page': 0}


[Document(metadata={'producer': 'macOS Version 14.5 (Build 23F79) Quartz PDFContext', 'creator': '', 'creationdate': "D:20260102150824Z00'00'", 'source': '../data/pdf/prasanth narasimhan.pdf', 'file_path': '../data/pdf/prasanth narasimhan.pdf', 'total_pages': 2, 'format': 'PDF 1.4', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': "D:20260102150824Z00'00'", 'trapped': '', 'modDate': "D:20260102150824Z00'00'", 'creationDate': "D:20260102150824Z00'00'", 'page': 0}, page_content='+91-7259913174 \nprasanth.iyer94@gmail.com \nhttps://www.linkedin.c\nom/in/prasanthiyer \nPRASANTH \nNARASIMHAN \nBangalore, India \n \nSUMMARY \nSenior Full Stack Developer with 9+ years of experience building \nweb applications from ground up to production. Proven track \nrecord of leading teams, architecting robust solutions, and \ndriving product innovation. Particularly skilled in React, Python, \nMySQL and AWS. \nHave led a 7-member team and provided mentorship and \nguidance to the team

In [6]:
texts = [doc.page_content for doc in chunks]
embeddings = embedding_manager.generate_embeddings(texts=texts)
vector_store.add_documents(chunks, embeddings)

Batches: 100%|██████████| 1/1 [00:00<00:00,  2.45it/s]

Shape generated: (6, 384)
Adding 6 documets
Added 6 to the vector store
Total documents: 18





In [24]:
class RagRetreiver:
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager
    
    def reterieve(self, query:str, top_k:int=5, score_threshload:float=0.0) -> List[Dict[str, Any]]:
        print(f'query: {query}')
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            print(results)
            retreived_docs = []
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]
                
                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    similarity_score = 1 - distance
                    if similarity_score >= score_threshload:
                        retreived_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })
                print(f'retrieved docs {retreived_docs}')
            return retreived_docs                
        except Exception as e:
            print(f'errr {e}')
            return []
rag_retriever = RagRetreiver(vector_store, embedding_manager)


In [54]:
rag_retriever.reterieve("node")

query: node


Batches: 100%|██████████| 1/1 [00:00<00:00,  1.95it/s]

Shape generated: (1, 384)
{'ids': [['doc_f5620bd2_5', 'doc_a167545f_5', 'doc_3ed465a1_5', 'doc_87d5828b_4', 'doc_2bad5876_4']], 'embeddings': None, 'documents': [['synchronise user data with Firebase \n• Collaborated with design and product teams to translate \nrequirements into technical solutions \n \nFront-end Developer \nPlash Digital Media PVT. Ltd, Bangalore, India | 2016- 2017       \n• Developed CMS and CRM frontends for Tru-Next and Alore \nproducts \n• Built comprehensive POS system dashboard featuring \nadvanced data visualisation and analytics \n• Partnered with management to implement new product \nfeatures, resulting in 25% increase in user engagement \n \nFront-end Developer  \nNanolocal Technologies PVT. Ltd. Bangalore, India | 2015- 2015    \n• Independently developed company website from concept \nto launch \n• Created digital marketing materials like promotional \ngraphics for events \n \n \nEDUCATION \n \n \nBachelor’s Degree in Commerce \nBharathiar University, Dr.




[]

In [None]:
grop_api_key = os.getenv("GROQ_API")
llm = ChatGroq(
    api_key=grop_api_key,
    model="openai/gpt-oss-120b",
    temperature=0.1,
    max_tokens=1024
)

def rag_simple(query, retreiver, llm, top_k=3):
    results = retreiver.reterieve(query, top_k=top_k)
    context = "\n\n".join([doc['content'] for doc in results]) if results else ""
    if not results:
        return "No useful data"
    prompt = f""" 
        Use the following context to answer the question concisely.
        Context: {context}
        question: {query}
        Answer:  """
    
    response = llm.invoke([prompt.format(context=context, query=query)])
    return response.content


In [None]:
answer = rag_simple("", rag_retriever, llm)
print(answer )

query: Generate a summary on this file


Batches: 100%|██████████| 1/1 [00:00<00:00,  3.37it/s]

Shape generated: (1, 384)
{'ids': [['doc_665df1f3_1', 'doc_ce722f4a_1', 'doc_6ffdc8dd_1']], 'embeddings': None, 'documents': [['Functional Programming, \nAWS, Elasticsearch \n \nEngineering Lead \nRecruiterflow, Bangalore, India | 2024 - Present  \n• Conducted daily stand-ups to improve transparency and \nsprint velocity, resulting in a 15% improvement in team \ndelivery consistency. \n• Architected database designs and APIs for an AI-driven \nnote-taking app, enabling scalable adoption across 200+ \nclient organisations. \n• Designed backend systems to orchestrate LLM API calls, \nmanage prompt inputs, validate outputs, and persist AI-\ngenerated notes reliably. \n• Led and mentored a 7-member development team, \nconducting code reviews and ensuring high-quality, on-\ntime releases. \n• Introduced best practices and QA processes, cutting bug \nfrequency by 40% and improving release stability. \n• Partnered with leadership to scale engineering headcount \nfrom 5 to 20+ and established 




In [76]:
def aadvanced_rag(query, retreiver, llm, top_k=3, min_score=0.2, return_context=False):
    results = retreiver.reterieve(query, top_k=top_k, score_threshload=min_score)
    if not results:
       return {'answer': '', 'sources': [], 'confidence': 0.0, 'context': ''}
    context = "\n\n".join([doc['content'] for doc in results]) if results else ""
    sources = [{
        'source': doc['metadata'].get('source_file', doc['metadata'].get('source', 'unknown')),
        'page': doc['metadata'].get('page', 'unknown'),
        'score': doc['similarity_score'],
        'preview': doc['content'][:300] + '...',
    } for doc in results]
    confidence = max([doc['similarity_score'] for doc in results])
    prompt = f"""
        Use the following context to answer the question concisely. \n Context: {context} \n\n Question: {query} \n\n Answer: 
    """
    response = llm.invoke([prompt.format(context=context, query=query)])
    output = {
        'answer': response.context,
        'sources': sources,
        'confidence': confidence
    }
    if return_context:
        output['context'] = return_context
    return output


In [81]:
rag_adv = aadvanced_rag("skills", rag_retriever, llm, top_k=3, min_score=0.1, return_context=True)
print(rag_adv)

query: skills


Batches: 100%|██████████| 1/1 [00:00<00:00,  4.75it/s]

Shape generated: (1, 384)
{'ids': [['doc_498b1c74_0', 'doc_b4772594_0', 'doc_4332ccaf_0']], 'embeddings': None, 'documents': [['+91-7259913174 \nprasanth.iyer94@gmail.com \nhttps://www.linkedin.c\nom/in/prasanthiyer \nPRASANTH \nNARASIMHAN \nBangalore, India \n \nSUMMARY \nSenior Full Stack Developer with 9+ years of experience building \nweb applications from ground up to production. Proven track \nrecord of leading teams, architecting robust solutions, and \ndriving product innovation. Particularly skilled in React, Python, \nMySQL and AWS. \nHave led a 7-member team and provided mentorship and \nguidance to the team in their career progression. \n \n \nKEY SKILLS \n \nWORK EXPERIENCE \nPython, React, Angular, \nJavaScript,  \nHTML5, CSS3, Chrome \nExtension,  \nFlask, NodeJS, MySQL, \nCelery, REST APIs,  \nCloud Services, JIRA, \nDatabase Architecture, \nCode Reviews, Team lead, \nSaaS,  \nProblem Solving, JSON, \nAutomation,  \nCelery, CI/CD, Agile, Scrum, \nFunctional Programming,


