In [49]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

In [51]:
def process_all_pdfs(pdf_directory):
    all_documents = []
    pdf_dir = Path(pdf_directory)
    pdf_files = list(pdf_dir.glob('**/*.pdf'))

    print(f"Found {len(pdf_files)} PDF files to process")

    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")

        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'

            all_documents.extend(documents)
            print(f"  ✓ Loaded {len(documents)} pages")

        except Exception as e:
            print(f"  ✗ Error: {e}")

    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

all_pdf_documents = process_all_pdfs("../data")

Found 2 PDF files to process

Processing: camcom_2.pdf
  ✓ Loaded 2 pages

Processing: camcom.pdf
  ✓ Loaded 5 pages

Total documents loaded: 7


In [52]:
all_pdf_documents

[Document(metadata={'producer': 'ReportLab PDF Library - www.reportlab.com', 'creator': 'ReportLab PDF Library - www.reportlab.com', 'creationdate': '2025-09-09T15:09:15+00:00', 'author': 'anonymous', 'keywords': '', 'moddate': '2025-09-09T15:09:15+00:00', 'subject': 'unspecified', 'title': 'untitled', 'trapped': '/False', 'source': '../data/camcom_2.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1', 'source_file': 'camcom_2.pdf', 'file_type': 'pdf'}, page_content="CamCom Technologies Pvt Ltd — Company Profile\nExecutive Summary\nCamCom Technologies Pvt Ltd (CamCom) is a Bengaluru-based, award-winning, industry-agnostic deep-learning\ncomputer vision company that provides visual inspection and damage/defect assessment solutions across\nautomotive, insurance, manufacturing, logistics and other sectors. They have developed large vision models for\nsurface-agnostic defect assessment and offer cloud & mobile-enabled deployments for both manufacturing and\naftermarket workflows.\nKey Fac

In [53]:
### Text splitting get into chunks
def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    """Split documents into smaller chunks for better RAG performance"""

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )

    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")

    # Show example of a chunk
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs



In [54]:
chunks = split_documents(all_pdf_documents)
chunks

Split 7 documents into 15 chunks

Example chunk:
Content: CamCom Technologies Pvt Ltd — Company Profile
Executive Summary
CamCom Technologies Pvt Ltd (CamCom) is a Bengaluru-based, award-winning, industry-agnostic deep-learning
computer vision company that p...
Metadata: {'producer': 'ReportLab PDF Library - www.reportlab.com', 'creator': 'ReportLab PDF Library - www.reportlab.com', 'creationdate': '2025-09-09T15:09:15+00:00', 'author': 'anonymous', 'keywords': '', 'moddate': '2025-09-09T15:09:15+00:00', 'subject': 'unspecified', 'title': 'untitled', 'trapped': '/False', 'source': '../data/camcom_2.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1', 'source_file': 'camcom_2.pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'ReportLab PDF Library - www.reportlab.com', 'creator': 'ReportLab PDF Library - www.reportlab.com', 'creationdate': '2025-09-09T15:09:15+00:00', 'author': 'anonymous', 'keywords': '', 'moddate': '2025-09-09T15:09:15+00:00', 'subject': 'unspecified', 'title': 'untitled', 'trapped': '/False', 'source': '../data/camcom_2.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1', 'source_file': 'camcom_2.pdf', 'file_type': 'pdf'}, page_content='CamCom Technologies Pvt Ltd — Company Profile\nExecutive Summary\nCamCom Technologies Pvt Ltd (CamCom) is a Bengaluru-based, award-winning, industry-agnostic deep-learning\ncomputer vision company that provides visual inspection and damage/defect assessment solutions across\nautomotive, insurance, manufacturing, logistics and other sectors. They have developed large vision models for\nsurface-agnostic defect assessment and offer cloud & mobile-enabled deployments for both manufacturing and\naftermarket workflows.\nKey Fac

In [55]:
### embedding
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [56]:
class EmbeddingManager:

    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        try:
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise


    def generate_embeddings(self, texts: List[str]) -> np.ndarray:

        if not self.model:
            raise ValueError("Model not loaded")
        
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings

embeddings_manager = EmbeddingManager()
embeddings_manager
    

Model loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x167bf30e0>

In [58]:
class VectorStore:
    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):

        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        try:
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)

            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddings for RAG"}
            )

            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise
    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
        
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []

        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):

            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)

            documents_text.append(doc.page_content)
            embeddings_list.append(embedding.tolist())

            try:
                self.collection.add(
                    ids=ids,
                    embeddings=embeddings_list,
                    metadatas=metadatas,
                    documents=documents_text
                )
            except Exception as e:
                print(f"Error adding documents to vector store: {e}")
                raise

vector_store = VectorStore()
vector_store

Vector store initialized. Collection: pdf_documents
Existing documents in collection: 30


<__main__.VectorStore at 0x167c75550>

In [59]:
chunks

[Document(metadata={'producer': 'ReportLab PDF Library - www.reportlab.com', 'creator': 'ReportLab PDF Library - www.reportlab.com', 'creationdate': '2025-09-09T15:09:15+00:00', 'author': 'anonymous', 'keywords': '', 'moddate': '2025-09-09T15:09:15+00:00', 'subject': 'unspecified', 'title': 'untitled', 'trapped': '/False', 'source': '../data/camcom_2.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1', 'source_file': 'camcom_2.pdf', 'file_type': 'pdf'}, page_content='CamCom Technologies Pvt Ltd — Company Profile\nExecutive Summary\nCamCom Technologies Pvt Ltd (CamCom) is a Bengaluru-based, award-winning, industry-agnostic deep-learning\ncomputer vision company that provides visual inspection and damage/defect assessment solutions across\nautomotive, insurance, manufacturing, logistics and other sectors. They have developed large vision models for\nsurface-agnostic defect assessment and offer cloud & mobile-enabled deployments for both manufacturing and\naftermarket workflows.\nKey Fac

In [60]:
texts = [doc.page_content for doc in chunks]

embeddings = embeddings_manager.generate_embeddings(texts)

vector_store.add_documents(chunks, embeddings)

Batches: 100%|██████████| 1/1 [00:00<00:00,  7.81it/s]


Generated embeddings with shape: (15, 384)


In [62]:
# ## retriever pipeline from vector store

class RAGRetriever:
    def __init__(self, vector_store=VectorStore, embeddings_manager=EmbeddingManager):

        self.vector_store=vector_store
        self.embeddings_manager=embeddings_manager

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        
        query_embedding = self.embeddings_manager.generate_embeddings([query])[0]

        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )

            retrieved_docs = []

            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]

                for i, (doc,meta,dist,id) in enumerate(zip(documents, metadatas, distances, ids)):

                    similarity_score = 1 - dist

                    if similarity_score >= score_threshold:
                        retrieved_docs.append(
                            {
                                'id': id,
                                'content': doc,
                                'metadata': meta,
                                'similarity_score': similarity_score,
                                'distance': dist,
                                'rank': i + 1
                            }
                        )
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")

            else:
                print("No documents found")

            return retrieved_docs

        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []

rag_retriever = RAGRetriever(vector_store, embeddings_manager)

In [63]:
rag_retriever

<__main__.RAGRetriever at 0x167c75160>

In [64]:
rag_retriever.retrieve("Financial Information of camcom")

Batches: 100%|██████████| 1/1 [00:00<00:00, 24.27it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)





[{'id': 'doc_a5e9698f_7',
  'content': 'halli, Arakere, off Bannerghatta Road, Bangalore, Karnataka, India - 560076\n• Email: umesh@camcom.ai\n• Website: https://camcom.ai\n• Listing Status: Unlisted\n• Company Status: Active (as of July 2023)\n• NIC Code: 7499 (Other business activities n.e.c., service activities for com-\nmercial clients)\n2 Financial Information\nCamCom Technologies Pvt. Ltd. operates with a modest financial structure, as\nreported by various sources up to June 2025.\n• Authorized Share Capital: ₹1,700,000\n• Paid-up Capital: ₹1,078,045\n• Open Charges: ₹5,000,000\n• Satisfied Charges: ₹1,000,000\n• Revenue Growth (FY 2023): 39.70%\n• Profit Growth (FY 2023): -237.57%\n• EBITDA (FY 2023): -235.77%\n• Net Worth (FY 2023): 10.98%\n• Total Assets Growth (FY 2023): 40.90%\n• Last Annual General Meeting: September 27, 2024\n1',
  'metadata': {'source_file': 'camcom.pdf',
   'source': '../data/camcom.pdf',
   'page': 1,
   'creator': 'LaTeX with hyperref',
   'creationdat

In [83]:
import os
from dotenv import load_dotenv
load_dotenv()


True

In [84]:
### integration vectordb with llm
from langchain_groq import ChatGroq
from langchain.prompts import PromptTemplate
from langchain.schema import HumanMessage, SystemMessage

In [86]:
class GroqLLM:

    def __init__(self, model_name: str="gemma2-9b-it", api_key: str=None):

        self.model_name = model_name
        self.api_key = api_key or os.environ.get("GROQ_API_KEY")

        if not self.api_key:
            raise ValueError("Groq API key is required. Set GROQ_API_KEY environment variable or pass api_key parameter.")
        
        self.llm = ChatGroq(
            groq_api_key=self.api_key,
            model_name=self.model_name,
            temperature=0.1,
            max_tokens=1024
        )
        print(f"Initialized Groq LLM with model: {self.model_name}")

    def generate_response(self, query: str, context: str, maxlength: int= 500) -> str:
        
        prompt_template = PromptTemplate(
            imput_variables = ["context", "question"],
            template = """You are a helpful AI assistant. Use the following context to answer the question accurately and concisely.
                            Context: {context}
                            Question: {question}
                            Answer: Provide a clear and informative answer based on the context above. If the context doesn't contain enough information to answer the question, say so.""",
        )

        formatted_prompt = prompt_template.format(context=context, question=query)

        try:

            messages = [HumanMessage(content=formatted_prompt)]
            response = self.llm.invoke(messages)
            return response.content
        
        except Exception as e:
            return f"Error generating response: {str(e)}"
        
    def generate_response_simple(self, query: str, context: str) -> str:
        
        simple_prompt = f"""Based on this context: {context}

                                Question: {query}

                                Answer:"""
        try:

            messages = [HumanMessage(content=simple_prompt)]
            response = self.llm.invoke(messages)
            return response
        
        except Exception as e:
            return f"Error: {str(e)}"


In [87]:
# Initialize Groq LLM 

try:
    groq_llm = GroqLLM(api_key=os.getenv("GROQ_API_KEY"))
    print("Groq LLM initialized successfully!")
except Exception as e:
    print(f"Warning: {e}")
    print("Please set your GROQ_API_KEY environment variable to use the LLM.")
    groq_llm = None

Initialized Groq LLM with model: gemma2-9b-it
Groq LLM initialized successfully!


In [82]:
rag_retriever.retrieve("camcom")

Batches: 100%|██████████| 1/1 [00:00<00:00,  4.66it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)





[{'id': 'doc_c6af2c4b_0',
  'content': 'CamCom Technologies Pvt Ltd — Company Profile\nExecutive Summary\nCamCom Technologies Pvt Ltd (CamCom) is a Bengaluru-based, award-winning, industry-agnostic deep-learning\ncomputer vision company that provides visual inspection and damage/defect assessment solutions across\nautomotive, insurance, manufacturing, logistics and other sectors. They have developed large vision models for\nsurface-agnostic defect assessment and offer cloud & mobile-enabled deployments for both manufacturing and\naftermarket workflows.\nKey Facts\nLegal name: CamCom Technologies Private Limited CIN: U74999KA2017PTC106196 Headquarters: No.70, 2nd Floor, 1st\nCross, Shayadri Layout Billekahalli, Arakere, off Bannerghatta Road, Bangalore 560076, India Founded: 2017\n(company sources and profiles list founders and early dates) Website: https://camcom.ai\nLeadership & Founders\nAjith Nayar — Co-founder & CEO Uma (Umesh) Mahesh — Co-founder & President (Americas) Mahesh Subr