In [2]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

In [5]:
def process_all_pdfs(pdf_directory):
    all_documents = []
    pdf_dir = Path(pdf_directory)
    pdf_files = list(pdf_dir.glob('**/*.pdf'))

    print(f"Found {len(pdf_files)} PDF files to process")

    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")

        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'

            all_documents.extend(documents)
            print(f"  ✓ Loaded {len(documents)} pages")

        except Exception as e:
            print(f"  ✗ Error: {e}")

    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

all_pdf_documents = process_all_pdfs("../data")

Found 2 PDF files to process

Processing: camcom_2.pdf
  ✓ Loaded 2 pages

Processing: camcom.pdf
  ✓ Loaded 5 pages

Total documents loaded: 7


In [6]:
all_pdf_documents

[Document(metadata={'producer': 'ReportLab PDF Library - www.reportlab.com', 'creator': 'ReportLab PDF Library - www.reportlab.com', 'creationdate': '2025-09-09T15:09:15+00:00', 'author': 'anonymous', 'keywords': '', 'moddate': '2025-09-09T15:09:15+00:00', 'subject': 'unspecified', 'title': 'untitled', 'trapped': '/False', 'source': '../data/camcom_2.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1', 'source_file': 'camcom_2.pdf', 'file_type': 'pdf'}, page_content="CamCom Technologies Pvt Ltd — Company Profile\nExecutive Summary\nCamCom Technologies Pvt Ltd (CamCom) is a Bengaluru-based, award-winning, industry-agnostic deep-learning\ncomputer vision company that provides visual inspection and damage/defect assessment solutions across\nautomotive, insurance, manufacturing, logistics and other sectors. They have developed large vision models for\nsurface-agnostic defect assessment and offer cloud & mobile-enabled deployments for both manufacturing and\naftermarket workflows.\nKey Fac

In [7]:
### Text splitting get into chunks
def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    """Split documents into smaller chunks for better RAG performance"""

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )

    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")

    # Show example of a chunk
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs



In [8]:
chunks = split_documents(all_pdf_documents)
chunks

Split 7 documents into 15 chunks

Example chunk:
Content: CamCom Technologies Pvt Ltd — Company Profile
Executive Summary
CamCom Technologies Pvt Ltd (CamCom) is a Bengaluru-based, award-winning, industry-agnostic deep-learning
computer vision company that p...
Metadata: {'producer': 'ReportLab PDF Library - www.reportlab.com', 'creator': 'ReportLab PDF Library - www.reportlab.com', 'creationdate': '2025-09-09T15:09:15+00:00', 'author': 'anonymous', 'keywords': '', 'moddate': '2025-09-09T15:09:15+00:00', 'subject': 'unspecified', 'title': 'untitled', 'trapped': '/False', 'source': '../data/camcom_2.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1', 'source_file': 'camcom_2.pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'ReportLab PDF Library - www.reportlab.com', 'creator': 'ReportLab PDF Library - www.reportlab.com', 'creationdate': '2025-09-09T15:09:15+00:00', 'author': 'anonymous', 'keywords': '', 'moddate': '2025-09-09T15:09:15+00:00', 'subject': 'unspecified', 'title': 'untitled', 'trapped': '/False', 'source': '../data/camcom_2.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1', 'source_file': 'camcom_2.pdf', 'file_type': 'pdf'}, page_content='CamCom Technologies Pvt Ltd — Company Profile\nExecutive Summary\nCamCom Technologies Pvt Ltd (CamCom) is a Bengaluru-based, award-winning, industry-agnostic deep-learning\ncomputer vision company that provides visual inspection and damage/defect assessment solutions across\nautomotive, insurance, manufacturing, logistics and other sectors. They have developed large vision models for\nsurface-agnostic defect assessment and offer cloud & mobile-enabled deployments for both manufacturing and\naftermarket workflows.\nKey Fac

In [10]:
### embedding
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
class EmbeddingManager:

    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        try:
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise


    def generate_embeddings(self, texts: List[str]) -> np.ndarray:

        if not self.model:
            raise ValueError("Model not loaded")
        
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings

embeddings_manager = EmbeddingManager()
embeddings_manager
    

Model loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x14b5286e0>