### RAG pipelines - Data Imgestion to vector DB pipeline

In [6]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

In [23]:
def process_all_pdfs(pdf_directory):
    """Process all the pdf files"""
    all_documents =[]
    pdf_dir = Path(pdf_directory)
    print(pdf_dir)
    
    #Find all the pdf files
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    print(pdf_files)
    
    for pdf_file in pdf_files:
        print(f"\n Processing {pdf_file.name}")
        try:
            loader = PyMuPDFLoader(str(pdf_file))
            documents = loader.load()
            
            # Add source information to metadata
            for doc in documents:
                doc.metadata['source_file']= pdf_file.name
                doc.metadata['file_type'] = 'pdf'
                
            all_documents.extend(documents)
            print(f"\n Processed {pdf_file.name}")   
        except Exception as e:
            print(f" Error occured while processing {pdf_file.name}, {e}")
            
    print(f"Processed total {len(pdf_files)}")
    return all_documents
                
    
all_pdf_documents = process_all_pdfs("../data/pdf")

../data/pdf
[PosixPath('../data/pdf/objectdetection.pdf'), PosixPath('../data/pdf/embeddings.pdf'), PosixPath('../data/pdf/attention.pdf'), PosixPath('../data/pdf/proposal.pdf')]

 Processing objectdetection.pdf

 Processed objectdetection.pdf

 Processing embeddings.pdf

 Processed embeddings.pdf

 Processing attention.pdf

 Processed attention.pdf

 Processing proposal.pdf

 Processed proposal.pdf
Processed total 4


In [24]:
all_pdf_documents

[Document(metadata={'producer': 'Skia/PDF m147 Google Docs Renderer', 'creator': '', 'creationdate': '', 'source': '../data/pdf/objectdetection.pdf', 'file_path': '../data/pdf/objectdetection.pdf', 'total_pages': 15, 'format': 'PDF 1.4', 'title': 'objectdetection', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': '', 'page': 0, 'source_file': 'objectdetection.pdf', 'file_type': 'pdf'}, page_content='Title: A Comparative Study of CNN-Based Object Detection Models \n \nAbstract: \nThis research analyzes single-stage and two-stage object detection models using a benchmark \nimage dataset. \n \nIntroduction: \nObject detection identifies and localizes objects within images using bounding boxes. \n \nMethodology: \n \nEvaluated Faster R-CNN (two-stage model). \n \nEvaluated YOLO (single-stage model). \n \nCompared inference speed and detection accuracy. \n \nResults: \n \nYOLO achieved faster inference. \n \nFaster R-CNN produced high

In [29]:
### Text splitting get into chunks

def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    """SPlit documents into chunks for better performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        length_function = len,
        separators = ["\n\n", "\n", " ", ""]
    )
    split_docs= text_splitter.split_documents(documents)
    print(f" split {len(documents)} into {len(split_docs)} chunks")
    
    if split_docs:
        print(f"\nExample chunk")
        print((f"content: {split_docs[0].page_content[:200]}..."))
        print((f"content: {split_docs[0].metadata}"))
        
    return split_docs
        
chunks= split_documents(all_pdf_documents)

 split 48 into 52 chunks

Example chunk
content: Title: A Comparative Study of CNN-Based Object Detection Models 
 
Abstract: 
This research analyzes single-stage and two-stage object detection models using a benchmark 
image dataset. 
 
Introductio...
content: {'producer': 'Skia/PDF m147 Google Docs Renderer', 'creator': '', 'creationdate': '', 'source': '../data/pdf/objectdetection.pdf', 'file_path': '../data/pdf/objectdetection.pdf', 'total_pages': 15, 'format': 'PDF 1.4', 'title': 'objectdetection', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': '', 'page': 0, 'source_file': 'objectdetection.pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'Skia/PDF m147 Google Docs Renderer', 'creator': '', 'creationdate': '', 'source': '../data/pdf/objectdetection.pdf', 'file_path': '../data/pdf/objectdetection.pdf', 'total_pages': 15, 'format': 'PDF 1.4', 'title': 'objectdetection', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': '', 'page': 0, 'source_file': 'objectdetection.pdf', 'file_type': 'pdf'}, page_content='Title: A Comparative Study of CNN-Based Object Detection Models \n \nAbstract: \nThis research analyzes single-stage and two-stage object detection models using a benchmark \nimage dataset. \n \nIntroduction: \nObject detection identifies and localizes objects within images using bounding boxes. \n \nMethodology: \n \nEvaluated Faster R-CNN (two-stage model). \n \nEvaluated YOLO (single-stage model). \n \nCompared inference speed and detection accuracy. \n \nResults: \n \nYOLO achieved faster inference. \n \nFaster R-CNN produced high