In [9]:
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyMuPDFLoader, PyPDFLoader
from pathlib import Path

In [10]:
def process_all_pdfs(pdf_directory):
    """Process all pdf files in a directory"""
    all_documents = []
    pdf_dir = Path(pdf_directory)
    # Find all PDF files recursively

    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    print(f"Found {len(pdf_files)} PDF files to process")

    for pdf_file in pdf_files:
        print(f"\n Processing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'

            all_documents.extend(documents)
            print(f" Loaded {len(documents)} pages") 

        except Exception as e:
            print(f" Error : {e}") 

        print(f"\n Total documents loaded : {len(all_documents)}")    
        return all_documents;

all_pdf_documents = process_all_pdfs("data\\pdf\\")      


Found 1 PDF files to process

 Processing: GenericAI.pdf
 Loaded 39 pages

 Total documents loaded : 39


In [11]:
#### Text splitting get into chunks

def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    """Split documents into smaller chunks for better RAG performance"""
    text_spliter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ",""]
    )
    split_docs = text_spliter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")

    #show example of a chunks

    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")

    return split_docs    

In [12]:
chunks = split_documents(all_pdf_documents)
print(chunks)

Split 39 documents into 139 chunks

Example chunk:
Content: AGENTIC RETRIEVAL -AUGMENTED GENERATION : A S URVEY ON
AGENTIC RAG
Aditi Singh
Department of Computer Science
Cleveland State University
Cleveland, OH, USA
a.singh22@csuohio.edu
Abul Ehtesham
The Dave...
Metadata: {'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-02-05T01:26:00+00:00', 'author': '', 'keywords': '', 'moddate': '2025-02-05T01:26:00+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'data\\pdf\\GenericAI.pdf', 'total_pages': 39, 'page': 0, 'page_label': '1', 'source_file': 'GenericAI.pdf', 'file_type': 'pdf'}
[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-02-05T01:26:00+00:00', 'author': '', 'keywords': '', 'moddate': '2025-02-05T01:26:00+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.1415

In [None]:
# Embedding and vector db
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any , Tuple
from sklearn.metrics.pairwise import cosine_similarity