### RAG Pipelines Data Ingestion to Vector DB Pipeline

In [11]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader, PyMuPDFLoader
from pathlib import Path

In [None]:
def process_all_pdfs(pdf_directory : str):

    loader = DirectoryLoader(
        pdf_directory,
        glob = "**/*.pdf",
        loader_cls = PyMuPDFLoader,
        show_progress = False
    )

    documents = loader.load()

    for doc in documents:
        doc.metadata['source_file'] = Path(doc.metadata['source']).name
        doc.metadata['file_type'] = 'pdf'
    print(f"\n✅ Loaded {len(documents)} documents from {pdf_directory}")
    return documents

all_pdf_documents = process_all_pdfs("../data")


✅ Loaded 129 documents from ../data


In [13]:
all_pdf_documents

[Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2022-12-17T08:32:38+00:00', 'source': '../data/pdf_files/unit2.pdf', 'file_path': '../data/pdf_files/unit2.pdf', 'total_pages': 23, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2022-12-17T08:32:38+00:00', 'trapped': '', 'modDate': "D:20221217083238+00'00'", 'creationDate': "D:20221217083238+00'00'", 'page': 0, 'source_file': 'unit2.pdf', 'file_type': 'pdf'}, page_content='1 \n \n \n \n \n \n \nSCHOOL OF COMPUTING \n \nDEPARTMENT OF COMPUTER SCIENCE AND \nENGINEERING \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n     \n \n    UNIT – I  - DISTRIBUTED DATABASE AND INFORMATION SYSTEMS- SCSA3008'),
 Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2022-12-17T08:32:38+00:00', 'source': '../data/pdf_fil

In [None]:
# ### process all pdfs inside directory 

# def process_all_pdfs(pdf_directory):
#     ## process all pdf in the directory 
#     all_documents = []
#     pdf_dir = Path(pdf_directory)

#     ##find all pdf files recursively
#     pdf_files = list(pdf_dir.glob("**/*.pdf"))

#     print(f"found {len(pdf_files)} PDF files to process")

#     for pdf_file in pdf_files:
#         print(f"\nProcessing: {pdf_file.name}")
#         try:
#             loader = PyPDFLoader(str(pdf_file))
#             documents = loader.load()

#             ## add source info to metadata
#             for doc in documents:
#                 doc.metadata['source_file'] = pdf_file.name
#                 doc.metadata['file_type'] = 'pdf'
            
#             all_documents.extend(documents)
#             print(f" Loaded {len(documents)} pages")
        
#         except Exception as e:
#             print(f" Error: {e}")
            
#     print(f"\nTotal documents loaded: {len(all_documents)}")
#     return all_documents

# # process all documents in data directory
# all_pdf_documents = process_all_pdfs("../data")


found 5 PDF files to process

Processing: unit2.pdf
 Loaded 23 pages

Processing: unit3.pdf
 Loaded 38 pages

Processing: unit1.pdf
 Loaded 38 pages

Processing: unit4.pdf
 Loaded 12 pages

Processing: unit5.pdf
 Loaded 18 pages

Total documents loaded: 129


In [16]:
for doc in all_pdf_documents[:5]:  # first 5 docs
    print(doc.metadata)


{'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2022-12-17T08:32:38+00:00', 'source': '../data/pdf_files/unit2.pdf', 'file_path': '../data/pdf_files/unit2.pdf', 'total_pages': 23, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2022-12-17T08:32:38+00:00', 'trapped': '', 'modDate': "D:20221217083238+00'00'", 'creationDate': "D:20221217083238+00'00'", 'page': 0, 'source_file': 'unit2.pdf', 'file_type': 'pdf'}
{'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2022-12-17T08:32:38+00:00', 'source': '../data/pdf_files/unit2.pdf', 'file_path': '../data/pdf_files/unit2.pdf', 'total_pages': 23, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2022-12-17T08:32:38+00:00', 'trapped': '', 'modDate': "D:20221217083238+00'00'", 'creationDate': "D:20221217083238+00'00'", 'page': 1, 'sourc

In [21]:
### text splitting into chunks

def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    #splitting documents into smaller chunks for better RAG performance
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        length_function = len,
        separators = ["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")

    #show example of chunk
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")

    return split_docs

In [23]:
chunks = split_documents(all_pdf_documents)
chunks

Split 129 documents into 319 chunks

Example chunk:
Content: 1 
 
 
 
 
 
 
SCHOOL OF COMPUTING 
 
DEPARTMENT OF COMPUTER SCIENCE AND 
ENGINEERING 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
     
 
    UNIT – I  - DISTRIBUTED DATABASE AND INFORMATION SY...
Metadata: {'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2022-12-17T08:32:38+00:00', 'source': '../data/pdf_files/unit2.pdf', 'file_path': '../data/pdf_files/unit2.pdf', 'total_pages': 23, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2022-12-17T08:32:38+00:00', 'trapped': '', 'modDate': "D:20221217083238+00'00'", 'creationDate': "D:20221217083238+00'00'", 'page': 0, 'source_file': 'unit2.pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2022-12-17T08:32:38+00:00', 'source': '../data/pdf_files/unit2.pdf', 'file_path': '../data/pdf_files/unit2.pdf', 'total_pages': 23, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2022-12-17T08:32:38+00:00', 'trapped': '', 'modDate': "D:20221217083238+00'00'", 'creationDate': "D:20221217083238+00'00'", 'page': 0, 'source_file': 'unit2.pdf', 'file_type': 'pdf'}, page_content='1 \n \n \n \n \n \n \nSCHOOL OF COMPUTING \n \nDEPARTMENT OF COMPUTER SCIENCE AND \nENGINEERING \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n     \n \n    UNIT – I  - DISTRIBUTED DATABASE AND INFORMATION SYSTEMS- SCSA3008'),
 Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2022-12-17T08:32:38+00:00', 'source': '../data/pdf_fil