In [None]:
from langchain_core.documents import Document
from langchain.text_splitter import (RecursiveCharacterTextSplitter,
                                     CharacterTextSplitter,
                                     TokenTextSplitter)

In [2]:
##Example of document - page content + metadata
doc = Document(
    page_content='This is an example text file',
    metadata={
        "source":"example.txt",
        "page":1,
        "date_creation":"19/10/2025"
    }
)

print(doc.page_content)
print(doc.metadata)

This is an example text file
{'source': 'example.txt', 'page': 1, 'date_creation': '19/10/2025'}


In [8]:
from langchain_community.document_loaders import TextLoader
##Loading a text file here##
loader = TextLoader("data//text_files//machine_learning.txt",encoding='utf-8')
docs = loader.load()
print(f"Number of documents {len(docs)}")
print(f"Content preview:\n {docs[0].page_content[0:100]}")

Number of documents 1
Content preview:
 Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables system


In [None]:
### Read all documents from a prticular directory###
##Disadvantage - Only one typr of files##
from langchain_community.document_loaders import DirectoryLoader
loader_dir = DirectoryLoader(path='data/text_files',
                             glob="**/*.txt", ##pattern for directory##
                             loader_cls = TextLoader,
                             loader_kwargs={'encoding':'utf-8'},
                             show_progress=True)
docs = loader_dir.load()
print("Number of documents loaded:",len(docs))8
for i,doc in enumerate(docs):
    print(f"\nDocument {i+1}:")
    print(f"Source: {doc.metadata['source']}")
    print(f"Length: {len(doc.page_content)}")

100%|██████████| 2/2 [00:00<00:00, 1055.57it/s]

Number of documents loaded: 2

Document 1:
Source: data\text_files\machine_learning.txt
Length: 575

Document 2:
Source: data\text_files\python_intro.txt
Length: 489





### BASIC CHUNKING STRATEGIES

In [14]:
from langchain.text_splitter import CharacterTextSplitter,RecursiveCharacterTextSplitter,TokenTextSplitter
print(docs[0].page_content) 

Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing computer programs
that can access data and use it to learn for themselves.

Types of Machine Learning:
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data
3. Reinforcement Learning: Learning through rewards and penalties

Applications include image recognition, speech processing, and recommendation systems


    


In [None]:
print("CHARACTER TEXT SPLITTER")
char_splitter = CharacterTextSplitter(
    separator='\n',
    chunk_size=200,
    chunk_overlap=70,
    length_function=len

)
chunks = char_splitter.split_text(docs[0].page_content)
print("Number of chunks created:",len(chunks))
for i in range(len(chunks)):
    print(chunks[i])
    print("-------------")

Number of chunks created: 5
Machine Learning Basics
Machine learning is a subset of artificial intelligence that enables systems to learn and improve
-------------
from experience without being explicitly programmed. It focuses on developing computer programs
that can access data and use it to learn for themselves.
Types of Machine Learning:
-------------
Types of Machine Learning:
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data
-------------
2. Unsupervised Learning: Finding patterns in unlabeled data
3. Reinforcement Learning: Learning through rewards and penalties
-------------
3. Reinforcement Learning: Learning through rewards and penalties
Applications include image recognition, speech processing, and recommendation systems
-------------


In [25]:
print("RECURSIVE CHARACTER SPLITTER")
recursive_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n","\n"," ",""],
    chunk_size=200,
    chunk_overlap=50,
    length_function=len
)
chunks = recursive_splitter.split_text(docs[0].page_content)
print("Number of chunks created:",len(chunks))
for i in range(len(chunks)):
    print(chunks[i])
    print("-------------")

RECURSIVE CHARACTER SPLITTER
Number of chunks created: 6
Machine Learning Basics
-------------
Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing computer programs
-------------
that can access data and use it to learn for themselves.
-------------
Types of Machine Learning:
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data
-------------
3. Reinforcement Learning: Learning through rewards and penalties
-------------
Applications include image recognition, speech processing, and recommendation systems
-------------


### PDF INGESTION

In [53]:
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_community.document_loaders.parsers import TesseractBlobParser

print("PyPDFLoader")
try:
    pypdf_loader = PyPDFLoader('data/pdf/attention.pdf')
    pypdf_doc = pypdf_loader.load()
    print(pypdf_doc)
except Exception as e:
    print(f"Error : {e}")

PyPDFLoader
[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'data/pdf/attention.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗ †\nUniversity of Toronto\naidan@cs.toronto.edu\nŁukasz Kaiser∗\n

In [48]:
print("number of pages :",pypdf_doc[0].metadata['total_pages'])
print("Page 3 content:",pypdf_doc[2].page_content)

number of pages : 15
Page 3 content: Figure 1: The Transformer - model architecture.
The Transformer follows this overall architecture using stacked self-attention and point-wise, fully
connected layers for both the encoder and decoder, shown in the left and right halves of Figure 1,
respectively.
3.1 Encoder and Decoder Stacks
Encoder: The encoder is composed of a stack of N = 6 identical layers. Each layer has two
sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position-
wise fully connected feed-forward network. We employ a residual connection [11] around each of
the two sub-layers, followed by layer normalization [ 1]. That is, the output of each sub-layer is
LayerNorm(x + Sublayer(x)), where Sublayer(x) is the function implemented by the sub-layer
itself. To facilitate these residual connections, all sub-layers in the model, as well as the embedding
layers, produce outputs of dimension dmodel = 512.
Decoder: The decoder is also composed 

In [57]:
print("PyMuPDFLoader")
tesseract_parser = TesseractBlobParser(langs="eng")
try:
    pymupdf_loader = PyMuPDFLoader('data/pdf/attention.pdf',
                                   images_inner_format="text", # Store image as markdown
                                   images_parser=tesseract_parser)
    pdf_doc = pymupdf_loader.load()
    print(pdf_doc)
except Exception as e:
    print(f'Error:{e}')

PyMuPDFLoader
[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'source': 'data/pdf/attention.pdf', 'file_path': 'data/pdf/attention.pdf', 'total_pages': 15, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'trapped': '', 'modDate': 'D:20240410211143Z', 'creationDate': 'D:20240410211143Z', 'page': 0}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗†\nUniversity of Toronto\naidan@cs.toronto.edu\nŁukasz Kaiser∗\nGoog

In [58]:
print("number of pages :",pdf_doc[0].metadata['total_pages'])
print("Page 3 content:",pdf_doc[2].page_content)

number of pages : 15
Page 3 content: Figure 1: The Transformer - model architecture.
The Transformer follows this overall architecture using stacked self-attention and point-wise, fully
connected layers for both the encoder and decoder, shown in the left and right halves of Figure 1,
respectively.
3.1
Encoder and Decoder Stacks
Encoder:
The encoder is composed of a stack of N = 6 identical layers. Each layer has two
sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position-
wise fully connected feed-forward network. We employ a residual connection [11] around each of
the two sub-layers, followed by layer normalization [1]. That is, the output of each sub-layer is
LayerNorm(x + Sublayer(x)), where Sublayer(x) is the function implemented by the sub-layer
itself. To facilitate these residual connections, all sub-layers in the model, as well as the embedding
layers, produce outputs of dimension dmodel = 512.
Decoder:
The decoder is also composed o

In [63]:
######## Creating a PDF Parsing pipeline ###########
from typing import List
class PDFParser:

    def __init__(self,chunk_size=1000,chunk_overlap=100):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size = chunk_size,
            chunk_overlap = chunk_overlap,
            length_function = len,
            separators=[" "]

        )
        

    def pdf_process(self,pdf_path:str)->List[Document]:

        ### load pdf ##
        pdf_loader = PyPDFLoader(pdf_path)
        pdf_doc_pages = pdf_loader.load()

        chunks_list = []
        for num_page,page in enumerate(pdf_doc_pages):

            ###cleaning the pages##
            clean_text = self._clean_text(page.page_content)
            if len(clean_text.strip()) < 50:
                continue


            chunks = self.text_splitter.create_documents(

                texts=[clean_text],
                metadatas=[{
                    **page.metadata,
                    'page_number':num_page+1,
                    'total_pages':len(pdf_doc_pages),
                    'character_count':len(clean_text)
                }]
            )

            chunks_list.extend(chunks)
        
        return chunks_list
    
    def _clean_text(self,text:str) -> str:
        text = " ".join(text.split())
        return text

In [70]:
pdf_parser = PDFParser()
chunks_list = pdf_parser.pdf_process("data/pdf/attention.pdf")
print("Total number of chunks:",len(chunks_list))
for chnk in chunks_list:
    print(chnk.page_content)
    print(chnk.metadata)
    print("-------------------")

Total number of chunks: 49
Provided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works. Attention Is All You Need Ashish Vaswani∗ Google Brain avaswani@google.com Noam Shazeer∗ Google Brain noam@google.com Niki Parmar∗ Google Research nikip@google.com Jakob Uszkoreit∗ Google Research usz@google.com Llion Jones∗ Google Research llion@google.com Aidan N. Gomez∗ † University of Toronto aidan@cs.toronto.edu Łukasz Kaiser∗ Google Brain lukaszkaiser@google.com Illia Polosukhin∗ ‡ illia.polosukhin@gmail.com Abstract The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and

### WORD DOCUMENT PARSING

In [71]:
from langchain_community.document_loaders import Docx2txtLoader,UnstructuredWordDocumentLoader

In [76]:
docx_loader = Docx2txtLoader("data/word_files/proposal.docx")
documents = docx_loader.load()
print("Number of word docs loaded:",len(documents))
print("------------")
print("Content(100 words):\n",documents[0].page_content[:101])
print("------------")
print('Metadata:',documents[0].metadata)

Number of word docs loaded: 1
------------
Content(100 words):
 Project Proposal: RAG Implementation

Executive Summary

This proposal outlines the implementation of
------------
Metadata: {'source': 'data/word_files/proposal.docx'}


In [78]:
##mode elements -->> breaks the document into title, subtitle etc##
### chunks/splits the data ###
unstruct_doc_loader = UnstructuredWordDocumentLoader("data/word_files/proposal.docx",mode='elements')
docs = unstruct_doc_loader.load()
print("Documents loaded:",len(docs))

Documents loaded: 20


In [86]:
for index,doc in enumerate(docs):
    print("Element",index+1)
    print("Type:",doc.metadata.get('category'))
    print("Metadata:",doc.metadata)
    print("Content:",doc.page_content)
    print("\n")

Element 1
Type: Title
Metadata: {'source': 'data/word_files/proposal.docx', 'category_depth': 0, 'file_directory': 'data/word_files', 'filename': 'proposal.docx', 'last_modified': '2025-10-19T11:32:29', 'languages': ['eng'], 'filetype': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'category': 'Title', 'element_id': 'bb0410bfd160ef866f8d4357b0949db2'}
Content: Project Proposal: RAG Implementation


Element 2
Type: Title
Metadata: {'source': 'data/word_files/proposal.docx', 'category_depth': 0, 'file_directory': 'data/word_files', 'filename': 'proposal.docx', 'last_modified': '2025-10-19T11:32:29', 'languages': ['eng'], 'filetype': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'category': 'Title', 'element_id': 'c0f844859abf08d9506856b3aed4a719'}
Content: Executive Summary


Element 3
Type: NarrativeText
Metadata: {'source': 'data/word_files/proposal.docx', 'category_depth': 0, 'file_directory': 'data/word_files', 'filename': 'pr