In [None]:
# For LlamaIndex
!pip install llama-parse llama-index llama-index-embeddings-openai nltk

# For LangChain
!pip install langchain langchain-text-splitters langchain-community langchain-openai tiktoken
!pip install pypdf pymupdf pdfplumber unstructured


Collecting pymupdf
  Using cached pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting pdfplumber
  Using cached pdfplumber-0.11.9-py3-none-any.whl.metadata (43 kB)
Collecting unstructured
  Using cached unstructured-0.18.26-py3-none-any.whl.metadata (25 kB)
Collecting pdfminer.six==20251230 (from pdfplumber)
  Using cached pdfminer_six-20251230-py3-none-any.whl.metadata (4.3 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Using cached pypdfium2-5.3.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (67 kB)
Collecting python-magic (from unstructured)
  Using cached python_magic-0.4.27-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting emoji (from unstructured)
  Using cached emoji-2.15.0-py3-none-any.whl.metadata (5.7 kB)
Collecting python-iso639 (from unstructured)
  Using cached python_iso639-2025.11.16-py3-none-any.whl.metadata (15 kB)
Collecting langdetect (from unstructured)
  Using cached langdetect-1.0.9.tar.gz (981 kB)
  Preparing

In [None]:
!pip install python-dotenv



In [None]:
from pprint import pprint

In [None]:
from dotenv import load_dotenv
import os

load_dotenv()  # loads .env into environment variables

secret_key = os.getenv("LLAMA_CLOUD_API_KEY")


**PDF Document Parsing Using LLamaParse**

In [None]:
import os
from llama_parse import LlamaParse

# Initialize parser
parser = LlamaParse(
    result_type="markdown",  # or "text"
    verbose=True
)

# Parse PDF
documents = parser.load_data("/content/sample-local-pdf.pdf")
len(documents), type(documents), type(documents[0])



Started parsing the file under job_id b5d5d9c8-a3dd-487e-b0a5-3d14b9cf8c69


(3, list, llama_index.core.schema.Document)

In [None]:
pprint(dict(documents[0]))

{'audio_resource': None,
 'embedding': None,
 'excluded_embed_metadata_keys': [],
 'excluded_llm_metadata_keys': [],
 'id_': '09942564-dfa8-42c8-bfcc-a49cc4983fdb',
 'image_resource': None,
 'metadata': {},
 'metadata_separator': '\n',
 'metadata_template': '{key}: {value}',
 'relationships': {},
 'text_resource': MediaResource(embeddings=None, data=None, text='Sample PDF\nCreated for testing PDFObject\n\nThis PDF is three pages long. Three long pages. Or three short pages if you’re optimistic. Is it the same as saying “three long minutes”, knowing that all minutes are the same duration, and one cannot possibly be longer than the other? If these pages are all the same size, can one possibly be longer than the other?\n\nI digress. Here’s some Latin. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer nec odio. Praesent libero. Sed cursus ante dapibus diam. Sed nisi. Nulla quis sem at nibh elementum imperdiet. Duis sagittis ipsum. Praesent mauris. Fusce nec tellus sed augue 

**LlamaIndex Chunking Techniques**

LlamaIndex uses Node Parsers that convert Documents into Node objects, where each node is a chunk inheriting metadata from the parent document.

1. **SentenceSplitter (Basic & Recommended)**

In [None]:
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import Document

splitter1 = SentenceSplitter(
    chunk_size=1024,        # Target tokens per chunk
    chunk_overlap=20,       # Overlap between chunks
    paragraph_separator="\n\n\n",
    secondary_chunking_regex="[^,.;。]+[,.;。]?"
)
splitter2 = SentenceSplitter(
    chunk_size=256,        # Target tokens per chunk
    chunk_overlap=20,       # Overlap between chunks
    paragraph_separator="\n\n\n",
    secondary_chunking_regex="[^,.;。]+[,.;。]?"
)

In [None]:
splitter1

SentenceSplitter(include_metadata=True, include_prev_next_rel=True, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x7a3971caea20>, id_func=<function default_id_func at 0x7a397c980fe0>, chunk_size=1024, chunk_overlap=20, separator=' ', paragraph_separator='\n\n\n', secondary_chunking_regex='[^,.;。]+[,.;。]?')

In [None]:
nodes_with_splitter1 = splitter1.get_nodes_from_documents(documents)
nodes_with_splitter2 = splitter2.get_nodes_from_documents(documents)

print(f"Nodes with Splitter 1: \nType: {type(nodes_with_splitter1)}, \nNum_of_nodes: {len(nodes_with_splitter1)}", end="\n\n")
print(f"Nodes with Splitter 2: \nType: {type(nodes_with_splitter2)}, \nNum_of_nodes: {len(nodes_with_splitter2)}")


Nodes with Splitter 1: 
Type: <class 'list'>, 
Num_of_nodes: 4

Nodes with Splitter 2: 
Type: <class 'list'>, 
Num_of_nodes: 15


In [None]:
pprint(dict(nodes_with_splitter1[0]))

{'embedding': None,
 'end_char_idx': 2970,
 'excluded_embed_metadata_keys': [],
 'excluded_llm_metadata_keys': [],
 'id_': '36900db6-1d83-4ed6-b871-6b6870b270c5',
 'metadata': {},
 'metadata_separator': '\n',
 'metadata_seperator': '\n',
 'metadata_template': '{key}: {value}',
 'mimetype': 'text/plain',
 'relationships': {<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='09942564-dfa8-42c8-bfcc-a49cc4983fdb', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='92477fef8bfaa85668c659571b7bd25684c419d76f41bcbb48bb49adecc8082c')},
 'start_char_idx': 0,
 'text': 'Sample PDF\n'
         'Created for testing PDFObject\n'
         '\n'
         'This PDF is three pages long. Three long pages. Or three short pages '
         'if you’re optimistic. Is it the same as saying “three long minutes”, '
         'knowing that all minutes are the same duration, and one cannot '
         'possibly be longer than the other? If these pages are all the same '
         'size, can one possibly be

In [None]:
pprint(dict(nodes_with_splitter2[0]))

{'embedding': None,
 'end_char_idx': 869,
 'excluded_embed_metadata_keys': [],
 'excluded_llm_metadata_keys': [],
 'id_': '87c12de1-381a-4c0c-baf2-d2b635660a07',
 'metadata': {},
 'metadata_separator': '\n',
 'metadata_seperator': '\n',
 'metadata_template': '{key}: {value}',
 'mimetype': 'text/plain',
 'relationships': {<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='09942564-dfa8-42c8-bfcc-a49cc4983fdb', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='92477fef8bfaa85668c659571b7bd25684c419d76f41bcbb48bb49adecc8082c'),
                   <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='94893b92-e9f1-4058-ad76-8c803c7b63d9', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='f9030d151e905b83458cb3ceab0b58d6aa9a6abe2e261037f8a3ee32a9ef378a')},
 'start_char_idx': 0,
 'text': 'Sample PDF\n'
         'Created for testing PDFObject\n'
         '\n'
         'This PDF is three pages long. Three long pages. Or three short pages '
         'if you’re optimistic. Is it

In [None]:
pprint(dict(nodes_with_splitter2[1]))

{'embedding': None,
 'end_char_idx': 1512,
 'excluded_embed_metadata_keys': [],
 'excluded_llm_metadata_keys': [],
 'id_': '94893b92-e9f1-4058-ad76-8c803c7b63d9',
 'metadata': {},
 'metadata_separator': '\n',
 'metadata_seperator': '\n',
 'metadata_template': '{key}: {value}',
 'mimetype': 'text/plain',
 'relationships': {<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='09942564-dfa8-42c8-bfcc-a49cc4983fdb', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='92477fef8bfaa85668c659571b7bd25684c419d76f41bcbb48bb49adecc8082c'),
                   <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='87c12de1-381a-4c0c-baf2-d2b635660a07', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='c7e0d9657a10dc7ff95928b556c33f5939e31edcacde7c7b26acee50e3359fab'),
                   <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='9531154a-162a-4776-8bf1-491d37954bb4', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='03e1cf68d62801060897ccaf9b7246811bef3776dfca4c65d00

In [None]:
# Inspect nodes : nodes_with_splitter1
print(f"Total nodes created: {len(nodes_with_splitter1)}")
print("\n=== First Node ===")
print(f"Node ID: {nodes_with_splitter1[0].node_id}")
print(f"Text length: {len(nodes_with_splitter1[0].text)} chars")
print(f"Metadata: {nodes_with_splitter1[0].metadata}")
print(f"\nContent preview:\n{nodes_with_splitter1[0].text[:300]}")

# Analyze relationships
print(f"\n=== Node Relationships ===")
print(f"Source doc: {nodes_with_splitter1[0].source_node}")
if len(nodes_with_splitter1) > 1:
    print(f"Next node: {nodes_with_splitter1[0].relationships.get('next')}")

Total nodes created: 4

=== First Node ===
Node ID: 815cc067-83df-4a99-9de2-c8a3b1679844
Text length: 2970 chars
Metadata: {}

Content preview:
Sample PDF
Created for testing PDFObject

This PDF is three pages long. Three long pages. Or three short pages if you’re optimistic. Is it the same as saying “three long minutes”, knowing that all minutes are the same duration, and one cannot possibly be longer than the other? If these pages are all

=== Node Relationships ===
Source doc: node_id='a138ae6d-a5a8-4b52-9e9d-4d3e68f58f92' node_type=<ObjectType.DOCUMENT: '4'> metadata={} hash='92477fef8bfaa85668c659571b7bd25684c419d76f41bcbb48bb49adecc8082c'
Next node: None


In [None]:
# Inspect nodes : nodes_with_splitter2
print(f"Total nodes created: {len(nodes_with_splitter2)}")
print("\n=== First Node ===")
print(f"Node ID: {nodes_with_splitter2[0].node_id}")
print(f"Text length: {len(nodes_with_splitter2[0].text)} chars")
print(f"Metadata: {nodes_with_splitter2[0].metadata}")
print(f"\nContent preview:\n{nodes_with_splitter2[0].text[:300]}")

# Analyze relationships
print(f"\n=== Node Relationships ===")
print(f"Source doc: {nodes_with_splitter2[0].source_node}")
if len(nodes_with_splitter2) > 1:
    print(f"Next node: {nodes_with_splitter2[0].relationships.get('next')}")

Total nodes created: 15

=== First Node ===
Node ID: 5833dd49-f87d-429f-866b-1e81975d89c4
Text length: 869 chars
Metadata: {}

Content preview:
Sample PDF
Created for testing PDFObject

This PDF is three pages long. Three long pages. Or three short pages if you’re optimistic. Is it the same as saying “three long minutes”, knowing that all minutes are the same duration, and one cannot possibly be longer than the other? If these pages are all

=== Node Relationships ===
Source doc: node_id='a138ae6d-a5a8-4b52-9e9d-4d3e68f58f92' node_type=<ObjectType.DOCUMENT: '4'> metadata={} hash='92477fef8bfaa85668c659571b7bd25684c419d76f41bcbb48bb49adecc8082c'
Next node: None
