In [1]:
!pip install llama-parse llama-index langchain langchain-community



**Basic Llama Parse Implementation**

LlamaParse successfully extracts content with proper headers and recreates tables in markdown format.

In [2]:
import os
from llama_parse import LlamaParse

# Set your API key
os.environ["LLAMA_CLOUD_API_KEY"] = "llx-YhmFE05QQKAYkAVFyzF4xP0mevdOnML4lCOOzCA9wdrB7Xd4"

# Initialize parser
parser = LlamaParse(
    result_type="markdown",  # or "text"
    verbose=True
)

# Parse PDF
documents = parser.load_data("/content/sample-local-pdf.pdf")

# Inspect structure
print(f"Number of documents: {len(documents)}")
print(f"\nFirst 1000 characters:\n{documents[0].text[:1000]}")

# Save to markdown file for detailed inspection
with open("parsed_output.md", "w", encoding="utf-8") as f:
    for doc in documents:
        f.write(doc.text)
        f.write("\n\n---\n\n")




Started parsing the file under job_id 684926b3-3445-4460-ac72-8b6ad89402b7
Number of documents: 3

First 1000 characters:
Sample PDF
Created for testing PDFObject

This PDF is three pages long. Three long pages. Or three short pages if you’re optimistic. Is it the same as saying “three long minutes”, knowing that all minutes are the same duration, and one cannot possibly be longer than the other? If these pages are all the same size, can one possibly be longer than the other?

I digress. Here’s some Latin. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer nec odio. Praesent libero. Sed cursus ante dapibus diam. Sed nisi. Nulla quis sem at nibh elementum imperdiet. Duis sagittis ipsum. Praesent mauris. Fusce nec tellus sed augue semper porta. Mauris massa. Vestibulum lacinia arcu eget nulla. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Curabitur sodales ligula in libero.

Sed dignissim lacinia nunc. Curabitur tortor. Pellente

In [23]:
from pprint import pprint
type(documents[0])
for page in documents:
    pprint(dict(page))
    print()
    pprint(dict(dict(page)["text_resource"]))
    print("-------------------------------------")

{'audio_resource': None,
 'embedding': None,
 'excluded_embed_metadata_keys': [],
 'excluded_llm_metadata_keys': [],
 'id_': 'a064dc3f-1c79-407f-af11-5928742b0d63',
 'image_resource': None,
 'metadata': {},
 'metadata_separator': '\n',
 'metadata_template': '{key}: {value}',
 'relationships': {},
 'text_resource': MediaResource(embeddings=None, data=None, text='Sample PDF\nCreated for testing PDFObject\n\nThis PDF is three pages long. Three long pages. Or three short pages if you’re optimistic. Is it the same as saying “three long minutes”, knowing that all minutes are the same duration, and one cannot possibly be longer than the other? If these pages are all the same size, can one possibly be longer than the other?\n\nI digress. Here’s some Latin. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer nec odio. Praesent libero. Sed cursus ante dapibus diam. Sed nisi. Nulla quis sem at nibh elementum imperdiet. Duis sagittis ipsum. Praesent mauris. Fusce nec tellus sed augue 

**LangChain Integration**
To use LlamaParse output with LangChain:

In [5]:
# from langchain.text_splitter import RecursiveCharacterTextSplitter    ## Threw module not found error!
from langchain_text_splitters import RecursiveCharacterTextSplitter
# from langchain.schema import Document ## No module named 'langchain.schema'
from langchain_core.documents import Document

# Convert LlamaParse documents to LangChain format
langchain_docs = []
for doc in documents:
    langchain_docs.append(
        Document(
            page_content=doc.text,
            metadata={"source": "/content/sample-local-pdf.pdf"}
        )
    )

# Split documents for RAG
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)
splits = text_splitter.split_documents(langchain_docs)

# Inspect chunks
print(f"Total chunks created: {len(splits)}")
for i, split in enumerate(splits[:3]):
    print(f"\nChunk {i+1}:")
    print(f"Length: {len(split.page_content)}")
    print(f"Preview: {split.page_content[:200]}...")

Total chunks created: 13

Chunk 1:
Length: 822
Preview: Sample PDF
Created for testing PDFObject

This PDF is three pages long. Three long pages. Or three short pages if you’re optimistic. Is it the same as saying “three long minutes”, knowing that all min...

Chunk 2:
Length: 781
Preview: Sed dignissim lacinia nunc. Curabitur tortor. Pellentesque nibh. Aenean quam. In scelerisque sem at dolor. Maecenas mattis. Sed convallis tristique sem. Proin ut ligula vel nunc egestas porttitor. Mor...

Chunk 3:
Length: 597
Preview: Suspendisse in justo eu magna luctus suscipit. Sed lectus. Integer euismod lacus luctus magna. Quisque cursus, metus vitae pharetra auctor, sem massa mattis sem, at interdum magna augue eget diam. Ves...


**Inspecting Parsed Structure**

In [6]:
# Detailed structure analysis
for idx, doc in enumerate(documents):
    print(f"\n=== Document {idx+1} ===")
    print(f"Total length: {len(doc.text)} characters")

    # Check for tables (markdown format)
    tables = doc.text.count("|---")
    print(f"Tables detected: {tables}")

    # Check for headers
    headers = [line for line in doc.text.split('\n') if line.startswith('#')]
    print(f"Headers found: {len(headers)}")
    if headers:
        print("Sample headers:", headers[:5])

    # Check metadata if available
    if hasattr(doc, 'metadata'):
        print(f"Metadata: {doc.metadata}")



=== Document 1 ===
Total length: 2971 characters
Tables detected: 0
Headers found: 0
Metadata: {}

=== Document 2 ===
Total length: 3579 characters
Tables detected: 0
Headers found: 0
Metadata: {}

=== Document 3 ===
Total length: 2634 characters
Tables detected: 0
Headers found: 0
Metadata: {}


In [24]:
import os
from llama_parse import LlamaParse

# Set your API key
os.environ["LLAMA_CLOUD_API_KEY"] = "llx-YhmFE05QQKAYkAVFyzF4xP0mevdOnML4lCOOzCA9wdrB7Xd4"

# Initialize parser
parser = LlamaParse(
    result_type="markdown",  # or "text"
    verbose=True
)

# Parse PDF
documents = parser.load_data("/content/file-example_PDF_1MB.pdf")

# Inspect structure
print(f"Number of documents: {len(documents)}")
print(f"\nFirst 1000 characters:\n{documents[0].text[:1000]}")

# Save to markdown file for detailed inspection
with open("parsed_output.md", "w", encoding="utf-8") as f:
    for doc in documents:
        f.write(doc.text)
        f.write("\n\n---\n\n")


Started parsing the file under job_id 3dad96a7-b219-49aa-bf0e-29765f07b81e
Number of documents: 30

First 1000 characters:

Lorem ipsum

Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc ac faucibus odio.

Vestibulum neque massa, scelerisque sit amet ligula eu, congue molestie mi. Praesent ut varius sem. Nullam at porttitor arcu, nec lacinia nisi. Ut ac dolor vitae odio interdum condimentum. Vivamus dapibus sodales ex, vitae malesuada ipsum cursus convallis. Maecenas sed egestas nulla, ac condimentum orci. Mauris diam felis, vulputate ac suscipit et, iaculis non est. Curabitur semper arcu ac ligula semper, nec luctus nisl blandit. Integer lacinia ante ac libero lobortis imperdiet. Nullam mollis convallis ipsum, ac accumsan nunc vehicula vitae. Nulla eget justo in felis tristique fringilla. Morbi sit amet tortor quis risus auctor condimentum. Morbi in ullamcorper elit. Nulla iaculis tellus sit amet mauris tempus fringilla.

Maecenas mauris lectus, lobortis et purus mattis, b

In [26]:
from pprint import pprint
type(documents[0])
for page in documents[:5]:
    pprint(dict(page))
    print()
    pprint(dict(dict(page)["text_resource"]))
    print("-------------------------------------")

{'audio_resource': None,
 'embedding': None,
 'excluded_embed_metadata_keys': [],
 'excluded_llm_metadata_keys': [],
 'id_': '64842ca0-daf4-4cc9-b97d-5257df134249',
 'image_resource': None,
 'metadata': {},
 'metadata_separator': '\n',
 'metadata_template': '{key}: {value}',
 'relationships': {},
 'text_resource': MediaResource(embeddings=None, data=None, text='\nLorem ipsum\n\nLorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc ac faucibus odio.\n\nVestibulum neque massa, scelerisque sit amet ligula eu, congue molestie mi. Praesent ut varius sem. Nullam at porttitor arcu, nec lacinia nisi. Ut ac dolor vitae odio interdum condimentum. Vivamus dapibus sodales ex, vitae malesuada ipsum cursus convallis. Maecenas sed egestas nulla, ac condimentum orci. Mauris diam felis, vulputate ac suscipit et, iaculis non est. Curabitur semper arcu ac ligula semper, nec luctus nisl blandit. Integer lacinia ante ac libero lobortis imperdiet. Nullam mollis convallis ipsum, ac accumsan nunc vehi

In [27]:
# Install required packages
!pip install langchain langchain-community pypdf pymupdf pdfplumber unstructured

Collecting pymupdf
  Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.9-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unstructured
  Downloading unstructured-0.18.26-py3-none-any.whl.metadata (25 kB)
Collecting pdfminer.six==20251230 (from pdfplumber)
  Downloading pdfminer_six-20251230-py3-none-any.whl.metadata (4.3 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-5.3.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.8/67.8 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting python-magic (from unstructured)
  Downloading python_magic-0.4.27-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting emoji (from unstructured)
  Downloading emoji-2.15.0-py3-none-any.whl

In [33]:
from langchain_community.document_loaders import PyMuPDFLoader

# Load PDF
# loader = PyMuPDFLoader("/content/sample-local-pdf.pdf")
loader = PyMuPDFLoader("/content/file-example_PDF_1MB.pdf")

docs = loader.load()

print(f"Total documents: {len(docs)}")


Total documents: 30


In [36]:
pprint(dict(docs[1]))

{'id': None,
 'metadata': {'author': '',
              'creationDate': "D:20170811232209+02'00'",
              'creationdate': '2017-08-11T23:22:09+02:00',
              'creator': 'Writer',
              'file_path': '/content/file-example_PDF_1MB.pdf',
              'format': 'PDF 1.4',
              'keywords': '',
              'modDate': '',
              'moddate': '',
              'page': 1,
              'producer': 'LibreOffice 4.2',
              'source': '/content/file-example_PDF_1MB.pdf',
              'subject': '',
              'title': '',
              'total_pages': 30,
              'trapped': ''},
 'page_content': 'In non mauris justo. Duis vehicula mi vel mi pretium, a '
                 'viverra erat efficitur. Cras aliquam\n'
                 'est ac eros varius, id iaculis dui auctor. Duis pretium '
                 'neque ligula, et pulvinar mi placerat\n'
                 'et. Nulla nec nunc sit amet nunc posuere vestibulum. Ut id '
                 'neque

In [None]:
# Inspect first document
doc = docs[0]
print(f"\n=== Document Metadata ===")
for key, value in doc.metadata.items():
    print(f"{key}: {value}")

print(f"\n=== Content Structure ===")
print(f"Total content length: {len(doc.page_content)} characters")

# Check for common elements
content = doc.page_content
print(f"Number of lines: {len(content.split(chr(10)))}")
print(f"Number of words: {len(content.split())}")

# Preview content
print(f"\n=== First 500 characters ===")
print(content[:500])

# Extract by page mode
loader_by_page = PyMuPDFLoader(
    "your_file.pdf",
    mode="page"  # Each page as separate document
)
pages = loader_by_page.load()

print(f"\n=== Per-Page Analysis ===")
print(f"Total pages: {len(pages)}")
for i, page in enumerate(pages[:5]):  # First 5 pages
    print(f"Page {i+1}: {len(page.page_content)} chars")